1 // Written in the D programming language. 2 3 /++ 4 $(P The `std.uni` module provides an implementation 5 of fundamental Unicode algorithms and data structures. 6 This doesn't include UTF encoding and decoding primitives, 7 see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf) 8 for this functionality. ) 9 10 $(SCRIPT inhibitQuickIndex = 1;) 11 $(DIVC quickindex, 12 $(BOOKTABLE, 13 $(TR $(TH Category) $(TH Functions)) 14 $(TR $(TD Decode) $(TD 15 $(LREF byCodePoint) 16 $(LREF byGrapheme) 17 $(LREF decodeGrapheme) 18 $(LREF graphemeStride) 19 )) 20 $(TR $(TD Comparison) $(TD 21 $(LREF icmp) 22 $(LREF sicmp) 23 )) 24 $(TR $(TD Classification) $(TD 25 $(LREF isAlpha) 26 $(LREF isAlphaNum) 27 $(LREF isCodepointSet) 28 $(LREF isControl) 29 $(LREF isFormat) 30 $(LREF isGraphical) 31 $(LREF isIntegralPair) 32 $(LREF isMark) 33 $(LREF isNonCharacter) 34 $(LREF isNumber) 35 $(LREF isPrivateUse) 36 $(LREF isPunctuation) 37 $(LREF isSpace) 38 $(LREF isSurrogate) 39 $(LREF isSurrogateHi) 40 $(LREF isSurrogateLo) 41 $(LREF isSymbol) 42 $(LREF isWhite) 43 )) 44 $(TR $(TD Normalization) $(TD 45 $(LREF NFC) 46 $(LREF NFD) 47 $(LREF NFKD) 48 $(LREF NormalizationForm) 49 $(LREF normalize) 50 )) 51 $(TR $(TD Decompose) $(TD 52 $(LREF decompose) 53 $(LREF decomposeHangul) 54 $(LREF UnicodeDecomposition) 55 )) 56 $(TR $(TD Compose) $(TD 57 $(LREF compose) 58 $(LREF composeJamo) 59 )) 60 $(TR $(TD Sets) $(TD 61 $(LREF CodepointInterval) 62 $(LREF CodepointSet) 63 $(LREF InversionList) 64 $(LREF unicode) 65 )) 66 $(TR $(TD Trie) $(TD 67 $(LREF codepointSetTrie) 68 $(LREF CodepointSetTrie) 69 $(LREF codepointTrie) 70 $(LREF CodepointTrie) 71 $(LREF toTrie) 72 $(LREF toDelegate) 73 )) 74 $(TR $(TD Casing) $(TD 75 $(LREF asCapitalized) 76 $(LREF asLowerCase) 77 $(LREF asUpperCase) 78 $(LREF isLower) 79 $(LREF isUpper) 80 $(LREF toLower) 81 $(LREF toLowerInPlace) 82 $(LREF toUpper) 83 $(LREF toUpperInPlace) 84 )) 85 $(TR $(TD Utf8Matcher) $(TD 86 $(LREF isUtfMatcher) 87 $(LREF MatcherConcept) 88 $(LREF utfMatcher) 89 )) 90 $(TR $(TD Separators) $(TD 91 $(LREF lineSep) 92 $(LREF nelSep) 93 $(LREF paraSep) 94 )) 95 $(TR $(TD Building blocks) $(TD 96 $(LREF allowedIn) 97 $(LREF combiningClass) 98 $(LREF Grapheme) 99 )) 100 )) 101 102 $(P All primitives listed operate on Unicode characters and 103 sets of characters. For functions which operate on ASCII characters 104 and ignore Unicode $(CHARACTERS), see $(MREF std, ascii). 105 For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms 106 used throughout this module see the $(S_LINK Terminology, terminology) section 107 below. 108 ) 109 $(P The focus of this module is the core needs of developing Unicode-aware 110 applications. To that effect it provides the following optimized primitives: 111 ) 112 $(UL 113 $(LI Character classification by category and common properties: 114 $(LREF isAlpha), $(LREF isWhite) and others. 115 ) 116 $(LI 117 Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)). 118 ) 119 $(LI 120 Converting text to any of the four normalization forms via $(LREF normalize). 121 ) 122 $(LI 123 Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride)) 124 by user-perceived characters, that is by $(LREF Grapheme) clusters. 125 ) 126 $(LI 127 Decomposing and composing of individual character(s) according to canonical 128 or compatibility rules, see $(LREF compose) and $(LREF decompose), 129 including the specific version for Hangul syllables $(LREF composeJamo) 130 and $(LREF decomposeHangul). 131 ) 132 ) 133 $(P It's recognized that an application may need further enhancements 134 and extensions, such as less commonly known algorithms, 135 or tailoring existing ones for region specific needs. To help users 136 with building any extra functionality beyond the core primitives, 137 the module provides: 138 ) 139 $(UL 140 $(LI 141 $(LREF CodepointSet), a type for easy manipulation of sets of characters. 142 Besides the typical set algebra it provides an unusual feature: 143 a D source code generator for detection of $(CODEPOINTS) in this set. 144 This is a boon for meta-programming parser frameworks, 145 and is used internally to power classification in small 146 sets like $(LREF isWhite). 147 ) 148 $(LI 149 A way to construct optimal packed multi-stage tables also known as a 150 special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie). 151 The functions $(LREF codepointTrie), $(LREF codepointSetTrie) 152 construct custom tries that map dchar to value. 153 The end result is a fast and predictable $(BIGOH 1) lookup that powers 154 functions like $(LREF isAlpha) and $(LREF combiningClass), 155 but for user-defined data sets. 156 ) 157 $(LI 158 A useful technique for Unicode-aware parsers that perform 159 character classification of encoded $(CODEPOINTS) 160 is to avoid unnecassary decoding at all costs. 161 $(LREF utfMatcher) provides an improvement over the usual workflow 162 of decode-classify-process, combining the decoding and classification 163 steps. By extracting necessary bits directly from encoded 164 $(S_LINK Code unit, code units) matchers achieve 165 significant performance improvements. See $(LREF MatcherConcept) for 166 the common interface of UTF matchers. 167 ) 168 $(LI 169 Generally useful building blocks for customized normalization: 170 $(LREF combiningClass) for querying combining class 171 and $(LREF allowedIn) for testing the Quick_Check 172 property of a given normalization form. 173 ) 174 $(LI 175 Access to a large selection of commonly used sets of $(CODEPOINTS). 176 $(S_LINK Unicode properties, Supported sets) include Script, 177 Block and General Category. The exact contents of a set can be 178 observed in the CLDR utility, on the 179 $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page 180 of the Unicode website. 181 See $(LREF unicode) for easy and (optionally) compile-time checked set 182 queries. 183 ) 184 ) 185 $(SECTION Synopsis) 186 --- 187 import std.uni; 188 void main() 189 { 190 // initialize code point sets using script/block or property name 191 // now 'set' contains code points from both scripts. 192 auto set = unicode("Cyrillic") | unicode("Armenian"); 193 // same thing but simpler and checked at compile-time 194 auto ascii = unicode.ASCII; 195 auto currency = unicode.Currency_Symbol; 196 197 // easy set ops 198 auto a = set & ascii; 199 assert(a.empty); // as it has no intersection with ascii 200 a = set | ascii; 201 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 202 203 // some properties of code point sets 204 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 205 // testing presence of a code point in a set 206 // is just fine, it is O(logN) 207 assert(!b['$']); 208 assert(!b['\u058F']); // Armenian dram sign 209 assert(b['¥']); 210 211 // building fast lookup tables, these guarantee O(1) complexity 212 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 213 auto oneTrie = toTrie!1(b); 214 // 2-level far more compact but typically slightly slower 215 auto twoTrie = toTrie!2(b); 216 // 3-level even smaller, and a bit slower yet 217 auto threeTrie = toTrie!3(b); 218 assert(oneTrie['£']); 219 assert(twoTrie['£']); 220 assert(threeTrie['£']); 221 222 // build the trie with the most sensible trie level 223 // and bind it as a functor 224 auto cyrillicOrArmenian = toDelegate(set); 225 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 226 assert(balance == "ընկեր!"); 227 // compatible with bool delegate(dchar) 228 bool delegate(dchar) bindIt = cyrillicOrArmenian; 229 230 // Normalization 231 string s = "Plain ascii (and not only), is always normalized!"; 232 assert(s is normalize(s));// is the same string 233 234 string nonS = "A\u0308ffin"; // A ligature 235 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 236 assert(nS == "Äffin"); 237 assert(nS != nonS); 238 string composed = "Äffin"; 239 240 assert(normalize!NFD(composed) == "A\u0308ffin"); 241 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 242 assert(normalize!NFKD("2¹⁰") == "210"); 243 } 244 --- 245 $(SECTION Terminology) 246 $(P The following is a list of important Unicode notions 247 and definitions. Any conventions used specifically in this 248 module alone are marked as such. The descriptions are based on the formal 249 definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf, 250 chapter three of The Unicode Standard Core Specification.) 251 ) 252 $(P $(DEF Abstract character) A unit of information used for the organization, 253 control, or representation of textual data. 254 Note that: 255 $(UL 256 $(LI When representing data, the nature of that data 257 is generally symbolic as opposed to some other 258 kind of data (for example, visual). 259 ) 260 $(LI An abstract character has no concrete form 261 and should not be confused with a $(S_LINK Glyph, glyph). 262 ) 263 $(LI An abstract character does not necessarily 264 correspond to what a user thinks of as a “character” 265 and should not be confused with a $(LREF Grapheme). 266 ) 267 $(LI The abstract characters encoded (see Encoded character) 268 are known as Unicode abstract characters. 269 ) 270 $(LI Abstract characters not directly 271 encoded by the Unicode Standard can often be 272 represented by the use of combining character sequences. 273 ) 274 ) 275 ) 276 $(P $(DEF Canonical decomposition) 277 The decomposition of a character or character sequence 278 that results from recursively applying the canonical 279 mappings found in the Unicode Character Database 280 and these described in Conjoining Jamo Behavior 281 (section 12 of 282 $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)). 283 ) 284 $(P $(DEF Canonical composition) 285 The precise definition of the Canonical composition 286 is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf, 287 Unicode Conformance) section 11. 288 Informally it's the process that does the reverse of the canonical 289 decomposition with the addition of certain rules 290 that e.g. prevent legacy characters from appearing in the composed result. 291 ) 292 $(P $(DEF Canonical equivalent) 293 Two character sequences are said to be canonical equivalents if 294 their full canonical decompositions are identical. 295 ) 296 $(P $(DEF Character) Typically differs by context. 297 For the purpose of this documentation the term $(I character) 298 implies $(I encoded character), that is, a code point having 299 an assigned abstract character (a symbolic meaning). 300 ) 301 $(P $(DEF Code point) Any value in the Unicode codespace; 302 that is, the range of integers from 0 to 10FFFF (hex). 303 Not all code points are assigned to encoded characters. 304 ) 305 $(P $(DEF Code unit) The minimal bit combination that can represent 306 a unit of encoded text for processing or interchange. 307 Depending on the encoding this could be: 308 8-bit code units in the UTF-8 (`char`), 309 16-bit code units in the UTF-16 (`wchar`), 310 and 32-bit code units in the UTF-32 (`dchar`). 311 $(I Note that in UTF-32, a code unit is a code point 312 and is represented by the D `dchar` type.) 313 ) 314 $(P $(DEF Combining character) A character with the General Category 315 of Combining Mark(M). 316 $(UL 317 $(LI All characters with non-zero canonical combining class 318 are combining characters, but the reverse is not the case: 319 there are combining characters with a zero combining class. 320 ) 321 $(LI These characters are not normally used in isolation 322 unless they are being described. They include such characters 323 as accents, diacritics, Hebrew points, Arabic vowel signs, 324 and Indic matras. 325 ) 326 ) 327 ) 328 $(P $(DEF Combining class) 329 A numerical value used by the Unicode Canonical Ordering Algorithm 330 to determine which sequences of combining marks are to be 331 considered canonically equivalent and which are not. 332 ) 333 $(P $(DEF Compatibility decomposition) 334 The decomposition of a character or character sequence that results 335 from recursively applying both the compatibility mappings and 336 the canonical mappings found in the Unicode Character Database, and those 337 described in Conjoining Jamo Behavior no characters 338 can be further decomposed. 339 ) 340 $(P $(DEF Compatibility equivalent) 341 Two character sequences are said to be compatibility 342 equivalents if their full compatibility decompositions are identical. 343 ) 344 $(P $(DEF Encoded character) An association (or mapping) 345 between an abstract character and a code point. 346 ) 347 $(P $(DEF Glyph) The actual, concrete image of a glyph representation 348 having been rasterized or otherwise imaged onto some display surface. 349 ) 350 $(P $(DEF Grapheme base) A character with the property 351 Grapheme_Base, or any standard Korean syllable block. 352 ) 353 $(P $(DEF Grapheme cluster) Defined as the text between 354 grapheme boundaries as specified by Unicode Standard Annex #29, 355 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation). 356 Important general properties of a grapheme: 357 $(UL 358 $(LI The grapheme cluster represents a horizontally segmentable 359 unit of text, consisting of some grapheme base (which may 360 consist of a Korean syllable) together with any number of 361 nonspacing marks applied to it. 362 ) 363 $(LI A grapheme cluster typically starts with a grapheme base 364 and then extends across any subsequent sequence of nonspacing marks. 365 A grapheme cluster is most directly relevant to text rendering and 366 processes such as cursor placement and text selection in editing, 367 but may also be relevant to comparison and searching. 368 ) 369 $(LI For many processes, a grapheme cluster behaves as if it was a 370 single character with the same properties as its grapheme base. 371 Effectively, nonspacing marks apply $(I graphically) to the base, 372 but do not change its properties. 373 ) 374 ) 375 $(P This module defines a number of primitives that work with graphemes: 376 $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride). 377 All of them are using $(I extended grapheme) boundaries 378 as defined in the aforementioned standard annex. 379 ) 380 ) 381 $(P $(DEF Nonspacing mark) A combining character with the 382 General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me). 383 ) 384 $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark. 385 ) 386 $(SECTION Normalization) 387 $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent) 388 or $(S_LINK Compatibility equivalent, compatibility equivalent) 389 characters in the Unicode Standard make it necessary to have a full, formal 390 definition of equivalence for Unicode strings. 391 String equivalence is determined by a process called normalization, 392 whereby strings are converted into forms which are compared 393 directly for identity. This is the primary goal of the normalization process, 394 see the function $(LREF normalize) to convert into any of 395 the four defined forms. 396 ) 397 $(P A very important attribute of the Unicode Normalization Forms 398 is that they must remain stable between versions of the Unicode Standard. 399 A Unicode string normalized to a particular Unicode Normalization Form 400 in one version of the standard is guaranteed to remain in that Normalization 401 Form for implementations of future versions of the standard. 402 ) 403 $(P The Unicode Standard specifies four normalization forms. 404 Informally, two of these forms are defined by maximal decomposition 405 of equivalent sequences, and two of these forms are defined 406 by maximal $(I composition) of equivalent sequences. 407 $(UL 408 $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition, 409 canonical decomposition) of a character sequence.) 410 $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition, 411 compatibility decomposition) of a character sequence.) 412 $(LI Normalization Form C (NFC): The canonical composition of the 413 $(S_LINK Canonical decomposition, canonical decomposition) 414 of a coded character sequence.) 415 $(LI Normalization Form KC (NFKC): The canonical composition 416 of the $(S_LINK Compatibility decomposition, 417 compatibility decomposition) of a character sequence) 418 ) 419 ) 420 $(P The choice of the normalization form depends on the particular use case. 421 NFC is the best form for general text, since it's more compatible with 422 strings converted from legacy encodings. NFKC is the preferred form for 423 identifiers, especially where there are security concerns. NFD and NFKD 424 are the most useful for internal processing. 425 ) 426 $(SECTION Construction of lookup tables) 427 $(P The Unicode standard describes a set of algorithms that 428 depend on having the ability to quickly look up various properties 429 of a code point. Given the codespace of about 1 million $(CODEPOINTS), 430 it is not a trivial task to provide a space-efficient solution for 431 the multitude of properties. 432 ) 433 $(P Common approaches such as hash-tables or binary search over 434 sorted code point intervals (as in $(LREF InversionList)) are insufficient. 435 Hash-tables have enormous memory footprint and binary search 436 over intervals is not fast enough for some heavy-duty algorithms. 437 ) 438 $(P The recommended solution (see Unicode Implementation Guidelines) 439 is using multi-stage tables that are an implementation of the 440 $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer 441 keys and a fixed number of stages. For the remainder of the section 442 this will be called a fixed trie. The following describes a particular 443 implementation that is aimed for the speed of access at the expense 444 of ideal size savings. 445 ) 446 $(P Taking a 2-level Trie as an example the principle of operation is as follows. 447 Split the number of bits in a key (code point, 21 bits) into 2 components 448 (e.g. 15 and 8). The first is the number of bits in the index of the trie 449 and the other is number of bits in each page of the trie. 450 The layout of the trie is then an array of size 2^^bits-of-index followed 451 an array of memory chunks of size 2^^bits-of-page/bits-per-element. 452 ) 453 $(P The number of pages is variable (but not less then 1) 454 unlike the number of entries in the index. The slots of the index 455 all have to contain a number of a page that is present. The lookup is then 456 just a couple of operations - slice the upper bits, 457 lookup an index for these, take a page at this index and use 458 the lower bits as an offset within this page. 459 460 Assuming that pages are laid out consequently 461 in one array at `pages`, the pseudo-code is: 462 ) 463 --- 464 auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits; 465 pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)]; 466 --- 467 $(P Where if `elemsPerPage` is a power of 2 the whole process is 468 a handful of simple instructions and 2 array reads. Subsequent levels 469 of the trie are introduced by recursing on this notion - the index array 470 is treated as values. The number of bits in index is then again 471 split into 2 parts, with pages over 'current-index' and the new 'upper-index'. 472 ) 473 474 $(P For completeness a level 1 trie is simply an array. 475 The current implementation takes advantage of bit-packing values 476 when the range is known to be limited in advance (such as `bool`). 477 See also $(LREF BitPacked) for enforcing it manually. 478 The major size advantage however comes from the fact 479 that multiple $(B identical pages on every level are merged) by construction. 480 ) 481 $(P The process of constructing a trie is more involved and is hidden from 482 the user in a form of the convenience functions $(LREF codepointTrie), 483 $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie). 484 In general a set or built-in AA with `dchar` type 485 can be turned into a trie. The trie object in this module 486 is read-only (immutable); it's effectively frozen after construction. 487 ) 488 $(SECTION Unicode properties) 489 $(P This is a full list of Unicode properties accessible through $(LREF unicode) 490 with specific helpers per category nested within. Consult the 491 $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility) 492 when in doubt about the contents of a particular set. 493 ) 494 $(P General category sets listed below are only accessible with the 495 $(LREF unicode) shorthand accessor.) 496 $(BOOKTABLE $(B General category ), 497 $(TR $(TH Abb.) $(TH Long form) 498 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form)) 499 $(TR $(TD L) $(TD Letter) 500 $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation)) 501 $(TR $(TD Ll) $(TD Lowercase_Letter) 502 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation)) 503 $(TR $(TD Lm) $(TD Modifier_Letter) 504 $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol)) 505 $(TR $(TD Lo) $(TD Other_Letter) 506 $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol)) 507 $(TR $(TD Lt) $(TD Titlecase_Letter) 508 $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol)) 509 $(TR $(TD Lu) $(TD Uppercase_Letter) 510 $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol)) 511 $(TR $(TD M) $(TD Mark) 512 $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol)) 513 $(TR $(TD Mc) $(TD Spacing_Mark) 514 $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator)) 515 $(TR $(TD Me) $(TD Enclosing_Mark) 516 $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator)) 517 $(TR $(TD Mn) $(TD Nonspacing_Mark) 518 $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator)) 519 $(TR $(TD C) $(TD Other) 520 $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator)) 521 $(TR $(TD Cc) $(TD Control) $(TD Pf) 522 $(TD Final_Punctuation) $(TD -) $(TD Any)) 523 $(TR $(TD Cf) $(TD Format) 524 $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII)) 525 ) 526 $(P Sets for other commonly useful properties that are 527 accessible with $(LREF unicode):) 528 $(BOOKTABLE $(B Common binary properties), 529 $(TR $(TH Name) $(TH Name) $(TH Name)) 530 $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase)) 531 $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax)) 532 $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space)) 533 $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark)) 534 $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical)) 535 $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted)) 536 $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm)) 537 $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation)) 538 $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph)) 539 $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase)) 540 $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector)) 541 $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space)) 542 $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue)) 543 $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start)) 544 $(TR $(TD Hyphen) $(TD Other_Lowercase) ) 545 $(TR $(TD ID_Continue) $(TD Other_Math) ) 546 ) 547 $(P Below is the table with block names accepted by $(LREF unicode.block). 548 Note that the shorthand version $(LREF unicode) requires "In" 549 to be prepended to the names of blocks so as to disambiguate 550 scripts and blocks. 551 ) 552 $(BOOKTABLE $(B Blocks), 553 $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian)) 554 $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols)) 555 $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar)) 556 $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A)) 557 $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue)) 558 $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo)) 559 $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms)) 560 $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham)) 561 $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki)) 562 $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic)) 563 $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian)) 564 $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian)) 565 $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic)) 566 $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition)) 567 $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya)) 568 $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya)) 569 $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa)) 570 $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc)) 571 $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician)) 572 $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions)) 573 $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement)) 574 $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards)) 575 $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area)) 576 $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang)) 577 $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols)) 578 $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic)) 579 $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan)) 580 $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra)) 581 $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada)) 582 $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian)) 583 $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala)) 584 $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants)) 585 $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng)) 586 $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters)) 587 $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials)) 588 $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese)) 589 $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement)) 590 $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts)) 591 $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A)) 592 $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B)) 593 $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators)) 594 $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation)) 595 $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A)) 596 $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B)) 597 $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri)) 598 $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac)) 599 $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog)) 600 $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa)) 601 $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags)) 602 $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le)) 603 $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham)) 604 $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet)) 605 $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols)) 606 $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri)) 607 $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil)) 608 $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu)) 609 $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana)) 610 $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai)) 611 $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan)) 612 $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh)) 613 $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols)) 614 $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic)) 615 $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics)) 616 $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended)) 617 $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai)) 618 $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors)) 619 $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement)) 620 $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions)) 621 $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms)) 622 $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols)) 623 $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals)) 624 $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables)) 625 $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) ) 626 $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) ) 627 ) 628 $(P Below is the table with script names accepted by $(LREF unicode.script) 629 and by the shorthand version $(LREF unicode):) 630 $(BOOKTABLE $(B Scripts), 631 $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic)) 632 $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian)) 633 $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian)) 634 $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic)) 635 $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya)) 636 $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya)) 637 $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa)) 638 $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician)) 639 $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang)) 640 $(TR $(TD Braille) $(TD Kannada) $(TD Runic)) 641 $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan)) 642 $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra)) 643 $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada)) 644 $(TR $(TD Carian) $(TD Khmer) $(TD Shavian)) 645 $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala)) 646 $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng)) 647 $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese)) 648 $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri)) 649 $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac)) 650 $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog)) 651 $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa)) 652 $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le)) 653 $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham)) 654 $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet)) 655 $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri)) 656 $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil)) 657 $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu)) 658 $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana)) 659 $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai)) 660 $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan)) 661 $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh)) 662 $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic)) 663 $(TR $(TD Han) $(TD Ogham) $(TD Vai)) 664 $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi)) 665 ) 666 $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).) 667 $(BOOKTABLE $(B Hangul syllable type), 668 $(TR $(TH Abb.) $(TH Long form)) 669 $(TR $(TD L) $(TD Leading_Jamo)) 670 $(TR $(TD LV) $(TD LV_Syllable)) 671 $(TR $(TD LVT) $(TD LVT_Syllable) ) 672 $(TR $(TD T) $(TD Trailing_Jamo)) 673 $(TR $(TD V) $(TD Vowel_Jamo)) 674 ) 675 References: 676 $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table), 677 $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia), 678 $(HTTP www.unicode.org, The Unicode Consortium), 679 $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms), 680 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation) 681 $(HTTP www.unicode.org/uni2book/ch05.pdf, 682 Unicode Implementation Guidelines) 683 $(HTTP www.unicode.org/uni2book/ch03.pdf, 684 Unicode Conformance) 685 Trademarks: 686 Unicode(tm) is a trademark of Unicode, Inc. 687 688 Copyright: Copyright 2013 - 689 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 690 Authors: Dmitry Olshansky 691 Source: $(PHOBOSSRC std/uni/package.d) 692 Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2) 693 694 Macros: 695 696 SECTION = <h3><a id="$1">$0</a></h3> 697 DEF = <div><a id="$1"><i>$0</i></a></div> 698 S_LINK = <a href="#$1">$+</a> 699 CODEPOINT = $(S_LINK Code point, code point) 700 CODEPOINTS = $(S_LINK Code point, code points) 701 CHARACTER = $(S_LINK Character, character) 702 CHARACTERS = $(S_LINK Character, characters) 703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster) 704 +/ 705 module std.uni; 706 707 import std.meta : AliasSeq; 708 import std.range.primitives : back, ElementEncodingType, ElementType, empty, 709 front, hasLength, hasSlicing, isForwardRange, isInputRange, 710 isRandomAccessRange, popFront, put, save; 711 import std.traits : isConvertibleToString, isIntegral, isSomeChar, 712 isSomeString, Unqual, isDynamicArray; 713 // debug = std_uni; 714 715 import std.internal.unicode_tables; // generated file 716 717 debug(std_uni) import std.stdio; // writefln, writeln 718 719 private: 720 721 722 void copyBackwards(T,U)(T[] src, U[] dest) 723 { 724 assert(src.length == dest.length); 725 for (size_t i=src.length; i-- > 0; ) 726 dest[i] = src[i]; 727 } 728 729 void copyForward(T,U)(T[] src, U[] dest) 730 { 731 assert(src.length == dest.length); 732 for (size_t i=0; i<src.length; i++) 733 dest[i] = src[i]; 734 } 735 736 // TODO: update to reflect all major CPUs supporting unaligned reads 737 version (X86) 738 enum hasUnalignedReads = true; 739 else version (X86_64) 740 enum hasUnalignedReads = true; 741 else version (SystemZ) 742 enum hasUnalignedReads = true; 743 else 744 enum hasUnalignedReads = false; // better be safe then sorry 745 746 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator. 747 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator. 748 public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line. 749 750 // test the intro example 751 @safe unittest 752 { 753 import std.algorithm.searching : find; 754 // initialize code point sets using script/block or property name 755 // set contains code points from both scripts. 756 auto set = unicode("Cyrillic") | unicode("Armenian"); 757 // or simpler and statically-checked look 758 auto ascii = unicode.ASCII; 759 auto currency = unicode.Currency_Symbol; 760 761 // easy set ops 762 auto a = set & ascii; 763 assert(a.empty); // as it has no intersection with ascii 764 a = set | ascii; 765 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 766 767 // some properties of code point sets 768 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 769 // testing presence of a code point in a set 770 // is just fine, it is O(logN) 771 assert(!b['$']); 772 assert(!b['\u058F']); // Armenian dram sign 773 assert(b['¥']); 774 775 // building fast lookup tables, these guarantee O(1) complexity 776 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 777 auto oneTrie = toTrie!1(b); 778 // 2-level far more compact but typically slightly slower 779 auto twoTrie = toTrie!2(b); 780 // 3-level even smaller, and a bit slower yet 781 auto threeTrie = toTrie!3(b); 782 assert(oneTrie['£']); 783 assert(twoTrie['£']); 784 assert(threeTrie['£']); 785 786 // build the trie with the most sensible trie level 787 // and bind it as a functor 788 auto cyrillicOrArmenian = toDelegate(set); 789 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 790 assert(balance == "ընկեր!"); 791 // compatible with bool delegate(dchar) 792 bool delegate(dchar) bindIt = cyrillicOrArmenian; 793 794 // Normalization 795 string s = "Plain ascii (and not only), is always normalized!"; 796 assert(s is normalize(s));// is the same string 797 798 string nonS = "A\u0308ffin"; // A ligature 799 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 800 assert(nS == "Äffin"); 801 assert(nS != nonS); 802 string composed = "Äffin"; 803 804 assert(normalize!NFD(composed) == "A\u0308ffin"); 805 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 806 assert(normalize!NFKD("2¹⁰") == "210"); 807 } 808 809 enum lastDchar = 0x10FFFF; 810 811 auto force(T, F)(F from) 812 if (isIntegral!T && !is(T == F)) 813 { 814 assert(from <= T.max && from >= T.min); 815 return cast(T) from; 816 } 817 818 auto force(T, F)(F from) 819 if (isBitPacked!T && !is(T == F)) 820 { 821 assert(from <= 2^^bitSizeOf!T-1); 822 return T(cast(TypeOfBitPacked!T) from); 823 } 824 825 auto force(T, F)(F from) 826 if (is(T == F)) 827 { 828 return from; 829 } 830 831 // repeat X times the bit-pattern in val assuming it's length is 'bits' 832 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc 833 { 834 static if (times == 1) 835 return val; 836 else static if (bits == 1) 837 { 838 static if (times == size_t.sizeof*8) 839 return val ? size_t.max : 0; 840 else 841 return val ? (1 << times)-1 : 0; 842 } 843 else static if (times % 2) 844 return (replicateBits!(times-1, bits)(val)<<bits) | val; 845 else 846 return replicateBits!(times/2, bits*2)((val << bits) | val); 847 } 848 849 @safe pure nothrow @nogc unittest // for replicate 850 { 851 import std.algorithm.iteration : sum, map; 852 import std.range : iota; 853 size_t m = 0b111; 854 size_t m2 = 0b01; 855 static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) 856 { 857 assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i))); 858 assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum()); 859 } 860 } 861 862 // multiple arrays squashed into one memory block 863 struct MultiArray(Types...) 864 { 865 import std.range.primitives : isOutputRange; 866 this(size_t[] sizes...) @safe pure nothrow 867 { 868 assert(dim == sizes.length); 869 size_t full_size; 870 foreach (i, v; Types) 871 { 872 full_size += spaceFor!(bitSizeOf!v)(sizes[i]); 873 sz[i] = sizes[i]; 874 static if (i >= 1) 875 offsets[i] = offsets[i-1] + 876 spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]); 877 } 878 879 storage = new size_t[full_size]; 880 } 881 882 this(const(size_t)[] raw_offsets, 883 const(size_t)[] raw_sizes, 884 return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc 885 { 886 offsets[] = raw_offsets[]; 887 sz[] = raw_sizes[]; 888 storage = data; 889 } 890 891 @property auto slice(size_t n)()inout pure nothrow @nogc 892 { 893 auto ptr = raw_ptr!n; 894 return packedArrayView!(Types[n])(ptr, sz[n]); 895 } 896 897 @property auto ptr(size_t n)()inout pure nothrow @nogc 898 { 899 auto ptr = raw_ptr!n; 900 return inout(PackedPtr!(Types[n]))(ptr); 901 } 902 903 template length(size_t n) 904 { 905 @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; } 906 907 @property void length(size_t new_size) 908 { 909 if (new_size > sz[n]) 910 {// extend 911 size_t delta = (new_size - sz[n]); 912 sz[n] += delta; 913 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 914 storage.length += delta;// extend space at end 915 // raw_slice!x must follow resize as it could be moved! 916 // next stmts move all data past this array, last-one-goes-first 917 static if (n != dim-1) 918 { 919 auto start = raw_ptr!(n+1); 920 // len includes delta 921 size_t len = (storage.ptr+storage.length-start); 922 923 copyBackwards(start[0 .. len-delta], start[delta .. len]); 924 925 start[0 .. delta] = 0; 926 // offsets are used for raw_slice, ptr etc. 927 foreach (i; n+1 .. dim) 928 offsets[i] += delta; 929 } 930 } 931 else if (new_size < sz[n]) 932 {// shrink 933 size_t delta = (sz[n] - new_size); 934 sz[n] -= delta; 935 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 936 // move all data past this array, forward direction 937 static if (n != dim-1) 938 { 939 auto start = raw_ptr!(n+1); 940 size_t len = (storage.ptr+storage.length-start); 941 copyForward(start[0 .. len-delta], start[delta .. len]); 942 943 // adjust offsets last, they affect raw_slice 944 foreach (i; n+1 .. dim) 945 offsets[i] -= delta; 946 } 947 storage.length -= delta; 948 } 949 // else - NOP 950 } 951 } 952 953 @property size_t bytes(size_t n=size_t.max)() const @safe 954 { 955 static if (n == size_t.max) 956 return storage.length*size_t.sizeof; 957 else static if (n != Types.length-1) 958 return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof; 959 else 960 return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof; 961 } 962 963 void store(OutRange)(scope OutRange sink) const 964 if (isOutputRange!(OutRange, char)) 965 { 966 import std.format.write : formattedWrite; 967 formattedWrite(sink, "[%( 0x%x, %)]", offsets[]); 968 formattedWrite(sink, ", [%( 0x%x, %)]", sz[]); 969 formattedWrite(sink, ", [%( 0x%x, %)]", storage); 970 } 971 972 private: 973 import std.meta : staticMap; 974 @property auto raw_ptr(size_t n)()inout pure nothrow @nogc 975 { 976 static if (n == 0) 977 return storage.ptr; 978 else 979 { 980 return storage.ptr+offsets[n]; 981 } 982 } 983 enum dim = Types.length; 984 size_t[dim] offsets;// offset for level x 985 size_t[dim] sz;// size of level x 986 alias bitWidth = staticMap!(bitSizeOf, Types); 987 size_t[] storage; 988 } 989 990 @system unittest 991 { 992 import std.conv : text; 993 enum dg = (){ 994 // sizes are: 995 // lvl0: 3, lvl1 : 2, lvl2: 1 996 auto m = MultiArray!(int, ubyte, int)(3,2,1); 997 998 static void check(size_t k, T)(ref T m, int n) 999 { 1000 foreach (i; 0 .. n) 1001 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n])); 1002 } 1003 1004 static void checkB(size_t k, T)(ref T m, int n) 1005 { 1006 foreach (i; 0 .. n) 1007 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n])); 1008 } 1009 1010 static void fill(size_t k, T)(ref T m, int n) 1011 { 1012 foreach (i; 0 .. n) 1013 m.slice!(k)[i] = force!ubyte(i+1); 1014 } 1015 1016 static void fillB(size_t k, T)(ref T m, int n) 1017 { 1018 foreach (i; 0 .. n) 1019 m.slice!(k)[i] = force!ubyte(n-i); 1020 } 1021 1022 m.length!1 = 100; 1023 fill!1(m, 100); 1024 check!1(m, 100); 1025 1026 m.length!0 = 220; 1027 fill!0(m, 220); 1028 check!1(m, 100); 1029 check!0(m, 220); 1030 1031 m.length!2 = 17; 1032 fillB!2(m, 17); 1033 checkB!2(m, 17); 1034 check!0(m, 220); 1035 check!1(m, 100); 1036 1037 m.length!2 = 33; 1038 checkB!2(m, 17); 1039 fillB!2(m, 33); 1040 checkB!2(m, 33); 1041 check!0(m, 220); 1042 check!1(m, 100); 1043 1044 m.length!1 = 195; 1045 fillB!1(m, 195); 1046 checkB!1(m, 195); 1047 checkB!2(m, 33); 1048 check!0(m, 220); 1049 1050 auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10); 1051 marr.length!0 = 15; 1052 marr.length!1 = 30; 1053 fill!1(marr, 30); 1054 fill!0(marr, 15); 1055 check!1(marr, 30); 1056 check!0(marr, 15); 1057 return 0; 1058 }; 1059 enum ct = dg(); 1060 auto rt = dg(); 1061 } 1062 1063 @system unittest 1064 {// more bitpacking tests 1065 import std.conv : text; 1066 1067 alias Bitty = 1068 MultiArray!(BitPacked!(size_t, 3) 1069 , BitPacked!(size_t, 4) 1070 , BitPacked!(size_t, 3) 1071 , BitPacked!(size_t, 6) 1072 , bool); 1073 alias fn1 = sliceBits!(13, 16); 1074 alias fn2 = sliceBits!( 9, 13); 1075 alias fn3 = sliceBits!( 6, 9); 1076 alias fn4 = sliceBits!( 0, 6); 1077 static void check(size_t lvl, MA)(ref MA arr){ 1078 for (size_t i = 0; i< arr.length!lvl; i++) 1079 assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i])); 1080 } 1081 1082 static void fillIdx(size_t lvl, MA)(ref MA arr){ 1083 for (size_t i = 0; i< arr.length!lvl; i++) 1084 arr.slice!(lvl)[i] = i; 1085 } 1086 Bitty m1; 1087 1088 m1.length!4 = 10; 1089 m1.length!3 = 2^^6; 1090 m1.length!2 = 2^^3; 1091 m1.length!1 = 2^^4; 1092 m1.length!0 = 2^^3; 1093 1094 m1.length!4 = 2^^16; 1095 1096 for (size_t i = 0; i< m1.length!4; i++) 1097 m1.slice!(4)[i] = i % 2; 1098 1099 fillIdx!1(m1); 1100 check!1(m1); 1101 fillIdx!2(m1); 1102 check!2(m1); 1103 fillIdx!3(m1); 1104 check!3(m1); 1105 fillIdx!0(m1); 1106 check!0(m1); 1107 check!3(m1); 1108 check!2(m1); 1109 check!1(m1); 1110 for (size_t i=0; i < 2^^16; i++) 1111 { 1112 m1.slice!(4)[i] = i % 2; 1113 m1.slice!(0)[fn1(i)] = fn1(i); 1114 m1.slice!(1)[fn2(i)] = fn2(i); 1115 m1.slice!(2)[fn3(i)] = fn3(i); 1116 m1.slice!(3)[fn4(i)] = fn4(i); 1117 } 1118 for (size_t i=0; i < 2^^16; i++) 1119 { 1120 assert(m1.slice!(4)[i] == i % 2); 1121 assert(m1.slice!(0)[fn1(i)] == fn1(i)); 1122 assert(m1.slice!(1)[fn2(i)] == fn2(i)); 1123 assert(m1.slice!(2)[fn3(i)] == fn3(i)); 1124 assert(m1.slice!(3)[fn4(i)] == fn4(i)); 1125 } 1126 } 1127 1128 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc 1129 { 1130 import std.math.algebraic : nextPow2; 1131 enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView 1132 static if (bits > 8*size_t.sizeof) 1133 { 1134 static assert(bits % (size_t.sizeof*8) == 0); 1135 return new_len * bits/(8*size_t.sizeof); 1136 } 1137 else 1138 { 1139 enum factor = size_t.sizeof*8/bits; 1140 return (new_len+factor-1)/factor; // rounded up 1141 } 1142 } 1143 1144 template isBitPackableType(T) 1145 { 1146 enum isBitPackableType = isBitPacked!T 1147 || isIntegral!T || is(T == bool) || isSomeChar!T; 1148 } 1149 1150 //============================================================================ 1151 template PackedArrayView(T) 1152 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1153 && isBitPackableType!U) || isBitPackableType!T) 1154 { 1155 import std.math.algebraic : nextPow2; 1156 private enum bits = bitSizeOf!T; 1157 alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1158 } 1159 1160 //unsafe and fast access to a chunk of RAM as if it contains packed values 1161 template PackedPtr(T) 1162 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1163 && isBitPackableType!U) || isBitPackableType!T) 1164 { 1165 import std.math.algebraic : nextPow2; 1166 private enum bits = bitSizeOf!T; 1167 alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1168 } 1169 1170 struct PackedPtrImpl(T, size_t bits) 1171 { 1172 pure nothrow: 1173 static assert(isPow2OrZero(bits)); 1174 1175 this(inout(size_t)* ptr)inout @safe @nogc 1176 { 1177 origin = ptr; 1178 } 1179 1180 private T simpleIndex(size_t n) inout 1181 { 1182 immutable q = n / factor; 1183 immutable r = n % factor; 1184 return cast(T)((origin[q] >> bits*r) & mask); 1185 } 1186 1187 private void simpleWrite(TypeOfBitPacked!T val, size_t n) 1188 in 1189 { 1190 static if (isIntegral!T) 1191 assert(val <= mask); 1192 } 1193 do 1194 { 1195 immutable q = n / factor; 1196 immutable r = n % factor; 1197 immutable tgt_shift = bits*r; 1198 immutable word = origin[q]; 1199 origin[q] = (word & ~(mask << tgt_shift)) 1200 | (cast(size_t) val << tgt_shift); 1201 } 1202 1203 static if (factor == bytesPerWord// can safely pack by byte 1204 || factor == 1 // a whole word at a time 1205 || ((factor == bytesPerWord/2 || factor == bytesPerWord/4) 1206 && hasUnalignedReads)) // this needs unaligned reads 1207 { 1208 static if (factor == bytesPerWord) 1209 alias U = ubyte; 1210 else static if (factor == bytesPerWord/2) 1211 alias U = ushort; 1212 else static if (factor == bytesPerWord/4) 1213 alias U = uint; 1214 else static if (size_t.sizeof == 8 && factor == bytesPerWord/8) 1215 alias U = ulong; 1216 1217 T opIndex(size_t idx) inout 1218 { 1219 T ret; 1220 version (LittleEndian) 1221 ret = __ctfe ? simpleIndex(idx) : 1222 cast(inout(T))(cast(U*) origin)[idx]; 1223 else 1224 ret = simpleIndex(idx); 1225 return ret; 1226 } 1227 1228 static if (isBitPacked!T) // lack of user-defined implicit conversion 1229 { 1230 void opIndexAssign(T val, size_t idx) 1231 { 1232 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1233 } 1234 } 1235 1236 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1237 { 1238 version (LittleEndian) 1239 { 1240 if (__ctfe) 1241 simpleWrite(val, idx); 1242 else 1243 (cast(U*) origin)[idx] = cast(U) val; 1244 } 1245 else 1246 simpleWrite(val, idx); 1247 } 1248 } 1249 else 1250 { 1251 T opIndex(size_t n) inout 1252 { 1253 return simpleIndex(n); 1254 } 1255 1256 static if (isBitPacked!T) // lack of user-defined implicit conversion 1257 { 1258 void opIndexAssign(T val, size_t idx) 1259 { 1260 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1261 } 1262 } 1263 1264 void opIndexAssign(TypeOfBitPacked!T val, size_t n) 1265 { 1266 return simpleWrite(val, n); 1267 } 1268 } 1269 1270 private: 1271 // factor - number of elements in one machine word 1272 enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1; 1273 enum bytesPerWord = size_t.sizeof; 1274 size_t* origin; 1275 } 1276 1277 // data is packed only by power of two sized packs per word, 1278 // thus avoiding mul/div overhead at the cost of ultimate packing 1279 // this construct doesn't own memory, only provides access, see MultiArray for usage 1280 struct PackedArrayViewImpl(T, size_t bits) 1281 { 1282 pure nothrow: 1283 1284 this(inout(size_t)* origin, size_t offset, size_t items) inout @safe 1285 { 1286 ptr = inout(PackedPtr!(T))(origin); 1287 ofs = offset; 1288 limit = items; 1289 } 1290 1291 bool zeros(size_t s, size_t e) 1292 in 1293 { 1294 assert(s <= e); 1295 } 1296 do 1297 { 1298 s += ofs; 1299 e += ofs; 1300 immutable pad_s = roundUp(s); 1301 if ( s >= e) 1302 { 1303 foreach (i; s .. e) 1304 if (ptr[i]) 1305 return false; 1306 return true; 1307 } 1308 immutable pad_e = roundDown(e); 1309 size_t i; 1310 for (i=s; i<pad_s; i++) 1311 if (ptr[i]) 1312 return false; 1313 // all in between is x*factor elements 1314 for (size_t j=i/factor; i<pad_e; i+=factor, j++) 1315 if (ptr.origin[j]) 1316 return false; 1317 for (; i<e; i++) 1318 if (ptr[i]) 1319 return false; 1320 return true; 1321 } 1322 1323 T opIndex(size_t idx) inout 1324 in 1325 { 1326 assert(idx < limit); 1327 } 1328 do 1329 { 1330 return ptr[ofs + idx]; 1331 } 1332 1333 static if (isBitPacked!T) // lack of user-defined implicit conversion 1334 { 1335 void opIndexAssign(T val, size_t idx) 1336 { 1337 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1338 } 1339 } 1340 1341 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1342 in 1343 { 1344 assert(idx < limit); 1345 } 1346 do 1347 { 1348 ptr[ofs + idx] = val; 1349 } 1350 1351 static if (isBitPacked!T) // lack of user-defined implicit conversions 1352 { 1353 void opSliceAssign(T val, size_t start, size_t end) 1354 { 1355 opSliceAssign(cast(TypeOfBitPacked!T) val, start, end); 1356 } 1357 } 1358 1359 void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end) 1360 in 1361 { 1362 assert(start <= end); 1363 assert(end <= limit); 1364 } 1365 do 1366 { 1367 // account for ofsetted view 1368 start += ofs; 1369 end += ofs; 1370 // rounded to factor granularity 1371 immutable pad_start = roundUp(start);// rounded up 1372 if (pad_start >= end) //rounded up >= then end of slice 1373 { 1374 //nothing to gain, use per element assignment 1375 foreach (i; start .. end) 1376 ptr[i] = val; 1377 return; 1378 } 1379 immutable pad_end = roundDown(end); // rounded down 1380 size_t i; 1381 for (i=start; i<pad_start; i++) 1382 ptr[i] = val; 1383 // all in between is x*factor elements 1384 if (pad_start != pad_end) 1385 { 1386 immutable repval = replicateBits!(factor, bits)(val); 1387 for (size_t j=i/factor; i<pad_end; i+=factor, j++) 1388 ptr.origin[j] = repval;// so speed it up by factor 1389 } 1390 for (; i<end; i++) 1391 ptr[i] = val; 1392 } 1393 1394 auto opSlice(size_t from, size_t to)inout 1395 in 1396 { 1397 assert(from <= to); 1398 assert(ofs + to <= limit); 1399 } 1400 do 1401 { 1402 return typeof(this)(ptr.origin, ofs + from, to - from); 1403 } 1404 1405 auto opSlice(){ return opSlice(0, length); } 1406 1407 bool opEquals(T)(auto ref T arr) const 1408 { 1409 if (limit != arr.limit) 1410 return false; 1411 size_t s1 = ofs, s2 = arr.ofs; 1412 size_t e1 = s1 + limit, e2 = s2 + limit; 1413 if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0) 1414 { 1415 return ptr.origin[s1/factor .. e1/factor] 1416 == arr.ptr.origin[s2/factor .. e2/factor]; 1417 } 1418 for (size_t i=0;i<limit; i++) 1419 if (this[i] != arr[i]) 1420 return false; 1421 return true; 1422 } 1423 1424 @property size_t length()const{ return limit; } 1425 1426 private: 1427 auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; } 1428 auto roundDown()(size_t val){ return val/factor*factor; } 1429 // factor - number of elements in one machine word 1430 enum factor = size_t.sizeof*8/bits; 1431 PackedPtr!(T) ptr; 1432 size_t ofs, limit; 1433 } 1434 1435 1436 private struct SliceOverIndexed(T) 1437 { 1438 enum assignableIndex = is(typeof((){ T.init[0] = Item.init; })); 1439 enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; })); 1440 auto opIndex(size_t idx)const 1441 in 1442 { 1443 assert(idx < to - from); 1444 } 1445 do 1446 { 1447 return (*arr)[from+idx]; 1448 } 1449 1450 static if (assignableIndex) 1451 void opIndexAssign(Item val, size_t idx) 1452 in 1453 { 1454 assert(idx < to - from); 1455 } 1456 do 1457 { 1458 (*arr)[from+idx] = val; 1459 } 1460 1461 auto opSlice(size_t a, size_t b) 1462 { 1463 return typeof(this)(from+a, from+b, arr); 1464 } 1465 1466 // static if (assignableSlice) 1467 void opSliceAssign(T)(T val, size_t start, size_t end) 1468 { 1469 (*arr)[start+from .. end+from] = val; 1470 } 1471 1472 auto opSlice() 1473 { 1474 return typeof(this)(from, to, arr); 1475 } 1476 1477 @property size_t length()const { return to-from;} 1478 1479 alias opDollar = length; 1480 1481 @property bool empty()const { return from == to; } 1482 1483 @property auto front()const { return (*arr)[from]; } 1484 1485 static if (assignableIndex) 1486 @property void front(Item val) { (*arr)[from] = val; } 1487 1488 @property auto back()const { return (*arr)[to-1]; } 1489 1490 static if (assignableIndex) 1491 @property void back(Item val) { (*arr)[to-1] = val; } 1492 1493 @property auto save() inout { return this; } 1494 1495 void popFront() { from++; } 1496 1497 void popBack() { to--; } 1498 1499 bool opEquals(T)(auto ref T arr) const 1500 { 1501 if (arr.length != length) 1502 return false; 1503 for (size_t i=0; i <length; i++) 1504 if (this[i] != arr[i]) 1505 return false; 1506 return true; 1507 } 1508 private: 1509 alias Item = typeof(T.init[0]); 1510 size_t from, to; 1511 T* arr; 1512 } 1513 1514 @safe pure nothrow @nogc unittest 1515 { 1516 static assert(isRandomAccessRange!(SliceOverIndexed!(int[]))); 1517 } 1518 1519 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x) 1520 if (is(Unqual!T == T)) 1521 { 1522 return SliceOverIndexed!(const(T))(a, b, x); 1523 } 1524 1525 // BUG? inout is out of reach 1526 //...SliceOverIndexed.arr only parameters or stack based variables can be inout 1527 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x) 1528 if (is(Unqual!T == T)) 1529 { 1530 return SliceOverIndexed!T(a, b, x); 1531 } 1532 1533 @system unittest 1534 { 1535 int[] idxArray = [2, 3, 5, 8, 13]; 1536 auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray); 1537 1538 assert(!sliced.empty); 1539 assert(sliced.front == 2); 1540 sliced.front = 1; 1541 assert(sliced.front == 1); 1542 assert(sliced.back == 13); 1543 sliced.popFront(); 1544 assert(sliced.front == 3); 1545 assert(sliced.back == 13); 1546 sliced.back = 11; 1547 assert(sliced.back == 11); 1548 sliced.popBack(); 1549 1550 assert(sliced.front == 3); 1551 assert(sliced[$-1] == 8); 1552 sliced = sliced[]; 1553 assert(sliced[0] == 3); 1554 assert(sliced.back == 8); 1555 sliced = sliced[1..$]; 1556 assert(sliced.front == 5); 1557 sliced = sliced[0..$-1]; 1558 assert(sliced[$-1] == 5); 1559 1560 int[] other = [2, 5]; 1561 assert(sliced[] == sliceOverIndexed(1, 2, &other)); 1562 sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1; 1563 assert(idxArray[0 .. 2] == [-1, -1]); 1564 uint[] nullArr = null; 1565 auto nullSlice = sliceOverIndexed(0, 0, &idxArray); 1566 assert(nullSlice.empty); 1567 } 1568 1569 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items) 1570 { 1571 return inout(PackedArrayView!T)(ptr, 0, items); 1572 } 1573 1574 1575 //============================================================================ 1576 // Partially unrolled binary search using Shar's method 1577 //============================================================================ 1578 1579 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow 1580 { 1581 import core.bitop : bsr; 1582 import std.array : replace; 1583 import std.conv : to; 1584 assert(isPow2OrZero(size)); 1585 string code = ` 1586 import core.bitop : bsr; 1587 auto power = bsr(m)+1; 1588 switch (power){`; 1589 size_t i = bsr(size); 1590 foreach_reverse (val; 0 .. bsr(size)) 1591 { 1592 auto v = 2^^val; 1593 code ~= ` 1594 case pow: 1595 if (pred(range[idx+m], needle)) 1596 idx += m; 1597 goto case; 1598 `.replace("m", to!string(v)) 1599 .replace("pow", to!string(i)); 1600 i--; 1601 } 1602 code ~= ` 1603 case 0: 1604 if (pred(range[idx], needle)) 1605 idx += 1; 1606 goto default; 1607 `; 1608 code ~= ` 1609 default: 1610 }`; 1611 return code; 1612 } 1613 1614 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc 1615 { 1616 // See also: std.math.isPowerOf2() 1617 return (sz & (sz-1)) == 0; 1618 } 1619 1620 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle) 1621 if (is(T : ElementType!Range)) 1622 { 1623 assert(isPow2OrZero(range.length)); 1624 size_t idx = 0, m = range.length/2; 1625 while (m != 0) 1626 { 1627 if (pred(range[idx+m], needle)) 1628 idx += m; 1629 m /= 2; 1630 } 1631 if (pred(range[idx], needle)) 1632 idx += 1; 1633 return idx; 1634 } 1635 1636 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle) 1637 if (is(T : ElementType!Range)) 1638 { 1639 assert(isPow2OrZero(range.length)); 1640 size_t idx = 0, m = range.length/2; 1641 enum max = 1 << 10; 1642 while (m >= max) 1643 { 1644 if (pred(range[idx+m], needle)) 1645 idx += m; 1646 m /= 2; 1647 } 1648 mixin(genUnrolledSwitchSearch(max)); 1649 return idx; 1650 } 1651 1652 template sharMethod(alias uniLowerBound) 1653 { 1654 size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle) 1655 if (is(T : ElementType!Range)) 1656 { 1657 import std.functional : binaryFun; 1658 import std.math.algebraic : nextPow2, truncPow2; 1659 alias pred = binaryFun!_pred; 1660 if (range.length == 0) 1661 return 0; 1662 if (isPow2OrZero(range.length)) 1663 return uniLowerBound!pred(range, needle); 1664 size_t n = truncPow2(range.length); 1665 if (pred(range[n-1], needle)) 1666 {// search in another 2^^k area that fully covers the tail of range 1667 size_t k = nextPow2(range.length - n + 1); 1668 return range.length - k + uniLowerBound!pred(range[$-k..$], needle); 1669 } 1670 else 1671 return uniLowerBound!pred(range[0 .. n], needle); 1672 } 1673 } 1674 1675 alias sharLowerBound = sharMethod!uniformLowerBound; 1676 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound; 1677 1678 @safe unittest 1679 { 1680 import std.array : array; 1681 import std.range : assumeSorted, iota; 1682 1683 auto stdLowerBound(T)(T[] range, T needle) 1684 { 1685 return assumeSorted(range).lowerBound(needle).length; 1686 } 1687 immutable MAX = 5*1173; 1688 auto arr = array(iota(5, MAX, 5)); 1689 assert(arr.length == MAX/5-1); 1690 foreach (i; 0 .. MAX+5) 1691 { 1692 auto st = stdLowerBound(arr, i); 1693 assert(st == sharLowerBound(arr, i)); 1694 assert(st == sharSwitchLowerBound(arr, i)); 1695 } 1696 arr = []; 1697 auto st = stdLowerBound(arr, 33); 1698 assert(st == sharLowerBound(arr, 33)); 1699 assert(st == sharSwitchLowerBound(arr, 33)); 1700 } 1701 //============================================================================ 1702 1703 @safe 1704 { 1705 // hope to see simillar stuff in public interface... once Allocators are out 1706 //@@@BUG moveFront and friends? dunno, for now it's POD-only 1707 1708 @trusted size_t genericReplace(Policy=void, T, Range) 1709 (ref T dest, size_t from, size_t to, Range stuff) 1710 { 1711 import std.algorithm.mutation : copy; 1712 size_t delta = to - from; 1713 size_t stuff_end = from+stuff.length; 1714 if (stuff.length > delta) 1715 {// replace increases length 1716 delta = stuff.length - delta;// now, new is > old by delta 1717 static if (is(Policy == void)) 1718 dest.length = dest.length+delta;//@@@BUG lame @property 1719 else 1720 dest = Policy.realloc(dest, dest.length+delta); 1721 copyBackwards(dest[to .. dest.length-delta], 1722 dest[to+delta .. dest.length]); 1723 copyForward(stuff, dest[from .. stuff_end]); 1724 } 1725 else if (stuff.length == delta) 1726 { 1727 copy(stuff, dest[from .. to]); 1728 } 1729 else 1730 {// replace decreases length by delta 1731 delta = delta - stuff.length; 1732 copy(stuff, dest[from .. stuff_end]); 1733 copyForward(dest[to .. dest.length], 1734 dest[stuff_end .. dest.length-delta]); 1735 static if (is(Policy == void)) 1736 dest.length = dest.length - delta;//@@@BUG lame @property 1737 else 1738 dest = Policy.realloc(dest, dest.length-delta); 1739 } 1740 return stuff_end; 1741 } 1742 1743 1744 // Simple storage manipulation policy 1745 @safe private struct GcPolicy 1746 { 1747 import std.traits : isDynamicArray; 1748 1749 static T[] dup(T)(const T[] arr) 1750 { 1751 return arr.dup; 1752 } 1753 1754 static T[] alloc(T)(size_t size) 1755 { 1756 return new T[size]; 1757 } 1758 1759 static T[] realloc(T)(T[] arr, size_t sz) 1760 { 1761 arr.length = sz; 1762 return arr; 1763 } 1764 1765 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1766 { 1767 replaceInPlace(dest, from, to, stuff); 1768 } 1769 1770 static void append(T, V)(ref T[] arr, V value) 1771 if (!isInputRange!V) 1772 { 1773 arr ~= force!T(value); 1774 } 1775 1776 static void append(T, V)(ref T[] arr, V value) 1777 if (isInputRange!V) 1778 { 1779 insertInPlace(arr, arr.length, value); 1780 } 1781 1782 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1783 if (isDynamicArray!T && is(Unqual!T == T)) 1784 { 1785 debug 1786 { 1787 arr[] = cast(typeof(T.init[0]))(0xdead_beef); 1788 } 1789 arr = null; 1790 } 1791 1792 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1793 if (isDynamicArray!T && !is(Unqual!T == T)) 1794 { 1795 arr = null; 1796 } 1797 } 1798 1799 // ditto 1800 @safe struct ReallocPolicy 1801 { 1802 import std.range.primitives : hasLength; 1803 1804 static T[] dup(T)(const T[] arr) 1805 { 1806 auto result = alloc!T(arr.length); 1807 result[] = arr[]; 1808 return result; 1809 } 1810 1811 static T[] alloc(T)(size_t size) @trusted 1812 { 1813 import std.internal.memory : enforceMalloc; 1814 1815 import core.checkedint : mulu; 1816 bool overflow; 1817 size_t nbytes = mulu(size, T.sizeof, overflow); 1818 if (overflow) assert(0); 1819 1820 auto ptr = cast(T*) enforceMalloc(nbytes); 1821 return ptr[0 .. size]; 1822 } 1823 1824 static T[] realloc(T)(return scope T[] arr, size_t size) @trusted 1825 { 1826 import std.internal.memory : enforceRealloc; 1827 if (!size) 1828 { 1829 destroy(arr); 1830 return null; 1831 } 1832 1833 import core.checkedint : mulu; 1834 bool overflow; 1835 size_t nbytes = mulu(size, T.sizeof, overflow); 1836 if (overflow) assert(0); 1837 1838 auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes); 1839 return ptr[0 .. size]; 1840 } 1841 1842 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1843 { 1844 genericReplace!(ReallocPolicy)(dest, from, to, stuff); 1845 } 1846 1847 static void append(T, V)(ref T[] arr, V value) 1848 if (!isInputRange!V) 1849 { 1850 if (arr.length == size_t.max) assert(0); 1851 arr = realloc(arr, arr.length+1); 1852 arr[$-1] = force!T(value); 1853 } 1854 1855 pure @safe unittest 1856 { 1857 int[] arr; 1858 ReallocPolicy.append(arr, 3); 1859 1860 import std.algorithm.comparison : equal; 1861 assert(equal(arr, [3])); 1862 } 1863 1864 static void append(T, V)(ref T[] arr, V value) 1865 if (isInputRange!V && hasLength!V) 1866 { 1867 import core.checkedint : addu; 1868 bool overflow; 1869 size_t nelems = addu(arr.length, value.length, overflow); 1870 if (overflow) assert(0); 1871 1872 arr = realloc(arr, nelems); 1873 1874 import std.algorithm.mutation : copy; 1875 copy(value, arr[$-value.length..$]); 1876 } 1877 1878 pure @safe unittest 1879 { 1880 int[] arr; 1881 ReallocPolicy.append(arr, [1,2,3]); 1882 1883 import std.algorithm.comparison : equal; 1884 assert(equal(arr, [1,2,3])); 1885 } 1886 1887 static void destroy(T)(scope ref T[] arr) @trusted 1888 { 1889 import core.memory : pureFree; 1890 if (arr.ptr) 1891 pureFree(arr.ptr); 1892 arr = null; 1893 } 1894 } 1895 1896 //build hack 1897 alias _RealArray = CowArray!ReallocPolicy; 1898 1899 pure @safe unittest 1900 { 1901 import std.algorithm.comparison : equal; 1902 1903 with(ReallocPolicy) 1904 { 1905 bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result, 1906 string file = __FILE__, size_t line = __LINE__) 1907 { 1908 { 1909 replaceImpl(orig, from, to, toReplace); 1910 scope(exit) destroy(orig); 1911 if (!equal(orig, result)) 1912 return false; 1913 } 1914 return true; 1915 } 1916 static T[] arr(T)(T[] args... ) 1917 { 1918 return dup(args); 1919 } 1920 1921 assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4])); 1922 assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4])); 1923 assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7])); 1924 assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4])); 1925 assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4])); 1926 } 1927 } 1928 1929 /** 1930 Tests if T is some kind a set of code points. Intended for template constraints. 1931 */ 1932 public template isCodepointSet(T) 1933 { 1934 static if (is(T dummy == InversionList!(Args), Args...)) 1935 enum isCodepointSet = true; 1936 else 1937 enum isCodepointSet = false; 1938 } 1939 1940 /** 1941 Tests if `T` is a pair of integers that implicitly convert to `V`. 1942 The following code must compile for any pair `T`: 1943 --- 1944 (T x){ V a = x[0]; V b = x[1];} 1945 --- 1946 The following must not compile: 1947 --- 1948 (T x){ V c = x[2];} 1949 --- 1950 */ 1951 public template isIntegralPair(T, V=uint) 1952 { 1953 enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];})) 1954 && !is(typeof((T x){ V c = x[2]; })); 1955 } 1956 1957 1958 /** 1959 The recommended default type for set of $(CODEPOINTS). 1960 For details, see the current implementation: $(LREF InversionList). 1961 */ 1962 public alias CodepointSet = InversionList!GcPolicy; 1963 1964 1965 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin 1966 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error 1967 // hence below doesn't seem to work 1968 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b"); 1969 1970 /** 1971 The recommended type of $(REF Tuple, std,_typecons) 1972 to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList). 1973 Any interval type should pass $(LREF isIntegralPair) trait. 1974 */ 1975 public struct CodepointInterval 1976 { 1977 pure: 1978 uint[2] _tuple; 1979 alias _tuple this; 1980 1981 @safe pure nothrow @nogc: 1982 1983 this(uint low, uint high) 1984 { 1985 _tuple[0] = low; 1986 _tuple[1] = high; 1987 } 1988 bool opEquals(T)(T val) const 1989 { 1990 return this[0] == val[0] && this[1] == val[1]; 1991 } 1992 @property ref inout(uint) a() return inout { return _tuple[0]; } 1993 @property ref inout(uint) b() return inout { return _tuple[1]; } 1994 } 1995 1996 /** 1997 $(P 1998 `InversionList` is a set of $(CODEPOINTS) 1999 represented as an array of open-right [a, b$(RPAREN) 2000 intervals (see $(LREF CodepointInterval) above). 2001 The name comes from the way the representation reads left to right. 2002 For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN), 2003 plus a singular value 60 looks like this: 2004 ) 2005 --- 2006 10, 50, 60, 61, 80, 90 2007 --- 2008 $(P 2009 The way to read this is: start with negative meaning that all numbers 2010 smaller then the next one are not present in this set (and positive - 2011 the contrary). Then switch positive/negative after each 2012 number passed from left to right. 2013 ) 2014 $(P This way negative spans until 10, then positive until 50, 2015 then negative until 60, then positive until 61, and so on. 2016 As seen this provides a space-efficient storage of highly redundant data 2017 that comes in long runs. A description which Unicode $(CHARACTER) 2018 properties fit nicely. The technique itself could be seen as a variation 2019 on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding). 2020 ) 2021 2022 $(P Sets are value types (just like `int` is) thus they 2023 are never aliased. 2024 ) 2025 Example: 2026 --- 2027 auto a = CodepointSet('a', 'z'+1); 2028 auto b = CodepointSet('A', 'Z'+1); 2029 auto c = a; 2030 a = a | b; 2031 assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1)); 2032 assert(a != c); 2033 --- 2034 $(P See also $(LREF unicode) for simpler construction of sets 2035 from predefined ones. 2036 ) 2037 2038 $(P Memory usage is 8 bytes per each contiguous interval in a set. 2039 The value semantics are achieved by using the 2040 $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique 2041 and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared). 2042 ) 2043 2044 Note: 2045 $(P It's not recommended to rely on the template parameters 2046 or the exact type of a current $(CODEPOINT) set in `std.uni`. 2047 The type and parameters may change when the standard 2048 allocators design is finalized. 2049 Use $(LREF isCodepointSet) with templates or just stick with the default 2050 alias $(LREF CodepointSet) throughout the whole code base. 2051 ) 2052 */ 2053 public struct InversionList(SP=GcPolicy) 2054 { 2055 import std.range : assumeSorted; 2056 2057 /** 2058 Construct from another code point set of any type. 2059 */ 2060 this(Set)(Set set) pure 2061 if (isCodepointSet!Set) 2062 { 2063 uint[] arr; 2064 foreach (v; set.byInterval) 2065 { 2066 arr ~= v.a; 2067 arr ~= v.b; 2068 } 2069 data = CowArray!(SP).reuse(arr); 2070 } 2071 2072 /** 2073 Construct a set from a forward range of code point intervals. 2074 */ 2075 this(Range)(Range intervals) pure 2076 if (isForwardRange!Range && isIntegralPair!(ElementType!Range)) 2077 { 2078 uint[] arr; 2079 foreach (v; intervals) 2080 { 2081 SP.append(arr, v.a); 2082 SP.append(arr, v.b); 2083 } 2084 data = CowArray!(SP).reuse(arr); 2085 sanitize(); //enforce invariant: sort intervals etc. 2086 } 2087 2088 //helper function that avoids sanity check to be CTFE-friendly 2089 private static fromIntervals(Range)(Range intervals) pure 2090 { 2091 import std.algorithm.iteration : map; 2092 import std.range : roundRobin; 2093 auto flattened = roundRobin(intervals.save.map!"a[0]"(), 2094 intervals.save.map!"a[1]"()); 2095 InversionList set; 2096 set.data = CowArray!(SP)(flattened); 2097 return set; 2098 } 2099 //ditto untill sort is CTFE-able 2100 private static fromIntervals()(uint[] intervals...) pure 2101 in 2102 { 2103 import std.conv : text; 2104 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2105 for (uint i = 0; i < intervals.length; i += 2) 2106 { 2107 auto a = intervals[i], b = intervals[i+1]; 2108 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2109 } 2110 } 2111 do 2112 { 2113 InversionList set; 2114 set.data = CowArray!(SP)(intervals); 2115 return set; 2116 } 2117 2118 /** 2119 Construct a set from plain values of code point intervals. 2120 */ 2121 this()(uint[] intervals...) 2122 in 2123 { 2124 import std.conv : text; 2125 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2126 for (uint i = 0; i < intervals.length; i += 2) 2127 { 2128 auto a = intervals[i], b = intervals[i+1]; 2129 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2130 } 2131 } 2132 do 2133 { 2134 data = CowArray!(SP)(intervals); 2135 sanitize(); //enforce invariant: sort intervals etc. 2136 } 2137 2138 /// 2139 pure @safe unittest 2140 { 2141 import std.algorithm.comparison : equal; 2142 2143 auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1); 2144 foreach (v; 'a'..'z'+1) 2145 assert(set[v]); 2146 // Cyrillic lowercase interval 2147 foreach (v; 'а'..'я'+1) 2148 assert(set[v]); 2149 //specific order is not required, intervals may interesect 2150 auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1); 2151 //the same end result 2152 assert(set2.byInterval.equal(set.byInterval)); 2153 // test constructor this(Range)(Range intervals) 2154 auto chessPiecesWhite = CodepointInterval(9812, 9818); 2155 auto chessPiecesBlack = CodepointInterval(9818, 9824); 2156 auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]); 2157 foreach (v; '♔'..'♟'+1) 2158 assert(set3[v]); 2159 } 2160 2161 /** 2162 Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList). 2163 */ 2164 @property auto byInterval() scope 2165 { 2166 // TODO: change this to data[] once the -dip1000 errors have been fixed 2167 // see e.g. https://github.com/dlang/phobos/pull/6638 2168 import std.array : array; 2169 return Intervals!(typeof(data.array))(data.array); 2170 } 2171 2172 @safe unittest 2173 { 2174 import std.algorithm.comparison : equal; 2175 import std.typecons : tuple; 2176 2177 auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1); 2178 2179 assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')])); 2180 } 2181 2182 package(std) @property const(CodepointInterval)[] intervals() const 2183 { 2184 import std.array : array; 2185 return Intervals!(typeof(data[]))(data[]).array; 2186 } 2187 2188 /** 2189 Tests the presence of code point `val` in this set. 2190 */ 2191 bool opIndex(uint val) const 2192 { 2193 // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1 2194 // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1; 2195 return sharSwitchLowerBound!"a <= b"(data[], val) & 1; 2196 } 2197 2198 /// 2199 pure @safe unittest 2200 { 2201 auto gothic = unicode.Gothic; 2202 // Gothic letter ahsa 2203 assert(gothic['\U00010330']); 2204 // no ascii in Gothic obviously 2205 assert(!gothic['$']); 2206 } 2207 2208 2209 // Linear scan for `ch`. Useful only for small sets. 2210 // TODO: 2211 // used internally in std.regex 2212 // should be properly exposed in a public API ? 2213 package(std) auto scanFor()(dchar ch) const 2214 { 2215 immutable len = data.length; 2216 for (size_t i = 0; i < len; i++) 2217 if (ch < data[i]) 2218 return i & 1; 2219 return 0; 2220 } 2221 2222 /// Number of $(CODEPOINTS) in this set 2223 @property size_t length() 2224 { 2225 size_t sum = 0; 2226 foreach (iv; byInterval) 2227 { 2228 sum += iv.b - iv.a; 2229 } 2230 return sum; 2231 } 2232 2233 // bootstrap full set operations from 4 primitives (suitable as a template mixin): 2234 // addInterval, skipUpTo, dropUpTo & byInterval iteration 2235 //============================================================================ 2236 public: 2237 /** 2238 $(P Sets support natural syntax for set algebra, namely: ) 2239 $(BOOKTABLE , 2240 $(TR $(TH Operator) $(TH Math notation) $(TH Description) ) 2241 $(TR $(TD &) $(TD a ∩ b) $(TD intersection) ) 2242 $(TR $(TD |) $(TD a ∪ b) $(TD union) ) 2243 $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) ) 2244 $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) ) 2245 ) 2246 */ 2247 This opBinary(string op, U)(U rhs) 2248 if (isCodepointSet!U || is(U:dchar)) 2249 { 2250 static if (op == "&" || op == "|" || op == "~") 2251 {// symmetric ops thus can swap arguments to reuse r-value 2252 static if (is(U:dchar)) 2253 { 2254 auto tmp = this; 2255 mixin("tmp "~op~"= rhs; "); 2256 return tmp; 2257 } 2258 else 2259 { 2260 static if (is(Unqual!U == U)) 2261 { 2262 // try hard to reuse r-value 2263 mixin("rhs "~op~"= this;"); 2264 return rhs; 2265 } 2266 else 2267 { 2268 auto tmp = this; 2269 mixin("tmp "~op~"= rhs;"); 2270 return tmp; 2271 } 2272 } 2273 } 2274 else static if (op == "-") // anti-symmetric 2275 { 2276 auto tmp = this; 2277 tmp -= rhs; 2278 return tmp; 2279 } 2280 else 2281 static assert(0, "no operator "~op~" defined for Set"); 2282 } 2283 2284 /// 2285 pure @safe unittest 2286 { 2287 import std.algorithm.comparison : equal; 2288 import std.range : iota; 2289 2290 auto lower = unicode.LowerCase; 2291 auto upper = unicode.UpperCase; 2292 auto ascii = unicode.ASCII; 2293 2294 assert((lower & upper).empty); // no intersection 2295 auto lowerASCII = lower & ascii; 2296 assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1))); 2297 // throw away all of the lowercase ASCII 2298 assert((ascii - lower).length == 128 - 26); 2299 2300 auto onlyOneOf = lower ~ ascii; 2301 assert(!onlyOneOf['Δ']); // not ASCII and not lowercase 2302 assert(onlyOneOf['$']); // ASCII and not lowercase 2303 assert(!onlyOneOf['a']); // ASCII and lowercase 2304 assert(onlyOneOf['я']); // not ASCII but lowercase 2305 2306 // throw away all cased letters from ASCII 2307 auto noLetters = ascii - (lower | upper); 2308 assert(noLetters.length == 128 - 26*2); 2309 } 2310 2311 /// The 'op=' versions of the above overloaded operators. 2312 ref This opOpAssign(string op, U)(U rhs) 2313 if (isCodepointSet!U || is(U:dchar)) 2314 { 2315 static if (op == "|") // union 2316 { 2317 static if (is(U:dchar)) 2318 { 2319 this.addInterval(rhs, rhs+1); 2320 return this; 2321 } 2322 else 2323 return this.add(rhs); 2324 } 2325 else static if (op == "&") // intersection 2326 return this.intersect(rhs);// overloaded 2327 else static if (op == "-") // set difference 2328 return this.sub(rhs);// overloaded 2329 else static if (op == "~") // symmetric set difference 2330 { 2331 auto copy = this & rhs; 2332 this |= rhs; 2333 this -= copy; 2334 return this; 2335 } 2336 else 2337 static assert(0, "no operator "~op~" defined for Set"); 2338 } 2339 2340 /** 2341 Tests the presence of codepoint `ch` in this set, 2342 the same as $(LREF opIndex). 2343 */ 2344 bool opBinaryRight(string op: "in", U)(U ch) const 2345 if (is(U : dchar)) 2346 { 2347 return this[ch]; 2348 } 2349 2350 /// 2351 pure @safe unittest 2352 { 2353 assert('я' in unicode.Cyrillic); 2354 assert(!('z' in unicode.Cyrillic)); 2355 } 2356 2357 2358 2359 /** 2360 * Obtains a set that is the inversion of this set. 2361 * 2362 * See_Also: $(LREF inverted) 2363 */ 2364 auto opUnary(string op: "!")() 2365 { 2366 return this.inverted; 2367 } 2368 2369 /** 2370 A range that spans each $(CODEPOINT) in this set. 2371 */ 2372 @property auto byCodepoint() 2373 { 2374 static struct CodepointRange 2375 { 2376 this(This set) 2377 { 2378 r = set.byInterval; 2379 if (!r.empty) 2380 cur = r.front.a; 2381 } 2382 2383 @property dchar front() const 2384 { 2385 return cast(dchar) cur; 2386 } 2387 2388 @property bool empty() const 2389 { 2390 return r.empty; 2391 } 2392 2393 void popFront() 2394 { 2395 cur++; 2396 while (cur >= r.front.b) 2397 { 2398 r.popFront(); 2399 if (r.empty) 2400 break; 2401 cur = r.front.a; 2402 } 2403 } 2404 private: 2405 uint cur; 2406 @(imported!"core.attribute".mutableRefInit) typeof(This.init.byInterval) r; 2407 } 2408 2409 return CodepointRange(this); 2410 } 2411 2412 /// 2413 pure @safe unittest 2414 { 2415 import std.algorithm.comparison : equal; 2416 import std.range : iota; 2417 2418 auto set = unicode.ASCII; 2419 set.byCodepoint.equal(iota(0, 0x80)); 2420 } 2421 2422 /** 2423 $(P Obtain textual representation of this set in from of 2424 open-right intervals and feed it to `sink`. 2425 ) 2426 $(P Used by various standard formatting facilities such as 2427 $(REF formattedWrite, std,format), $(REF write, std,stdio), 2428 $(REF writef, std,stdio), $(REF to, std,conv) and others. 2429 ) 2430 Example: 2431 --- 2432 import std.conv; 2433 assert(unicode.ASCII.to!string == "[0..128$(RPAREN)"); 2434 --- 2435 */ 2436 2437 private import std.format.spec : FormatSpec; 2438 2439 /*************************************** 2440 * Obtain a textual representation of this InversionList 2441 * in form of open-right intervals. 2442 * 2443 * The formatting flag is applied individually to each value, for example: 2444 * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals) 2445 * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters) 2446 * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters) 2447 */ 2448 void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */ 2449 { 2450 import std.format.write : formatValue; 2451 auto range = byInterval; 2452 if (range.empty) 2453 return; 2454 2455 while (1) 2456 { 2457 auto i = range.front; 2458 range.popFront(); 2459 2460 put(sink, "["); 2461 formatValue(sink, i.a, fmt); 2462 put(sink, ".."); 2463 formatValue(sink, i.b, fmt); 2464 put(sink, ")"); 2465 if (range.empty) return; 2466 put(sink, " "); 2467 } 2468 } 2469 2470 /// 2471 pure @safe unittest 2472 { 2473 import std.conv : to; 2474 import std.format : format; 2475 import std.uni : unicode; 2476 2477 // This was originally using Cyrillic script. 2478 // Unfortunately this is a pretty active range for changes, 2479 // and hence broke in an update. 2480 // Therefore the range Basic latin was used instead as it 2481 // unlikely to ever change. 2482 2483 assert(unicode.InBasic_latin.to!string == "[0..128)"); 2484 2485 // The specs '%s' and '%d' are equivalent to the to!string call above. 2486 assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string); 2487 2488 assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)"); 2489 assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)"); 2490 } 2491 2492 pure @safe unittest 2493 { 2494 import std.exception : assertThrown; 2495 import std.format : format, FormatException; 2496 assertThrown!FormatException(format("%z", unicode.ASCII)); 2497 } 2498 2499 2500 /** 2501 Add an interval [a, b$(RPAREN) to this set. 2502 */ 2503 ref add()(uint a, uint b) 2504 { 2505 addInterval(a, b); 2506 return this; 2507 } 2508 2509 /// 2510 pure @safe unittest 2511 { 2512 CodepointSet someSet; 2513 someSet.add('0', '5').add('A','Z'+1); 2514 someSet.add('5', '9'+1); 2515 assert(someSet['0']); 2516 assert(someSet['5']); 2517 assert(someSet['9']); 2518 assert(someSet['Z']); 2519 } 2520 2521 private: 2522 2523 package(std) // used from: std.regex.internal.parser 2524 ref intersect(U)(U rhs) 2525 if (isCodepointSet!U) 2526 { 2527 Marker mark; 2528 foreach ( i; rhs.byInterval) 2529 { 2530 mark = this.dropUpTo(i.a, mark); 2531 mark = this.skipUpTo(i.b, mark); 2532 } 2533 this.dropUpTo(uint.max, mark); 2534 return this; 2535 } 2536 2537 ref intersect()(dchar ch) 2538 { 2539 foreach (i; byInterval) 2540 if (i.a <= ch && ch < i.b) 2541 return this = This.init.add(ch, ch+1); 2542 this = This.init; 2543 return this; 2544 } 2545 2546 pure @safe unittest 2547 { 2548 assert(unicode.Cyrillic.intersect('-').byInterval.empty); 2549 } 2550 2551 ref sub()(dchar ch) 2552 { 2553 return subChar(ch); 2554 } 2555 2556 // same as the above except that skip & drop parts are swapped 2557 package(std) // used from: std.regex.internal.parser 2558 ref sub(U)(U rhs) 2559 if (isCodepointSet!U) 2560 { 2561 Marker mark; 2562 foreach (i; rhs.byInterval) 2563 { 2564 mark = this.skipUpTo(i.a, mark); 2565 mark = this.dropUpTo(i.b, mark); 2566 } 2567 return this; 2568 } 2569 2570 package(std) // used from: std.regex.internal.parse 2571 ref add(U)(U rhs) 2572 if (isCodepointSet!U) 2573 { 2574 Marker start; 2575 foreach (i; rhs.byInterval) 2576 { 2577 start = addInterval(i.a, i.b, start); 2578 } 2579 return this; 2580 } 2581 2582 // end of mixin-able part 2583 //============================================================================ 2584 public: 2585 /** 2586 Obtains a set that is the inversion of this set. 2587 2588 See the '!' $(LREF opUnary) for the same but using operators. 2589 */ 2590 @property auto inverted() 2591 { 2592 InversionList inversion = this; 2593 if (inversion.data.length == 0) 2594 { 2595 inversion.addInterval(0, lastDchar+1); 2596 return inversion; 2597 } 2598 if (inversion.data[0] != 0) 2599 genericReplace(inversion.data, 0, 0, [0]); 2600 else 2601 genericReplace(inversion.data, 0, 1, cast(uint[]) null); 2602 if (data[data.length-1] != lastDchar+1) 2603 genericReplace(inversion.data, 2604 inversion.data.length, inversion.data.length, [lastDchar+1]); 2605 else 2606 genericReplace(inversion.data, 2607 inversion.data.length-1, inversion.data.length, cast(uint[]) null); 2608 2609 return inversion; 2610 } 2611 2612 /// 2613 pure @safe unittest 2614 { 2615 auto set = unicode.ASCII; 2616 // union with the inverse gets all of the code points in the Unicode 2617 assert((set | set.inverted).length == 0x110000); 2618 // no intersection with the inverse 2619 assert((set & set.inverted).empty); 2620 } 2621 2622 package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName) 2623 { 2624 import std.algorithm.searching : countUntil; 2625 import std.format : format; 2626 enum maxBinary = 3; 2627 static string linearScope(R)(R ivals, string indent) 2628 { 2629 string result = indent~"{\n"; 2630 string deeper = indent~" "; 2631 foreach (ival; ivals) 2632 { 2633 immutable span = ival[1] - ival[0]; 2634 assert(span != 0); 2635 if (span == 1) 2636 { 2637 result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]); 2638 } 2639 else if (span == 2) 2640 { 2641 result ~= format("%sif (ch == %s || ch == %s) return true;\n", 2642 deeper, ival[0], ival[0]+1); 2643 } 2644 else 2645 { 2646 if (ival[0] != 0) // dchar is unsigned and < 0 is useless 2647 result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]); 2648 result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]); 2649 } 2650 } 2651 result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals 2652 return result; 2653 } 2654 2655 static string binaryScope(R)(R ivals, string indent) @safe 2656 { 2657 // time to do unrolled comparisons? 2658 if (ivals.length < maxBinary) 2659 return linearScope(ivals, indent); 2660 else 2661 return bisect(ivals, ivals.length/2, indent); 2662 } 2663 2664 // not used yet if/elsebinary search is far better with DMD as of 2.061 2665 // and GDC is doing fine job either way 2666 static string switchScope(R)(R ivals, string indent) 2667 { 2668 string result = indent~"switch (ch){\n"; 2669 string deeper = indent~" "; 2670 foreach (ival; ivals) 2671 { 2672 if (ival[0]+1 == ival[1]) 2673 { 2674 result ~= format("%scase %s: return true;\n", 2675 deeper, ival[0]); 2676 } 2677 else 2678 { 2679 result ~= format("%scase %s: .. case %s: return true;\n", 2680 deeper, ival[0], ival[1]-1); 2681 } 2682 } 2683 result ~= deeper~"default: return false;\n"~indent~"}\n"; 2684 return result; 2685 } 2686 2687 static string bisect(R)(R range, size_t idx, string indent) 2688 { 2689 string deeper = indent ~ " "; 2690 // bisect on one [a, b) interval at idx 2691 string result = indent~"{\n"; 2692 // less branch, < a 2693 result ~= format("%sif (ch < %s)\n%s", 2694 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper)); 2695 // middle point, >= a && < b 2696 result ~= format("%selse if (ch < %s) return true;\n", 2697 deeper, range[idx][1]); 2698 // greater or equal branch, >= b 2699 result ~= format("%selse\n%s", 2700 deeper, binaryScope(range[idx+1..$], deeper)); 2701 return result~indent~"}\n"; 2702 } 2703 2704 string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n", 2705 funcName.empty ? "function" : funcName); 2706 // special case first bisection to be on ASCII vs beyond 2707 auto tillAscii = countUntil!"a[0] > 0x80"(range); 2708 if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0) 2709 code ~= binaryScope(range, ""); 2710 else 2711 code ~= bisect(range, tillAscii, ""); 2712 return code; 2713 } 2714 2715 /** 2716 Generates string with D source code of unary function with name of 2717 `funcName` taking a single `dchar` argument. If `funcName` is empty 2718 the code is adjusted to be a lambda function. 2719 2720 The function generated tests if the $(CODEPOINT) passed 2721 belongs to this set or not. The result is to be used with string mixin. 2722 The intended usage area is aggressive optimization via meta programming 2723 in parser generators and the like. 2724 2725 Note: Use with care for relatively small or regular sets. It 2726 could end up being slower then just using multi-staged tables. 2727 2728 Example: 2729 --- 2730 import std.stdio; 2731 2732 // construct set directly from [a, b$RPAREN intervals 2733 auto set = CodepointSet(10, 12, 45, 65, 100, 200); 2734 writeln(set); 2735 writeln(set.toSourceCode("func")); 2736 --- 2737 2738 The above outputs something along the lines of: 2739 --- 2740 bool func(dchar ch) @safe pure nothrow @nogc 2741 { 2742 if (ch < 45) 2743 { 2744 if (ch == 10 || ch == 11) return true; 2745 return false; 2746 } 2747 else if (ch < 65) return true; 2748 else 2749 { 2750 if (ch < 100) return false; 2751 if (ch < 200) return true; 2752 return false; 2753 } 2754 } 2755 --- 2756 */ 2757 string toSourceCode(string funcName="") 2758 { 2759 import std.array : array; 2760 auto range = byInterval.array(); 2761 return toSourceCode(range, funcName); 2762 } 2763 2764 /** 2765 True if this set doesn't contain any $(CODEPOINTS). 2766 */ 2767 @property bool empty() const 2768 { 2769 return data.length == 0; 2770 } 2771 2772 /// 2773 pure @safe unittest 2774 { 2775 CodepointSet emptySet; 2776 assert(emptySet.length == 0); 2777 assert(emptySet.empty); 2778 } 2779 2780 private: 2781 alias This = typeof(this); 2782 alias Marker = size_t; 2783 2784 // a random-access range of integral pairs 2785 static struct Intervals(Range) 2786 { 2787 import std.range.primitives : hasAssignableElements; 2788 2789 this(Range sp) scope 2790 { 2791 slice = sp; 2792 start = 0; 2793 end = sp.length; 2794 } 2795 2796 this(Range sp, size_t s, size_t e) scope 2797 { 2798 slice = sp; 2799 start = s; 2800 end = e; 2801 } 2802 2803 @property auto front()const 2804 { 2805 immutable a = slice[start]; 2806 immutable b = slice[start+1]; 2807 return CodepointInterval(a, b); 2808 } 2809 2810 //may break sorted property - but we need std.sort to access it 2811 //hence package(std) protection attribute 2812 static if (hasAssignableElements!Range) 2813 package(std) @property void front(CodepointInterval val) 2814 { 2815 slice[start] = val.a; 2816 slice[start+1] = val.b; 2817 } 2818 2819 @property auto back()const 2820 { 2821 immutable a = slice[end-2]; 2822 immutable b = slice[end-1]; 2823 return CodepointInterval(a, b); 2824 } 2825 2826 //ditto about package 2827 static if (hasAssignableElements!Range) 2828 package(std) @property void back(CodepointInterval val) 2829 { 2830 slice[end-2] = val.a; 2831 slice[end-1] = val.b; 2832 } 2833 2834 void popFront() 2835 { 2836 start += 2; 2837 } 2838 2839 void popBack() 2840 { 2841 end -= 2; 2842 } 2843 2844 auto opIndex(size_t idx) const 2845 { 2846 immutable a = slice[start+idx*2]; 2847 immutable b = slice[start+idx*2+1]; 2848 return CodepointInterval(a, b); 2849 } 2850 2851 //ditto about package 2852 static if (hasAssignableElements!Range) 2853 package(std) void opIndexAssign(CodepointInterval val, size_t idx) 2854 { 2855 slice[start+idx*2] = val.a; 2856 slice[start+idx*2+1] = val.b; 2857 } 2858 2859 auto opSlice(size_t s, size_t e) 2860 { 2861 return Intervals(slice, s*2+start, e*2+start); 2862 } 2863 2864 @property size_t length()const { return slice.length/2; } 2865 2866 @property bool empty()const { return start == end; } 2867 2868 @property auto save(){ return this; } 2869 private: 2870 size_t start, end; 2871 Range slice; 2872 } 2873 2874 // called after construction from intervals 2875 // to make sure invariants hold 2876 void sanitize() 2877 { 2878 import std.algorithm.comparison : max; 2879 import std.algorithm.mutation : SwapStrategy; 2880 import std.algorithm.sorting : sort; 2881 if (data.length == 0) 2882 return; 2883 alias Ival = CodepointInterval; 2884 //intervals wrapper for a _range_ over packed array 2885 auto ivals = Intervals!(typeof(data[]))(data[]); 2886 //@@@BUG@@@ can't use "a.a < b.a" see 2887 // https://issues.dlang.org/show_bug.cgi?id=12265 2888 sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals); 2889 // what follows is a variation on stable remove 2890 // differences: 2891 // - predicate is binary, and is tested against 2892 // the last kept element (at 'i'). 2893 // - predicate mutates lhs (merges rhs into lhs) 2894 size_t len = ivals.length; 2895 size_t i = 0; 2896 size_t j = 1; 2897 while (j < len) 2898 { 2899 if (ivals[i].b >= ivals[j].a) 2900 { 2901 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b)); 2902 j++; 2903 } 2904 else //unmergable 2905 { 2906 // check if there is a hole after merges 2907 // (in the best case we do 0 writes to ivals) 2908 if (j != i+1) 2909 ivals[i+1] = ivals[j]; //copy over 2910 i++; 2911 j++; 2912 } 2913 } 2914 len = i + 1; 2915 for (size_t k=0; k + 1 < len; k++) 2916 { 2917 assert(ivals[k].a < ivals[k].b); 2918 assert(ivals[k].b < ivals[k+1].a); 2919 } 2920 data.length = len * 2; 2921 } 2922 2923 // special case for normal InversionList 2924 ref subChar(dchar ch) 2925 { 2926 auto mark = skipUpTo(ch); 2927 if (mark != data.length 2928 && data[mark] == ch && data[mark-1] == ch) 2929 { 2930 // it has split, meaning that ch happens to be in one of intervals 2931 data[mark] = data[mark]+1; 2932 } 2933 return this; 2934 } 2935 2936 // 2937 Marker addInterval(int a, int b, Marker hint=Marker.init) scope 2938 in 2939 { 2940 assert(a <= b); 2941 } 2942 do 2943 { 2944 import std.range : assumeSorted, SearchPolicy; 2945 auto range = assumeSorted(data[]); 2946 size_t pos; 2947 size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length; 2948 if (a_idx == range.length) 2949 { 2950 // [---+++----++++----++++++] 2951 // [ a b] 2952 data.append(a, b); 2953 return data.length-1; 2954 } 2955 size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx; 2956 uint[3] buf = void; 2957 uint to_insert; 2958 debug(std_uni) 2959 { 2960 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2961 } 2962 if (b_idx == range.length) 2963 { 2964 // [-------++++++++----++++++-] 2965 // [ s a b] 2966 if (a_idx & 1)// a in positive 2967 { 2968 buf[0] = b; 2969 to_insert = 1; 2970 } 2971 else// a in negative 2972 { 2973 buf[0] = a; 2974 buf[1] = b; 2975 to_insert = 2; 2976 } 2977 pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]); 2978 return pos - 1; 2979 } 2980 2981 uint top = data[b_idx]; 2982 2983 debug(std_uni) 2984 { 2985 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2986 writefln("a=%s; b=%s; top=%s;", a, b, top); 2987 } 2988 if (a_idx & 1) 2989 {// a in positive 2990 if (b_idx & 1)// b in positive 2991 { 2992 // [-------++++++++----++++++-] 2993 // [ s a b ] 2994 buf[0] = top; 2995 to_insert = 1; 2996 } 2997 else // b in negative 2998 { 2999 // [-------++++++++----++++++-] 3000 // [ s a b ] 3001 if (top == b) 3002 { 3003 assert(b_idx+1 < data.length); 3004 buf[0] = data[b_idx+1]; 3005 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]); 3006 return pos - 1; 3007 } 3008 buf[0] = b; 3009 buf[1] = top; 3010 to_insert = 2; 3011 } 3012 } 3013 else 3014 { // a in negative 3015 if (b_idx & 1) // b in positive 3016 { 3017 // [----------+++++----++++++-] 3018 // [ a b ] 3019 buf[0] = a; 3020 buf[1] = top; 3021 to_insert = 2; 3022 } 3023 else// b in negative 3024 { 3025 // [----------+++++----++++++-] 3026 // [ a s b ] 3027 if (top == b) 3028 { 3029 assert(b_idx+1 < data.length); 3030 buf[0] = a; 3031 buf[1] = data[b_idx+1]; 3032 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]); 3033 return pos - 1; 3034 } 3035 buf[0] = a; 3036 buf[1] = b; 3037 buf[2] = top; 3038 to_insert = 3; 3039 } 3040 } 3041 pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]); 3042 debug(std_uni) 3043 { 3044 writefln("marker idx: %d; length=%d", pos, data[pos], data.length); 3045 writeln("inserting ", buf[0 .. to_insert]); 3046 } 3047 return pos - 1; 3048 } 3049 3050 // 3051 Marker dropUpTo(uint a, Marker pos=Marker.init) 3052 in 3053 { 3054 assert(pos % 2 == 0); // at start of interval 3055 } 3056 do 3057 { 3058 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3059 if (range.empty) 3060 return pos; 3061 size_t idx = pos; 3062 idx += range.lowerBound(a).length; 3063 3064 debug(std_uni) 3065 { 3066 writeln("dropUpTo full length=", data.length); 3067 writeln(pos,"~~~", idx); 3068 } 3069 if (idx == data.length) 3070 return genericReplace(data, pos, idx, cast(uint[])[]); 3071 if (idx & 1) 3072 { // a in positive 3073 //[--+++----++++++----+++++++------...] 3074 // |<---si s a t 3075 genericReplace(data, pos, idx, [a]); 3076 } 3077 else 3078 { // a in negative 3079 //[--+++----++++++----+++++++-------+++...] 3080 // |<---si s a t 3081 genericReplace(data, pos, idx, cast(uint[])[]); 3082 } 3083 return pos; 3084 } 3085 3086 // 3087 Marker skipUpTo(uint a, Marker pos=Marker.init) 3088 out(result) 3089 { 3090 assert(result % 2 == 0);// always start of interval 3091 //(may be 0-width after-split) 3092 } 3093 do 3094 { 3095 assert(data.length % 2 == 0); 3096 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3097 size_t idx = pos+range.lowerBound(a).length; 3098 3099 if (idx >= data.length) // could have Marker point to recently removed stuff 3100 return data.length; 3101 3102 if (idx & 1)// inside of interval, check for split 3103 { 3104 3105 immutable top = data[idx]; 3106 if (top == a)// no need to split, it's end 3107 return idx+1; 3108 immutable start = data[idx-1]; 3109 if (a == start) 3110 return idx-1; 3111 // split it up 3112 genericReplace(data, idx, idx+1, [a, a, top]); 3113 return idx+1; // avoid odd index 3114 } 3115 return idx; 3116 } 3117 3118 CowArray!SP data; 3119 } 3120 3121 pure @safe unittest 3122 { 3123 import std.conv : to; 3124 assert(unicode.ASCII.to!string() == "[0..128)"); 3125 } 3126 3127 // pedantic version for ctfe, and aligned-access only architectures 3128 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3129 { 3130 idx *= 3; 3131 version (LittleEndian) 3132 return ptr[idx] + (cast(uint) ptr[idx+1]<<8) 3133 + (cast(uint) ptr[idx+2]<<16); 3134 else 3135 return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8) 3136 + ptr[idx+2]; 3137 } 3138 3139 // ditto 3140 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3141 { 3142 idx *= 3; 3143 version (LittleEndian) 3144 { 3145 ptr[idx] = val & 0xFF; 3146 ptr[idx+1] = (val >> 8) & 0xFF; 3147 ptr[idx+2] = (val >> 16) & 0xFF; 3148 } 3149 else 3150 { 3151 ptr[idx] = (val >> 16) & 0xFF; 3152 ptr[idx+1] = (val >> 8) & 0xFF; 3153 ptr[idx+2] = val & 0xFF; 3154 } 3155 } 3156 3157 // unaligned x86-like read/write functions 3158 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3159 { 3160 uint* src = cast(uint*)(ptr+3*idx); 3161 version (LittleEndian) 3162 return *src & 0xFF_FFFF; 3163 else 3164 return *src >> 8; 3165 } 3166 3167 // ditto 3168 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3169 { 3170 uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx); 3171 version (LittleEndian) 3172 *dest = val | (*dest & 0xFF00_0000); 3173 else 3174 *dest = (val << 8) | (*dest & 0xFF); 3175 } 3176 3177 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3178 { 3179 static if (hasUnalignedReads) 3180 return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx); 3181 else 3182 return safeRead24(ptr, idx); 3183 } 3184 3185 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3186 { 3187 static if (hasUnalignedReads) 3188 return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx); 3189 else 3190 return safeWrite24(ptr, val, idx); 3191 } 3192 3193 struct CowArray(SP=GcPolicy) 3194 { 3195 import std.range.primitives : hasLength; 3196 3197 @safe: 3198 static auto reuse(uint[] arr) 3199 { 3200 CowArray cow; 3201 cow.data = arr; 3202 SP.append(cow.data, 1); 3203 assert(cow.refCount == 1); 3204 assert(cow.length == arr.length); 3205 return cow; 3206 } 3207 3208 this(Range)(Range range) 3209 if (isInputRange!Range && hasLength!Range) 3210 { 3211 import std.algorithm.mutation : copy; 3212 length = range.length; 3213 copy(range, data[0..$-1]); 3214 } 3215 3216 this(Range)(Range range) 3217 if (isForwardRange!Range && !hasLength!Range) 3218 { 3219 import std.algorithm.mutation : copy; 3220 import std.range.primitives : walkLength; 3221 immutable len = walkLength(range.save); 3222 length = len; 3223 copy(range, data[0..$-1]); 3224 } 3225 3226 this(this) 3227 { 3228 if (!empty) 3229 { 3230 refCount = refCount + 1; 3231 } 3232 } 3233 3234 ~this() 3235 { 3236 if (!empty) 3237 { 3238 immutable cnt = refCount; 3239 if (cnt == 1) 3240 SP.destroy(data); 3241 else 3242 refCount = cnt - 1; 3243 } 3244 } 3245 3246 // no ref-count for empty U24 array 3247 @property bool empty() const { return data.length == 0; } 3248 3249 // report one less then actual size 3250 @property size_t length() const 3251 { 3252 return data.length ? data.length - 1 : 0; 3253 } 3254 3255 //+ an extra slot for ref-count 3256 @property void length(size_t len) 3257 { 3258 import std.algorithm.comparison : min; 3259 import std.algorithm.mutation : copy; 3260 if (len == 0) 3261 { 3262 if (!empty) 3263 freeThisReference(); 3264 return; 3265 } 3266 immutable total = len + 1; // including ref-count 3267 if (empty) 3268 { 3269 data = SP.alloc!uint(total); 3270 refCount = 1; 3271 return; 3272 } 3273 immutable cur_cnt = refCount; 3274 if (cur_cnt != 1) // have more references to this memory 3275 { 3276 refCount = cur_cnt - 1; 3277 auto new_data = SP.alloc!uint(total); 3278 // take shrinking into account 3279 auto to_copy = min(total, data.length) - 1; 3280 copy(data[0 .. to_copy], new_data[0 .. to_copy]); 3281 data = new_data; // before setting refCount! 3282 refCount = 1; 3283 } 3284 else // 'this' is the only reference 3285 { 3286 // use the realloc (hopefully in-place operation) 3287 data = SP.realloc(data, total); 3288 refCount = 1; // setup a ref-count in the new end of the array 3289 } 3290 } 3291 3292 alias opDollar = length; 3293 3294 uint opIndex()(size_t idx)const 3295 { 3296 return data[idx]; 3297 } 3298 3299 void opIndexAssign(uint val, size_t idx) 3300 { 3301 auto cnt = refCount; 3302 if (cnt != 1) 3303 dupThisReference(cnt); 3304 data[idx] = val; 3305 } 3306 3307 // 3308 auto opSlice(size_t from, size_t to) 3309 { 3310 if (!empty) 3311 { 3312 auto cnt = refCount; 3313 if (cnt != 1) 3314 dupThisReference(cnt); 3315 } 3316 return data[from .. to]; 3317 3318 } 3319 3320 // 3321 auto opSlice(size_t from, size_t to) const 3322 { 3323 return data[from .. to]; 3324 } 3325 3326 // length slices before the ref count 3327 auto opSlice() 3328 { 3329 return opSlice(0, length); 3330 } 3331 3332 // ditto 3333 auto opSlice() const 3334 { 3335 return opSlice(0, length); 3336 } 3337 3338 void append(Range)(Range range) 3339 if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint)) 3340 { 3341 size_t nl = length + range.length; 3342 length = nl; 3343 copy(range, this[nl-range.length .. nl]); 3344 } 3345 3346 void append()(uint[] val...) 3347 { 3348 length = length + val.length; 3349 data[$-val.length-1 .. $-1] = val[]; 3350 } 3351 3352 bool opEquals()(auto const ref CowArray rhs)const 3353 { 3354 if (empty ^ rhs.empty) 3355 return false; // one is empty and the other isn't 3356 return empty || data[0..$-1] == rhs.data[0..$-1]; 3357 } 3358 3359 private: 3360 // ref-count is right after the data 3361 @property uint refCount() const 3362 { 3363 return data[$-1]; 3364 } 3365 3366 @property void refCount(uint cnt) 3367 { 3368 data[$-1] = cnt; 3369 } 3370 3371 void freeThisReference() 3372 { 3373 immutable count = refCount; 3374 if (count != 1) // have more references to this memory 3375 { 3376 // dec shared ref-count 3377 refCount = count - 1; 3378 data = []; 3379 } 3380 else 3381 SP.destroy(data); 3382 assert(!data.ptr); 3383 } 3384 3385 void dupThisReference(uint count) 3386 in 3387 { 3388 assert(!empty && count != 1 && count == refCount); 3389 } 3390 do 3391 { 3392 import std.algorithm.mutation : copy; 3393 // dec shared ref-count 3394 refCount = count - 1; 3395 // copy to the new chunk of RAM 3396 auto new_data = SP.alloc!uint(data.length); 3397 // bit-blit old stuff except the counter 3398 copy(data[0..$-1], new_data[0..$-1]); 3399 data = new_data; // before setting refCount! 3400 refCount = 1; // so that this updates the right one 3401 } 3402 3403 uint[] data; 3404 } 3405 3406 pure @safe unittest// Uint24 tests 3407 { 3408 import std.algorithm.comparison : equal; 3409 import std.algorithm.mutation : copy; 3410 import std.conv : text; 3411 import std.range : iota, chain; 3412 import std.range.primitives : isBidirectionalRange, isOutputRange; 3413 void funcRef(T)(ref T u24) 3414 { 3415 u24.length = 2; 3416 u24[1] = 1024; 3417 T u24_c = u24; 3418 assert(u24[1] == 1024); 3419 u24.length = 0; 3420 assert(u24.empty); 3421 u24.append([1, 2]); 3422 assert(equal(u24[], [1, 2])); 3423 u24.append(111); 3424 assert(equal(u24[], [1, 2, 111])); 3425 assert(!u24_c.empty && u24_c[1] == 1024); 3426 u24.length = 3; 3427 copy(iota(0, 3), u24[]); 3428 assert(equal(u24[], iota(0, 3))); 3429 assert(u24_c[1] == 1024); 3430 } 3431 3432 void func2(T)(T u24) 3433 { 3434 T u24_2 = u24; 3435 T u24_3; 3436 u24_3 = u24_2; 3437 assert(u24_2 == u24_3); 3438 assert(equal(u24[], u24_2[])); 3439 assert(equal(u24_2[], u24_3[])); 3440 funcRef(u24_3); 3441 3442 assert(equal(u24_3[], iota(0, 3))); 3443 assert(!equal(u24_2[], u24_3[])); 3444 assert(equal(u24_2[], u24[])); 3445 u24_2 = u24_3; 3446 assert(equal(u24_2[], iota(0, 3))); 3447 // to test that passed arg is intact outside 3448 // plus try out opEquals 3449 u24 = u24_3; 3450 u24 = T.init; 3451 u24_3 = T.init; 3452 assert(u24.empty); 3453 assert(u24 == u24_3); 3454 assert(u24 != u24_2); 3455 } 3456 3457 static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy)) 3458 {{ 3459 alias Range = typeof(CowArray!Policy.init[]); 3460 alias U24A = CowArray!Policy; 3461 static assert(isForwardRange!Range); 3462 static assert(isBidirectionalRange!Range); 3463 static assert(isOutputRange!(Range, uint)); 3464 static assert(isRandomAccessRange!(Range)); 3465 3466 auto arr = U24A([42u, 36, 100]); 3467 assert(arr[0] == 42); 3468 assert(arr[1] == 36); 3469 arr[0] = 72; 3470 arr[1] = 0xFE_FEFE; 3471 assert(arr[0] == 72); 3472 assert(arr[1] == 0xFE_FEFE); 3473 assert(arr[2] == 100); 3474 U24A arr2 = arr; 3475 assert(arr2[0] == 72); 3476 arr2[0] = 11; 3477 // test COW-ness 3478 assert(arr[0] == 72); 3479 assert(arr2[0] == 11); 3480 // set this to about 100M to stress-test COW memory management 3481 foreach (v; 0 .. 10_000) 3482 func2(arr); 3483 assert(equal(arr[], [72, 0xFE_FEFE, 100])); 3484 3485 auto r2 = U24A(iota(0, 100)); 3486 assert(equal(r2[], iota(0, 100)), text(r2[])); 3487 copy(iota(10, 170, 2), r2[10 .. 90]); 3488 assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100))) 3489 , text(r2[])); 3490 }} 3491 } 3492 3493 pure @safe unittest// core set primitives test 3494 { 3495 import std.conv : text; 3496 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3497 foreach (CodeList; AllSets) 3498 { 3499 CodeList a; 3500 //"plug a hole" test 3501 a.add(10, 20).add(25, 30).add(15, 27); 3502 assert(a == CodeList(10, 30), text(a)); 3503 3504 auto x = CodeList.init; 3505 x.add(10, 20).add(30, 40).add(50, 60); 3506 3507 a = x; 3508 a.add(20, 49);//[10, 49) [50, 60) 3509 assert(a == CodeList(10, 49, 50 ,60)); 3510 3511 a = x; 3512 a.add(20, 50); 3513 assert(a == CodeList(10, 60), text(a)); 3514 3515 // simple unions, mostly edge effects 3516 x = CodeList.init; 3517 x.add(10, 20).add(40, 60); 3518 3519 a = x; 3520 a.add(10, 25); //[10, 25) [40, 60) 3521 assert(a == CodeList(10, 25, 40, 60)); 3522 3523 a = x; 3524 a.add(5, 15); //[5, 20) [40, 60) 3525 assert(a == CodeList(5, 20, 40, 60)); 3526 3527 a = x; 3528 a.add(0, 10); // [0, 20) [40, 60) 3529 assert(a == CodeList(0, 20, 40, 60)); 3530 3531 a = x; 3532 a.add(0, 5); // prepand 3533 assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a)); 3534 3535 a = x; 3536 a.add(5, 20); 3537 assert(a == CodeList(5, 20, 40, 60)); 3538 3539 a = x; 3540 a.add(3, 37); 3541 assert(a == CodeList(3, 37, 40, 60)); 3542 3543 a = x; 3544 a.add(37, 65); 3545 assert(a == CodeList(10, 20, 37, 65)); 3546 3547 // some tests on helpers for set intersection 3548 x = CodeList.init.add(10, 20).add(40, 60).add(100, 120); 3549 a = x; 3550 3551 auto m = a.skipUpTo(60); 3552 a.dropUpTo(110, m); 3553 assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[])); 3554 3555 a = x; 3556 a.dropUpTo(100); 3557 assert(a == CodeList(100, 120), text(a.data[])); 3558 3559 a = x; 3560 m = a.skipUpTo(50); 3561 a.dropUpTo(140, m); 3562 assert(a == CodeList(10, 20, 40, 50), text(a.data[])); 3563 a = x; 3564 a.dropUpTo(60); 3565 assert(a == CodeList(100, 120), text(a.data[])); 3566 } 3567 } 3568 3569 3570 //test constructor to work with any order of intervals 3571 pure @safe unittest 3572 { 3573 import std.algorithm.comparison : equal; 3574 import std.conv : text, to; 3575 import std.range : chain, iota; 3576 import std.typecons : tuple; 3577 //ensure constructor handles bad ordering and overlap 3578 auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1); 3579 foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1))) 3580 assert(ch in c1, to!string(ch)); 3581 3582 //contiguos 3583 assert(CodepointSet(1000, 1006, 1006, 1009) 3584 .byInterval.equal([tuple(1000, 1009)])); 3585 //contains 3586 assert(CodepointSet(900, 1200, 1000, 1100) 3587 .byInterval.equal([tuple(900, 1200)])); 3588 //intersect left 3589 assert(CodepointSet(900, 1100, 1000, 1200) 3590 .byInterval.equal([tuple(900, 1200)])); 3591 //intersect right 3592 assert(CodepointSet(1000, 1200, 900, 1100) 3593 .byInterval.equal([tuple(900, 1200)])); 3594 3595 //ditto with extra items at end 3596 assert(CodepointSet(1000, 1200, 900, 1100, 800, 850) 3597 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3598 assert(CodepointSet(900, 1100, 1000, 1200, 800, 850) 3599 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3600 3601 //"plug a hole" test 3602 auto c2 = CodepointSet(20, 40, 3603 60, 80, 100, 140, 150, 200, 3604 40, 60, 80, 100, 140, 150 3605 ); 3606 assert(c2.byInterval.equal([tuple(20, 200)])); 3607 3608 auto c3 = CodepointSet( 3609 20, 40, 60, 80, 100, 140, 150, 200, 3610 0, 10, 15, 100, 10, 20, 200, 220); 3611 assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)])); 3612 } 3613 3614 3615 pure @safe unittest 3616 { // full set operations 3617 import std.conv : text; 3618 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3619 foreach (CodeList; AllSets) 3620 { 3621 CodeList a, b, c, d; 3622 3623 //"plug a hole" 3624 a.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3625 b.add(40, 60).add(80, 100).add(140, 150); 3626 c = a | b; 3627 d = b | a; 3628 assert(c == CodeList(20, 200), text(CodeList.stringof," ", c)); 3629 assert(c == d, text(c," vs ", d)); 3630 3631 b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210); 3632 c = a | b; //[20,45) [60, 85) [95, 140) [150, 210) 3633 d = b | a; 3634 assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c)); 3635 assert(c == d, text(c," vs ", d)); 3636 3637 b = CodeList.init.add(10, 20).add(30,100).add(145,200); 3638 c = a | b;//[10, 140) [145, 200) 3639 d = b | a; 3640 assert(c == CodeList(10, 140, 145, 200)); 3641 assert(c == d, text(c," vs ", d)); 3642 3643 b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220); 3644 c = a | b;//[0, 140) [150, 220) 3645 d = b | a; 3646 assert(c == CodeList(0, 140, 150, 220)); 3647 assert(c == d, text(c," vs ", d)); 3648 3649 3650 a = CodeList.init.add(20, 40).add(60, 80); 3651 b = CodeList.init.add(25, 35).add(65, 75); 3652 c = a & b; 3653 d = b & a; 3654 assert(c == CodeList(25, 35, 65, 75), text(c)); 3655 assert(c == d, text(c," vs ", d)); 3656 3657 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3658 b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180); 3659 c = a & b; 3660 d = b & a; 3661 assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c)); 3662 assert(c == d, text(c," vs ", d)); 3663 3664 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3665 b = CodeList.init.add(10, 30).add(60, 120).add(135, 160); 3666 c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160) 3667 d = b & a; 3668 3669 assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c)); 3670 assert(c == d, text(c, " vs ",d)); 3671 assert((c & a) == c); 3672 assert((d & b) == d); 3673 assert((c & d) == d); 3674 3675 b = CodeList.init.add(40, 60).add(80, 100).add(140, 200); 3676 c = a & b; 3677 d = b & a; 3678 assert(c == CodeList(150, 200), text(c)); 3679 assert(c == d, text(c, " vs ",d)); 3680 assert((c & a) == c); 3681 assert((d & b) == d); 3682 assert((c & d) == d); 3683 3684 assert((a & a) == a); 3685 assert((b & b) == b); 3686 3687 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3688 b = CodeList.init.add(30, 60).add(75, 120).add(190, 300); 3689 c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190) 3690 d = b - a;// [40, 60) [80, 100) [200, 300) 3691 assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c)); 3692 assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d)); 3693 assert(c - d == c, text(c-d, " vs ", c)); 3694 assert(d - c == d, text(d-c, " vs ", d)); 3695 assert(c - c == CodeList.init); 3696 assert(d - d == CodeList.init); 3697 3698 a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200); 3699 b = CodeList.init.add(10, 50).add(60, 160).add(190, 300); 3700 c = a - b;// [160, 190) 3701 d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300) 3702 assert(c == CodeList(160, 190), text(c)); 3703 assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d)); 3704 assert(c - d == c, text(c-d, " vs ", c)); 3705 assert(d - c == d, text(d-c, " vs ", d)); 3706 assert(c - c == CodeList.init); 3707 assert(d - d == CodeList.init); 3708 3709 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3710 b = CodeList.init.add(10, 30).add(45, 100).add(130, 190); 3711 c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200) 3712 d = b ~ a; 3713 assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200), 3714 text(c)); 3715 assert(c == d, text(c, " vs ", d)); 3716 } 3717 } 3718 3719 } 3720 3721 pure @safe unittest// vs single dchar 3722 { 3723 import std.conv : text; 3724 CodepointSet a = CodepointSet(10, 100, 120, 200); 3725 assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A')); 3726 assert((a & 'B') == CodepointSet(66, 67)); 3727 } 3728 3729 pure @safe unittest// iteration & opIndex 3730 { 3731 import std.algorithm.comparison : equal; 3732 import std.conv : text; 3733 import std.typecons : tuple, Tuple; 3734 3735 static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy))) 3736 {{ 3737 auto arr = "ABCDEFGHIJKLMabcdefghijklm"d; 3738 auto a = CodeList('A','N','a', 'n'); 3739 assert(equal(a.byInterval, 3740 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')] 3741 ), text(a.byInterval)); 3742 3743 // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ? 3744 version (bug8949) 3745 { 3746 import std.range : retro; 3747 assert(equal(retro(a.byInterval), 3748 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')] 3749 ), text(retro(a.byInterval))); 3750 } 3751 auto achr = a.byCodepoint; 3752 assert(equal(achr, arr), text(a.byCodepoint)); 3753 foreach (ch; a.byCodepoint) 3754 assert(a[ch]); 3755 auto x = CodeList(100, 500, 600, 900, 1200, 1500); 3756 assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval)); 3757 foreach (ch; x.byCodepoint) 3758 assert(x[ch]); 3759 static if (is(CodeList == CodepointSet)) 3760 { 3761 auto y = CodeList(x.byInterval); 3762 assert(equal(x.byInterval, y.byInterval)); 3763 } 3764 assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[])); 3765 assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[])); 3766 }} 3767 } 3768 3769 //============================================================================ 3770 // Generic Trie template and various ways to build it 3771 //============================================================================ 3772 3773 // debug helper to get a shortened array dump 3774 auto arrayRepr(T)(T x) 3775 { 3776 import std.conv : text; 3777 if (x.length > 32) 3778 { 3779 return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]); 3780 } 3781 else 3782 return text(x); 3783 } 3784 3785 /** 3786 Maps `Key` to a suitable integer index within the range of `size_t`. 3787 The mapping is constructed by applying predicates from `Prefix` left to right 3788 and concatenating the resulting bits. 3789 3790 The first (leftmost) predicate defines the most significant bits of 3791 the resulting index. 3792 */ 3793 template mapTrieIndex(Prefix...) 3794 { 3795 size_t mapTrieIndex(Key)(Key key) 3796 if (isValidPrefixForTrie!(Key, Prefix)) 3797 { 3798 alias p = Prefix; 3799 size_t idx; 3800 foreach (i, v; p[0..$-1]) 3801 { 3802 idx |= p[i](key); 3803 idx <<= p[i+1].bitSize; 3804 } 3805 idx |= p[$-1](key); 3806 return idx; 3807 } 3808 } 3809 3810 /* 3811 `TrieBuilder` is a type used for incremental construction 3812 of $(LREF Trie)s. 3813 3814 See $(LREF buildTrie) for generic helpers built on top of it. 3815 */ 3816 @trusted private struct TrieBuilder(Value, Key, Args...) 3817 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args)) 3818 { 3819 import std.exception : enforce; 3820 3821 private: 3822 // last index is not stored in table, it is used as an offset to values in a block. 3823 static if (is(Value == bool))// always pack bool 3824 alias V = BitPacked!(Value, 1); 3825 else 3826 alias V = Value; 3827 static auto deduceMaxIndex(Preds...)() 3828 { 3829 size_t idx = 1; 3830 foreach (v; Preds) 3831 idx *= 2^^v.bitSize; 3832 return idx; 3833 } 3834 3835 static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key 3836 { 3837 alias Prefix = Args[1..$]; 3838 enum lastPageSize = 2^^Prefix[$-1].bitSize; 3839 enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]); 3840 enum roughedMaxIndex = 3841 (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize; 3842 // check warp around - if wrapped, use the default deduction rule 3843 enum maxIndex = roughedMaxIndex < translatedMaxIndex ? 3844 deduceMaxIndex!(Prefix)() : roughedMaxIndex; 3845 } 3846 else 3847 { 3848 alias Prefix = Args; 3849 enum maxIndex = deduceMaxIndex!(Prefix)(); 3850 } 3851 3852 alias getIndex = mapTrieIndex!(Prefix); 3853 3854 enum lastLevel = Prefix.length-1; 3855 struct ConstructState 3856 { 3857 size_t idx_zeros, idx_ones; 3858 } 3859 // iteration over levels of Trie, each indexes its own level and thus a shortened domain 3860 size_t[Prefix.length] indices; 3861 // default filler value to use 3862 Value defValue; 3863 // this is a full-width index of next item 3864 size_t curIndex; 3865 // all-zeros page index, all-ones page index (+ indicator if there is such a page) 3866 ConstructState[Prefix.length] state; 3867 // the table being constructed 3868 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table; 3869 3870 @disable this(); 3871 3872 //shortcut for index variable at level 'level' 3873 @property ref idx(size_t level)(){ return indices[level]; } 3874 3875 // this function assumes no holes in the input so 3876 // indices are going one by one 3877 void addValue(size_t level, T)(T val, size_t numVals) 3878 { 3879 alias j = idx!level; 3880 enum pageSize = 1 << Prefix[level].bitSize; 3881 if (numVals == 0) 3882 return; 3883 auto ptr = table.slice!(level); 3884 if (numVals == 1) 3885 { 3886 static if (level == Prefix.length-1) 3887 ptr[j] = val; 3888 else 3889 {// can incur narrowing conversion 3890 assert(j < ptr.length); 3891 ptr[j] = force!(typeof(ptr[j]))(val); 3892 } 3893 j++; 3894 if (j % pageSize == 0) 3895 spillToNextPage!level(ptr); 3896 return; 3897 } 3898 // longer row of values 3899 // get to the next page boundary 3900 immutable nextPB = (j + pageSize) & ~(pageSize-1); 3901 immutable n = nextPB - j;// can fill right in this page 3902 if (numVals < n) //fits in current page 3903 { 3904 ptr[j .. j+numVals] = val; 3905 j += numVals; 3906 return; 3907 } 3908 static if (level != 0)//on the first level it always fits 3909 { 3910 numVals -= n; 3911 //write till the end of current page 3912 ptr[j .. j+n] = val; 3913 j += n; 3914 //spill to the next page 3915 spillToNextPage!level(ptr); 3916 // page at once loop 3917 if (state[level].idx_zeros != size_t.max && val == T.init) 3918 { 3919 alias NextIdx = typeof(table.slice!(level-1)[0]); 3920 addValue!(level-1)(force!NextIdx(state[level].idx_zeros), 3921 numVals/pageSize); 3922 ptr = table.slice!level; //table structure might have changed 3923 numVals %= pageSize; 3924 } 3925 else 3926 { 3927 while (numVals >= pageSize) 3928 { 3929 numVals -= pageSize; 3930 ptr[j .. j+pageSize] = val; 3931 j += pageSize; 3932 spillToNextPage!level(ptr); 3933 } 3934 } 3935 if (numVals) 3936 { 3937 // the leftovers, an incomplete page 3938 ptr[j .. j+numVals] = val; 3939 j += numVals; 3940 } 3941 } 3942 } 3943 3944 void spillToNextPage(size_t level, Slice)(ref Slice ptr) 3945 { 3946 // last level (i.e. topmost) has 1 "page" 3947 // thus it need not to add a new page on upper level 3948 static if (level != 0) 3949 spillToNextPageImpl!(level)(ptr); 3950 } 3951 3952 // this can re-use the current page if duplicate or allocate a new one 3953 // it also makes sure that previous levels point to the correct page in this level 3954 void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr) 3955 { 3956 alias NextIdx = typeof(table.slice!(level-1)[0]); 3957 NextIdx next_lvl_index; 3958 enum pageSize = 1 << Prefix[level].bitSize; 3959 assert(idx!level % pageSize == 0); 3960 immutable last = idx!level-pageSize; 3961 const slice = ptr[idx!level - pageSize .. idx!level]; 3962 size_t j; 3963 for (j=0; j<last; j+=pageSize) 3964 { 3965 if (ptr[j .. j+pageSize] == slice) 3966 { 3967 // get index to it, reuse ptr space for the next block 3968 next_lvl_index = force!NextIdx(j/pageSize); 3969 version (none) 3970 { 3971 import std.stdio : writefln, writeln; 3972 writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]" 3973 ,level 3974 ,indices[level-1], pageSize, j, j+pageSize); 3975 writeln("LEVEL(", level 3976 , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize])); 3977 writeln("LEVEL(", level 3978 , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize])); 3979 } 3980 idx!level -= pageSize; // reuse this page, it is duplicate 3981 break; 3982 } 3983 } 3984 if (j == last) 3985 { 3986 L_allocate_page: 3987 next_lvl_index = force!NextIdx(idx!level/pageSize - 1); 3988 if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize)) 3989 { 3990 state[level].idx_zeros = next_lvl_index; 3991 } 3992 // allocate next page 3993 version (none) 3994 { 3995 import std.stdio : writefln; 3996 writefln("LEVEL(%s) page allocated: %s" 3997 , level, arrayRepr(slice[0 .. pageSize])); 3998 writefln("LEVEL(%s) index: %s ; page at this index %s" 3999 , level 4000 , next_lvl_index 4001 , arrayRepr( 4002 table.slice!(level) 4003 [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize] 4004 )); 4005 } 4006 table.length!level = table.length!level + pageSize; 4007 } 4008 L_know_index: 4009 // for the previous level, values are indices to the pages in the current level 4010 addValue!(level-1)(next_lvl_index, 1); 4011 ptr = table.slice!level; //re-load the slice after moves 4012 } 4013 4014 // idx - full-width index to fill with v (full-width index != key) 4015 // fills everything in the range of [curIndex, idx) with filler 4016 void putAt(size_t idx, Value v) 4017 { 4018 assert(idx >= curIndex); 4019 immutable numFillers = idx - curIndex; 4020 addValue!lastLevel(defValue, numFillers); 4021 addValue!lastLevel(v, 1); 4022 curIndex = idx + 1; 4023 } 4024 4025 // ditto, but sets the range of [idxA, idxB) to v 4026 void putRangeAt(size_t idxA, size_t idxB, Value v) 4027 { 4028 assert(idxA >= curIndex); 4029 assert(idxB >= idxA); 4030 size_t numFillers = idxA - curIndex; 4031 addValue!lastLevel(defValue, numFillers); 4032 addValue!lastLevel(v, idxB - idxA); 4033 curIndex = idxB; // open-right 4034 } 4035 4036 enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~ 4037 "duplicate key->value mapping"; 4038 4039 public: 4040 /** 4041 Construct a builder, where `filler` is a value 4042 to indicate empty slots (or "not found" condition). 4043 */ 4044 this(Value filler) 4045 { 4046 curIndex = 0; 4047 defValue = filler; 4048 // zeros-page index, ones-page index 4049 foreach (ref v; state) 4050 v = ConstructState(size_t.max, size_t.max); 4051 table = typeof(table)(indices); 4052 // one page per level is a bootstrap minimum 4053 foreach (i, Pred; Prefix) 4054 table.length!i = (1 << Pred.bitSize); 4055 } 4056 4057 /** 4058 Put a value `v` into interval as 4059 mapped by keys from `a` to `b`. 4060 All slots prior to `a` are filled with 4061 the default filler. 4062 */ 4063 void putRange(Key a, Key b, Value v) 4064 { 4065 auto idxA = getIndex(a), idxB = getIndex(b); 4066 // indexes of key should always grow 4067 enforce(idxB >= idxA && idxA >= curIndex, errMsg); 4068 putRangeAt(idxA, idxB, v); 4069 } 4070 4071 /** 4072 Put a value `v` into slot mapped by `key`. 4073 All slots prior to `key` are filled with the 4074 default filler. 4075 */ 4076 void putValue(Key key, Value v) 4077 { 4078 auto idx = getIndex(key); 4079 enforce(idx >= curIndex, errMsg); 4080 putAt(idx, v); 4081 } 4082 4083 /// Finishes construction of Trie, yielding an immutable Trie instance. 4084 auto build() 4085 { 4086 static if (maxIndex != 0) // doesn't cover full range of size_t 4087 { 4088 assert(curIndex <= maxIndex); 4089 addValue!lastLevel(defValue, maxIndex - curIndex); 4090 } 4091 else 4092 { 4093 if (curIndex != 0 // couldn't wrap around 4094 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty 4095 { 4096 addValue!lastLevel(defValue, size_t.max - curIndex); 4097 addValue!lastLevel(defValue, 1); 4098 } 4099 // else curIndex already completed the full range of size_t by wrapping around 4100 } 4101 return Trie!(V, Key, maxIndex, Prefix)(table); 4102 } 4103 } 4104 4105 /** 4106 $(P A generic Trie data-structure for a fixed number of stages. 4107 The design goal is optimal speed with smallest footprint size. 4108 ) 4109 $(P It's intentionally read-only and doesn't provide constructors. 4110 To construct one use a special builder, 4111 see $(LREF TrieBuilder) and $(LREF buildTrie). 4112 ) 4113 4114 */ 4115 @trusted private struct Trie(Value, Key, Args...) 4116 if (isValidPrefixForTrie!(Key, Args) 4117 || (isValidPrefixForTrie!(Key, Args[1..$]) 4118 && is(typeof(Args[0]) : size_t))) 4119 { 4120 import std.range.primitives : isOutputRange; 4121 static if (is(typeof(Args[0]) : size_t)) 4122 { 4123 private enum maxIndex = Args[0]; 4124 private enum hasBoundsCheck = true; 4125 private alias Prefix = Args[1..$]; 4126 } 4127 else 4128 { 4129 private enum hasBoundsCheck = false; 4130 private alias Prefix = Args; 4131 } 4132 4133 private this()(typeof(_table) table) 4134 { 4135 _table = table; 4136 } 4137 4138 // only for constant Tries constructed from precompiled tables 4139 private this()(const(size_t)[] offsets, const(size_t)[] sizes, 4140 const(size_t)[] data) const 4141 { 4142 _table = typeof(_table)(offsets, sizes, data); 4143 } 4144 4145 /** 4146 $(P Lookup the `key` in this `Trie`. ) 4147 4148 $(P The lookup always succeeds if key fits the domain 4149 provided during construction. The whole domain defined 4150 is covered so instead of not found condition 4151 the sentinel (filler) value could be used. ) 4152 4153 $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to 4154 define a domain of `Trie` keys and the sentinel value. ) 4155 4156 Note: 4157 Domain range-checking is only enabled in debug builds 4158 and results in assertion failure. 4159 */ 4160 TypeOfBitPacked!Value opIndex()(Key key) const 4161 { 4162 static if (hasBoundsCheck) 4163 assert(mapTrieIndex!Prefix(key) < maxIndex); 4164 size_t idx; 4165 alias p = Prefix; 4166 idx = cast(size_t) p[0](key); 4167 foreach (i, v; p[0..$-1]) 4168 idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key)); 4169 return _table.ptr!(p.length-1)[idx]; 4170 } 4171 4172 /// 4173 @property size_t bytes(size_t n=size_t.max)() const 4174 { 4175 return _table.bytes!n; 4176 } 4177 4178 /// 4179 @property size_t pages(size_t n)() const 4180 { 4181 return (bytes!n+2^^(Prefix[n].bitSize-1)) 4182 /2^^Prefix[n].bitSize; 4183 } 4184 4185 /// 4186 void store(OutRange)(scope OutRange sink) const 4187 if (isOutputRange!(OutRange, char)) 4188 { 4189 _table.store(sink); 4190 } 4191 4192 private: 4193 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table; 4194 } 4195 4196 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes' 4197 // left-to-right, the most significant bits first 4198 template GetBitSlicing(size_t top, sizes...) 4199 { 4200 static if (sizes.length > 0) 4201 alias GetBitSlicing = 4202 AliasSeq!(sliceBits!(top - sizes[0], top), 4203 GetBitSlicing!(top - sizes[0], sizes[1..$])); 4204 else 4205 alias GetBitSlicing = AliasSeq!(); 4206 } 4207 4208 template callableWith(T) 4209 { 4210 template callableWith(alias Pred) 4211 { 4212 static if (!is(typeof(Pred(T.init)))) 4213 enum callableWith = false; 4214 else 4215 { 4216 alias Result = typeof(Pred(T.init)); 4217 enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result)); 4218 } 4219 } 4220 } 4221 4222 /* 4223 Check if `Prefix` is a valid set of predicates 4224 for `Trie` template having `Key` as the type of keys. 4225 This requires all predicates to be callable, take 4226 single argument of type `Key` and return unsigned value. 4227 */ 4228 template isValidPrefixForTrie(Key, Prefix...) 4229 { 4230 import std.meta : allSatisfy; 4231 enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws 4232 } 4233 4234 /* 4235 Check if `Args` is a set of maximum key value followed by valid predicates 4236 for `Trie` template having `Key` as the type of keys. 4237 */ 4238 template isValidArgsForTrie(Key, Args...) 4239 { 4240 static if (Args.length > 1) 4241 { 4242 enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args) 4243 || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key)); 4244 } 4245 else 4246 enum isValidArgsForTrie = isValidPrefixForTrie!Args; 4247 } 4248 4249 @property size_t sumOfIntegerTuple(ints...)() 4250 { 4251 size_t count=0; 4252 foreach (v; ints) 4253 count += v; 4254 return count; 4255 } 4256 4257 /** 4258 A shorthand for creating a custom multi-level fixed Trie 4259 from a `CodepointSet`. `sizes` are numbers of bits per level, 4260 with the most significant bits used first. 4261 4262 Note: The sum of `sizes` must be equal 21. 4263 4264 See_Also: $(LREF toTrie), which is even simpler. 4265 4266 Example: 4267 --- 4268 { 4269 import std.stdio; 4270 auto set = unicode("Number"); 4271 auto trie = codepointSetTrie!(8, 5, 8)(set); 4272 writeln("Input code points to test:"); 4273 foreach (line; stdin.byLine) 4274 { 4275 int count=0; 4276 foreach (dchar ch; line) 4277 if (trie[ch])// is number 4278 count++; 4279 writefln("Contains %d number code points.", count); 4280 } 4281 } 4282 --- 4283 */ 4284 public template codepointSetTrie(sizes...) 4285 if (sumOfIntegerTuple!sizes == 21) 4286 { 4287 auto codepointSetTrie(Set)(Set set) 4288 if (isCodepointSet!Set) 4289 { 4290 auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false); 4291 foreach (ival; set.byInterval) 4292 builder.putRange(ival[0], ival[1], true); 4293 return builder.build(); 4294 } 4295 } 4296 4297 /// Type of Trie generated by codepointSetTrie function. 4298 public template CodepointSetTrie(sizes...) 4299 if (sumOfIntegerTuple!sizes == 21) 4300 { 4301 alias Prefix = GetBitSlicing!(21, sizes); 4302 alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build()); 4303 } 4304 4305 /** 4306 A slightly more general tool for building fixed `Trie` 4307 for the Unicode data. 4308 4309 Specifically unlike `codepointSetTrie` it's allows creating mappings 4310 of `dchar` to an arbitrary type `T`. 4311 4312 Note: Overload taking `CodepointSet`s will naturally convert 4313 only to bool mapping `Trie`s. 4314 4315 CodepointTrie is the type of Trie as generated by codepointTrie function. 4316 */ 4317 public template codepointTrie(T, sizes...) 4318 if (sumOfIntegerTuple!sizes == 21) 4319 { 4320 alias Prefix = GetBitSlicing!(21, sizes); 4321 4322 static if (is(TypeOfBitPacked!T == bool)) 4323 { 4324 auto codepointTrie(Set)(const scope Set set) 4325 if (isCodepointSet!Set) 4326 { 4327 return codepointSetTrie(set); 4328 } 4329 } 4330 4331 /// 4332 auto codepointTrie()(T[dchar] map, T defValue=T.init) 4333 { 4334 return buildTrie!(T, dchar, Prefix)(map, defValue); 4335 } 4336 4337 // unsorted range of pairs 4338 /// 4339 auto codepointTrie(R)(R range, T defValue=T.init) 4340 if (isInputRange!R 4341 && is(typeof(ElementType!R.init[0]) : T) 4342 && is(typeof(ElementType!R.init[1]) : dchar)) 4343 { 4344 // build from unsorted array of pairs 4345 // TODO: expose index sorting functions for Trie 4346 return buildTrie!(T, dchar, Prefix)(range, defValue, true); 4347 } 4348 } 4349 4350 @system pure unittest 4351 { 4352 import std.algorithm.comparison : max; 4353 import std.algorithm.searching : count; 4354 4355 // pick characters from the Greek script 4356 auto set = unicode.Greek; 4357 4358 // a user-defined property (or an expensive function) 4359 // that we want to look up 4360 static uint luckFactor(dchar ch) 4361 { 4362 // here we consider a character lucky 4363 // if its code point has a lot of identical hex-digits 4364 // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2 4365 ubyte[6] nibbles; // 6 4-bit chunks of code point 4366 uint value = ch; 4367 foreach (i; 0 .. 6) 4368 { 4369 nibbles[i] = value & 0xF; 4370 value >>= 4; 4371 } 4372 uint luck; 4373 foreach (n; nibbles) 4374 luck = cast(uint) max(luck, count(nibbles[], n)); 4375 return luck; 4376 } 4377 4378 // only unsigned built-ins are supported at the moment 4379 alias LuckFactor = BitPacked!(uint, 3); 4380 4381 // create a temporary associative array (AA) 4382 LuckFactor[dchar] map; 4383 foreach (ch; set.byCodepoint) 4384 map[ch] = LuckFactor(luckFactor(ch)); 4385 4386 // bits per stage are chosen randomly, fell free to optimize 4387 auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map); 4388 4389 // from now on the AA is not needed 4390 foreach (ch; set.byCodepoint) 4391 assert(trie[ch] == luckFactor(ch)); // verify 4392 // CJK is not Greek, thus it has the default value 4393 assert(trie['\u4444'] == 0); 4394 // and here is a couple of quite lucky Greek characters: 4395 // Greek small letter epsilon with dasia 4396 assert(trie['\u1F11'] == 3); 4397 // Ancient Greek metretes sign 4398 assert(trie['\U00010181'] == 3); 4399 4400 } 4401 4402 /// ditto 4403 public template CodepointTrie(T, sizes...) 4404 if (sumOfIntegerTuple!sizes == 21) 4405 { 4406 alias Prefix = GetBitSlicing!(21, sizes); 4407 alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build()); 4408 } 4409 4410 package(std) template cmpK0(alias Pred) 4411 { 4412 import std.typecons : Tuple; 4413 static bool cmpK0(Value, Key) 4414 (Tuple!(Value, Key) a, Tuple!(Value, Key) b) 4415 { 4416 return Pred(a[1]) < Pred(b[1]); 4417 } 4418 } 4419 4420 /** 4421 The most general utility for construction of `Trie`s 4422 short of using `TrieBuilder` directly. 4423 4424 Provides a number of convenience overloads. 4425 `Args` is tuple of maximum key value followed by 4426 predicates to construct index from key. 4427 4428 Alternatively if the first argument is not a value convertible to `Key` 4429 then the whole tuple of `Args` is treated as predicates 4430 and the maximum Key is deduced from predicates. 4431 */ 4432 private template buildTrie(Value, Key, Args...) 4433 if (isValidArgsForTrie!(Key, Args)) 4434 { 4435 static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key 4436 { 4437 alias Prefix = Args[1..$]; 4438 } 4439 else 4440 alias Prefix = Args; 4441 4442 alias getIndex = mapTrieIndex!(Prefix); 4443 4444 // for multi-sort 4445 template GetComparators(size_t n) 4446 { 4447 static if (n > 0) 4448 alias GetComparators = 4449 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1])); 4450 else 4451 alias GetComparators = AliasSeq!(); 4452 } 4453 4454 /* 4455 Build `Trie` from a range of a Key-Value pairs, 4456 assuming it is sorted by Key as defined by the following lambda: 4457 ------ 4458 (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b) 4459 ------ 4460 Exception is thrown if it's detected that the above order doesn't hold. 4461 4462 In other words $(LREF mapTrieIndex) should be a 4463 monotonically increasing function that maps `Key` to an integer. 4464 4465 See_Also: $(REF sort, std,_algorithm), 4466 $(REF SortedRange, std,range), 4467 $(REF setUnion, std,_algorithm). 4468 */ 4469 auto buildTrie(Range)(Range range, Value filler=Value.init) 4470 if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value) 4471 && is(typeof(Range.init.front[1]) : Key)) 4472 { 4473 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4474 foreach (v; range) 4475 builder.putValue(v[1], v[0]); 4476 return builder.build(); 4477 } 4478 4479 /* 4480 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4481 to build `Trie` from a range of open-right intervals of `Key`s. 4482 The requirement on the ordering of keys (and the behavior on the 4483 violation of it) is the same as for Key-Value range overload. 4484 4485 Intervals denote ranges of !`filler` i.e. the opposite of filler. 4486 If no filler provided keys inside of the intervals map to true, 4487 and `filler` is false. 4488 */ 4489 auto buildTrie(Range)(Range range, Value filler=Value.init) 4490 if (is(TypeOfBitPacked!Value == bool) 4491 && isInputRange!Range && is(typeof(Range.init.front[0]) : Key) 4492 && is(typeof(Range.init.front[1]) : Key)) 4493 { 4494 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4495 foreach (ival; range) 4496 builder.putRange(ival[0], ival[1], !filler); 4497 return builder.build(); 4498 } 4499 4500 auto buildTrie(Range)(Range range, Value filler, bool unsorted) 4501 if (isInputRange!Range 4502 && is(typeof(Range.init.front[0]) : Value) 4503 && is(typeof(Range.init.front[1]) : Key)) 4504 { 4505 import std.algorithm.sorting : multiSort; 4506 alias Comps = GetComparators!(Prefix.length); 4507 if (unsorted) 4508 multiSort!(Comps)(range); 4509 return buildTrie(range, filler); 4510 } 4511 4512 /* 4513 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4514 to build `Trie` simply from an input range of `Key`s. 4515 The requirement on the ordering of keys (and the behavior on the 4516 violation of it) is the same as for Key-Value range overload. 4517 4518 Keys found in range denote !`filler` i.e. the opposite of filler. 4519 If no filler provided keys map to true, and `filler` is false. 4520 */ 4521 auto buildTrie(Range)(Range range, Value filler=Value.init) 4522 if (is(TypeOfBitPacked!Value == bool) 4523 && isInputRange!Range && is(typeof(Range.init.front) : Key)) 4524 { 4525 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4526 foreach (v; range) 4527 builder.putValue(v, !filler); 4528 return builder.build(); 4529 } 4530 4531 /* 4532 If `Key` is unsigned integer `Trie` could be constructed from array 4533 of values where array index serves as key. 4534 */ 4535 auto buildTrie()(Value[] array, Value filler=Value.init) 4536 if (isUnsigned!Key) 4537 { 4538 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4539 foreach (idx, v; array) 4540 builder.putValue(idx, v); 4541 return builder.build(); 4542 } 4543 4544 /* 4545 Builds `Trie` from associative array. 4546 */ 4547 auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init) 4548 { 4549 import std.array : array; 4550 import std.range : zip; 4551 auto range = array(zip(map.values, map.keys)); 4552 return buildTrie(range, filler, true); // sort it 4553 } 4554 } 4555 4556 // helper in place of assumeSize to 4557 //reduce mangled name & help DMD inline Trie functors 4558 struct clamp(size_t bits) 4559 { 4560 static size_t opCall(T)(T arg){ return arg; } 4561 enum bitSize = bits; 4562 } 4563 4564 struct clampIdx(size_t idx, size_t bits) 4565 { 4566 static size_t opCall(T)(T arg){ return arg[idx]; } 4567 enum bitSize = bits; 4568 } 4569 4570 /** 4571 Conceptual type that outlines the common properties of all UTF Matchers. 4572 4573 Note: For illustration purposes only, every method 4574 call results in assertion failure. 4575 Use $(LREF utfMatcher) to obtain a concrete matcher 4576 for UTF-8 or UTF-16 encodings. 4577 */ 4578 public struct MatcherConcept 4579 { 4580 /** 4581 $(P Perform a semantic equivalent 2 operations: 4582 decoding a $(CODEPOINT) at front of `inp` and testing if 4583 it belongs to the set of $(CODEPOINTS) of this matcher. ) 4584 4585 $(P The effect on `inp` depends on the kind of function called:) 4586 4587 $(P Match. If the codepoint is found in the set then range `inp` 4588 is advanced by its size in $(S_LINK Code unit, code units), 4589 otherwise the range is not modifed.) 4590 4591 $(P Skip. The range is always advanced by the size 4592 of the tested $(CODEPOINT) regardless of the result of test.) 4593 4594 $(P Test. The range is left unaffected regardless 4595 of the result of test.) 4596 */ 4597 public bool match(Range)(ref Range inp) 4598 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4599 { 4600 assert(false); 4601 } 4602 4603 ///ditto 4604 public bool skip(Range)(ref Range inp) 4605 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4606 { 4607 assert(false); 4608 } 4609 4610 ///ditto 4611 public bool test(Range)(ref Range inp) 4612 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4613 { 4614 assert(false); 4615 } 4616 /// 4617 pure @safe unittest 4618 { 4619 string truth = "2² = 4"; 4620 auto m = utfMatcher!char(unicode.Number); 4621 assert(m.match(truth)); // '2' is a number all right 4622 assert(truth == "² = 4"); // skips on match 4623 assert(m.match(truth)); // so is the superscript '2' 4624 assert(!m.match(truth)); // space is not a number 4625 assert(truth == " = 4"); // unaffected on no match 4626 assert(!m.skip(truth)); // same test ... 4627 assert(truth == "= 4"); // but skips a codepoint regardless 4628 assert(!m.test(truth)); // '=' is not a number 4629 assert(truth == "= 4"); // test never affects argument 4630 } 4631 4632 /** 4633 Advanced feature - provide direct access to a subset of matcher based a 4634 set of known encoding lengths. Lengths are provided in 4635 $(S_LINK Code unit, code units). The sub-matcher then may do less 4636 operations per any `test`/`match`. 4637 4638 Use with care as the sub-matcher won't match 4639 any $(CODEPOINTS) that have encoded length that doesn't belong 4640 to the selected set of lengths. Also the sub-matcher object references 4641 the parent matcher and must not be used past the liftetime 4642 of the latter. 4643 4644 Another caveat of using sub-matcher is that skip is not available 4645 preciesly because sub-matcher doesn't detect all lengths. 4646 */ 4647 @property auto subMatcher(Lengths...)() 4648 { 4649 assert(0); 4650 return this; 4651 } 4652 4653 pure @safe unittest 4654 { 4655 auto m = utfMatcher!char(unicode.Number); 4656 string square = "2²"; 4657 // about sub-matchers 4658 assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered 4659 assert(m.subMatcher!1.match(square)); // ASCII-only, works 4660 assert(!m.subMatcher!1.test(square)); // unicode '²' 4661 assert(m.subMatcher!(2,3,4).match(square)); // 4662 assert(square == ""); 4663 wstring wsquare = "2²"; 4664 auto m16 = utfMatcher!wchar(unicode.Number); 4665 // may keep ref, but the orignal (m16) must be kept alive 4666 auto bmp = m16.subMatcher!1; 4667 assert(bmp.match(wsquare)); // Okay, in basic multilingual plan 4668 assert(bmp.match(wsquare)); // And '²' too 4669 } 4670 } 4671 4672 /** 4673 Test if `M` is an UTF Matcher for ranges of `Char`. 4674 */ 4675 public enum isUtfMatcher(M, C) = __traits(compiles, (){ 4676 C[] s; 4677 auto d = s.decoder; 4678 M m; 4679 assert(is(typeof(m.match(d)) == bool)); 4680 assert(is(typeof(m.test(d)) == bool)); 4681 static if (is(typeof(m.skip(d)))) 4682 { 4683 assert(is(typeof(m.skip(d)) == bool)); 4684 assert(is(typeof(m.skip(s)) == bool)); 4685 } 4686 assert(is(typeof(m.match(s)) == bool)); 4687 assert(is(typeof(m.test(s)) == bool)); 4688 }); 4689 4690 pure @safe unittest 4691 { 4692 alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init)); 4693 alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init)); 4694 static assert(isUtfMatcher!(CharMatcher, char)); 4695 static assert(isUtfMatcher!(CharMatcher, immutable(char))); 4696 static assert(isUtfMatcher!(WcharMatcher, wchar)); 4697 static assert(isUtfMatcher!(WcharMatcher, immutable(wchar))); 4698 } 4699 4700 enum Mode { 4701 alwaysSkip, 4702 neverSkip, 4703 skipOnMatch 4704 } 4705 4706 mixin template ForwardStrings() 4707 { 4708 private bool fwdStr(string fn, C)(ref C[] str) const @trusted 4709 { 4710 import std.utf : byCodeUnit; 4711 alias type = typeof(byCodeUnit(str)); 4712 return mixin(fn~"(*cast(type*)&str)"); 4713 } 4714 } 4715 4716 template Utf8Matcher() 4717 { 4718 enum validSize(int sz) = sz >= 1 && sz <= 4; 4719 4720 void badEncoding() pure @safe 4721 { 4722 import std.utf : UTFException; 4723 throw new UTFException("Invalid UTF-8 sequence"); 4724 } 4725 4726 //for 1-stage ASCII 4727 alias AsciiSpec = AliasSeq!(bool, char, clamp!7); 4728 //for 2-stage lookup of 2 byte UTF-8 sequences 4729 alias Utf8Spec2 = AliasSeq!(bool, char[2], 4730 clampIdx!(0, 5), clampIdx!(1, 6)); 4731 //ditto for 3 byte 4732 alias Utf8Spec3 = AliasSeq!(bool, char[3], 4733 clampIdx!(0, 4), 4734 clampIdx!(1, 6), 4735 clampIdx!(2, 6) 4736 ); 4737 //ditto for 4 byte 4738 alias Utf8Spec4 = AliasSeq!(bool, char[4], 4739 clampIdx!(0, 3), clampIdx!(1, 6), 4740 clampIdx!(2, 6), clampIdx!(3, 6) 4741 ); 4742 alias Tables = AliasSeq!( 4743 typeof(TrieBuilder!(AsciiSpec)(false).build()), 4744 typeof(TrieBuilder!(Utf8Spec2)(false).build()), 4745 typeof(TrieBuilder!(Utf8Spec3)(false).build()), 4746 typeof(TrieBuilder!(Utf8Spec4)(false).build()) 4747 ); 4748 alias Table(int size) = Tables[size-1]; 4749 4750 enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1; 4751 enum encMask(size_t size) = ((1 << size)-1)<<(8-size); 4752 4753 char truncate()(char ch) pure @safe 4754 { 4755 ch -= 0x80; 4756 if (ch < 0x40) 4757 { 4758 return ch; 4759 } 4760 else 4761 { 4762 badEncoding(); 4763 return cast(char) 0; 4764 } 4765 } 4766 4767 static auto encode(size_t sz)(dchar ch) 4768 if (sz > 1) 4769 { 4770 import std.utf : encodeUTF = encode; 4771 char[4] buf; 4772 encodeUTF(buf, ch); 4773 char[sz] ret; 4774 buf[0] &= leadMask!sz; 4775 foreach (n; 1 .. sz) 4776 buf[n] = buf[n] & 0x3f; //keep 6 lower bits 4777 ret[] = buf[0 .. sz]; 4778 return ret; 4779 } 4780 4781 auto build(Set)(Set set) 4782 { 4783 import std.algorithm.iteration : map; 4784 auto ascii = set & unicode.ASCII; 4785 auto utf8_2 = set & CodepointSet(0x80, 0x800); 4786 auto utf8_3 = set & CodepointSet(0x800, 0x1_0000); 4787 auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1); 4788 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 4789 auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2); 4790 auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3); 4791 auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4); 4792 alias Ret = Impl!(1,2,3,4); 4793 return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T); 4794 } 4795 4796 // Bootstrap UTF-8 static matcher interface 4797 // from 3 primitives: tab!(size), lookup and Sizes 4798 mixin template DefMatcher() 4799 { 4800 import std.format : format; 4801 import std.meta : Erase, staticIndexOf; 4802 enum hasASCII = staticIndexOf!(1, Sizes) >= 0; 4803 alias UniSizes = Erase!(1, Sizes); 4804 4805 //generate dispatch code sequence for unicode parts 4806 static auto genDispatch() 4807 { 4808 string code; 4809 foreach (size; UniSizes) 4810 code ~= format(q{ 4811 if ((ch & ~leadMask!%d) == encMask!(%d)) 4812 return lookup!(%d, mode)(inp); 4813 else 4814 }, size, size, size); 4815 static if (Sizes.length == 4) //covers all code unit cases 4816 code ~= "{ badEncoding(); return false; }"; 4817 else 4818 code ~= "return false;"; //may be just fine but not covered 4819 return code; 4820 } 4821 enum dispatch = genDispatch(); 4822 4823 public bool match(Range)(ref Range inp) const 4824 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4825 !isDynamicArray!Range) 4826 { 4827 enum mode = Mode.skipOnMatch; 4828 assert(!inp.empty); 4829 immutable ch = inp[0]; 4830 static if (hasASCII) 4831 { 4832 if (ch < 0x80) 4833 { 4834 immutable r = tab!1[ch]; 4835 if (r) 4836 inp.popFront(); 4837 return r; 4838 } 4839 else 4840 mixin(dispatch); 4841 } 4842 else 4843 mixin(dispatch); 4844 } 4845 4846 static if (Sizes.length == 4) // can skip iff can detect all encodings 4847 { 4848 public bool skip(Range)(ref Range inp) const 4849 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4850 !isDynamicArray!Range) 4851 { 4852 enum mode = Mode.alwaysSkip; 4853 assert(!inp.empty); 4854 auto ch = inp[0]; 4855 static if (hasASCII) 4856 { 4857 if (ch < 0x80) 4858 { 4859 inp.popFront(); 4860 return tab!1[ch]; 4861 } 4862 else 4863 mixin(dispatch); 4864 } 4865 else 4866 mixin(dispatch); 4867 } 4868 } 4869 4870 public bool test(Range)(ref Range inp) const 4871 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4872 !isDynamicArray!Range) 4873 { 4874 enum mode = Mode.neverSkip; 4875 assert(!inp.empty); 4876 auto ch = inp[0]; 4877 4878 static if (hasASCII) 4879 { 4880 if (ch < 0x80) 4881 return tab!1[ch]; 4882 else 4883 mixin(dispatch); 4884 } 4885 else 4886 mixin(dispatch); 4887 } 4888 4889 bool match(C)(ref C[] str) const 4890 if (isSomeChar!C) 4891 { 4892 return fwdStr!"match"(str); 4893 } 4894 4895 bool skip(C)(ref C[] str) const 4896 if (isSomeChar!C) 4897 { 4898 return fwdStr!"skip"(str); 4899 } 4900 4901 bool test(C)(ref C[] str) const 4902 if (isSomeChar!C) 4903 { 4904 return fwdStr!"test"(str); 4905 } 4906 4907 mixin ForwardStrings; 4908 } 4909 4910 struct Impl(Sizes...) 4911 { 4912 import std.meta : allSatisfy, staticMap; 4913 static assert(allSatisfy!(validSize, Sizes), 4914 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4915 private: 4916 //pick tables for chosen sizes 4917 alias OurTabs = staticMap!(Table, Sizes); 4918 OurTabs tables; 4919 mixin DefMatcher; 4920 //static disptach helper UTF size ==> table 4921 alias tab(int i) = tables[i - 1]; 4922 4923 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 4924 { 4925 return CherryPick!(Impl, SizesToPick)(&this); 4926 } 4927 4928 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4929 { 4930 import std.range : popFrontN; 4931 if (inp.length < size) 4932 { 4933 badEncoding(); 4934 return false; 4935 } 4936 char[size] needle = void; 4937 needle[0] = leadMask!size & inp[0]; 4938 static foreach (i; 1 .. size) 4939 { 4940 needle[i] = truncate(inp[i]); 4941 } 4942 //overlong encoding checks 4943 static if (size == 2) 4944 { 4945 //0x80-0x7FF 4946 //got 6 bits in needle[1], must use at least 8 bits 4947 //must use at least 2 bits in needle[1] 4948 if (needle[0] < 2) badEncoding(); 4949 } 4950 else static if (size == 3) 4951 { 4952 //0x800-0xFFFF 4953 //got 6 bits in needle[2], must use at least 12bits 4954 //must use 6 bits in needle[1] or anything in needle[0] 4955 if (needle[0] == 0 && needle[1] < 0x20) badEncoding(); 4956 } 4957 else static if (size == 4) 4958 { 4959 //0x800-0xFFFF 4960 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits 4961 //must use 5 bits (or above) in needle[1] or anything in needle[0] 4962 if (needle[0] == 0 && needle[1] < 0x10) badEncoding(); 4963 } 4964 static if (mode == Mode.alwaysSkip) 4965 { 4966 inp.popFrontN(size); 4967 return tab!size[needle]; 4968 } 4969 else static if (mode == Mode.neverSkip) 4970 { 4971 return tab!size[needle]; 4972 } 4973 else 4974 { 4975 static assert(mode == Mode.skipOnMatch); 4976 4977 if (tab!size[needle]) 4978 { 4979 inp.popFrontN(size); 4980 return true; 4981 } 4982 else 4983 return false; 4984 } 4985 } 4986 } 4987 4988 struct CherryPick(I, Sizes...) 4989 { 4990 import std.meta : allSatisfy; 4991 static assert(allSatisfy!(validSize, Sizes), 4992 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4993 private: 4994 I* m; 4995 @property auto tab(int i)() const { return m.tables[i - 1]; } 4996 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4997 { 4998 return m.lookup!(size, mode)(inp); 4999 } 5000 mixin DefMatcher; 5001 } 5002 } 5003 5004 template Utf16Matcher() 5005 { 5006 enum validSize(int sz) = sz >= 1 && sz <= 2; 5007 5008 void badEncoding() pure @safe 5009 { 5010 import std.utf : UTFException; 5011 throw new UTFException("Invalid UTF-16 sequence"); 5012 } 5013 5014 // 1-stage ASCII 5015 alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7); 5016 //2-stage BMP 5017 alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7)); 5018 //4-stage - full Unicode 5019 //assume that 0xD800 & 0xDC00 bits are cleared 5020 //thus leaving 10 bit per wchar to worry about 5021 alias UniSpec = AliasSeq!(bool, wchar[2], 5022 assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4), 5023 assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6), 5024 ); 5025 alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build()); 5026 alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build()); 5027 alias Uni = typeof(TrieBuilder!(UniSpec)(false).build()); 5028 5029 auto encode2(dchar ch) 5030 { 5031 ch -= 0x1_0000; 5032 assert(ch <= 0xF_FFFF); 5033 wchar[2] ret; 5034 //do not put surrogate bits, they are sliced off 5035 ret[0] = cast(wchar)(ch >> 10); 5036 ret[1] = (ch & 0xFFF); 5037 return ret; 5038 } 5039 5040 auto build(Set)(Set set) 5041 { 5042 import std.algorithm.iteration : map; 5043 auto ascii = set & unicode.ASCII; 5044 auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1)) 5045 - CodepointSet.fromIntervals(0xD800, 0xDFFF+1); 5046 auto other = set - (bmp | ascii); 5047 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 5048 auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec); 5049 auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec); 5050 alias Ret = Impl!(1,2); 5051 return Ret(asciiT, bmpT, otherT); 5052 } 5053 5054 //bootstrap full UTF-16 matcher interace from 5055 //sizeFlags, lookupUni and ascii 5056 mixin template DefMatcher() 5057 { 5058 public bool match(Range)(ref Range inp) const 5059 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5060 !isDynamicArray!Range) 5061 { 5062 enum mode = Mode.skipOnMatch; 5063 assert(!inp.empty); 5064 immutable ch = inp[0]; 5065 static if (sizeFlags & 1) 5066 { 5067 if (ch < 0x80) 5068 { 5069 if (ascii[ch]) 5070 { 5071 inp.popFront(); 5072 return true; 5073 } 5074 else 5075 return false; 5076 } 5077 return lookupUni!mode(inp); 5078 } 5079 else 5080 return lookupUni!mode(inp); 5081 } 5082 5083 static if (Sizes.length == 2) 5084 { 5085 public bool skip(Range)(ref Range inp) const 5086 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5087 !isDynamicArray!Range) 5088 { 5089 enum mode = Mode.alwaysSkip; 5090 assert(!inp.empty); 5091 immutable ch = inp[0]; 5092 static if (sizeFlags & 1) 5093 { 5094 if (ch < 0x80) 5095 { 5096 inp.popFront(); 5097 return ascii[ch]; 5098 } 5099 else 5100 return lookupUni!mode(inp); 5101 } 5102 else 5103 return lookupUni!mode(inp); 5104 } 5105 } 5106 5107 public bool test(Range)(ref Range inp) const 5108 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5109 !isDynamicArray!Range) 5110 { 5111 enum mode = Mode.neverSkip; 5112 assert(!inp.empty); 5113 auto ch = inp[0]; 5114 static if (sizeFlags & 1) 5115 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp); 5116 else 5117 return lookupUni!mode(inp); 5118 } 5119 5120 bool match(C)(ref C[] str) const 5121 if (isSomeChar!C) 5122 { 5123 return fwdStr!"match"(str); 5124 } 5125 5126 bool skip(C)(ref C[] str) const 5127 if (isSomeChar!C) 5128 { 5129 return fwdStr!"skip"(str); 5130 } 5131 5132 bool test(C)(ref C[] str) const 5133 if (isSomeChar!C) 5134 { 5135 return fwdStr!"test"(str); 5136 } 5137 5138 mixin ForwardStrings; //dispatch strings to range versions 5139 } 5140 5141 struct Impl(Sizes...) 5142 if (Sizes.length >= 1 && Sizes.length <= 2) 5143 { 5144 private: 5145 import std.meta : allSatisfy; 5146 static assert(allSatisfy!(validSize, Sizes), 5147 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5148 static if (Sizes.length > 1) 5149 enum sizeFlags = Sizes[0] | Sizes[1]; 5150 else 5151 enum sizeFlags = Sizes[0]; 5152 5153 static if (sizeFlags & 1) 5154 { 5155 Ascii ascii; 5156 Bmp bmp; 5157 } 5158 static if (sizeFlags & 2) 5159 { 5160 Uni uni; 5161 } 5162 mixin DefMatcher; 5163 5164 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 5165 { 5166 return CherryPick!(Impl, SizesToPick)(&this); 5167 } 5168 5169 bool lookupUni(Mode mode, Range)(ref Range inp) const 5170 { 5171 wchar x = cast(wchar)(inp[0] - 0xD800); 5172 //not a high surrogate 5173 if (x > 0x3FF) 5174 { 5175 //low surrogate 5176 if (x <= 0x7FF) badEncoding(); 5177 static if (sizeFlags & 1) 5178 { 5179 auto ch = inp[0]; 5180 static if (mode == Mode.alwaysSkip) 5181 inp.popFront(); 5182 static if (mode == Mode.skipOnMatch) 5183 { 5184 if (bmp[ch]) 5185 { 5186 inp.popFront(); 5187 return true; 5188 } 5189 else 5190 return false; 5191 } 5192 else 5193 return bmp[ch]; 5194 } 5195 else //skip is not available for sub-matchers, so just false 5196 return false; 5197 } 5198 else 5199 { 5200 import std.range : popFrontN; 5201 static if (sizeFlags & 2) 5202 { 5203 if (inp.length < 2) 5204 badEncoding(); 5205 wchar y = cast(wchar)(inp[1] - 0xDC00); 5206 //not a low surrogate 5207 if (y > 0x3FF) 5208 badEncoding(); 5209 wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff]; 5210 static if (mode == Mode.alwaysSkip) 5211 inp.popFrontN(2); 5212 static if (mode == Mode.skipOnMatch) 5213 { 5214 if (uni[needle]) 5215 { 5216 inp.popFrontN(2); 5217 return true; 5218 } 5219 else 5220 return false; 5221 } 5222 else 5223 return uni[needle]; 5224 } 5225 else //ditto 5226 return false; 5227 } 5228 } 5229 } 5230 5231 struct CherryPick(I, Sizes...) 5232 if (Sizes.length >= 1 && Sizes.length <= 2) 5233 { 5234 private: 5235 import std.meta : allSatisfy; 5236 I* m; 5237 enum sizeFlags = I.sizeFlags; 5238 5239 static if (sizeFlags & 1) 5240 { 5241 @property auto ascii()() const { return m.ascii; } 5242 } 5243 5244 bool lookupUni(Mode mode, Range)(ref Range inp) const 5245 { 5246 return m.lookupUni!mode(inp); 5247 } 5248 mixin DefMatcher; 5249 static assert(allSatisfy!(validSize, Sizes), 5250 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5251 } 5252 } 5253 5254 private auto utf8Matcher(Set)(Set set) 5255 { 5256 return Utf8Matcher!().build(set); 5257 } 5258 5259 private auto utf16Matcher(Set)(Set set) 5260 { 5261 return Utf16Matcher!().build(set); 5262 } 5263 5264 /** 5265 Constructs a matcher object 5266 to classify $(CODEPOINTS) from the `set` for encoding 5267 that has `Char` as code unit. 5268 5269 See $(LREF MatcherConcept) for API outline. 5270 */ 5271 public auto utfMatcher(Char, Set)(Set set) 5272 if (isCodepointSet!Set) 5273 { 5274 static if (is(Char : char)) 5275 return utf8Matcher(set); 5276 else static if (is(Char : wchar)) 5277 return utf16Matcher(set); 5278 else static if (is(Char : dchar)) 5279 static assert(false, "UTF-32 needs no decoding, 5280 and thus not supported by utfMatcher"); 5281 else 5282 static assert(false, "Only character types 'char' and 'wchar' are allowed"); 5283 } 5284 5285 5286 //a range of code units, packed with index to speed up forward iteration 5287 package(std) auto decoder(C)(C[] s, size_t offset=0) 5288 if (is(C : wchar) || is(C : char)) 5289 { 5290 static struct Decoder 5291 { 5292 pure nothrow: 5293 C[] str; 5294 size_t idx; 5295 @property C front(){ return str[idx]; } 5296 @property C back(){ return str[$-1]; } 5297 void popFront(){ idx++; } 5298 void popBack(){ str = str[0..$-1]; } 5299 void popFrontN(size_t n){ idx += n; } 5300 @property bool empty(){ return idx == str.length; } 5301 @property auto save(){ return this; } 5302 auto opIndex(size_t i){ return str[idx+i]; } 5303 @property size_t length(){ return str.length - idx; } 5304 alias opDollar = length; 5305 auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); } 5306 } 5307 static assert(isRandomAccessRange!Decoder); 5308 static assert(is(ElementType!Decoder : C)); 5309 return Decoder(s, offset); 5310 } 5311 5312 pure @safe unittest 5313 { 5314 string rs = "hi! ネемног砀 текста"; 5315 auto codec = rs.decoder; 5316 auto utf8 = utf8Matcher(unicode.Letter); 5317 auto asc = utf8.subMatcher!(1); 5318 auto uni = utf8.subMatcher!(2,3,4); 5319 5320 // h 5321 assert(asc.test(codec)); 5322 assert(!uni.match(codec)); 5323 assert(utf8.skip(codec)); 5324 assert(codec.idx == 1); 5325 5326 // i 5327 assert(asc.test(codec)); 5328 assert(!uni.match(codec)); 5329 assert(utf8.skip(codec)); 5330 assert(codec.idx == 2); 5331 5332 // ! 5333 assert(!asc.match(codec)); 5334 assert(!utf8.test(codec)); 5335 assert(!utf8.skip(codec)); 5336 assert(codec.idx == 3); 5337 5338 // space 5339 assert(!asc.test(codec)); 5340 assert(!utf8.test(codec)); 5341 assert(!utf8.skip(codec)); 5342 assert(codec.idx == 4); 5343 5344 assert(utf8.test(codec)); 5345 foreach (i; 0 .. 7) 5346 { 5347 assert(!asc.test(codec)); 5348 assert(uni.test(codec)); 5349 assert(utf8.skip(codec)); 5350 } 5351 assert(!utf8.test(codec)); 5352 assert(!utf8.skip(codec)); 5353 5354 //the same with match where applicable 5355 codec = rs.decoder; 5356 assert(utf8.match(codec)); 5357 assert(codec.idx == 1); 5358 assert(utf8.match(codec)); 5359 assert(codec.idx == 2); 5360 assert(!utf8.match(codec)); 5361 assert(codec.idx == 2); 5362 assert(!utf8.skip(codec)); 5363 assert(!utf8.skip(codec)); 5364 5365 foreach (i; 0 .. 7) 5366 { 5367 assert(!asc.test(codec)); 5368 assert(utf8.test(codec)); 5369 assert(utf8.match(codec)); 5370 } 5371 auto i = codec.idx; 5372 assert(!utf8.match(codec)); 5373 assert(codec.idx == i); 5374 } 5375 5376 pure @system unittest 5377 { 5378 import std.range : stride; 5379 static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe 5380 { 5381 bool t = m.test(r); 5382 auto save = r.idx; 5383 assert(t == m.match(r)); 5384 assert(r.idx == save || t); //ether no change or was match 5385 r.idx = save; 5386 static if (is(typeof(m.skip(r)))) 5387 { 5388 assert(t == m.skip(r)); 5389 assert(r.idx != save); //always changed 5390 r.idx = save; 5391 } 5392 return t; 5393 } 5394 auto utf16 = utfMatcher!wchar(unicode.L); 5395 auto bmp = utf16.subMatcher!1; 5396 auto nonBmp = utf16.subMatcher!1; 5397 auto utf8 = utfMatcher!char(unicode.L); 5398 auto ascii = utf8.subMatcher!1; 5399 auto uni2 = utf8.subMatcher!2; 5400 auto uni3 = utf8.subMatcher!3; 5401 auto uni24 = utf8.subMatcher!(2,4); 5402 foreach (ch; unicode.L.byCodepoint.stride(3)) 5403 { 5404 import std.utf : encode; 5405 char[4] buf; 5406 wchar[2] buf16; 5407 auto len = encode(buf, ch); 5408 auto len16 = encode(buf16, ch); 5409 auto c8 = buf[0 .. len].decoder; 5410 auto c16 = buf16[0 .. len16].decoder; 5411 assert(testAll(utf16, c16)); 5412 assert(testAll(bmp, c16) || len16 != 1); 5413 assert(testAll(nonBmp, c16) || len16 != 2); 5414 5415 assert(testAll(utf8, c8)); 5416 5417 //submatchers return false on out of their domain 5418 assert(testAll(ascii, c8) || len != 1); 5419 assert(testAll(uni2, c8) || len != 2); 5420 assert(testAll(uni3, c8) || len != 3); 5421 assert(testAll(uni24, c8) || (len != 2 && len != 4)); 5422 } 5423 } 5424 5425 // cover decode fail cases of Matcher 5426 pure @safe unittest 5427 { 5428 import std.algorithm.iteration : map; 5429 import std.exception : collectException; 5430 import std.format : format; 5431 auto utf16 = utfMatcher!wchar(unicode.L); 5432 auto utf8 = utfMatcher!char(unicode.L); 5433 //decode failure cases UTF-8 5434 alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79", 5435 "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00", 5436 "\xCF\x00\0x00\0x00\x00"); 5437 foreach (msg; fails8) 5438 { 5439 assert(collectException((){ 5440 auto s = msg; 5441 size_t idx = 0; 5442 utf8.test(s); 5443 }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg)); 5444 } 5445 //decode failure cases UTF-16 5446 alias fails16 = AliasSeq!([0xD811], [0xDC02]); 5447 foreach (msg; fails16) 5448 { 5449 assert(collectException((){ 5450 auto s = msg.map!(x => cast(wchar) x); 5451 utf16.test(s); 5452 }())); 5453 } 5454 } 5455 5456 /++ 5457 Convenience function to construct optimal configurations for 5458 packed Trie from any `set` of $(CODEPOINTS). 5459 5460 The parameter `level` indicates the number of trie levels to use, 5461 allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs 5462 speed-size wise. 5463 5464 $(P Level 1 is fastest and the most memory hungry (a bit array). ) 5465 $(P Level 4 is the slowest and has the smallest footprint. ) 5466 5467 See the $(S_LINK Synopsis, Synopsis) section for example. 5468 5469 Note: 5470 Level 4 stays very practical (being faster and more predictable) 5471 compared to using direct lookup on the `set` itself. 5472 5473 5474 +/ 5475 public auto toTrie(size_t level, Set)(Set set) 5476 if (isCodepointSet!Set) 5477 { 5478 static if (level == 1) 5479 return codepointSetTrie!(21)(set); 5480 else static if (level == 2) 5481 return codepointSetTrie!(10, 11)(set); 5482 else static if (level == 3) 5483 return codepointSetTrie!(8, 5, 8)(set); 5484 else static if (level == 4) 5485 return codepointSetTrie!(6, 4, 4, 7)(set); 5486 else 5487 static assert(false, 5488 "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly"); 5489 } 5490 5491 /** 5492 $(P Builds a `Trie` with typically optimal speed-size trade-off 5493 and wraps it into a delegate of the following type: 5494 $(D bool delegate(dchar ch)). ) 5495 5496 $(P Effectively this creates a 'tester' lambda suitable 5497 for algorithms like std.algorithm.find that take unary predicates. ) 5498 5499 See the $(S_LINK Synopsis, Synopsis) section for example. 5500 */ 5501 public auto toDelegate(Set)(Set set) 5502 if (isCodepointSet!Set) 5503 { 5504 // 3 is very small and is almost as fast as 2-level (due to CPU caches?) 5505 auto t = toTrie!3(set); 5506 return (dchar ch) => t[ch]; 5507 } 5508 5509 /** 5510 $(P Opaque wrapper around unsigned built-in integers and 5511 code unit (char/wchar/dchar) types. 5512 Parameter `sz` indicates that the value is confined 5513 to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be 5514 packed more tightly when stored in certain 5515 data-structures like trie. ) 5516 5517 Note: 5518 $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T` 5519 but not vise-versa. Users have to ensure the value fits in 5520 the range required and use the `cast` 5521 operator to perform the conversion.) 5522 */ 5523 struct BitPacked(T, size_t sz) 5524 if (isIntegral!T || is(T:dchar)) 5525 { 5526 enum bitSize = sz; 5527 T _value; 5528 alias _value this; 5529 } 5530 5531 /* 5532 Depending on the form of the passed argument `bitSizeOf` returns 5533 the amount of bits required to represent a given type 5534 or a return type of a given functor. 5535 */ 5536 template bitSizeOf(Args...) 5537 if (Args.length == 1) 5538 { 5539 import std.traits : ReturnType; 5540 alias T = Args[0]; 5541 static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t)) 5542 { 5543 enum bitSizeOf = T.bitSize; 5544 } 5545 else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits)) 5546 { 5547 enum bitSizeOf = bitSizeOf!(ReturnType!T); 5548 } 5549 else 5550 { 5551 enum bitSizeOf = T.sizeof*8; 5552 } 5553 } 5554 5555 /** 5556 Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x) 5557 and thus suitable for packing. 5558 */ 5559 template isBitPacked(T) 5560 { 5561 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5562 enum isBitPacked = true; 5563 else 5564 enum isBitPacked = false; 5565 } 5566 5567 /** 5568 Gives the type `U` from $(LREF BitPacked)!(U, x) 5569 or `T` itself for every other type. 5570 */ 5571 template TypeOfBitPacked(T) 5572 { 5573 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5574 alias TypeOfBitPacked = U; 5575 else 5576 alias TypeOfBitPacked = T; 5577 } 5578 5579 /* 5580 Wrapper, used in definition of custom data structures from `Trie` template. 5581 Applying it to a unary lambda function indicates that the returned value always 5582 fits within `bits` of bits. 5583 */ 5584 struct assumeSize(alias Fn, size_t bits) 5585 { 5586 enum bitSize = bits; 5587 static auto ref opCall(T)(auto ref T arg) 5588 { 5589 return Fn(arg); 5590 } 5591 } 5592 5593 /* 5594 A helper for defining lambda function that yields a slice 5595 of certain bits from an unsigned integral value. 5596 The resulting lambda is wrapped in assumeSize and can be used directly 5597 with `Trie` template. 5598 */ 5599 struct sliceBits(size_t from, size_t to) 5600 { 5601 //for now bypass assumeSize, DMD has trouble inlining it 5602 enum bitSize = to-from; 5603 static auto opCall(T)(T x) 5604 out(result) 5605 { 5606 assert(result < (1 << to-from)); 5607 } 5608 do 5609 { 5610 static assert(from < to); 5611 static if (from == 0) 5612 return x & ((1 << to)-1); 5613 else 5614 return (x >> from) & ((1<<(to-from))-1); 5615 } 5616 } 5617 5618 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; } 5619 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; } 5620 alias lo8 = assumeSize!(low_8, 8); 5621 alias mlo8 = assumeSize!(midlow_8, 8); 5622 5623 @safe pure nothrow @nogc unittest 5624 { 5625 static assert(bitSizeOf!lo8 == 8); 5626 static assert(bitSizeOf!(sliceBits!(4, 7)) == 3); 5627 static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2); 5628 } 5629 5630 template Sequence(size_t start, size_t end) 5631 { 5632 static if (start < end) 5633 alias Sequence = AliasSeq!(start, Sequence!(start+1, end)); 5634 else 5635 alias Sequence = AliasSeq!(); 5636 } 5637 5638 //---- TRIE TESTS ---- 5639 @system unittest 5640 { 5641 import std.algorithm.iteration : map; 5642 import std.algorithm.sorting : sort; 5643 import std.array : array; 5644 import std.conv : text, to; 5645 import std.range : iota; 5646 static trieStats(TRIE)(TRIE t) 5647 { 5648 version (std_uni_stats) 5649 { 5650 import std.stdio : writefln, writeln; 5651 writeln("---TRIE FOOTPRINT STATS---"); 5652 static foreach (i; 0 .. t.table.dim) 5653 { 5654 writefln("lvl%s = %s bytes; %s pages" 5655 , i, t.bytes!i, t.pages!i); 5656 } 5657 writefln("TOTAL: %s bytes", t.bytes); 5658 version (none) 5659 { 5660 writeln("INDEX (excluding value level):"); 5661 static foreach (i; 0 .. t.table.dim-1) 5662 writeln(t.table.slice!(i)[0 .. t.table.length!i]); 5663 } 5664 writeln("---------------------------"); 5665 } 5666 } 5667 //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2) 5668 // alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; }); 5669 // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; }); 5670 alias Set = CodepointSet; 5671 auto set = Set('A','Z','a','z'); 5672 auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array 5673 for (int a='a'; a<'z';a++) 5674 assert(trie[a]); 5675 for (int a='A'; a<'Z';a++) 5676 assert(trie[a]); 5677 for (int a=0; a<'A'; a++) 5678 assert(!trie[a]); 5679 for (int a ='Z'; a<'a'; a++) 5680 assert(!trie[a]); 5681 trieStats(trie); 5682 5683 auto redundant2 = Set( 5684 1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111); 5685 auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval); 5686 trieStats(trie2); 5687 foreach (e; redundant2.byCodepoint) 5688 assert(trie2[e], text(cast(uint) e, " - ", trie2[e])); 5689 foreach (i; 0 .. 1024) 5690 { 5691 assert(trie2[i] == (i in redundant2)); 5692 } 5693 5694 5695 auto redundant3 = Set( 5696 2, 4, 6, 8, 16, 5697 2+16, 4+16, 16+6, 16+8, 16+16, 5698 2+32, 4+32, 32+6, 32+8, 5699 ); 5700 5701 enum max3 = 256; 5702 // sliceBits 5703 auto trie3 = buildTrie!(bool, uint, max3, 5704 sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4) 5705 )(redundant3.byInterval); 5706 trieStats(trie3); 5707 foreach (i; 0 .. max3) 5708 assert(trie3[i] == (i in redundant3), text(cast(uint) i)); 5709 5710 auto redundant4 = Set( 5711 10, 64, 64+10, 128, 128+10, 256, 256+10, 512, 5712 1000, 2000, 3000, 4000, 5000, 6000 5713 ); 5714 enum max4 = 2^^16; 5715 auto trie4 = buildTrie!(bool, size_t, max4, 5716 sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6) 5717 )(redundant4.byInterval); 5718 foreach (i; 0 .. max4) 5719 { 5720 if (i in redundant4) 5721 assert(trie4[i], text(cast(uint) i)); 5722 } 5723 trieStats(trie4); 5724 5725 alias mapToS = mapTrieIndex!(useItemAt!(0, char)); 5726 string[] redundantS = ["tea", "start", "orange"]; 5727 redundantS.sort!((a,b) => mapToS(a) < mapToS(b))(); 5728 auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS); 5729 // using first char only 5730 assert(redundantS == ["orange", "start", "tea"]); 5731 assert(strie["test"], text(strie["test"])); 5732 assert(!strie["aea"]); 5733 assert(strie["s"]); 5734 5735 // a bit size test 5736 auto a = array(map!(x => to!ubyte(x))(iota(0, 256))); 5737 auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a); 5738 trieStats(bt); 5739 foreach (i; 0 .. 256) 5740 assert(bt[cast(ubyte) i]); 5741 } 5742 5743 template useItemAt(size_t idx, T) 5744 if (isIntegral!T || is(T: dchar)) 5745 { 5746 size_t impl(const scope T[] arr){ return arr[idx]; } 5747 alias useItemAt = assumeSize!(impl, 8*T.sizeof); 5748 } 5749 5750 template useLastItem(T) 5751 { 5752 size_t impl(const scope T[] arr){ return arr[$-1]; } 5753 alias useLastItem = assumeSize!(impl, 8*T.sizeof); 5754 } 5755 5756 template fullBitSize(Prefix...) 5757 { 5758 static if (Prefix.length > 0) 5759 enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]); 5760 else 5761 enum fullBitSize = 0; 5762 } 5763 5764 template idxTypes(Key, size_t fullBits, Prefix...) 5765 { 5766 static if (Prefix.length == 1) 5767 {// the last level is value level, so no index once reduced to 1-level 5768 alias idxTypes = AliasSeq!(); 5769 } 5770 else 5771 { 5772 // Important note on bit packing 5773 // Each level has to hold enough of bits to address the next one 5774 // The bottom level is known to hold full bit width 5775 // thus it's size in pages is full_bit_width - size_of_last_prefix 5776 // Recourse on this notion 5777 alias idxTypes = 5778 AliasSeq!( 5779 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]), 5780 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1])) 5781 ); 5782 } 5783 } 5784 5785 //============================================================================ 5786 5787 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) 5788 if (is(Char1 : dchar) && is(Char2 : dchar)) 5789 { 5790 import std.algorithm.comparison : cmp; 5791 import std.algorithm.iteration : map, filter; 5792 import std.ascii : toLower; 5793 static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';} 5794 return cmp( 5795 a.map!toLower.filter!pred, 5796 b.map!toLower.filter!pred); 5797 } 5798 5799 @safe pure unittest 5800 { 5801 assert(!comparePropertyName("foo-bar", "fooBar")); 5802 } 5803 5804 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure 5805 if (is(Char1 : dchar) && is(Char2 : dchar)) 5806 { 5807 return comparePropertyName(a, b) < 0; 5808 } 5809 5810 //============================================================================ 5811 // Utilities for compression of Unicode code point sets 5812 //============================================================================ 5813 5814 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow 5815 { 5816 // not optimized as usually done 1 time (and not public interface) 5817 if (val < 128) 5818 arr ~= cast(ubyte) val; 5819 else if (val < (1 << 13)) 5820 { 5821 arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8); 5822 arr ~= val & 0xFF; 5823 } 5824 else 5825 { 5826 assert(val < (1 << 21)); 5827 arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16); 5828 arr ~= (val >> 8) & 0xFF; 5829 arr ~= val & 0xFF; 5830 } 5831 } 5832 5833 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure 5834 { 5835 import std.exception : enforce; 5836 immutable first = arr[idx++]; 5837 if (!(first & 0x80)) // no top bit -> [0 .. 127] 5838 return first; 5839 immutable extra = ((first >> 5) & 1) + 1; // [1, 2] 5840 uint val = (first & 0x1F); 5841 enforce(idx + extra <= arr.length, "bad code point interval encoding"); 5842 foreach (j; 0 .. extra) 5843 val = (val << 8) | arr[idx+j]; 5844 idx += extra; 5845 return val; 5846 } 5847 5848 5849 package(std) ubyte[] compressIntervals(Range)(Range intervals) 5850 if (isInputRange!Range && isIntegralPair!(ElementType!Range)) 5851 { 5852 ubyte[] storage; 5853 uint base = 0; 5854 // RLE encode 5855 foreach (val; intervals) 5856 { 5857 compressTo(val[0]-base, storage); 5858 base = val[0]; 5859 if (val[1] != lastDchar+1) // till the end of the domain so don't store it 5860 { 5861 compressTo(val[1]-base, storage); 5862 base = val[1]; 5863 } 5864 } 5865 return storage; 5866 } 5867 5868 @safe pure unittest 5869 { 5870 import std.algorithm.comparison : equal; 5871 import std.typecons : tuple; 5872 5873 auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)]; 5874 ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0]; 5875 assert(compressIntervals(run) == enc); 5876 auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)]; 5877 ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed 5878 assert(compressIntervals(run2) == enc2); 5879 size_t idx = 0; 5880 assert(decompressFrom(enc, idx) == 80); 5881 assert(decompressFrom(enc, idx) == 47); 5882 assert(decompressFrom(enc, idx) == 1); 5883 assert(decompressFrom(enc, idx) == (1 << 10)); 5884 idx = 0; 5885 assert(decompressFrom(enc2, idx) == 0); 5886 assert(decompressFrom(enc2, idx) == (1 << 20)+512+1); 5887 assert(equal(decompressIntervals(compressIntervals(run)), run)); 5888 assert(equal(decompressIntervals(compressIntervals(run2)), run2)); 5889 } 5890 5891 // Creates a range of `CodepointInterval` that lazily decodes compressed data. 5892 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure 5893 { 5894 return DecompressedIntervals(data); 5895 } 5896 5897 @safe struct DecompressedIntervals 5898 { 5899 pure: 5900 const(ubyte)[] _stream; 5901 size_t _idx; 5902 CodepointInterval _front; 5903 5904 this(const(ubyte)[] stream) 5905 { 5906 _stream = stream; 5907 popFront(); 5908 } 5909 5910 @property CodepointInterval front() 5911 { 5912 assert(!empty); 5913 return _front; 5914 } 5915 5916 void popFront() 5917 { 5918 if (_idx == _stream.length) 5919 { 5920 _idx = size_t.max; 5921 return; 5922 } 5923 uint base = _front[1]; 5924 _front[0] = base + decompressFrom(_stream, _idx); 5925 if (_idx == _stream.length)// odd length ---> till the end 5926 _front[1] = lastDchar+1; 5927 else 5928 { 5929 base = _front[0]; 5930 _front[1] = base + decompressFrom(_stream, _idx); 5931 } 5932 } 5933 5934 @property bool empty() const 5935 { 5936 return _idx == size_t.max; 5937 } 5938 5939 @property DecompressedIntervals save() return scope { return this; } 5940 } 5941 5942 @safe pure nothrow @nogc unittest 5943 { 5944 static assert(isInputRange!DecompressedIntervals); 5945 static assert(isForwardRange!DecompressedIntervals); 5946 } 5947 5948 //============================================================================ 5949 5950 version (std_uni_bootstrap){} 5951 else 5952 { 5953 5954 // helper for looking up code point sets 5955 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name) 5956 { 5957 import std.algorithm.iteration : map; 5958 import std.range : assumeSorted; 5959 auto range = assumeSorted!((a,b) => propertyNameLess(a,b)) 5960 (table.map!"a.name"()); 5961 size_t idx = range.lowerBound(name).length; 5962 if (idx < range.length && comparePropertyName(range[idx], name) == 0) 5963 return idx; 5964 return -1; 5965 } 5966 5967 // another one that loads it 5968 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest) 5969 { 5970 auto idx = findUnicodeSet!table(name); 5971 if (idx >= 0) 5972 { 5973 dest = Set(asSet(table[idx].compressed)); 5974 return true; 5975 } 5976 return false; 5977 } 5978 5979 bool loadProperty(Set=CodepointSet, C) 5980 (const scope C[] name, ref Set target) pure 5981 { 5982 import std.internal.unicode_tables : uniProps; // generated file 5983 alias ucmp = comparePropertyName; 5984 // conjure cumulative properties by hand 5985 if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0) 5986 { 5987 target = asSet(uniProps.Lu); 5988 target |= asSet(uniProps.Ll); 5989 target |= asSet(uniProps.Lt); 5990 target |= asSet(uniProps.Lo); 5991 target |= asSet(uniProps.Lm); 5992 } 5993 else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0) 5994 { 5995 target = asSet(uniProps.Ll); 5996 target |= asSet(uniProps.Lu); 5997 target |= asSet(uniProps.Lt);// Title case 5998 } 5999 else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0) 6000 { 6001 target = asSet(uniProps.Mn); 6002 target |= asSet(uniProps.Mc); 6003 target |= asSet(uniProps.Me); 6004 } 6005 else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0) 6006 { 6007 target = asSet(uniProps.Nd); 6008 target |= asSet(uniProps.Nl); 6009 target |= asSet(uniProps.No); 6010 } 6011 else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0) 6012 { 6013 target = asSet(uniProps.Pc); 6014 target |= asSet(uniProps.Pd); 6015 target |= asSet(uniProps.Ps); 6016 target |= asSet(uniProps.Pe); 6017 target |= asSet(uniProps.Pi); 6018 target |= asSet(uniProps.Pf); 6019 target |= asSet(uniProps.Po); 6020 } 6021 else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0) 6022 { 6023 target = asSet(uniProps.Sm); 6024 target |= asSet(uniProps.Sc); 6025 target |= asSet(uniProps.Sk); 6026 target |= asSet(uniProps.So); 6027 } 6028 else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0) 6029 { 6030 target = asSet(uniProps.Zs); 6031 target |= asSet(uniProps.Zl); 6032 target |= asSet(uniProps.Zp); 6033 } 6034 else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0) 6035 { 6036 target = asSet(uniProps.Cc); 6037 target |= asSet(uniProps.Cf); 6038 target |= asSet(uniProps.Cs); 6039 target |= asSet(uniProps.Co); 6040 target |= asSet(uniProps.Cn); 6041 } 6042 else if (ucmp(name, "graphical") == 0) 6043 { 6044 target = asSet(uniProps.Alphabetic); 6045 6046 target |= asSet(uniProps.Mn); 6047 target |= asSet(uniProps.Mc); 6048 target |= asSet(uniProps.Me); 6049 6050 target |= asSet(uniProps.Nd); 6051 target |= asSet(uniProps.Nl); 6052 target |= asSet(uniProps.No); 6053 6054 target |= asSet(uniProps.Pc); 6055 target |= asSet(uniProps.Pd); 6056 target |= asSet(uniProps.Ps); 6057 target |= asSet(uniProps.Pe); 6058 target |= asSet(uniProps.Pi); 6059 target |= asSet(uniProps.Pf); 6060 target |= asSet(uniProps.Po); 6061 6062 target |= asSet(uniProps.Zs); 6063 6064 target |= asSet(uniProps.Sm); 6065 target |= asSet(uniProps.Sc); 6066 target |= asSet(uniProps.Sk); 6067 target |= asSet(uniProps.So); 6068 } 6069 else if (ucmp(name, "any") == 0) 6070 target = Set.fromIntervals(0, 0x110000); 6071 else if (ucmp(name, "ascii") == 0) 6072 target = Set.fromIntervals(0, 0x80); 6073 else 6074 return loadUnicodeSet!(uniProps.tab)(name, target); 6075 return true; 6076 } 6077 6078 // CTFE-only helper for checking property names at compile-time 6079 @safe bool isPrettyPropertyName(C)(const scope C[] name) 6080 { 6081 import std.algorithm.searching : find; 6082 auto names = [ 6083 "L", "Letter", 6084 "LC", "Cased Letter", 6085 "M", "Mark", 6086 "N", "Number", 6087 "P", "Punctuation", 6088 "S", "Symbol", 6089 "Z", "Separator", 6090 "Graphical", 6091 "any", 6092 "ascii" 6093 ]; 6094 auto x = find!(x => comparePropertyName(x, name) == 0)(names); 6095 return !x.empty; 6096 } 6097 6098 // ditto, CTFE-only, not optimized 6099 @safe private static bool findSetName(alias table, C)(const scope C[] name) 6100 { 6101 return findUnicodeSet!table(name) >= 0; 6102 } 6103 6104 template SetSearcher(alias table, string kind) 6105 { 6106 /// Run-time checked search. 6107 static auto opCall(C)(const scope C[] name) 6108 if (is(C : dchar)) 6109 { 6110 import std.conv : to; 6111 CodepointSet set; 6112 if (loadUnicodeSet!table(name, set)) 6113 return set; 6114 throw new Exception("No unicode set for "~kind~" by name " 6115 ~name.to!string()~" was found."); 6116 } 6117 /// Compile-time checked search. 6118 static @property auto opDispatch(string name)() 6119 { 6120 static if (findSetName!table(name)) 6121 { 6122 CodepointSet set; 6123 loadUnicodeSet!table(name, set); 6124 return set; 6125 } 6126 else 6127 static assert(false, "No unicode set for "~kind~" by name " 6128 ~name~" was found."); 6129 } 6130 } 6131 6132 // Characters that need escaping in string posed as regular expressions 6133 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', 6134 ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); 6135 6136 package(std) CodepointSet memoizeExpr(string expr)() 6137 { 6138 if (__ctfe) 6139 return mixin(expr); 6140 alias T = typeof(mixin(expr)); 6141 static T slot; 6142 static bool initialized; 6143 if (!initialized) 6144 { 6145 slot = mixin(expr); 6146 initialized = true; 6147 } 6148 return slot; 6149 } 6150 6151 //property for \w character class 6152 package(std) @property CodepointSet wordCharacter() @safe 6153 { 6154 return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc 6155 | unicode.Me | unicode.Nd | unicode.Pc")(); 6156 } 6157 6158 //basic stack, just in case it gets used anywhere else then Parser 6159 package(std) struct Stack(T) 6160 { 6161 @safe: 6162 T[] data; 6163 @property bool empty(){ return data.empty; } 6164 6165 @property size_t length(){ return data.length; } 6166 6167 void push(T val){ data ~= val; } 6168 6169 @trusted T pop() 6170 { 6171 assert(!empty); 6172 auto val = data[$ - 1]; 6173 data = data[0 .. $ - 1]; 6174 if (!__ctfe) 6175 cast(void) data.assumeSafeAppend(); 6176 return val; 6177 } 6178 6179 @property ref T top() 6180 { 6181 assert(!empty); 6182 return data[$ - 1]; 6183 } 6184 } 6185 6186 //test if a given string starts with hex number of maxDigit that's a valid codepoint 6187 //returns it's value and skips these maxDigit chars on success, throws on failure 6188 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit) 6189 { 6190 import std.exception : enforce; 6191 //std.conv.parse is both @system and bogus 6192 uint val; 6193 for (int k = 0; k < maxDigit; k++) 6194 { 6195 enforce(!str.empty, "incomplete escape sequence"); 6196 //accepts ascii only, so it's OK to index directly 6197 immutable current = str.front; 6198 if ('0' <= current && current <= '9') 6199 val = val * 16 + current - '0'; 6200 else if ('a' <= current && current <= 'f') 6201 val = val * 16 + current -'a' + 10; 6202 else if ('A' <= current && current <= 'F') 6203 val = val * 16 + current - 'A' + 10; 6204 else 6205 throw new Exception("invalid escape sequence"); 6206 str.popFront(); 6207 } 6208 enforce(val <= 0x10FFFF, "invalid codepoint"); 6209 return val; 6210 } 6211 6212 @safe unittest 6213 { 6214 import std.algorithm.searching : canFind; 6215 import std.exception : collectException; 6216 string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; 6217 string[] hex = [ "01", "ff", "00af", "10FFFF" ]; 6218 int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ]; 6219 foreach (v; non_hex) 6220 assert(collectException(parseUniHex(v, v.length)).msg 6221 .canFind("invalid escape sequence")); 6222 foreach (i, v; hex) 6223 assert(parseUniHex(v, v.length) == value[i]); 6224 string over = "0011FFFF"; 6225 assert(collectException(parseUniHex(over, over.length)).msg 6226 .canFind("invalid codepoint")); 6227 } 6228 6229 auto caseEnclose(CodepointSet set) 6230 { 6231 auto cased = set & unicode.LC; 6232 foreach (dchar ch; cased.byCodepoint) 6233 { 6234 foreach (c; simpleCaseFoldings(ch)) 6235 set |= c; 6236 } 6237 return set; 6238 } 6239 6240 /+ 6241 fetch codepoint set corresponding to a name (InBlock or binary property) 6242 +/ 6243 CodepointSet getUnicodeSet(const scope char[] name, bool negated, bool casefold) @safe 6244 { 6245 CodepointSet s = unicode(name); 6246 //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) 6247 if (casefold) 6248 s = caseEnclose(s); 6249 if (negated) 6250 s = s.inverted; 6251 return s; 6252 } 6253 6254 struct UnicodeSetParser(Range) 6255 { 6256 import std.exception : enforce; 6257 import std.typecons : tuple, Tuple; 6258 Range range; 6259 bool casefold_; 6260 6261 @property bool empty(){ return range.empty; } 6262 @property dchar front(){ return range.front; } 6263 void popFront(){ range.popFront(); } 6264 6265 //CodepointSet operations relatively in order of priority 6266 enum Operator:uint { 6267 Open = 0, Negate, Difference, SymDifference, Intersection, Union, None 6268 } 6269 6270 //parse unit of CodepointSet spec, most notably escape sequences and char ranges 6271 //also fetches next set operation 6272 Tuple!(CodepointSet,Operator) parseCharTerm() 6273 { 6274 import std.range : drop; 6275 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; 6276 enum State{ Start, Char, Escape, CharDash, CharDashEscape, 6277 PotentialTwinSymbolOperator } 6278 Operator op = Operator.None; 6279 dchar last; 6280 CodepointSet set; 6281 State state = State.Start; 6282 6283 void addWithFlags(ref CodepointSet set, uint ch) 6284 { 6285 if (casefold_) 6286 { 6287 auto foldings = simpleCaseFoldings(ch); 6288 foreach (v; foldings) 6289 set |= v; 6290 } 6291 else 6292 set |= ch; 6293 } 6294 6295 static Operator twinSymbolOperator(dchar symbol) 6296 { 6297 switch (symbol) 6298 { 6299 case '|': 6300 return Operator.Union; 6301 case '-': 6302 return Operator.Difference; 6303 case '~': 6304 return Operator.SymDifference; 6305 case '&': 6306 return Operator.Intersection; 6307 default: 6308 assert(false); 6309 } 6310 } 6311 6312 L_CharTermLoop: 6313 for (;;) 6314 { 6315 final switch (state) 6316 { 6317 case State.Start: 6318 switch (front) 6319 { 6320 case '|': 6321 case '-': 6322 case '~': 6323 case '&': 6324 state = State.PotentialTwinSymbolOperator; 6325 last = front; 6326 break; 6327 case '[': 6328 op = Operator.Union; 6329 goto case; 6330 case ']': 6331 break L_CharTermLoop; 6332 case '\\': 6333 state = State.Escape; 6334 break; 6335 default: 6336 state = State.Char; 6337 last = front; 6338 } 6339 break; 6340 case State.Char: 6341 // xxx last front xxx 6342 switch (front) 6343 { 6344 case '|': 6345 case '~': 6346 case '&': 6347 // then last is treated as normal char and added as implicit union 6348 state = State.PotentialTwinSymbolOperator; 6349 addWithFlags(set, last); 6350 last = front; 6351 break; 6352 case '-': // still need more info 6353 state = State.CharDash; 6354 break; 6355 case '\\': 6356 set |= last; 6357 state = State.Escape; 6358 break; 6359 case '[': 6360 op = Operator.Union; 6361 goto case; 6362 case ']': 6363 addWithFlags(set, last); 6364 break L_CharTermLoop; 6365 default: 6366 state = State.Char; 6367 addWithFlags(set, last); 6368 last = front; 6369 } 6370 break; 6371 case State.PotentialTwinSymbolOperator: 6372 // xxx last front xxxx 6373 // where last = [|-&~] 6374 if (front == last) 6375 { 6376 op = twinSymbolOperator(last); 6377 popFront();//skip second twin char 6378 break L_CharTermLoop; 6379 } 6380 goto case State.Char; 6381 case State.Escape: 6382 // xxx \ front xxx 6383 switch (front) 6384 { 6385 case 'f': 6386 last = '\f'; 6387 state = State.Char; 6388 break; 6389 case 'n': 6390 last = '\n'; 6391 state = State.Char; 6392 break; 6393 case 'r': 6394 last = '\r'; 6395 state = State.Char; 6396 break; 6397 case 't': 6398 last = '\t'; 6399 state = State.Char; 6400 break; 6401 case 'v': 6402 last = '\v'; 6403 state = State.Char; 6404 break; 6405 case 'c': 6406 last = unicode.parseControlCode(this); 6407 state = State.Char; 6408 break; 6409 foreach (val; Escapables) 6410 { 6411 case val: 6412 } 6413 last = front; 6414 state = State.Char; 6415 break; 6416 case 'p': 6417 set.add(unicode.parsePropertySpec(this, false, casefold_)); 6418 state = State.Start; 6419 continue L_CharTermLoop; //next char already fetched 6420 case 'P': 6421 set.add(unicode.parsePropertySpec(this, true, casefold_)); 6422 state = State.Start; 6423 continue L_CharTermLoop; //next char already fetched 6424 case 'x': 6425 popFront(); 6426 last = parseUniHex(this, 2); 6427 state = State.Char; 6428 continue L_CharTermLoop; 6429 case 'u': 6430 popFront(); 6431 last = parseUniHex(this, 4); 6432 state = State.Char; 6433 continue L_CharTermLoop; 6434 case 'U': 6435 popFront(); 6436 last = parseUniHex(this, 8); 6437 state = State.Char; 6438 continue L_CharTermLoop; 6439 case 'd': 6440 set.add(unicode.Nd); 6441 state = State.Start; 6442 break; 6443 case 'D': 6444 set.add(unicode.Nd.inverted); 6445 state = State.Start; 6446 break; 6447 case 's': 6448 set.add(unicode.White_Space); 6449 state = State.Start; 6450 break; 6451 case 'S': 6452 set.add(unicode.White_Space.inverted); 6453 state = State.Start; 6454 break; 6455 case 'w': 6456 set.add(wordCharacter); 6457 state = State.Start; 6458 break; 6459 case 'W': 6460 set.add(wordCharacter.inverted); 6461 state = State.Start; 6462 break; 6463 default: 6464 if (front >= privateUseStart && front <= privateUseEnd) 6465 enforce(false, "no matching ']' found while parsing character class"); 6466 enforce(false, "invalid escape sequence"); 6467 } 6468 break; 6469 case State.CharDash: 6470 // xxx last - front xxx 6471 switch (front) 6472 { 6473 case '[': 6474 op = Operator.Union; 6475 goto case; 6476 case ']': 6477 //means dash is a single char not an interval specifier 6478 addWithFlags(set, last); 6479 addWithFlags(set, '-'); 6480 break L_CharTermLoop; 6481 case '-'://set Difference again 6482 addWithFlags(set, last); 6483 op = Operator.Difference; 6484 popFront();//skip '-' 6485 break L_CharTermLoop; 6486 case '\\': 6487 state = State.CharDashEscape; 6488 break; 6489 default: 6490 enforce(last <= front, "inverted range"); 6491 if (casefold_) 6492 { 6493 for (uint ch = last; ch <= front; ch++) 6494 addWithFlags(set, ch); 6495 } 6496 else 6497 set.add(last, front + 1); 6498 state = State.Start; 6499 } 6500 break; 6501 case State.CharDashEscape: 6502 //xxx last - \ front xxx 6503 uint end; 6504 switch (front) 6505 { 6506 case 'f': 6507 end = '\f'; 6508 break; 6509 case 'n': 6510 end = '\n'; 6511 break; 6512 case 'r': 6513 end = '\r'; 6514 break; 6515 case 't': 6516 end = '\t'; 6517 break; 6518 case 'v': 6519 end = '\v'; 6520 break; 6521 foreach (val; Escapables) 6522 { 6523 case val: 6524 } 6525 end = front; 6526 break; 6527 case 'c': 6528 end = unicode.parseControlCode(this); 6529 break; 6530 case 'x': 6531 popFront(); 6532 end = parseUniHex(this, 2); 6533 enforce(last <= end,"inverted range"); 6534 set.add(last, end + 1); 6535 state = State.Start; 6536 continue L_CharTermLoop; 6537 case 'u': 6538 popFront(); 6539 end = parseUniHex(this, 4); 6540 enforce(last <= end,"inverted range"); 6541 set.add(last, end + 1); 6542 state = State.Start; 6543 continue L_CharTermLoop; 6544 case 'U': 6545 popFront(); 6546 end = parseUniHex(this, 8); 6547 enforce(last <= end,"inverted range"); 6548 set.add(last, end + 1); 6549 state = State.Start; 6550 continue L_CharTermLoop; 6551 default: 6552 if (front >= privateUseStart && front <= privateUseEnd) 6553 enforce(false, "no matching ']' found while parsing character class"); 6554 enforce(false, "invalid escape sequence"); 6555 } 6556 // Lookahead to check if it's a \T 6557 // where T is sub-pattern terminator in multi-pattern scheme 6558 auto lookahead = range.save.drop(1); 6559 if (end == '\\' && !lookahead.empty) 6560 { 6561 if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd) 6562 enforce(false, "no matching ']' found while parsing character class"); 6563 } 6564 enforce(last <= end,"inverted range"); 6565 set.add(last, end + 1); 6566 state = State.Start; 6567 break; 6568 } 6569 popFront(); 6570 enforce(!empty, "unexpected end of CodepointSet"); 6571 } 6572 return tuple(set, op); 6573 } 6574 6575 alias ValStack = Stack!(CodepointSet); 6576 alias OpStack = Stack!(Operator); 6577 6578 CodepointSet parseSet() 6579 { 6580 ValStack vstack; 6581 OpStack opstack; 6582 import std.functional : unaryFun; 6583 enforce(!empty, "unexpected end of input"); 6584 enforce(front == '[', "expected '[' at the start of unicode set"); 6585 // 6586 static bool apply(Operator op, ref ValStack stack) 6587 { 6588 switch (op) 6589 { 6590 case Operator.Negate: 6591 enforce(!stack.empty, "no operand for '^'"); 6592 stack.top = stack.top.inverted; 6593 break; 6594 case Operator.Union: 6595 auto s = stack.pop();//2nd operand 6596 enforce(!stack.empty, "no operand for '||'"); 6597 stack.top.add(s); 6598 break; 6599 case Operator.Difference: 6600 auto s = stack.pop();//2nd operand 6601 enforce(!stack.empty, "no operand for '--'"); 6602 stack.top.sub(s); 6603 break; 6604 case Operator.SymDifference: 6605 auto s = stack.pop();//2nd operand 6606 enforce(!stack.empty, "no operand for '~~'"); 6607 stack.top ~= s; 6608 break; 6609 case Operator.Intersection: 6610 auto s = stack.pop();//2nd operand 6611 enforce(!stack.empty, "no operand for '&&'"); 6612 stack.top.intersect(s); 6613 break; 6614 default: 6615 return false; 6616 } 6617 return true; 6618 } 6619 static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) 6620 { 6621 while (cond(opstack.top)) 6622 { 6623 if (!apply(opstack.pop(),vstack)) 6624 return false;//syntax error 6625 if (opstack.empty) 6626 return false; 6627 } 6628 return true; 6629 } 6630 6631 L_CharsetLoop: 6632 do 6633 { 6634 switch (front) 6635 { 6636 case '[': 6637 opstack.push(Operator.Open); 6638 popFront(); 6639 enforce(!empty, "unexpected end of character class"); 6640 if (front == '^') 6641 { 6642 opstack.push(Operator.Negate); 6643 popFront(); 6644 enforce(!empty, "unexpected end of character class"); 6645 } 6646 else if (front == ']') // []...] is special cased 6647 { 6648 popFront(); 6649 enforce(!empty, "wrong character set"); 6650 auto pair = parseCharTerm(); 6651 pair[0].add(']', ']'+1); 6652 if (pair[1] != Operator.None) 6653 { 6654 if (opstack.top == Operator.Union) 6655 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6656 opstack.push(pair[1]); 6657 } 6658 vstack.push(pair[0]); 6659 } 6660 break; 6661 case ']': 6662 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack), 6663 "character class syntax error"); 6664 enforce(!opstack.empty, "unmatched ']'"); 6665 opstack.pop(); 6666 popFront(); 6667 if (opstack.empty) 6668 break L_CharsetLoop; 6669 auto pair = parseCharTerm(); 6670 if (!pair[0].empty)//not only operator e.g. -- or ~~ 6671 { 6672 vstack.top.add(pair[0]);//apply union 6673 } 6674 if (pair[1] != Operator.None) 6675 { 6676 if (opstack.top == Operator.Union) 6677 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6678 opstack.push(pair[1]); 6679 } 6680 break; 6681 // 6682 default://yet another pair of term(op)? 6683 auto pair = parseCharTerm(); 6684 if (pair[1] != Operator.None) 6685 { 6686 if (opstack.top == Operator.Union) 6687 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6688 opstack.push(pair[1]); 6689 } 6690 vstack.push(pair[0]); 6691 } 6692 6693 }while (!empty || !opstack.empty); 6694 while (!opstack.empty) 6695 apply(opstack.pop(),vstack); 6696 assert(vstack.length == 1); 6697 return vstack.top; 6698 } 6699 } 6700 6701 /** 6702 A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of 6703 a block, script or general category. 6704 6705 It uses well defined standard rules of property name lookup. 6706 This includes fuzzy matching of names, so that 6707 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal 6708 and yield the same set of white space $(CHARACTERS). 6709 */ 6710 @safe public struct unicode 6711 { 6712 import std.exception : enforce; 6713 /** 6714 Performs the lookup of set of $(CODEPOINTS) 6715 with compile-time correctness checking. 6716 This short-cut version combines 3 searches: 6717 across blocks, scripts, and common binary properties. 6718 6719 Note that since scripts and blocks overlap the 6720 usual trick to disambiguate is used - to get a block use 6721 `unicode.InBlockName`, to search a script 6722 use `unicode.ScriptName`. 6723 6724 See_Also: $(LREF block), $(LREF script) 6725 and (not included in this search) $(LREF hangulSyllableType). 6726 */ 6727 6728 static @property auto opDispatch(string name)() pure 6729 { 6730 static if (findAny(name)) 6731 return loadAny(name); 6732 else 6733 static assert(false, "No unicode set by name "~name~" was found."); 6734 } 6735 6736 /// 6737 @safe unittest 6738 { 6739 import std.exception : collectException; 6740 auto ascii = unicode.ASCII; 6741 assert(ascii['A']); 6742 assert(ascii['~']); 6743 assert(!ascii['\u00e0']); 6744 // matching is case-insensitive 6745 assert(ascii == unicode.ascII); 6746 assert(!ascii['à']); 6747 // underscores, '-' and whitespace in names are ignored too 6748 auto latin = unicode.in_latin1_Supplement; 6749 assert(latin['à']); 6750 assert(!latin['$']); 6751 // BTW Latin 1 Supplement is a block, hence "In" prefix 6752 assert(latin == unicode("In Latin 1 Supplement")); 6753 // run-time look up throws if no such set is found 6754 assert(collectException(unicode("InCyrilliac"))); 6755 } 6756 6757 /** 6758 The same lookup across blocks, scripts, or binary properties, 6759 but performed at run-time. 6760 This version is provided for cases where `name` 6761 is not known beforehand; otherwise compile-time 6762 checked $(LREF opDispatch) is typically a better choice. 6763 6764 See the $(S_LINK Unicode properties, table of properties) for available 6765 sets. 6766 */ 6767 static auto opCall(C)(const scope C[] name) 6768 if (is(C : dchar)) 6769 { 6770 return loadAny(name); 6771 } 6772 6773 /** 6774 Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks. 6775 6776 Note: 6777 Here block names are unambiguous as no scripts are searched 6778 and thus to search use simply `unicode.block.BlockName` notation. 6779 6780 See $(S_LINK Unicode properties, table of properties) for available sets. 6781 See_Also: $(S_LINK Unicode properties, table of properties). 6782 */ 6783 struct block 6784 { 6785 import std.internal.unicode_tables : blocks; // generated file 6786 mixin SetSearcher!(blocks.tab, "block"); 6787 } 6788 6789 /// 6790 @safe unittest 6791 { 6792 // use .block for explicitness 6793 assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic); 6794 } 6795 6796 /** 6797 Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts. 6798 6799 See the $(S_LINK Unicode properties, table of properties) for available 6800 sets. 6801 */ 6802 struct script 6803 { 6804 import std.internal.unicode_tables : scripts; // generated file 6805 mixin SetSearcher!(scripts.tab, "script"); 6806 } 6807 6808 /// 6809 @safe unittest 6810 { 6811 auto arabicScript = unicode.script.arabic; 6812 auto arabicBlock = unicode.block.arabic; 6813 // there is an intersection between script and block 6814 assert(arabicBlock['']); 6815 assert(arabicScript['']); 6816 // but they are different 6817 assert(arabicBlock != arabicScript); 6818 assert(arabicBlock == unicode.inArabic); 6819 assert(arabicScript == unicode.arabic); 6820 } 6821 6822 /** 6823 Fetch a set of $(CODEPOINTS) that have the given hangul syllable type. 6824 6825 Other non-binary properties (once supported) follow the same 6826 notation - `unicode.propertyName.propertyValue` for compile-time 6827 checked access and `unicode.propertyName(propertyValue)` 6828 for run-time checked one. 6829 6830 See the $(S_LINK Unicode properties, table of properties) for available 6831 sets. 6832 */ 6833 struct hangulSyllableType 6834 { 6835 import std.internal.unicode_tables : hangul; // generated file 6836 mixin SetSearcher!(hangul.tab, "hangul syllable type"); 6837 } 6838 6839 /// 6840 @safe unittest 6841 { 6842 // L here is syllable type not Letter as in unicode.L short-cut 6843 auto leadingVowel = unicode.hangulSyllableType("L"); 6844 // check that some leading vowels are present 6845 foreach (vowel; '\u1110'..'\u115F') 6846 assert(leadingVowel[vowel]); 6847 assert(leadingVowel == unicode.hangulSyllableType.L); 6848 } 6849 6850 //parse control code of form \cXXX, c assumed to be the current symbol 6851 static package(std) dchar parseControlCode(Parser)(ref Parser p) 6852 { 6853 with(p) 6854 { 6855 popFront(); 6856 enforce(!empty, "Unfinished escape sequence"); 6857 enforce(('a' <= front && front <= 'z') 6858 || ('A' <= front && front <= 'Z'), 6859 "Only letters are allowed after \\c"); 6860 return front & 0x1f; 6861 } 6862 } 6863 6864 //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, 6865 //\ - assumed to be processed, p - is current 6866 static package(std) CodepointSet parsePropertySpec(Range)(ref Range p, 6867 bool negated, bool casefold) 6868 { 6869 static import std.ascii; 6870 with(p) 6871 { 6872 enum MAX_PROPERTY = 128; 6873 char[MAX_PROPERTY] result; 6874 uint k = 0; 6875 popFront(); 6876 enforce(!empty, "eof parsing unicode property spec"); 6877 if (front == '{') 6878 { 6879 popFront(); 6880 while (k < MAX_PROPERTY && !empty && front !='}' 6881 && front !=':') 6882 { 6883 if (front != '-' && front != ' ' && front != '_') 6884 result[k++] = cast(char) std.ascii.toLower(front); 6885 popFront(); 6886 } 6887 enforce(k != MAX_PROPERTY, "invalid property name"); 6888 enforce(front == '}', "} expected "); 6889 } 6890 else 6891 {//single char properties e.g.: \pL, \pN ... 6892 enforce(front < 0x80, "invalid property name"); 6893 result[k++] = cast(char) front; 6894 } 6895 auto s = getUnicodeSet(result[0 .. k], negated, casefold); 6896 enforce(!s.empty, "unrecognized unicode property spec"); 6897 popFront(); 6898 return s; 6899 } 6900 } 6901 6902 /** 6903 Parse unicode codepoint set from given `range` using standard regex 6904 syntax '[...]'. The range is advanced skiping over regex set definition. 6905 `casefold` parameter determines if the set should be casefolded - that is 6906 include both lower and upper case versions for any letters in the set. 6907 */ 6908 static CodepointSet parseSet(Range)(ref Range range, bool casefold=false) 6909 if (isInputRange!Range && is(ElementType!Range : dchar)) 6910 { 6911 auto usParser = UnicodeSetParser!Range(range, casefold); 6912 auto set = usParser.parseSet(); 6913 range = usParser.range; 6914 return set; 6915 } 6916 6917 /// 6918 @safe unittest 6919 { 6920 import std.uni : unicode; 6921 string pat = "[a-zA-Z0-9]hello"; 6922 auto set = unicode.parseSet(pat); 6923 // check some of the codepoints 6924 assert(set['a'] && set['A'] && set['9']); 6925 assert(pat == "hello"); 6926 } 6927 6928 private: 6929 alias ucmp = comparePropertyName; 6930 6931 static bool findAny(string name) 6932 { 6933 import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file 6934 return isPrettyPropertyName(name) 6935 || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name) 6936 || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$])); 6937 } 6938 6939 static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure 6940 { 6941 import std.conv : to; 6942 import std.internal.unicode_tables : blocks, scripts; // generated file 6943 Set set; 6944 immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set) 6945 || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0 6946 && loadUnicodeSet!(blocks.tab)(name[2..$], set)); 6947 if (loaded) 6948 return set; 6949 throw new Exception("No unicode set by name "~name.to!string()~" was found."); 6950 } 6951 6952 // FIXME: re-disable once the compiler is fixed 6953 // Disabled to prevent the mistake of creating instances of this pseudo-struct. 6954 //@disable ~this(); 6955 } 6956 6957 @safe unittest 6958 { 6959 import std.internal.unicode_tables : blocks, uniProps; // generated file 6960 assert(unicode("InHebrew") == asSet(blocks.Hebrew)); 6961 assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp))); 6962 assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi)); 6963 } 6964 6965 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally 6966 6967 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too 6968 // Use combined trie instead of checking for '\r' | '\n' | ccTrie, 6969 // or extend | '\u200D' separately 6970 6971 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow 6972 { 6973 return ch >= '\U0001F1E6' && ch <= '\U0001F1FF'; 6974 } 6975 6976 // Our grapheme decoder is a state machine, this is list of all possible 6977 // states before each code point. 6978 private enum GraphemeState 6979 { 6980 Start, 6981 CR, 6982 RI, 6983 L, 6984 V, 6985 LVT, 6986 Emoji, 6987 EmojiZWJ, 6988 Prepend, 6989 End 6990 } 6991 6992 // Message values whether end of grapheme is reached 6993 private enum TransformRes 6994 { 6995 // No, unless the source range ends here 6996 // (GB2 - break at end of text, unless text is empty) 6997 goOn, 6998 redo, // Run last character again with new state 6999 retInclude, // Yes, after the just iterated character 7000 retExclude // Yes, before the just iterated character 7001 } 7002 7003 // The logic of the grapheme decoding is all here 7004 // GB# means Grapheme Breaking rule number # - see Unicode standard annex #29 7005 // Note, getting GB1 (break at start of text, unless text is empty) right 7006 // relies on the user starting grapheme walking from beginning of the text, and 7007 // not attempting to walk an empty text. 7008 private enum TransformRes 7009 function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms = 7010 [ 7011 GraphemeState.Start: (ref state, ch) 7012 { 7013 // GB4. Break after controls. 7014 if (graphemeControlTrie[ch] || ch == '\n') 7015 return TransformRes.retInclude; 7016 7017 with (GraphemeState) state = 7018 ch == '\r' ? CR : 7019 isRegionalIndicator(ch) ? RI : 7020 isHangL(ch) ? L : 7021 hangLV[ch] || isHangV(ch) ? V : 7022 hangLVT[ch] || isHangT(ch) ? LVT : 7023 prependTrie[ch] ? Prepend : 7024 xpictoTrie[ch] ? Emoji : 7025 End; 7026 7027 // No matter what we encountered, we always include the 7028 // first code point in the grapheme. 7029 return TransformRes.goOn; 7030 }, 7031 7032 // GB3, GB4. Do not break between a CR and LF. 7033 // Otherwise, break after controls. 7034 GraphemeState.CR: (ref state, ch) => ch == '\n' ? 7035 TransformRes.retInclude : 7036 TransformRes.retExclude, 7037 7038 // GB12 - GB13. Do not break within emoji flag sequences. 7039 // That is, do not break between regional indicator (RI) symbols if 7040 // there is an odd number of RI characters before the break point. 7041 // This state applies if one and only one RI code point has been 7042 // encountered. 7043 GraphemeState.RI: (ref state, ch) 7044 { 7045 state = GraphemeState.End; 7046 7047 return isRegionalIndicator(ch) ? 7048 TransformRes.goOn : 7049 TransformRes.redo; 7050 }, 7051 7052 // GB6. Do not break Hangul syllable sequences. 7053 GraphemeState.L: (ref state, ch) 7054 { 7055 if (isHangL(ch)) 7056 return TransformRes.goOn; 7057 else if (isHangV(ch) || hangLV[ch]) 7058 { 7059 state = GraphemeState.V; 7060 return TransformRes.goOn; 7061 } 7062 else if (hangLVT[ch]) 7063 { 7064 state = GraphemeState.LVT; 7065 return TransformRes.goOn; 7066 } 7067 7068 state = GraphemeState.End; 7069 return TransformRes.redo; 7070 }, 7071 7072 // GB7. Do not break Hangul syllable sequences. 7073 GraphemeState.V: (ref state, ch) 7074 { 7075 if (isHangV(ch)) 7076 return TransformRes.goOn; 7077 else if (isHangT(ch)) 7078 { 7079 state = GraphemeState.LVT; 7080 return TransformRes.goOn; 7081 } 7082 7083 state = GraphemeState.End; 7084 return TransformRes.redo; 7085 }, 7086 7087 // GB8. Do not break Hangul syllable sequences. 7088 GraphemeState.LVT: (ref state, ch) 7089 { 7090 if (isHangT(ch)) 7091 return TransformRes.goOn; 7092 7093 state = GraphemeState.End; 7094 return TransformRes.redo; 7095 }, 7096 7097 // GB11. Do not break within emoji modifier sequences or emoji 7098 // zwj sequences. This state applies when the last code point was 7099 // NOT a ZWJ. 7100 GraphemeState.Emoji: (ref state, ch) 7101 { 7102 if (graphemeExtendTrie[ch]) 7103 return TransformRes.goOn; 7104 7105 static assert(!graphemeExtendTrie['\u200D']); 7106 7107 if (ch == '\u200D') 7108 { 7109 state = GraphemeState.EmojiZWJ; 7110 return TransformRes.goOn; 7111 } 7112 7113 state = GraphemeState.End; 7114 // There might still be spacing marks are 7115 // at the end, which are not allowed in 7116 // middle of emoji sequences 7117 return TransformRes.redo; 7118 }, 7119 7120 // GB11. Do not break within emoji modifier sequences or emoji 7121 // zwj sequences. This state applies when the last code point was 7122 // a ZWJ. 7123 GraphemeState.EmojiZWJ: (ref state, ch) 7124 { 7125 state = GraphemeState.Emoji; 7126 if (xpictoTrie[ch]) 7127 return TransformRes.goOn; 7128 return TransformRes.redo; 7129 }, 7130 7131 // GB9b. Do not break after Prepend characters. 7132 GraphemeState.Prepend: (ref state, ch) 7133 { 7134 // GB5. Break before controls. 7135 if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n') 7136 return TransformRes.retExclude; 7137 7138 state = GraphemeState.Start; 7139 return TransformRes.redo; 7140 }, 7141 7142 // GB9, GB9a. Do not break before extending characters, ZWJ 7143 // or SpacingMarks. 7144 // GB999. Otherwise, break everywhere. 7145 GraphemeState.End: (ref state, ch) 7146 => !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ? 7147 TransformRes.retExclude : 7148 TransformRes.goOn 7149 ]; 7150 7151 template genericDecodeGrapheme(bool getValue) 7152 { 7153 static if (getValue) 7154 alias Value = Grapheme; 7155 else 7156 alias Value = void; 7157 7158 Value genericDecodeGrapheme(Input)(ref Input range) 7159 { 7160 static if (getValue) 7161 Grapheme grapheme; 7162 auto state = GraphemeState.Start; 7163 dchar ch; 7164 7165 assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof); 7166 outer: 7167 while (!range.empty) 7168 { 7169 ch = range.front; 7170 7171 rerun: 7172 final switch (graphemeTransforms[state](state, ch)) 7173 with(TransformRes) 7174 { 7175 case goOn: 7176 static if (getValue) 7177 grapheme ~= ch; 7178 range.popFront(); 7179 continue; 7180 7181 case redo: 7182 goto rerun; 7183 7184 case retInclude: 7185 static if (getValue) 7186 grapheme ~= ch; 7187 range.popFront(); 7188 break outer; 7189 7190 case retExclude: 7191 break outer; 7192 } 7193 } 7194 7195 static if (getValue) 7196 return grapheme; 7197 } 7198 } 7199 7200 public: // Public API continues 7201 7202 /++ 7203 Computes the length of grapheme cluster starting at `index`. 7204 Both the resulting length and the `index` are measured 7205 in $(S_LINK Code unit, code units). 7206 7207 Params: 7208 C = type that is implicitly convertible to `dchars` 7209 input = array of grapheme clusters 7210 index = starting index into `input[]` 7211 7212 Returns: 7213 length of grapheme cluster 7214 +/ 7215 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure 7216 if (is(C : dchar)) 7217 { 7218 auto src = input[index..$]; 7219 auto n = src.length; 7220 genericDecodeGrapheme!(false)(src); 7221 return n - src.length; 7222 } 7223 7224 /// 7225 @safe unittest 7226 { 7227 assert(graphemeStride(" ", 1) == 1); 7228 // A + combing ring above 7229 string city = "A\u030Arhus"; 7230 size_t first = graphemeStride(city, 0); 7231 assert(first == 3); //\u030A has 2 UTF-8 code units 7232 assert(city[0 .. first] == "A\u030A"); 7233 assert(city[first..$] == "rhus"); 7234 } 7235 7236 @safe unittest 7237 { 7238 // Ensure that graphemeStride is usable from CTFE. 7239 enum c1 = graphemeStride("A", 0); 7240 static assert(c1 == 1); 7241 7242 enum c2 = graphemeStride("A\u0301", 0); 7243 static assert(c2 == 3); // \u0301 has 2 UTF-8 code units 7244 } 7245 7246 // TODO: make this @nogc. Probably no big deal since the state machine is 7247 // already GC-free. 7248 @safe pure nothrow unittest 7249 { 7250 // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face 7251 assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2); 7252 // skier ~ female sign ~ '€' 7253 assert(graphemeStride("\u26F7\u2640€"d, 0) == 1); 7254 // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€' 7255 assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2); 7256 // skier ~ zero-width joiner ~ female sign ~ '€' 7257 assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3); 7258 // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner 7259 // ~ female sign ~ '€' 7260 assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4); 7261 // skier ~ zero-width joiner ~ '€' 7262 assert(graphemeStride("\u26F7\u200D€"d, 0) == 2); 7263 //'€' ~ zero-width joiner ~ skier 7264 assert(graphemeStride("€\u200D\u26F7"d, 0) == 2); 7265 // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two 7266 assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2); 7267 // Kaithi number sign ~ null 7268 assert(graphemeStride("\U000110BD\0"d, 0) == 1); 7269 } 7270 7271 /++ 7272 Reads one full grapheme cluster from an 7273 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`. 7274 7275 For examples see the $(LREF Grapheme) below. 7276 7277 Note: 7278 This function modifies `inp` and thus `inp` 7279 must be an L-value. 7280 +/ 7281 Grapheme decodeGrapheme(Input)(ref Input inp) 7282 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar)) 7283 { 7284 return genericDecodeGrapheme!true(inp); 7285 } 7286 7287 @safe unittest 7288 { 7289 import std.algorithm.comparison : equal; 7290 7291 Grapheme gr; 7292 string s = " \u0020\u0308 "; 7293 gr = decodeGrapheme(s); 7294 assert(gr.length == 1 && gr[0] == ' '); 7295 gr = decodeGrapheme(s); 7296 assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308")); 7297 s = "\u0300\u0308\u1100"; 7298 assert(equal(decodeGrapheme(s)[], "\u0300\u0308")); 7299 assert(equal(decodeGrapheme(s)[], "\u1100")); 7300 s = "\u11A8\u0308\uAC01"; 7301 assert(equal(decodeGrapheme(s)[], "\u11A8\u0308")); 7302 assert(equal(decodeGrapheme(s)[], "\uAC01")); 7303 7304 // Two Union Jacks of the Great Britain 7305 s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7"; 7306 assert(equal(decodeGrapheme(s)[], "\U0001F1EC\U0001F1E7")); 7307 } 7308 7309 /++ 7310 $(P Iterate a string by $(LREF Grapheme).) 7311 7312 $(P Useful for doing string manipulation that needs to be aware 7313 of graphemes.) 7314 7315 See_Also: 7316 $(LREF byCodePoint) 7317 +/ 7318 auto byGrapheme(Range)(Range range) 7319 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7320 { 7321 // TODO: Bidirectional access 7322 static struct Result(R) 7323 { 7324 private R _range; 7325 private Grapheme _front; 7326 7327 bool empty() @property 7328 { 7329 return _front.length == 0; 7330 } 7331 7332 Grapheme front() @property 7333 { 7334 return _front; 7335 } 7336 7337 void popFront() 7338 { 7339 _front = _range.empty ? Grapheme.init : _range.decodeGrapheme(); 7340 } 7341 7342 static if (isForwardRange!R) 7343 { 7344 Result save() @property 7345 { 7346 return Result(_range.save, _front); 7347 } 7348 } 7349 } 7350 7351 auto result = Result!(Range)(range); 7352 result.popFront(); 7353 return result; 7354 } 7355 7356 /// 7357 @safe unittest 7358 { 7359 import std.algorithm.comparison : equal; 7360 import std.range.primitives : walkLength; 7361 import std.range : take, drop; 7362 auto text = "noe\u0308l"; // noël using e + combining diaeresis 7363 assert(text.walkLength == 5); // 5 code points 7364 7365 auto gText = text.byGrapheme; 7366 assert(gText.walkLength == 4); // 4 graphemes 7367 7368 assert(gText.take(3).equal("noe\u0308".byGrapheme)); 7369 assert(gText.drop(3).equal("l".byGrapheme)); 7370 } 7371 7372 // For testing non-forward-range input ranges 7373 version (StdUnittest) 7374 private static @safe struct InputRangeString 7375 { 7376 private string s; 7377 7378 bool empty() @property { return s.empty; } 7379 dchar front() @property { return s.front; } 7380 void popFront() { s.popFront(); } 7381 } 7382 7383 @safe unittest 7384 { 7385 import std.algorithm.comparison : equal; 7386 import std.array : array; 7387 import std.range : retro; 7388 import std.range.primitives : walkLength; 7389 assert("".byGrapheme.walkLength == 0); 7390 7391 auto reverse = "le\u0308on"; 7392 assert(reverse.walkLength == 5); 7393 7394 auto gReverse = reverse.byGrapheme; 7395 assert(gReverse.walkLength == 4); 7396 7397 static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d)) 7398 {{ 7399 assert(text.walkLength == 5); 7400 static assert(isForwardRange!(typeof(text))); 7401 7402 auto gText = text.byGrapheme; 7403 static assert(isForwardRange!(typeof(gText))); 7404 assert(gText.walkLength == 4); 7405 assert(gText.array.retro.equal(gReverse)); 7406 }} 7407 7408 auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme; 7409 static assert(!isForwardRange!(typeof(nonForwardRange))); 7410 assert(nonForwardRange.walkLength == 4); 7411 } 7412 7413 // Issue 23474 7414 @safe pure unittest 7415 { 7416 import std.range.primitives : walkLength; 7417 assert(byGrapheme("\r\u0308").walkLength == 2); 7418 } 7419 7420 /++ 7421 $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.) 7422 7423 $(P Useful for converting the result to a string after doing operations 7424 on graphemes.) 7425 7426 $(P If passed in a range of code points, returns a range with equivalent capabilities.) 7427 +/ 7428 auto byCodePoint(Range)(Range range) 7429 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme)) 7430 { 7431 // TODO: Propagate bidirectional access 7432 static struct Result 7433 { 7434 private Range _range; 7435 private size_t i = 0; 7436 7437 bool empty() @property 7438 { 7439 return _range.empty; 7440 } 7441 7442 dchar front() @property 7443 { 7444 return _range.front[i]; 7445 } 7446 7447 void popFront() 7448 { 7449 ++i; 7450 7451 if (i >= _range.front.length) 7452 { 7453 _range.popFront(); 7454 i = 0; 7455 } 7456 } 7457 7458 static if (isForwardRange!Range) 7459 { 7460 Result save() @property 7461 { 7462 return Result(_range.save, i); 7463 } 7464 } 7465 } 7466 7467 return Result(range); 7468 } 7469 7470 /// Ditto 7471 auto byCodePoint(Range)(Range range) 7472 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7473 { 7474 import std.range.primitives : isBidirectionalRange, popBack; 7475 import std.traits : isNarrowString; 7476 static if (isNarrowString!Range) 7477 { 7478 static struct Result 7479 { 7480 private Range _range; 7481 @property bool empty() { return _range.empty; } 7482 @property dchar front(){ return _range.front; } 7483 void popFront(){ _range.popFront; } 7484 @property auto save() { return Result(_range.save); } 7485 @property dchar back(){ return _range.back; } 7486 void popBack(){ _range.popBack; } 7487 } 7488 static assert(isBidirectionalRange!(Result)); 7489 return Result(range); 7490 } 7491 else 7492 return range; 7493 } 7494 7495 /// 7496 @safe unittest 7497 { 7498 import std.array : array; 7499 import std.conv : text; 7500 import std.range : retro; 7501 7502 string s = "noe\u0308l"; // noël 7503 7504 // reverse it and convert the result to a string 7505 string reverse = s.byGrapheme 7506 .array 7507 .retro 7508 .byCodePoint 7509 .text; 7510 7511 assert(reverse == "le\u0308on"); // lëon 7512 } 7513 7514 @safe unittest 7515 { 7516 import std.algorithm.comparison : equal; 7517 import std.range.primitives : walkLength; 7518 import std.range : retro; 7519 assert("".byGrapheme.byCodePoint.equal("")); 7520 7521 string text = "noe\u0308l"; 7522 static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length)); 7523 7524 auto gText = InputRangeString(text).byGrapheme; 7525 static assert(!isForwardRange!(typeof(gText))); 7526 7527 auto cpText = gText.byCodePoint; 7528 static assert(!isForwardRange!(typeof(cpText))); 7529 7530 assert(cpText.walkLength == text.walkLength); 7531 7532 auto plainCp = text.byCodePoint; 7533 static assert(isForwardRange!(typeof(plainCp))); 7534 assert(equal(plainCp, text)); 7535 assert(equal(retro(plainCp.save), retro(text.save))); 7536 // Check that we still have length for dstring 7537 assert("абвгд"d.byCodePoint.length == 5); 7538 } 7539 7540 /++ 7541 $(P A structure designed to effectively pack $(CHARACTERS) 7542 of a $(CLUSTER). 7543 ) 7544 7545 $(P `Grapheme` has value semantics so 2 copies of a `Grapheme` 7546 always refer to distinct objects. In most actual scenarios a `Grapheme` 7547 fits on the stack and avoids memory allocation overhead for all but quite 7548 long clusters. 7549 ) 7550 7551 See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride) 7552 +/ 7553 @safe struct Grapheme 7554 { 7555 import std.exception : enforce; 7556 import std.traits : isDynamicArray; 7557 7558 public: 7559 /// Ctor 7560 this(C)(const scope C[] chars...) 7561 if (is(C : dchar)) 7562 { 7563 this ~= chars; 7564 } 7565 7566 ///ditto 7567 this(Input)(Input seq) 7568 if (!isDynamicArray!Input 7569 && isInputRange!Input && is(ElementType!Input : dchar)) 7570 { 7571 this ~= seq; 7572 } 7573 7574 /// Gets a $(CODEPOINT) at the given index in this cluster. 7575 dchar opIndex(size_t index) const @nogc nothrow pure @trusted 7576 { 7577 assert(index < length); 7578 return read24(isBig ? ptr_ : small_.ptr, index); 7579 } 7580 7581 /++ 7582 Writes a $(CODEPOINT) `ch` at given index in this cluster. 7583 7584 Warning: 7585 Use of this facility may invalidate grapheme cluster, 7586 see also $(LREF Grapheme.valid). 7587 +/ 7588 void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted 7589 { 7590 assert(index < length); 7591 write24(isBig ? ptr_ : small_.ptr, ch, index); 7592 } 7593 7594 /// 7595 @safe unittest 7596 { 7597 auto g = Grapheme("A\u0302"); 7598 assert(g[0] == 'A'); 7599 assert(g.valid); 7600 g[1] = '~'; // ASCII tilda is not a combining mark 7601 assert(g[1] == '~'); 7602 assert(!g.valid); 7603 } 7604 7605 /++ 7606 Random-access range over Grapheme's $(CHARACTERS). 7607 7608 Warning: Invalidates when this Grapheme leaves the scope, 7609 attempts to use it then would lead to memory corruption. 7610 +/ 7611 SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return 7612 { 7613 return sliceOverIndexed(a, b, &this); 7614 } 7615 7616 /// ditto 7617 SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return 7618 { 7619 return sliceOverIndexed(0, length, &this); 7620 } 7621 7622 /// Grapheme cluster length in $(CODEPOINTS). 7623 @property size_t length() const @nogc nothrow pure 7624 { 7625 return isBig ? len_ : slen_ & 0x7F; 7626 } 7627 7628 /++ 7629 Append $(CHARACTER) `ch` to this grapheme. 7630 Warning: 7631 Use of this facility may invalidate grapheme cluster, 7632 see also `valid`. 7633 7634 See_Also: $(LREF Grapheme.valid) 7635 +/ 7636 ref opOpAssign(string op)(dchar ch) @trusted 7637 { 7638 static if (op == "~") 7639 { 7640 import std.internal.memory : enforceRealloc; 7641 if (!isBig) 7642 { 7643 if (slen_ == small_cap) 7644 convertToBig();// & fallthrough to "big" branch 7645 else 7646 { 7647 write24(small_.ptr, ch, smallLength); 7648 slen_++; 7649 return this; 7650 } 7651 } 7652 7653 assert(isBig); 7654 if (len_ == cap_) 7655 { 7656 import core.checkedint : addu, mulu; 7657 bool overflow; 7658 cap_ = addu(cap_, grow, overflow); 7659 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow); 7660 if (overflow) assert(0); 7661 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems); 7662 } 7663 write24(ptr_, ch, len_++); 7664 return this; 7665 } 7666 else 7667 static assert(false, "No operation "~op~" defined for Grapheme"); 7668 } 7669 7670 /// 7671 @safe unittest 7672 { 7673 import std.algorithm.comparison : equal; 7674 auto g = Grapheme("A"); 7675 assert(g.valid); 7676 g ~= '\u0301'; 7677 assert(g[].equal("A\u0301")); 7678 assert(g.valid); 7679 g ~= "B"; 7680 // not a valid grapheme cluster anymore 7681 assert(!g.valid); 7682 // still could be useful though 7683 assert(g[].equal("A\u0301B")); 7684 } 7685 7686 /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme. 7687 ref opOpAssign(string op, Input)(scope Input inp) 7688 if (isInputRange!Input && is(ElementType!Input : dchar)) 7689 { 7690 static if (op == "~") 7691 { 7692 foreach (dchar ch; inp) 7693 this ~= ch; 7694 return this; 7695 } 7696 else 7697 static assert(false, "No operation "~op~" defined for Grapheme"); 7698 } 7699 7700 // This is not a good `opEquals`, but formerly the automatically generated 7701 // opEquals was used, which was inferred `@safe` because of bugzilla 20655: 7702 // https://issues.dlang.org/show_bug.cgi?id=20655 7703 // This `@trusted opEquals` is only here to prevent breakage. 7704 bool opEquals(R)(const auto ref R other) const @trusted 7705 { 7706 return this.tupleof == other.tupleof; 7707 } 7708 7709 // Define a default toHash to allow AA usage 7710 size_t toHash() const @trusted 7711 { 7712 return hashOf(slen_, hashOf(small_)); 7713 } 7714 7715 /++ 7716 True if this object contains valid extended grapheme cluster. 7717 Decoding primitives of this module always return a valid `Grapheme`. 7718 7719 Appending to and direct manipulation of grapheme's $(CHARACTERS) may 7720 render it no longer valid. Certain applications may chose to use 7721 Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property 7722 entirely. 7723 +/ 7724 @property bool valid()() /*const*/ 7725 { 7726 auto r = this[]; 7727 genericDecodeGrapheme!false(r); 7728 return r.length == 0; 7729 } 7730 7731 this(this) @nogc nothrow pure @trusted 7732 { 7733 import std.internal.memory : enforceMalloc; 7734 if (isBig) 7735 {// dup it 7736 import core.checkedint : addu, mulu; 7737 bool overflow; 7738 auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow); 7739 if (overflow) assert(0); 7740 7741 auto p = cast(ubyte*) enforceMalloc(raw_cap); 7742 p[0 .. raw_cap] = ptr_[0 .. raw_cap]; 7743 ptr_ = p; 7744 } 7745 } 7746 7747 ~this() @nogc nothrow pure @trusted 7748 { 7749 import core.memory : pureFree; 7750 if (isBig) 7751 { 7752 pureFree(ptr_); 7753 } 7754 } 7755 7756 7757 private: 7758 enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1); 7759 // "out of the blue" grow rate, needs testing 7760 // (though graphemes are typically small < 9) 7761 enum grow = 20; 7762 enum small_cap = small_bytes/3; 7763 enum small_flag = 0x80, small_mask = 0x7F; 7764 // 16 bytes in 32bits, should be enough for the majority of cases 7765 union 7766 { 7767 struct 7768 { 7769 ubyte* ptr_; 7770 size_t cap_; 7771 size_t len_; 7772 size_t padding_; 7773 } 7774 struct 7775 { 7776 ubyte[small_bytes] small_; 7777 ubyte slen_; 7778 } 7779 } 7780 7781 void convertToBig() @nogc nothrow pure @trusted 7782 { 7783 import std.internal.memory : enforceMalloc; 7784 static assert(grow.max / 3 - 1 >= grow); 7785 enum nbytes = 3 * (grow + 1); 7786 size_t k = smallLength; 7787 ubyte* p = cast(ubyte*) enforceMalloc(nbytes); 7788 for (int i=0; i<k; i++) 7789 write24(p, read24(small_.ptr, i), i); 7790 // now we can overwrite small array data 7791 ptr_ = p; 7792 len_ = slen_; 7793 assert(grow > len_); 7794 cap_ = grow; 7795 setBig(); 7796 } 7797 7798 void setBig() @nogc nothrow pure { slen_ |= small_flag; } 7799 7800 @property size_t smallLength() const @nogc nothrow pure 7801 { 7802 return slen_ & small_mask; 7803 } 7804 @property ubyte isBig() const @nogc nothrow pure 7805 { 7806 return slen_ & small_flag; 7807 } 7808 } 7809 7810 static assert(Grapheme.sizeof == size_t.sizeof*4); 7811 7812 7813 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw 7814 { 7815 import std.algorithm.comparison : equal; 7816 Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")]; 7817 assert(byGrapheme("ЮУЗ").equal(data[])); 7818 } 7819 7820 /// 7821 @safe unittest 7822 { 7823 import std.algorithm.comparison : equal; 7824 import std.algorithm.iteration : filter; 7825 import std.range : isRandomAccessRange; 7826 7827 string bold = "ku\u0308hn"; 7828 7829 // note that decodeGrapheme takes parameter by ref 7830 auto first = decodeGrapheme(bold); 7831 7832 assert(first.length == 1); 7833 assert(first[0] == 'k'); 7834 7835 // the next grapheme is 2 characters long 7836 auto wideOne = decodeGrapheme(bold); 7837 // slicing a grapheme yields a random-access range of dchar 7838 assert(wideOne[].equal("u\u0308")); 7839 assert(wideOne.length == 2); 7840 static assert(isRandomAccessRange!(typeof(wideOne[]))); 7841 7842 // all of the usual range manipulation is possible 7843 assert(wideOne[].filter!isMark().equal("\u0308")); 7844 7845 auto g = Grapheme("A"); 7846 assert(g.valid); 7847 g ~= '\u0301'; 7848 assert(g[].equal("A\u0301")); 7849 assert(g.valid); 7850 g ~= "B"; 7851 // not a valid grapheme cluster anymore 7852 assert(!g.valid); 7853 // still could be useful though 7854 assert(g[].equal("A\u0301B")); 7855 } 7856 7857 @safe unittest 7858 { 7859 auto g = Grapheme("A\u0302"); 7860 assert(g[0] == 'A'); 7861 assert(g.valid); 7862 g[1] = '~'; // ASCII tilda is not a combining mark 7863 assert(g[1] == '~'); 7864 assert(!g.valid); 7865 } 7866 7867 @safe unittest 7868 { 7869 import std.algorithm.comparison : equal; 7870 import std.algorithm.iteration : map; 7871 import std.conv : text; 7872 import std.range : iota; 7873 7874 // not valid clusters (but it just a test) 7875 auto g = Grapheme('a', 'b', 'c', 'd', 'e'); 7876 assert(g[0] == 'a'); 7877 assert(g[1] == 'b'); 7878 assert(g[2] == 'c'); 7879 assert(g[3] == 'd'); 7880 assert(g[4] == 'e'); 7881 g[3] = 'Й'; 7882 assert(g[2] == 'c'); 7883 assert(g[3] == 'Й', text(g[3], " vs ", 'Й')); 7884 assert(g[4] == 'e'); 7885 assert(!g.valid); 7886 7887 g ~= 'ц'; 7888 g ~= '~'; 7889 assert(g[0] == 'a'); 7890 assert(g[1] == 'b'); 7891 assert(g[2] == 'c'); 7892 assert(g[3] == 'Й'); 7893 assert(g[4] == 'e'); 7894 assert(g[5] == 'ц'); 7895 assert(g[6] == '~'); 7896 assert(!g.valid); 7897 7898 Grapheme copy = g; 7899 copy[0] = 'X'; 7900 copy[1] = '-'; 7901 assert(g[0] == 'a' && copy[0] == 'X'); 7902 assert(g[1] == 'b' && copy[1] == '-'); 7903 assert(equal(g[2 .. g.length], copy[2 .. copy.length])); 7904 copy = Grapheme("АБВГДЕЁЖЗИКЛМ"); 7905 assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8])); 7906 copy ~= "xyz"; 7907 assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15])); 7908 assert(!copy.valid); 7909 7910 Grapheme h; 7911 foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"()) 7912 h ~= v; 7913 assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1))); 7914 } 7915 7916 // ensure Grapheme can be used as an AA key. 7917 @safe unittest 7918 { 7919 int[Grapheme] aa; 7920 } 7921 7922 /++ 7923 $(P Does basic case-insensitive comparison of `r1` and `r2`. 7924 This function uses simpler comparison rule thus achieving better performance 7925 than $(LREF icmp). However keep in mind the warning below.) 7926 7927 Params: 7928 r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7929 r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7930 7931 Returns: 7932 An `int` that is 0 if the strings match, 7933 <0 if `r1` is lexicographically "less" than `r2`, 7934 >0 if `r1` is lexicographically "greater" than `r2` 7935 7936 Warning: 7937 This function only handles 1:1 $(CODEPOINT) mapping 7938 and thus is not sufficient for certain alphabets 7939 like German, Greek and few others. 7940 7941 See_Also: 7942 $(LREF icmp) 7943 $(REF cmp, std,algorithm,comparison) 7944 +/ 7945 int sicmp(S1, S2)(scope S1 r1, scope S2 r2) 7946 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1) 7947 && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2)) 7948 { 7949 import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file 7950 import std.range.primitives : isInfinite; 7951 import std.utf : decodeFront; 7952 import std.traits : isDynamicArray; 7953 import std.typecons : Yes; 7954 static import std.ascii; 7955 7956 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 7957 && (isDynamicArray!S2 || isRandomAccessRange!S2) 7958 && !(isInfinite!S1 && isInfinite!S2) 7959 && __traits(compiles, 7960 { 7961 size_t s = size_t.sizeof / 2; 7962 r1 = r1[s .. $]; 7963 r2 = r2[s .. $]; 7964 })) 7965 {{ 7966 // ASCII optimization for dynamic arrays & similar. 7967 size_t i = 0; 7968 static if (isInfinite!S1) 7969 immutable end = r2.length; 7970 else static if (isInfinite!S2) 7971 immutable end = r1.length; 7972 else 7973 immutable end = r1.length > r2.length ? r2.length : r1.length; 7974 for (; i < end; ++i) 7975 { 7976 auto lhs = r1[i]; 7977 auto rhs = r2[i]; 7978 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 7979 if (lhs == rhs) continue; 7980 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 7981 if (lowDiff) return lowDiff; 7982 } 7983 static if (isInfinite!S1) 7984 return 1; 7985 else static if (isInfinite!S2) 7986 return -1; 7987 else 7988 return (r1.length > r2.length) - (r2.length > r1.length); 7989 7990 NonAsciiPath: 7991 r1 = r1[i .. $]; 7992 r2 = r2[i .. $]; 7993 // Fall through to standard case. 7994 }} 7995 7996 while (!r1.empty) 7997 { 7998 immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1); 7999 if (r2.empty) 8000 return 1; 8001 immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2); 8002 int diff = lhs - rhs; 8003 if (!diff) 8004 continue; 8005 if ((lhs | rhs) < 0x80) 8006 { 8007 immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 8008 if (!d) continue; 8009 return d; 8010 } 8011 size_t idx = simpleCaseTrie[lhs]; 8012 size_t idx2 = simpleCaseTrie[rhs]; 8013 // simpleCaseTrie is packed index table 8014 if (idx != EMPTY_CASE_TRIE) 8015 { 8016 if (idx2 != EMPTY_CASE_TRIE) 8017 {// both cased chars 8018 // adjust idx --> start of bucket 8019 idx = idx - sTable[idx].n; 8020 idx2 = idx2 - sTable[idx2].n; 8021 if (idx == idx2)// one bucket, equivalent chars 8022 continue; 8023 else// not the same bucket 8024 diff = sTable[idx].ch - sTable[idx2].ch; 8025 } 8026 else 8027 diff = sTable[idx - sTable[idx].n].ch - rhs; 8028 } 8029 else if (idx2 != EMPTY_CASE_TRIE) 8030 { 8031 diff = lhs - sTable[idx2 - sTable[idx2].n].ch; 8032 } 8033 // one of chars is not cased at all 8034 return diff; 8035 } 8036 return int(r2.empty) - 1; 8037 } 8038 8039 /// 8040 @safe @nogc pure nothrow unittest 8041 { 8042 assert(sicmp("Август", "авгусТ") == 0); 8043 // Greek also works as long as there is no 1:M mapping in sight 8044 assert(sicmp("ΌΎ", "όύ") == 0); 8045 // things like the following won't get matched as equal 8046 // Greek small letter iota with dialytika and tonos 8047 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 8048 8049 // while icmp has no problem with that 8050 assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0); 8051 assert(icmp("ΌΎ", "όύ") == 0); 8052 } 8053 8054 // overloads for the most common cases to reduce compile time 8055 @safe @nogc pure nothrow 8056 { 8057 int sicmp(scope const(char)[] str1, scope const(char)[] str2) 8058 { return sicmp!(const(char)[], const(char)[])(str1, str2); } 8059 8060 int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2) 8061 { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8062 8063 int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2) 8064 { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8065 } 8066 8067 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail) 8068 { 8069 import std.algorithm.searching : skipOver; 8070 import std.internal.unicode_tables : fullCaseTable; // generated file 8071 alias fTable = fullCaseTable; 8072 size_t idx = fullCaseTrie[lhs]; 8073 // fullCaseTrie is packed index table 8074 if (idx == EMPTY_CASE_TRIE) 8075 return lhs; 8076 immutable start = idx - fTable[idx].n; 8077 immutable end = fTable[idx].size + start; 8078 assert(fTable[start].entry_len == 1); 8079 for (idx=start; idx<end; idx++) 8080 { 8081 auto entryLen = fTable[idx].entry_len; 8082 if (entryLen == 1) 8083 { 8084 if (fTable[idx].seq[0] == rhs) 8085 { 8086 return 0; 8087 } 8088 } 8089 else 8090 {// OK it's a long chunk, like 'ss' for German 8091 dstring seq = fTable[idx].seq[0 .. entryLen]; 8092 if (rhs == seq[0] 8093 && rtail.skipOver(seq[1..$])) 8094 { 8095 // note that this path modifies rtail 8096 // iff we managed to get there 8097 return 0; 8098 } 8099 } 8100 } 8101 return fTable[start].seq[0]; // new remapped character for accurate diffs 8102 } 8103 8104 /++ 8105 Does case insensitive comparison of `r1` and `r2`. 8106 Follows the rules of full case-folding mapping. 8107 This includes matching as equal german ß with "ss" and 8108 other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp). 8109 The cost of `icmp` being pedantically correct is 8110 slightly worse performance. 8111 8112 Params: 8113 r1 = a forward range of characters 8114 r2 = a forward range of characters 8115 8116 Returns: 8117 An `int` that is 0 if the strings match, 8118 <0 if `str1` is lexicographically "less" than `str2`, 8119 >0 if `str1` is lexicographically "greater" than `str2` 8120 8121 See_Also: 8122 $(LREF sicmp) 8123 $(REF cmp, std,algorithm,comparison) 8124 +/ 8125 int icmp(S1, S2)(S1 r1, S2 r2) 8126 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1) 8127 && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2)) 8128 { 8129 import std.range.primitives : isInfinite; 8130 import std.traits : isDynamicArray; 8131 import std.utf : byDchar; 8132 static import std.ascii; 8133 8134 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 8135 && (isDynamicArray!S2 || isRandomAccessRange!S2) 8136 && !(isInfinite!S1 && isInfinite!S2) 8137 && __traits(compiles, 8138 { 8139 size_t s = size_t.max / 2; 8140 r1 = r1[s .. $]; 8141 r2 = r2[s .. $]; 8142 })) 8143 {{ 8144 // ASCII optimization for dynamic arrays & similar. 8145 size_t i = 0; 8146 static if (isInfinite!S1) 8147 immutable end = r2.length; 8148 else static if (isInfinite!S2) 8149 immutable end = r1.length; 8150 else 8151 immutable end = r1.length > r2.length ? r2.length : r1.length; 8152 for (; i < end; ++i) 8153 { 8154 auto lhs = r1[i]; 8155 auto rhs = r2[i]; 8156 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 8157 if (lhs == rhs) continue; 8158 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 8159 if (lowDiff) return lowDiff; 8160 } 8161 static if (isInfinite!S1) 8162 return 1; 8163 else static if (isInfinite!S2) 8164 return -1; 8165 else 8166 return (r1.length > r2.length) - (r2.length > r1.length); 8167 8168 NonAsciiPath: 8169 r1 = r1[i .. $]; 8170 r2 = r2[i .. $]; 8171 // Fall through to standard case. 8172 }} 8173 8174 auto str1 = r1.byDchar; 8175 auto str2 = r2.byDchar; 8176 8177 for (;;) 8178 { 8179 if (str1.empty) 8180 return str2.empty ? 0 : -1; 8181 immutable lhs = str1.front; 8182 if (str2.empty) 8183 return 1; 8184 immutable rhs = str2.front; 8185 str1.popFront(); 8186 str2.popFront(); 8187 if (!(lhs - rhs)) 8188 continue; 8189 // first try to match lhs to <rhs,right-tail> sequence 8190 immutable cmpLR = fullCasedCmp(lhs, rhs, str2); 8191 if (!cmpLR) 8192 continue; 8193 // then rhs to <lhs,left-tail> sequence 8194 immutable cmpRL = fullCasedCmp(rhs, lhs, str1); 8195 if (!cmpRL) 8196 continue; 8197 // cmpXX contain remapped codepoints 8198 // to obtain stable ordering of icmp 8199 return cmpLR - cmpRL; 8200 } 8201 } 8202 8203 /// 8204 @safe @nogc pure nothrow unittest 8205 { 8206 assert(icmp("Rußland", "Russland") == 0); 8207 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8208 } 8209 8210 /** 8211 * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding 8212 * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`. 8213 */ 8214 @safe @nogc nothrow pure unittest 8215 { 8216 import std.utf : byDchar; 8217 8218 assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0); 8219 assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0); 8220 } 8221 8222 // test different character types 8223 @safe unittest 8224 { 8225 assert(icmp("Rußland", "Russland") == 0); 8226 assert(icmp("Rußland"w, "Russland") == 0); 8227 assert(icmp("Rußland", "Russland"w) == 0); 8228 assert(icmp("Rußland"w, "Russland"w) == 0); 8229 assert(icmp("Rußland"d, "Russland"w) == 0); 8230 assert(icmp("Rußland"w, "Russland"d) == 0); 8231 } 8232 8233 // overloads for the most common cases to reduce compile time 8234 @safe @nogc pure nothrow 8235 { 8236 int icmp(const(char)[] str1, const(char)[] str2) 8237 { return icmp!(const(char)[], const(char)[])(str1, str2); } 8238 int icmp(const(wchar)[] str1, const(wchar)[] str2) 8239 { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8240 int icmp(const(dchar)[] str1, const(dchar)[] str2) 8241 { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8242 } 8243 8244 @safe unittest 8245 { 8246 import std.algorithm.sorting : sort; 8247 import std.conv : to; 8248 import std.exception : assertCTFEable; 8249 assertCTFEable!( 8250 { 8251 static foreach (cfunc; AliasSeq!(icmp, sicmp)) 8252 {{ 8253 static foreach (S1; AliasSeq!(string, wstring, dstring)) 8254 static foreach (S2; AliasSeq!(string, wstring, dstring)) 8255 { 8256 assert(cfunc("".to!S1(), "".to!S2()) == 0); 8257 assert(cfunc("A".to!S1(), "".to!S2()) > 0); 8258 assert(cfunc("".to!S1(), "0".to!S2()) < 0); 8259 assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0); 8260 assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0); 8261 assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0); 8262 assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0); 8263 assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0); 8264 // Check example: 8265 assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0); 8266 assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0); 8267 } 8268 // check that the order is properly agnostic to the case 8269 auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"]; 8270 sort!((a,b) => cfunc(a,b) < 0)(strs); 8271 assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]); 8272 }} 8273 assert(icmp("ßb", "ssa") > 0); 8274 // Check example: 8275 assert(icmp("Russland", "Rußland") == 0); 8276 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8277 assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0); 8278 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 8279 // https://issues.dlang.org/show_bug.cgi?id=11057 8280 assert( icmp("K", "L") < 0 ); 8281 }); 8282 } 8283 8284 // https://issues.dlang.org/show_bug.cgi?id=17372 8285 @safe pure unittest 8286 { 8287 import std.algorithm.iteration : joiner, map; 8288 import std.algorithm.sorting : sort; 8289 import std.array : array; 8290 auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0); 8291 } 8292 8293 // This is package(std) for the moment to be used as a support tool for std.regex 8294 // It needs a better API 8295 /* 8296 Return a range of all $(CODEPOINTS) that casefold to 8297 and from this `ch`. 8298 */ 8299 package(std) auto simpleCaseFoldings(dchar ch) @safe 8300 { 8301 import std.internal.unicode_tables : simpleCaseTable; // generated file 8302 alias sTable = simpleCaseTable; 8303 static struct Range 8304 { 8305 @safe pure nothrow: 8306 uint idx; //if == uint.max, then read c. 8307 union 8308 { 8309 dchar c; // == 0 - empty range 8310 uint len; 8311 } 8312 @property bool isSmall() const { return idx == uint.max; } 8313 8314 this(dchar ch) 8315 { 8316 idx = uint.max; 8317 c = ch; 8318 } 8319 8320 this(uint start, uint size) 8321 { 8322 idx = start; 8323 len = size; 8324 } 8325 8326 @property dchar front() const 8327 { 8328 assert(!empty); 8329 if (isSmall) 8330 { 8331 return c; 8332 } 8333 auto ch = sTable[idx].ch; 8334 return ch; 8335 } 8336 8337 @property bool empty() const 8338 { 8339 if (isSmall) 8340 { 8341 return c == 0; 8342 } 8343 return len == 0; 8344 } 8345 8346 @property size_t length() const 8347 { 8348 if (isSmall) 8349 { 8350 return c == 0 ? 0 : 1; 8351 } 8352 return len; 8353 } 8354 8355 void popFront() 8356 { 8357 if (isSmall) 8358 c = 0; 8359 else 8360 { 8361 idx++; 8362 len--; 8363 } 8364 } 8365 } 8366 immutable idx = simpleCaseTrie[ch]; 8367 if (idx == EMPTY_CASE_TRIE) 8368 return Range(ch); 8369 auto entry = sTable[idx]; 8370 immutable start = idx - entry.n; 8371 return Range(start, entry.size); 8372 } 8373 8374 @safe unittest 8375 { 8376 import std.algorithm.comparison : equal; 8377 import std.algorithm.searching : canFind; 8378 import std.array : array; 8379 import std.exception : assertCTFEable; 8380 assertCTFEable!((){ 8381 auto r = simpleCaseFoldings('Э').array; 8382 assert(r.length == 2); 8383 assert(r.canFind('э') && r.canFind('Э')); 8384 auto sr = simpleCaseFoldings('~'); 8385 assert(sr.equal("~")); 8386 //A with ring above - casefolds to the same bucket as Angstrom sign 8387 sr = simpleCaseFoldings('Å'); 8388 assert(sr.length == 3); 8389 assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B')); 8390 }); 8391 } 8392 8393 /++ 8394 $(P Returns the $(S_LINK Combining class, combining class) of `ch`.) 8395 +/ 8396 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc 8397 { 8398 return combiningClassTrie[ch]; 8399 } 8400 8401 /// 8402 @safe unittest 8403 { 8404 // shorten the code 8405 alias CC = combiningClass; 8406 8407 // combining tilda 8408 assert(CC('\u0303') == 230); 8409 // combining ring below 8410 assert(CC('\u0325') == 220); 8411 // the simple consequence is that "tilda" should be 8412 // placed after a "ring below" in a sequence 8413 } 8414 8415 @safe pure nothrow @nogc unittest 8416 { 8417 foreach (ch; 0 .. 0x80) 8418 assert(combiningClass(ch) == 0); 8419 assert(combiningClass('\u05BD') == 22); 8420 assert(combiningClass('\u0300') == 230); 8421 assert(combiningClass('\u0317') == 220); 8422 assert(combiningClass('\u1939') == 222); 8423 } 8424 8425 /// Unicode character decomposition type. 8426 enum UnicodeDecomposition { 8427 /// Canonical decomposition. The result is canonically equivalent sequence. 8428 Canonical, 8429 /** 8430 Compatibility decomposition. The result is compatibility equivalent sequence. 8431 Note: Compatibility decomposition is a $(B lossy) conversion, 8432 typically suitable only for fuzzy matching and internal processing. 8433 */ 8434 Compatibility 8435 } 8436 8437 /** 8438 Shorthand aliases for character decomposition type, passed as a 8439 template parameter to $(LREF decompose). 8440 */ 8441 enum { 8442 Canonical = UnicodeDecomposition.Canonical, 8443 Compatibility = UnicodeDecomposition.Compatibility 8444 } 8445 8446 /++ 8447 Try to canonically compose 2 $(CHARACTERS). 8448 Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise. 8449 8450 The assumption is that `first` comes before `second` in the original text, 8451 usually meaning that the first is a starter. 8452 8453 Note: Hangul syllables are not covered by this function. 8454 See `composeJamo` below. 8455 +/ 8456 public dchar compose(dchar first, dchar second) pure nothrow @safe 8457 { 8458 import std.algorithm.iteration : map; 8459 import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask; 8460 import std.range : assumeSorted; 8461 immutable packed = compositionJumpTrie[first]; 8462 if (packed == ushort.max) 8463 return dchar.init; 8464 // unpack offset and length 8465 immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift; 8466 // TODO: optimize this micro binary search (no more then 4-5 steps) 8467 auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted(); 8468 immutable target = r.lowerBound(second).length; 8469 if (target == cnt) 8470 return dchar.init; 8471 immutable entry = compositionTable[idx+target]; 8472 if (entry.rhs != second) 8473 return dchar.init; 8474 return entry.composed; 8475 } 8476 8477 /// 8478 @safe unittest 8479 { 8480 assert(compose('A','\u0308') == '\u00C4'); 8481 assert(compose('A', 'B') == dchar.init); 8482 assert(compose('C', '\u0301') == '\u0106'); 8483 // note that the starter is the first one 8484 // thus the following doesn't compose 8485 assert(compose('\u0308', 'A') == dchar.init); 8486 } 8487 8488 /++ 8489 Returns a full $(S_LINK Canonical decomposition, Canonical) 8490 (by default) or $(S_LINK Compatibility decomposition, Compatibility) 8491 decomposition of $(CHARACTER) `ch`. 8492 If no decomposition is available returns a $(LREF Grapheme) 8493 with the `ch` itself. 8494 8495 Note: 8496 This function also decomposes hangul syllables 8497 as prescribed by the standard. 8498 8499 See_Also: $(LREF decomposeHangul) for a restricted version 8500 that takes into account only hangul syllables but 8501 no other decompositions. 8502 +/ 8503 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe 8504 { 8505 import std.algorithm.searching : until; 8506 import std.internal.unicode_decomp : decompCompatTable, decompCanonTable; 8507 static if (decompType == Canonical) 8508 { 8509 alias table = decompCanonTable; 8510 alias mapping = canonMappingTrie; 8511 } 8512 else static if (decompType == Compatibility) 8513 { 8514 alias table = decompCompatTable; 8515 alias mapping = compatMappingTrie; 8516 } 8517 immutable idx = mapping[ch]; 8518 if (!idx) // not found, check hangul arithmetic decomposition 8519 return decomposeHangul(ch); 8520 auto decomp = table[idx..$].until(0); 8521 return Grapheme(decomp); 8522 } 8523 8524 /// 8525 @safe unittest 8526 { 8527 import std.algorithm.comparison : equal; 8528 8529 assert(compose('A','\u0308') == '\u00C4'); 8530 assert(compose('A', 'B') == dchar.init); 8531 assert(compose('C', '\u0301') == '\u0106'); 8532 // note that the starter is the first one 8533 // thus the following doesn't compose 8534 assert(compose('\u0308', 'A') == dchar.init); 8535 8536 assert(decompose('Ĉ')[].equal("C\u0302")); 8537 assert(decompose('D')[].equal("D")); 8538 assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7")); 8539 assert(decompose!Compatibility('¹')[].equal("1")); 8540 } 8541 8542 //---------------------------------------------------------------------------- 8543 // Hangul specific composition/decomposition 8544 enum jamoSBase = 0xAC00; 8545 enum jamoLBase = 0x1100; 8546 enum jamoVBase = 0x1161; 8547 enum jamoTBase = 0x11A7; 8548 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28; 8549 enum jamoNCount = jamoVCount * jamoTCount; 8550 enum jamoSCount = jamoLCount * jamoNCount; 8551 8552 // Tests if `ch` is a Hangul leading consonant jamo. 8553 bool isJamoL(dchar ch) pure nothrow @nogc @safe 8554 { 8555 // first cmp rejects ~ 1M code points above leading jamo range 8556 return ch < jamoLBase+jamoLCount && ch >= jamoLBase; 8557 } 8558 8559 // Tests if `ch` is a Hangul vowel jamo. 8560 bool isJamoT(dchar ch) pure nothrow @nogc @safe 8561 { 8562 // first cmp rejects ~ 1M code points above trailing jamo range 8563 // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0) 8564 return ch < jamoTBase+jamoTCount && ch > jamoTBase; 8565 } 8566 8567 // Tests if `ch` is a Hangul trailnig consonant jamo. 8568 bool isJamoV(dchar ch) pure nothrow @nogc @safe 8569 { 8570 // first cmp rejects ~ 1M code points above vowel range 8571 return ch < jamoVBase+jamoVCount && ch >= jamoVBase; 8572 } 8573 8574 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe 8575 { 8576 int idxS = cast(int) ch - jamoSBase; 8577 return idxS >= 0 && idxS < jamoSCount ? idxS : -1; 8578 } 8579 8580 // internal helper: compose hangul syllables leaving dchar.init in holes 8581 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe 8582 { 8583 for (size_t idx = 0; idx + 1 < seq.length; ) 8584 { 8585 if (isJamoL(seq[idx]) && isJamoV(seq[idx+1])) 8586 { 8587 immutable int indexL = seq[idx] - jamoLBase; 8588 immutable int indexV = seq[idx+1] - jamoVBase; 8589 immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount; 8590 if (idx + 2 < seq.length && isJamoT(seq[idx+2])) 8591 { 8592 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase; 8593 seq[idx+1] = dchar.init; 8594 seq[idx+2] = dchar.init; 8595 idx += 3; 8596 } 8597 else 8598 { 8599 seq[idx] = jamoSBase + indexLV; 8600 seq[idx+1] = dchar.init; 8601 idx += 2; 8602 } 8603 } 8604 else 8605 idx++; 8606 } 8607 } 8608 8609 //---------------------------------------------------------------------------- 8610 public: 8611 8612 /** 8613 Decomposes a Hangul syllable. If `ch` is not a composed syllable 8614 then this function returns $(LREF Grapheme) containing only `ch` as is. 8615 */ 8616 Grapheme decomposeHangul(dchar ch) nothrow pure @safe 8617 { 8618 immutable idxS = cast(int) ch - jamoSBase; 8619 if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch); 8620 immutable idxL = idxS / jamoNCount; 8621 immutable idxV = (idxS % jamoNCount) / jamoTCount; 8622 immutable idxT = idxS % jamoTCount; 8623 8624 immutable partL = jamoLBase + idxL; 8625 immutable partV = jamoVBase + idxV; 8626 if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition 8627 return Grapheme(partL, partV, jamoTBase + idxT); 8628 else // <L, V> decomposition 8629 return Grapheme(partL, partV); 8630 } 8631 8632 /// 8633 @safe unittest 8634 { 8635 import std.algorithm.comparison : equal; 8636 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8637 } 8638 8639 /++ 8640 Try to compose hangul syllable out of a leading consonant (`lead`), 8641 a `vowel` and optional `trailing` consonant jamos. 8642 8643 On success returns the composed LV or LVT hangul syllable. 8644 8645 If any of `lead` and `vowel` are not a valid hangul jamo 8646 of the respective $(CHARACTER) class returns dchar.init. 8647 +/ 8648 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe 8649 { 8650 if (!isJamoL(lead)) 8651 return dchar.init; 8652 immutable indexL = lead - jamoLBase; 8653 if (!isJamoV(vowel)) 8654 return dchar.init; 8655 immutable indexV = vowel - jamoVBase; 8656 immutable indexLV = indexL * jamoNCount + indexV * jamoTCount; 8657 immutable dchar syllable = jamoSBase + indexLV; 8658 return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable; 8659 } 8660 8661 /// 8662 @safe unittest 8663 { 8664 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8665 // leaving out T-vowel, or passing any codepoint 8666 // that is not trailing consonant composes an LV-syllable 8667 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); 8668 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8669 assert(composeJamo('\u1111', 'A') == dchar.init); 8670 assert(composeJamo('A', '\u1171') == dchar.init); 8671 } 8672 8673 @safe unittest 8674 { 8675 import std.algorithm.comparison : equal; 8676 import std.conv : text; 8677 8678 static void testDecomp(UnicodeDecomposition T)(dchar ch, string r) 8679 { 8680 Grapheme g = decompose!T(ch); 8681 assert(equal(g[], r), text(g[], " vs ", r)); 8682 } 8683 testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345"); 8684 testDecomp!Canonical('\uF907', "\u9F9C"); 8685 testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C"); 8686 testDecomp!Compatibility('\uA7F9', "\u0153"); 8687 8688 // check examples 8689 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8690 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8691 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel 8692 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8693 assert(composeJamo('\u1111', 'A') == dchar.init); 8694 assert(composeJamo('A', '\u1171') == dchar.init); 8695 } 8696 8697 /** 8698 Enumeration type for normalization forms, 8699 passed as template parameter for functions like $(LREF normalize). 8700 */ 8701 enum NormalizationForm { 8702 NFC, 8703 NFD, 8704 NFKC, 8705 NFKD 8706 } 8707 8708 8709 enum { 8710 /** 8711 Shorthand aliases from values indicating normalization forms. 8712 */ 8713 NFC = NormalizationForm.NFC, 8714 ///ditto 8715 NFD = NormalizationForm.NFD, 8716 ///ditto 8717 NFKC = NormalizationForm.NFKC, 8718 ///ditto 8719 NFKD = NormalizationForm.NFKD 8720 } 8721 8722 /++ 8723 Returns `input` string normalized to the chosen form. 8724 Form C is used by default. 8725 8726 For more information on normalization forms see 8727 the $(S_LINK Normalization, normalization section). 8728 8729 Note: 8730 In cases where the string in question is already normalized, 8731 it is returned unmodified and no memory allocation happens. 8732 +/ 8733 /* 8734 WARNING: @trusted lambda inside - handle with same care as @trusted 8735 functions 8736 8737 Despite being a template, the attributes do no harm since this doesn't work 8738 with user-defined range or character types anyway. 8739 */ 8740 pure @safe inout(C)[] normalize(NormalizationForm norm=NFC, C) 8741 (return scope inout(C)[] input) 8742 { 8743 import std.algorithm.mutation : SwapStrategy; 8744 import std.algorithm.sorting : sort; 8745 import std.array : appender; 8746 import std.range : zip; 8747 8748 auto anchors = splitNormalized!norm(input); 8749 if (anchors[0] == input.length && anchors[1] == input.length) 8750 return input; 8751 dchar[] decomposed; 8752 decomposed.reserve(31); 8753 ubyte[] ccc; 8754 ccc.reserve(31); 8755 auto app = appender!(C[])(); 8756 do 8757 { 8758 app.put(input[0 .. anchors[0]]); 8759 foreach (dchar ch; input[anchors[0]..anchors[1]]) 8760 static if (norm == NFD || norm == NFC) 8761 { 8762 foreach (dchar c; decompose!Canonical(ch)[]) 8763 decomposed ~= c; 8764 } 8765 else // NFKD & NFKC 8766 { 8767 foreach (dchar c; decompose!Compatibility(ch)[]) 8768 decomposed ~= c; 8769 } 8770 ccc.length = decomposed.length; 8771 size_t firstNonStable = 0; 8772 ubyte lastClazz = 0; 8773 8774 foreach (idx, dchar ch; decomposed) 8775 { 8776 immutable clazz = combiningClass(ch); 8777 ccc[idx] = clazz; 8778 if (clazz == 0 && lastClazz != 0) 8779 { 8780 // found a stable code point after unstable ones 8781 sort!("a[0] < b[0]", SwapStrategy.stable) 8782 (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx])); 8783 firstNonStable = decomposed.length; 8784 } 8785 else if (clazz != 0 && lastClazz == 0) 8786 { 8787 // found first unstable code point after stable ones 8788 firstNonStable = idx; 8789 } 8790 lastClazz = clazz; 8791 } 8792 sort!("a[0] < b[0]", SwapStrategy.stable) 8793 (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$])); 8794 static if (norm == NFC || norm == NFKC) 8795 { 8796 import std.algorithm.searching : countUntil; 8797 auto first = countUntil(ccc, 0); 8798 if (first >= 0) // no starters?? no recomposition 8799 { 8800 for (;;) 8801 { 8802 immutable second = recompose(first, decomposed, ccc); 8803 if (second == decomposed.length) 8804 break; 8805 first = second; 8806 } 8807 // 2nd pass for hangul syllables 8808 hangulRecompose(decomposed); 8809 } 8810 } 8811 static if (norm == NFD || norm == NFKD) 8812 app.put(decomposed); 8813 else 8814 { 8815 import std.algorithm.mutation : remove; 8816 auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed); 8817 app.put(decomposed[0 .. clean.length]); 8818 } 8819 // reset variables 8820 decomposed.length = 0; 8821 () @trusted { 8822 // assumeSafeAppend isn't considered pure as of writing, hence the 8823 // cast. It isn't pure in the sense that the elements after 8824 // the array in question are affected, but we don't use those 8825 // making the call pure for our purposes. 8826 (cast(void delegate() pure nothrow) {decomposed.assumeSafeAppend();})(); 8827 ccc.length = 0; 8828 (cast(void delegate() pure nothrow) {ccc.assumeSafeAppend();})(); 8829 } (); 8830 input = input[anchors[1]..$]; 8831 // and move on 8832 anchors = splitNormalized!norm(input); 8833 } while (anchors[0] != input.length); 8834 app.put(input[0 .. anchors[0]]); 8835 return () @trusted inout { return cast(inout(C)[]) app.data; } (); 8836 } 8837 8838 /// 8839 @safe pure unittest 8840 { 8841 // any encoding works 8842 wstring greet = "Hello world"; 8843 assert(normalize(greet) is greet); // the same exact slice 8844 8845 // An example of a character with all 4 forms being different: 8846 // Greek upsilon with acute and hook symbol (code point 0x03D3) 8847 assert(normalize!NFC("ϓ") == "\u03D3"); 8848 assert(normalize!NFD("ϓ") == "\u03D2\u0301"); 8849 assert(normalize!NFKC("ϓ") == "\u038E"); 8850 assert(normalize!NFKD("ϓ") == "\u03A5\u0301"); 8851 } 8852 8853 @safe pure unittest 8854 { 8855 import std.conv : text; 8856 8857 assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def"))); 8858 assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰")); 8859 assert(normalize!NFD("Äffin") == "A\u0308ffin"); 8860 8861 // test with dstring 8862 dstring greet = "Hello world"; 8863 assert(normalize(greet) is greet); // the same exact slice 8864 } 8865 8866 // canonically recompose given slice of code points, works in-place and mutates data 8867 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe 8868 { 8869 assert(input.length == ccc.length); 8870 int accumCC = -1;// so that it's out of 0 .. 255 range 8871 // writefln("recomposing %( %04x %)", input); 8872 // first one is always a starter thus we start at i == 1 8873 size_t i = start+1; 8874 for (; ; ) 8875 { 8876 if (i == input.length) 8877 break; 8878 immutable curCC = ccc[i]; 8879 // In any character sequence beginning with a starter S 8880 // a character C is blocked from S if and only if there 8881 // is some character B between S and C, and either B 8882 // is a starter or it has the same or higher combining class as C. 8883 //------------------------ 8884 // Applying to our case: 8885 // S is input[0] 8886 // accumCC is the maximum CCC of characters between C and S, 8887 // as ccc are sorted 8888 // C is input[i] 8889 8890 if (curCC > accumCC) 8891 { 8892 immutable comp = compose(input[start], input[i]); 8893 if (comp != dchar.init) 8894 { 8895 input[start] = comp; 8896 input[i] = dchar.init;// put a sentinel 8897 // current was merged so its CCC shouldn't affect 8898 // composing with the next one 8899 } 8900 else 8901 { 8902 // if it was a starter then accumCC is now 0, end of loop 8903 accumCC = curCC; 8904 if (accumCC == 0) 8905 break; 8906 } 8907 } 8908 else 8909 { 8910 // ditto here 8911 accumCC = curCC; 8912 if (accumCC == 0) 8913 break; 8914 } 8915 i++; 8916 } 8917 return i; 8918 } 8919 8920 // returns tuple of 2 indexes that delimit: 8921 // normalized text, piece that needs normalization and 8922 // the rest of input starting with stable code point 8923 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input) 8924 { 8925 import std.typecons : tuple; 8926 ubyte lastCC = 0; 8927 8928 foreach (idx, dchar ch; input) 8929 { 8930 static if (norm == NFC) 8931 if (ch < 0x0300) 8932 { 8933 lastCC = 0; 8934 continue; 8935 } 8936 immutable ubyte CC = combiningClass(ch); 8937 if (lastCC > CC && CC != 0) 8938 { 8939 return seekStable!norm(idx, input); 8940 } 8941 8942 if (notAllowedIn!norm(ch)) 8943 { 8944 return seekStable!norm(idx, input); 8945 } 8946 lastCC = CC; 8947 } 8948 return tuple(input.length, input.length); 8949 } 8950 8951 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input) 8952 { 8953 import std.typecons : tuple; 8954 import std.utf : codeLength; 8955 8956 auto br = input[0 .. idx]; 8957 size_t region_start = 0;// default 8958 for (;;) 8959 { 8960 if (br.empty)// start is 0 8961 break; 8962 dchar ch = br.back; 8963 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8964 { 8965 region_start = br.length - codeLength!C(ch); 8966 break; 8967 } 8968 br.popFront(); 8969 } 8970 ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..." 8971 size_t region_end=input.length;// end is $ by default 8972 foreach (i, dchar ch; input[idx..$]) 8973 { 8974 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8975 { 8976 region_end = i+idx; 8977 break; 8978 } 8979 } 8980 // writeln("Region to normalize: ", input[region_start .. region_end]); 8981 return tuple(region_start, region_end); 8982 } 8983 8984 /** 8985 Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization 8986 form `norm`. 8987 */ 8988 public bool allowedIn(NormalizationForm norm)(dchar ch) 8989 { 8990 return !notAllowedIn!norm(ch); 8991 } 8992 8993 /// 8994 @safe unittest 8995 { 8996 // e.g. Cyrillic is always allowed, so is ASCII 8997 assert(allowedIn!NFC('я')); 8998 assert(allowedIn!NFD('я')); 8999 assert(allowedIn!NFKC('я')); 9000 assert(allowedIn!NFKD('я')); 9001 assert(allowedIn!NFC('Z')); 9002 } 9003 9004 // not user friendly name but more direct 9005 private bool notAllowedIn(NormalizationForm norm)(dchar ch) 9006 { 9007 static if (norm == NFC) 9008 alias qcTrie = nfcQCTrie; 9009 else static if (norm == NFD) 9010 alias qcTrie = nfdQCTrie; 9011 else static if (norm == NFKC) 9012 alias qcTrie = nfkcQCTrie; 9013 else static if (norm == NFKD) 9014 alias qcTrie = nfkdQCTrie; 9015 else 9016 static assert("Unknown normalization form "~norm); 9017 return qcTrie[ch]; 9018 } 9019 9020 @safe unittest 9021 { 9022 assert(allowedIn!NFC('я')); 9023 assert(allowedIn!NFD('я')); 9024 assert(allowedIn!NFKC('я')); 9025 assert(allowedIn!NFKD('я')); 9026 assert(allowedIn!NFC('Z')); 9027 } 9028 9029 } 9030 9031 version (std_uni_bootstrap) 9032 { 9033 // old version used for bootstrapping of gen_uni.d that generates 9034 // up to date optimal versions of all of isXXX functions 9035 @safe pure nothrow @nogc public bool isWhite(dchar c) 9036 { 9037 import std.ascii : isWhite; 9038 return isWhite(c) || 9039 c == lineSep || c == paraSep || 9040 c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' || 9041 (c >= '\u2000' && c <= '\u200A') || 9042 c == '\u202F' || c == '\u205F' || c == '\u3000'; 9043 } 9044 } 9045 else 9046 { 9047 9048 // trusted -> avoid bounds check 9049 @trusted pure nothrow @nogc private 9050 { 9051 import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file 9052 9053 // hide template instances behind functions 9054 // https://issues.dlang.org/show_bug.cgi?id=13232 9055 ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; } 9056 ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; } 9057 dchar toLowerTab(size_t idx) { return toLowerTable[idx]; } 9058 9059 ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; } 9060 ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; } 9061 dchar toTitleTab(size_t idx) { return toTitleTable[idx]; } 9062 9063 ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; } 9064 ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; } 9065 dchar toUpperTab(size_t idx) { return toUpperTable[idx]; } 9066 } 9067 9068 public: 9069 9070 /++ 9071 Whether or not `c` is a Unicode whitespace $(CHARACTER). 9072 (general Unicode category: Part of C0(tab, vertical tab, form feed, 9073 carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085)) 9074 +/ 9075 @safe pure nothrow @nogc 9076 public bool isWhite(dchar c) 9077 { 9078 import std.internal.unicode_tables : isWhiteGen; // generated file 9079 return isWhiteGen(c); // call pregenerated binary search 9080 } 9081 9082 /++ 9083 Return whether `c` is a Unicode lowercase $(CHARACTER). 9084 +/ 9085 @safe pure nothrow @nogc 9086 bool isLower(dchar c) 9087 { 9088 import std.ascii : isLower, isASCII; 9089 if (isASCII(c)) 9090 return isLower(c); 9091 return lowerCaseTrie[c]; 9092 } 9093 9094 @safe unittest 9095 { 9096 import std.ascii : isLower; 9097 foreach (v; 0 .. 0x80) 9098 assert(isLower(v) == .isLower(v)); 9099 assert(.isLower('я')); 9100 assert(.isLower('й')); 9101 assert(!.isLower('Ж')); 9102 // Greek HETA 9103 assert(!.isLower('\u0370')); 9104 assert(.isLower('\u0371')); 9105 assert(!.isLower('\u039C')); // capital MU 9106 assert(.isLower('\u03B2')); // beta 9107 // from extended Greek 9108 assert(!.isLower('\u1F18')); 9109 assert(.isLower('\u1F00')); 9110 foreach (v; unicode.lowerCase.byCodepoint) 9111 assert(.isLower(v) && !isUpper(v)); 9112 } 9113 9114 9115 /++ 9116 Return whether `c` is a Unicode uppercase $(CHARACTER). 9117 +/ 9118 @safe pure nothrow @nogc 9119 bool isUpper(dchar c) 9120 { 9121 import std.ascii : isUpper, isASCII; 9122 if (isASCII(c)) 9123 return isUpper(c); 9124 return upperCaseTrie[c]; 9125 } 9126 9127 @safe unittest 9128 { 9129 import std.ascii : isLower; 9130 foreach (v; 0 .. 0x80) 9131 assert(isLower(v) == .isLower(v)); 9132 assert(!isUpper('й')); 9133 assert(isUpper('Ж')); 9134 // Greek HETA 9135 assert(isUpper('\u0370')); 9136 assert(!isUpper('\u0371')); 9137 assert(isUpper('\u039C')); // capital MU 9138 assert(!isUpper('\u03B2')); // beta 9139 // from extended Greek 9140 assert(!isUpper('\u1F00')); 9141 assert(isUpper('\u1F18')); 9142 foreach (v; unicode.upperCase.byCodepoint) 9143 assert(isUpper(v) && !.isLower(v)); 9144 } 9145 9146 9147 //TODO: Hidden for now, needs better API. 9148 //Other transforms could use better API as well, but this one is a new primitive. 9149 @safe pure nothrow @nogc 9150 private dchar toTitlecase(dchar c) 9151 { 9152 // optimize ASCII case 9153 if (c < 0xAA) 9154 { 9155 if (c < 'a') 9156 return c; 9157 if (c <= 'z') 9158 return c - 32; 9159 return c; 9160 } 9161 size_t idx = toTitleSimpleIndex(c); 9162 if (idx != ushort.max) 9163 { 9164 return toTitleTab(idx); 9165 } 9166 return c; 9167 } 9168 9169 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab); 9170 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab); 9171 9172 // generic toUpper/toLower on whole string, creates new or returns as is 9173 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s) 9174 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 9175 { 9176 import std.array : appender, array; 9177 import std.ascii : isASCII; 9178 import std.utf : byDchar, codeLength; 9179 9180 alias C = ElementEncodingType!S; 9181 9182 auto r = s.byDchar; 9183 for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront()) 9184 { 9185 auto cOuter = r.front; 9186 ushort idx = indexFn(cOuter); 9187 if (idx == ushort.max) 9188 continue; 9189 auto result = appender!(C[])(); 9190 result.reserve(s.length); 9191 result.put(s[0 .. i]); 9192 foreach (dchar c; s[i .. $].byDchar) 9193 { 9194 if (c.isASCII) 9195 { 9196 result.put(asciiConvert(c)); 9197 } 9198 else 9199 { 9200 idx = indexFn(c); 9201 if (idx == ushort.max) 9202 result.put(c); 9203 else if (idx < maxIdx) 9204 { 9205 c = tableFn(idx); 9206 result.put(c); 9207 } 9208 else 9209 { 9210 auto val = tableFn(idx); 9211 // unpack length + codepoint 9212 immutable uint len = val >> 24; 9213 result.put(cast(dchar)(val & 0xFF_FFFF)); 9214 foreach (j; idx+1 .. idx+len) 9215 result.put(tableFn(j)); 9216 } 9217 } 9218 } 9219 return result.data; 9220 } 9221 9222 static if (isSomeString!S) 9223 return s; 9224 else 9225 return s.array; 9226 } 9227 9228 // https://issues.dlang.org/show_bug.cgi?id=12428 9229 @safe unittest 9230 { 9231 import std.array : replicate; 9232 auto s = "abcdefghij".replicate(300); 9233 s = s[0 .. 10]; 9234 9235 toUpper(s); 9236 9237 assert(s == "abcdefghij"); 9238 } 9239 9240 // https://issues.dlang.org/show_bug.cgi?id=18993 9241 @safe unittest 9242 { 9243 static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length); 9244 } 9245 9246 9247 // generic toUpper/toLower on whole range, returns range 9248 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str) 9249 // Accept range of dchar's 9250 if (isInputRange!Range && 9251 isSomeChar!(ElementEncodingType!Range) && 9252 ElementEncodingType!Range.sizeof == dchar.sizeof) 9253 { 9254 static struct ToCaserImpl 9255 { 9256 @property bool empty() 9257 { 9258 return !nLeft && r.empty; 9259 } 9260 9261 @property auto front() 9262 { 9263 import std.ascii : isASCII; 9264 9265 if (!nLeft) 9266 { 9267 dchar c = r.front; 9268 if (c.isASCII) 9269 { 9270 buf[0] = asciiConvert(c); 9271 nLeft = 1; 9272 } 9273 else 9274 { 9275 const idx = indexFn(c); 9276 if (idx == ushort.max) 9277 { 9278 buf[0] = c; 9279 nLeft = 1; 9280 } 9281 else if (idx < maxIdx) 9282 { 9283 buf[0] = tableFn(idx); 9284 nLeft = 1; 9285 } 9286 else 9287 { 9288 immutable val = tableFn(idx); 9289 // unpack length + codepoint 9290 nLeft = val >> 24; 9291 if (nLeft == 0) 9292 nLeft = 1; 9293 assert(nLeft <= buf.length); 9294 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9295 foreach (j; 1 .. nLeft) 9296 buf[nLeft - j - 1] = tableFn(idx + j); 9297 } 9298 } 9299 } 9300 return buf[nLeft - 1]; 9301 } 9302 9303 void popFront() 9304 { 9305 if (!nLeft) 9306 front; 9307 assert(nLeft); 9308 --nLeft; 9309 if (!nLeft) 9310 r.popFront(); 9311 } 9312 9313 static if (isForwardRange!Range) 9314 { 9315 @property auto save() 9316 { 9317 auto ret = this; 9318 ret.r = r.save; 9319 return ret; 9320 } 9321 } 9322 9323 private: 9324 Range r; 9325 uint nLeft; 9326 dchar[3] buf = void; 9327 } 9328 9329 return ToCaserImpl(str); 9330 } 9331 9332 /********************* 9333 * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9334 * or a string to upper or lower case. 9335 * 9336 * Does not allocate memory. 9337 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9338 * are treated as $(REF replacementDchar, std,utf). 9339 * 9340 * Params: 9341 * str = string or range of characters 9342 * 9343 * Returns: 9344 * an input range of `dchar`s 9345 * 9346 * See_Also: 9347 * $(LREF toUpper), $(LREF toLower) 9348 */ 9349 9350 auto asLowerCase(Range)(Range str) 9351 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9352 !isConvertibleToString!Range) 9353 { 9354 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9355 { 9356 import std.utf : byDchar; 9357 9358 // Decode first 9359 return asLowerCase(str.byDchar); 9360 } 9361 else 9362 { 9363 static import std.ascii; 9364 return toCaser!(LowerTriple, std.ascii.toLower)(str); 9365 } 9366 } 9367 9368 /// ditto 9369 auto asUpperCase(Range)(Range str) 9370 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9371 !isConvertibleToString!Range) 9372 { 9373 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9374 { 9375 import std.utf : byDchar; 9376 9377 // Decode first 9378 return asUpperCase(str.byDchar); 9379 } 9380 else 9381 { 9382 static import std.ascii; 9383 return toCaser!(UpperTriple, std.ascii.toUpper)(str); 9384 } 9385 } 9386 9387 /// 9388 @safe pure unittest 9389 { 9390 import std.algorithm.comparison : equal; 9391 9392 assert("hEllo".asUpperCase.equal("HELLO")); 9393 } 9394 9395 // explicitly undocumented 9396 auto asLowerCase(Range)(auto ref Range str) 9397 if (isConvertibleToString!Range) 9398 { 9399 import std.traits : StringTypeOf; 9400 return asLowerCase!(StringTypeOf!Range)(str); 9401 } 9402 9403 // explicitly undocumented 9404 auto asUpperCase(Range)(auto ref Range str) 9405 if (isConvertibleToString!Range) 9406 { 9407 import std.traits : StringTypeOf; 9408 return asUpperCase!(StringTypeOf!Range)(str); 9409 } 9410 9411 @safe unittest 9412 { 9413 static struct TestAliasedString 9414 { 9415 string get() @safe @nogc pure nothrow { return _s; } 9416 alias get this; 9417 @disable this(this); 9418 string _s; 9419 } 9420 9421 static bool testAliasedString(alias func, Args...)(string s, Args args) 9422 { 9423 import std.algorithm.comparison : equal; 9424 auto a = func(TestAliasedString(s), args); 9425 auto b = func(s, args); 9426 static if (is(typeof(equal(a, b)))) 9427 { 9428 // For ranges, compare contents instead of object identity. 9429 return equal(a, b); 9430 } 9431 else 9432 { 9433 return a == b; 9434 } 9435 } 9436 assert(testAliasedString!asLowerCase("hEllo")); 9437 assert(testAliasedString!asUpperCase("hEllo")); 9438 assert(testAliasedString!asCapitalized("hEllo")); 9439 } 9440 9441 @safe unittest 9442 { 9443 import std.array : array; 9444 9445 auto a = "HELLo".asLowerCase; 9446 auto savea = a.save; 9447 auto s = a.array; 9448 assert(s == "hello"); 9449 s = savea.array; 9450 assert(s == "hello"); 9451 9452 string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 9453 string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 9454 9455 foreach (i, slwr; lower) 9456 { 9457 import std.utf : byChar; 9458 9459 auto sx = slwr.asUpperCase.byChar.array; 9460 assert(sx == toUpper(slwr)); 9461 auto sy = upper[i].asLowerCase.byChar.array; 9462 assert(sy == toLower(upper[i])); 9463 } 9464 9465 // Not necessary to call r.front 9466 for (auto r = lower[3].asUpperCase; !r.empty; r.popFront()) 9467 { 9468 } 9469 9470 import std.algorithm.comparison : equal; 9471 9472 "HELLo"w.asLowerCase.equal("hello"d); 9473 "HELLo"w.asUpperCase.equal("HELLO"d); 9474 "HELLo"d.asLowerCase.equal("hello"d); 9475 "HELLo"d.asUpperCase.equal("HELLO"d); 9476 9477 import std.utf : byChar; 9478 assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array); 9479 } 9480 9481 // generic capitalizer on whole range, returns range 9482 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper, 9483 Range)(Range str) 9484 // Accept range of dchar's 9485 if (isInputRange!Range && 9486 isSomeChar!(ElementEncodingType!Range) && 9487 ElementEncodingType!Range.sizeof == dchar.sizeof) 9488 { 9489 static struct ToCapitalizerImpl 9490 { 9491 @property bool empty() 9492 { 9493 return lower ? lwr.empty : !nLeft && r.empty; 9494 } 9495 9496 @property auto front() 9497 { 9498 if (lower) 9499 return lwr.front; 9500 9501 if (!nLeft) 9502 { 9503 immutable dchar c = r.front; 9504 const idx = indexFnUpper(c); 9505 if (idx == ushort.max) 9506 { 9507 buf[0] = c; 9508 nLeft = 1; 9509 } 9510 else if (idx < maxIdxUpper) 9511 { 9512 buf[0] = tableFnUpper(idx); 9513 nLeft = 1; 9514 } 9515 else 9516 { 9517 immutable val = tableFnUpper(idx); 9518 // unpack length + codepoint 9519 nLeft = val >> 24; 9520 if (nLeft == 0) 9521 nLeft = 1; 9522 assert(nLeft <= buf.length); 9523 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9524 foreach (j; 1 .. nLeft) 9525 buf[nLeft - j - 1] = tableFnUpper(idx + j); 9526 } 9527 } 9528 return buf[nLeft - 1]; 9529 } 9530 9531 void popFront() 9532 { 9533 if (lower) 9534 lwr.popFront(); 9535 else 9536 { 9537 if (!nLeft) 9538 front; 9539 assert(nLeft); 9540 --nLeft; 9541 if (!nLeft) 9542 { 9543 r.popFront(); 9544 lwr = r.asLowerCase(); 9545 lower = true; 9546 } 9547 } 9548 } 9549 9550 static if (isForwardRange!Range) 9551 { 9552 @property auto save() 9553 { 9554 auto ret = this; 9555 ret.r = r.save; 9556 ret.lwr = lwr.save; 9557 return ret; 9558 } 9559 } 9560 9561 private: 9562 Range r; 9563 typeof(r.asLowerCase) lwr; // range representing the lower case rest of string 9564 bool lower = false; // false for first character, true for rest of string 9565 dchar[3] buf = void; 9566 uint nLeft = 0; 9567 } 9568 9569 return ToCapitalizerImpl(str); 9570 } 9571 9572 /********************* 9573 * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9574 * or string, meaning convert the first 9575 * character to upper case and subsequent characters to lower case. 9576 * 9577 * Does not allocate memory. 9578 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9579 * are treated as $(REF replacementDchar, std,utf). 9580 * 9581 * Params: 9582 * str = string or range of characters 9583 * 9584 * Returns: 9585 * an InputRange of dchars 9586 * 9587 * See_Also: 9588 * $(LREF toUpper), $(LREF toLower) 9589 * $(LREF asUpperCase), $(LREF asLowerCase) 9590 */ 9591 9592 auto asCapitalized(Range)(Range str) 9593 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9594 !isConvertibleToString!Range) 9595 { 9596 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9597 { 9598 import std.utf : byDchar; 9599 9600 // Decode first 9601 return toCapitalizer!UpperTriple(str.byDchar); 9602 } 9603 else 9604 { 9605 return toCapitalizer!UpperTriple(str); 9606 } 9607 } 9608 9609 /// 9610 @safe pure unittest 9611 { 9612 import std.algorithm.comparison : equal; 9613 9614 assert("hEllo".asCapitalized.equal("Hello")); 9615 } 9616 9617 auto asCapitalized(Range)(auto ref Range str) 9618 if (isConvertibleToString!Range) 9619 { 9620 import std.traits : StringTypeOf; 9621 return asCapitalized!(StringTypeOf!Range)(str); 9622 } 9623 9624 @safe pure nothrow @nogc unittest 9625 { 9626 auto r = "hEllo".asCapitalized(); 9627 assert(r.front == 'H'); 9628 } 9629 9630 @safe unittest 9631 { 9632 import std.array : array; 9633 9634 auto a = "hELLo".asCapitalized; 9635 auto savea = a.save; 9636 auto s = a.array; 9637 assert(s == "Hello"); 9638 s = savea.array; 9639 assert(s == "Hello"); 9640 9641 string[2][] cases = 9642 [ 9643 ["", ""], 9644 ["h", "H"], 9645 ["H", "H"], 9646 ["3", "3"], 9647 ["123", "123"], 9648 ["h123A", "H123a"], 9649 ["феж", "Феж"], 9650 ["\u1Fe2", "\u03a5\u0308\u0300"], 9651 ]; 9652 9653 foreach (i; 0 .. cases.length) 9654 { 9655 import std.utf : byChar; 9656 9657 auto r = cases[i][0].asCapitalized.byChar.array; 9658 auto result = cases[i][1]; 9659 assert(r == result); 9660 } 9661 9662 // Don't call r.front 9663 for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront()) 9664 { 9665 } 9666 9667 import std.algorithm.comparison : equal; 9668 9669 "HELLo"w.asCapitalized.equal("Hello"d); 9670 "hElLO"w.asCapitalized.equal("Hello"d); 9671 "hello"d.asCapitalized.equal("Hello"d); 9672 "HELLO"d.asCapitalized.equal("Hello"d); 9673 9674 import std.utf : byChar; 9675 assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array); 9676 } 9677 9678 // TODO: helper, I wish std.utf was more flexible (and stright) 9679 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9680 { 9681 if (c <= 0x7F) 9682 { 9683 buf[idx] = cast(char) c; 9684 idx++; 9685 } 9686 else if (c <= 0x7FF) 9687 { 9688 buf[idx] = cast(char)(0xC0 | (c >> 6)); 9689 buf[idx+1] = cast(char)(0x80 | (c & 0x3F)); 9690 idx += 2; 9691 } 9692 else if (c <= 0xFFFF) 9693 { 9694 buf[idx] = cast(char)(0xE0 | (c >> 12)); 9695 buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9696 buf[idx+2] = cast(char)(0x80 | (c & 0x3F)); 9697 idx += 3; 9698 } 9699 else if (c <= 0x10FFFF) 9700 { 9701 buf[idx] = cast(char)(0xF0 | (c >> 18)); 9702 buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 9703 buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9704 buf[idx+3] = cast(char)(0x80 | (c & 0x3F)); 9705 idx += 4; 9706 } 9707 else 9708 assert(0); 9709 return idx; 9710 } 9711 9712 @safe unittest 9713 { 9714 char[] s = "abcd".dup; 9715 size_t i = 0; 9716 i = encodeTo(s, i, 'X'); 9717 assert(s == "Xbcd"); 9718 9719 i = encodeTo(s, i, cast(dchar)'\u00A9'); 9720 assert(s == "X\xC2\xA9d"); 9721 } 9722 9723 // TODO: helper, I wish std.utf was more flexible (and stright) 9724 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure 9725 { 9726 import std.utf : UTFException; 9727 if (c <= 0xFFFF) 9728 { 9729 if (0xD800 <= c && c <= 0xDFFF) 9730 throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c); 9731 buf[idx] = cast(wchar) c; 9732 idx++; 9733 } 9734 else if (c <= 0x10FFFF) 9735 { 9736 buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 9737 buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 9738 idx += 2; 9739 } 9740 else 9741 assert(0); 9742 return idx; 9743 } 9744 9745 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9746 { 9747 buf[idx] = c; 9748 idx++; 9749 return idx; 9750 } 9751 9752 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure 9753 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9754 { 9755 import std.utf : decode, codeLength; 9756 size_t curIdx = 0; 9757 size_t destIdx = 0; 9758 alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn); 9759 size_t lastUnchanged = 0; 9760 // in-buffer move of bytes to a new start index 9761 // the trick is that it may not need to copy at all 9762 static size_t moveTo(C[] str, size_t dest, size_t from, size_t to) 9763 { 9764 // Interestingly we may just bump pointer for a while 9765 // then have to copy if a re-cased char was smaller the original 9766 // later we may regain pace with char that got bigger 9767 // In the end it sometimes flip-flops between the 2 cases below 9768 if (dest == from) 9769 return to; 9770 // got to copy 9771 foreach (C c; str[from .. to]) 9772 str[dest++] = c; 9773 return dest; 9774 } 9775 while (curIdx != s.length) 9776 { 9777 size_t startIdx = curIdx; 9778 immutable ch = decode(s, curIdx); 9779 // TODO: special case for ASCII 9780 immutable caseIndex = indexFn(ch); 9781 if (caseIndex == ushort.max) // unchanged, skip over 9782 { 9783 continue; 9784 } 9785 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9786 { 9787 // previous cased chars had the same length as uncased ones 9788 // thus can just adjust pointer 9789 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9790 lastUnchanged = curIdx; 9791 immutable cased = tableFn(caseIndex); 9792 immutable casedLen = codeLength!C(cased); 9793 if (casedLen + destIdx > curIdx) // no place to fit cased char 9794 { 9795 // switch to slow codepath, where we allocate 9796 return slowToCase(s, startIdx, destIdx); 9797 } 9798 else 9799 { 9800 destIdx = encodeTo(s, destIdx, cased); 9801 } 9802 } 9803 else // 1:m codepoint mapping, slow codepath 9804 { 9805 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9806 lastUnchanged = curIdx; 9807 return slowToCase(s, startIdx, destIdx); 9808 } 9809 assert(destIdx <= curIdx); 9810 } 9811 if (lastUnchanged != s.length) 9812 { 9813 destIdx = moveTo(s, destIdx, lastUnchanged, s.length); 9814 } 9815 s = s[0 .. destIdx]; 9816 } 9817 9818 // helper to precalculate size of case-converted string 9819 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn) 9820 { 9821 size_t toCaseLength(C)(const scope C[] str) 9822 { 9823 import std.utf : decode, codeLength; 9824 size_t codeLen = 0; 9825 size_t lastNonTrivial = 0; 9826 size_t curIdx = 0; 9827 while (curIdx != str.length) 9828 { 9829 immutable startIdx = curIdx; 9830 immutable ch = decode(str, curIdx); 9831 immutable ushort caseIndex = indexFn(ch); 9832 if (caseIndex == ushort.max) 9833 continue; 9834 else if (caseIndex < maxIdx) 9835 { 9836 codeLen += startIdx - lastNonTrivial; 9837 lastNonTrivial = curIdx; 9838 immutable cased = tableFn(caseIndex); 9839 codeLen += codeLength!C(cased); 9840 } 9841 else 9842 { 9843 codeLen += startIdx - lastNonTrivial; 9844 lastNonTrivial = curIdx; 9845 immutable val = tableFn(caseIndex); 9846 immutable len = val >> 24; 9847 immutable dchar cased = val & 0xFF_FFFF; 9848 codeLen += codeLength!C(cased); 9849 foreach (j; caseIndex+1 .. caseIndex+len) 9850 codeLen += codeLength!C(tableFn(j)); 9851 } 9852 } 9853 if (lastNonTrivial != str.length) 9854 codeLen += str.length - lastNonTrivial; 9855 return codeLen; 9856 } 9857 } 9858 9859 @safe unittest 9860 { 9861 alias toLowerLength = toCaseLength!(LowerTriple); 9862 assert(toLowerLength("abcd") == 4); 9863 assert(toLowerLength("аБВгд456") == 10+3); 9864 } 9865 9866 // slower code path that preallocates and then copies 9867 // case-converted stuf to the new string 9868 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn) 9869 { 9870 void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx, 9871 size_t destIdx) @trusted pure 9872 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9873 { 9874 import std.utf : decode; 9875 alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn); 9876 auto trueLength = destIdx + caseLength(s[curIdx..$]); 9877 C[] ns = new C[trueLength]; 9878 ns[0 .. destIdx] = s[0 .. destIdx]; 9879 size_t lastUnchanged = curIdx; 9880 while (curIdx != s.length) 9881 { 9882 immutable startIdx = curIdx; // start of current codepoint 9883 immutable ch = decode(s, curIdx); 9884 immutable caseIndex = indexFn(ch); 9885 if (caseIndex == ushort.max) // skip over 9886 { 9887 continue; 9888 } 9889 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9890 { 9891 immutable cased = tableFn(caseIndex); 9892 auto toCopy = startIdx - lastUnchanged; 9893 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9894 lastUnchanged = curIdx; 9895 destIdx += toCopy; 9896 destIdx = encodeTo(ns, destIdx, cased); 9897 } 9898 else // 1:m codepoint mapping, slow codepath 9899 { 9900 auto toCopy = startIdx - lastUnchanged; 9901 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9902 lastUnchanged = curIdx; 9903 destIdx += toCopy; 9904 auto val = tableFn(caseIndex); 9905 // unpack length + codepoint 9906 immutable uint len = val >> 24; 9907 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF)); 9908 foreach (j; caseIndex+1 .. caseIndex+len) 9909 destIdx = encodeTo(ns, destIdx, tableFn(j)); 9910 } 9911 } 9912 if (lastUnchanged != s.length) 9913 { 9914 auto toCopy = s.length - lastUnchanged; 9915 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$]; 9916 destIdx += toCopy; 9917 } 9918 assert(ns.length == destIdx); 9919 s = ns; 9920 } 9921 } 9922 9923 /++ 9924 Converts `s` to lowercase (by performing Unicode lowercase mapping) in place. 9925 For a few characters string length may increase after the transformation, 9926 in such a case the function reallocates exactly once. 9927 If `s` does not have any uppercase characters, then `s` is unaltered. 9928 +/ 9929 void toLowerInPlace(C)(ref C[] s) @trusted pure 9930 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9931 { 9932 toCaseInPlace!(LowerTriple)(s); 9933 } 9934 // overloads for the most common cases to reduce compile time 9935 @safe pure /*TODO nothrow*/ 9936 { 9937 void toLowerInPlace(ref char[] s) 9938 { toLowerInPlace!char(s); } 9939 void toLowerInPlace(ref wchar[] s) 9940 { toLowerInPlace!wchar(s); } 9941 void toLowerInPlace(ref dchar[] s) 9942 { toLowerInPlace!dchar(s); } 9943 } 9944 9945 /++ 9946 Converts `s` to uppercase (by performing Unicode uppercase mapping) in place. 9947 For a few characters string length may increase after the transformation, 9948 in such a case the function reallocates exactly once. 9949 If `s` does not have any lowercase characters, then `s` is unaltered. 9950 +/ 9951 void toUpperInPlace(C)(ref C[] s) @trusted pure 9952 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9953 { 9954 toCaseInPlace!(UpperTriple)(s); 9955 } 9956 // overloads for the most common cases to reduce compile time/code size 9957 @safe pure /*TODO nothrow*/ 9958 { 9959 void toUpperInPlace(ref char[] s) 9960 { toUpperInPlace!char(s); } 9961 void toUpperInPlace(ref wchar[] s) 9962 { toUpperInPlace!wchar(s); } 9963 void toUpperInPlace(ref dchar[] s) 9964 { toUpperInPlace!dchar(s); } 9965 } 9966 9967 /++ 9968 If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent 9969 is returned. Otherwise `c` is returned. 9970 9971 Warning: certain alphabets like German and Greek have no 1:1 9972 upper-lower mapping. Use overload of toLower which takes full string instead. 9973 +/ 9974 @safe pure nothrow @nogc 9975 dchar toLower(dchar c) 9976 { 9977 // optimize ASCII case 9978 if (c < 0xAA) 9979 { 9980 if (c < 'A') 9981 return c; 9982 if (c <= 'Z') 9983 return c + 32; 9984 return c; 9985 } 9986 size_t idx = toLowerSimpleIndex(c); 9987 if (idx != ushort.max) 9988 { 9989 return toLowerTab(idx); 9990 } 9991 return c; 9992 } 9993 9994 /++ 9995 Creates a new array which is identical to `s` except that all of its 9996 characters are converted to lowercase (by performing Unicode lowercase mapping). 9997 If none of `s` characters were affected, then `s` itself is returned if `s` is a 9998 `string`-like type. 9999 10000 Params: 10001 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 10002 of characters 10003 Returns: 10004 An array with the same element type as `s`. 10005 +/ 10006 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted 10007 if (isSomeString!S) 10008 { 10009 static import std.ascii; 10010 return toCase!(LowerTriple, std.ascii.toLower)(s); 10011 } 10012 10013 /// ditto 10014 ElementEncodingType!S[] toLower(S)(S s) 10015 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10016 { 10017 static import std.ascii; 10018 return toCase!(LowerTriple, std.ascii.toLower)(s); 10019 } 10020 10021 // overloads for the most common cases to reduce compile time 10022 @safe pure /*TODO nothrow*/ 10023 { 10024 string toLower(return scope string s) 10025 { return toLower!string(s); } 10026 wstring toLower(return scope wstring s) 10027 { return toLower!wstring(s); } 10028 dstring toLower(return scope dstring s) 10029 { return toLower!dstring(s); } 10030 10031 @safe unittest 10032 { 10033 // https://issues.dlang.org/show_bug.cgi?id=16663 10034 10035 static struct String 10036 { 10037 string data; 10038 alias data this; 10039 } 10040 10041 void foo() 10042 { 10043 auto u = toLower(String("")); 10044 } 10045 } 10046 } 10047 10048 10049 @safe unittest 10050 { 10051 static import std.ascii; 10052 import std.format : format; 10053 foreach (ch; 0 .. 0x80) 10054 assert(std.ascii.toLower(ch) == toLower(ch)); 10055 assert(toLower('Я') == 'я'); 10056 assert(toLower('Δ') == 'δ'); 10057 foreach (ch; unicode.upperCase.byCodepoint) 10058 { 10059 dchar low = ch.toLower(); 10060 assert(low == ch || isLower(low), format("%s -> %s", ch, low)); 10061 } 10062 assert(toLower("АЯ") == "ая"); 10063 10064 assert("\u1E9E".toLower == "\u00df"); 10065 assert("\u00df".toUpper == "SS"); 10066 } 10067 10068 // https://issues.dlang.org/show_bug.cgi?id=9629 10069 @safe unittest 10070 { 10071 wchar[] test = "hello þ world"w.dup; 10072 auto piece = test[6 .. 7]; 10073 toUpperInPlace(piece); 10074 assert(test == "hello Þ world"); 10075 } 10076 10077 10078 @safe unittest 10079 { 10080 import std.algorithm.comparison : cmp; 10081 string s1 = "FoL"; 10082 string s2 = toLower(s1); 10083 assert(cmp(s2, "fol") == 0, s2); 10084 assert(s2 != s1); 10085 10086 char[] s3 = s1.dup; 10087 toLowerInPlace(s3); 10088 assert(s3 == s2); 10089 10090 s1 = "A\u0100B\u0101d"; 10091 s2 = toLower(s1); 10092 s3 = s1.dup; 10093 assert(cmp(s2, "a\u0101b\u0101d") == 0); 10094 assert(s2 !is s1); 10095 toLowerInPlace(s3); 10096 assert(s3 == s2); 10097 10098 s1 = "A\u0460B\u0461d"; 10099 s2 = toLower(s1); 10100 s3 = s1.dup; 10101 assert(cmp(s2, "a\u0461b\u0461d") == 0); 10102 assert(s2 !is s1); 10103 toLowerInPlace(s3); 10104 assert(s3 == s2); 10105 10106 s1 = "\u0130"; 10107 s2 = toLower(s1); 10108 s3 = s1.dup; 10109 assert(s2 == "i\u0307"); 10110 assert(s2 !is s1); 10111 toLowerInPlace(s3); 10112 assert(s3 == s2); 10113 10114 // Test on wchar and dchar strings. 10115 assert(toLower("Some String"w) == "some string"w); 10116 assert(toLower("Some String"d) == "some string"d); 10117 10118 // https://issues.dlang.org/show_bug.cgi?id=12455 10119 dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE 10120 assert(isUpper(c)); 10121 assert(toLower(c) == 'i'); 10122 // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report 10123 // check simple-case toUpper too 10124 c = '\u1f87'; 10125 assert(isLower(c)); 10126 assert(toUpper(c) == '\u1F8F'); 10127 } 10128 10129 @safe pure unittest 10130 { 10131 import std.algorithm.comparison : cmp, equal; 10132 import std.utf : byCodeUnit; 10133 auto r1 = "FoL".byCodeUnit; 10134 assert(r1.toLower.cmp("fol") == 0); 10135 auto r2 = "A\u0460B\u0461d".byCodeUnit; 10136 assert(r2.toLower.cmp("a\u0461b\u0461d") == 0); 10137 } 10138 10139 /++ 10140 If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent 10141 is returned. Otherwise `c` is returned. 10142 10143 Warning: 10144 Certain alphabets like German and Greek have no 1:1 10145 upper-lower mapping. Use overload of toUpper which takes full string instead. 10146 10147 toUpper can be used as an argument to $(REF map, std,algorithm,iteration) 10148 to produce an algorithm that can convert a range of characters to upper case 10149 without allocating memory. 10150 A string can then be produced by using $(REF copy, std,algorithm,mutation) 10151 to send it to an $(REF appender, std,array). 10152 +/ 10153 @safe pure nothrow @nogc 10154 dchar toUpper(dchar c) 10155 { 10156 // optimize ASCII case 10157 if (c < 0xAA) 10158 { 10159 if (c < 'a') 10160 return c; 10161 if (c <= 'z') 10162 return c - 32; 10163 return c; 10164 } 10165 size_t idx = toUpperSimpleIndex(c); 10166 if (idx != ushort.max) 10167 { 10168 return toUpperTab(idx); 10169 } 10170 return c; 10171 } 10172 10173 /// 10174 @safe unittest 10175 { 10176 import std.algorithm.iteration : map; 10177 import std.algorithm.mutation : copy; 10178 import std.array : appender; 10179 10180 auto abuf = appender!(char[])(); 10181 "hello".map!toUpper.copy(abuf); 10182 assert(abuf.data == "HELLO"); 10183 } 10184 10185 @safe unittest 10186 { 10187 static import std.ascii; 10188 import std.format : format; 10189 foreach (ch; 0 .. 0x80) 10190 assert(std.ascii.toUpper(ch) == toUpper(ch)); 10191 assert(toUpper('я') == 'Я'); 10192 assert(toUpper('δ') == 'Δ'); 10193 auto title = unicode.Titlecase_Letter; 10194 foreach (ch; unicode.lowerCase.byCodepoint) 10195 { 10196 dchar up = ch.toUpper(); 10197 assert(up == ch || isUpper(up) || title[up], 10198 format("%x -> %x", ch, up)); 10199 } 10200 } 10201 10202 /++ 10203 Allocates a new array which is identical to `s` except that all of its 10204 characters are converted to uppercase (by performing Unicode uppercase mapping). 10205 If none of `s` characters were affected, then `s` itself is returned if `s` 10206 is a `string`-like type. 10207 10208 Params: 10209 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 10210 of characters 10211 Returns: 10212 An new array with the same element type as `s`. 10213 +/ 10214 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted 10215 if (isSomeString!S) 10216 { 10217 static import std.ascii; 10218 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10219 } 10220 10221 /// ditto 10222 ElementEncodingType!S[] toUpper(S)(S s) 10223 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10224 { 10225 static import std.ascii; 10226 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10227 } 10228 10229 // overloads for the most common cases to reduce compile time 10230 @safe pure /*TODO nothrow*/ 10231 { 10232 string toUpper(return scope string s) 10233 { return toUpper!string(s); } 10234 wstring toUpper(return scope wstring s) 10235 { return toUpper!wstring(s); } 10236 dstring toUpper(return scope dstring s) 10237 { return toUpper!dstring(s); } 10238 10239 @safe unittest 10240 { 10241 // https://issues.dlang.org/show_bug.cgi?id=16663 10242 10243 static struct String 10244 { 10245 string data; 10246 alias data this; 10247 } 10248 10249 void foo() 10250 { 10251 auto u = toUpper(String("")); 10252 } 10253 } 10254 } 10255 10256 @safe unittest 10257 { 10258 import std.algorithm.comparison : cmp; 10259 10260 string s1 = "FoL"; 10261 string s2; 10262 char[] s3; 10263 10264 s2 = toUpper(s1); 10265 s3 = s1.dup; toUpperInPlace(s3); 10266 assert(s3 == s2, s3); 10267 assert(cmp(s2, "FOL") == 0); 10268 assert(s2 !is s1); 10269 10270 s1 = "a\u0100B\u0101d"; 10271 s2 = toUpper(s1); 10272 s3 = s1.dup; toUpperInPlace(s3); 10273 assert(s3 == s2); 10274 assert(cmp(s2, "A\u0100B\u0100D") == 0); 10275 assert(s2 !is s1); 10276 10277 s1 = "a\u0460B\u0461d"; 10278 s2 = toUpper(s1); 10279 s3 = s1.dup; toUpperInPlace(s3); 10280 assert(s3 == s2); 10281 assert(cmp(s2, "A\u0460B\u0460D") == 0); 10282 assert(s2 !is s1); 10283 } 10284 10285 @safe unittest 10286 { 10287 static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow) 10288 { 10289 import std.format : format; 10290 string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)"; 10291 auto low = s.toLower() , up = s.toUpper(); 10292 auto lowInp = s.dup, upInp = s.dup; 10293 lowInp.toLowerInPlace(); 10294 upInp.toUpperInPlace(); 10295 assert(low == trueLow, format(diff, low, trueLow)); 10296 assert(up == trueUp, format(diff, up, trueUp)); 10297 assert(lowInp == trueLow, 10298 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow)); 10299 assert(upInp == trueUp, 10300 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp)); 10301 } 10302 static foreach (S; AliasSeq!(dstring, wstring, string)) 10303 {{ 10304 10305 S easy = "123"; 10306 S good = "abCФеж"; 10307 S awful = "\u0131\u023f\u2126"; 10308 S wicked = "\u0130\u1FE2"; 10309 auto options = [easy, good, awful, wicked]; 10310 S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 10311 S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 10312 10313 foreach (val; [easy, good]) 10314 { 10315 auto e = val.dup; 10316 auto g = e; 10317 e.toUpperInPlace(); 10318 assert(e is g); 10319 e.toLowerInPlace(); 10320 assert(e is g); 10321 } 10322 foreach (i, v; options) 10323 { 10324 doTest(v, upper[i], lower[i]); 10325 } 10326 10327 // a few combinatorial runs 10328 foreach (i; 0 .. options.length) 10329 foreach (j; i .. options.length) 10330 foreach (k; j .. options.length) 10331 { 10332 auto sample = options[i] ~ options[j] ~ options[k]; 10333 auto sample2 = options[k] ~ options[j] ~ options[i]; 10334 doTest(sample, upper[i] ~ upper[j] ~ upper[k], 10335 lower[i] ~ lower[j] ~ lower[k]); 10336 doTest(sample2, upper[k] ~ upper[j] ~ upper[i], 10337 lower[k] ~ lower[j] ~ lower[i]); 10338 } 10339 }} 10340 } 10341 10342 // test random access ranges 10343 @safe pure unittest 10344 { 10345 import std.algorithm.comparison : cmp; 10346 import std.utf : byCodeUnit; 10347 auto s1 = "FoL".byCodeUnit; 10348 assert(s1.toUpper.cmp("FOL") == 0); 10349 auto s2 = "a\u0460B\u0461d".byCodeUnit; 10350 assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0); 10351 } 10352 10353 /++ 10354 Returns whether `c` is a Unicode alphabetic $(CHARACTER) 10355 (general Unicode category: Alphabetic). 10356 +/ 10357 @safe pure nothrow @nogc 10358 bool isAlpha(dchar c) 10359 { 10360 // optimization 10361 if (c < 0xAA) 10362 { 10363 return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); 10364 } 10365 10366 return alphaTrie[c]; 10367 } 10368 10369 @safe unittest 10370 { 10371 auto alpha = unicode("Alphabetic"); 10372 foreach (ch; alpha.byCodepoint) 10373 assert(isAlpha(ch)); 10374 foreach (ch; 0 .. 0x4000) 10375 assert((ch in alpha) == isAlpha(ch)); 10376 } 10377 10378 10379 /++ 10380 Returns whether `c` is a Unicode mark 10381 (general Unicode category: Mn, Me, Mc). 10382 +/ 10383 @safe pure nothrow @nogc 10384 bool isMark(dchar c) 10385 { 10386 return markTrie[c]; 10387 } 10388 10389 @safe unittest 10390 { 10391 auto mark = unicode("Mark"); 10392 foreach (ch; mark.byCodepoint) 10393 assert(isMark(ch)); 10394 foreach (ch; 0 .. 0x4000) 10395 assert((ch in mark) == isMark(ch)); 10396 } 10397 10398 /++ 10399 Returns whether `c` is a Unicode numerical $(CHARACTER) 10400 (general Unicode category: Nd, Nl, No). 10401 +/ 10402 @safe pure nothrow @nogc 10403 bool isNumber(dchar c) 10404 { 10405 // optimization for ascii case 10406 if (c <= 0x7F) 10407 { 10408 return c >= '0' && c <= '9'; 10409 } 10410 else 10411 { 10412 return numberTrie[c]; 10413 } 10414 } 10415 10416 @safe unittest 10417 { 10418 auto n = unicode("N"); 10419 foreach (ch; n.byCodepoint) 10420 assert(isNumber(ch)); 10421 foreach (ch; 0 .. 0x4000) 10422 assert((ch in n) == isNumber(ch)); 10423 } 10424 10425 /++ 10426 Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number. 10427 (general Unicode category: Alphabetic, Nd, Nl, No). 10428 10429 Params: 10430 c = any Unicode character 10431 Returns: 10432 `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode 10433 categories 10434 +/ 10435 @safe pure nothrow @nogc 10436 bool isAlphaNum(dchar c) 10437 { 10438 static import std.ascii; 10439 10440 // optimization for ascii case 10441 if (std.ascii.isASCII(c)) 10442 { 10443 return std.ascii.isAlphaNum(c); 10444 } 10445 else 10446 { 10447 return isAlpha(c) || isNumber(c); 10448 } 10449 } 10450 10451 @safe unittest 10452 { 10453 auto n = unicode("N"); 10454 auto alpha = unicode("Alphabetic"); 10455 10456 foreach (ch; n.byCodepoint) 10457 assert(isAlphaNum(ch)); 10458 10459 foreach (ch; alpha.byCodepoint) 10460 assert(isAlphaNum(ch)); 10461 10462 foreach (ch; 0 .. 0x4000) 10463 { 10464 assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch)); 10465 } 10466 } 10467 10468 /++ 10469 Returns whether `c` is a Unicode punctuation $(CHARACTER) 10470 (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf). 10471 +/ 10472 @safe pure nothrow @nogc 10473 bool isPunctuation(dchar c) 10474 { 10475 static import std.ascii; 10476 10477 // optimization for ascii case 10478 if (c <= 0x7F) 10479 { 10480 return std.ascii.isPunctuation(c); 10481 } 10482 else 10483 { 10484 return punctuationTrie[c]; 10485 } 10486 } 10487 10488 @safe unittest 10489 { 10490 assert(isPunctuation('\u0021')); 10491 assert(isPunctuation('\u0028')); 10492 assert(isPunctuation('\u0029')); 10493 assert(isPunctuation('\u002D')); 10494 assert(isPunctuation('\u005F')); 10495 assert(isPunctuation('\u00AB')); 10496 assert(isPunctuation('\u00BB')); 10497 foreach (ch; unicode("P").byCodepoint) 10498 assert(isPunctuation(ch)); 10499 } 10500 10501 /++ 10502 Returns whether `c` is a Unicode symbol $(CHARACTER) 10503 (general Unicode category: Sm, Sc, Sk, So). 10504 +/ 10505 @safe pure nothrow @nogc 10506 bool isSymbol(dchar c) 10507 { 10508 return symbolTrie[c]; 10509 } 10510 10511 @safe unittest 10512 { 10513 import std.format : format; 10514 assert(isSymbol('\u0024')); 10515 assert(isSymbol('\u002B')); 10516 assert(isSymbol('\u005E')); 10517 assert(isSymbol('\u00A6')); 10518 foreach (ch; unicode("S").byCodepoint) 10519 assert(isSymbol(ch), format("%04x", ch)); 10520 } 10521 10522 /++ 10523 Returns whether `c` is a Unicode space $(CHARACTER) 10524 (general Unicode category: Zs) 10525 Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER). 10526 For commonly used less strict semantics see $(LREF isWhite). 10527 +/ 10528 @safe pure nothrow @nogc 10529 bool isSpace(dchar c) 10530 { 10531 import std.internal.unicode_tables : isSpaceGen; // generated file 10532 return isSpaceGen(c); 10533 } 10534 10535 @safe unittest 10536 { 10537 assert(isSpace('\u0020')); 10538 auto space = unicode.Zs; 10539 foreach (ch; space.byCodepoint) 10540 assert(isSpace(ch)); 10541 foreach (ch; 0 .. 0x1000) 10542 assert(isSpace(ch) == space[ch]); 10543 } 10544 10545 10546 /++ 10547 Returns whether `c` is a Unicode graphical $(CHARACTER) 10548 (general Unicode category: L, M, N, P, S, Zs). 10549 10550 +/ 10551 @safe pure nothrow @nogc 10552 bool isGraphical(dchar c) 10553 { 10554 return graphicalTrie[c]; 10555 } 10556 10557 10558 @safe unittest 10559 { 10560 auto set = unicode("Graphical"); 10561 import std.format : format; 10562 foreach (ch; set.byCodepoint) 10563 assert(isGraphical(ch), format("%4x", ch)); 10564 foreach (ch; 0 .. 0x4000) 10565 assert((ch in set) == isGraphical(ch)); 10566 } 10567 10568 10569 /++ 10570 Returns whether `c` is a Unicode control $(CHARACTER) 10571 (general Unicode category: Cc). 10572 +/ 10573 @safe pure nothrow @nogc 10574 bool isControl(dchar c) 10575 { 10576 import std.internal.unicode_tables : isControlGen; // generated file 10577 return isControlGen(c); 10578 } 10579 10580 @safe unittest 10581 { 10582 assert(isControl('\u0000')); 10583 assert(isControl('\u0081')); 10584 assert(!isControl('\u0100')); 10585 auto cc = unicode.Cc; 10586 foreach (ch; cc.byCodepoint) 10587 assert(isControl(ch)); 10588 foreach (ch; 0 .. 0x1000) 10589 assert(isControl(ch) == cc[ch]); 10590 } 10591 10592 10593 /++ 10594 Returns whether `c` is a Unicode formatting $(CHARACTER) 10595 (general Unicode category: Cf). 10596 +/ 10597 @safe pure nothrow @nogc 10598 bool isFormat(dchar c) 10599 { 10600 import std.internal.unicode_tables : isFormatGen; // generated file 10601 return isFormatGen(c); 10602 } 10603 10604 10605 @safe unittest 10606 { 10607 assert(isFormat('\u00AD')); 10608 foreach (ch; unicode("Format").byCodepoint) 10609 assert(isFormat(ch)); 10610 } 10611 10612 // code points for private use, surrogates are not likely to change in near feature 10613 // if need be they can be generated from unicode data as well 10614 10615 /++ 10616 Returns whether `c` is a Unicode Private Use $(CODEPOINT) 10617 (general Unicode category: Co). 10618 +/ 10619 @safe pure nothrow @nogc 10620 bool isPrivateUse(dchar c) 10621 { 10622 return (0x00_E000 <= c && c <= 0x00_F8FF) 10623 || (0x0F_0000 <= c && c <= 0x0F_FFFD) 10624 || (0x10_0000 <= c && c <= 0x10_FFFD); 10625 } 10626 10627 /++ 10628 Returns whether `c` is a Unicode surrogate $(CODEPOINT) 10629 (general Unicode category: Cs). 10630 +/ 10631 @safe pure nothrow @nogc 10632 bool isSurrogate(dchar c) 10633 { 10634 return (0xD800 <= c && c <= 0xDFFF); 10635 } 10636 10637 /++ 10638 Returns whether `c` is a Unicode high surrogate (lead surrogate). 10639 +/ 10640 @safe pure nothrow @nogc 10641 bool isSurrogateHi(dchar c) 10642 { 10643 return (0xD800 <= c && c <= 0xDBFF); 10644 } 10645 10646 /++ 10647 Returns whether `c` is a Unicode low surrogate (trail surrogate). 10648 +/ 10649 @safe pure nothrow @nogc 10650 bool isSurrogateLo(dchar c) 10651 { 10652 return (0xDC00 <= c && c <= 0xDFFF); 10653 } 10654 10655 /++ 10656 Returns whether `c` is a Unicode non-character i.e. 10657 a $(CODEPOINT) with no assigned abstract character. 10658 (general Unicode category: Cn) 10659 +/ 10660 @safe pure nothrow @nogc 10661 bool isNonCharacter(dchar c) 10662 { 10663 return nonCharacterTrie[c]; 10664 } 10665 10666 @safe unittest 10667 { 10668 auto set = unicode("Cn"); 10669 foreach (ch; set.byCodepoint) 10670 assert(isNonCharacter(ch)); 10671 } 10672 10673 private: 10674 // load static data from pre-generated tables into usable datastructures 10675 10676 10677 @safe auto asSet(const (ubyte)[] compressed) pure 10678 { 10679 return CodepointSet.fromIntervals(decompressIntervals(compressed)); 10680 } 10681 10682 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e) 10683 { 10684 return const(CodepointTrie!T)(e.offsets, e.sizes, e.data); 10685 } 10686 10687 @safe pure nothrow @nogc @property 10688 { 10689 // It's important to use auto return here, so that the compiler 10690 // only runs semantic on the return type if the function gets 10691 // used. Also these are functions rather than templates to not 10692 // increase the object size of the caller. 10693 auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; } 10694 auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; } 10695 auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; } 10696 auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; } 10697 auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; } 10698 auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; } 10699 auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; } 10700 auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; } 10701 auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; } 10702 auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; } 10703 auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; } 10704 10705 //normalization quick-check tables 10706 auto nfcQCTrie() 10707 { 10708 import std.internal.unicode_norm : nfcQCTrieEntries; 10709 static immutable res = asTrie(nfcQCTrieEntries); 10710 return res; 10711 } 10712 10713 auto nfdQCTrie() 10714 { 10715 import std.internal.unicode_norm : nfdQCTrieEntries; 10716 static immutable res = asTrie(nfdQCTrieEntries); 10717 return res; 10718 } 10719 10720 auto nfkcQCTrie() 10721 { 10722 import std.internal.unicode_norm : nfkcQCTrieEntries; 10723 static immutable res = asTrie(nfkcQCTrieEntries); 10724 return res; 10725 } 10726 10727 auto nfkdQCTrie() 10728 { 10729 import std.internal.unicode_norm : nfkdQCTrieEntries; 10730 static immutable res = asTrie(nfkdQCTrieEntries); 10731 return res; 10732 } 10733 10734 //grapheme breaking algorithm tables 10735 auto spacingMarkTrie() 10736 { 10737 import std.internal.unicode_grapheme : spacingMarkTrieEntries; 10738 static immutable res = asTrie(spacingMarkTrieEntries); 10739 return res; 10740 } 10741 10742 auto graphemeExtendTrie() 10743 { 10744 import std.internal.unicode_grapheme : graphemeExtendTrieEntries; 10745 static immutable res = asTrie(graphemeExtendTrieEntries); 10746 return res; 10747 } 10748 10749 auto hangLV() 10750 { 10751 import std.internal.unicode_grapheme : hangulLVTrieEntries; 10752 static immutable res = asTrie(hangulLVTrieEntries); 10753 return res; 10754 } 10755 10756 auto hangLVT() 10757 { 10758 import std.internal.unicode_grapheme : hangulLVTTrieEntries; 10759 static immutable res = asTrie(hangulLVTTrieEntries); 10760 return res; 10761 } 10762 10763 auto prependTrie() 10764 { 10765 import std.internal.unicode_grapheme : prependTrieEntries; 10766 static immutable res = asTrie(prependTrieEntries); 10767 return res; 10768 } 10769 10770 auto graphemeControlTrie() 10771 { 10772 import std.internal.unicode_grapheme : controlTrieEntries; 10773 static immutable res = asTrie(controlTrieEntries); 10774 return res; 10775 } 10776 10777 auto xpictoTrie() 10778 { 10779 import std.internal.unicode_grapheme : Extended_PictographicTrieEntries; 10780 static immutable res = asTrie(Extended_PictographicTrieEntries); 10781 return res; 10782 } 10783 10784 // tables below are used for composition/decomposition 10785 auto combiningClassTrie() 10786 { 10787 import std.internal.unicode_comp : combiningClassTrieEntries; 10788 static immutable res = asTrie(combiningClassTrieEntries); 10789 return res; 10790 } 10791 10792 auto compatMappingTrie() 10793 { 10794 import std.internal.unicode_decomp : compatMappingTrieEntries; 10795 static immutable res = asTrie(compatMappingTrieEntries); 10796 return res; 10797 } 10798 10799 auto canonMappingTrie() 10800 { 10801 import std.internal.unicode_decomp : canonMappingTrieEntries; 10802 static immutable res = asTrie(canonMappingTrieEntries); 10803 return res; 10804 } 10805 10806 auto compositionJumpTrie() 10807 { 10808 import std.internal.unicode_comp : compositionJumpTrieEntries; 10809 static immutable res = asTrie(compositionJumpTrieEntries); 10810 return res; 10811 } 10812 10813 //case conversion tables 10814 auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; } 10815 auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; } 10816 auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; } 10817 //simple case conversion tables 10818 auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; } 10819 auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; } 10820 auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; } 10821 10822 } 10823 10824 }// version (!std_uni_bootstrap)