1 // Written in the D programming language. 2 3 /++ 4 $(P The `std.uni` module provides an implementation 5 of fundamental Unicode algorithms and data structures. 6 This doesn't include UTF encoding and decoding primitives, 7 see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf) 8 for this functionality. ) 9 10 $(SCRIPT inhibitQuickIndex = 1;) 11 $(DIVC quickindex, 12 $(BOOKTABLE, 13 $(TR $(TH Category) $(TH Functions)) 14 $(TR $(TD Decode) $(TD 15 $(LREF byCodePoint) 16 $(LREF byGrapheme) 17 $(LREF decodeGrapheme) 18 $(LREF graphemeStride) 19 )) 20 $(TR $(TD Comparison) $(TD 21 $(LREF icmp) 22 $(LREF sicmp) 23 )) 24 $(TR $(TD Classification) $(TD 25 $(LREF isAlpha) 26 $(LREF isAlphaNum) 27 $(LREF isCodepointSet) 28 $(LREF isControl) 29 $(LREF isFormat) 30 $(LREF isGraphical) 31 $(LREF isIntegralPair) 32 $(LREF isMark) 33 $(LREF isNonCharacter) 34 $(LREF isNumber) 35 $(LREF isPrivateUse) 36 $(LREF isPunctuation) 37 $(LREF isSpace) 38 $(LREF isSurrogate) 39 $(LREF isSurrogateHi) 40 $(LREF isSurrogateLo) 41 $(LREF isSymbol) 42 $(LREF isWhite) 43 )) 44 $(TR $(TD Normalization) $(TD 45 $(LREF NFC) 46 $(LREF NFD) 47 $(LREF NFKD) 48 $(LREF NormalizationForm) 49 $(LREF normalize) 50 )) 51 $(TR $(TD Decompose) $(TD 52 $(LREF decompose) 53 $(LREF decomposeHangul) 54 $(LREF UnicodeDecomposition) 55 )) 56 $(TR $(TD Compose) $(TD 57 $(LREF compose) 58 $(LREF composeJamo) 59 )) 60 $(TR $(TD Sets) $(TD 61 $(LREF CodepointInterval) 62 $(LREF CodepointSet) 63 $(LREF InversionList) 64 $(LREF unicode) 65 )) 66 $(TR $(TD Trie) $(TD 67 $(LREF codepointSetTrie) 68 $(LREF CodepointSetTrie) 69 $(LREF codepointTrie) 70 $(LREF CodepointTrie) 71 $(LREF toTrie) 72 $(LREF toDelegate) 73 )) 74 $(TR $(TD Casing) $(TD 75 $(LREF asCapitalized) 76 $(LREF asLowerCase) 77 $(LREF asUpperCase) 78 $(LREF isLower) 79 $(LREF isUpper) 80 $(LREF toLower) 81 $(LREF toLowerInPlace) 82 $(LREF toUpper) 83 $(LREF toUpperInPlace) 84 )) 85 $(TR $(TD Utf8Matcher) $(TD 86 $(LREF isUtfMatcher) 87 $(LREF MatcherConcept) 88 $(LREF utfMatcher) 89 )) 90 $(TR $(TD Separators) $(TD 91 $(LREF lineSep) 92 $(LREF nelSep) 93 $(LREF paraSep) 94 )) 95 $(TR $(TD Building blocks) $(TD 96 $(LREF allowedIn) 97 $(LREF combiningClass) 98 $(LREF Grapheme) 99 )) 100 )) 101 102 $(P All primitives listed operate on Unicode characters and 103 sets of characters. For functions which operate on ASCII characters 104 and ignore Unicode $(CHARACTERS), see $(MREF std, ascii). 105 For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms 106 used throughout this module see the $(S_LINK Terminology, terminology) section 107 below. 108 ) 109 $(P The focus of this module is the core needs of developing Unicode-aware 110 applications. To that effect it provides the following optimized primitives: 111 ) 112 $(UL 113 $(LI Character classification by category and common properties: 114 $(LREF isAlpha), $(LREF isWhite) and others. 115 ) 116 $(LI 117 Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)). 118 ) 119 $(LI 120 Converting text to any of the four normalization forms via $(LREF normalize). 121 ) 122 $(LI 123 Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride)) 124 by user-perceived characters, that is by $(LREF Grapheme) clusters. 125 ) 126 $(LI 127 Decomposing and composing of individual character(s) according to canonical 128 or compatibility rules, see $(LREF compose) and $(LREF decompose), 129 including the specific version for Hangul syllables $(LREF composeJamo) 130 and $(LREF decomposeHangul). 131 ) 132 ) 133 $(P It's recognized that an application may need further enhancements 134 and extensions, such as less commonly known algorithms, 135 or tailoring existing ones for region specific needs. To help users 136 with building any extra functionality beyond the core primitives, 137 the module provides: 138 ) 139 $(UL 140 $(LI 141 $(LREF CodepointSet), a type for easy manipulation of sets of characters. 142 Besides the typical set algebra it provides an unusual feature: 143 a D source code generator for detection of $(CODEPOINTS) in this set. 144 This is a boon for meta-programming parser frameworks, 145 and is used internally to power classification in small 146 sets like $(LREF isWhite). 147 ) 148 $(LI 149 A way to construct optimal packed multi-stage tables also known as a 150 special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie). 151 The functions $(LREF codepointTrie), $(LREF codepointSetTrie) 152 construct custom tries that map dchar to value. 153 The end result is a fast and predictable $(BIGOH 1) lookup that powers 154 functions like $(LREF isAlpha) and $(LREF combiningClass), 155 but for user-defined data sets. 156 ) 157 $(LI 158 A useful technique for Unicode-aware parsers that perform 159 character classification of encoded $(CODEPOINTS) 160 is to avoid unnecassary decoding at all costs. 161 $(LREF utfMatcher) provides an improvement over the usual workflow 162 of decode-classify-process, combining the decoding and classification 163 steps. By extracting necessary bits directly from encoded 164 $(S_LINK Code unit, code units) matchers achieve 165 significant performance improvements. See $(LREF MatcherConcept) for 166 the common interface of UTF matchers. 167 ) 168 $(LI 169 Generally useful building blocks for customized normalization: 170 $(LREF combiningClass) for querying combining class 171 and $(LREF allowedIn) for testing the Quick_Check 172 property of a given normalization form. 173 ) 174 $(LI 175 Access to a large selection of commonly used sets of $(CODEPOINTS). 176 $(S_LINK Unicode properties, Supported sets) include Script, 177 Block and General Category. The exact contents of a set can be 178 observed in the CLDR utility, on the 179 $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page 180 of the Unicode website. 181 See $(LREF unicode) for easy and (optionally) compile-time checked set 182 queries. 183 ) 184 ) 185 $(SECTION Synopsis) 186 --- 187 import std.uni; 188 void main() 189 { 190 // initialize code point sets using script/block or property name 191 // now 'set' contains code points from both scripts. 192 auto set = unicode("Cyrillic") | unicode("Armenian"); 193 // same thing but simpler and checked at compile-time 194 auto ascii = unicode.ASCII; 195 auto currency = unicode.Currency_Symbol; 196 197 // easy set ops 198 auto a = set & ascii; 199 assert(a.empty); // as it has no intersection with ascii 200 a = set | ascii; 201 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 202 203 // some properties of code point sets 204 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 205 // testing presence of a code point in a set 206 // is just fine, it is O(logN) 207 assert(!b['$']); 208 assert(!b['\u058F']); // Armenian dram sign 209 assert(b['¥']); 210 211 // building fast lookup tables, these guarantee O(1) complexity 212 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 213 auto oneTrie = toTrie!1(b); 214 // 2-level far more compact but typically slightly slower 215 auto twoTrie = toTrie!2(b); 216 // 3-level even smaller, and a bit slower yet 217 auto threeTrie = toTrie!3(b); 218 assert(oneTrie['£']); 219 assert(twoTrie['£']); 220 assert(threeTrie['£']); 221 222 // build the trie with the most sensible trie level 223 // and bind it as a functor 224 auto cyrillicOrArmenian = toDelegate(set); 225 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 226 assert(balance == "ընկեր!"); 227 // compatible with bool delegate(dchar) 228 bool delegate(dchar) bindIt = cyrillicOrArmenian; 229 230 // Normalization 231 string s = "Plain ascii (and not only), is always normalized!"; 232 assert(s is normalize(s));// is the same string 233 234 string nonS = "A\u0308ffin"; // A ligature 235 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 236 assert(nS == "Äffin"); 237 assert(nS != nonS); 238 string composed = "Äffin"; 239 240 assert(normalize!NFD(composed) == "A\u0308ffin"); 241 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 242 assert(normalize!NFKD("2¹⁰") == "210"); 243 } 244 --- 245 $(SECTION Terminology) 246 $(P The following is a list of important Unicode notions 247 and definitions. Any conventions used specifically in this 248 module alone are marked as such. The descriptions are based on the formal 249 definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf, 250 chapter three of The Unicode Standard Core Specification.) 251 ) 252 $(P $(DEF Abstract character) A unit of information used for the organization, 253 control, or representation of textual data. 254 Note that: 255 $(UL 256 $(LI When representing data, the nature of that data 257 is generally symbolic as opposed to some other 258 kind of data (for example, visual). 259 ) 260 $(LI An abstract character has no concrete form 261 and should not be confused with a $(S_LINK Glyph, glyph). 262 ) 263 $(LI An abstract character does not necessarily 264 correspond to what a user thinks of as a “character” 265 and should not be confused with a $(LREF Grapheme). 266 ) 267 $(LI The abstract characters encoded (see Encoded character) 268 are known as Unicode abstract characters. 269 ) 270 $(LI Abstract characters not directly 271 encoded by the Unicode Standard can often be 272 represented by the use of combining character sequences. 273 ) 274 ) 275 ) 276 $(P $(DEF Canonical decomposition) 277 The decomposition of a character or character sequence 278 that results from recursively applying the canonical 279 mappings found in the Unicode Character Database 280 and these described in Conjoining Jamo Behavior 281 (section 12 of 282 $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)). 283 ) 284 $(P $(DEF Canonical composition) 285 The precise definition of the Canonical composition 286 is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf, 287 Unicode Conformance) section 11. 288 Informally it's the process that does the reverse of the canonical 289 decomposition with the addition of certain rules 290 that e.g. prevent legacy characters from appearing in the composed result. 291 ) 292 $(P $(DEF Canonical equivalent) 293 Two character sequences are said to be canonical equivalents if 294 their full canonical decompositions are identical. 295 ) 296 $(P $(DEF Character) Typically differs by context. 297 For the purpose of this documentation the term $(I character) 298 implies $(I encoded character), that is, a code point having 299 an assigned abstract character (a symbolic meaning). 300 ) 301 $(P $(DEF Code point) Any value in the Unicode codespace; 302 that is, the range of integers from 0 to 10FFFF (hex). 303 Not all code points are assigned to encoded characters. 304 ) 305 $(P $(DEF Code unit) The minimal bit combination that can represent 306 a unit of encoded text for processing or interchange. 307 Depending on the encoding this could be: 308 8-bit code units in the UTF-8 (`char`), 309 16-bit code units in the UTF-16 (`wchar`), 310 and 32-bit code units in the UTF-32 (`dchar`). 311 $(I Note that in UTF-32, a code unit is a code point 312 and is represented by the D `dchar` type.) 313 ) 314 $(P $(DEF Combining character) A character with the General Category 315 of Combining Mark(M). 316 $(UL 317 $(LI All characters with non-zero canonical combining class 318 are combining characters, but the reverse is not the case: 319 there are combining characters with a zero combining class. 320 ) 321 $(LI These characters are not normally used in isolation 322 unless they are being described. They include such characters 323 as accents, diacritics, Hebrew points, Arabic vowel signs, 324 and Indic matras. 325 ) 326 ) 327 ) 328 $(P $(DEF Combining class) 329 A numerical value used by the Unicode Canonical Ordering Algorithm 330 to determine which sequences of combining marks are to be 331 considered canonically equivalent and which are not. 332 ) 333 $(P $(DEF Compatibility decomposition) 334 The decomposition of a character or character sequence that results 335 from recursively applying both the compatibility mappings and 336 the canonical mappings found in the Unicode Character Database, and those 337 described in Conjoining Jamo Behavior no characters 338 can be further decomposed. 339 ) 340 $(P $(DEF Compatibility equivalent) 341 Two character sequences are said to be compatibility 342 equivalents if their full compatibility decompositions are identical. 343 ) 344 $(P $(DEF Encoded character) An association (or mapping) 345 between an abstract character and a code point. 346 ) 347 $(P $(DEF Glyph) The actual, concrete image of a glyph representation 348 having been rasterized or otherwise imaged onto some display surface. 349 ) 350 $(P $(DEF Grapheme base) A character with the property 351 Grapheme_Base, or any standard Korean syllable block. 352 ) 353 $(P $(DEF Grapheme cluster) Defined as the text between 354 grapheme boundaries as specified by Unicode Standard Annex #29, 355 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation). 356 Important general properties of a grapheme: 357 $(UL 358 $(LI The grapheme cluster represents a horizontally segmentable 359 unit of text, consisting of some grapheme base (which may 360 consist of a Korean syllable) together with any number of 361 nonspacing marks applied to it. 362 ) 363 $(LI A grapheme cluster typically starts with a grapheme base 364 and then extends across any subsequent sequence of nonspacing marks. 365 A grapheme cluster is most directly relevant to text rendering and 366 processes such as cursor placement and text selection in editing, 367 but may also be relevant to comparison and searching. 368 ) 369 $(LI For many processes, a grapheme cluster behaves as if it was a 370 single character with the same properties as its grapheme base. 371 Effectively, nonspacing marks apply $(I graphically) to the base, 372 but do not change its properties. 373 ) 374 ) 375 $(P This module defines a number of primitives that work with graphemes: 376 $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride). 377 All of them are using $(I extended grapheme) boundaries 378 as defined in the aforementioned standard annex. 379 ) 380 ) 381 $(P $(DEF Nonspacing mark) A combining character with the 382 General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me). 383 ) 384 $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark. 385 ) 386 $(SECTION Normalization) 387 $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent) 388 or $(S_LINK Compatibility equivalent, compatibility equivalent) 389 characters in the Unicode Standard make it necessary to have a full, formal 390 definition of equivalence for Unicode strings. 391 String equivalence is determined by a process called normalization, 392 whereby strings are converted into forms which are compared 393 directly for identity. This is the primary goal of the normalization process, 394 see the function $(LREF normalize) to convert into any of 395 the four defined forms. 396 ) 397 $(P A very important attribute of the Unicode Normalization Forms 398 is that they must remain stable between versions of the Unicode Standard. 399 A Unicode string normalized to a particular Unicode Normalization Form 400 in one version of the standard is guaranteed to remain in that Normalization 401 Form for implementations of future versions of the standard. 402 ) 403 $(P The Unicode Standard specifies four normalization forms. 404 Informally, two of these forms are defined by maximal decomposition 405 of equivalent sequences, and two of these forms are defined 406 by maximal $(I composition) of equivalent sequences. 407 $(UL 408 $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition, 409 canonical decomposition) of a character sequence.) 410 $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition, 411 compatibility decomposition) of a character sequence.) 412 $(LI Normalization Form C (NFC): The canonical composition of the 413 $(S_LINK Canonical decomposition, canonical decomposition) 414 of a coded character sequence.) 415 $(LI Normalization Form KC (NFKC): The canonical composition 416 of the $(S_LINK Compatibility decomposition, 417 compatibility decomposition) of a character sequence) 418 ) 419 ) 420 $(P The choice of the normalization form depends on the particular use case. 421 NFC is the best form for general text, since it's more compatible with 422 strings converted from legacy encodings. NFKC is the preferred form for 423 identifiers, especially where there are security concerns. NFD and NFKD 424 are the most useful for internal processing. 425 ) 426 $(SECTION Construction of lookup tables) 427 $(P The Unicode standard describes a set of algorithms that 428 depend on having the ability to quickly look up various properties 429 of a code point. Given the codespace of about 1 million $(CODEPOINTS), 430 it is not a trivial task to provide a space-efficient solution for 431 the multitude of properties. 432 ) 433 $(P Common approaches such as hash-tables or binary search over 434 sorted code point intervals (as in $(LREF InversionList)) are insufficient. 435 Hash-tables have enormous memory footprint and binary search 436 over intervals is not fast enough for some heavy-duty algorithms. 437 ) 438 $(P The recommended solution (see Unicode Implementation Guidelines) 439 is using multi-stage tables that are an implementation of the 440 $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer 441 keys and a fixed number of stages. For the remainder of the section 442 this will be called a fixed trie. The following describes a particular 443 implementation that is aimed for the speed of access at the expense 444 of ideal size savings. 445 ) 446 $(P Taking a 2-level Trie as an example the principle of operation is as follows. 447 Split the number of bits in a key (code point, 21 bits) into 2 components 448 (e.g. 15 and 8). The first is the number of bits in the index of the trie 449 and the other is number of bits in each page of the trie. 450 The layout of the trie is then an array of size 2^^bits-of-index followed 451 an array of memory chunks of size 2^^bits-of-page/bits-per-element. 452 ) 453 $(P The number of pages is variable (but not less then 1) 454 unlike the number of entries in the index. The slots of the index 455 all have to contain a number of a page that is present. The lookup is then 456 just a couple of operations - slice the upper bits, 457 lookup an index for these, take a page at this index and use 458 the lower bits as an offset within this page. 459 460 Assuming that pages are laid out consequently 461 in one array at `pages`, the pseudo-code is: 462 ) 463 --- 464 auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits; 465 pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)]; 466 --- 467 $(P Where if `elemsPerPage` is a power of 2 the whole process is 468 a handful of simple instructions and 2 array reads. Subsequent levels 469 of the trie are introduced by recursing on this notion - the index array 470 is treated as values. The number of bits in index is then again 471 split into 2 parts, with pages over 'current-index' and the new 'upper-index'. 472 ) 473 474 $(P For completeness a level 1 trie is simply an array. 475 The current implementation takes advantage of bit-packing values 476 when the range is known to be limited in advance (such as `bool`). 477 See also $(LREF BitPacked) for enforcing it manually. 478 The major size advantage however comes from the fact 479 that multiple $(B identical pages on every level are merged) by construction. 480 ) 481 $(P The process of constructing a trie is more involved and is hidden from 482 the user in a form of the convenience functions $(LREF codepointTrie), 483 $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie). 484 In general a set or built-in AA with `dchar` type 485 can be turned into a trie. The trie object in this module 486 is read-only (immutable); it's effectively frozen after construction. 487 ) 488 $(SECTION Unicode properties) 489 $(P This is a full list of Unicode properties accessible through $(LREF unicode) 490 with specific helpers per category nested within. Consult the 491 $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility) 492 when in doubt about the contents of a particular set. 493 ) 494 $(P General category sets listed below are only accessible with the 495 $(LREF unicode) shorthand accessor.) 496 $(BOOKTABLE $(B General category ), 497 $(TR $(TH Abb.) $(TH Long form) 498 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form)) 499 $(TR $(TD L) $(TD Letter) 500 $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation)) 501 $(TR $(TD Ll) $(TD Lowercase_Letter) 502 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation)) 503 $(TR $(TD Lm) $(TD Modifier_Letter) 504 $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol)) 505 $(TR $(TD Lo) $(TD Other_Letter) 506 $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol)) 507 $(TR $(TD Lt) $(TD Titlecase_Letter) 508 $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol)) 509 $(TR $(TD Lu) $(TD Uppercase_Letter) 510 $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol)) 511 $(TR $(TD M) $(TD Mark) 512 $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol)) 513 $(TR $(TD Mc) $(TD Spacing_Mark) 514 $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator)) 515 $(TR $(TD Me) $(TD Enclosing_Mark) 516 $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator)) 517 $(TR $(TD Mn) $(TD Nonspacing_Mark) 518 $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator)) 519 $(TR $(TD C) $(TD Other) 520 $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator)) 521 $(TR $(TD Cc) $(TD Control) $(TD Pf) 522 $(TD Final_Punctuation) $(TD -) $(TD Any)) 523 $(TR $(TD Cf) $(TD Format) 524 $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII)) 525 ) 526 $(P Sets for other commonly useful properties that are 527 accessible with $(LREF unicode):) 528 $(BOOKTABLE $(B Common binary properties), 529 $(TR $(TH Name) $(TH Name) $(TH Name)) 530 $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase)) 531 $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax)) 532 $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space)) 533 $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark)) 534 $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical)) 535 $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted)) 536 $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm)) 537 $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation)) 538 $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph)) 539 $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase)) 540 $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector)) 541 $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space)) 542 $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue)) 543 $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start)) 544 $(TR $(TD Hyphen) $(TD Other_Lowercase) ) 545 $(TR $(TD ID_Continue) $(TD Other_Math) ) 546 ) 547 $(P Below is the table with block names accepted by $(LREF unicode.block). 548 Note that the shorthand version $(LREF unicode) requires "In" 549 to be prepended to the names of blocks so as to disambiguate 550 scripts and blocks. 551 ) 552 $(BOOKTABLE $(B Blocks), 553 $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian)) 554 $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols)) 555 $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar)) 556 $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A)) 557 $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue)) 558 $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo)) 559 $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms)) 560 $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham)) 561 $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki)) 562 $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic)) 563 $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian)) 564 $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian)) 565 $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic)) 566 $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition)) 567 $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya)) 568 $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya)) 569 $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa)) 570 $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc)) 571 $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician)) 572 $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions)) 573 $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement)) 574 $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards)) 575 $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area)) 576 $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang)) 577 $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols)) 578 $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic)) 579 $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan)) 580 $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra)) 581 $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada)) 582 $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian)) 583 $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala)) 584 $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants)) 585 $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng)) 586 $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters)) 587 $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials)) 588 $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese)) 589 $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement)) 590 $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts)) 591 $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A)) 592 $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B)) 593 $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators)) 594 $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation)) 595 $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A)) 596 $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B)) 597 $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri)) 598 $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac)) 599 $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog)) 600 $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa)) 601 $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags)) 602 $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le)) 603 $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham)) 604 $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet)) 605 $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols)) 606 $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri)) 607 $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil)) 608 $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu)) 609 $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana)) 610 $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai)) 611 $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan)) 612 $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh)) 613 $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols)) 614 $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic)) 615 $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics)) 616 $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended)) 617 $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai)) 618 $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors)) 619 $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement)) 620 $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions)) 621 $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms)) 622 $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols)) 623 $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals)) 624 $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables)) 625 $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) ) 626 $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) ) 627 ) 628 $(P Below is the table with script names accepted by $(LREF unicode.script) 629 and by the shorthand version $(LREF unicode):) 630 $(BOOKTABLE $(B Scripts), 631 $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic)) 632 $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian)) 633 $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian)) 634 $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic)) 635 $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya)) 636 $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya)) 637 $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa)) 638 $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician)) 639 $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang)) 640 $(TR $(TD Braille) $(TD Kannada) $(TD Runic)) 641 $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan)) 642 $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra)) 643 $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada)) 644 $(TR $(TD Carian) $(TD Khmer) $(TD Shavian)) 645 $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala)) 646 $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng)) 647 $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese)) 648 $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri)) 649 $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac)) 650 $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog)) 651 $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa)) 652 $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le)) 653 $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham)) 654 $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet)) 655 $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri)) 656 $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil)) 657 $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu)) 658 $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana)) 659 $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai)) 660 $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan)) 661 $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh)) 662 $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic)) 663 $(TR $(TD Han) $(TD Ogham) $(TD Vai)) 664 $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi)) 665 ) 666 $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).) 667 $(BOOKTABLE $(B Hangul syllable type), 668 $(TR $(TH Abb.) $(TH Long form)) 669 $(TR $(TD L) $(TD Leading_Jamo)) 670 $(TR $(TD LV) $(TD LV_Syllable)) 671 $(TR $(TD LVT) $(TD LVT_Syllable) ) 672 $(TR $(TD T) $(TD Trailing_Jamo)) 673 $(TR $(TD V) $(TD Vowel_Jamo)) 674 ) 675 References: 676 $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table), 677 $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia), 678 $(HTTP www.unicode.org, The Unicode Consortium), 679 $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms), 680 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation) 681 $(HTTP www.unicode.org/uni2book/ch05.pdf, 682 Unicode Implementation Guidelines) 683 $(HTTP www.unicode.org/uni2book/ch03.pdf, 684 Unicode Conformance) 685 Trademarks: 686 Unicode(tm) is a trademark of Unicode, Inc. 687 688 Copyright: Copyright 2013 - 689 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 690 Authors: Dmitry Olshansky 691 Source: $(PHOBOSSRC std/uni/package.d) 692 Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2) 693 694 Macros: 695 696 SECTION = <h3><a id="$1">$0</a></h3> 697 DEF = <div><a id="$1"><i>$0</i></a></div> 698 S_LINK = <a href="#$1">$+</a> 699 CODEPOINT = $(S_LINK Code point, code point) 700 CODEPOINTS = $(S_LINK Code point, code points) 701 CHARACTER = $(S_LINK Character, character) 702 CHARACTERS = $(S_LINK Character, characters) 703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster) 704 +/ 705 module std.uni; 706 707 import std.meta : AliasSeq; 708 import std.range.primitives : back, ElementEncodingType, ElementType, empty, 709 front, hasLength, hasSlicing, isForwardRange, isInputRange, 710 isRandomAccessRange, popFront, put, save; 711 import std.traits : isConvertibleToString, isIntegral, isSomeChar, 712 isSomeString, Unqual, isDynamicArray; 713 // debug = std_uni; 714 715 import std.internal.unicode_tables; // generated file 716 717 debug(std_uni) import std.stdio; // writefln, writeln 718 719 private: 720 721 722 void copyBackwards(T,U)(T[] src, U[] dest) 723 { 724 assert(src.length == dest.length); 725 for (size_t i=src.length; i-- > 0; ) 726 dest[i] = src[i]; 727 } 728 729 void copyForward(T,U)(T[] src, U[] dest) 730 { 731 assert(src.length == dest.length); 732 for (size_t i=0; i<src.length; i++) 733 dest[i] = src[i]; 734 } 735 736 // TODO: update to reflect all major CPUs supporting unaligned reads 737 version (X86) 738 enum hasUnalignedReads = true; 739 else version (X86_64) 740 enum hasUnalignedReads = true; 741 else version (SystemZ) 742 enum hasUnalignedReads = true; 743 else 744 enum hasUnalignedReads = false; // better be safe then sorry 745 746 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator. 747 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator. 748 public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line. 749 750 // test the intro example 751 @safe unittest 752 { 753 import std.algorithm.searching : find; 754 // initialize code point sets using script/block or property name 755 // set contains code points from both scripts. 756 auto set = unicode("Cyrillic") | unicode("Armenian"); 757 // or simpler and statically-checked look 758 auto ascii = unicode.ASCII; 759 auto currency = unicode.Currency_Symbol; 760 761 // easy set ops 762 auto a = set & ascii; 763 assert(a.empty); // as it has no intersection with ascii 764 a = set | ascii; 765 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 766 767 // some properties of code point sets 768 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 769 // testing presence of a code point in a set 770 // is just fine, it is O(logN) 771 assert(!b['$']); 772 assert(!b['\u058F']); // Armenian dram sign 773 assert(b['¥']); 774 775 // building fast lookup tables, these guarantee O(1) complexity 776 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 777 auto oneTrie = toTrie!1(b); 778 // 2-level far more compact but typically slightly slower 779 auto twoTrie = toTrie!2(b); 780 // 3-level even smaller, and a bit slower yet 781 auto threeTrie = toTrie!3(b); 782 assert(oneTrie['£']); 783 assert(twoTrie['£']); 784 assert(threeTrie['£']); 785 786 // build the trie with the most sensible trie level 787 // and bind it as a functor 788 auto cyrillicOrArmenian = toDelegate(set); 789 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 790 assert(balance == "ընկեր!"); 791 // compatible with bool delegate(dchar) 792 bool delegate(dchar) bindIt = cyrillicOrArmenian; 793 794 // Normalization 795 string s = "Plain ascii (and not only), is always normalized!"; 796 assert(s is normalize(s));// is the same string 797 798 string nonS = "A\u0308ffin"; // A ligature 799 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 800 assert(nS == "Äffin"); 801 assert(nS != nonS); 802 string composed = "Äffin"; 803 804 assert(normalize!NFD(composed) == "A\u0308ffin"); 805 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 806 assert(normalize!NFKD("2¹⁰") == "210"); 807 } 808 809 enum lastDchar = 0x10FFFF; 810 811 auto force(T, F)(F from) 812 if (isIntegral!T && !is(T == F)) 813 { 814 assert(from <= T.max && from >= T.min); 815 return cast(T) from; 816 } 817 818 auto force(T, F)(F from) 819 if (isBitPacked!T && !is(T == F)) 820 { 821 assert(from <= 2^^bitSizeOf!T-1); 822 return T(cast(TypeOfBitPacked!T) from); 823 } 824 825 auto force(T, F)(F from) 826 if (is(T == F)) 827 { 828 return from; 829 } 830 831 // repeat X times the bit-pattern in val assuming it's length is 'bits' 832 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc 833 { 834 static if (times == 1) 835 return val; 836 else static if (bits == 1) 837 { 838 static if (times == size_t.sizeof*8) 839 return val ? size_t.max : 0; 840 else 841 return val ? (1 << times)-1 : 0; 842 } 843 else static if (times % 2) 844 return (replicateBits!(times-1, bits)(val)<<bits) | val; 845 else 846 return replicateBits!(times/2, bits*2)((val << bits) | val); 847 } 848 849 @safe pure nothrow @nogc unittest // for replicate 850 { 851 import std.algorithm.iteration : sum, map; 852 import std.range : iota; 853 size_t m = 0b111; 854 size_t m2 = 0b01; 855 static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) 856 { 857 assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i))); 858 assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum()); 859 } 860 } 861 862 // multiple arrays squashed into one memory block 863 struct MultiArray(Types...) 864 { 865 import std.range.primitives : isOutputRange; 866 this(size_t[] sizes...) @safe pure nothrow 867 { 868 assert(dim == sizes.length); 869 size_t full_size; 870 foreach (i, v; Types) 871 { 872 full_size += spaceFor!(bitSizeOf!v)(sizes[i]); 873 sz[i] = sizes[i]; 874 static if (i >= 1) 875 offsets[i] = offsets[i-1] + 876 spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]); 877 } 878 879 storage = new size_t[full_size]; 880 } 881 882 this(const(size_t)[] raw_offsets, 883 const(size_t)[] raw_sizes, 884 return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc 885 { 886 offsets[] = raw_offsets[]; 887 sz[] = raw_sizes[]; 888 storage = data; 889 } 890 891 @property auto slice(size_t n)()inout pure nothrow @nogc 892 { 893 auto ptr = raw_ptr!n; 894 return packedArrayView!(Types[n])(ptr, sz[n]); 895 } 896 897 @property auto ptr(size_t n)()inout pure nothrow @nogc 898 { 899 auto ptr = raw_ptr!n; 900 return inout(PackedPtr!(Types[n]))(ptr); 901 } 902 903 template length(size_t n) 904 { 905 @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; } 906 907 @property void length(size_t new_size) 908 { 909 if (new_size > sz[n]) 910 {// extend 911 size_t delta = (new_size - sz[n]); 912 sz[n] += delta; 913 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 914 storage.length += delta;// extend space at end 915 // raw_slice!x must follow resize as it could be moved! 916 // next stmts move all data past this array, last-one-goes-first 917 static if (n != dim-1) 918 { 919 auto start = raw_ptr!(n+1); 920 // len includes delta 921 size_t len = (storage.ptr+storage.length-start); 922 923 copyBackwards(start[0 .. len-delta], start[delta .. len]); 924 925 start[0 .. delta] = 0; 926 // offsets are used for raw_slice, ptr etc. 927 foreach (i; n+1 .. dim) 928 offsets[i] += delta; 929 } 930 } 931 else if (new_size < sz[n]) 932 {// shrink 933 size_t delta = (sz[n] - new_size); 934 sz[n] -= delta; 935 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 936 // move all data past this array, forward direction 937 static if (n != dim-1) 938 { 939 auto start = raw_ptr!(n+1); 940 size_t len = (storage.ptr+storage.length-start); 941 copyForward(start[0 .. len-delta], start[delta .. len]); 942 943 // adjust offsets last, they affect raw_slice 944 foreach (i; n+1 .. dim) 945 offsets[i] -= delta; 946 } 947 storage.length -= delta; 948 } 949 // else - NOP 950 } 951 } 952 953 @property size_t bytes(size_t n=size_t.max)() const @safe 954 { 955 static if (n == size_t.max) 956 return storage.length*size_t.sizeof; 957 else static if (n != Types.length-1) 958 return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof; 959 else 960 return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof; 961 } 962 963 void store(OutRange)(scope OutRange sink) const 964 if (isOutputRange!(OutRange, char)) 965 { 966 import std.format.write : formattedWrite; 967 formattedWrite(sink, "[%( 0x%x, %)]", offsets[]); 968 formattedWrite(sink, ", [%( 0x%x, %)]", sz[]); 969 formattedWrite(sink, ", [%( 0x%x, %)]", storage); 970 } 971 972 private: 973 import std.meta : staticMap; 974 @property auto raw_ptr(size_t n)()inout pure nothrow @nogc 975 { 976 static if (n == 0) 977 return storage.ptr; 978 else 979 { 980 return storage.ptr+offsets[n]; 981 } 982 } 983 enum dim = Types.length; 984 size_t[dim] offsets;// offset for level x 985 size_t[dim] sz;// size of level x 986 alias bitWidth = staticMap!(bitSizeOf, Types); 987 size_t[] storage; 988 } 989 990 @system unittest 991 { 992 import std.conv : text; 993 enum dg = (){ 994 // sizes are: 995 // lvl0: 3, lvl1 : 2, lvl2: 1 996 auto m = MultiArray!(int, ubyte, int)(3,2,1); 997 998 static void check(size_t k, T)(ref T m, int n) 999 { 1000 foreach (i; 0 .. n) 1001 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n])); 1002 } 1003 1004 static void checkB(size_t k, T)(ref T m, int n) 1005 { 1006 foreach (i; 0 .. n) 1007 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n])); 1008 } 1009 1010 static void fill(size_t k, T)(ref T m, int n) 1011 { 1012 foreach (i; 0 .. n) 1013 m.slice!(k)[i] = force!ubyte(i+1); 1014 } 1015 1016 static void fillB(size_t k, T)(ref T m, int n) 1017 { 1018 foreach (i; 0 .. n) 1019 m.slice!(k)[i] = force!ubyte(n-i); 1020 } 1021 1022 m.length!1 = 100; 1023 fill!1(m, 100); 1024 check!1(m, 100); 1025 1026 m.length!0 = 220; 1027 fill!0(m, 220); 1028 check!1(m, 100); 1029 check!0(m, 220); 1030 1031 m.length!2 = 17; 1032 fillB!2(m, 17); 1033 checkB!2(m, 17); 1034 check!0(m, 220); 1035 check!1(m, 100); 1036 1037 m.length!2 = 33; 1038 checkB!2(m, 17); 1039 fillB!2(m, 33); 1040 checkB!2(m, 33); 1041 check!0(m, 220); 1042 check!1(m, 100); 1043 1044 m.length!1 = 195; 1045 fillB!1(m, 195); 1046 checkB!1(m, 195); 1047 checkB!2(m, 33); 1048 check!0(m, 220); 1049 1050 auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10); 1051 marr.length!0 = 15; 1052 marr.length!1 = 30; 1053 fill!1(marr, 30); 1054 fill!0(marr, 15); 1055 check!1(marr, 30); 1056 check!0(marr, 15); 1057 return 0; 1058 }; 1059 enum ct = dg(); 1060 auto rt = dg(); 1061 } 1062 1063 @system unittest 1064 {// more bitpacking tests 1065 import std.conv : text; 1066 1067 alias Bitty = 1068 MultiArray!(BitPacked!(size_t, 3) 1069 , BitPacked!(size_t, 4) 1070 , BitPacked!(size_t, 3) 1071 , BitPacked!(size_t, 6) 1072 , bool); 1073 alias fn1 = sliceBits!(13, 16); 1074 alias fn2 = sliceBits!( 9, 13); 1075 alias fn3 = sliceBits!( 6, 9); 1076 alias fn4 = sliceBits!( 0, 6); 1077 static void check(size_t lvl, MA)(ref MA arr){ 1078 for (size_t i = 0; i< arr.length!lvl; i++) 1079 assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i])); 1080 } 1081 1082 static void fillIdx(size_t lvl, MA)(ref MA arr){ 1083 for (size_t i = 0; i< arr.length!lvl; i++) 1084 arr.slice!(lvl)[i] = i; 1085 } 1086 Bitty m1; 1087 1088 m1.length!4 = 10; 1089 m1.length!3 = 2^^6; 1090 m1.length!2 = 2^^3; 1091 m1.length!1 = 2^^4; 1092 m1.length!0 = 2^^3; 1093 1094 m1.length!4 = 2^^16; 1095 1096 for (size_t i = 0; i< m1.length!4; i++) 1097 m1.slice!(4)[i] = i % 2; 1098 1099 fillIdx!1(m1); 1100 check!1(m1); 1101 fillIdx!2(m1); 1102 check!2(m1); 1103 fillIdx!3(m1); 1104 check!3(m1); 1105 fillIdx!0(m1); 1106 check!0(m1); 1107 check!3(m1); 1108 check!2(m1); 1109 check!1(m1); 1110 for (size_t i=0; i < 2^^16; i++) 1111 { 1112 m1.slice!(4)[i] = i % 2; 1113 m1.slice!(0)[fn1(i)] = fn1(i); 1114 m1.slice!(1)[fn2(i)] = fn2(i); 1115 m1.slice!(2)[fn3(i)] = fn3(i); 1116 m1.slice!(3)[fn4(i)] = fn4(i); 1117 } 1118 for (size_t i=0; i < 2^^16; i++) 1119 { 1120 assert(m1.slice!(4)[i] == i % 2); 1121 assert(m1.slice!(0)[fn1(i)] == fn1(i)); 1122 assert(m1.slice!(1)[fn2(i)] == fn2(i)); 1123 assert(m1.slice!(2)[fn3(i)] == fn3(i)); 1124 assert(m1.slice!(3)[fn4(i)] == fn4(i)); 1125 } 1126 } 1127 1128 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc 1129 { 1130 import std.math.algebraic : nextPow2; 1131 enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView 1132 static if (bits > 8*size_t.sizeof) 1133 { 1134 static assert(bits % (size_t.sizeof*8) == 0); 1135 return new_len * bits/(8*size_t.sizeof); 1136 } 1137 else 1138 { 1139 enum factor = size_t.sizeof*8/bits; 1140 return (new_len+factor-1)/factor; // rounded up 1141 } 1142 } 1143 1144 template isBitPackableType(T) 1145 { 1146 enum isBitPackableType = isBitPacked!T 1147 || isIntegral!T || is(T == bool) || isSomeChar!T; 1148 } 1149 1150 //============================================================================ 1151 template PackedArrayView(T) 1152 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1153 && isBitPackableType!U) || isBitPackableType!T) 1154 { 1155 import std.math.algebraic : nextPow2; 1156 private enum bits = bitSizeOf!T; 1157 alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1158 } 1159 1160 //unsafe and fast access to a chunk of RAM as if it contains packed values 1161 template PackedPtr(T) 1162 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1163 && isBitPackableType!U) || isBitPackableType!T) 1164 { 1165 import std.math.algebraic : nextPow2; 1166 private enum bits = bitSizeOf!T; 1167 alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1168 } 1169 1170 struct PackedPtrImpl(T, size_t bits) 1171 { 1172 pure nothrow: 1173 static assert(isPow2OrZero(bits)); 1174 1175 this(inout(size_t)* ptr)inout @safe @nogc 1176 { 1177 origin = ptr; 1178 } 1179 1180 private T simpleIndex(size_t n) inout 1181 { 1182 immutable q = n / factor; 1183 immutable r = n % factor; 1184 return cast(T)((origin[q] >> bits*r) & mask); 1185 } 1186 1187 private void simpleWrite(TypeOfBitPacked!T val, size_t n) 1188 in 1189 { 1190 static if (isIntegral!T) 1191 assert(val <= mask); 1192 } 1193 do 1194 { 1195 immutable q = n / factor; 1196 immutable r = n % factor; 1197 immutable tgt_shift = bits*r; 1198 immutable word = origin[q]; 1199 origin[q] = (word & ~(mask << tgt_shift)) 1200 | (cast(size_t) val << tgt_shift); 1201 } 1202 1203 static if (factor == bytesPerWord// can safely pack by byte 1204 || factor == 1 // a whole word at a time 1205 || ((factor == bytesPerWord/2 || factor == bytesPerWord/4) 1206 && hasUnalignedReads)) // this needs unaligned reads 1207 { 1208 static if (factor == bytesPerWord) 1209 alias U = ubyte; 1210 else static if (factor == bytesPerWord/2) 1211 alias U = ushort; 1212 else static if (factor == bytesPerWord/4) 1213 alias U = uint; 1214 else static if (size_t.sizeof == 8 && factor == bytesPerWord/8) 1215 alias U = ulong; 1216 1217 T opIndex(size_t idx) inout 1218 { 1219 T ret; 1220 version (LittleEndian) 1221 ret = __ctfe ? simpleIndex(idx) : 1222 cast(inout(T))(cast(U*) origin)[idx]; 1223 else 1224 ret = simpleIndex(idx); 1225 return ret; 1226 } 1227 1228 static if (isBitPacked!T) // lack of user-defined implicit conversion 1229 { 1230 void opIndexAssign(T val, size_t idx) 1231 { 1232 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1233 } 1234 } 1235 1236 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1237 { 1238 version (LittleEndian) 1239 { 1240 if (__ctfe) 1241 simpleWrite(val, idx); 1242 else 1243 (cast(U*) origin)[idx] = cast(U) val; 1244 } 1245 else 1246 simpleWrite(val, idx); 1247 } 1248 } 1249 else 1250 { 1251 T opIndex(size_t n) inout 1252 { 1253 return simpleIndex(n); 1254 } 1255 1256 static if (isBitPacked!T) // lack of user-defined implicit conversion 1257 { 1258 void opIndexAssign(T val, size_t idx) 1259 { 1260 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1261 } 1262 } 1263 1264 void opIndexAssign(TypeOfBitPacked!T val, size_t n) 1265 { 1266 return simpleWrite(val, n); 1267 } 1268 } 1269 1270 private: 1271 // factor - number of elements in one machine word 1272 enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1; 1273 enum bytesPerWord = size_t.sizeof; 1274 size_t* origin; 1275 } 1276 1277 // data is packed only by power of two sized packs per word, 1278 // thus avoiding mul/div overhead at the cost of ultimate packing 1279 // this construct doesn't own memory, only provides access, see MultiArray for usage 1280 struct PackedArrayViewImpl(T, size_t bits) 1281 { 1282 pure nothrow: 1283 1284 this(inout(size_t)* origin, size_t offset, size_t items) inout @safe 1285 { 1286 ptr = inout(PackedPtr!(T))(origin); 1287 ofs = offset; 1288 limit = items; 1289 } 1290 1291 bool zeros(size_t s, size_t e) 1292 in 1293 { 1294 assert(s <= e); 1295 } 1296 do 1297 { 1298 s += ofs; 1299 e += ofs; 1300 immutable pad_s = roundUp(s); 1301 if ( s >= e) 1302 { 1303 foreach (i; s .. e) 1304 if (ptr[i]) 1305 return false; 1306 return true; 1307 } 1308 immutable pad_e = roundDown(e); 1309 size_t i; 1310 for (i=s; i<pad_s; i++) 1311 if (ptr[i]) 1312 return false; 1313 // all in between is x*factor elements 1314 for (size_t j=i/factor; i<pad_e; i+=factor, j++) 1315 if (ptr.origin[j]) 1316 return false; 1317 for (; i<e; i++) 1318 if (ptr[i]) 1319 return false; 1320 return true; 1321 } 1322 1323 T opIndex(size_t idx) inout 1324 in 1325 { 1326 assert(idx < limit); 1327 } 1328 do 1329 { 1330 return ptr[ofs + idx]; 1331 } 1332 1333 static if (isBitPacked!T) // lack of user-defined implicit conversion 1334 { 1335 void opIndexAssign(T val, size_t idx) 1336 { 1337 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1338 } 1339 } 1340 1341 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1342 in 1343 { 1344 assert(idx < limit); 1345 } 1346 do 1347 { 1348 ptr[ofs + idx] = val; 1349 } 1350 1351 static if (isBitPacked!T) // lack of user-defined implicit conversions 1352 { 1353 void opSliceAssign(T val, size_t start, size_t end) 1354 { 1355 opSliceAssign(cast(TypeOfBitPacked!T) val, start, end); 1356 } 1357 } 1358 1359 void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end) 1360 in 1361 { 1362 assert(start <= end); 1363 assert(end <= limit); 1364 } 1365 do 1366 { 1367 // account for ofsetted view 1368 start += ofs; 1369 end += ofs; 1370 // rounded to factor granularity 1371 immutable pad_start = roundUp(start);// rounded up 1372 if (pad_start >= end) //rounded up >= then end of slice 1373 { 1374 //nothing to gain, use per element assignment 1375 foreach (i; start .. end) 1376 ptr[i] = val; 1377 return; 1378 } 1379 immutable pad_end = roundDown(end); // rounded down 1380 size_t i; 1381 for (i=start; i<pad_start; i++) 1382 ptr[i] = val; 1383 // all in between is x*factor elements 1384 if (pad_start != pad_end) 1385 { 1386 immutable repval = replicateBits!(factor, bits)(val); 1387 for (size_t j=i/factor; i<pad_end; i+=factor, j++) 1388 ptr.origin[j] = repval;// so speed it up by factor 1389 } 1390 for (; i<end; i++) 1391 ptr[i] = val; 1392 } 1393 1394 auto opSlice(size_t from, size_t to)inout 1395 in 1396 { 1397 assert(from <= to); 1398 assert(ofs + to <= limit); 1399 } 1400 do 1401 { 1402 return typeof(this)(ptr.origin, ofs + from, to - from); 1403 } 1404 1405 auto opSlice(){ return opSlice(0, length); } 1406 1407 bool opEquals(T)(auto ref T arr) const 1408 { 1409 if (limit != arr.limit) 1410 return false; 1411 size_t s1 = ofs, s2 = arr.ofs; 1412 size_t e1 = s1 + limit, e2 = s2 + limit; 1413 if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0) 1414 { 1415 return ptr.origin[s1/factor .. e1/factor] 1416 == arr.ptr.origin[s2/factor .. e2/factor]; 1417 } 1418 for (size_t i=0;i<limit; i++) 1419 if (this[i] != arr[i]) 1420 return false; 1421 return true; 1422 } 1423 1424 @property size_t length()const{ return limit; } 1425 1426 private: 1427 auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; } 1428 auto roundDown()(size_t val){ return val/factor*factor; } 1429 // factor - number of elements in one machine word 1430 enum factor = size_t.sizeof*8/bits; 1431 PackedPtr!(T) ptr; 1432 size_t ofs, limit; 1433 } 1434 1435 1436 private struct SliceOverIndexed(T) 1437 { 1438 enum assignableIndex = is(typeof((){ T.init[0] = Item.init; })); 1439 enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; })); 1440 auto opIndex(size_t idx)const 1441 in 1442 { 1443 assert(idx < to - from); 1444 } 1445 do 1446 { 1447 return (*arr)[from+idx]; 1448 } 1449 1450 static if (assignableIndex) 1451 void opIndexAssign(Item val, size_t idx) 1452 in 1453 { 1454 assert(idx < to - from); 1455 } 1456 do 1457 { 1458 (*arr)[from+idx] = val; 1459 } 1460 1461 auto opSlice(size_t a, size_t b) 1462 { 1463 return typeof(this)(from+a, from+b, arr); 1464 } 1465 1466 // static if (assignableSlice) 1467 void opSliceAssign(T)(T val, size_t start, size_t end) 1468 { 1469 (*arr)[start+from .. end+from] = val; 1470 } 1471 1472 auto opSlice() 1473 { 1474 return typeof(this)(from, to, arr); 1475 } 1476 1477 @property size_t length()const { return to-from;} 1478 1479 alias opDollar = length; 1480 1481 @property bool empty()const { return from == to; } 1482 1483 @property auto front()const { return (*arr)[from]; } 1484 1485 static if (assignableIndex) 1486 @property void front(Item val) { (*arr)[from] = val; } 1487 1488 @property auto back()const { return (*arr)[to-1]; } 1489 1490 static if (assignableIndex) 1491 @property void back(Item val) { (*arr)[to-1] = val; } 1492 1493 @property auto save() inout { return this; } 1494 1495 void popFront() { from++; } 1496 1497 void popBack() { to--; } 1498 1499 bool opEquals(T)(auto ref T arr) const 1500 { 1501 if (arr.length != length) 1502 return false; 1503 for (size_t i=0; i <length; i++) 1504 if (this[i] != arr[i]) 1505 return false; 1506 return true; 1507 } 1508 private: 1509 alias Item = typeof(T.init[0]); 1510 size_t from, to; 1511 T* arr; 1512 } 1513 1514 @safe pure nothrow @nogc unittest 1515 { 1516 static assert(isRandomAccessRange!(SliceOverIndexed!(int[]))); 1517 } 1518 1519 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x) 1520 if (is(Unqual!T == T)) 1521 { 1522 return SliceOverIndexed!(const(T))(a, b, x); 1523 } 1524 1525 // BUG? inout is out of reach 1526 //...SliceOverIndexed.arr only parameters or stack based variables can be inout 1527 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x) 1528 if (is(Unqual!T == T)) 1529 { 1530 return SliceOverIndexed!T(a, b, x); 1531 } 1532 1533 @system unittest 1534 { 1535 int[] idxArray = [2, 3, 5, 8, 13]; 1536 auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray); 1537 1538 assert(!sliced.empty); 1539 assert(sliced.front == 2); 1540 sliced.front = 1; 1541 assert(sliced.front == 1); 1542 assert(sliced.back == 13); 1543 sliced.popFront(); 1544 assert(sliced.front == 3); 1545 assert(sliced.back == 13); 1546 sliced.back = 11; 1547 assert(sliced.back == 11); 1548 sliced.popBack(); 1549 1550 assert(sliced.front == 3); 1551 assert(sliced[$-1] == 8); 1552 sliced = sliced[]; 1553 assert(sliced[0] == 3); 1554 assert(sliced.back == 8); 1555 sliced = sliced[1..$]; 1556 assert(sliced.front == 5); 1557 sliced = sliced[0..$-1]; 1558 assert(sliced[$-1] == 5); 1559 1560 int[] other = [2, 5]; 1561 assert(sliced[] == sliceOverIndexed(1, 2, &other)); 1562 sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1; 1563 assert(idxArray[0 .. 2] == [-1, -1]); 1564 uint[] nullArr = null; 1565 auto nullSlice = sliceOverIndexed(0, 0, &idxArray); 1566 assert(nullSlice.empty); 1567 } 1568 1569 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items) 1570 { 1571 return inout(PackedArrayView!T)(ptr, 0, items); 1572 } 1573 1574 1575 //============================================================================ 1576 // Partially unrolled binary search using Shar's method 1577 //============================================================================ 1578 1579 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow 1580 { 1581 import core.bitop : bsr; 1582 import std.array : replace; 1583 import std.conv : to; 1584 assert(isPow2OrZero(size)); 1585 string code = ` 1586 import core.bitop : bsr; 1587 auto power = bsr(m)+1; 1588 switch (power){`; 1589 size_t i = bsr(size); 1590 foreach_reverse (val; 0 .. bsr(size)) 1591 { 1592 auto v = 2^^val; 1593 code ~= ` 1594 case pow: 1595 if (pred(range[idx+m], needle)) 1596 idx += m; 1597 goto case; 1598 `.replace("m", to!string(v)) 1599 .replace("pow", to!string(i)); 1600 i--; 1601 } 1602 code ~= ` 1603 case 0: 1604 if (pred(range[idx], needle)) 1605 idx += 1; 1606 goto default; 1607 `; 1608 code ~= ` 1609 default: 1610 }`; 1611 return code; 1612 } 1613 1614 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc 1615 { 1616 // See also: std.math.isPowerOf2() 1617 return (sz & (sz-1)) == 0; 1618 } 1619 1620 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle) 1621 if (is(T : ElementType!Range)) 1622 { 1623 assert(isPow2OrZero(range.length)); 1624 size_t idx = 0, m = range.length/2; 1625 while (m != 0) 1626 { 1627 if (pred(range[idx+m], needle)) 1628 idx += m; 1629 m /= 2; 1630 } 1631 if (pred(range[idx], needle)) 1632 idx += 1; 1633 return idx; 1634 } 1635 1636 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle) 1637 if (is(T : ElementType!Range)) 1638 { 1639 assert(isPow2OrZero(range.length)); 1640 size_t idx = 0, m = range.length/2; 1641 enum max = 1 << 10; 1642 while (m >= max) 1643 { 1644 if (pred(range[idx+m], needle)) 1645 idx += m; 1646 m /= 2; 1647 } 1648 mixin(genUnrolledSwitchSearch(max)); 1649 return idx; 1650 } 1651 1652 template sharMethod(alias uniLowerBound) 1653 { 1654 size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle) 1655 if (is(T : ElementType!Range)) 1656 { 1657 import std.functional : binaryFun; 1658 import std.math.algebraic : nextPow2, truncPow2; 1659 alias pred = binaryFun!_pred; 1660 if (range.length == 0) 1661 return 0; 1662 if (isPow2OrZero(range.length)) 1663 return uniLowerBound!pred(range, needle); 1664 size_t n = truncPow2(range.length); 1665 if (pred(range[n-1], needle)) 1666 {// search in another 2^^k area that fully covers the tail of range 1667 size_t k = nextPow2(range.length - n + 1); 1668 return range.length - k + uniLowerBound!pred(range[$-k..$], needle); 1669 } 1670 else 1671 return uniLowerBound!pred(range[0 .. n], needle); 1672 } 1673 } 1674 1675 alias sharLowerBound = sharMethod!uniformLowerBound; 1676 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound; 1677 1678 @safe unittest 1679 { 1680 import std.array : array; 1681 import std.range : assumeSorted, iota; 1682 1683 auto stdLowerBound(T)(T[] range, T needle) 1684 { 1685 return assumeSorted(range).lowerBound(needle).length; 1686 } 1687 immutable MAX = 5*1173; 1688 auto arr = array(iota(5, MAX, 5)); 1689 assert(arr.length == MAX/5-1); 1690 foreach (i; 0 .. MAX+5) 1691 { 1692 auto st = stdLowerBound(arr, i); 1693 assert(st == sharLowerBound(arr, i)); 1694 assert(st == sharSwitchLowerBound(arr, i)); 1695 } 1696 arr = []; 1697 auto st = stdLowerBound(arr, 33); 1698 assert(st == sharLowerBound(arr, 33)); 1699 assert(st == sharSwitchLowerBound(arr, 33)); 1700 } 1701 //============================================================================ 1702 1703 @safe 1704 { 1705 // hope to see simillar stuff in public interface... once Allocators are out 1706 //@@@BUG moveFront and friends? dunno, for now it's POD-only 1707 1708 @trusted size_t genericReplace(Policy=void, T, Range) 1709 (ref T dest, size_t from, size_t to, Range stuff) 1710 { 1711 import std.algorithm.mutation : copy; 1712 size_t delta = to - from; 1713 size_t stuff_end = from+stuff.length; 1714 if (stuff.length > delta) 1715 {// replace increases length 1716 delta = stuff.length - delta;// now, new is > old by delta 1717 static if (is(Policy == void)) 1718 dest.length = dest.length+delta;//@@@BUG lame @property 1719 else 1720 dest = Policy.realloc(dest, dest.length+delta); 1721 copyBackwards(dest[to .. dest.length-delta], 1722 dest[to+delta .. dest.length]); 1723 copyForward(stuff, dest[from .. stuff_end]); 1724 } 1725 else if (stuff.length == delta) 1726 { 1727 copy(stuff, dest[from .. to]); 1728 } 1729 else 1730 {// replace decreases length by delta 1731 delta = delta - stuff.length; 1732 copy(stuff, dest[from .. stuff_end]); 1733 copyForward(dest[to .. dest.length], 1734 dest[stuff_end .. dest.length-delta]); 1735 static if (is(Policy == void)) 1736 dest.length = dest.length - delta;//@@@BUG lame @property 1737 else 1738 dest = Policy.realloc(dest, dest.length-delta); 1739 } 1740 return stuff_end; 1741 } 1742 1743 1744 // Simple storage manipulation policy 1745 @safe private struct GcPolicy 1746 { 1747 import std.traits : isDynamicArray; 1748 1749 static T[] dup(T)(const T[] arr) 1750 { 1751 return arr.dup; 1752 } 1753 1754 static T[] alloc(T)(size_t size) 1755 { 1756 return new T[size]; 1757 } 1758 1759 static T[] realloc(T)(T[] arr, size_t sz) 1760 { 1761 arr.length = sz; 1762 return arr; 1763 } 1764 1765 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1766 { 1767 replaceInPlace(dest, from, to, stuff); 1768 } 1769 1770 static void append(T, V)(ref T[] arr, V value) 1771 if (!isInputRange!V) 1772 { 1773 arr ~= force!T(value); 1774 } 1775 1776 static void append(T, V)(ref T[] arr, V value) 1777 if (isInputRange!V) 1778 { 1779 insertInPlace(arr, arr.length, value); 1780 } 1781 1782 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1783 if (isDynamicArray!T && is(Unqual!T == T)) 1784 { 1785 debug 1786 { 1787 assert(accessIsSafe); 1788 arr[] = cast(typeof(T.init[0]))(0xdead_beef); 1789 } 1790 arr = null; 1791 } 1792 1793 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1794 if (isDynamicArray!T && !is(Unqual!T == T)) 1795 { 1796 arr = null; 1797 } 1798 1799 // This is unfortunately necessary to "fake pure". It will only ever be called 1800 // in the destructor for a GC-allocated CowArray, which is the only place where 1801 // this might return false. Current code expects this to be pure, so we can't 1802 // break that. But before this change, the code would access the referenced 1803 // array inside a GC finalizer, which is invalid. 1804 pragma(mangle, "gc_inFinalizer") private static extern(C) bool pureInGCFinalizer() @safe pure nothrow; 1805 1806 static @property bool accessIsSafe() @safe nothrow pure 1807 { 1808 return __ctfe || !pureInGCFinalizer; 1809 } 1810 } 1811 1812 // ditto 1813 @safe struct ReallocPolicy 1814 { 1815 import std.range.primitives : hasLength; 1816 1817 static T[] dup(T)(const T[] arr) 1818 { 1819 auto result = alloc!T(arr.length); 1820 result[] = arr[]; 1821 return result; 1822 } 1823 1824 static T[] alloc(T)(size_t size) @trusted 1825 { 1826 import std.internal.memory : enforceMalloc; 1827 1828 import core.checkedint : mulu; 1829 bool overflow; 1830 size_t nbytes = mulu(size, T.sizeof, overflow); 1831 if (overflow) assert(0); 1832 1833 auto ptr = cast(T*) enforceMalloc(nbytes); 1834 return ptr[0 .. size]; 1835 } 1836 1837 static T[] realloc(T)(return scope T[] arr, size_t size) @trusted 1838 { 1839 import std.internal.memory : enforceRealloc; 1840 if (!size) 1841 { 1842 destroy(arr); 1843 return null; 1844 } 1845 1846 import core.checkedint : mulu; 1847 bool overflow; 1848 size_t nbytes = mulu(size, T.sizeof, overflow); 1849 if (overflow) assert(0); 1850 1851 auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes); 1852 return ptr[0 .. size]; 1853 } 1854 1855 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1856 { 1857 genericReplace!(ReallocPolicy)(dest, from, to, stuff); 1858 } 1859 1860 static void append(T, V)(ref T[] arr, V value) 1861 if (!isInputRange!V) 1862 { 1863 if (arr.length == size_t.max) assert(0); 1864 arr = realloc(arr, arr.length+1); 1865 arr[$-1] = force!T(value); 1866 } 1867 1868 pure @safe unittest 1869 { 1870 int[] arr; 1871 ReallocPolicy.append(arr, 3); 1872 1873 import std.algorithm.comparison : equal; 1874 assert(equal(arr, [3])); 1875 } 1876 1877 static void append(T, V)(ref T[] arr, V value) 1878 if (isInputRange!V && hasLength!V) 1879 { 1880 import core.checkedint : addu; 1881 bool overflow; 1882 size_t nelems = addu(arr.length, value.length, overflow); 1883 if (overflow) assert(0); 1884 1885 arr = realloc(arr, nelems); 1886 1887 import std.algorithm.mutation : copy; 1888 copy(value, arr[$-value.length..$]); 1889 } 1890 1891 pure @safe unittest 1892 { 1893 int[] arr; 1894 ReallocPolicy.append(arr, [1,2,3]); 1895 1896 import std.algorithm.comparison : equal; 1897 assert(equal(arr, [1,2,3])); 1898 } 1899 1900 static void destroy(T)(scope ref T[] arr) @trusted 1901 { 1902 import core.memory : pureFree; 1903 if (arr.ptr) 1904 pureFree(arr.ptr); 1905 arr = null; 1906 } 1907 1908 enum accessIsSafe = true; 1909 } 1910 1911 //build hack 1912 alias _RealArray = CowArray!ReallocPolicy; 1913 1914 pure @safe unittest 1915 { 1916 import std.algorithm.comparison : equal; 1917 1918 with(ReallocPolicy) 1919 { 1920 bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result, 1921 string file = __FILE__, size_t line = __LINE__) 1922 { 1923 { 1924 replaceImpl(orig, from, to, toReplace); 1925 scope(exit) destroy(orig); 1926 if (!equal(orig, result)) 1927 return false; 1928 } 1929 return true; 1930 } 1931 static T[] arr(T)(T[] args... ) 1932 { 1933 return dup(args); 1934 } 1935 1936 assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4])); 1937 assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4])); 1938 assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7])); 1939 assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4])); 1940 assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4])); 1941 } 1942 } 1943 1944 /** 1945 Tests if T is some kind a set of code points. Intended for template constraints. 1946 */ 1947 public template isCodepointSet(T) 1948 { 1949 static if (is(T dummy == InversionList!(Args), Args...)) 1950 enum isCodepointSet = true; 1951 else 1952 enum isCodepointSet = false; 1953 } 1954 1955 /** 1956 Tests if `T` is a pair of integers that implicitly convert to `V`. 1957 The following code must compile for any pair `T`: 1958 --- 1959 (T x){ V a = x[0]; V b = x[1];} 1960 --- 1961 The following must not compile: 1962 --- 1963 (T x){ V c = x[2];} 1964 --- 1965 */ 1966 public template isIntegralPair(T, V=uint) 1967 { 1968 enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];})) 1969 && !is(typeof((T x){ V c = x[2]; })); 1970 } 1971 1972 1973 /** 1974 The recommended default type for set of $(CODEPOINTS). 1975 For details, see the current implementation: $(LREF InversionList). 1976 */ 1977 public alias CodepointSet = InversionList!GcPolicy; 1978 1979 1980 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin 1981 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error 1982 // hence below doesn't seem to work 1983 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b"); 1984 1985 /** 1986 The recommended type of $(REF Tuple, std,_typecons) 1987 to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList). 1988 Any interval type should pass $(LREF isIntegralPair) trait. 1989 */ 1990 public struct CodepointInterval 1991 { 1992 pure: 1993 uint[2] _tuple; 1994 alias _tuple this; 1995 1996 @safe pure nothrow @nogc: 1997 1998 this(uint low, uint high) 1999 { 2000 _tuple[0] = low; 2001 _tuple[1] = high; 2002 } 2003 bool opEquals(T)(T val) const 2004 { 2005 return this[0] == val[0] && this[1] == val[1]; 2006 } 2007 @property ref inout(uint) a() return inout { return _tuple[0]; } 2008 @property ref inout(uint) b() return inout { return _tuple[1]; } 2009 } 2010 2011 /** 2012 $(P 2013 `InversionList` is a set of $(CODEPOINTS) 2014 represented as an array of open-right [a, b$(RPAREN) 2015 intervals (see $(LREF CodepointInterval) above). 2016 The name comes from the way the representation reads left to right. 2017 For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN), 2018 plus a singular value 60 looks like this: 2019 ) 2020 --- 2021 10, 50, 60, 61, 80, 90 2022 --- 2023 $(P 2024 The way to read this is: start with negative meaning that all numbers 2025 smaller then the next one are not present in this set (and positive - 2026 the contrary). Then switch positive/negative after each 2027 number passed from left to right. 2028 ) 2029 $(P This way negative spans until 10, then positive until 50, 2030 then negative until 60, then positive until 61, and so on. 2031 As seen this provides a space-efficient storage of highly redundant data 2032 that comes in long runs. A description which Unicode $(CHARACTER) 2033 properties fit nicely. The technique itself could be seen as a variation 2034 on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding). 2035 ) 2036 2037 $(P Sets are value types (just like `int` is) thus they 2038 are never aliased. 2039 ) 2040 Example: 2041 --- 2042 auto a = CodepointSet('a', 'z'+1); 2043 auto b = CodepointSet('A', 'Z'+1); 2044 auto c = a; 2045 a = a | b; 2046 assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1)); 2047 assert(a != c); 2048 --- 2049 $(P See also $(LREF unicode) for simpler construction of sets 2050 from predefined ones. 2051 ) 2052 2053 $(P Memory usage is 8 bytes per each contiguous interval in a set. 2054 The value semantics are achieved by using the 2055 $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique 2056 and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared). 2057 ) 2058 2059 Note: 2060 $(P It's not recommended to rely on the template parameters 2061 or the exact type of a current $(CODEPOINT) set in `std.uni`. 2062 The type and parameters may change when the standard 2063 allocators design is finalized. 2064 Use $(LREF isCodepointSet) with templates or just stick with the default 2065 alias $(LREF CodepointSet) throughout the whole code base. 2066 ) 2067 */ 2068 public struct InversionList(SP=GcPolicy) 2069 { 2070 import std.range : assumeSorted; 2071 2072 /** 2073 Construct from another code point set of any type. 2074 */ 2075 this(Set)(Set set) pure 2076 if (isCodepointSet!Set) 2077 { 2078 uint[] arr; 2079 foreach (v; set.byInterval) 2080 { 2081 arr ~= v.a; 2082 arr ~= v.b; 2083 } 2084 data = CowArray!(SP).reuse(arr); 2085 } 2086 2087 /** 2088 Construct a set from a forward range of code point intervals. 2089 */ 2090 this(Range)(Range intervals) pure 2091 if (isForwardRange!Range && isIntegralPair!(ElementType!Range)) 2092 { 2093 uint[] arr; 2094 foreach (v; intervals) 2095 { 2096 SP.append(arr, v.a); 2097 SP.append(arr, v.b); 2098 } 2099 data = CowArray!(SP).reuse(arr); 2100 sanitize(); //enforce invariant: sort intervals etc. 2101 } 2102 2103 //helper function that avoids sanity check to be CTFE-friendly 2104 private static fromIntervals(Range)(Range intervals) pure 2105 { 2106 import std.algorithm.iteration : map; 2107 import std.range : roundRobin; 2108 auto flattened = roundRobin(intervals.save.map!"a[0]"(), 2109 intervals.save.map!"a[1]"()); 2110 InversionList set; 2111 set.data = CowArray!(SP)(flattened); 2112 return set; 2113 } 2114 //ditto untill sort is CTFE-able 2115 private static fromIntervals()(uint[] intervals...) pure 2116 in 2117 { 2118 import std.conv : text; 2119 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2120 for (uint i = 0; i < intervals.length; i += 2) 2121 { 2122 auto a = intervals[i], b = intervals[i+1]; 2123 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2124 } 2125 } 2126 do 2127 { 2128 InversionList set; 2129 set.data = CowArray!(SP)(intervals); 2130 return set; 2131 } 2132 2133 /** 2134 Construct a set from plain values of code point intervals. 2135 */ 2136 this()(uint[] intervals...) 2137 in 2138 { 2139 import std.conv : text; 2140 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2141 for (uint i = 0; i < intervals.length; i += 2) 2142 { 2143 auto a = intervals[i], b = intervals[i+1]; 2144 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2145 } 2146 } 2147 do 2148 { 2149 data = CowArray!(SP)(intervals); 2150 sanitize(); //enforce invariant: sort intervals etc. 2151 } 2152 2153 /// 2154 pure @safe unittest 2155 { 2156 import std.algorithm.comparison : equal; 2157 2158 auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1); 2159 foreach (v; 'a'..'z'+1) 2160 assert(set[v]); 2161 // Cyrillic lowercase interval 2162 foreach (v; 'а'..'я'+1) 2163 assert(set[v]); 2164 //specific order is not required, intervals may interesect 2165 auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1); 2166 //the same end result 2167 assert(set2.byInterval.equal(set.byInterval)); 2168 // test constructor this(Range)(Range intervals) 2169 auto chessPiecesWhite = CodepointInterval(9812, 9818); 2170 auto chessPiecesBlack = CodepointInterval(9818, 9824); 2171 auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]); 2172 foreach (v; '♔'..'♟'+1) 2173 assert(set3[v]); 2174 } 2175 2176 /** 2177 Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList). 2178 */ 2179 @property auto byInterval() scope 2180 { 2181 // TODO: change this to data[] once the -dip1000 errors have been fixed 2182 // see e.g. https://github.com/dlang/phobos/pull/6638 2183 import std.array : array; 2184 return Intervals!(typeof(data.array))(data.array); 2185 } 2186 2187 @safe unittest 2188 { 2189 import std.algorithm.comparison : equal; 2190 import std.typecons : tuple; 2191 2192 auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1); 2193 2194 assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')])); 2195 } 2196 2197 package(std) @property const(CodepointInterval)[] intervals() const 2198 { 2199 import std.array : array; 2200 return Intervals!(typeof(data[]))(data[]).array; 2201 } 2202 2203 /** 2204 Tests the presence of code point `val` in this set. 2205 */ 2206 bool opIndex(uint val) const 2207 { 2208 // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1 2209 // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1; 2210 return sharSwitchLowerBound!"a <= b"(data[], val) & 1; 2211 } 2212 2213 /// 2214 pure @safe unittest 2215 { 2216 auto gothic = unicode.Gothic; 2217 // Gothic letter ahsa 2218 assert(gothic['\U00010330']); 2219 // no ascii in Gothic obviously 2220 assert(!gothic['$']); 2221 } 2222 2223 2224 // Linear scan for `ch`. Useful only for small sets. 2225 // TODO: 2226 // used internally in std.regex 2227 // should be properly exposed in a public API ? 2228 package(std) auto scanFor()(dchar ch) const 2229 { 2230 immutable len = data.length; 2231 for (size_t i = 0; i < len; i++) 2232 if (ch < data[i]) 2233 return i & 1; 2234 return 0; 2235 } 2236 2237 /// Number of $(CODEPOINTS) in this set 2238 @property size_t length() 2239 { 2240 size_t sum = 0; 2241 foreach (iv; byInterval) 2242 { 2243 sum += iv.b - iv.a; 2244 } 2245 return sum; 2246 } 2247 2248 // bootstrap full set operations from 4 primitives (suitable as a template mixin): 2249 // addInterval, skipUpTo, dropUpTo & byInterval iteration 2250 //============================================================================ 2251 public: 2252 /** 2253 $(P Sets support natural syntax for set algebra, namely: ) 2254 $(BOOKTABLE , 2255 $(TR $(TH Operator) $(TH Math notation) $(TH Description) ) 2256 $(TR $(TD &) $(TD a ∩ b) $(TD intersection) ) 2257 $(TR $(TD |) $(TD a ∪ b) $(TD union) ) 2258 $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) ) 2259 $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) ) 2260 ) 2261 */ 2262 This opBinary(string op, U)(U rhs) 2263 if (isCodepointSet!U || is(U:dchar)) 2264 { 2265 static if (op == "&" || op == "|" || op == "~") 2266 {// symmetric ops thus can swap arguments to reuse r-value 2267 static if (is(U:dchar)) 2268 { 2269 auto tmp = this; 2270 mixin("tmp "~op~"= rhs; "); 2271 return tmp; 2272 } 2273 else 2274 { 2275 static if (is(Unqual!U == U)) 2276 { 2277 // try hard to reuse r-value 2278 mixin("rhs "~op~"= this;"); 2279 return rhs; 2280 } 2281 else 2282 { 2283 auto tmp = this; 2284 mixin("tmp "~op~"= rhs;"); 2285 return tmp; 2286 } 2287 } 2288 } 2289 else static if (op == "-") // anti-symmetric 2290 { 2291 auto tmp = this; 2292 tmp -= rhs; 2293 return tmp; 2294 } 2295 else 2296 static assert(0, "no operator "~op~" defined for Set"); 2297 } 2298 2299 /// 2300 pure @safe unittest 2301 { 2302 import std.algorithm.comparison : equal; 2303 import std.range : iota; 2304 2305 auto lower = unicode.LowerCase; 2306 auto upper = unicode.UpperCase; 2307 auto ascii = unicode.ASCII; 2308 2309 assert((lower & upper).empty); // no intersection 2310 auto lowerASCII = lower & ascii; 2311 assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1))); 2312 // throw away all of the lowercase ASCII 2313 assert((ascii - lower).length == 128 - 26); 2314 2315 auto onlyOneOf = lower ~ ascii; 2316 assert(!onlyOneOf['Δ']); // not ASCII and not lowercase 2317 assert(onlyOneOf['$']); // ASCII and not lowercase 2318 assert(!onlyOneOf['a']); // ASCII and lowercase 2319 assert(onlyOneOf['я']); // not ASCII but lowercase 2320 2321 // throw away all cased letters from ASCII 2322 auto noLetters = ascii - (lower | upper); 2323 assert(noLetters.length == 128 - 26*2); 2324 } 2325 2326 /// The 'op=' versions of the above overloaded operators. 2327 ref This opOpAssign(string op, U)(U rhs) 2328 if (isCodepointSet!U || is(U:dchar)) 2329 { 2330 static if (op == "|") // union 2331 { 2332 static if (is(U:dchar)) 2333 { 2334 this.addInterval(rhs, rhs+1); 2335 return this; 2336 } 2337 else 2338 return this.add(rhs); 2339 } 2340 else static if (op == "&") // intersection 2341 return this.intersect(rhs);// overloaded 2342 else static if (op == "-") // set difference 2343 return this.sub(rhs);// overloaded 2344 else static if (op == "~") // symmetric set difference 2345 { 2346 auto copy = this & rhs; 2347 this |= rhs; 2348 this -= copy; 2349 return this; 2350 } 2351 else 2352 static assert(0, "no operator "~op~" defined for Set"); 2353 } 2354 2355 /** 2356 Tests the presence of codepoint `ch` in this set, 2357 the same as $(LREF opIndex). 2358 */ 2359 bool opBinaryRight(string op: "in", U)(U ch) const 2360 if (is(U : dchar)) 2361 { 2362 return this[ch]; 2363 } 2364 2365 /// 2366 pure @safe unittest 2367 { 2368 assert('я' in unicode.Cyrillic); 2369 assert(!('z' in unicode.Cyrillic)); 2370 } 2371 2372 2373 2374 /** 2375 * Obtains a set that is the inversion of this set. 2376 * 2377 * See_Also: $(LREF inverted) 2378 */ 2379 auto opUnary(string op: "!")() 2380 { 2381 return this.inverted; 2382 } 2383 2384 /** 2385 A range that spans each $(CODEPOINT) in this set. 2386 */ 2387 @property auto byCodepoint() 2388 { 2389 static struct CodepointRange 2390 { 2391 this(This set) 2392 { 2393 r = set.byInterval; 2394 if (!r.empty) 2395 cur = r.front.a; 2396 } 2397 2398 @property dchar front() const 2399 { 2400 return cast(dchar) cur; 2401 } 2402 2403 @property bool empty() const 2404 { 2405 return r.empty; 2406 } 2407 2408 void popFront() 2409 { 2410 cur++; 2411 while (cur >= r.front.b) 2412 { 2413 r.popFront(); 2414 if (r.empty) 2415 break; 2416 cur = r.front.a; 2417 } 2418 } 2419 private: 2420 uint cur; 2421 @(imported!"core.attribute".mutableRefInit) typeof(This.init.byInterval) r; 2422 } 2423 2424 return CodepointRange(this); 2425 } 2426 2427 /// 2428 pure @safe unittest 2429 { 2430 import std.algorithm.comparison : equal; 2431 import std.range : iota; 2432 2433 auto set = unicode.ASCII; 2434 set.byCodepoint.equal(iota(0, 0x80)); 2435 } 2436 2437 /** 2438 $(P Obtain textual representation of this set in from of 2439 open-right intervals and feed it to `sink`. 2440 ) 2441 $(P Used by various standard formatting facilities such as 2442 $(REF formattedWrite, std,format), $(REF write, std,stdio), 2443 $(REF writef, std,stdio), $(REF to, std,conv) and others. 2444 ) 2445 Example: 2446 --- 2447 import std.conv; 2448 assert(unicode.ASCII.to!string == "[0..128$(RPAREN)"); 2449 --- 2450 */ 2451 2452 private import std.format.spec : FormatSpec; 2453 2454 /*************************************** 2455 * Obtain a textual representation of this InversionList 2456 * in form of open-right intervals. 2457 * 2458 * The formatting flag is applied individually to each value, for example: 2459 * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals) 2460 * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters) 2461 * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters) 2462 */ 2463 void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */ 2464 { 2465 import std.format.write : formatValue; 2466 auto range = byInterval; 2467 if (range.empty) 2468 return; 2469 2470 while (1) 2471 { 2472 auto i = range.front; 2473 range.popFront(); 2474 2475 put(sink, "["); 2476 formatValue(sink, i.a, fmt); 2477 put(sink, ".."); 2478 formatValue(sink, i.b, fmt); 2479 put(sink, ")"); 2480 if (range.empty) return; 2481 put(sink, " "); 2482 } 2483 } 2484 2485 /// 2486 pure @safe unittest 2487 { 2488 import std.conv : to; 2489 import std.format : format; 2490 import std.uni : unicode; 2491 2492 // This was originally using Cyrillic script. 2493 // Unfortunately this is a pretty active range for changes, 2494 // and hence broke in an update. 2495 // Therefore the range Basic latin was used instead as it 2496 // unlikely to ever change. 2497 2498 assert(unicode.InBasic_latin.to!string == "[0..128)"); 2499 2500 // The specs '%s' and '%d' are equivalent to the to!string call above. 2501 assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string); 2502 2503 assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)"); 2504 assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)"); 2505 } 2506 2507 pure @safe unittest 2508 { 2509 import std.exception : assertThrown; 2510 import std.format : format, FormatException; 2511 assertThrown!FormatException(format("%z", unicode.ASCII)); 2512 } 2513 2514 2515 /** 2516 Add an interval [a, b$(RPAREN) to this set. 2517 */ 2518 ref add()(uint a, uint b) 2519 { 2520 addInterval(a, b); 2521 return this; 2522 } 2523 2524 /// 2525 pure @safe unittest 2526 { 2527 CodepointSet someSet; 2528 someSet.add('0', '5').add('A','Z'+1); 2529 someSet.add('5', '9'+1); 2530 assert(someSet['0']); 2531 assert(someSet['5']); 2532 assert(someSet['9']); 2533 assert(someSet['Z']); 2534 } 2535 2536 private: 2537 2538 package(std) // used from: std.regex.internal.parser 2539 ref intersect(U)(U rhs) 2540 if (isCodepointSet!U) 2541 { 2542 Marker mark; 2543 foreach ( i; rhs.byInterval) 2544 { 2545 mark = this.dropUpTo(i.a, mark); 2546 mark = this.skipUpTo(i.b, mark); 2547 } 2548 this.dropUpTo(uint.max, mark); 2549 return this; 2550 } 2551 2552 ref intersect()(dchar ch) 2553 { 2554 foreach (i; byInterval) 2555 if (i.a <= ch && ch < i.b) 2556 return this = This.init.add(ch, ch+1); 2557 this = This.init; 2558 return this; 2559 } 2560 2561 pure @safe unittest 2562 { 2563 assert(unicode.Cyrillic.intersect('-').byInterval.empty); 2564 } 2565 2566 ref sub()(dchar ch) 2567 { 2568 return subChar(ch); 2569 } 2570 2571 // same as the above except that skip & drop parts are swapped 2572 package(std) // used from: std.regex.internal.parser 2573 ref sub(U)(U rhs) 2574 if (isCodepointSet!U) 2575 { 2576 Marker mark; 2577 foreach (i; rhs.byInterval) 2578 { 2579 mark = this.skipUpTo(i.a, mark); 2580 mark = this.dropUpTo(i.b, mark); 2581 } 2582 return this; 2583 } 2584 2585 package(std) // used from: std.regex.internal.parse 2586 ref add(U)(U rhs) 2587 if (isCodepointSet!U) 2588 { 2589 Marker start; 2590 foreach (i; rhs.byInterval) 2591 { 2592 start = addInterval(i.a, i.b, start); 2593 } 2594 return this; 2595 } 2596 2597 // end of mixin-able part 2598 //============================================================================ 2599 public: 2600 /** 2601 Obtains a set that is the inversion of this set. 2602 2603 See the '!' $(LREF opUnary) for the same but using operators. 2604 */ 2605 @property auto inverted() 2606 { 2607 InversionList inversion = this; 2608 if (inversion.data.length == 0) 2609 { 2610 inversion.addInterval(0, lastDchar+1); 2611 return inversion; 2612 } 2613 if (inversion.data[0] != 0) 2614 genericReplace(inversion.data, 0, 0, [0]); 2615 else 2616 genericReplace(inversion.data, 0, 1, cast(uint[]) null); 2617 if (data[data.length-1] != lastDchar+1) 2618 genericReplace(inversion.data, 2619 inversion.data.length, inversion.data.length, [lastDchar+1]); 2620 else 2621 genericReplace(inversion.data, 2622 inversion.data.length-1, inversion.data.length, cast(uint[]) null); 2623 2624 return inversion; 2625 } 2626 2627 /// 2628 pure @safe unittest 2629 { 2630 auto set = unicode.ASCII; 2631 // union with the inverse gets all of the code points in the Unicode 2632 assert((set | set.inverted).length == 0x110000); 2633 // no intersection with the inverse 2634 assert((set & set.inverted).empty); 2635 } 2636 2637 package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName) 2638 { 2639 import std.algorithm.searching : countUntil; 2640 import std.format : format; 2641 enum maxBinary = 3; 2642 static string linearScope(R)(R ivals, string indent) 2643 { 2644 string result = indent~"{\n"; 2645 string deeper = indent~" "; 2646 foreach (ival; ivals) 2647 { 2648 immutable span = ival[1] - ival[0]; 2649 assert(span != 0); 2650 if (span == 1) 2651 { 2652 result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]); 2653 } 2654 else if (span == 2) 2655 { 2656 result ~= format("%sif (ch == %s || ch == %s) return true;\n", 2657 deeper, ival[0], ival[0]+1); 2658 } 2659 else 2660 { 2661 if (ival[0] != 0) // dchar is unsigned and < 0 is useless 2662 result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]); 2663 result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]); 2664 } 2665 } 2666 result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals 2667 return result; 2668 } 2669 2670 static string binaryScope(R)(R ivals, string indent) @safe 2671 { 2672 // time to do unrolled comparisons? 2673 if (ivals.length < maxBinary) 2674 return linearScope(ivals, indent); 2675 else 2676 return bisect(ivals, ivals.length/2, indent); 2677 } 2678 2679 // not used yet if/elsebinary search is far better with DMD as of 2.061 2680 // and GDC is doing fine job either way 2681 static string switchScope(R)(R ivals, string indent) 2682 { 2683 string result = indent~"switch (ch){\n"; 2684 string deeper = indent~" "; 2685 foreach (ival; ivals) 2686 { 2687 if (ival[0]+1 == ival[1]) 2688 { 2689 result ~= format("%scase %s: return true;\n", 2690 deeper, ival[0]); 2691 } 2692 else 2693 { 2694 result ~= format("%scase %s: .. case %s: return true;\n", 2695 deeper, ival[0], ival[1]-1); 2696 } 2697 } 2698 result ~= deeper~"default: return false;\n"~indent~"}\n"; 2699 return result; 2700 } 2701 2702 static string bisect(R)(R range, size_t idx, string indent) 2703 { 2704 string deeper = indent ~ " "; 2705 // bisect on one [a, b) interval at idx 2706 string result = indent~"{\n"; 2707 // less branch, < a 2708 result ~= format("%sif (ch < %s)\n%s", 2709 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper)); 2710 // middle point, >= a && < b 2711 result ~= format("%selse if (ch < %s) return true;\n", 2712 deeper, range[idx][1]); 2713 // greater or equal branch, >= b 2714 result ~= format("%selse\n%s", 2715 deeper, binaryScope(range[idx+1..$], deeper)); 2716 return result~indent~"}\n"; 2717 } 2718 2719 string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n", 2720 funcName.empty ? "function" : funcName); 2721 // special case first bisection to be on ASCII vs beyond 2722 auto tillAscii = countUntil!"a[0] > 0x80"(range); 2723 if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0) 2724 code ~= binaryScope(range, ""); 2725 else 2726 code ~= bisect(range, tillAscii, ""); 2727 return code; 2728 } 2729 2730 /** 2731 Generates string with D source code of unary function with name of 2732 `funcName` taking a single `dchar` argument. If `funcName` is empty 2733 the code is adjusted to be a lambda function. 2734 2735 The function generated tests if the $(CODEPOINT) passed 2736 belongs to this set or not. The result is to be used with string mixin. 2737 The intended usage area is aggressive optimization via meta programming 2738 in parser generators and the like. 2739 2740 Note: Use with care for relatively small or regular sets. It 2741 could end up being slower then just using multi-staged tables. 2742 2743 Example: 2744 --- 2745 import std.stdio; 2746 2747 // construct set directly from [a, b$RPAREN intervals 2748 auto set = CodepointSet(10, 12, 45, 65, 100, 200); 2749 writeln(set); 2750 writeln(set.toSourceCode("func")); 2751 --- 2752 2753 The above outputs something along the lines of: 2754 --- 2755 bool func(dchar ch) @safe pure nothrow @nogc 2756 { 2757 if (ch < 45) 2758 { 2759 if (ch == 10 || ch == 11) return true; 2760 return false; 2761 } 2762 else if (ch < 65) return true; 2763 else 2764 { 2765 if (ch < 100) return false; 2766 if (ch < 200) return true; 2767 return false; 2768 } 2769 } 2770 --- 2771 */ 2772 string toSourceCode(string funcName="") 2773 { 2774 import std.array : array; 2775 auto range = byInterval.array(); 2776 return toSourceCode(range, funcName); 2777 } 2778 2779 /** 2780 True if this set doesn't contain any $(CODEPOINTS). 2781 */ 2782 @property bool empty() const 2783 { 2784 return data.length == 0; 2785 } 2786 2787 /// 2788 pure @safe unittest 2789 { 2790 CodepointSet emptySet; 2791 assert(emptySet.length == 0); 2792 assert(emptySet.empty); 2793 } 2794 2795 private: 2796 alias This = typeof(this); 2797 alias Marker = size_t; 2798 2799 // a random-access range of integral pairs 2800 static struct Intervals(Range) 2801 { 2802 import std.range.primitives : hasAssignableElements; 2803 2804 this(Range sp) scope 2805 { 2806 slice = sp; 2807 start = 0; 2808 end = sp.length; 2809 } 2810 2811 this(Range sp, size_t s, size_t e) scope 2812 { 2813 slice = sp; 2814 start = s; 2815 end = e; 2816 } 2817 2818 @property auto front()const 2819 { 2820 immutable a = slice[start]; 2821 immutable b = slice[start+1]; 2822 return CodepointInterval(a, b); 2823 } 2824 2825 //may break sorted property - but we need std.sort to access it 2826 //hence package(std) protection attribute 2827 static if (hasAssignableElements!Range) 2828 package(std) @property void front(CodepointInterval val) 2829 { 2830 slice[start] = val.a; 2831 slice[start+1] = val.b; 2832 } 2833 2834 @property auto back()const 2835 { 2836 immutable a = slice[end-2]; 2837 immutable b = slice[end-1]; 2838 return CodepointInterval(a, b); 2839 } 2840 2841 //ditto about package 2842 static if (hasAssignableElements!Range) 2843 package(std) @property void back(CodepointInterval val) 2844 { 2845 slice[end-2] = val.a; 2846 slice[end-1] = val.b; 2847 } 2848 2849 void popFront() 2850 { 2851 start += 2; 2852 } 2853 2854 void popBack() 2855 { 2856 end -= 2; 2857 } 2858 2859 auto opIndex(size_t idx) const 2860 { 2861 immutable a = slice[start+idx*2]; 2862 immutable b = slice[start+idx*2+1]; 2863 return CodepointInterval(a, b); 2864 } 2865 2866 //ditto about package 2867 static if (hasAssignableElements!Range) 2868 package(std) void opIndexAssign(CodepointInterval val, size_t idx) 2869 { 2870 slice[start+idx*2] = val.a; 2871 slice[start+idx*2+1] = val.b; 2872 } 2873 2874 auto opSlice(size_t s, size_t e) 2875 { 2876 return Intervals(slice, s*2+start, e*2+start); 2877 } 2878 2879 @property size_t length()const { return slice.length/2; } 2880 2881 @property bool empty()const { return start == end; } 2882 2883 @property auto save(){ return this; } 2884 private: 2885 size_t start, end; 2886 Range slice; 2887 } 2888 2889 // called after construction from intervals 2890 // to make sure invariants hold 2891 void sanitize() 2892 { 2893 import std.algorithm.comparison : max; 2894 import std.algorithm.mutation : SwapStrategy; 2895 import std.algorithm.sorting : sort; 2896 if (data.length == 0) 2897 return; 2898 alias Ival = CodepointInterval; 2899 //intervals wrapper for a _range_ over packed array 2900 auto ivals = Intervals!(typeof(data[]))(data[]); 2901 //@@@BUG@@@ can't use "a.a < b.a" see 2902 // https://issues.dlang.org/show_bug.cgi?id=12265 2903 sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals); 2904 // what follows is a variation on stable remove 2905 // differences: 2906 // - predicate is binary, and is tested against 2907 // the last kept element (at 'i'). 2908 // - predicate mutates lhs (merges rhs into lhs) 2909 size_t len = ivals.length; 2910 size_t i = 0; 2911 size_t j = 1; 2912 while (j < len) 2913 { 2914 if (ivals[i].b >= ivals[j].a) 2915 { 2916 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b)); 2917 j++; 2918 } 2919 else //unmergable 2920 { 2921 // check if there is a hole after merges 2922 // (in the best case we do 0 writes to ivals) 2923 if (j != i+1) 2924 ivals[i+1] = ivals[j]; //copy over 2925 i++; 2926 j++; 2927 } 2928 } 2929 len = i + 1; 2930 for (size_t k=0; k + 1 < len; k++) 2931 { 2932 assert(ivals[k].a < ivals[k].b); 2933 assert(ivals[k].b < ivals[k+1].a); 2934 } 2935 data.length = len * 2; 2936 } 2937 2938 // special case for normal InversionList 2939 ref subChar(dchar ch) 2940 { 2941 auto mark = skipUpTo(ch); 2942 if (mark != data.length 2943 && data[mark] == ch && data[mark-1] == ch) 2944 { 2945 // it has split, meaning that ch happens to be in one of intervals 2946 data[mark] = data[mark]+1; 2947 } 2948 return this; 2949 } 2950 2951 // 2952 Marker addInterval(int a, int b, Marker hint=Marker.init) scope 2953 in 2954 { 2955 assert(a <= b); 2956 } 2957 do 2958 { 2959 import std.range : assumeSorted, SearchPolicy; 2960 auto range = assumeSorted(data[]); 2961 size_t pos; 2962 size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length; 2963 if (a_idx == range.length) 2964 { 2965 // [---+++----++++----++++++] 2966 // [ a b] 2967 data.append(a, b); 2968 return data.length-1; 2969 } 2970 size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx; 2971 uint[3] buf = void; 2972 uint to_insert; 2973 debug(std_uni) 2974 { 2975 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2976 } 2977 if (b_idx == range.length) 2978 { 2979 // [-------++++++++----++++++-] 2980 // [ s a b] 2981 if (a_idx & 1)// a in positive 2982 { 2983 buf[0] = b; 2984 to_insert = 1; 2985 } 2986 else// a in negative 2987 { 2988 buf[0] = a; 2989 buf[1] = b; 2990 to_insert = 2; 2991 } 2992 pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]); 2993 return pos - 1; 2994 } 2995 2996 uint top = data[b_idx]; 2997 2998 debug(std_uni) 2999 { 3000 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 3001 writefln("a=%s; b=%s; top=%s;", a, b, top); 3002 } 3003 if (a_idx & 1) 3004 {// a in positive 3005 if (b_idx & 1)// b in positive 3006 { 3007 // [-------++++++++----++++++-] 3008 // [ s a b ] 3009 buf[0] = top; 3010 to_insert = 1; 3011 } 3012 else // b in negative 3013 { 3014 // [-------++++++++----++++++-] 3015 // [ s a b ] 3016 if (top == b) 3017 { 3018 assert(b_idx+1 < data.length); 3019 buf[0] = data[b_idx+1]; 3020 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]); 3021 return pos - 1; 3022 } 3023 buf[0] = b; 3024 buf[1] = top; 3025 to_insert = 2; 3026 } 3027 } 3028 else 3029 { // a in negative 3030 if (b_idx & 1) // b in positive 3031 { 3032 // [----------+++++----++++++-] 3033 // [ a b ] 3034 buf[0] = a; 3035 buf[1] = top; 3036 to_insert = 2; 3037 } 3038 else// b in negative 3039 { 3040 // [----------+++++----++++++-] 3041 // [ a s b ] 3042 if (top == b) 3043 { 3044 assert(b_idx+1 < data.length); 3045 buf[0] = a; 3046 buf[1] = data[b_idx+1]; 3047 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]); 3048 return pos - 1; 3049 } 3050 buf[0] = a; 3051 buf[1] = b; 3052 buf[2] = top; 3053 to_insert = 3; 3054 } 3055 } 3056 pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]); 3057 debug(std_uni) 3058 { 3059 writefln("marker idx: %d; length=%d", pos, data[pos], data.length); 3060 writeln("inserting ", buf[0 .. to_insert]); 3061 } 3062 return pos - 1; 3063 } 3064 3065 // 3066 Marker dropUpTo(uint a, Marker pos=Marker.init) 3067 in 3068 { 3069 assert(pos % 2 == 0); // at start of interval 3070 } 3071 do 3072 { 3073 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3074 if (range.empty) 3075 return pos; 3076 size_t idx = pos; 3077 idx += range.lowerBound(a).length; 3078 3079 debug(std_uni) 3080 { 3081 writeln("dropUpTo full length=", data.length); 3082 writeln(pos,"~~~", idx); 3083 } 3084 if (idx == data.length) 3085 return genericReplace(data, pos, idx, cast(uint[])[]); 3086 if (idx & 1) 3087 { // a in positive 3088 //[--+++----++++++----+++++++------...] 3089 // |<---si s a t 3090 genericReplace(data, pos, idx, [a]); 3091 } 3092 else 3093 { // a in negative 3094 //[--+++----++++++----+++++++-------+++...] 3095 // |<---si s a t 3096 genericReplace(data, pos, idx, cast(uint[])[]); 3097 } 3098 return pos; 3099 } 3100 3101 // 3102 Marker skipUpTo(uint a, Marker pos=Marker.init) 3103 out(result) 3104 { 3105 assert(result % 2 == 0);// always start of interval 3106 //(may be 0-width after-split) 3107 } 3108 do 3109 { 3110 assert(data.length % 2 == 0); 3111 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3112 size_t idx = pos+range.lowerBound(a).length; 3113 3114 if (idx >= data.length) // could have Marker point to recently removed stuff 3115 return data.length; 3116 3117 if (idx & 1)// inside of interval, check for split 3118 { 3119 3120 immutable top = data[idx]; 3121 if (top == a)// no need to split, it's end 3122 return idx+1; 3123 immutable start = data[idx-1]; 3124 if (a == start) 3125 return idx-1; 3126 // split it up 3127 genericReplace(data, idx, idx+1, [a, a, top]); 3128 return idx+1; // avoid odd index 3129 } 3130 return idx; 3131 } 3132 3133 CowArray!SP data; 3134 } 3135 3136 pure @safe unittest 3137 { 3138 import std.conv : to; 3139 assert(unicode.ASCII.to!string() == "[0..128)"); 3140 } 3141 3142 // pedantic version for ctfe, and aligned-access only architectures 3143 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3144 { 3145 idx *= 3; 3146 version (LittleEndian) 3147 return ptr[idx] + (cast(uint) ptr[idx+1]<<8) 3148 + (cast(uint) ptr[idx+2]<<16); 3149 else 3150 return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8) 3151 + ptr[idx+2]; 3152 } 3153 3154 // ditto 3155 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3156 { 3157 idx *= 3; 3158 version (LittleEndian) 3159 { 3160 ptr[idx] = val & 0xFF; 3161 ptr[idx+1] = (val >> 8) & 0xFF; 3162 ptr[idx+2] = (val >> 16) & 0xFF; 3163 } 3164 else 3165 { 3166 ptr[idx] = (val >> 16) & 0xFF; 3167 ptr[idx+1] = (val >> 8) & 0xFF; 3168 ptr[idx+2] = val & 0xFF; 3169 } 3170 } 3171 3172 // unaligned x86-like read/write functions 3173 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3174 { 3175 uint* src = cast(uint*)(ptr+3*idx); 3176 version (LittleEndian) 3177 return *src & 0xFF_FFFF; 3178 else 3179 return *src >> 8; 3180 } 3181 3182 // ditto 3183 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3184 { 3185 uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx); 3186 version (LittleEndian) 3187 *dest = val | (*dest & 0xFF00_0000); 3188 else 3189 *dest = (val << 8) | (*dest & 0xFF); 3190 } 3191 3192 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3193 { 3194 static if (hasUnalignedReads) 3195 return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx); 3196 else 3197 return safeRead24(ptr, idx); 3198 } 3199 3200 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3201 { 3202 static if (hasUnalignedReads) 3203 return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx); 3204 else 3205 return safeWrite24(ptr, val, idx); 3206 } 3207 3208 struct CowArray(SP=GcPolicy) 3209 { 3210 import std.range.primitives : hasLength; 3211 3212 @safe: 3213 static auto reuse(uint[] arr) 3214 { 3215 CowArray cow; 3216 cow.data = arr; 3217 SP.append(cow.data, 1); 3218 assert(cow.refCount == 1); 3219 assert(cow.length == arr.length); 3220 return cow; 3221 } 3222 3223 this(Range)(Range range) 3224 if (isInputRange!Range && hasLength!Range) 3225 { 3226 import std.algorithm.mutation : copy; 3227 length = range.length; 3228 copy(range, data[0..$-1]); 3229 } 3230 3231 this(Range)(Range range) 3232 if (isForwardRange!Range && !hasLength!Range) 3233 { 3234 import std.algorithm.mutation : copy; 3235 import std.range.primitives : walkLength; 3236 immutable len = walkLength(range.save); 3237 length = len; 3238 copy(range, data[0..$-1]); 3239 } 3240 3241 this(this) 3242 { 3243 if (!empty) 3244 { 3245 refCount = refCount + 1; 3246 } 3247 } 3248 3249 ~this() 3250 { 3251 if (!SP.accessIsSafe) 3252 // detach from the array, we can no longer access it. 3253 data = null; 3254 3255 if (!empty) 3256 { 3257 immutable cnt = refCount; 3258 if (cnt == 1) 3259 SP.destroy(data); 3260 else 3261 refCount = cnt - 1; 3262 } 3263 } 3264 3265 // no ref-count for empty U24 array 3266 @property bool empty() const { return data.length == 0; } 3267 3268 // report one less then actual size 3269 @property size_t length() const 3270 { 3271 return data.length ? data.length - 1 : 0; 3272 } 3273 3274 //+ an extra slot for ref-count 3275 @property void length(size_t len) 3276 { 3277 import std.algorithm.comparison : min; 3278 import std.algorithm.mutation : copy; 3279 if (len == 0) 3280 { 3281 if (!empty) 3282 freeThisReference(); 3283 return; 3284 } 3285 immutable total = len + 1; // including ref-count 3286 if (empty) 3287 { 3288 data = SP.alloc!uint(total); 3289 refCount = 1; 3290 return; 3291 } 3292 immutable cur_cnt = refCount; 3293 if (cur_cnt != 1) // have more references to this memory 3294 { 3295 refCount = cur_cnt - 1; 3296 auto new_data = SP.alloc!uint(total); 3297 // take shrinking into account 3298 auto to_copy = min(total, data.length) - 1; 3299 copy(data[0 .. to_copy], new_data[0 .. to_copy]); 3300 data = new_data; // before setting refCount! 3301 refCount = 1; 3302 } 3303 else // 'this' is the only reference 3304 { 3305 // use the realloc (hopefully in-place operation) 3306 data = SP.realloc(data, total); 3307 refCount = 1; // setup a ref-count in the new end of the array 3308 } 3309 } 3310 3311 alias opDollar = length; 3312 3313 uint opIndex()(size_t idx)const 3314 { 3315 return data[idx]; 3316 } 3317 3318 void opIndexAssign(uint val, size_t idx) 3319 { 3320 auto cnt = refCount; 3321 if (cnt != 1) 3322 dupThisReference(cnt); 3323 data[idx] = val; 3324 } 3325 3326 // 3327 auto opSlice(size_t from, size_t to) 3328 { 3329 if (!empty) 3330 { 3331 auto cnt = refCount; 3332 if (cnt != 1) 3333 dupThisReference(cnt); 3334 } 3335 return data[from .. to]; 3336 3337 } 3338 3339 // 3340 auto opSlice(size_t from, size_t to) const 3341 { 3342 return data[from .. to]; 3343 } 3344 3345 // length slices before the ref count 3346 auto opSlice() 3347 { 3348 return opSlice(0, length); 3349 } 3350 3351 // ditto 3352 auto opSlice() const 3353 { 3354 return opSlice(0, length); 3355 } 3356 3357 void append(Range)(Range range) 3358 if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint)) 3359 { 3360 size_t nl = length + range.length; 3361 length = nl; 3362 copy(range, this[nl-range.length .. nl]); 3363 } 3364 3365 void append()(uint[] val...) 3366 { 3367 length = length + val.length; 3368 data[$-val.length-1 .. $-1] = val[]; 3369 } 3370 3371 bool opEquals()(auto const ref CowArray rhs)const 3372 { 3373 if (empty ^ rhs.empty) 3374 return false; // one is empty and the other isn't 3375 return empty || data[0..$-1] == rhs.data[0..$-1]; 3376 } 3377 3378 private: 3379 // ref-count is right after the data 3380 @property uint refCount() const 3381 { 3382 return data[$-1]; 3383 } 3384 3385 @property void refCount(uint cnt) 3386 { 3387 data[$-1] = cnt; 3388 } 3389 3390 void freeThisReference() 3391 { 3392 immutable count = refCount; 3393 if (count != 1) // have more references to this memory 3394 { 3395 // dec shared ref-count 3396 refCount = count - 1; 3397 data = []; 3398 } 3399 else 3400 SP.destroy(data); 3401 assert(!data.ptr); 3402 } 3403 3404 void dupThisReference(uint count) 3405 in 3406 { 3407 assert(!empty && count != 1 && count == refCount); 3408 } 3409 do 3410 { 3411 import std.algorithm.mutation : copy; 3412 // dec shared ref-count 3413 refCount = count - 1; 3414 // copy to the new chunk of RAM 3415 auto new_data = SP.alloc!uint(data.length); 3416 // bit-blit old stuff except the counter 3417 copy(data[0..$-1], new_data[0..$-1]); 3418 data = new_data; // before setting refCount! 3419 refCount = 1; // so that this updates the right one 3420 } 3421 3422 uint[] data; 3423 } 3424 3425 pure @safe unittest// Uint24 tests 3426 { 3427 import std.algorithm.comparison : equal; 3428 import std.algorithm.mutation : copy; 3429 import std.conv : text; 3430 import std.range : iota, chain; 3431 import std.range.primitives : isBidirectionalRange, isOutputRange; 3432 void funcRef(T)(ref T u24) 3433 { 3434 u24.length = 2; 3435 u24[1] = 1024; 3436 T u24_c = u24; 3437 assert(u24[1] == 1024); 3438 u24.length = 0; 3439 assert(u24.empty); 3440 u24.append([1, 2]); 3441 assert(equal(u24[], [1, 2])); 3442 u24.append(111); 3443 assert(equal(u24[], [1, 2, 111])); 3444 assert(!u24_c.empty && u24_c[1] == 1024); 3445 u24.length = 3; 3446 copy(iota(0, 3), u24[]); 3447 assert(equal(u24[], iota(0, 3))); 3448 assert(u24_c[1] == 1024); 3449 } 3450 3451 void func2(T)(T u24) 3452 { 3453 T u24_2 = u24; 3454 T u24_3; 3455 u24_3 = u24_2; 3456 assert(u24_2 == u24_3); 3457 assert(equal(u24[], u24_2[])); 3458 assert(equal(u24_2[], u24_3[])); 3459 funcRef(u24_3); 3460 3461 assert(equal(u24_3[], iota(0, 3))); 3462 assert(!equal(u24_2[], u24_3[])); 3463 assert(equal(u24_2[], u24[])); 3464 u24_2 = u24_3; 3465 assert(equal(u24_2[], iota(0, 3))); 3466 // to test that passed arg is intact outside 3467 // plus try out opEquals 3468 u24 = u24_3; 3469 u24 = T.init; 3470 u24_3 = T.init; 3471 assert(u24.empty); 3472 assert(u24 == u24_3); 3473 assert(u24 != u24_2); 3474 } 3475 3476 static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy)) 3477 {{ 3478 alias Range = typeof(CowArray!Policy.init[]); 3479 alias U24A = CowArray!Policy; 3480 static assert(isForwardRange!Range); 3481 static assert(isBidirectionalRange!Range); 3482 static assert(isOutputRange!(Range, uint)); 3483 static assert(isRandomAccessRange!(Range)); 3484 3485 auto arr = U24A([42u, 36, 100]); 3486 assert(arr[0] == 42); 3487 assert(arr[1] == 36); 3488 arr[0] = 72; 3489 arr[1] = 0xFE_FEFE; 3490 assert(arr[0] == 72); 3491 assert(arr[1] == 0xFE_FEFE); 3492 assert(arr[2] == 100); 3493 U24A arr2 = arr; 3494 assert(arr2[0] == 72); 3495 arr2[0] = 11; 3496 // test COW-ness 3497 assert(arr[0] == 72); 3498 assert(arr2[0] == 11); 3499 // set this to about 100M to stress-test COW memory management 3500 foreach (v; 0 .. 10_000) 3501 func2(arr); 3502 assert(equal(arr[], [72, 0xFE_FEFE, 100])); 3503 3504 auto r2 = U24A(iota(0, 100)); 3505 assert(equal(r2[], iota(0, 100)), text(r2[])); 3506 copy(iota(10, 170, 2), r2[10 .. 90]); 3507 assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100))) 3508 , text(r2[])); 3509 }} 3510 } 3511 3512 pure @safe unittest// core set primitives test 3513 { 3514 import std.conv : text; 3515 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3516 foreach (CodeList; AllSets) 3517 { 3518 CodeList a; 3519 //"plug a hole" test 3520 a.add(10, 20).add(25, 30).add(15, 27); 3521 assert(a == CodeList(10, 30), text(a)); 3522 3523 auto x = CodeList.init; 3524 x.add(10, 20).add(30, 40).add(50, 60); 3525 3526 a = x; 3527 a.add(20, 49);//[10, 49) [50, 60) 3528 assert(a == CodeList(10, 49, 50 ,60)); 3529 3530 a = x; 3531 a.add(20, 50); 3532 assert(a == CodeList(10, 60), text(a)); 3533 3534 // simple unions, mostly edge effects 3535 x = CodeList.init; 3536 x.add(10, 20).add(40, 60); 3537 3538 a = x; 3539 a.add(10, 25); //[10, 25) [40, 60) 3540 assert(a == CodeList(10, 25, 40, 60)); 3541 3542 a = x; 3543 a.add(5, 15); //[5, 20) [40, 60) 3544 assert(a == CodeList(5, 20, 40, 60)); 3545 3546 a = x; 3547 a.add(0, 10); // [0, 20) [40, 60) 3548 assert(a == CodeList(0, 20, 40, 60)); 3549 3550 a = x; 3551 a.add(0, 5); // prepand 3552 assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a)); 3553 3554 a = x; 3555 a.add(5, 20); 3556 assert(a == CodeList(5, 20, 40, 60)); 3557 3558 a = x; 3559 a.add(3, 37); 3560 assert(a == CodeList(3, 37, 40, 60)); 3561 3562 a = x; 3563 a.add(37, 65); 3564 assert(a == CodeList(10, 20, 37, 65)); 3565 3566 // some tests on helpers for set intersection 3567 x = CodeList.init.add(10, 20).add(40, 60).add(100, 120); 3568 a = x; 3569 3570 auto m = a.skipUpTo(60); 3571 a.dropUpTo(110, m); 3572 assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[])); 3573 3574 a = x; 3575 a.dropUpTo(100); 3576 assert(a == CodeList(100, 120), text(a.data[])); 3577 3578 a = x; 3579 m = a.skipUpTo(50); 3580 a.dropUpTo(140, m); 3581 assert(a == CodeList(10, 20, 40, 50), text(a.data[])); 3582 a = x; 3583 a.dropUpTo(60); 3584 assert(a == CodeList(100, 120), text(a.data[])); 3585 } 3586 } 3587 3588 3589 //test constructor to work with any order of intervals 3590 pure @safe unittest 3591 { 3592 import std.algorithm.comparison : equal; 3593 import std.conv : text, to; 3594 import std.range : chain, iota; 3595 import std.typecons : tuple; 3596 //ensure constructor handles bad ordering and overlap 3597 auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1); 3598 foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1))) 3599 assert(ch in c1, to!string(ch)); 3600 3601 //contiguos 3602 assert(CodepointSet(1000, 1006, 1006, 1009) 3603 .byInterval.equal([tuple(1000, 1009)])); 3604 //contains 3605 assert(CodepointSet(900, 1200, 1000, 1100) 3606 .byInterval.equal([tuple(900, 1200)])); 3607 //intersect left 3608 assert(CodepointSet(900, 1100, 1000, 1200) 3609 .byInterval.equal([tuple(900, 1200)])); 3610 //intersect right 3611 assert(CodepointSet(1000, 1200, 900, 1100) 3612 .byInterval.equal([tuple(900, 1200)])); 3613 3614 //ditto with extra items at end 3615 assert(CodepointSet(1000, 1200, 900, 1100, 800, 850) 3616 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3617 assert(CodepointSet(900, 1100, 1000, 1200, 800, 850) 3618 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3619 3620 //"plug a hole" test 3621 auto c2 = CodepointSet(20, 40, 3622 60, 80, 100, 140, 150, 200, 3623 40, 60, 80, 100, 140, 150 3624 ); 3625 assert(c2.byInterval.equal([tuple(20, 200)])); 3626 3627 auto c3 = CodepointSet( 3628 20, 40, 60, 80, 100, 140, 150, 200, 3629 0, 10, 15, 100, 10, 20, 200, 220); 3630 assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)])); 3631 } 3632 3633 3634 pure @safe unittest 3635 { // full set operations 3636 import std.conv : text; 3637 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3638 foreach (CodeList; AllSets) 3639 { 3640 CodeList a, b, c, d; 3641 3642 //"plug a hole" 3643 a.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3644 b.add(40, 60).add(80, 100).add(140, 150); 3645 c = a | b; 3646 d = b | a; 3647 assert(c == CodeList(20, 200), text(CodeList.stringof," ", c)); 3648 assert(c == d, text(c," vs ", d)); 3649 3650 b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210); 3651 c = a | b; //[20,45) [60, 85) [95, 140) [150, 210) 3652 d = b | a; 3653 assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c)); 3654 assert(c == d, text(c," vs ", d)); 3655 3656 b = CodeList.init.add(10, 20).add(30,100).add(145,200); 3657 c = a | b;//[10, 140) [145, 200) 3658 d = b | a; 3659 assert(c == CodeList(10, 140, 145, 200)); 3660 assert(c == d, text(c," vs ", d)); 3661 3662 b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220); 3663 c = a | b;//[0, 140) [150, 220) 3664 d = b | a; 3665 assert(c == CodeList(0, 140, 150, 220)); 3666 assert(c == d, text(c," vs ", d)); 3667 3668 3669 a = CodeList.init.add(20, 40).add(60, 80); 3670 b = CodeList.init.add(25, 35).add(65, 75); 3671 c = a & b; 3672 d = b & a; 3673 assert(c == CodeList(25, 35, 65, 75), text(c)); 3674 assert(c == d, text(c," vs ", d)); 3675 3676 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3677 b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180); 3678 c = a & b; 3679 d = b & a; 3680 assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c)); 3681 assert(c == d, text(c," vs ", d)); 3682 3683 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3684 b = CodeList.init.add(10, 30).add(60, 120).add(135, 160); 3685 c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160) 3686 d = b & a; 3687 3688 assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c)); 3689 assert(c == d, text(c, " vs ",d)); 3690 assert((c & a) == c); 3691 assert((d & b) == d); 3692 assert((c & d) == d); 3693 3694 b = CodeList.init.add(40, 60).add(80, 100).add(140, 200); 3695 c = a & b; 3696 d = b & a; 3697 assert(c == CodeList(150, 200), text(c)); 3698 assert(c == d, text(c, " vs ",d)); 3699 assert((c & a) == c); 3700 assert((d & b) == d); 3701 assert((c & d) == d); 3702 3703 assert((a & a) == a); 3704 assert((b & b) == b); 3705 3706 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3707 b = CodeList.init.add(30, 60).add(75, 120).add(190, 300); 3708 c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190) 3709 d = b - a;// [40, 60) [80, 100) [200, 300) 3710 assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c)); 3711 assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d)); 3712 assert(c - d == c, text(c-d, " vs ", c)); 3713 assert(d - c == d, text(d-c, " vs ", d)); 3714 assert(c - c == CodeList.init); 3715 assert(d - d == CodeList.init); 3716 3717 a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200); 3718 b = CodeList.init.add(10, 50).add(60, 160).add(190, 300); 3719 c = a - b;// [160, 190) 3720 d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300) 3721 assert(c == CodeList(160, 190), text(c)); 3722 assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d)); 3723 assert(c - d == c, text(c-d, " vs ", c)); 3724 assert(d - c == d, text(d-c, " vs ", d)); 3725 assert(c - c == CodeList.init); 3726 assert(d - d == CodeList.init); 3727 3728 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3729 b = CodeList.init.add(10, 30).add(45, 100).add(130, 190); 3730 c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200) 3731 d = b ~ a; 3732 assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200), 3733 text(c)); 3734 assert(c == d, text(c, " vs ", d)); 3735 } 3736 } 3737 3738 } 3739 3740 pure @safe unittest// vs single dchar 3741 { 3742 import std.conv : text; 3743 CodepointSet a = CodepointSet(10, 100, 120, 200); 3744 assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A')); 3745 assert((a & 'B') == CodepointSet(66, 67)); 3746 } 3747 3748 pure @safe unittest// iteration & opIndex 3749 { 3750 import std.algorithm.comparison : equal; 3751 import std.conv : text; 3752 import std.typecons : tuple, Tuple; 3753 3754 static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy))) 3755 {{ 3756 auto arr = "ABCDEFGHIJKLMabcdefghijklm"d; 3757 auto a = CodeList('A','N','a', 'n'); 3758 assert(equal(a.byInterval, 3759 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')] 3760 ), text(a.byInterval)); 3761 3762 // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ? 3763 version (bug8949) 3764 { 3765 import std.range : retro; 3766 assert(equal(retro(a.byInterval), 3767 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')] 3768 ), text(retro(a.byInterval))); 3769 } 3770 auto achr = a.byCodepoint; 3771 assert(equal(achr, arr), text(a.byCodepoint)); 3772 foreach (ch; a.byCodepoint) 3773 assert(a[ch]); 3774 auto x = CodeList(100, 500, 600, 900, 1200, 1500); 3775 assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval)); 3776 foreach (ch; x.byCodepoint) 3777 assert(x[ch]); 3778 static if (is(CodeList == CodepointSet)) 3779 { 3780 auto y = CodeList(x.byInterval); 3781 assert(equal(x.byInterval, y.byInterval)); 3782 } 3783 assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[])); 3784 assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[])); 3785 }} 3786 } 3787 3788 //============================================================================ 3789 // Generic Trie template and various ways to build it 3790 //============================================================================ 3791 3792 // debug helper to get a shortened array dump 3793 auto arrayRepr(T)(T x) 3794 { 3795 import std.conv : text; 3796 if (x.length > 32) 3797 { 3798 return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]); 3799 } 3800 else 3801 return text(x); 3802 } 3803 3804 /** 3805 Maps `Key` to a suitable integer index within the range of `size_t`. 3806 The mapping is constructed by applying predicates from `Prefix` left to right 3807 and concatenating the resulting bits. 3808 3809 The first (leftmost) predicate defines the most significant bits of 3810 the resulting index. 3811 */ 3812 template mapTrieIndex(Prefix...) 3813 { 3814 size_t mapTrieIndex(Key)(Key key) 3815 if (isValidPrefixForTrie!(Key, Prefix)) 3816 { 3817 alias p = Prefix; 3818 size_t idx; 3819 foreach (i, v; p[0..$-1]) 3820 { 3821 idx |= p[i](key); 3822 idx <<= p[i+1].bitSize; 3823 } 3824 idx |= p[$-1](key); 3825 return idx; 3826 } 3827 } 3828 3829 /* 3830 `TrieBuilder` is a type used for incremental construction 3831 of $(LREF Trie)s. 3832 3833 See $(LREF buildTrie) for generic helpers built on top of it. 3834 */ 3835 @trusted private struct TrieBuilder(Value, Key, Args...) 3836 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args)) 3837 { 3838 import std.exception : enforce; 3839 3840 private: 3841 // last index is not stored in table, it is used as an offset to values in a block. 3842 static if (is(Value == bool))// always pack bool 3843 alias V = BitPacked!(Value, 1); 3844 else 3845 alias V = Value; 3846 static auto deduceMaxIndex(Preds...)() 3847 { 3848 size_t idx = 1; 3849 foreach (v; Preds) 3850 idx *= 2^^v.bitSize; 3851 return idx; 3852 } 3853 3854 static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key 3855 { 3856 alias Prefix = Args[1..$]; 3857 enum lastPageSize = 2^^Prefix[$-1].bitSize; 3858 enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]); 3859 enum roughedMaxIndex = 3860 (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize; 3861 // check warp around - if wrapped, use the default deduction rule 3862 enum maxIndex = roughedMaxIndex < translatedMaxIndex ? 3863 deduceMaxIndex!(Prefix)() : roughedMaxIndex; 3864 } 3865 else 3866 { 3867 alias Prefix = Args; 3868 enum maxIndex = deduceMaxIndex!(Prefix)(); 3869 } 3870 3871 alias getIndex = mapTrieIndex!(Prefix); 3872 3873 enum lastLevel = Prefix.length-1; 3874 struct ConstructState 3875 { 3876 size_t idx_zeros, idx_ones; 3877 } 3878 // iteration over levels of Trie, each indexes its own level and thus a shortened domain 3879 size_t[Prefix.length] indices; 3880 // default filler value to use 3881 Value defValue; 3882 // this is a full-width index of next item 3883 size_t curIndex; 3884 // all-zeros page index, all-ones page index (+ indicator if there is such a page) 3885 ConstructState[Prefix.length] state; 3886 // the table being constructed 3887 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table; 3888 3889 @disable this(); 3890 3891 //shortcut for index variable at level 'level' 3892 @property ref idx(size_t level)(){ return indices[level]; } 3893 3894 // this function assumes no holes in the input so 3895 // indices are going one by one 3896 void addValue(size_t level, T)(T val, size_t numVals) 3897 { 3898 alias j = idx!level; 3899 enum pageSize = 1 << Prefix[level].bitSize; 3900 if (numVals == 0) 3901 return; 3902 auto ptr = table.slice!(level); 3903 if (numVals == 1) 3904 { 3905 static if (level == Prefix.length-1) 3906 ptr[j] = val; 3907 else 3908 {// can incur narrowing conversion 3909 assert(j < ptr.length); 3910 ptr[j] = force!(typeof(ptr[j]))(val); 3911 } 3912 j++; 3913 if (j % pageSize == 0) 3914 spillToNextPage!level(ptr); 3915 return; 3916 } 3917 // longer row of values 3918 // get to the next page boundary 3919 immutable nextPB = (j + pageSize) & ~(pageSize-1); 3920 immutable n = nextPB - j;// can fill right in this page 3921 if (numVals < n) //fits in current page 3922 { 3923 ptr[j .. j+numVals] = val; 3924 j += numVals; 3925 return; 3926 } 3927 static if (level != 0)//on the first level it always fits 3928 { 3929 numVals -= n; 3930 //write till the end of current page 3931 ptr[j .. j+n] = val; 3932 j += n; 3933 //spill to the next page 3934 spillToNextPage!level(ptr); 3935 // page at once loop 3936 if (state[level].idx_zeros != size_t.max && val == T.init) 3937 { 3938 alias NextIdx = typeof(table.slice!(level-1)[0]); 3939 addValue!(level-1)(force!NextIdx(state[level].idx_zeros), 3940 numVals/pageSize); 3941 ptr = table.slice!level; //table structure might have changed 3942 numVals %= pageSize; 3943 } 3944 else 3945 { 3946 while (numVals >= pageSize) 3947 { 3948 numVals -= pageSize; 3949 ptr[j .. j+pageSize] = val; 3950 j += pageSize; 3951 spillToNextPage!level(ptr); 3952 } 3953 } 3954 if (numVals) 3955 { 3956 // the leftovers, an incomplete page 3957 ptr[j .. j+numVals] = val; 3958 j += numVals; 3959 } 3960 } 3961 } 3962 3963 void spillToNextPage(size_t level, Slice)(ref Slice ptr) 3964 { 3965 // last level (i.e. topmost) has 1 "page" 3966 // thus it need not to add a new page on upper level 3967 static if (level != 0) 3968 spillToNextPageImpl!(level)(ptr); 3969 } 3970 3971 // this can re-use the current page if duplicate or allocate a new one 3972 // it also makes sure that previous levels point to the correct page in this level 3973 void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr) 3974 { 3975 alias NextIdx = typeof(table.slice!(level-1)[0]); 3976 NextIdx next_lvl_index; 3977 enum pageSize = 1 << Prefix[level].bitSize; 3978 assert(idx!level % pageSize == 0); 3979 immutable last = idx!level-pageSize; 3980 const slice = ptr[idx!level - pageSize .. idx!level]; 3981 size_t j; 3982 for (j=0; j<last; j+=pageSize) 3983 { 3984 if (ptr[j .. j+pageSize] == slice) 3985 { 3986 // get index to it, reuse ptr space for the next block 3987 next_lvl_index = force!NextIdx(j/pageSize); 3988 version (none) 3989 { 3990 import std.stdio : writefln, writeln; 3991 writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]" 3992 ,level 3993 ,indices[level-1], pageSize, j, j+pageSize); 3994 writeln("LEVEL(", level 3995 , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize])); 3996 writeln("LEVEL(", level 3997 , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize])); 3998 } 3999 idx!level -= pageSize; // reuse this page, it is duplicate 4000 break; 4001 } 4002 } 4003 if (j == last) 4004 { 4005 L_allocate_page: 4006 next_lvl_index = force!NextIdx(idx!level/pageSize - 1); 4007 if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize)) 4008 { 4009 state[level].idx_zeros = next_lvl_index; 4010 } 4011 // allocate next page 4012 version (none) 4013 { 4014 import std.stdio : writefln; 4015 writefln("LEVEL(%s) page allocated: %s" 4016 , level, arrayRepr(slice[0 .. pageSize])); 4017 writefln("LEVEL(%s) index: %s ; page at this index %s" 4018 , level 4019 , next_lvl_index 4020 , arrayRepr( 4021 table.slice!(level) 4022 [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize] 4023 )); 4024 } 4025 table.length!level = table.length!level + pageSize; 4026 } 4027 L_know_index: 4028 // for the previous level, values are indices to the pages in the current level 4029 addValue!(level-1)(next_lvl_index, 1); 4030 ptr = table.slice!level; //re-load the slice after moves 4031 } 4032 4033 // idx - full-width index to fill with v (full-width index != key) 4034 // fills everything in the range of [curIndex, idx) with filler 4035 void putAt(size_t idx, Value v) 4036 { 4037 assert(idx >= curIndex); 4038 immutable numFillers = idx - curIndex; 4039 addValue!lastLevel(defValue, numFillers); 4040 addValue!lastLevel(v, 1); 4041 curIndex = idx + 1; 4042 } 4043 4044 // ditto, but sets the range of [idxA, idxB) to v 4045 void putRangeAt(size_t idxA, size_t idxB, Value v) 4046 { 4047 assert(idxA >= curIndex); 4048 assert(idxB >= idxA); 4049 size_t numFillers = idxA - curIndex; 4050 addValue!lastLevel(defValue, numFillers); 4051 addValue!lastLevel(v, idxB - idxA); 4052 curIndex = idxB; // open-right 4053 } 4054 4055 enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~ 4056 "duplicate key->value mapping"; 4057 4058 public: 4059 /** 4060 Construct a builder, where `filler` is a value 4061 to indicate empty slots (or "not found" condition). 4062 */ 4063 this(Value filler) 4064 { 4065 curIndex = 0; 4066 defValue = filler; 4067 // zeros-page index, ones-page index 4068 foreach (ref v; state) 4069 v = ConstructState(size_t.max, size_t.max); 4070 table = typeof(table)(indices); 4071 // one page per level is a bootstrap minimum 4072 foreach (i, Pred; Prefix) 4073 table.length!i = (1 << Pred.bitSize); 4074 } 4075 4076 /** 4077 Put a value `v` into interval as 4078 mapped by keys from `a` to `b`. 4079 All slots prior to `a` are filled with 4080 the default filler. 4081 */ 4082 void putRange(Key a, Key b, Value v) 4083 { 4084 auto idxA = getIndex(a), idxB = getIndex(b); 4085 // indexes of key should always grow 4086 enforce(idxB >= idxA && idxA >= curIndex, errMsg); 4087 putRangeAt(idxA, idxB, v); 4088 } 4089 4090 /** 4091 Put a value `v` into slot mapped by `key`. 4092 All slots prior to `key` are filled with the 4093 default filler. 4094 */ 4095 void putValue(Key key, Value v) 4096 { 4097 auto idx = getIndex(key); 4098 enforce(idx >= curIndex, errMsg); 4099 putAt(idx, v); 4100 } 4101 4102 /// Finishes construction of Trie, yielding an immutable Trie instance. 4103 auto build() 4104 { 4105 static if (maxIndex != 0) // doesn't cover full range of size_t 4106 { 4107 assert(curIndex <= maxIndex); 4108 addValue!lastLevel(defValue, maxIndex - curIndex); 4109 } 4110 else 4111 { 4112 if (curIndex != 0 // couldn't wrap around 4113 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty 4114 { 4115 addValue!lastLevel(defValue, size_t.max - curIndex); 4116 addValue!lastLevel(defValue, 1); 4117 } 4118 // else curIndex already completed the full range of size_t by wrapping around 4119 } 4120 return Trie!(V, Key, maxIndex, Prefix)(table); 4121 } 4122 } 4123 4124 /** 4125 $(P A generic Trie data-structure for a fixed number of stages. 4126 The design goal is optimal speed with smallest footprint size. 4127 ) 4128 $(P It's intentionally read-only and doesn't provide constructors. 4129 To construct one use a special builder, 4130 see $(LREF TrieBuilder) and $(LREF buildTrie). 4131 ) 4132 4133 */ 4134 @trusted private struct Trie(Value, Key, Args...) 4135 if (isValidPrefixForTrie!(Key, Args) 4136 || (isValidPrefixForTrie!(Key, Args[1..$]) 4137 && is(typeof(Args[0]) : size_t))) 4138 { 4139 import std.range.primitives : isOutputRange; 4140 static if (is(typeof(Args[0]) : size_t)) 4141 { 4142 private enum maxIndex = Args[0]; 4143 private enum hasBoundsCheck = true; 4144 private alias Prefix = Args[1..$]; 4145 } 4146 else 4147 { 4148 private enum hasBoundsCheck = false; 4149 private alias Prefix = Args; 4150 } 4151 4152 private this()(typeof(_table) table) 4153 { 4154 _table = table; 4155 } 4156 4157 // only for constant Tries constructed from precompiled tables 4158 private this()(const(size_t)[] offsets, const(size_t)[] sizes, 4159 const(size_t)[] data) const 4160 { 4161 _table = typeof(_table)(offsets, sizes, data); 4162 } 4163 4164 /** 4165 $(P Lookup the `key` in this `Trie`. ) 4166 4167 $(P The lookup always succeeds if key fits the domain 4168 provided during construction. The whole domain defined 4169 is covered so instead of not found condition 4170 the sentinel (filler) value could be used. ) 4171 4172 $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to 4173 define a domain of `Trie` keys and the sentinel value. ) 4174 4175 Note: 4176 Domain range-checking is only enabled in debug builds 4177 and results in assertion failure. 4178 */ 4179 TypeOfBitPacked!Value opIndex()(Key key) const 4180 { 4181 static if (hasBoundsCheck) 4182 assert(mapTrieIndex!Prefix(key) < maxIndex); 4183 size_t idx; 4184 alias p = Prefix; 4185 idx = cast(size_t) p[0](key); 4186 foreach (i, v; p[0..$-1]) 4187 idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key)); 4188 return _table.ptr!(p.length-1)[idx]; 4189 } 4190 4191 /// 4192 @property size_t bytes(size_t n=size_t.max)() const 4193 { 4194 return _table.bytes!n; 4195 } 4196 4197 /// 4198 @property size_t pages(size_t n)() const 4199 { 4200 return (bytes!n+2^^(Prefix[n].bitSize-1)) 4201 /2^^Prefix[n].bitSize; 4202 } 4203 4204 /// 4205 void store(OutRange)(scope OutRange sink) const 4206 if (isOutputRange!(OutRange, char)) 4207 { 4208 _table.store(sink); 4209 } 4210 4211 private: 4212 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table; 4213 } 4214 4215 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes' 4216 // left-to-right, the most significant bits first 4217 template GetBitSlicing(size_t top, sizes...) 4218 { 4219 static if (sizes.length > 0) 4220 alias GetBitSlicing = 4221 AliasSeq!(sliceBits!(top - sizes[0], top), 4222 GetBitSlicing!(top - sizes[0], sizes[1..$])); 4223 else 4224 alias GetBitSlicing = AliasSeq!(); 4225 } 4226 4227 template callableWith(T) 4228 { 4229 template callableWith(alias Pred) 4230 { 4231 static if (!is(typeof(Pred(T.init)))) 4232 enum callableWith = false; 4233 else 4234 { 4235 alias Result = typeof(Pred(T.init)); 4236 enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result)); 4237 } 4238 } 4239 } 4240 4241 /* 4242 Check if `Prefix` is a valid set of predicates 4243 for `Trie` template having `Key` as the type of keys. 4244 This requires all predicates to be callable, take 4245 single argument of type `Key` and return unsigned value. 4246 */ 4247 template isValidPrefixForTrie(Key, Prefix...) 4248 { 4249 import std.meta : allSatisfy; 4250 enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws 4251 } 4252 4253 /* 4254 Check if `Args` is a set of maximum key value followed by valid predicates 4255 for `Trie` template having `Key` as the type of keys. 4256 */ 4257 template isValidArgsForTrie(Key, Args...) 4258 { 4259 static if (Args.length > 1) 4260 { 4261 enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args) 4262 || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key)); 4263 } 4264 else 4265 enum isValidArgsForTrie = isValidPrefixForTrie!Args; 4266 } 4267 4268 @property size_t sumOfIntegerTuple(ints...)() 4269 { 4270 size_t count=0; 4271 foreach (v; ints) 4272 count += v; 4273 return count; 4274 } 4275 4276 /** 4277 A shorthand for creating a custom multi-level fixed Trie 4278 from a `CodepointSet`. `sizes` are numbers of bits per level, 4279 with the most significant bits used first. 4280 4281 Note: The sum of `sizes` must be equal 21. 4282 4283 See_Also: $(LREF toTrie), which is even simpler. 4284 4285 Example: 4286 --- 4287 { 4288 import std.stdio; 4289 auto set = unicode("Number"); 4290 auto trie = codepointSetTrie!(8, 5, 8)(set); 4291 writeln("Input code points to test:"); 4292 foreach (line; stdin.byLine) 4293 { 4294 int count=0; 4295 foreach (dchar ch; line) 4296 if (trie[ch])// is number 4297 count++; 4298 writefln("Contains %d number code points.", count); 4299 } 4300 } 4301 --- 4302 */ 4303 public template codepointSetTrie(sizes...) 4304 if (sumOfIntegerTuple!sizes == 21) 4305 { 4306 auto codepointSetTrie(Set)(Set set) 4307 if (isCodepointSet!Set) 4308 { 4309 auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false); 4310 foreach (ival; set.byInterval) 4311 builder.putRange(ival[0], ival[1], true); 4312 return builder.build(); 4313 } 4314 } 4315 4316 /// Type of Trie generated by codepointSetTrie function. 4317 public template CodepointSetTrie(sizes...) 4318 if (sumOfIntegerTuple!sizes == 21) 4319 { 4320 alias Prefix = GetBitSlicing!(21, sizes); 4321 alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build()); 4322 } 4323 4324 /** 4325 A slightly more general tool for building fixed `Trie` 4326 for the Unicode data. 4327 4328 Specifically unlike `codepointSetTrie` it's allows creating mappings 4329 of `dchar` to an arbitrary type `T`. 4330 4331 Note: Overload taking `CodepointSet`s will naturally convert 4332 only to bool mapping `Trie`s. 4333 4334 CodepointTrie is the type of Trie as generated by codepointTrie function. 4335 */ 4336 public template codepointTrie(T, sizes...) 4337 if (sumOfIntegerTuple!sizes == 21) 4338 { 4339 alias Prefix = GetBitSlicing!(21, sizes); 4340 4341 static if (is(TypeOfBitPacked!T == bool)) 4342 { 4343 auto codepointTrie(Set)(const scope Set set) 4344 if (isCodepointSet!Set) 4345 { 4346 return codepointSetTrie(set); 4347 } 4348 } 4349 4350 /// 4351 auto codepointTrie()(T[dchar] map, T defValue=T.init) 4352 { 4353 return buildTrie!(T, dchar, Prefix)(map, defValue); 4354 } 4355 4356 // unsorted range of pairs 4357 /// 4358 auto codepointTrie(R)(R range, T defValue=T.init) 4359 if (isInputRange!R 4360 && is(typeof(ElementType!R.init[0]) : T) 4361 && is(typeof(ElementType!R.init[1]) : dchar)) 4362 { 4363 // build from unsorted array of pairs 4364 // TODO: expose index sorting functions for Trie 4365 return buildTrie!(T, dchar, Prefix)(range, defValue, true); 4366 } 4367 } 4368 4369 @system pure unittest 4370 { 4371 import std.algorithm.comparison : max; 4372 import std.algorithm.searching : count; 4373 4374 // pick characters from the Greek script 4375 auto set = unicode.Greek; 4376 4377 // a user-defined property (or an expensive function) 4378 // that we want to look up 4379 static uint luckFactor(dchar ch) 4380 { 4381 // here we consider a character lucky 4382 // if its code point has a lot of identical hex-digits 4383 // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2 4384 ubyte[6] nibbles; // 6 4-bit chunks of code point 4385 uint value = ch; 4386 foreach (i; 0 .. 6) 4387 { 4388 nibbles[i] = value & 0xF; 4389 value >>= 4; 4390 } 4391 uint luck; 4392 foreach (n; nibbles) 4393 luck = cast(uint) max(luck, count(nibbles[], n)); 4394 return luck; 4395 } 4396 4397 // only unsigned built-ins are supported at the moment 4398 alias LuckFactor = BitPacked!(uint, 3); 4399 4400 // create a temporary associative array (AA) 4401 LuckFactor[dchar] map; 4402 foreach (ch; set.byCodepoint) 4403 map[ch] = LuckFactor(luckFactor(ch)); 4404 4405 // bits per stage are chosen randomly, fell free to optimize 4406 auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map); 4407 4408 // from now on the AA is not needed 4409 foreach (ch; set.byCodepoint) 4410 assert(trie[ch] == luckFactor(ch)); // verify 4411 // CJK is not Greek, thus it has the default value 4412 assert(trie['\u4444'] == 0); 4413 // and here is a couple of quite lucky Greek characters: 4414 // Greek small letter epsilon with dasia 4415 assert(trie['\u1F11'] == 3); 4416 // Ancient Greek metretes sign 4417 assert(trie['\U00010181'] == 3); 4418 4419 } 4420 4421 /// ditto 4422 public template CodepointTrie(T, sizes...) 4423 if (sumOfIntegerTuple!sizes == 21) 4424 { 4425 alias Prefix = GetBitSlicing!(21, sizes); 4426 alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build()); 4427 } 4428 4429 package(std) template cmpK0(alias Pred) 4430 { 4431 import std.typecons : Tuple; 4432 static bool cmpK0(Value, Key) 4433 (Tuple!(Value, Key) a, Tuple!(Value, Key) b) 4434 { 4435 return Pred(a[1]) < Pred(b[1]); 4436 } 4437 } 4438 4439 /** 4440 The most general utility for construction of `Trie`s 4441 short of using `TrieBuilder` directly. 4442 4443 Provides a number of convenience overloads. 4444 `Args` is tuple of maximum key value followed by 4445 predicates to construct index from key. 4446 4447 Alternatively if the first argument is not a value convertible to `Key` 4448 then the whole tuple of `Args` is treated as predicates 4449 and the maximum Key is deduced from predicates. 4450 */ 4451 private template buildTrie(Value, Key, Args...) 4452 if (isValidArgsForTrie!(Key, Args)) 4453 { 4454 static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key 4455 { 4456 alias Prefix = Args[1..$]; 4457 } 4458 else 4459 alias Prefix = Args; 4460 4461 alias getIndex = mapTrieIndex!(Prefix); 4462 4463 // for multi-sort 4464 template GetComparators(size_t n) 4465 { 4466 static if (n > 0) 4467 alias GetComparators = 4468 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1])); 4469 else 4470 alias GetComparators = AliasSeq!(); 4471 } 4472 4473 /* 4474 Build `Trie` from a range of a Key-Value pairs, 4475 assuming it is sorted by Key as defined by the following lambda: 4476 ------ 4477 (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b) 4478 ------ 4479 Exception is thrown if it's detected that the above order doesn't hold. 4480 4481 In other words $(LREF mapTrieIndex) should be a 4482 monotonically increasing function that maps `Key` to an integer. 4483 4484 See_Also: $(REF sort, std,_algorithm), 4485 $(REF SortedRange, std,range), 4486 $(REF setUnion, std,_algorithm). 4487 */ 4488 auto buildTrie(Range)(Range range, Value filler=Value.init) 4489 if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value) 4490 && is(typeof(Range.init.front[1]) : Key)) 4491 { 4492 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4493 foreach (v; range) 4494 builder.putValue(v[1], v[0]); 4495 return builder.build(); 4496 } 4497 4498 /* 4499 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4500 to build `Trie` from a range of open-right intervals of `Key`s. 4501 The requirement on the ordering of keys (and the behavior on the 4502 violation of it) is the same as for Key-Value range overload. 4503 4504 Intervals denote ranges of !`filler` i.e. the opposite of filler. 4505 If no filler provided keys inside of the intervals map to true, 4506 and `filler` is false. 4507 */ 4508 auto buildTrie(Range)(Range range, Value filler=Value.init) 4509 if (is(TypeOfBitPacked!Value == bool) 4510 && isInputRange!Range && is(typeof(Range.init.front[0]) : Key) 4511 && is(typeof(Range.init.front[1]) : Key)) 4512 { 4513 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4514 foreach (ival; range) 4515 builder.putRange(ival[0], ival[1], !filler); 4516 return builder.build(); 4517 } 4518 4519 auto buildTrie(Range)(Range range, Value filler, bool unsorted) 4520 if (isInputRange!Range 4521 && is(typeof(Range.init.front[0]) : Value) 4522 && is(typeof(Range.init.front[1]) : Key)) 4523 { 4524 import std.algorithm.sorting : multiSort; 4525 alias Comps = GetComparators!(Prefix.length); 4526 if (unsorted) 4527 multiSort!(Comps)(range); 4528 return buildTrie(range, filler); 4529 } 4530 4531 /* 4532 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4533 to build `Trie` simply from an input range of `Key`s. 4534 The requirement on the ordering of keys (and the behavior on the 4535 violation of it) is the same as for Key-Value range overload. 4536 4537 Keys found in range denote !`filler` i.e. the opposite of filler. 4538 If no filler provided keys map to true, and `filler` is false. 4539 */ 4540 auto buildTrie(Range)(Range range, Value filler=Value.init) 4541 if (is(TypeOfBitPacked!Value == bool) 4542 && isInputRange!Range && is(typeof(Range.init.front) : Key)) 4543 { 4544 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4545 foreach (v; range) 4546 builder.putValue(v, !filler); 4547 return builder.build(); 4548 } 4549 4550 /* 4551 If `Key` is unsigned integer `Trie` could be constructed from array 4552 of values where array index serves as key. 4553 */ 4554 auto buildTrie()(Value[] array, Value filler=Value.init) 4555 if (isUnsigned!Key) 4556 { 4557 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4558 foreach (idx, v; array) 4559 builder.putValue(idx, v); 4560 return builder.build(); 4561 } 4562 4563 /* 4564 Builds `Trie` from associative array. 4565 */ 4566 auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init) 4567 { 4568 import std.array : array; 4569 import std.range : zip; 4570 auto range = array(zip(map.values, map.keys)); 4571 return buildTrie(range, filler, true); // sort it 4572 } 4573 } 4574 4575 // helper in place of assumeSize to 4576 //reduce mangled name & help DMD inline Trie functors 4577 struct clamp(size_t bits) 4578 { 4579 static size_t opCall(T)(T arg){ return arg; } 4580 enum bitSize = bits; 4581 } 4582 4583 struct clampIdx(size_t idx, size_t bits) 4584 { 4585 static size_t opCall(T)(T arg){ return arg[idx]; } 4586 enum bitSize = bits; 4587 } 4588 4589 /** 4590 Conceptual type that outlines the common properties of all UTF Matchers. 4591 4592 Note: For illustration purposes only, every method 4593 call results in assertion failure. 4594 Use $(LREF utfMatcher) to obtain a concrete matcher 4595 for UTF-8 or UTF-16 encodings. 4596 */ 4597 public struct MatcherConcept 4598 { 4599 /** 4600 $(P Perform a semantic equivalent 2 operations: 4601 decoding a $(CODEPOINT) at front of `inp` and testing if 4602 it belongs to the set of $(CODEPOINTS) of this matcher. ) 4603 4604 $(P The effect on `inp` depends on the kind of function called:) 4605 4606 $(P Match. If the codepoint is found in the set then range `inp` 4607 is advanced by its size in $(S_LINK Code unit, code units), 4608 otherwise the range is not modifed.) 4609 4610 $(P Skip. The range is always advanced by the size 4611 of the tested $(CODEPOINT) regardless of the result of test.) 4612 4613 $(P Test. The range is left unaffected regardless 4614 of the result of test.) 4615 */ 4616 public bool match(Range)(ref Range inp) 4617 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4618 { 4619 assert(false); 4620 } 4621 4622 ///ditto 4623 public bool skip(Range)(ref Range inp) 4624 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4625 { 4626 assert(false); 4627 } 4628 4629 ///ditto 4630 public bool test(Range)(ref Range inp) 4631 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4632 { 4633 assert(false); 4634 } 4635 /// 4636 pure @safe unittest 4637 { 4638 string truth = "2² = 4"; 4639 auto m = utfMatcher!char(unicode.Number); 4640 assert(m.match(truth)); // '2' is a number all right 4641 assert(truth == "² = 4"); // skips on match 4642 assert(m.match(truth)); // so is the superscript '2' 4643 assert(!m.match(truth)); // space is not a number 4644 assert(truth == " = 4"); // unaffected on no match 4645 assert(!m.skip(truth)); // same test ... 4646 assert(truth == "= 4"); // but skips a codepoint regardless 4647 assert(!m.test(truth)); // '=' is not a number 4648 assert(truth == "= 4"); // test never affects argument 4649 } 4650 4651 /** 4652 Advanced feature - provide direct access to a subset of matcher based a 4653 set of known encoding lengths. Lengths are provided in 4654 $(S_LINK Code unit, code units). The sub-matcher then may do less 4655 operations per any `test`/`match`. 4656 4657 Use with care as the sub-matcher won't match 4658 any $(CODEPOINTS) that have encoded length that doesn't belong 4659 to the selected set of lengths. Also the sub-matcher object references 4660 the parent matcher and must not be used past the liftetime 4661 of the latter. 4662 4663 Another caveat of using sub-matcher is that skip is not available 4664 preciesly because sub-matcher doesn't detect all lengths. 4665 */ 4666 @property auto subMatcher(Lengths...)() 4667 { 4668 assert(0); 4669 return this; 4670 } 4671 4672 pure @safe unittest 4673 { 4674 auto m = utfMatcher!char(unicode.Number); 4675 string square = "2²"; 4676 // about sub-matchers 4677 assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered 4678 assert(m.subMatcher!1.match(square)); // ASCII-only, works 4679 assert(!m.subMatcher!1.test(square)); // unicode '²' 4680 assert(m.subMatcher!(2,3,4).match(square)); // 4681 assert(square == ""); 4682 wstring wsquare = "2²"; 4683 auto m16 = utfMatcher!wchar(unicode.Number); 4684 // may keep ref, but the orignal (m16) must be kept alive 4685 auto bmp = m16.subMatcher!1; 4686 assert(bmp.match(wsquare)); // Okay, in basic multilingual plan 4687 assert(bmp.match(wsquare)); // And '²' too 4688 } 4689 } 4690 4691 /** 4692 Test if `M` is an UTF Matcher for ranges of `Char`. 4693 */ 4694 public enum isUtfMatcher(M, C) = __traits(compiles, (){ 4695 C[] s; 4696 auto d = s.decoder; 4697 M m; 4698 assert(is(typeof(m.match(d)) == bool)); 4699 assert(is(typeof(m.test(d)) == bool)); 4700 static if (is(typeof(m.skip(d)))) 4701 { 4702 assert(is(typeof(m.skip(d)) == bool)); 4703 assert(is(typeof(m.skip(s)) == bool)); 4704 } 4705 assert(is(typeof(m.match(s)) == bool)); 4706 assert(is(typeof(m.test(s)) == bool)); 4707 }); 4708 4709 pure @safe unittest 4710 { 4711 alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init)); 4712 alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init)); 4713 static assert(isUtfMatcher!(CharMatcher, char)); 4714 static assert(isUtfMatcher!(CharMatcher, immutable(char))); 4715 static assert(isUtfMatcher!(WcharMatcher, wchar)); 4716 static assert(isUtfMatcher!(WcharMatcher, immutable(wchar))); 4717 } 4718 4719 enum Mode { 4720 alwaysSkip, 4721 neverSkip, 4722 skipOnMatch 4723 } 4724 4725 mixin template ForwardStrings() 4726 { 4727 private bool fwdStr(string fn, C)(ref C[] str) const @trusted 4728 { 4729 import std.utf : byCodeUnit; 4730 alias type = typeof(byCodeUnit(str)); 4731 return mixin(fn~"(*cast(type*)&str)"); 4732 } 4733 } 4734 4735 template Utf8Matcher() 4736 { 4737 enum validSize(int sz) = sz >= 1 && sz <= 4; 4738 4739 void badEncoding() pure @safe 4740 { 4741 import std.utf : UTFException; 4742 throw new UTFException("Invalid UTF-8 sequence"); 4743 } 4744 4745 //for 1-stage ASCII 4746 alias AsciiSpec = AliasSeq!(bool, char, clamp!7); 4747 //for 2-stage lookup of 2 byte UTF-8 sequences 4748 alias Utf8Spec2 = AliasSeq!(bool, char[2], 4749 clampIdx!(0, 5), clampIdx!(1, 6)); 4750 //ditto for 3 byte 4751 alias Utf8Spec3 = AliasSeq!(bool, char[3], 4752 clampIdx!(0, 4), 4753 clampIdx!(1, 6), 4754 clampIdx!(2, 6) 4755 ); 4756 //ditto for 4 byte 4757 alias Utf8Spec4 = AliasSeq!(bool, char[4], 4758 clampIdx!(0, 3), clampIdx!(1, 6), 4759 clampIdx!(2, 6), clampIdx!(3, 6) 4760 ); 4761 alias Tables = AliasSeq!( 4762 typeof(TrieBuilder!(AsciiSpec)(false).build()), 4763 typeof(TrieBuilder!(Utf8Spec2)(false).build()), 4764 typeof(TrieBuilder!(Utf8Spec3)(false).build()), 4765 typeof(TrieBuilder!(Utf8Spec4)(false).build()) 4766 ); 4767 alias Table(int size) = Tables[size-1]; 4768 4769 enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1; 4770 enum encMask(size_t size) = ((1 << size)-1)<<(8-size); 4771 4772 char truncate()(char ch) pure @safe 4773 { 4774 ch -= 0x80; 4775 if (ch < 0x40) 4776 { 4777 return ch; 4778 } 4779 else 4780 { 4781 badEncoding(); 4782 return cast(char) 0; 4783 } 4784 } 4785 4786 static auto encode(size_t sz)(dchar ch) 4787 if (sz > 1) 4788 { 4789 import std.utf : encodeUTF = encode; 4790 char[4] buf; 4791 encodeUTF(buf, ch); 4792 char[sz] ret; 4793 buf[0] &= leadMask!sz; 4794 foreach (n; 1 .. sz) 4795 buf[n] = buf[n] & 0x3f; //keep 6 lower bits 4796 ret[] = buf[0 .. sz]; 4797 return ret; 4798 } 4799 4800 auto build(Set)(Set set) 4801 { 4802 import std.algorithm.iteration : map; 4803 auto ascii = set & unicode.ASCII; 4804 auto utf8_2 = set & CodepointSet(0x80, 0x800); 4805 auto utf8_3 = set & CodepointSet(0x800, 0x1_0000); 4806 auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1); 4807 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 4808 auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2); 4809 auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3); 4810 auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4); 4811 alias Ret = Impl!(1,2,3,4); 4812 return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T); 4813 } 4814 4815 // Bootstrap UTF-8 static matcher interface 4816 // from 3 primitives: tab!(size), lookup and Sizes 4817 mixin template DefMatcher() 4818 { 4819 import std.format : format; 4820 import std.meta : Erase, staticIndexOf; 4821 enum hasASCII = staticIndexOf!(1, Sizes) >= 0; 4822 alias UniSizes = Erase!(1, Sizes); 4823 4824 //generate dispatch code sequence for unicode parts 4825 static auto genDispatch() 4826 { 4827 string code; 4828 foreach (size; UniSizes) 4829 code ~= format(q{ 4830 if ((ch & ~leadMask!%d) == encMask!(%d)) 4831 return lookup!(%d, mode)(inp); 4832 else 4833 }, size, size, size); 4834 static if (Sizes.length == 4) //covers all code unit cases 4835 code ~= "{ badEncoding(); return false; }"; 4836 else 4837 code ~= "return false;"; //may be just fine but not covered 4838 return code; 4839 } 4840 enum dispatch = genDispatch(); 4841 4842 public bool match(Range)(ref Range inp) const 4843 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4844 !isDynamicArray!Range) 4845 { 4846 enum mode = Mode.skipOnMatch; 4847 assert(!inp.empty); 4848 immutable ch = inp[0]; 4849 static if (hasASCII) 4850 { 4851 if (ch < 0x80) 4852 { 4853 immutable r = tab!1[ch]; 4854 if (r) 4855 inp.popFront(); 4856 return r; 4857 } 4858 else 4859 mixin(dispatch); 4860 } 4861 else 4862 mixin(dispatch); 4863 } 4864 4865 static if (Sizes.length == 4) // can skip iff can detect all encodings 4866 { 4867 public bool skip(Range)(ref Range inp) const 4868 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4869 !isDynamicArray!Range) 4870 { 4871 enum mode = Mode.alwaysSkip; 4872 assert(!inp.empty); 4873 auto ch = inp[0]; 4874 static if (hasASCII) 4875 { 4876 if (ch < 0x80) 4877 { 4878 inp.popFront(); 4879 return tab!1[ch]; 4880 } 4881 else 4882 mixin(dispatch); 4883 } 4884 else 4885 mixin(dispatch); 4886 } 4887 } 4888 4889 public bool test(Range)(ref Range inp) const 4890 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4891 !isDynamicArray!Range) 4892 { 4893 enum mode = Mode.neverSkip; 4894 assert(!inp.empty); 4895 auto ch = inp[0]; 4896 4897 static if (hasASCII) 4898 { 4899 if (ch < 0x80) 4900 return tab!1[ch]; 4901 else 4902 mixin(dispatch); 4903 } 4904 else 4905 mixin(dispatch); 4906 } 4907 4908 bool match(C)(ref C[] str) const 4909 if (isSomeChar!C) 4910 { 4911 return fwdStr!"match"(str); 4912 } 4913 4914 bool skip(C)(ref C[] str) const 4915 if (isSomeChar!C) 4916 { 4917 return fwdStr!"skip"(str); 4918 } 4919 4920 bool test(C)(ref C[] str) const 4921 if (isSomeChar!C) 4922 { 4923 return fwdStr!"test"(str); 4924 } 4925 4926 mixin ForwardStrings; 4927 } 4928 4929 struct Impl(Sizes...) 4930 { 4931 import std.meta : allSatisfy, staticMap; 4932 static assert(allSatisfy!(validSize, Sizes), 4933 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4934 private: 4935 //pick tables for chosen sizes 4936 alias OurTabs = staticMap!(Table, Sizes); 4937 OurTabs tables; 4938 mixin DefMatcher; 4939 //static disptach helper UTF size ==> table 4940 alias tab(int i) = tables[i - 1]; 4941 4942 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 4943 { 4944 return CherryPick!(Impl, SizesToPick)(&this); 4945 } 4946 4947 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4948 { 4949 import std.range : popFrontN; 4950 if (inp.length < size) 4951 { 4952 badEncoding(); 4953 return false; 4954 } 4955 char[size] needle = void; 4956 needle[0] = leadMask!size & inp[0]; 4957 static foreach (i; 1 .. size) 4958 { 4959 needle[i] = truncate(inp[i]); 4960 } 4961 //overlong encoding checks 4962 static if (size == 2) 4963 { 4964 //0x80-0x7FF 4965 //got 6 bits in needle[1], must use at least 8 bits 4966 //must use at least 2 bits in needle[1] 4967 if (needle[0] < 2) badEncoding(); 4968 } 4969 else static if (size == 3) 4970 { 4971 //0x800-0xFFFF 4972 //got 6 bits in needle[2], must use at least 12bits 4973 //must use 6 bits in needle[1] or anything in needle[0] 4974 if (needle[0] == 0 && needle[1] < 0x20) badEncoding(); 4975 } 4976 else static if (size == 4) 4977 { 4978 //0x800-0xFFFF 4979 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits 4980 //must use 5 bits (or above) in needle[1] or anything in needle[0] 4981 if (needle[0] == 0 && needle[1] < 0x10) badEncoding(); 4982 } 4983 static if (mode == Mode.alwaysSkip) 4984 { 4985 inp.popFrontN(size); 4986 return tab!size[needle]; 4987 } 4988 else static if (mode == Mode.neverSkip) 4989 { 4990 return tab!size[needle]; 4991 } 4992 else 4993 { 4994 static assert(mode == Mode.skipOnMatch); 4995 4996 if (tab!size[needle]) 4997 { 4998 inp.popFrontN(size); 4999 return true; 5000 } 5001 else 5002 return false; 5003 } 5004 } 5005 } 5006 5007 struct CherryPick(I, Sizes...) 5008 { 5009 import std.meta : allSatisfy; 5010 static assert(allSatisfy!(validSize, Sizes), 5011 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 5012 private: 5013 I* m; 5014 @property auto tab(int i)() const { return m.tables[i - 1]; } 5015 bool lookup(int size, Mode mode, Range)(ref Range inp) const 5016 { 5017 return m.lookup!(size, mode)(inp); 5018 } 5019 mixin DefMatcher; 5020 } 5021 } 5022 5023 template Utf16Matcher() 5024 { 5025 enum validSize(int sz) = sz >= 1 && sz <= 2; 5026 5027 void badEncoding() pure @safe 5028 { 5029 import std.utf : UTFException; 5030 throw new UTFException("Invalid UTF-16 sequence"); 5031 } 5032 5033 // 1-stage ASCII 5034 alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7); 5035 //2-stage BMP 5036 alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7)); 5037 //4-stage - full Unicode 5038 //assume that 0xD800 & 0xDC00 bits are cleared 5039 //thus leaving 10 bit per wchar to worry about 5040 alias UniSpec = AliasSeq!(bool, wchar[2], 5041 assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4), 5042 assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6), 5043 ); 5044 alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build()); 5045 alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build()); 5046 alias Uni = typeof(TrieBuilder!(UniSpec)(false).build()); 5047 5048 auto encode2(dchar ch) 5049 { 5050 ch -= 0x1_0000; 5051 assert(ch <= 0xF_FFFF); 5052 wchar[2] ret; 5053 //do not put surrogate bits, they are sliced off 5054 ret[0] = cast(wchar)(ch >> 10); 5055 ret[1] = (ch & 0xFFF); 5056 return ret; 5057 } 5058 5059 auto build(Set)(Set set) 5060 { 5061 import std.algorithm.iteration : map; 5062 auto ascii = set & unicode.ASCII; 5063 auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1)) 5064 - CodepointSet.fromIntervals(0xD800, 0xDFFF+1); 5065 auto other = set - (bmp | ascii); 5066 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 5067 auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec); 5068 auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec); 5069 alias Ret = Impl!(1,2); 5070 return Ret(asciiT, bmpT, otherT); 5071 } 5072 5073 //bootstrap full UTF-16 matcher interace from 5074 //sizeFlags, lookupUni and ascii 5075 mixin template DefMatcher() 5076 { 5077 public bool match(Range)(ref Range inp) const 5078 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5079 !isDynamicArray!Range) 5080 { 5081 enum mode = Mode.skipOnMatch; 5082 assert(!inp.empty); 5083 immutable ch = inp[0]; 5084 static if (sizeFlags & 1) 5085 { 5086 if (ch < 0x80) 5087 { 5088 if (ascii[ch]) 5089 { 5090 inp.popFront(); 5091 return true; 5092 } 5093 else 5094 return false; 5095 } 5096 return lookupUni!mode(inp); 5097 } 5098 else 5099 return lookupUni!mode(inp); 5100 } 5101 5102 static if (Sizes.length == 2) 5103 { 5104 public bool skip(Range)(ref Range inp) const 5105 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5106 !isDynamicArray!Range) 5107 { 5108 enum mode = Mode.alwaysSkip; 5109 assert(!inp.empty); 5110 immutable ch = inp[0]; 5111 static if (sizeFlags & 1) 5112 { 5113 if (ch < 0x80) 5114 { 5115 inp.popFront(); 5116 return ascii[ch]; 5117 } 5118 else 5119 return lookupUni!mode(inp); 5120 } 5121 else 5122 return lookupUni!mode(inp); 5123 } 5124 } 5125 5126 public bool test(Range)(ref Range inp) const 5127 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5128 !isDynamicArray!Range) 5129 { 5130 enum mode = Mode.neverSkip; 5131 assert(!inp.empty); 5132 auto ch = inp[0]; 5133 static if (sizeFlags & 1) 5134 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp); 5135 else 5136 return lookupUni!mode(inp); 5137 } 5138 5139 bool match(C)(ref C[] str) const 5140 if (isSomeChar!C) 5141 { 5142 return fwdStr!"match"(str); 5143 } 5144 5145 bool skip(C)(ref C[] str) const 5146 if (isSomeChar!C) 5147 { 5148 return fwdStr!"skip"(str); 5149 } 5150 5151 bool test(C)(ref C[] str) const 5152 if (isSomeChar!C) 5153 { 5154 return fwdStr!"test"(str); 5155 } 5156 5157 mixin ForwardStrings; //dispatch strings to range versions 5158 } 5159 5160 struct Impl(Sizes...) 5161 if (Sizes.length >= 1 && Sizes.length <= 2) 5162 { 5163 private: 5164 import std.meta : allSatisfy; 5165 static assert(allSatisfy!(validSize, Sizes), 5166 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5167 static if (Sizes.length > 1) 5168 enum sizeFlags = Sizes[0] | Sizes[1]; 5169 else 5170 enum sizeFlags = Sizes[0]; 5171 5172 static if (sizeFlags & 1) 5173 { 5174 Ascii ascii; 5175 Bmp bmp; 5176 } 5177 static if (sizeFlags & 2) 5178 { 5179 Uni uni; 5180 } 5181 mixin DefMatcher; 5182 5183 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 5184 { 5185 return CherryPick!(Impl, SizesToPick)(&this); 5186 } 5187 5188 bool lookupUni(Mode mode, Range)(ref Range inp) const 5189 { 5190 wchar x = cast(wchar)(inp[0] - 0xD800); 5191 //not a high surrogate 5192 if (x > 0x3FF) 5193 { 5194 //low surrogate 5195 if (x <= 0x7FF) badEncoding(); 5196 static if (sizeFlags & 1) 5197 { 5198 auto ch = inp[0]; 5199 static if (mode == Mode.alwaysSkip) 5200 inp.popFront(); 5201 static if (mode == Mode.skipOnMatch) 5202 { 5203 if (bmp[ch]) 5204 { 5205 inp.popFront(); 5206 return true; 5207 } 5208 else 5209 return false; 5210 } 5211 else 5212 return bmp[ch]; 5213 } 5214 else //skip is not available for sub-matchers, so just false 5215 return false; 5216 } 5217 else 5218 { 5219 import std.range : popFrontN; 5220 static if (sizeFlags & 2) 5221 { 5222 if (inp.length < 2) 5223 badEncoding(); 5224 wchar y = cast(wchar)(inp[1] - 0xDC00); 5225 //not a low surrogate 5226 if (y > 0x3FF) 5227 badEncoding(); 5228 wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff]; 5229 static if (mode == Mode.alwaysSkip) 5230 inp.popFrontN(2); 5231 static if (mode == Mode.skipOnMatch) 5232 { 5233 if (uni[needle]) 5234 { 5235 inp.popFrontN(2); 5236 return true; 5237 } 5238 else 5239 return false; 5240 } 5241 else 5242 return uni[needle]; 5243 } 5244 else //ditto 5245 return false; 5246 } 5247 } 5248 } 5249 5250 struct CherryPick(I, Sizes...) 5251 if (Sizes.length >= 1 && Sizes.length <= 2) 5252 { 5253 private: 5254 import std.meta : allSatisfy; 5255 I* m; 5256 enum sizeFlags = I.sizeFlags; 5257 5258 static if (sizeFlags & 1) 5259 { 5260 @property auto ascii()() const { return m.ascii; } 5261 } 5262 5263 bool lookupUni(Mode mode, Range)(ref Range inp) const 5264 { 5265 return m.lookupUni!mode(inp); 5266 } 5267 mixin DefMatcher; 5268 static assert(allSatisfy!(validSize, Sizes), 5269 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5270 } 5271 } 5272 5273 private auto utf8Matcher(Set)(Set set) 5274 { 5275 return Utf8Matcher!().build(set); 5276 } 5277 5278 private auto utf16Matcher(Set)(Set set) 5279 { 5280 return Utf16Matcher!().build(set); 5281 } 5282 5283 /** 5284 Constructs a matcher object 5285 to classify $(CODEPOINTS) from the `set` for encoding 5286 that has `Char` as code unit. 5287 5288 See $(LREF MatcherConcept) for API outline. 5289 */ 5290 public auto utfMatcher(Char, Set)(Set set) 5291 if (isCodepointSet!Set) 5292 { 5293 static if (is(Char : char)) 5294 return utf8Matcher(set); 5295 else static if (is(Char : wchar)) 5296 return utf16Matcher(set); 5297 else static if (is(Char : dchar)) 5298 static assert(false, "UTF-32 needs no decoding, 5299 and thus not supported by utfMatcher"); 5300 else 5301 static assert(false, "Only character types 'char' and 'wchar' are allowed"); 5302 } 5303 5304 5305 //a range of code units, packed with index to speed up forward iteration 5306 package(std) auto decoder(C)(C[] s, size_t offset=0) 5307 if (is(C : wchar) || is(C : char)) 5308 { 5309 static struct Decoder 5310 { 5311 pure nothrow: 5312 C[] str; 5313 size_t idx; 5314 @property C front(){ return str[idx]; } 5315 @property C back(){ return str[$-1]; } 5316 void popFront(){ idx++; } 5317 void popBack(){ str = str[0..$-1]; } 5318 void popFrontN(size_t n){ idx += n; } 5319 @property bool empty(){ return idx == str.length; } 5320 @property auto save(){ return this; } 5321 auto opIndex(size_t i){ return str[idx+i]; } 5322 @property size_t length(){ return str.length - idx; } 5323 alias opDollar = length; 5324 auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); } 5325 } 5326 static assert(isRandomAccessRange!Decoder); 5327 static assert(is(ElementType!Decoder : C)); 5328 return Decoder(s, offset); 5329 } 5330 5331 pure @safe unittest 5332 { 5333 string rs = "hi! ネемног砀 текста"; 5334 auto codec = rs.decoder; 5335 auto utf8 = utf8Matcher(unicode.Letter); 5336 auto asc = utf8.subMatcher!(1); 5337 auto uni = utf8.subMatcher!(2,3,4); 5338 5339 // h 5340 assert(asc.test(codec)); 5341 assert(!uni.match(codec)); 5342 assert(utf8.skip(codec)); 5343 assert(codec.idx == 1); 5344 5345 // i 5346 assert(asc.test(codec)); 5347 assert(!uni.match(codec)); 5348 assert(utf8.skip(codec)); 5349 assert(codec.idx == 2); 5350 5351 // ! 5352 assert(!asc.match(codec)); 5353 assert(!utf8.test(codec)); 5354 assert(!utf8.skip(codec)); 5355 assert(codec.idx == 3); 5356 5357 // space 5358 assert(!asc.test(codec)); 5359 assert(!utf8.test(codec)); 5360 assert(!utf8.skip(codec)); 5361 assert(codec.idx == 4); 5362 5363 assert(utf8.test(codec)); 5364 foreach (i; 0 .. 7) 5365 { 5366 assert(!asc.test(codec)); 5367 assert(uni.test(codec)); 5368 assert(utf8.skip(codec)); 5369 } 5370 assert(!utf8.test(codec)); 5371 assert(!utf8.skip(codec)); 5372 5373 //the same with match where applicable 5374 codec = rs.decoder; 5375 assert(utf8.match(codec)); 5376 assert(codec.idx == 1); 5377 assert(utf8.match(codec)); 5378 assert(codec.idx == 2); 5379 assert(!utf8.match(codec)); 5380 assert(codec.idx == 2); 5381 assert(!utf8.skip(codec)); 5382 assert(!utf8.skip(codec)); 5383 5384 foreach (i; 0 .. 7) 5385 { 5386 assert(!asc.test(codec)); 5387 assert(utf8.test(codec)); 5388 assert(utf8.match(codec)); 5389 } 5390 auto i = codec.idx; 5391 assert(!utf8.match(codec)); 5392 assert(codec.idx == i); 5393 } 5394 5395 pure @system unittest 5396 { 5397 import std.range : stride; 5398 static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe 5399 { 5400 bool t = m.test(r); 5401 auto save = r.idx; 5402 assert(t == m.match(r)); 5403 assert(r.idx == save || t); //ether no change or was match 5404 r.idx = save; 5405 static if (is(typeof(m.skip(r)))) 5406 { 5407 assert(t == m.skip(r)); 5408 assert(r.idx != save); //always changed 5409 r.idx = save; 5410 } 5411 return t; 5412 } 5413 auto utf16 = utfMatcher!wchar(unicode.L); 5414 auto bmp = utf16.subMatcher!1; 5415 auto nonBmp = utf16.subMatcher!1; 5416 auto utf8 = utfMatcher!char(unicode.L); 5417 auto ascii = utf8.subMatcher!1; 5418 auto uni2 = utf8.subMatcher!2; 5419 auto uni3 = utf8.subMatcher!3; 5420 auto uni24 = utf8.subMatcher!(2,4); 5421 foreach (ch; unicode.L.byCodepoint.stride(3)) 5422 { 5423 import std.utf : encode; 5424 char[4] buf; 5425 wchar[2] buf16; 5426 auto len = encode(buf, ch); 5427 auto len16 = encode(buf16, ch); 5428 auto c8 = buf[0 .. len].decoder; 5429 auto c16 = buf16[0 .. len16].decoder; 5430 assert(testAll(utf16, c16)); 5431 assert(testAll(bmp, c16) || len16 != 1); 5432 assert(testAll(nonBmp, c16) || len16 != 2); 5433 5434 assert(testAll(utf8, c8)); 5435 5436 //submatchers return false on out of their domain 5437 assert(testAll(ascii, c8) || len != 1); 5438 assert(testAll(uni2, c8) || len != 2); 5439 assert(testAll(uni3, c8) || len != 3); 5440 assert(testAll(uni24, c8) || (len != 2 && len != 4)); 5441 } 5442 } 5443 5444 // cover decode fail cases of Matcher 5445 pure @safe unittest 5446 { 5447 import std.algorithm.iteration : map; 5448 import std.exception : collectException; 5449 import std.format : format; 5450 auto utf16 = utfMatcher!wchar(unicode.L); 5451 auto utf8 = utfMatcher!char(unicode.L); 5452 //decode failure cases UTF-8 5453 alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79", 5454 "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00", 5455 "\xCF\x00\0x00\0x00\x00"); 5456 foreach (msg; fails8) 5457 { 5458 assert(collectException((){ 5459 auto s = msg; 5460 size_t idx = 0; 5461 utf8.test(s); 5462 }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg)); 5463 } 5464 //decode failure cases UTF-16 5465 alias fails16 = AliasSeq!([0xD811], [0xDC02]); 5466 foreach (msg; fails16) 5467 { 5468 assert(collectException((){ 5469 auto s = msg.map!(x => cast(wchar) x); 5470 utf16.test(s); 5471 }())); 5472 } 5473 } 5474 5475 /++ 5476 Convenience function to construct optimal configurations for 5477 packed Trie from any `set` of $(CODEPOINTS). 5478 5479 The parameter `level` indicates the number of trie levels to use, 5480 allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs 5481 speed-size wise. 5482 5483 $(P Level 1 is fastest and the most memory hungry (a bit array). ) 5484 $(P Level 4 is the slowest and has the smallest footprint. ) 5485 5486 See the $(S_LINK Synopsis, Synopsis) section for example. 5487 5488 Note: 5489 Level 4 stays very practical (being faster and more predictable) 5490 compared to using direct lookup on the `set` itself. 5491 5492 5493 +/ 5494 public auto toTrie(size_t level, Set)(Set set) 5495 if (isCodepointSet!Set) 5496 { 5497 static if (level == 1) 5498 return codepointSetTrie!(21)(set); 5499 else static if (level == 2) 5500 return codepointSetTrie!(10, 11)(set); 5501 else static if (level == 3) 5502 return codepointSetTrie!(8, 5, 8)(set); 5503 else static if (level == 4) 5504 return codepointSetTrie!(6, 4, 4, 7)(set); 5505 else 5506 static assert(false, 5507 "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly"); 5508 } 5509 5510 /** 5511 $(P Builds a `Trie` with typically optimal speed-size trade-off 5512 and wraps it into a delegate of the following type: 5513 $(D bool delegate(dchar ch)). ) 5514 5515 $(P Effectively this creates a 'tester' lambda suitable 5516 for algorithms like std.algorithm.find that take unary predicates. ) 5517 5518 See the $(S_LINK Synopsis, Synopsis) section for example. 5519 */ 5520 public auto toDelegate(Set)(Set set) 5521 if (isCodepointSet!Set) 5522 { 5523 // 3 is very small and is almost as fast as 2-level (due to CPU caches?) 5524 auto t = toTrie!3(set); 5525 return (dchar ch) => t[ch]; 5526 } 5527 5528 /** 5529 $(P Opaque wrapper around unsigned built-in integers and 5530 code unit (char/wchar/dchar) types. 5531 Parameter `sz` indicates that the value is confined 5532 to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be 5533 packed more tightly when stored in certain 5534 data-structures like trie. ) 5535 5536 Note: 5537 $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T` 5538 but not vise-versa. Users have to ensure the value fits in 5539 the range required and use the `cast` 5540 operator to perform the conversion.) 5541 */ 5542 struct BitPacked(T, size_t sz) 5543 if (isIntegral!T || is(T:dchar)) 5544 { 5545 enum bitSize = sz; 5546 T _value; 5547 alias _value this; 5548 } 5549 5550 /* 5551 Depending on the form of the passed argument `bitSizeOf` returns 5552 the amount of bits required to represent a given type 5553 or a return type of a given functor. 5554 */ 5555 template bitSizeOf(Args...) 5556 if (Args.length == 1) 5557 { 5558 import std.traits : ReturnType; 5559 alias T = Args[0]; 5560 static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t)) 5561 { 5562 enum bitSizeOf = T.bitSize; 5563 } 5564 else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits)) 5565 { 5566 enum bitSizeOf = bitSizeOf!(ReturnType!T); 5567 } 5568 else 5569 { 5570 enum bitSizeOf = T.sizeof*8; 5571 } 5572 } 5573 5574 /** 5575 Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x) 5576 and thus suitable for packing. 5577 */ 5578 template isBitPacked(T) 5579 { 5580 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5581 enum isBitPacked = true; 5582 else 5583 enum isBitPacked = false; 5584 } 5585 5586 /** 5587 Gives the type `U` from $(LREF BitPacked)!(U, x) 5588 or `T` itself for every other type. 5589 */ 5590 template TypeOfBitPacked(T) 5591 { 5592 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5593 alias TypeOfBitPacked = U; 5594 else 5595 alias TypeOfBitPacked = T; 5596 } 5597 5598 /* 5599 Wrapper, used in definition of custom data structures from `Trie` template. 5600 Applying it to a unary lambda function indicates that the returned value always 5601 fits within `bits` of bits. 5602 */ 5603 struct assumeSize(alias Fn, size_t bits) 5604 { 5605 enum bitSize = bits; 5606 static auto ref opCall(T)(auto ref T arg) 5607 { 5608 return Fn(arg); 5609 } 5610 } 5611 5612 /* 5613 A helper for defining lambda function that yields a slice 5614 of certain bits from an unsigned integral value. 5615 The resulting lambda is wrapped in assumeSize and can be used directly 5616 with `Trie` template. 5617 */ 5618 struct sliceBits(size_t from, size_t to) 5619 { 5620 //for now bypass assumeSize, DMD has trouble inlining it 5621 enum bitSize = to-from; 5622 static auto opCall(T)(T x) 5623 out(result) 5624 { 5625 assert(result < (1 << to-from)); 5626 } 5627 do 5628 { 5629 static assert(from < to); 5630 static if (from == 0) 5631 return x & ((1 << to)-1); 5632 else 5633 return (x >> from) & ((1<<(to-from))-1); 5634 } 5635 } 5636 5637 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; } 5638 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; } 5639 alias lo8 = assumeSize!(low_8, 8); 5640 alias mlo8 = assumeSize!(midlow_8, 8); 5641 5642 @safe pure nothrow @nogc unittest 5643 { 5644 static assert(bitSizeOf!lo8 == 8); 5645 static assert(bitSizeOf!(sliceBits!(4, 7)) == 3); 5646 static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2); 5647 } 5648 5649 template Sequence(size_t start, size_t end) 5650 { 5651 static if (start < end) 5652 alias Sequence = AliasSeq!(start, Sequence!(start+1, end)); 5653 else 5654 alias Sequence = AliasSeq!(); 5655 } 5656 5657 //---- TRIE TESTS ---- 5658 @system unittest 5659 { 5660 import std.algorithm.iteration : map; 5661 import std.algorithm.sorting : sort; 5662 import std.array : array; 5663 import std.conv : text, to; 5664 import std.range : iota; 5665 static trieStats(TRIE)(TRIE t) 5666 { 5667 version (std_uni_stats) 5668 { 5669 import std.stdio : writefln, writeln; 5670 writeln("---TRIE FOOTPRINT STATS---"); 5671 static foreach (i; 0 .. t.table.dim) 5672 { 5673 writefln("lvl%s = %s bytes; %s pages" 5674 , i, t.bytes!i, t.pages!i); 5675 } 5676 writefln("TOTAL: %s bytes", t.bytes); 5677 version (none) 5678 { 5679 writeln("INDEX (excluding value level):"); 5680 static foreach (i; 0 .. t.table.dim-1) 5681 writeln(t.table.slice!(i)[0 .. t.table.length!i]); 5682 } 5683 writeln("---------------------------"); 5684 } 5685 } 5686 //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2) 5687 // alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; }); 5688 // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; }); 5689 alias Set = CodepointSet; 5690 auto set = Set('A','Z','a','z'); 5691 auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array 5692 for (int a='a'; a<'z';a++) 5693 assert(trie[a]); 5694 for (int a='A'; a<'Z';a++) 5695 assert(trie[a]); 5696 for (int a=0; a<'A'; a++) 5697 assert(!trie[a]); 5698 for (int a ='Z'; a<'a'; a++) 5699 assert(!trie[a]); 5700 trieStats(trie); 5701 5702 auto redundant2 = Set( 5703 1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111); 5704 auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval); 5705 trieStats(trie2); 5706 foreach (e; redundant2.byCodepoint) 5707 assert(trie2[e], text(cast(uint) e, " - ", trie2[e])); 5708 foreach (i; 0 .. 1024) 5709 { 5710 assert(trie2[i] == (i in redundant2)); 5711 } 5712 5713 5714 auto redundant3 = Set( 5715 2, 4, 6, 8, 16, 5716 2+16, 4+16, 16+6, 16+8, 16+16, 5717 2+32, 4+32, 32+6, 32+8, 5718 ); 5719 5720 enum max3 = 256; 5721 // sliceBits 5722 auto trie3 = buildTrie!(bool, uint, max3, 5723 sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4) 5724 )(redundant3.byInterval); 5725 trieStats(trie3); 5726 foreach (i; 0 .. max3) 5727 assert(trie3[i] == (i in redundant3), text(cast(uint) i)); 5728 5729 auto redundant4 = Set( 5730 10, 64, 64+10, 128, 128+10, 256, 256+10, 512, 5731 1000, 2000, 3000, 4000, 5000, 6000 5732 ); 5733 enum max4 = 2^^16; 5734 auto trie4 = buildTrie!(bool, size_t, max4, 5735 sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6) 5736 )(redundant4.byInterval); 5737 foreach (i; 0 .. max4) 5738 { 5739 if (i in redundant4) 5740 assert(trie4[i], text(cast(uint) i)); 5741 } 5742 trieStats(trie4); 5743 5744 alias mapToS = mapTrieIndex!(useItemAt!(0, char)); 5745 string[] redundantS = ["tea", "start", "orange"]; 5746 redundantS.sort!((a,b) => mapToS(a) < mapToS(b))(); 5747 auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS); 5748 // using first char only 5749 assert(redundantS == ["orange", "start", "tea"]); 5750 assert(strie["test"], text(strie["test"])); 5751 assert(!strie["aea"]); 5752 assert(strie["s"]); 5753 5754 // a bit size test 5755 auto a = array(map!(x => to!ubyte(x))(iota(0, 256))); 5756 auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a); 5757 trieStats(bt); 5758 foreach (i; 0 .. 256) 5759 assert(bt[cast(ubyte) i]); 5760 } 5761 5762 template useItemAt(size_t idx, T) 5763 if (isIntegral!T || is(T: dchar)) 5764 { 5765 size_t impl(const scope T[] arr){ return arr[idx]; } 5766 alias useItemAt = assumeSize!(impl, 8*T.sizeof); 5767 } 5768 5769 template useLastItem(T) 5770 { 5771 size_t impl(const scope T[] arr){ return arr[$-1]; } 5772 alias useLastItem = assumeSize!(impl, 8*T.sizeof); 5773 } 5774 5775 template fullBitSize(Prefix...) 5776 { 5777 static if (Prefix.length > 0) 5778 enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]); 5779 else 5780 enum fullBitSize = 0; 5781 } 5782 5783 template idxTypes(Key, size_t fullBits, Prefix...) 5784 { 5785 static if (Prefix.length == 1) 5786 {// the last level is value level, so no index once reduced to 1-level 5787 alias idxTypes = AliasSeq!(); 5788 } 5789 else 5790 { 5791 // Important note on bit packing 5792 // Each level has to hold enough of bits to address the next one 5793 // The bottom level is known to hold full bit width 5794 // thus it's size in pages is full_bit_width - size_of_last_prefix 5795 // Recourse on this notion 5796 alias idxTypes = 5797 AliasSeq!( 5798 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]), 5799 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1])) 5800 ); 5801 } 5802 } 5803 5804 //============================================================================ 5805 5806 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) 5807 if (is(Char1 : dchar) && is(Char2 : dchar)) 5808 { 5809 import std.algorithm.comparison : cmp; 5810 import std.algorithm.iteration : map, filter; 5811 import std.ascii : toLower; 5812 static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';} 5813 return cmp( 5814 a.map!toLower.filter!pred, 5815 b.map!toLower.filter!pred); 5816 } 5817 5818 @safe pure unittest 5819 { 5820 assert(!comparePropertyName("foo-bar", "fooBar")); 5821 } 5822 5823 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure 5824 if (is(Char1 : dchar) && is(Char2 : dchar)) 5825 { 5826 return comparePropertyName(a, b) < 0; 5827 } 5828 5829 //============================================================================ 5830 // Utilities for compression of Unicode code point sets 5831 //============================================================================ 5832 5833 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow 5834 { 5835 // not optimized as usually done 1 time (and not public interface) 5836 if (val < 128) 5837 arr ~= cast(ubyte) val; 5838 else if (val < (1 << 13)) 5839 { 5840 arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8); 5841 arr ~= val & 0xFF; 5842 } 5843 else 5844 { 5845 assert(val < (1 << 21)); 5846 arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16); 5847 arr ~= (val >> 8) & 0xFF; 5848 arr ~= val & 0xFF; 5849 } 5850 } 5851 5852 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure 5853 { 5854 import std.exception : enforce; 5855 immutable first = arr[idx++]; 5856 if (!(first & 0x80)) // no top bit -> [0 .. 127] 5857 return first; 5858 immutable extra = ((first >> 5) & 1) + 1; // [1, 2] 5859 uint val = (first & 0x1F); 5860 enforce(idx + extra <= arr.length, "bad code point interval encoding"); 5861 foreach (j; 0 .. extra) 5862 val = (val << 8) | arr[idx+j]; 5863 idx += extra; 5864 return val; 5865 } 5866 5867 5868 package(std) ubyte[] compressIntervals(Range)(Range intervals) 5869 if (isInputRange!Range && isIntegralPair!(ElementType!Range)) 5870 { 5871 ubyte[] storage; 5872 uint base = 0; 5873 // RLE encode 5874 foreach (val; intervals) 5875 { 5876 compressTo(val[0]-base, storage); 5877 base = val[0]; 5878 if (val[1] != lastDchar+1) // till the end of the domain so don't store it 5879 { 5880 compressTo(val[1]-base, storage); 5881 base = val[1]; 5882 } 5883 } 5884 return storage; 5885 } 5886 5887 @safe pure unittest 5888 { 5889 import std.algorithm.comparison : equal; 5890 import std.typecons : tuple; 5891 5892 auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)]; 5893 ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0]; 5894 assert(compressIntervals(run) == enc); 5895 auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)]; 5896 ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed 5897 assert(compressIntervals(run2) == enc2); 5898 size_t idx = 0; 5899 assert(decompressFrom(enc, idx) == 80); 5900 assert(decompressFrom(enc, idx) == 47); 5901 assert(decompressFrom(enc, idx) == 1); 5902 assert(decompressFrom(enc, idx) == (1 << 10)); 5903 idx = 0; 5904 assert(decompressFrom(enc2, idx) == 0); 5905 assert(decompressFrom(enc2, idx) == (1 << 20)+512+1); 5906 assert(equal(decompressIntervals(compressIntervals(run)), run)); 5907 assert(equal(decompressIntervals(compressIntervals(run2)), run2)); 5908 } 5909 5910 // Creates a range of `CodepointInterval` that lazily decodes compressed data. 5911 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure 5912 { 5913 return DecompressedIntervals(data); 5914 } 5915 5916 @safe struct DecompressedIntervals 5917 { 5918 pure: 5919 const(ubyte)[] _stream; 5920 size_t _idx; 5921 CodepointInterval _front; 5922 5923 this(const(ubyte)[] stream) 5924 { 5925 _stream = stream; 5926 popFront(); 5927 } 5928 5929 @property CodepointInterval front() 5930 { 5931 assert(!empty); 5932 return _front; 5933 } 5934 5935 void popFront() 5936 { 5937 if (_idx == _stream.length) 5938 { 5939 _idx = size_t.max; 5940 return; 5941 } 5942 uint base = _front[1]; 5943 _front[0] = base + decompressFrom(_stream, _idx); 5944 if (_idx == _stream.length)// odd length ---> till the end 5945 _front[1] = lastDchar+1; 5946 else 5947 { 5948 base = _front[0]; 5949 _front[1] = base + decompressFrom(_stream, _idx); 5950 } 5951 } 5952 5953 @property bool empty() const 5954 { 5955 return _idx == size_t.max; 5956 } 5957 5958 @property DecompressedIntervals save() return scope { return this; } 5959 } 5960 5961 @safe pure nothrow @nogc unittest 5962 { 5963 static assert(isInputRange!DecompressedIntervals); 5964 static assert(isForwardRange!DecompressedIntervals); 5965 } 5966 5967 //============================================================================ 5968 5969 version (std_uni_bootstrap){} 5970 else 5971 { 5972 5973 // helper for looking up code point sets 5974 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name) 5975 { 5976 import std.algorithm.iteration : map; 5977 import std.range : assumeSorted; 5978 auto range = assumeSorted!((a,b) => propertyNameLess(a,b)) 5979 (table.map!"a.name"()); 5980 size_t idx = range.lowerBound(name).length; 5981 if (idx < range.length && comparePropertyName(range[idx], name) == 0) 5982 return idx; 5983 return -1; 5984 } 5985 5986 // another one that loads it 5987 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest) 5988 { 5989 auto idx = findUnicodeSet!table(name); 5990 if (idx >= 0) 5991 { 5992 dest = Set(asSet(table[idx].compressed)); 5993 return true; 5994 } 5995 return false; 5996 } 5997 5998 bool loadProperty(Set=CodepointSet, C) 5999 (const scope C[] name, ref Set target) pure 6000 { 6001 import std.internal.unicode_tables : uniProps; // generated file 6002 alias ucmp = comparePropertyName; 6003 // conjure cumulative properties by hand 6004 if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0) 6005 { 6006 target = asSet(uniProps.Lu); 6007 target |= asSet(uniProps.Ll); 6008 target |= asSet(uniProps.Lt); 6009 target |= asSet(uniProps.Lo); 6010 target |= asSet(uniProps.Lm); 6011 } 6012 else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0) 6013 { 6014 target = asSet(uniProps.Ll); 6015 target |= asSet(uniProps.Lu); 6016 target |= asSet(uniProps.Lt);// Title case 6017 } 6018 else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0) 6019 { 6020 target = asSet(uniProps.Mn); 6021 target |= asSet(uniProps.Mc); 6022 target |= asSet(uniProps.Me); 6023 } 6024 else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0) 6025 { 6026 target = asSet(uniProps.Nd); 6027 target |= asSet(uniProps.Nl); 6028 target |= asSet(uniProps.No); 6029 } 6030 else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0) 6031 { 6032 target = asSet(uniProps.Pc); 6033 target |= asSet(uniProps.Pd); 6034 target |= asSet(uniProps.Ps); 6035 target |= asSet(uniProps.Pe); 6036 target |= asSet(uniProps.Pi); 6037 target |= asSet(uniProps.Pf); 6038 target |= asSet(uniProps.Po); 6039 } 6040 else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0) 6041 { 6042 target = asSet(uniProps.Sm); 6043 target |= asSet(uniProps.Sc); 6044 target |= asSet(uniProps.Sk); 6045 target |= asSet(uniProps.So); 6046 } 6047 else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0) 6048 { 6049 target = asSet(uniProps.Zs); 6050 target |= asSet(uniProps.Zl); 6051 target |= asSet(uniProps.Zp); 6052 } 6053 else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0) 6054 { 6055 target = asSet(uniProps.Cc); 6056 target |= asSet(uniProps.Cf); 6057 target |= asSet(uniProps.Cs); 6058 target |= asSet(uniProps.Co); 6059 target |= asSet(uniProps.Cn); 6060 } 6061 else if (ucmp(name, "graphical") == 0) 6062 { 6063 target = asSet(uniProps.Alphabetic); 6064 6065 target |= asSet(uniProps.Mn); 6066 target |= asSet(uniProps.Mc); 6067 target |= asSet(uniProps.Me); 6068 6069 target |= asSet(uniProps.Nd); 6070 target |= asSet(uniProps.Nl); 6071 target |= asSet(uniProps.No); 6072 6073 target |= asSet(uniProps.Pc); 6074 target |= asSet(uniProps.Pd); 6075 target |= asSet(uniProps.Ps); 6076 target |= asSet(uniProps.Pe); 6077 target |= asSet(uniProps.Pi); 6078 target |= asSet(uniProps.Pf); 6079 target |= asSet(uniProps.Po); 6080 6081 target |= asSet(uniProps.Zs); 6082 6083 target |= asSet(uniProps.Sm); 6084 target |= asSet(uniProps.Sc); 6085 target |= asSet(uniProps.Sk); 6086 target |= asSet(uniProps.So); 6087 } 6088 else if (ucmp(name, "any") == 0) 6089 target = Set.fromIntervals(0, 0x110000); 6090 else if (ucmp(name, "ascii") == 0) 6091 target = Set.fromIntervals(0, 0x80); 6092 else 6093 return loadUnicodeSet!(uniProps.tab)(name, target); 6094 return true; 6095 } 6096 6097 // CTFE-only helper for checking property names at compile-time 6098 @safe bool isPrettyPropertyName(C)(const scope C[] name) 6099 { 6100 import std.algorithm.searching : find; 6101 auto names = [ 6102 "L", "Letter", 6103 "LC", "Cased Letter", 6104 "M", "Mark", 6105 "N", "Number", 6106 "P", "Punctuation", 6107 "S", "Symbol", 6108 "Z", "Separator", 6109 "Graphical", 6110 "any", 6111 "ascii" 6112 ]; 6113 auto x = find!(x => comparePropertyName(x, name) == 0)(names); 6114 return !x.empty; 6115 } 6116 6117 // ditto, CTFE-only, not optimized 6118 @safe private static bool findSetName(alias table, C)(const scope C[] name) 6119 { 6120 return findUnicodeSet!table(name) >= 0; 6121 } 6122 6123 template SetSearcher(alias table, string kind) 6124 { 6125 /// Run-time checked search. 6126 static auto opCall(C)(const scope C[] name) 6127 if (is(C : dchar)) 6128 { 6129 import std.conv : to; 6130 CodepointSet set; 6131 if (loadUnicodeSet!table(name, set)) 6132 return set; 6133 throw new Exception("No unicode set for "~kind~" by name " 6134 ~name.to!string()~" was found."); 6135 } 6136 /// Compile-time checked search. 6137 static @property auto opDispatch(string name)() 6138 { 6139 static if (findSetName!table(name)) 6140 { 6141 CodepointSet set; 6142 loadUnicodeSet!table(name, set); 6143 return set; 6144 } 6145 else 6146 static assert(false, "No unicode set for "~kind~" by name " 6147 ~name~" was found."); 6148 } 6149 } 6150 6151 // Characters that need escaping in string posed as regular expressions 6152 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', 6153 ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); 6154 6155 package(std) CodepointSet memoizeExpr(string expr)() 6156 { 6157 if (__ctfe) 6158 return mixin(expr); 6159 alias T = typeof(mixin(expr)); 6160 static T slot; 6161 static bool initialized; 6162 if (!initialized) 6163 { 6164 slot = mixin(expr); 6165 initialized = true; 6166 } 6167 return slot; 6168 } 6169 6170 //property for \w character class 6171 package(std) @property CodepointSet wordCharacter() @safe 6172 { 6173 return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc 6174 | unicode.Me | unicode.Nd | unicode.Pc")(); 6175 } 6176 6177 //basic stack, just in case it gets used anywhere else then Parser 6178 package(std) struct Stack(T) 6179 { 6180 @safe: 6181 T[] data; 6182 @property bool empty(){ return data.empty; } 6183 6184 @property size_t length(){ return data.length; } 6185 6186 void push(T val){ data ~= val; } 6187 6188 @trusted T pop() 6189 { 6190 assert(!empty); 6191 auto val = data[$ - 1]; 6192 data = data[0 .. $ - 1]; 6193 if (!__ctfe) 6194 cast(void) data.assumeSafeAppend(); 6195 return val; 6196 } 6197 6198 @property ref T top() 6199 { 6200 assert(!empty); 6201 return data[$ - 1]; 6202 } 6203 } 6204 6205 //test if a given string starts with hex number of maxDigit that's a valid codepoint 6206 //returns it's value and skips these maxDigit chars on success, throws on failure 6207 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit) 6208 { 6209 import std.exception : enforce; 6210 //std.conv.parse is both @system and bogus 6211 uint val; 6212 for (int k = 0; k < maxDigit; k++) 6213 { 6214 enforce(!str.empty, "incomplete escape sequence"); 6215 //accepts ascii only, so it's OK to index directly 6216 immutable current = str.front; 6217 if ('0' <= current && current <= '9') 6218 val = val * 16 + current - '0'; 6219 else if ('a' <= current && current <= 'f') 6220 val = val * 16 + current -'a' + 10; 6221 else if ('A' <= current && current <= 'F') 6222 val = val * 16 + current - 'A' + 10; 6223 else 6224 throw new Exception("invalid escape sequence"); 6225 str.popFront(); 6226 } 6227 enforce(val <= 0x10FFFF, "invalid codepoint"); 6228 return val; 6229 } 6230 6231 @safe unittest 6232 { 6233 import std.algorithm.searching : canFind; 6234 import std.exception : collectException; 6235 string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; 6236 string[] hex = [ "01", "ff", "00af", "10FFFF" ]; 6237 int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ]; 6238 foreach (v; non_hex) 6239 assert(collectException(parseUniHex(v, v.length)).msg 6240 .canFind("invalid escape sequence")); 6241 foreach (i, v; hex) 6242 assert(parseUniHex(v, v.length) == value[i]); 6243 string over = "0011FFFF"; 6244 assert(collectException(parseUniHex(over, over.length)).msg 6245 .canFind("invalid codepoint")); 6246 } 6247 6248 auto caseEnclose(CodepointSet set) 6249 { 6250 auto cased = set & unicode.LC; 6251 foreach (dchar ch; cased.byCodepoint) 6252 { 6253 foreach (c; simpleCaseFoldings(ch)) 6254 set |= c; 6255 } 6256 return set; 6257 } 6258 6259 /+ 6260 fetch codepoint set corresponding to a name (InBlock or binary property) 6261 +/ 6262 CodepointSet getUnicodeSet(const scope char[] name, bool negated, bool casefold) @safe 6263 { 6264 CodepointSet s = unicode(name); 6265 //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) 6266 if (casefold) 6267 s = caseEnclose(s); 6268 if (negated) 6269 s = s.inverted; 6270 return s; 6271 } 6272 6273 struct UnicodeSetParser(Range) 6274 { 6275 import std.exception : enforce; 6276 import std.typecons : tuple, Tuple; 6277 Range range; 6278 bool casefold_; 6279 6280 @property bool empty(){ return range.empty; } 6281 @property dchar front(){ return range.front; } 6282 void popFront(){ range.popFront(); } 6283 6284 //CodepointSet operations relatively in order of priority 6285 enum Operator:uint { 6286 Open = 0, Negate, Difference, SymDifference, Intersection, Union, None 6287 } 6288 6289 //parse unit of CodepointSet spec, most notably escape sequences and char ranges 6290 //also fetches next set operation 6291 Tuple!(CodepointSet,Operator) parseCharTerm() 6292 { 6293 import std.range : drop; 6294 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; 6295 enum State{ Start, Char, Escape, CharDash, CharDashEscape, 6296 PotentialTwinSymbolOperator } 6297 Operator op = Operator.None; 6298 dchar last; 6299 CodepointSet set; 6300 State state = State.Start; 6301 6302 void addWithFlags(ref CodepointSet set, uint ch) 6303 { 6304 if (casefold_) 6305 { 6306 auto foldings = simpleCaseFoldings(ch); 6307 foreach (v; foldings) 6308 set |= v; 6309 } 6310 else 6311 set |= ch; 6312 } 6313 6314 static Operator twinSymbolOperator(dchar symbol) 6315 { 6316 switch (symbol) 6317 { 6318 case '|': 6319 return Operator.Union; 6320 case '-': 6321 return Operator.Difference; 6322 case '~': 6323 return Operator.SymDifference; 6324 case '&': 6325 return Operator.Intersection; 6326 default: 6327 assert(false); 6328 } 6329 } 6330 6331 L_CharTermLoop: 6332 for (;;) 6333 { 6334 final switch (state) 6335 { 6336 case State.Start: 6337 switch (front) 6338 { 6339 case '|': 6340 case '-': 6341 case '~': 6342 case '&': 6343 state = State.PotentialTwinSymbolOperator; 6344 last = front; 6345 break; 6346 case '[': 6347 op = Operator.Union; 6348 goto case; 6349 case ']': 6350 break L_CharTermLoop; 6351 case '\\': 6352 state = State.Escape; 6353 break; 6354 default: 6355 state = State.Char; 6356 last = front; 6357 } 6358 break; 6359 case State.Char: 6360 // xxx last front xxx 6361 switch (front) 6362 { 6363 case '|': 6364 case '~': 6365 case '&': 6366 // then last is treated as normal char and added as implicit union 6367 state = State.PotentialTwinSymbolOperator; 6368 addWithFlags(set, last); 6369 last = front; 6370 break; 6371 case '-': // still need more info 6372 state = State.CharDash; 6373 break; 6374 case '\\': 6375 set |= last; 6376 state = State.Escape; 6377 break; 6378 case '[': 6379 op = Operator.Union; 6380 goto case; 6381 case ']': 6382 addWithFlags(set, last); 6383 break L_CharTermLoop; 6384 default: 6385 state = State.Char; 6386 addWithFlags(set, last); 6387 last = front; 6388 } 6389 break; 6390 case State.PotentialTwinSymbolOperator: 6391 // xxx last front xxxx 6392 // where last = [|-&~] 6393 if (front == last) 6394 { 6395 op = twinSymbolOperator(last); 6396 popFront();//skip second twin char 6397 break L_CharTermLoop; 6398 } 6399 goto case State.Char; 6400 case State.Escape: 6401 // xxx \ front xxx 6402 switch (front) 6403 { 6404 case 'f': 6405 last = '\f'; 6406 state = State.Char; 6407 break; 6408 case 'n': 6409 last = '\n'; 6410 state = State.Char; 6411 break; 6412 case 'r': 6413 last = '\r'; 6414 state = State.Char; 6415 break; 6416 case 't': 6417 last = '\t'; 6418 state = State.Char; 6419 break; 6420 case 'v': 6421 last = '\v'; 6422 state = State.Char; 6423 break; 6424 case 'c': 6425 last = unicode.parseControlCode(this); 6426 state = State.Char; 6427 break; 6428 foreach (val; Escapables) 6429 { 6430 case val: 6431 } 6432 last = front; 6433 state = State.Char; 6434 break; 6435 case 'p': 6436 set.add(unicode.parsePropertySpec(this, false, casefold_)); 6437 state = State.Start; 6438 continue L_CharTermLoop; //next char already fetched 6439 case 'P': 6440 set.add(unicode.parsePropertySpec(this, true, casefold_)); 6441 state = State.Start; 6442 continue L_CharTermLoop; //next char already fetched 6443 case 'x': 6444 popFront(); 6445 last = parseUniHex(this, 2); 6446 state = State.Char; 6447 continue L_CharTermLoop; 6448 case 'u': 6449 popFront(); 6450 last = parseUniHex(this, 4); 6451 state = State.Char; 6452 continue L_CharTermLoop; 6453 case 'U': 6454 popFront(); 6455 last = parseUniHex(this, 8); 6456 state = State.Char; 6457 continue L_CharTermLoop; 6458 case 'd': 6459 set.add(unicode.Nd); 6460 state = State.Start; 6461 break; 6462 case 'D': 6463 set.add(unicode.Nd.inverted); 6464 state = State.Start; 6465 break; 6466 case 's': 6467 set.add(unicode.White_Space); 6468 state = State.Start; 6469 break; 6470 case 'S': 6471 set.add(unicode.White_Space.inverted); 6472 state = State.Start; 6473 break; 6474 case 'w': 6475 set.add(wordCharacter); 6476 state = State.Start; 6477 break; 6478 case 'W': 6479 set.add(wordCharacter.inverted); 6480 state = State.Start; 6481 break; 6482 default: 6483 if (front >= privateUseStart && front <= privateUseEnd) 6484 enforce(false, "no matching ']' found while parsing character class"); 6485 enforce(false, "invalid escape sequence"); 6486 } 6487 break; 6488 case State.CharDash: 6489 // xxx last - front xxx 6490 switch (front) 6491 { 6492 case '[': 6493 op = Operator.Union; 6494 goto case; 6495 case ']': 6496 //means dash is a single char not an interval specifier 6497 addWithFlags(set, last); 6498 addWithFlags(set, '-'); 6499 break L_CharTermLoop; 6500 case '-'://set Difference again 6501 addWithFlags(set, last); 6502 op = Operator.Difference; 6503 popFront();//skip '-' 6504 break L_CharTermLoop; 6505 case '\\': 6506 state = State.CharDashEscape; 6507 break; 6508 default: 6509 enforce(last <= front, "inverted range"); 6510 if (casefold_) 6511 { 6512 for (uint ch = last; ch <= front; ch++) 6513 addWithFlags(set, ch); 6514 } 6515 else 6516 set.add(last, front + 1); 6517 state = State.Start; 6518 } 6519 break; 6520 case State.CharDashEscape: 6521 //xxx last - \ front xxx 6522 uint end; 6523 switch (front) 6524 { 6525 case 'f': 6526 end = '\f'; 6527 break; 6528 case 'n': 6529 end = '\n'; 6530 break; 6531 case 'r': 6532 end = '\r'; 6533 break; 6534 case 't': 6535 end = '\t'; 6536 break; 6537 case 'v': 6538 end = '\v'; 6539 break; 6540 foreach (val; Escapables) 6541 { 6542 case val: 6543 } 6544 end = front; 6545 break; 6546 case 'c': 6547 end = unicode.parseControlCode(this); 6548 break; 6549 case 'x': 6550 popFront(); 6551 end = parseUniHex(this, 2); 6552 enforce(last <= end,"inverted range"); 6553 set.add(last, end + 1); 6554 state = State.Start; 6555 continue L_CharTermLoop; 6556 case 'u': 6557 popFront(); 6558 end = parseUniHex(this, 4); 6559 enforce(last <= end,"inverted range"); 6560 set.add(last, end + 1); 6561 state = State.Start; 6562 continue L_CharTermLoop; 6563 case 'U': 6564 popFront(); 6565 end = parseUniHex(this, 8); 6566 enforce(last <= end,"inverted range"); 6567 set.add(last, end + 1); 6568 state = State.Start; 6569 continue L_CharTermLoop; 6570 default: 6571 if (front >= privateUseStart && front <= privateUseEnd) 6572 enforce(false, "no matching ']' found while parsing character class"); 6573 enforce(false, "invalid escape sequence"); 6574 } 6575 // Lookahead to check if it's a \T 6576 // where T is sub-pattern terminator in multi-pattern scheme 6577 auto lookahead = range.save.drop(1); 6578 if (end == '\\' && !lookahead.empty) 6579 { 6580 if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd) 6581 enforce(false, "no matching ']' found while parsing character class"); 6582 } 6583 enforce(last <= end,"inverted range"); 6584 set.add(last, end + 1); 6585 state = State.Start; 6586 break; 6587 } 6588 popFront(); 6589 enforce(!empty, "unexpected end of CodepointSet"); 6590 } 6591 return tuple(set, op); 6592 } 6593 6594 alias ValStack = Stack!(CodepointSet); 6595 alias OpStack = Stack!(Operator); 6596 6597 CodepointSet parseSet() 6598 { 6599 ValStack vstack; 6600 OpStack opstack; 6601 import std.functional : unaryFun; 6602 enforce(!empty, "unexpected end of input"); 6603 enforce(front == '[', "expected '[' at the start of unicode set"); 6604 // 6605 static bool apply(Operator op, ref ValStack stack) 6606 { 6607 switch (op) 6608 { 6609 case Operator.Negate: 6610 enforce(!stack.empty, "no operand for '^'"); 6611 stack.top = stack.top.inverted; 6612 break; 6613 case Operator.Union: 6614 auto s = stack.pop();//2nd operand 6615 enforce(!stack.empty, "no operand for '||'"); 6616 stack.top.add(s); 6617 break; 6618 case Operator.Difference: 6619 auto s = stack.pop();//2nd operand 6620 enforce(!stack.empty, "no operand for '--'"); 6621 stack.top.sub(s); 6622 break; 6623 case Operator.SymDifference: 6624 auto s = stack.pop();//2nd operand 6625 enforce(!stack.empty, "no operand for '~~'"); 6626 stack.top ~= s; 6627 break; 6628 case Operator.Intersection: 6629 auto s = stack.pop();//2nd operand 6630 enforce(!stack.empty, "no operand for '&&'"); 6631 stack.top.intersect(s); 6632 break; 6633 default: 6634 return false; 6635 } 6636 return true; 6637 } 6638 static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) 6639 { 6640 while (cond(opstack.top)) 6641 { 6642 if (!apply(opstack.pop(),vstack)) 6643 return false;//syntax error 6644 if (opstack.empty) 6645 return false; 6646 } 6647 return true; 6648 } 6649 6650 L_CharsetLoop: 6651 do 6652 { 6653 switch (front) 6654 { 6655 case '[': 6656 opstack.push(Operator.Open); 6657 popFront(); 6658 enforce(!empty, "unexpected end of character class"); 6659 if (front == '^') 6660 { 6661 opstack.push(Operator.Negate); 6662 popFront(); 6663 enforce(!empty, "unexpected end of character class"); 6664 } 6665 else if (front == ']') // []...] is special cased 6666 { 6667 popFront(); 6668 enforce(!empty, "wrong character set"); 6669 auto pair = parseCharTerm(); 6670 pair[0].add(']', ']'+1); 6671 if (pair[1] != Operator.None) 6672 { 6673 if (opstack.top == Operator.Union) 6674 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6675 opstack.push(pair[1]); 6676 } 6677 vstack.push(pair[0]); 6678 } 6679 break; 6680 case ']': 6681 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack), 6682 "character class syntax error"); 6683 enforce(!opstack.empty, "unmatched ']'"); 6684 opstack.pop(); 6685 popFront(); 6686 if (opstack.empty) 6687 break L_CharsetLoop; 6688 auto pair = parseCharTerm(); 6689 if (!pair[0].empty)//not only operator e.g. -- or ~~ 6690 { 6691 vstack.top.add(pair[0]);//apply union 6692 } 6693 if (pair[1] != Operator.None) 6694 { 6695 if (opstack.top == Operator.Union) 6696 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6697 opstack.push(pair[1]); 6698 } 6699 break; 6700 // 6701 default://yet another pair of term(op)? 6702 auto pair = parseCharTerm(); 6703 if (pair[1] != Operator.None) 6704 { 6705 if (opstack.top == Operator.Union) 6706 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6707 opstack.push(pair[1]); 6708 } 6709 vstack.push(pair[0]); 6710 } 6711 6712 }while (!empty || !opstack.empty); 6713 while (!opstack.empty) 6714 apply(opstack.pop(),vstack); 6715 assert(vstack.length == 1); 6716 return vstack.top; 6717 } 6718 } 6719 6720 /** 6721 A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of 6722 a block, script or general category. 6723 6724 It uses well defined standard rules of property name lookup. 6725 This includes fuzzy matching of names, so that 6726 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal 6727 and yield the same set of white space $(CHARACTERS). 6728 */ 6729 @safe public struct unicode 6730 { 6731 import std.exception : enforce; 6732 /** 6733 Performs the lookup of set of $(CODEPOINTS) 6734 with compile-time correctness checking. 6735 This short-cut version combines 3 searches: 6736 across blocks, scripts, and common binary properties. 6737 6738 Note that since scripts and blocks overlap the 6739 usual trick to disambiguate is used - to get a block use 6740 `unicode.InBlockName`, to search a script 6741 use `unicode.ScriptName`. 6742 6743 See_Also: $(LREF block), $(LREF script) 6744 and (not included in this search) $(LREF hangulSyllableType). 6745 */ 6746 6747 static @property auto opDispatch(string name)() pure 6748 { 6749 static if (findAny(name)) 6750 return loadAny(name); 6751 else 6752 static assert(false, "No unicode set by name "~name~" was found."); 6753 } 6754 6755 /// 6756 @safe unittest 6757 { 6758 import std.exception : collectException; 6759 auto ascii = unicode.ASCII; 6760 assert(ascii['A']); 6761 assert(ascii['~']); 6762 assert(!ascii['\u00e0']); 6763 // matching is case-insensitive 6764 assert(ascii == unicode.ascII); 6765 assert(!ascii['à']); 6766 // underscores, '-' and whitespace in names are ignored too 6767 auto latin = unicode.in_latin1_Supplement; 6768 assert(latin['à']); 6769 assert(!latin['$']); 6770 // BTW Latin 1 Supplement is a block, hence "In" prefix 6771 assert(latin == unicode("In Latin 1 Supplement")); 6772 // run-time look up throws if no such set is found 6773 assert(collectException(unicode("InCyrilliac"))); 6774 } 6775 6776 /** 6777 The same lookup across blocks, scripts, or binary properties, 6778 but performed at run-time. 6779 This version is provided for cases where `name` 6780 is not known beforehand; otherwise compile-time 6781 checked $(LREF opDispatch) is typically a better choice. 6782 6783 See the $(S_LINK Unicode properties, table of properties) for available 6784 sets. 6785 */ 6786 static auto opCall(C)(const scope C[] name) 6787 if (is(C : dchar)) 6788 { 6789 return loadAny(name); 6790 } 6791 6792 /** 6793 Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks. 6794 6795 Note: 6796 Here block names are unambiguous as no scripts are searched 6797 and thus to search use simply `unicode.block.BlockName` notation. 6798 6799 See $(S_LINK Unicode properties, table of properties) for available sets. 6800 See_Also: $(S_LINK Unicode properties, table of properties). 6801 */ 6802 struct block 6803 { 6804 import std.internal.unicode_tables : blocks; // generated file 6805 mixin SetSearcher!(blocks.tab, "block"); 6806 } 6807 6808 /// 6809 @safe unittest 6810 { 6811 // use .block for explicitness 6812 assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic); 6813 } 6814 6815 /** 6816 Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts. 6817 6818 See the $(S_LINK Unicode properties, table of properties) for available 6819 sets. 6820 */ 6821 struct script 6822 { 6823 import std.internal.unicode_tables : scripts; // generated file 6824 mixin SetSearcher!(scripts.tab, "script"); 6825 } 6826 6827 /// 6828 @safe unittest 6829 { 6830 auto arabicScript = unicode.script.arabic; 6831 auto arabicBlock = unicode.block.arabic; 6832 // there is an intersection between script and block 6833 assert(arabicBlock['']); 6834 assert(arabicScript['']); 6835 // but they are different 6836 assert(arabicBlock != arabicScript); 6837 assert(arabicBlock == unicode.inArabic); 6838 assert(arabicScript == unicode.arabic); 6839 } 6840 6841 /** 6842 Fetch a set of $(CODEPOINTS) that have the given hangul syllable type. 6843 6844 Other non-binary properties (once supported) follow the same 6845 notation - `unicode.propertyName.propertyValue` for compile-time 6846 checked access and `unicode.propertyName(propertyValue)` 6847 for run-time checked one. 6848 6849 See the $(S_LINK Unicode properties, table of properties) for available 6850 sets. 6851 */ 6852 struct hangulSyllableType 6853 { 6854 import std.internal.unicode_tables : hangul; // generated file 6855 mixin SetSearcher!(hangul.tab, "hangul syllable type"); 6856 } 6857 6858 /// 6859 @safe unittest 6860 { 6861 // L here is syllable type not Letter as in unicode.L short-cut 6862 auto leadingVowel = unicode.hangulSyllableType("L"); 6863 // check that some leading vowels are present 6864 foreach (vowel; '\u1110'..'\u115F') 6865 assert(leadingVowel[vowel]); 6866 assert(leadingVowel == unicode.hangulSyllableType.L); 6867 } 6868 6869 //parse control code of form \cXXX, c assumed to be the current symbol 6870 static package(std) dchar parseControlCode(Parser)(ref Parser p) 6871 { 6872 with(p) 6873 { 6874 popFront(); 6875 enforce(!empty, "Unfinished escape sequence"); 6876 enforce(('a' <= front && front <= 'z') 6877 || ('A' <= front && front <= 'Z'), 6878 "Only letters are allowed after \\c"); 6879 return front & 0x1f; 6880 } 6881 } 6882 6883 //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, 6884 //\ - assumed to be processed, p - is current 6885 static package(std) CodepointSet parsePropertySpec(Range)(ref Range p, 6886 bool negated, bool casefold) 6887 { 6888 static import std.ascii; 6889 with(p) 6890 { 6891 enum MAX_PROPERTY = 128; 6892 char[MAX_PROPERTY] result; 6893 uint k = 0; 6894 popFront(); 6895 enforce(!empty, "eof parsing unicode property spec"); 6896 if (front == '{') 6897 { 6898 popFront(); 6899 while (k < MAX_PROPERTY && !empty && front !='}' 6900 && front !=':') 6901 { 6902 if (front != '-' && front != ' ' && front != '_') 6903 result[k++] = cast(char) std.ascii.toLower(front); 6904 popFront(); 6905 } 6906 enforce(k != MAX_PROPERTY, "invalid property name"); 6907 enforce(front == '}', "} expected "); 6908 } 6909 else 6910 {//single char properties e.g.: \pL, \pN ... 6911 enforce(front < 0x80, "invalid property name"); 6912 result[k++] = cast(char) front; 6913 } 6914 auto s = getUnicodeSet(result[0 .. k], negated, casefold); 6915 enforce(!s.empty, "unrecognized unicode property spec"); 6916 popFront(); 6917 return s; 6918 } 6919 } 6920 6921 /** 6922 Parse unicode codepoint set from given `range` using standard regex 6923 syntax '[...]'. The range is advanced skiping over regex set definition. 6924 `casefold` parameter determines if the set should be casefolded - that is 6925 include both lower and upper case versions for any letters in the set. 6926 */ 6927 static CodepointSet parseSet(Range)(ref Range range, bool casefold=false) 6928 if (isInputRange!Range && is(ElementType!Range : dchar)) 6929 { 6930 auto usParser = UnicodeSetParser!Range(range, casefold); 6931 auto set = usParser.parseSet(); 6932 range = usParser.range; 6933 return set; 6934 } 6935 6936 /// 6937 @safe unittest 6938 { 6939 import std.uni : unicode; 6940 string pat = "[a-zA-Z0-9]hello"; 6941 auto set = unicode.parseSet(pat); 6942 // check some of the codepoints 6943 assert(set['a'] && set['A'] && set['9']); 6944 assert(pat == "hello"); 6945 } 6946 6947 private: 6948 alias ucmp = comparePropertyName; 6949 6950 static bool findAny(string name) 6951 { 6952 import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file 6953 return isPrettyPropertyName(name) 6954 || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name) 6955 || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$])); 6956 } 6957 6958 static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure 6959 { 6960 import std.conv : to; 6961 import std.internal.unicode_tables : blocks, scripts; // generated file 6962 Set set; 6963 immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set) 6964 || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0 6965 && loadUnicodeSet!(blocks.tab)(name[2..$], set)); 6966 if (loaded) 6967 return set; 6968 throw new Exception("No unicode set by name "~name.to!string()~" was found."); 6969 } 6970 6971 // FIXME: re-disable once the compiler is fixed 6972 // Disabled to prevent the mistake of creating instances of this pseudo-struct. 6973 //@disable ~this(); 6974 } 6975 6976 @safe unittest 6977 { 6978 import std.internal.unicode_tables : blocks, uniProps; // generated file 6979 assert(unicode("InHebrew") == asSet(blocks.Hebrew)); 6980 assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp))); 6981 assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi)); 6982 } 6983 6984 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally 6985 6986 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too 6987 // Use combined trie instead of checking for '\r' | '\n' | ccTrie, 6988 // or extend | '\u200D' separately 6989 6990 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow 6991 { 6992 return ch >= '\U0001F1E6' && ch <= '\U0001F1FF'; 6993 } 6994 6995 // Our grapheme decoder is a state machine, this is list of all possible 6996 // states before each code point. 6997 private enum GraphemeState 6998 { 6999 Start, 7000 CR, 7001 RI, 7002 L, 7003 V, 7004 LVT, 7005 Emoji, 7006 EmojiZWJ, 7007 Prepend, 7008 End 7009 } 7010 7011 // Message values whether end of grapheme is reached 7012 private enum TransformRes 7013 { 7014 // No, unless the source range ends here 7015 // (GB2 - break at end of text, unless text is empty) 7016 goOn, 7017 redo, // Run last character again with new state 7018 retInclude, // Yes, after the just iterated character 7019 retExclude // Yes, before the just iterated character 7020 } 7021 7022 // The logic of the grapheme decoding is all here 7023 // GB# means Grapheme Breaking rule number # - see Unicode standard annex #29 7024 // Note, getting GB1 (break at start of text, unless text is empty) right 7025 // relies on the user starting grapheme walking from beginning of the text, and 7026 // not attempting to walk an empty text. 7027 private enum TransformRes 7028 function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms = 7029 [ 7030 GraphemeState.Start: (ref state, ch) 7031 { 7032 // GB4. Break after controls. 7033 if (graphemeControlTrie[ch] || ch == '\n') 7034 return TransformRes.retInclude; 7035 7036 with (GraphemeState) state = 7037 ch == '\r' ? CR : 7038 isRegionalIndicator(ch) ? RI : 7039 isHangL(ch) ? L : 7040 hangLV[ch] || isHangV(ch) ? V : 7041 hangLVT[ch] || isHangT(ch) ? LVT : 7042 prependTrie[ch] ? Prepend : 7043 xpictoTrie[ch] ? Emoji : 7044 End; 7045 7046 // No matter what we encountered, we always include the 7047 // first code point in the grapheme. 7048 return TransformRes.goOn; 7049 }, 7050 7051 // GB3, GB4. Do not break between a CR and LF. 7052 // Otherwise, break after controls. 7053 GraphemeState.CR: (ref state, ch) => ch == '\n' ? 7054 TransformRes.retInclude : 7055 TransformRes.retExclude, 7056 7057 // GB12 - GB13. Do not break within emoji flag sequences. 7058 // That is, do not break between regional indicator (RI) symbols if 7059 // there is an odd number of RI characters before the break point. 7060 // This state applies if one and only one RI code point has been 7061 // encountered. 7062 GraphemeState.RI: (ref state, ch) 7063 { 7064 state = GraphemeState.End; 7065 7066 return isRegionalIndicator(ch) ? 7067 TransformRes.goOn : 7068 TransformRes.redo; 7069 }, 7070 7071 // GB6. Do not break Hangul syllable sequences. 7072 GraphemeState.L: (ref state, ch) 7073 { 7074 if (isHangL(ch)) 7075 return TransformRes.goOn; 7076 else if (isHangV(ch) || hangLV[ch]) 7077 { 7078 state = GraphemeState.V; 7079 return TransformRes.goOn; 7080 } 7081 else if (hangLVT[ch]) 7082 { 7083 state = GraphemeState.LVT; 7084 return TransformRes.goOn; 7085 } 7086 7087 state = GraphemeState.End; 7088 return TransformRes.redo; 7089 }, 7090 7091 // GB7. Do not break Hangul syllable sequences. 7092 GraphemeState.V: (ref state, ch) 7093 { 7094 if (isHangV(ch)) 7095 return TransformRes.goOn; 7096 else if (isHangT(ch)) 7097 { 7098 state = GraphemeState.LVT; 7099 return TransformRes.goOn; 7100 } 7101 7102 state = GraphemeState.End; 7103 return TransformRes.redo; 7104 }, 7105 7106 // GB8. Do not break Hangul syllable sequences. 7107 GraphemeState.LVT: (ref state, ch) 7108 { 7109 if (isHangT(ch)) 7110 return TransformRes.goOn; 7111 7112 state = GraphemeState.End; 7113 return TransformRes.redo; 7114 }, 7115 7116 // GB11. Do not break within emoji modifier sequences or emoji 7117 // zwj sequences. This state applies when the last code point was 7118 // NOT a ZWJ. 7119 GraphemeState.Emoji: (ref state, ch) 7120 { 7121 if (graphemeExtendTrie[ch]) 7122 return TransformRes.goOn; 7123 7124 static assert(!graphemeExtendTrie['\u200D']); 7125 7126 if (ch == '\u200D') 7127 { 7128 state = GraphemeState.EmojiZWJ; 7129 return TransformRes.goOn; 7130 } 7131 7132 state = GraphemeState.End; 7133 // There might still be spacing marks are 7134 // at the end, which are not allowed in 7135 // middle of emoji sequences 7136 return TransformRes.redo; 7137 }, 7138 7139 // GB11. Do not break within emoji modifier sequences or emoji 7140 // zwj sequences. This state applies when the last code point was 7141 // a ZWJ. 7142 GraphemeState.EmojiZWJ: (ref state, ch) 7143 { 7144 state = GraphemeState.Emoji; 7145 if (xpictoTrie[ch]) 7146 return TransformRes.goOn; 7147 return TransformRes.redo; 7148 }, 7149 7150 // GB9b. Do not break after Prepend characters. 7151 GraphemeState.Prepend: (ref state, ch) 7152 { 7153 // GB5. Break before controls. 7154 if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n') 7155 return TransformRes.retExclude; 7156 7157 state = GraphemeState.Start; 7158 return TransformRes.redo; 7159 }, 7160 7161 // GB9, GB9a. Do not break before extending characters, ZWJ 7162 // or SpacingMarks. 7163 // GB999. Otherwise, break everywhere. 7164 GraphemeState.End: (ref state, ch) 7165 => !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ? 7166 TransformRes.retExclude : 7167 TransformRes.goOn 7168 ]; 7169 7170 template genericDecodeGrapheme(bool getValue) 7171 { 7172 static if (getValue) 7173 alias Value = Grapheme; 7174 else 7175 alias Value = void; 7176 7177 Value genericDecodeGrapheme(Input)(ref Input range) 7178 { 7179 static if (getValue) 7180 Grapheme grapheme; 7181 auto state = GraphemeState.Start; 7182 dchar ch; 7183 7184 assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof); 7185 outer: 7186 while (!range.empty) 7187 { 7188 ch = range.front; 7189 7190 rerun: 7191 final switch (graphemeTransforms[state](state, ch)) 7192 with(TransformRes) 7193 { 7194 case goOn: 7195 static if (getValue) 7196 grapheme ~= ch; 7197 range.popFront(); 7198 continue; 7199 7200 case redo: 7201 goto rerun; 7202 7203 case retInclude: 7204 static if (getValue) 7205 grapheme ~= ch; 7206 range.popFront(); 7207 break outer; 7208 7209 case retExclude: 7210 break outer; 7211 } 7212 } 7213 7214 static if (getValue) 7215 return grapheme; 7216 } 7217 } 7218 7219 public: // Public API continues 7220 7221 /++ 7222 Computes the length of grapheme cluster starting at `index`. 7223 Both the resulting length and the `index` are measured 7224 in $(S_LINK Code unit, code units). 7225 7226 Params: 7227 C = type that is implicitly convertible to `dchars` 7228 input = array of grapheme clusters 7229 index = starting index into `input[]` 7230 7231 Returns: 7232 length of grapheme cluster 7233 +/ 7234 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure 7235 if (is(C : dchar)) 7236 { 7237 auto src = input[index..$]; 7238 auto n = src.length; 7239 genericDecodeGrapheme!(false)(src); 7240 return n - src.length; 7241 } 7242 7243 /// 7244 @safe unittest 7245 { 7246 assert(graphemeStride(" ", 1) == 1); 7247 // A + combing ring above 7248 string city = "A\u030Arhus"; 7249 size_t first = graphemeStride(city, 0); 7250 assert(first == 3); //\u030A has 2 UTF-8 code units 7251 assert(city[0 .. first] == "A\u030A"); 7252 assert(city[first..$] == "rhus"); 7253 } 7254 7255 @safe unittest 7256 { 7257 // Ensure that graphemeStride is usable from CTFE. 7258 enum c1 = graphemeStride("A", 0); 7259 static assert(c1 == 1); 7260 7261 enum c2 = graphemeStride("A\u0301", 0); 7262 static assert(c2 == 3); // \u0301 has 2 UTF-8 code units 7263 } 7264 7265 // TODO: make this @nogc. Probably no big deal since the state machine is 7266 // already GC-free. 7267 @safe pure nothrow unittest 7268 { 7269 // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face 7270 assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2); 7271 // skier ~ female sign ~ '€' 7272 assert(graphemeStride("\u26F7\u2640€"d, 0) == 1); 7273 // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€' 7274 assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2); 7275 // skier ~ zero-width joiner ~ female sign ~ '€' 7276 assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3); 7277 // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner 7278 // ~ female sign ~ '€' 7279 assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4); 7280 // skier ~ zero-width joiner ~ '€' 7281 assert(graphemeStride("\u26F7\u200D€"d, 0) == 2); 7282 //'€' ~ zero-width joiner ~ skier 7283 assert(graphemeStride("€\u200D\u26F7"d, 0) == 2); 7284 // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two 7285 assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2); 7286 // Kaithi number sign ~ null 7287 assert(graphemeStride("\U000110BD\0"d, 0) == 1); 7288 } 7289 7290 /++ 7291 Reads one full grapheme cluster from an 7292 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`. 7293 7294 For examples see the $(LREF Grapheme) below. 7295 7296 Note: 7297 This function modifies `inp` and thus `inp` 7298 must be an L-value. 7299 +/ 7300 Grapheme decodeGrapheme(Input)(ref Input inp) 7301 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar)) 7302 { 7303 return genericDecodeGrapheme!true(inp); 7304 } 7305 7306 @safe unittest 7307 { 7308 import std.algorithm.comparison : equal; 7309 7310 Grapheme gr; 7311 string s = " \u0020\u0308 "; 7312 gr = decodeGrapheme(s); 7313 assert(gr.length == 1 && gr[0] == ' '); 7314 gr = decodeGrapheme(s); 7315 assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308")); 7316 s = "\u0300\u0308\u1100"; 7317 assert(equal(decodeGrapheme(s)[], "\u0300\u0308")); 7318 assert(equal(decodeGrapheme(s)[], "\u1100")); 7319 s = "\u11A8\u0308\uAC01"; 7320 assert(equal(decodeGrapheme(s)[], "\u11A8\u0308")); 7321 assert(equal(decodeGrapheme(s)[], "\uAC01")); 7322 7323 // Two Union Jacks of the Great Britain 7324 s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7"; 7325 assert(equal(decodeGrapheme(s)[], "\U0001F1EC\U0001F1E7")); 7326 } 7327 7328 /++ 7329 $(P Iterate a string by $(LREF Grapheme).) 7330 7331 $(P Useful for doing string manipulation that needs to be aware 7332 of graphemes.) 7333 7334 See_Also: 7335 $(LREF byCodePoint) 7336 +/ 7337 auto byGrapheme(Range)(Range range) 7338 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7339 { 7340 // TODO: Bidirectional access 7341 static struct Result(R) 7342 { 7343 private R _range; 7344 private Grapheme _front; 7345 7346 bool empty() @property 7347 { 7348 return _front.length == 0; 7349 } 7350 7351 Grapheme front() @property 7352 { 7353 return _front; 7354 } 7355 7356 void popFront() 7357 { 7358 _front = _range.empty ? Grapheme.init : _range.decodeGrapheme(); 7359 } 7360 7361 static if (isForwardRange!R) 7362 { 7363 Result save() @property 7364 { 7365 return Result(_range.save, _front); 7366 } 7367 } 7368 } 7369 7370 auto result = Result!(Range)(range); 7371 result.popFront(); 7372 return result; 7373 } 7374 7375 /// 7376 @safe unittest 7377 { 7378 import std.algorithm.comparison : equal; 7379 import std.range.primitives : walkLength; 7380 import std.range : take, drop; 7381 auto text = "noe\u0308l"; // noël using e + combining diaeresis 7382 assert(text.walkLength == 5); // 5 code points 7383 7384 auto gText = text.byGrapheme; 7385 assert(gText.walkLength == 4); // 4 graphemes 7386 7387 assert(gText.take(3).equal("noe\u0308".byGrapheme)); 7388 assert(gText.drop(3).equal("l".byGrapheme)); 7389 } 7390 7391 // For testing non-forward-range input ranges 7392 version (StdUnittest) 7393 private static @safe struct InputRangeString 7394 { 7395 private string s; 7396 7397 bool empty() @property { return s.empty; } 7398 dchar front() @property { return s.front; } 7399 void popFront() { s.popFront(); } 7400 } 7401 7402 @safe unittest 7403 { 7404 import std.algorithm.comparison : equal; 7405 import std.array : array; 7406 import std.range : retro; 7407 import std.range.primitives : walkLength; 7408 assert("".byGrapheme.walkLength == 0); 7409 7410 auto reverse = "le\u0308on"; 7411 assert(reverse.walkLength == 5); 7412 7413 auto gReverse = reverse.byGrapheme; 7414 assert(gReverse.walkLength == 4); 7415 7416 static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d)) 7417 {{ 7418 assert(text.walkLength == 5); 7419 static assert(isForwardRange!(typeof(text))); 7420 7421 auto gText = text.byGrapheme; 7422 static assert(isForwardRange!(typeof(gText))); 7423 assert(gText.walkLength == 4); 7424 assert(gText.array.retro.equal(gReverse)); 7425 }} 7426 7427 auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme; 7428 static assert(!isForwardRange!(typeof(nonForwardRange))); 7429 assert(nonForwardRange.walkLength == 4); 7430 } 7431 7432 // Issue 23474 7433 @safe pure unittest 7434 { 7435 import std.range.primitives : walkLength; 7436 assert(byGrapheme("\r\u0308").walkLength == 2); 7437 } 7438 7439 /++ 7440 $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.) 7441 7442 $(P Useful for converting the result to a string after doing operations 7443 on graphemes.) 7444 7445 $(P If passed in a range of code points, returns a range with equivalent capabilities.) 7446 +/ 7447 auto byCodePoint(Range)(Range range) 7448 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme)) 7449 { 7450 // TODO: Propagate bidirectional access 7451 static struct Result 7452 { 7453 private Range _range; 7454 private size_t i = 0; 7455 7456 bool empty() @property 7457 { 7458 return _range.empty; 7459 } 7460 7461 dchar front() @property 7462 { 7463 return _range.front[i]; 7464 } 7465 7466 void popFront() 7467 { 7468 ++i; 7469 7470 if (i >= _range.front.length) 7471 { 7472 _range.popFront(); 7473 i = 0; 7474 } 7475 } 7476 7477 static if (isForwardRange!Range) 7478 { 7479 Result save() @property 7480 { 7481 return Result(_range.save, i); 7482 } 7483 } 7484 } 7485 7486 return Result(range); 7487 } 7488 7489 /// Ditto 7490 auto byCodePoint(Range)(Range range) 7491 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7492 { 7493 import std.range.primitives : isBidirectionalRange, popBack; 7494 import std.traits : isNarrowString; 7495 static if (isNarrowString!Range) 7496 { 7497 static struct Result 7498 { 7499 private Range _range; 7500 @property bool empty() { return _range.empty; } 7501 @property dchar front(){ return _range.front; } 7502 void popFront(){ _range.popFront; } 7503 @property auto save() { return Result(_range.save); } 7504 @property dchar back(){ return _range.back; } 7505 void popBack(){ _range.popBack; } 7506 } 7507 static assert(isBidirectionalRange!(Result)); 7508 return Result(range); 7509 } 7510 else 7511 return range; 7512 } 7513 7514 /// 7515 @safe unittest 7516 { 7517 import std.array : array; 7518 import std.conv : text; 7519 import std.range : retro; 7520 7521 string s = "noe\u0308l"; // noël 7522 7523 // reverse it and convert the result to a string 7524 string reverse = s.byGrapheme 7525 .array 7526 .retro 7527 .byCodePoint 7528 .text; 7529 7530 assert(reverse == "le\u0308on"); // lëon 7531 } 7532 7533 @safe unittest 7534 { 7535 import std.algorithm.comparison : equal; 7536 import std.range.primitives : walkLength; 7537 import std.range : retro; 7538 assert("".byGrapheme.byCodePoint.equal("")); 7539 7540 string text = "noe\u0308l"; 7541 static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length)); 7542 7543 auto gText = InputRangeString(text).byGrapheme; 7544 static assert(!isForwardRange!(typeof(gText))); 7545 7546 auto cpText = gText.byCodePoint; 7547 static assert(!isForwardRange!(typeof(cpText))); 7548 7549 assert(cpText.walkLength == text.walkLength); 7550 7551 auto plainCp = text.byCodePoint; 7552 static assert(isForwardRange!(typeof(plainCp))); 7553 assert(equal(plainCp, text)); 7554 assert(equal(retro(plainCp.save), retro(text.save))); 7555 // Check that we still have length for dstring 7556 assert("абвгд"d.byCodePoint.length == 5); 7557 } 7558 7559 /++ 7560 $(P A structure designed to effectively pack $(CHARACTERS) 7561 of a $(CLUSTER). 7562 ) 7563 7564 $(P `Grapheme` has value semantics so 2 copies of a `Grapheme` 7565 always refer to distinct objects. In most actual scenarios a `Grapheme` 7566 fits on the stack and avoids memory allocation overhead for all but quite 7567 long clusters. 7568 ) 7569 7570 See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride) 7571 +/ 7572 @safe struct Grapheme 7573 { 7574 import std.exception : enforce; 7575 import std.traits : isDynamicArray; 7576 7577 public: 7578 /// Ctor 7579 this(C)(const scope C[] chars...) 7580 if (is(C : dchar)) 7581 { 7582 this ~= chars; 7583 } 7584 7585 ///ditto 7586 this(Input)(Input seq) 7587 if (!isDynamicArray!Input 7588 && isInputRange!Input && is(ElementType!Input : dchar)) 7589 { 7590 this ~= seq; 7591 } 7592 7593 /// Gets a $(CODEPOINT) at the given index in this cluster. 7594 dchar opIndex(size_t index) const @nogc nothrow pure @trusted 7595 { 7596 assert(index < length); 7597 return read24(isBig ? ptr_ : small_.ptr, index); 7598 } 7599 7600 /++ 7601 Writes a $(CODEPOINT) `ch` at given index in this cluster. 7602 7603 Warning: 7604 Use of this facility may invalidate grapheme cluster, 7605 see also $(LREF Grapheme.valid). 7606 +/ 7607 void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted 7608 { 7609 assert(index < length); 7610 write24(isBig ? ptr_ : small_.ptr, ch, index); 7611 } 7612 7613 /// 7614 @safe unittest 7615 { 7616 auto g = Grapheme("A\u0302"); 7617 assert(g[0] == 'A'); 7618 assert(g.valid); 7619 g[1] = '~'; // ASCII tilda is not a combining mark 7620 assert(g[1] == '~'); 7621 assert(!g.valid); 7622 } 7623 7624 /++ 7625 Random-access range over Grapheme's $(CHARACTERS). 7626 7627 Warning: Invalidates when this Grapheme leaves the scope, 7628 attempts to use it then would lead to memory corruption. 7629 +/ 7630 SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return 7631 { 7632 return sliceOverIndexed(a, b, &this); 7633 } 7634 7635 /// ditto 7636 SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return 7637 { 7638 return sliceOverIndexed(0, length, &this); 7639 } 7640 7641 /// Grapheme cluster length in $(CODEPOINTS). 7642 @property size_t length() const @nogc nothrow pure 7643 { 7644 return isBig ? len_ : slen_ & 0x7F; 7645 } 7646 7647 /++ 7648 Append $(CHARACTER) `ch` to this grapheme. 7649 Warning: 7650 Use of this facility may invalidate grapheme cluster, 7651 see also `valid`. 7652 7653 See_Also: $(LREF Grapheme.valid) 7654 +/ 7655 ref opOpAssign(string op)(dchar ch) @trusted 7656 { 7657 static if (op == "~") 7658 { 7659 import std.internal.memory : enforceRealloc; 7660 if (!isBig) 7661 { 7662 if (slen_ == small_cap) 7663 convertToBig();// & fallthrough to "big" branch 7664 else 7665 { 7666 write24(small_.ptr, ch, smallLength); 7667 slen_++; 7668 return this; 7669 } 7670 } 7671 7672 assert(isBig); 7673 if (len_ == cap_) 7674 { 7675 import core.checkedint : addu, mulu; 7676 bool overflow; 7677 cap_ = addu(cap_, grow, overflow); 7678 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow); 7679 if (overflow) assert(0); 7680 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems); 7681 } 7682 write24(ptr_, ch, len_++); 7683 return this; 7684 } 7685 else 7686 static assert(false, "No operation "~op~" defined for Grapheme"); 7687 } 7688 7689 /// 7690 @safe unittest 7691 { 7692 import std.algorithm.comparison : equal; 7693 auto g = Grapheme("A"); 7694 assert(g.valid); 7695 g ~= '\u0301'; 7696 assert(g[].equal("A\u0301")); 7697 assert(g.valid); 7698 g ~= "B"; 7699 // not a valid grapheme cluster anymore 7700 assert(!g.valid); 7701 // still could be useful though 7702 assert(g[].equal("A\u0301B")); 7703 } 7704 7705 /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme. 7706 ref opOpAssign(string op, Input)(scope Input inp) 7707 if (isInputRange!Input && is(ElementType!Input : dchar)) 7708 { 7709 static if (op == "~") 7710 { 7711 foreach (dchar ch; inp) 7712 this ~= ch; 7713 return this; 7714 } 7715 else 7716 static assert(false, "No operation "~op~" defined for Grapheme"); 7717 } 7718 7719 // This is not a good `opEquals`, but formerly the automatically generated 7720 // opEquals was used, which was inferred `@safe` because of bugzilla 20655: 7721 // https://issues.dlang.org/show_bug.cgi?id=20655 7722 // This `@trusted opEquals` is only here to prevent breakage. 7723 bool opEquals(R)(const auto ref R other) const @trusted 7724 { 7725 return this.tupleof == other.tupleof; 7726 } 7727 7728 // Define a default toHash to allow AA usage 7729 size_t toHash() const @trusted 7730 { 7731 return hashOf(slen_, hashOf(small_)); 7732 } 7733 7734 /++ 7735 True if this object contains valid extended grapheme cluster. 7736 Decoding primitives of this module always return a valid `Grapheme`. 7737 7738 Appending to and direct manipulation of grapheme's $(CHARACTERS) may 7739 render it no longer valid. Certain applications may chose to use 7740 Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property 7741 entirely. 7742 +/ 7743 @property bool valid()() /*const*/ 7744 { 7745 auto r = this[]; 7746 genericDecodeGrapheme!false(r); 7747 return r.length == 0; 7748 } 7749 7750 this(this) @nogc nothrow pure @trusted 7751 { 7752 import std.internal.memory : enforceMalloc; 7753 if (isBig) 7754 {// dup it 7755 import core.checkedint : addu, mulu; 7756 bool overflow; 7757 auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow); 7758 if (overflow) assert(0); 7759 7760 auto p = cast(ubyte*) enforceMalloc(raw_cap); 7761 p[0 .. raw_cap] = ptr_[0 .. raw_cap]; 7762 ptr_ = p; 7763 } 7764 } 7765 7766 ~this() @nogc nothrow pure @trusted 7767 { 7768 import core.memory : pureFree; 7769 if (isBig) 7770 { 7771 pureFree(ptr_); 7772 } 7773 } 7774 7775 7776 private: 7777 enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1); 7778 // "out of the blue" grow rate, needs testing 7779 // (though graphemes are typically small < 9) 7780 enum grow = 20; 7781 enum small_cap = small_bytes/3; 7782 enum small_flag = 0x80, small_mask = 0x7F; 7783 // 16 bytes in 32bits, should be enough for the majority of cases 7784 union 7785 { 7786 struct 7787 { 7788 ubyte* ptr_; 7789 size_t cap_; 7790 size_t len_; 7791 size_t padding_; 7792 } 7793 struct 7794 { 7795 ubyte[small_bytes] small_; 7796 ubyte slen_; 7797 } 7798 } 7799 7800 void convertToBig() @nogc nothrow pure @trusted 7801 { 7802 import std.internal.memory : enforceMalloc; 7803 static assert(grow.max / 3 - 1 >= grow); 7804 enum nbytes = 3 * (grow + 1); 7805 size_t k = smallLength; 7806 ubyte* p = cast(ubyte*) enforceMalloc(nbytes); 7807 for (int i=0; i<k; i++) 7808 write24(p, read24(small_.ptr, i), i); 7809 // now we can overwrite small array data 7810 ptr_ = p; 7811 len_ = slen_; 7812 assert(grow > len_); 7813 cap_ = grow; 7814 setBig(); 7815 } 7816 7817 void setBig() @nogc nothrow pure { slen_ |= small_flag; } 7818 7819 @property size_t smallLength() const @nogc nothrow pure 7820 { 7821 return slen_ & small_mask; 7822 } 7823 @property ubyte isBig() const @nogc nothrow pure 7824 { 7825 return slen_ & small_flag; 7826 } 7827 } 7828 7829 static assert(Grapheme.sizeof == size_t.sizeof*4); 7830 7831 7832 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw 7833 { 7834 import std.algorithm.comparison : equal; 7835 Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")]; 7836 assert(byGrapheme("ЮУЗ").equal(data[])); 7837 } 7838 7839 /// 7840 @safe unittest 7841 { 7842 import std.algorithm.comparison : equal; 7843 import std.algorithm.iteration : filter; 7844 import std.range : isRandomAccessRange; 7845 7846 string bold = "ku\u0308hn"; 7847 7848 // note that decodeGrapheme takes parameter by ref 7849 auto first = decodeGrapheme(bold); 7850 7851 assert(first.length == 1); 7852 assert(first[0] == 'k'); 7853 7854 // the next grapheme is 2 characters long 7855 auto wideOne = decodeGrapheme(bold); 7856 // slicing a grapheme yields a random-access range of dchar 7857 assert(wideOne[].equal("u\u0308")); 7858 assert(wideOne.length == 2); 7859 static assert(isRandomAccessRange!(typeof(wideOne[]))); 7860 7861 // all of the usual range manipulation is possible 7862 assert(wideOne[].filter!isMark().equal("\u0308")); 7863 7864 auto g = Grapheme("A"); 7865 assert(g.valid); 7866 g ~= '\u0301'; 7867 assert(g[].equal("A\u0301")); 7868 assert(g.valid); 7869 g ~= "B"; 7870 // not a valid grapheme cluster anymore 7871 assert(!g.valid); 7872 // still could be useful though 7873 assert(g[].equal("A\u0301B")); 7874 } 7875 7876 @safe unittest 7877 { 7878 auto g = Grapheme("A\u0302"); 7879 assert(g[0] == 'A'); 7880 assert(g.valid); 7881 g[1] = '~'; // ASCII tilda is not a combining mark 7882 assert(g[1] == '~'); 7883 assert(!g.valid); 7884 } 7885 7886 @safe unittest 7887 { 7888 import std.algorithm.comparison : equal; 7889 import std.algorithm.iteration : map; 7890 import std.conv : text; 7891 import std.range : iota; 7892 7893 // not valid clusters (but it just a test) 7894 auto g = Grapheme('a', 'b', 'c', 'd', 'e'); 7895 assert(g[0] == 'a'); 7896 assert(g[1] == 'b'); 7897 assert(g[2] == 'c'); 7898 assert(g[3] == 'd'); 7899 assert(g[4] == 'e'); 7900 g[3] = 'Й'; 7901 assert(g[2] == 'c'); 7902 assert(g[3] == 'Й', text(g[3], " vs ", 'Й')); 7903 assert(g[4] == 'e'); 7904 assert(!g.valid); 7905 7906 g ~= 'ц'; 7907 g ~= '~'; 7908 assert(g[0] == 'a'); 7909 assert(g[1] == 'b'); 7910 assert(g[2] == 'c'); 7911 assert(g[3] == 'Й'); 7912 assert(g[4] == 'e'); 7913 assert(g[5] == 'ц'); 7914 assert(g[6] == '~'); 7915 assert(!g.valid); 7916 7917 Grapheme copy = g; 7918 copy[0] = 'X'; 7919 copy[1] = '-'; 7920 assert(g[0] == 'a' && copy[0] == 'X'); 7921 assert(g[1] == 'b' && copy[1] == '-'); 7922 assert(equal(g[2 .. g.length], copy[2 .. copy.length])); 7923 copy = Grapheme("АБВГДЕЁЖЗИКЛМ"); 7924 assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8])); 7925 copy ~= "xyz"; 7926 assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15])); 7927 assert(!copy.valid); 7928 7929 Grapheme h; 7930 foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"()) 7931 h ~= v; 7932 assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1))); 7933 } 7934 7935 // ensure Grapheme can be used as an AA key. 7936 @safe unittest 7937 { 7938 int[Grapheme] aa; 7939 } 7940 7941 /++ 7942 $(P Does basic case-insensitive comparison of `r1` and `r2`. 7943 This function uses simpler comparison rule thus achieving better performance 7944 than $(LREF icmp). However keep in mind the warning below.) 7945 7946 Params: 7947 r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7948 r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7949 7950 Returns: 7951 An `int` that is 0 if the strings match, 7952 <0 if `r1` is lexicographically "less" than `r2`, 7953 >0 if `r1` is lexicographically "greater" than `r2` 7954 7955 Warning: 7956 This function only handles 1:1 $(CODEPOINT) mapping 7957 and thus is not sufficient for certain alphabets 7958 like German, Greek and few others. 7959 7960 See_Also: 7961 $(LREF icmp) 7962 $(REF cmp, std,algorithm,comparison) 7963 +/ 7964 int sicmp(S1, S2)(scope S1 r1, scope S2 r2) 7965 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1) 7966 && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2)) 7967 { 7968 import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file 7969 import std.range.primitives : isInfinite; 7970 import std.utf : decodeFront; 7971 import std.traits : isDynamicArray; 7972 import std.typecons : Yes; 7973 static import std.ascii; 7974 7975 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 7976 && (isDynamicArray!S2 || isRandomAccessRange!S2) 7977 && !(isInfinite!S1 && isInfinite!S2) 7978 && __traits(compiles, 7979 { 7980 size_t s = size_t.sizeof / 2; 7981 r1 = r1[s .. $]; 7982 r2 = r2[s .. $]; 7983 })) 7984 {{ 7985 // ASCII optimization for dynamic arrays & similar. 7986 size_t i = 0; 7987 static if (isInfinite!S1) 7988 immutable end = r2.length; 7989 else static if (isInfinite!S2) 7990 immutable end = r1.length; 7991 else 7992 immutable end = r1.length > r2.length ? r2.length : r1.length; 7993 for (; i < end; ++i) 7994 { 7995 auto lhs = r1[i]; 7996 auto rhs = r2[i]; 7997 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 7998 if (lhs == rhs) continue; 7999 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 8000 if (lowDiff) return lowDiff; 8001 } 8002 static if (isInfinite!S1) 8003 return 1; 8004 else static if (isInfinite!S2) 8005 return -1; 8006 else 8007 return (r1.length > r2.length) - (r2.length > r1.length); 8008 8009 NonAsciiPath: 8010 r1 = r1[i .. $]; 8011 r2 = r2[i .. $]; 8012 // Fall through to standard case. 8013 }} 8014 8015 while (!r1.empty) 8016 { 8017 immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1); 8018 if (r2.empty) 8019 return 1; 8020 immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2); 8021 int diff = lhs - rhs; 8022 if (!diff) 8023 continue; 8024 if ((lhs | rhs) < 0x80) 8025 { 8026 immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 8027 if (!d) continue; 8028 return d; 8029 } 8030 size_t idx = simpleCaseTrie[lhs]; 8031 size_t idx2 = simpleCaseTrie[rhs]; 8032 // simpleCaseTrie is packed index table 8033 if (idx != EMPTY_CASE_TRIE) 8034 { 8035 if (idx2 != EMPTY_CASE_TRIE) 8036 {// both cased chars 8037 // adjust idx --> start of bucket 8038 idx = idx - sTable[idx].n; 8039 idx2 = idx2 - sTable[idx2].n; 8040 if (idx == idx2)// one bucket, equivalent chars 8041 continue; 8042 else// not the same bucket 8043 diff = sTable[idx].ch - sTable[idx2].ch; 8044 } 8045 else 8046 diff = sTable[idx - sTable[idx].n].ch - rhs; 8047 } 8048 else if (idx2 != EMPTY_CASE_TRIE) 8049 { 8050 diff = lhs - sTable[idx2 - sTable[idx2].n].ch; 8051 } 8052 // one of chars is not cased at all 8053 return diff; 8054 } 8055 return int(r2.empty) - 1; 8056 } 8057 8058 /// 8059 @safe @nogc pure nothrow unittest 8060 { 8061 assert(sicmp("Август", "авгусТ") == 0); 8062 // Greek also works as long as there is no 1:M mapping in sight 8063 assert(sicmp("ΌΎ", "όύ") == 0); 8064 // things like the following won't get matched as equal 8065 // Greek small letter iota with dialytika and tonos 8066 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 8067 8068 // while icmp has no problem with that 8069 assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0); 8070 assert(icmp("ΌΎ", "όύ") == 0); 8071 } 8072 8073 // overloads for the most common cases to reduce compile time 8074 @safe @nogc pure nothrow 8075 { 8076 int sicmp(scope const(char)[] str1, scope const(char)[] str2) 8077 { return sicmp!(const(char)[], const(char)[])(str1, str2); } 8078 8079 int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2) 8080 { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8081 8082 int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2) 8083 { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8084 } 8085 8086 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail) 8087 { 8088 import std.algorithm.searching : skipOver; 8089 import std.internal.unicode_tables : fullCaseTable; // generated file 8090 alias fTable = fullCaseTable; 8091 size_t idx = fullCaseTrie[lhs]; 8092 // fullCaseTrie is packed index table 8093 if (idx == EMPTY_CASE_TRIE) 8094 return lhs; 8095 immutable start = idx - fTable[idx].n; 8096 immutable end = fTable[idx].size + start; 8097 assert(fTable[start].entry_len == 1); 8098 for (idx=start; idx<end; idx++) 8099 { 8100 auto entryLen = fTable[idx].entry_len; 8101 if (entryLen == 1) 8102 { 8103 if (fTable[idx].seq[0] == rhs) 8104 { 8105 return 0; 8106 } 8107 } 8108 else 8109 {// OK it's a long chunk, like 'ss' for German 8110 dstring seq = fTable[idx].seq[0 .. entryLen]; 8111 if (rhs == seq[0] 8112 && rtail.skipOver(seq[1..$])) 8113 { 8114 // note that this path modifies rtail 8115 // iff we managed to get there 8116 return 0; 8117 } 8118 } 8119 } 8120 return fTable[start].seq[0]; // new remapped character for accurate diffs 8121 } 8122 8123 /++ 8124 Does case insensitive comparison of `r1` and `r2`. 8125 Follows the rules of full case-folding mapping. 8126 This includes matching as equal german ß with "ss" and 8127 other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp). 8128 The cost of `icmp` being pedantically correct is 8129 slightly worse performance. 8130 8131 Params: 8132 r1 = a forward range of characters 8133 r2 = a forward range of characters 8134 8135 Returns: 8136 An `int` that is 0 if the strings match, 8137 <0 if `str1` is lexicographically "less" than `str2`, 8138 >0 if `str1` is lexicographically "greater" than `str2` 8139 8140 See_Also: 8141 $(LREF sicmp) 8142 $(REF cmp, std,algorithm,comparison) 8143 +/ 8144 int icmp(S1, S2)(S1 r1, S2 r2) 8145 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1) 8146 && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2)) 8147 { 8148 import std.range.primitives : isInfinite; 8149 import std.traits : isDynamicArray; 8150 import std.utf : byDchar; 8151 static import std.ascii; 8152 8153 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 8154 && (isDynamicArray!S2 || isRandomAccessRange!S2) 8155 && !(isInfinite!S1 && isInfinite!S2) 8156 && __traits(compiles, 8157 { 8158 size_t s = size_t.max / 2; 8159 r1 = r1[s .. $]; 8160 r2 = r2[s .. $]; 8161 })) 8162 {{ 8163 // ASCII optimization for dynamic arrays & similar. 8164 size_t i = 0; 8165 static if (isInfinite!S1) 8166 immutable end = r2.length; 8167 else static if (isInfinite!S2) 8168 immutable end = r1.length; 8169 else 8170 immutable end = r1.length > r2.length ? r2.length : r1.length; 8171 for (; i < end; ++i) 8172 { 8173 auto lhs = r1[i]; 8174 auto rhs = r2[i]; 8175 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 8176 if (lhs == rhs) continue; 8177 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 8178 if (lowDiff) return lowDiff; 8179 } 8180 static if (isInfinite!S1) 8181 return 1; 8182 else static if (isInfinite!S2) 8183 return -1; 8184 else 8185 return (r1.length > r2.length) - (r2.length > r1.length); 8186 8187 NonAsciiPath: 8188 r1 = r1[i .. $]; 8189 r2 = r2[i .. $]; 8190 // Fall through to standard case. 8191 }} 8192 8193 auto str1 = r1.byDchar; 8194 auto str2 = r2.byDchar; 8195 8196 for (;;) 8197 { 8198 if (str1.empty) 8199 return str2.empty ? 0 : -1; 8200 immutable lhs = str1.front; 8201 if (str2.empty) 8202 return 1; 8203 immutable rhs = str2.front; 8204 str1.popFront(); 8205 str2.popFront(); 8206 if (!(lhs - rhs)) 8207 continue; 8208 // first try to match lhs to <rhs,right-tail> sequence 8209 immutable cmpLR = fullCasedCmp(lhs, rhs, str2); 8210 if (!cmpLR) 8211 continue; 8212 // then rhs to <lhs,left-tail> sequence 8213 immutable cmpRL = fullCasedCmp(rhs, lhs, str1); 8214 if (!cmpRL) 8215 continue; 8216 // cmpXX contain remapped codepoints 8217 // to obtain stable ordering of icmp 8218 return cmpLR - cmpRL; 8219 } 8220 } 8221 8222 /// 8223 @safe @nogc pure nothrow unittest 8224 { 8225 assert(icmp("Rußland", "Russland") == 0); 8226 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8227 } 8228 8229 /** 8230 * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding 8231 * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`. 8232 */ 8233 @safe @nogc nothrow pure unittest 8234 { 8235 import std.utf : byDchar; 8236 8237 assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0); 8238 assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0); 8239 } 8240 8241 // test different character types 8242 @safe unittest 8243 { 8244 assert(icmp("Rußland", "Russland") == 0); 8245 assert(icmp("Rußland"w, "Russland") == 0); 8246 assert(icmp("Rußland", "Russland"w) == 0); 8247 assert(icmp("Rußland"w, "Russland"w) == 0); 8248 assert(icmp("Rußland"d, "Russland"w) == 0); 8249 assert(icmp("Rußland"w, "Russland"d) == 0); 8250 } 8251 8252 // overloads for the most common cases to reduce compile time 8253 @safe @nogc pure nothrow 8254 { 8255 int icmp(const(char)[] str1, const(char)[] str2) 8256 { return icmp!(const(char)[], const(char)[])(str1, str2); } 8257 int icmp(const(wchar)[] str1, const(wchar)[] str2) 8258 { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8259 int icmp(const(dchar)[] str1, const(dchar)[] str2) 8260 { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8261 } 8262 8263 @safe unittest 8264 { 8265 import std.algorithm.sorting : sort; 8266 import std.conv : to; 8267 import std.exception : assertCTFEable; 8268 assertCTFEable!( 8269 { 8270 static foreach (cfunc; AliasSeq!(icmp, sicmp)) 8271 {{ 8272 static foreach (S1; AliasSeq!(string, wstring, dstring)) 8273 static foreach (S2; AliasSeq!(string, wstring, dstring)) 8274 { 8275 assert(cfunc("".to!S1(), "".to!S2()) == 0); 8276 assert(cfunc("A".to!S1(), "".to!S2()) > 0); 8277 assert(cfunc("".to!S1(), "0".to!S2()) < 0); 8278 assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0); 8279 assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0); 8280 assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0); 8281 assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0); 8282 assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0); 8283 // Check example: 8284 assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0); 8285 assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0); 8286 } 8287 // check that the order is properly agnostic to the case 8288 auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"]; 8289 sort!((a,b) => cfunc(a,b) < 0)(strs); 8290 assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]); 8291 }} 8292 assert(icmp("ßb", "ssa") > 0); 8293 // Check example: 8294 assert(icmp("Russland", "Rußland") == 0); 8295 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8296 assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0); 8297 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 8298 // https://issues.dlang.org/show_bug.cgi?id=11057 8299 assert( icmp("K", "L") < 0 ); 8300 }); 8301 } 8302 8303 // https://issues.dlang.org/show_bug.cgi?id=17372 8304 @safe pure unittest 8305 { 8306 import std.algorithm.iteration : joiner, map; 8307 import std.algorithm.sorting : sort; 8308 import std.array : array; 8309 auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0); 8310 } 8311 8312 // This is package(std) for the moment to be used as a support tool for std.regex 8313 // It needs a better API 8314 /* 8315 Return a range of all $(CODEPOINTS) that casefold to 8316 and from this `ch`. 8317 */ 8318 package(std) auto simpleCaseFoldings(dchar ch) @safe 8319 { 8320 import std.internal.unicode_tables : simpleCaseTable; // generated file 8321 alias sTable = simpleCaseTable; 8322 static struct Range 8323 { 8324 @safe pure nothrow: 8325 uint idx; //if == uint.max, then read c. 8326 union 8327 { 8328 dchar c; // == 0 - empty range 8329 uint len; 8330 } 8331 @property bool isSmall() const { return idx == uint.max; } 8332 8333 this(dchar ch) 8334 { 8335 idx = uint.max; 8336 c = ch; 8337 } 8338 8339 this(uint start, uint size) 8340 { 8341 idx = start; 8342 len = size; 8343 } 8344 8345 @property dchar front() const 8346 { 8347 assert(!empty); 8348 if (isSmall) 8349 { 8350 return c; 8351 } 8352 auto ch = sTable[idx].ch; 8353 return ch; 8354 } 8355 8356 @property bool empty() const 8357 { 8358 if (isSmall) 8359 { 8360 return c == 0; 8361 } 8362 return len == 0; 8363 } 8364 8365 @property size_t length() const 8366 { 8367 if (isSmall) 8368 { 8369 return c == 0 ? 0 : 1; 8370 } 8371 return len; 8372 } 8373 8374 void popFront() 8375 { 8376 if (isSmall) 8377 c = 0; 8378 else 8379 { 8380 idx++; 8381 len--; 8382 } 8383 } 8384 } 8385 immutable idx = simpleCaseTrie[ch]; 8386 if (idx == EMPTY_CASE_TRIE) 8387 return Range(ch); 8388 auto entry = sTable[idx]; 8389 immutable start = idx - entry.n; 8390 return Range(start, entry.size); 8391 } 8392 8393 @safe unittest 8394 { 8395 import std.algorithm.comparison : equal; 8396 import std.algorithm.searching : canFind; 8397 import std.array : array; 8398 import std.exception : assertCTFEable; 8399 assertCTFEable!((){ 8400 auto r = simpleCaseFoldings('Э').array; 8401 assert(r.length == 2); 8402 assert(r.canFind('э') && r.canFind('Э')); 8403 auto sr = simpleCaseFoldings('~'); 8404 assert(sr.equal("~")); 8405 //A with ring above - casefolds to the same bucket as Angstrom sign 8406 sr = simpleCaseFoldings('Å'); 8407 assert(sr.length == 3); 8408 assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B')); 8409 }); 8410 } 8411 8412 /++ 8413 $(P Returns the $(S_LINK Combining class, combining class) of `ch`.) 8414 +/ 8415 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc 8416 { 8417 return combiningClassTrie[ch]; 8418 } 8419 8420 /// 8421 @safe unittest 8422 { 8423 // shorten the code 8424 alias CC = combiningClass; 8425 8426 // combining tilda 8427 assert(CC('\u0303') == 230); 8428 // combining ring below 8429 assert(CC('\u0325') == 220); 8430 // the simple consequence is that "tilda" should be 8431 // placed after a "ring below" in a sequence 8432 } 8433 8434 @safe pure nothrow @nogc unittest 8435 { 8436 foreach (ch; 0 .. 0x80) 8437 assert(combiningClass(ch) == 0); 8438 assert(combiningClass('\u05BD') == 22); 8439 assert(combiningClass('\u0300') == 230); 8440 assert(combiningClass('\u0317') == 220); 8441 assert(combiningClass('\u1939') == 222); 8442 } 8443 8444 /// Unicode character decomposition type. 8445 enum UnicodeDecomposition { 8446 /// Canonical decomposition. The result is canonically equivalent sequence. 8447 Canonical, 8448 /** 8449 Compatibility decomposition. The result is compatibility equivalent sequence. 8450 Note: Compatibility decomposition is a $(B lossy) conversion, 8451 typically suitable only for fuzzy matching and internal processing. 8452 */ 8453 Compatibility 8454 } 8455 8456 /** 8457 Shorthand aliases for character decomposition type, passed as a 8458 template parameter to $(LREF decompose). 8459 */ 8460 enum { 8461 Canonical = UnicodeDecomposition.Canonical, 8462 Compatibility = UnicodeDecomposition.Compatibility 8463 } 8464 8465 /++ 8466 Try to canonically compose 2 $(CHARACTERS). 8467 Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise. 8468 8469 The assumption is that `first` comes before `second` in the original text, 8470 usually meaning that the first is a starter. 8471 8472 Note: Hangul syllables are not covered by this function. 8473 See `composeJamo` below. 8474 +/ 8475 public dchar compose(dchar first, dchar second) pure nothrow @safe 8476 { 8477 import std.algorithm.iteration : map; 8478 import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask; 8479 import std.range : assumeSorted; 8480 immutable packed = compositionJumpTrie[first]; 8481 if (packed == ushort.max) 8482 return dchar.init; 8483 // unpack offset and length 8484 immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift; 8485 // TODO: optimize this micro binary search (no more then 4-5 steps) 8486 auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted(); 8487 immutable target = r.lowerBound(second).length; 8488 if (target == cnt) 8489 return dchar.init; 8490 immutable entry = compositionTable[idx+target]; 8491 if (entry.rhs != second) 8492 return dchar.init; 8493 return entry.composed; 8494 } 8495 8496 /// 8497 @safe unittest 8498 { 8499 assert(compose('A','\u0308') == '\u00C4'); 8500 assert(compose('A', 'B') == dchar.init); 8501 assert(compose('C', '\u0301') == '\u0106'); 8502 // note that the starter is the first one 8503 // thus the following doesn't compose 8504 assert(compose('\u0308', 'A') == dchar.init); 8505 } 8506 8507 /++ 8508 Returns a full $(S_LINK Canonical decomposition, Canonical) 8509 (by default) or $(S_LINK Compatibility decomposition, Compatibility) 8510 decomposition of $(CHARACTER) `ch`. 8511 If no decomposition is available returns a $(LREF Grapheme) 8512 with the `ch` itself. 8513 8514 Note: 8515 This function also decomposes hangul syllables 8516 as prescribed by the standard. 8517 8518 See_Also: $(LREF decomposeHangul) for a restricted version 8519 that takes into account only hangul syllables but 8520 no other decompositions. 8521 +/ 8522 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe 8523 { 8524 import std.algorithm.searching : until; 8525 import std.internal.unicode_decomp : decompCompatTable, decompCanonTable; 8526 static if (decompType == Canonical) 8527 { 8528 alias table = decompCanonTable; 8529 alias mapping = canonMappingTrie; 8530 } 8531 else static if (decompType == Compatibility) 8532 { 8533 alias table = decompCompatTable; 8534 alias mapping = compatMappingTrie; 8535 } 8536 immutable idx = mapping[ch]; 8537 if (!idx) // not found, check hangul arithmetic decomposition 8538 return decomposeHangul(ch); 8539 auto decomp = table[idx..$].until(0); 8540 return Grapheme(decomp); 8541 } 8542 8543 /// 8544 @safe unittest 8545 { 8546 import std.algorithm.comparison : equal; 8547 8548 assert(compose('A','\u0308') == '\u00C4'); 8549 assert(compose('A', 'B') == dchar.init); 8550 assert(compose('C', '\u0301') == '\u0106'); 8551 // note that the starter is the first one 8552 // thus the following doesn't compose 8553 assert(compose('\u0308', 'A') == dchar.init); 8554 8555 assert(decompose('Ĉ')[].equal("C\u0302")); 8556 assert(decompose('D')[].equal("D")); 8557 assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7")); 8558 assert(decompose!Compatibility('¹')[].equal("1")); 8559 } 8560 8561 //---------------------------------------------------------------------------- 8562 // Hangul specific composition/decomposition 8563 enum jamoSBase = 0xAC00; 8564 enum jamoLBase = 0x1100; 8565 enum jamoVBase = 0x1161; 8566 enum jamoTBase = 0x11A7; 8567 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28; 8568 enum jamoNCount = jamoVCount * jamoTCount; 8569 enum jamoSCount = jamoLCount * jamoNCount; 8570 8571 // Tests if `ch` is a Hangul leading consonant jamo. 8572 bool isJamoL(dchar ch) pure nothrow @nogc @safe 8573 { 8574 // first cmp rejects ~ 1M code points above leading jamo range 8575 return ch < jamoLBase+jamoLCount && ch >= jamoLBase; 8576 } 8577 8578 // Tests if `ch` is a Hangul vowel jamo. 8579 bool isJamoT(dchar ch) pure nothrow @nogc @safe 8580 { 8581 // first cmp rejects ~ 1M code points above trailing jamo range 8582 // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0) 8583 return ch < jamoTBase+jamoTCount && ch > jamoTBase; 8584 } 8585 8586 // Tests if `ch` is a Hangul trailnig consonant jamo. 8587 bool isJamoV(dchar ch) pure nothrow @nogc @safe 8588 { 8589 // first cmp rejects ~ 1M code points above vowel range 8590 return ch < jamoVBase+jamoVCount && ch >= jamoVBase; 8591 } 8592 8593 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe 8594 { 8595 int idxS = cast(int) ch - jamoSBase; 8596 return idxS >= 0 && idxS < jamoSCount ? idxS : -1; 8597 } 8598 8599 // internal helper: compose hangul syllables leaving dchar.init in holes 8600 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe 8601 { 8602 for (size_t idx = 0; idx + 1 < seq.length; ) 8603 { 8604 if (isJamoL(seq[idx]) && isJamoV(seq[idx+1])) 8605 { 8606 immutable int indexL = seq[idx] - jamoLBase; 8607 immutable int indexV = seq[idx+1] - jamoVBase; 8608 immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount; 8609 if (idx + 2 < seq.length && isJamoT(seq[idx+2])) 8610 { 8611 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase; 8612 seq[idx+1] = dchar.init; 8613 seq[idx+2] = dchar.init; 8614 idx += 3; 8615 } 8616 else 8617 { 8618 seq[idx] = jamoSBase + indexLV; 8619 seq[idx+1] = dchar.init; 8620 idx += 2; 8621 } 8622 } 8623 else 8624 idx++; 8625 } 8626 } 8627 8628 //---------------------------------------------------------------------------- 8629 public: 8630 8631 /** 8632 Decomposes a Hangul syllable. If `ch` is not a composed syllable 8633 then this function returns $(LREF Grapheme) containing only `ch` as is. 8634 */ 8635 Grapheme decomposeHangul(dchar ch) nothrow pure @safe 8636 { 8637 immutable idxS = cast(int) ch - jamoSBase; 8638 if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch); 8639 immutable idxL = idxS / jamoNCount; 8640 immutable idxV = (idxS % jamoNCount) / jamoTCount; 8641 immutable idxT = idxS % jamoTCount; 8642 8643 immutable partL = jamoLBase + idxL; 8644 immutable partV = jamoVBase + idxV; 8645 if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition 8646 return Grapheme(partL, partV, jamoTBase + idxT); 8647 else // <L, V> decomposition 8648 return Grapheme(partL, partV); 8649 } 8650 8651 /// 8652 @safe unittest 8653 { 8654 import std.algorithm.comparison : equal; 8655 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8656 } 8657 8658 /++ 8659 Try to compose hangul syllable out of a leading consonant (`lead`), 8660 a `vowel` and optional `trailing` consonant jamos. 8661 8662 On success returns the composed LV or LVT hangul syllable. 8663 8664 If any of `lead` and `vowel` are not a valid hangul jamo 8665 of the respective $(CHARACTER) class returns dchar.init. 8666 +/ 8667 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe 8668 { 8669 if (!isJamoL(lead)) 8670 return dchar.init; 8671 immutable indexL = lead - jamoLBase; 8672 if (!isJamoV(vowel)) 8673 return dchar.init; 8674 immutable indexV = vowel - jamoVBase; 8675 immutable indexLV = indexL * jamoNCount + indexV * jamoTCount; 8676 immutable dchar syllable = jamoSBase + indexLV; 8677 return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable; 8678 } 8679 8680 /// 8681 @safe unittest 8682 { 8683 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8684 // leaving out T-vowel, or passing any codepoint 8685 // that is not trailing consonant composes an LV-syllable 8686 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); 8687 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8688 assert(composeJamo('\u1111', 'A') == dchar.init); 8689 assert(composeJamo('A', '\u1171') == dchar.init); 8690 } 8691 8692 @safe unittest 8693 { 8694 import std.algorithm.comparison : equal; 8695 import std.conv : text; 8696 8697 static void testDecomp(UnicodeDecomposition T)(dchar ch, string r) 8698 { 8699 Grapheme g = decompose!T(ch); 8700 assert(equal(g[], r), text(g[], " vs ", r)); 8701 } 8702 testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345"); 8703 testDecomp!Canonical('\uF907', "\u9F9C"); 8704 testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C"); 8705 testDecomp!Compatibility('\uA7F9', "\u0153"); 8706 8707 // check examples 8708 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8709 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8710 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel 8711 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8712 assert(composeJamo('\u1111', 'A') == dchar.init); 8713 assert(composeJamo('A', '\u1171') == dchar.init); 8714 } 8715 8716 /** 8717 Enumeration type for normalization forms, 8718 passed as template parameter for functions like $(LREF normalize). 8719 */ 8720 enum NormalizationForm { 8721 NFC, 8722 NFD, 8723 NFKC, 8724 NFKD 8725 } 8726 8727 8728 enum { 8729 /** 8730 Shorthand aliases from values indicating normalization forms. 8731 */ 8732 NFC = NormalizationForm.NFC, 8733 ///ditto 8734 NFD = NormalizationForm.NFD, 8735 ///ditto 8736 NFKC = NormalizationForm.NFKC, 8737 ///ditto 8738 NFKD = NormalizationForm.NFKD 8739 } 8740 8741 /++ 8742 Returns `input` string normalized to the chosen form. 8743 Form C is used by default. 8744 8745 For more information on normalization forms see 8746 the $(S_LINK Normalization, normalization section). 8747 8748 Note: 8749 In cases where the string in question is already normalized, 8750 it is returned unmodified and no memory allocation happens. 8751 +/ 8752 /* 8753 WARNING: @trusted lambda inside - handle with same care as @trusted 8754 functions 8755 8756 Despite being a template, the attributes do no harm since this doesn't work 8757 with user-defined range or character types anyway. 8758 */ 8759 pure @safe inout(C)[] normalize(NormalizationForm norm=NFC, C) 8760 (return scope inout(C)[] input) 8761 { 8762 import std.algorithm.mutation : SwapStrategy; 8763 import std.algorithm.sorting : sort; 8764 import std.array : appender; 8765 import std.range : zip; 8766 8767 auto anchors = splitNormalized!norm(input); 8768 if (anchors[0] == input.length && anchors[1] == input.length) 8769 return input; 8770 dchar[] decomposed; 8771 decomposed.reserve(31); 8772 ubyte[] ccc; 8773 ccc.reserve(31); 8774 auto app = appender!(C[])(); 8775 do 8776 { 8777 app.put(input[0 .. anchors[0]]); 8778 foreach (dchar ch; input[anchors[0]..anchors[1]]) 8779 static if (norm == NFD || norm == NFC) 8780 { 8781 foreach (dchar c; decompose!Canonical(ch)[]) 8782 decomposed ~= c; 8783 } 8784 else // NFKD & NFKC 8785 { 8786 foreach (dchar c; decompose!Compatibility(ch)[]) 8787 decomposed ~= c; 8788 } 8789 ccc.length = decomposed.length; 8790 size_t firstNonStable = 0; 8791 ubyte lastClazz = 0; 8792 8793 foreach (idx, dchar ch; decomposed) 8794 { 8795 immutable clazz = combiningClass(ch); 8796 ccc[idx] = clazz; 8797 if (clazz == 0 && lastClazz != 0) 8798 { 8799 // found a stable code point after unstable ones 8800 sort!("a[0] < b[0]", SwapStrategy.stable) 8801 (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx])); 8802 firstNonStable = decomposed.length; 8803 } 8804 else if (clazz != 0 && lastClazz == 0) 8805 { 8806 // found first unstable code point after stable ones 8807 firstNonStable = idx; 8808 } 8809 lastClazz = clazz; 8810 } 8811 sort!("a[0] < b[0]", SwapStrategy.stable) 8812 (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$])); 8813 static if (norm == NFC || norm == NFKC) 8814 { 8815 import std.algorithm.searching : countUntil; 8816 auto first = countUntil(ccc, 0); 8817 if (first >= 0) // no starters?? no recomposition 8818 { 8819 for (;;) 8820 { 8821 immutable second = recompose(first, decomposed, ccc); 8822 if (second == decomposed.length) 8823 break; 8824 first = second; 8825 } 8826 // 2nd pass for hangul syllables 8827 hangulRecompose(decomposed); 8828 } 8829 } 8830 static if (norm == NFD || norm == NFKD) 8831 app.put(decomposed); 8832 else 8833 { 8834 import std.algorithm.mutation : remove; 8835 auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed); 8836 app.put(decomposed[0 .. clean.length]); 8837 } 8838 // reset variables 8839 decomposed.length = 0; 8840 () @trusted { 8841 // assumeSafeAppend isn't considered pure as of writing, hence the 8842 // cast. It isn't pure in the sense that the elements after 8843 // the array in question are affected, but we don't use those 8844 // making the call pure for our purposes. 8845 (cast(void delegate() pure nothrow) {decomposed.assumeSafeAppend();})(); 8846 ccc.length = 0; 8847 (cast(void delegate() pure nothrow) {ccc.assumeSafeAppend();})(); 8848 } (); 8849 input = input[anchors[1]..$]; 8850 // and move on 8851 anchors = splitNormalized!norm(input); 8852 } while (anchors[0] != input.length); 8853 app.put(input[0 .. anchors[0]]); 8854 return () @trusted inout { return cast(inout(C)[]) app.data; } (); 8855 } 8856 8857 /// 8858 @safe pure unittest 8859 { 8860 // any encoding works 8861 wstring greet = "Hello world"; 8862 assert(normalize(greet) is greet); // the same exact slice 8863 8864 // An example of a character with all 4 forms being different: 8865 // Greek upsilon with acute and hook symbol (code point 0x03D3) 8866 assert(normalize!NFC("ϓ") == "\u03D3"); 8867 assert(normalize!NFD("ϓ") == "\u03D2\u0301"); 8868 assert(normalize!NFKC("ϓ") == "\u038E"); 8869 assert(normalize!NFKD("ϓ") == "\u03A5\u0301"); 8870 } 8871 8872 @safe pure unittest 8873 { 8874 import std.conv : text; 8875 8876 assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def"))); 8877 assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰")); 8878 assert(normalize!NFD("Äffin") == "A\u0308ffin"); 8879 8880 // test with dstring 8881 dstring greet = "Hello world"; 8882 assert(normalize(greet) is greet); // the same exact slice 8883 } 8884 8885 // canonically recompose given slice of code points, works in-place and mutates data 8886 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe 8887 { 8888 assert(input.length == ccc.length); 8889 int accumCC = -1;// so that it's out of 0 .. 255 range 8890 // writefln("recomposing %( %04x %)", input); 8891 // first one is always a starter thus we start at i == 1 8892 size_t i = start+1; 8893 for (; ; ) 8894 { 8895 if (i == input.length) 8896 break; 8897 immutable curCC = ccc[i]; 8898 // In any character sequence beginning with a starter S 8899 // a character C is blocked from S if and only if there 8900 // is some character B between S and C, and either B 8901 // is a starter or it has the same or higher combining class as C. 8902 //------------------------ 8903 // Applying to our case: 8904 // S is input[0] 8905 // accumCC is the maximum CCC of characters between C and S, 8906 // as ccc are sorted 8907 // C is input[i] 8908 8909 if (curCC > accumCC) 8910 { 8911 immutable comp = compose(input[start], input[i]); 8912 if (comp != dchar.init) 8913 { 8914 input[start] = comp; 8915 input[i] = dchar.init;// put a sentinel 8916 // current was merged so its CCC shouldn't affect 8917 // composing with the next one 8918 } 8919 else 8920 { 8921 // if it was a starter then accumCC is now 0, end of loop 8922 accumCC = curCC; 8923 if (accumCC == 0) 8924 break; 8925 } 8926 } 8927 else 8928 { 8929 // ditto here 8930 accumCC = curCC; 8931 if (accumCC == 0) 8932 break; 8933 } 8934 i++; 8935 } 8936 return i; 8937 } 8938 8939 // returns tuple of 2 indexes that delimit: 8940 // normalized text, piece that needs normalization and 8941 // the rest of input starting with stable code point 8942 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input) 8943 { 8944 import std.typecons : tuple; 8945 ubyte lastCC = 0; 8946 8947 foreach (idx, dchar ch; input) 8948 { 8949 static if (norm == NFC) 8950 if (ch < 0x0300) 8951 { 8952 lastCC = 0; 8953 continue; 8954 } 8955 immutable ubyte CC = combiningClass(ch); 8956 if (lastCC > CC && CC != 0) 8957 { 8958 return seekStable!norm(idx, input); 8959 } 8960 8961 if (notAllowedIn!norm(ch)) 8962 { 8963 return seekStable!norm(idx, input); 8964 } 8965 lastCC = CC; 8966 } 8967 return tuple(input.length, input.length); 8968 } 8969 8970 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input) 8971 { 8972 import std.typecons : tuple; 8973 import std.utf : codeLength; 8974 8975 auto br = input[0 .. idx]; 8976 size_t region_start = 0;// default 8977 for (;;) 8978 { 8979 if (br.empty)// start is 0 8980 break; 8981 dchar ch = br.back; 8982 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8983 { 8984 region_start = br.length - codeLength!C(ch); 8985 break; 8986 } 8987 br.popFront(); 8988 } 8989 ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..." 8990 size_t region_end=input.length;// end is $ by default 8991 foreach (i, dchar ch; input[idx..$]) 8992 { 8993 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8994 { 8995 region_end = i+idx; 8996 break; 8997 } 8998 } 8999 // writeln("Region to normalize: ", input[region_start .. region_end]); 9000 return tuple(region_start, region_end); 9001 } 9002 9003 /** 9004 Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization 9005 form `norm`. 9006 */ 9007 public bool allowedIn(NormalizationForm norm)(dchar ch) 9008 { 9009 return !notAllowedIn!norm(ch); 9010 } 9011 9012 /// 9013 @safe unittest 9014 { 9015 // e.g. Cyrillic is always allowed, so is ASCII 9016 assert(allowedIn!NFC('я')); 9017 assert(allowedIn!NFD('я')); 9018 assert(allowedIn!NFKC('я')); 9019 assert(allowedIn!NFKD('я')); 9020 assert(allowedIn!NFC('Z')); 9021 } 9022 9023 // not user friendly name but more direct 9024 private bool notAllowedIn(NormalizationForm norm)(dchar ch) 9025 { 9026 static if (norm == NFC) 9027 alias qcTrie = nfcQCTrie; 9028 else static if (norm == NFD) 9029 alias qcTrie = nfdQCTrie; 9030 else static if (norm == NFKC) 9031 alias qcTrie = nfkcQCTrie; 9032 else static if (norm == NFKD) 9033 alias qcTrie = nfkdQCTrie; 9034 else 9035 static assert("Unknown normalization form "~norm); 9036 return qcTrie[ch]; 9037 } 9038 9039 @safe unittest 9040 { 9041 assert(allowedIn!NFC('я')); 9042 assert(allowedIn!NFD('я')); 9043 assert(allowedIn!NFKC('я')); 9044 assert(allowedIn!NFKD('я')); 9045 assert(allowedIn!NFC('Z')); 9046 } 9047 9048 } 9049 9050 version (std_uni_bootstrap) 9051 { 9052 // old version used for bootstrapping of gen_uni.d that generates 9053 // up to date optimal versions of all of isXXX functions 9054 @safe pure nothrow @nogc public bool isWhite(dchar c) 9055 { 9056 import std.ascii : isWhite; 9057 return isWhite(c) || 9058 c == lineSep || c == paraSep || 9059 c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' || 9060 (c >= '\u2000' && c <= '\u200A') || 9061 c == '\u202F' || c == '\u205F' || c == '\u3000'; 9062 } 9063 } 9064 else 9065 { 9066 9067 // trusted -> avoid bounds check 9068 @trusted pure nothrow @nogc private 9069 { 9070 import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file 9071 9072 // hide template instances behind functions 9073 // https://issues.dlang.org/show_bug.cgi?id=13232 9074 ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; } 9075 ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; } 9076 dchar toLowerTab(size_t idx) { return toLowerTable[idx]; } 9077 9078 ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; } 9079 ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; } 9080 dchar toTitleTab(size_t idx) { return toTitleTable[idx]; } 9081 9082 ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; } 9083 ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; } 9084 dchar toUpperTab(size_t idx) { return toUpperTable[idx]; } 9085 } 9086 9087 public: 9088 9089 /++ 9090 Whether or not `c` is a Unicode whitespace $(CHARACTER). 9091 (general Unicode category: Part of C0(tab, vertical tab, form feed, 9092 carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085)) 9093 +/ 9094 @safe pure nothrow @nogc 9095 public bool isWhite(dchar c) 9096 { 9097 import std.internal.unicode_tables : isWhiteGen; // generated file 9098 return isWhiteGen(c); // call pregenerated binary search 9099 } 9100 9101 /++ 9102 Return whether `c` is a Unicode lowercase $(CHARACTER). 9103 +/ 9104 @safe pure nothrow @nogc 9105 bool isLower(dchar c) 9106 { 9107 import std.ascii : isLower, isASCII; 9108 if (isASCII(c)) 9109 return isLower(c); 9110 return lowerCaseTrie[c]; 9111 } 9112 9113 @safe unittest 9114 { 9115 import std.ascii : isLower; 9116 foreach (v; 0 .. 0x80) 9117 assert(isLower(v) == .isLower(v)); 9118 assert(.isLower('я')); 9119 assert(.isLower('й')); 9120 assert(!.isLower('Ж')); 9121 // Greek HETA 9122 assert(!.isLower('\u0370')); 9123 assert(.isLower('\u0371')); 9124 assert(!.isLower('\u039C')); // capital MU 9125 assert(.isLower('\u03B2')); // beta 9126 // from extended Greek 9127 assert(!.isLower('\u1F18')); 9128 assert(.isLower('\u1F00')); 9129 foreach (v; unicode.lowerCase.byCodepoint) 9130 assert(.isLower(v) && !isUpper(v)); 9131 } 9132 9133 9134 /++ 9135 Return whether `c` is a Unicode uppercase $(CHARACTER). 9136 +/ 9137 @safe pure nothrow @nogc 9138 bool isUpper(dchar c) 9139 { 9140 import std.ascii : isUpper, isASCII; 9141 if (isASCII(c)) 9142 return isUpper(c); 9143 return upperCaseTrie[c]; 9144 } 9145 9146 @safe unittest 9147 { 9148 import std.ascii : isLower; 9149 foreach (v; 0 .. 0x80) 9150 assert(isLower(v) == .isLower(v)); 9151 assert(!isUpper('й')); 9152 assert(isUpper('Ж')); 9153 // Greek HETA 9154 assert(isUpper('\u0370')); 9155 assert(!isUpper('\u0371')); 9156 assert(isUpper('\u039C')); // capital MU 9157 assert(!isUpper('\u03B2')); // beta 9158 // from extended Greek 9159 assert(!isUpper('\u1F00')); 9160 assert(isUpper('\u1F18')); 9161 foreach (v; unicode.upperCase.byCodepoint) 9162 assert(isUpper(v) && !.isLower(v)); 9163 } 9164 9165 9166 //TODO: Hidden for now, needs better API. 9167 //Other transforms could use better API as well, but this one is a new primitive. 9168 @safe pure nothrow @nogc 9169 private dchar toTitlecase(dchar c) 9170 { 9171 // optimize ASCII case 9172 if (c < 0xAA) 9173 { 9174 if (c < 'a') 9175 return c; 9176 if (c <= 'z') 9177 return c - 32; 9178 return c; 9179 } 9180 size_t idx = toTitleSimpleIndex(c); 9181 if (idx != ushort.max) 9182 { 9183 return toTitleTab(idx); 9184 } 9185 return c; 9186 } 9187 9188 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab); 9189 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab); 9190 9191 // generic toUpper/toLower on whole string, creates new or returns as is 9192 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s) 9193 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 9194 { 9195 import std.array : appender, array; 9196 import std.ascii : isASCII; 9197 import std.utf : byDchar, codeLength; 9198 9199 alias C = ElementEncodingType!S; 9200 9201 auto r = s.byDchar; 9202 for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront()) 9203 { 9204 auto cOuter = r.front; 9205 ushort idx = indexFn(cOuter); 9206 if (idx == ushort.max) 9207 continue; 9208 auto result = appender!(C[])(); 9209 result.reserve(s.length); 9210 result.put(s[0 .. i]); 9211 foreach (dchar c; s[i .. $].byDchar) 9212 { 9213 if (c.isASCII) 9214 { 9215 result.put(asciiConvert(c)); 9216 } 9217 else 9218 { 9219 idx = indexFn(c); 9220 if (idx == ushort.max) 9221 result.put(c); 9222 else if (idx < maxIdx) 9223 { 9224 c = tableFn(idx); 9225 result.put(c); 9226 } 9227 else 9228 { 9229 auto val = tableFn(idx); 9230 // unpack length + codepoint 9231 immutable uint len = val >> 24; 9232 result.put(cast(dchar)(val & 0xFF_FFFF)); 9233 foreach (j; idx+1 .. idx+len) 9234 result.put(tableFn(j)); 9235 } 9236 } 9237 } 9238 return result.data; 9239 } 9240 9241 static if (isSomeString!S) 9242 return s; 9243 else 9244 return s.array; 9245 } 9246 9247 // https://issues.dlang.org/show_bug.cgi?id=12428 9248 @safe unittest 9249 { 9250 import std.array : replicate; 9251 auto s = "abcdefghij".replicate(300); 9252 s = s[0 .. 10]; 9253 9254 toUpper(s); 9255 9256 assert(s == "abcdefghij"); 9257 } 9258 9259 // https://issues.dlang.org/show_bug.cgi?id=18993 9260 @safe unittest 9261 { 9262 static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length); 9263 } 9264 9265 9266 // generic toUpper/toLower on whole range, returns range 9267 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str) 9268 // Accept range of dchar's 9269 if (isInputRange!Range && 9270 isSomeChar!(ElementEncodingType!Range) && 9271 ElementEncodingType!Range.sizeof == dchar.sizeof) 9272 { 9273 static struct ToCaserImpl 9274 { 9275 @property bool empty() 9276 { 9277 return !nLeft && r.empty; 9278 } 9279 9280 @property auto front() 9281 { 9282 import std.ascii : isASCII; 9283 9284 if (!nLeft) 9285 { 9286 dchar c = r.front; 9287 if (c.isASCII) 9288 { 9289 buf[0] = asciiConvert(c); 9290 nLeft = 1; 9291 } 9292 else 9293 { 9294 const idx = indexFn(c); 9295 if (idx == ushort.max) 9296 { 9297 buf[0] = c; 9298 nLeft = 1; 9299 } 9300 else if (idx < maxIdx) 9301 { 9302 buf[0] = tableFn(idx); 9303 nLeft = 1; 9304 } 9305 else 9306 { 9307 immutable val = tableFn(idx); 9308 // unpack length + codepoint 9309 nLeft = val >> 24; 9310 if (nLeft == 0) 9311 nLeft = 1; 9312 assert(nLeft <= buf.length); 9313 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9314 foreach (j; 1 .. nLeft) 9315 buf[nLeft - j - 1] = tableFn(idx + j); 9316 } 9317 } 9318 } 9319 return buf[nLeft - 1]; 9320 } 9321 9322 void popFront() 9323 { 9324 if (!nLeft) 9325 front; 9326 assert(nLeft); 9327 --nLeft; 9328 if (!nLeft) 9329 r.popFront(); 9330 } 9331 9332 static if (isForwardRange!Range) 9333 { 9334 @property auto save() 9335 { 9336 auto ret = this; 9337 ret.r = r.save; 9338 return ret; 9339 } 9340 } 9341 9342 private: 9343 Range r; 9344 uint nLeft; 9345 dchar[3] buf = void; 9346 } 9347 9348 return ToCaserImpl(str); 9349 } 9350 9351 /********************* 9352 * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9353 * or a string to upper or lower case. 9354 * 9355 * Does not allocate memory. 9356 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9357 * are treated as $(REF replacementDchar, std,utf). 9358 * 9359 * Params: 9360 * str = string or range of characters 9361 * 9362 * Returns: 9363 * an input range of `dchar`s 9364 * 9365 * See_Also: 9366 * $(LREF toUpper), $(LREF toLower) 9367 */ 9368 9369 auto asLowerCase(Range)(Range str) 9370 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9371 !isConvertibleToString!Range) 9372 { 9373 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9374 { 9375 import std.utf : byDchar; 9376 9377 // Decode first 9378 return asLowerCase(str.byDchar); 9379 } 9380 else 9381 { 9382 static import std.ascii; 9383 return toCaser!(LowerTriple, std.ascii.toLower)(str); 9384 } 9385 } 9386 9387 /// ditto 9388 auto asUpperCase(Range)(Range str) 9389 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9390 !isConvertibleToString!Range) 9391 { 9392 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9393 { 9394 import std.utf : byDchar; 9395 9396 // Decode first 9397 return asUpperCase(str.byDchar); 9398 } 9399 else 9400 { 9401 static import std.ascii; 9402 return toCaser!(UpperTriple, std.ascii.toUpper)(str); 9403 } 9404 } 9405 9406 /// 9407 @safe pure unittest 9408 { 9409 import std.algorithm.comparison : equal; 9410 9411 assert("hEllo".asUpperCase.equal("HELLO")); 9412 } 9413 9414 // explicitly undocumented 9415 auto asLowerCase(Range)(auto ref Range str) 9416 if (isConvertibleToString!Range) 9417 { 9418 import std.traits : StringTypeOf; 9419 return asLowerCase!(StringTypeOf!Range)(str); 9420 } 9421 9422 // explicitly undocumented 9423 auto asUpperCase(Range)(auto ref Range str) 9424 if (isConvertibleToString!Range) 9425 { 9426 import std.traits : StringTypeOf; 9427 return asUpperCase!(StringTypeOf!Range)(str); 9428 } 9429 9430 @safe unittest 9431 { 9432 static struct TestAliasedString 9433 { 9434 string get() @safe @nogc pure nothrow { return _s; } 9435 alias get this; 9436 @disable this(this); 9437 string _s; 9438 } 9439 9440 static bool testAliasedString(alias func, Args...)(string s, Args args) 9441 { 9442 import std.algorithm.comparison : equal; 9443 auto a = func(TestAliasedString(s), args); 9444 auto b = func(s, args); 9445 static if (is(typeof(equal(a, b)))) 9446 { 9447 // For ranges, compare contents instead of object identity. 9448 return equal(a, b); 9449 } 9450 else 9451 { 9452 return a == b; 9453 } 9454 } 9455 assert(testAliasedString!asLowerCase("hEllo")); 9456 assert(testAliasedString!asUpperCase("hEllo")); 9457 assert(testAliasedString!asCapitalized("hEllo")); 9458 } 9459 9460 @safe unittest 9461 { 9462 import std.array : array; 9463 9464 auto a = "HELLo".asLowerCase; 9465 auto savea = a.save; 9466 auto s = a.array; 9467 assert(s == "hello"); 9468 s = savea.array; 9469 assert(s == "hello"); 9470 9471 string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 9472 string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 9473 9474 foreach (i, slwr; lower) 9475 { 9476 import std.utf : byChar; 9477 9478 auto sx = slwr.asUpperCase.byChar.array; 9479 assert(sx == toUpper(slwr)); 9480 auto sy = upper[i].asLowerCase.byChar.array; 9481 assert(sy == toLower(upper[i])); 9482 } 9483 9484 // Not necessary to call r.front 9485 for (auto r = lower[3].asUpperCase; !r.empty; r.popFront()) 9486 { 9487 } 9488 9489 import std.algorithm.comparison : equal; 9490 9491 "HELLo"w.asLowerCase.equal("hello"d); 9492 "HELLo"w.asUpperCase.equal("HELLO"d); 9493 "HELLo"d.asLowerCase.equal("hello"d); 9494 "HELLo"d.asUpperCase.equal("HELLO"d); 9495 9496 import std.utf : byChar; 9497 assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array); 9498 } 9499 9500 // generic capitalizer on whole range, returns range 9501 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper, 9502 Range)(Range str) 9503 // Accept range of dchar's 9504 if (isInputRange!Range && 9505 isSomeChar!(ElementEncodingType!Range) && 9506 ElementEncodingType!Range.sizeof == dchar.sizeof) 9507 { 9508 static struct ToCapitalizerImpl 9509 { 9510 @property bool empty() 9511 { 9512 return lower ? lwr.empty : !nLeft && r.empty; 9513 } 9514 9515 @property auto front() 9516 { 9517 if (lower) 9518 return lwr.front; 9519 9520 if (!nLeft) 9521 { 9522 immutable dchar c = r.front; 9523 const idx = indexFnUpper(c); 9524 if (idx == ushort.max) 9525 { 9526 buf[0] = c; 9527 nLeft = 1; 9528 } 9529 else if (idx < maxIdxUpper) 9530 { 9531 buf[0] = tableFnUpper(idx); 9532 nLeft = 1; 9533 } 9534 else 9535 { 9536 immutable val = tableFnUpper(idx); 9537 // unpack length + codepoint 9538 nLeft = val >> 24; 9539 if (nLeft == 0) 9540 nLeft = 1; 9541 assert(nLeft <= buf.length); 9542 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9543 foreach (j; 1 .. nLeft) 9544 buf[nLeft - j - 1] = tableFnUpper(idx + j); 9545 } 9546 } 9547 return buf[nLeft - 1]; 9548 } 9549 9550 void popFront() 9551 { 9552 if (lower) 9553 lwr.popFront(); 9554 else 9555 { 9556 if (!nLeft) 9557 front; 9558 assert(nLeft); 9559 --nLeft; 9560 if (!nLeft) 9561 { 9562 r.popFront(); 9563 lwr = r.asLowerCase(); 9564 lower = true; 9565 } 9566 } 9567 } 9568 9569 static if (isForwardRange!Range) 9570 { 9571 @property auto save() 9572 { 9573 auto ret = this; 9574 ret.r = r.save; 9575 ret.lwr = lwr.save; 9576 return ret; 9577 } 9578 } 9579 9580 private: 9581 Range r; 9582 typeof(r.asLowerCase) lwr; // range representing the lower case rest of string 9583 bool lower = false; // false for first character, true for rest of string 9584 dchar[3] buf = void; 9585 uint nLeft = 0; 9586 } 9587 9588 return ToCapitalizerImpl(str); 9589 } 9590 9591 /********************* 9592 * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9593 * or string, meaning convert the first 9594 * character to upper case and subsequent characters to lower case. 9595 * 9596 * Does not allocate memory. 9597 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9598 * are treated as $(REF replacementDchar, std,utf). 9599 * 9600 * Params: 9601 * str = string or range of characters 9602 * 9603 * Returns: 9604 * an InputRange of dchars 9605 * 9606 * See_Also: 9607 * $(LREF toUpper), $(LREF toLower) 9608 * $(LREF asUpperCase), $(LREF asLowerCase) 9609 */ 9610 9611 auto asCapitalized(Range)(Range str) 9612 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9613 !isConvertibleToString!Range) 9614 { 9615 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9616 { 9617 import std.utf : byDchar; 9618 9619 // Decode first 9620 return toCapitalizer!UpperTriple(str.byDchar); 9621 } 9622 else 9623 { 9624 return toCapitalizer!UpperTriple(str); 9625 } 9626 } 9627 9628 /// 9629 @safe pure unittest 9630 { 9631 import std.algorithm.comparison : equal; 9632 9633 assert("hEllo".asCapitalized.equal("Hello")); 9634 } 9635 9636 auto asCapitalized(Range)(auto ref Range str) 9637 if (isConvertibleToString!Range) 9638 { 9639 import std.traits : StringTypeOf; 9640 return asCapitalized!(StringTypeOf!Range)(str); 9641 } 9642 9643 @safe pure nothrow @nogc unittest 9644 { 9645 auto r = "hEllo".asCapitalized(); 9646 assert(r.front == 'H'); 9647 } 9648 9649 @safe unittest 9650 { 9651 import std.array : array; 9652 9653 auto a = "hELLo".asCapitalized; 9654 auto savea = a.save; 9655 auto s = a.array; 9656 assert(s == "Hello"); 9657 s = savea.array; 9658 assert(s == "Hello"); 9659 9660 string[2][] cases = 9661 [ 9662 ["", ""], 9663 ["h", "H"], 9664 ["H", "H"], 9665 ["3", "3"], 9666 ["123", "123"], 9667 ["h123A", "H123a"], 9668 ["феж", "Феж"], 9669 ["\u1Fe2", "\u03a5\u0308\u0300"], 9670 ]; 9671 9672 foreach (i; 0 .. cases.length) 9673 { 9674 import std.utf : byChar; 9675 9676 auto r = cases[i][0].asCapitalized.byChar.array; 9677 auto result = cases[i][1]; 9678 assert(r == result); 9679 } 9680 9681 // Don't call r.front 9682 for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront()) 9683 { 9684 } 9685 9686 import std.algorithm.comparison : equal; 9687 9688 "HELLo"w.asCapitalized.equal("Hello"d); 9689 "hElLO"w.asCapitalized.equal("Hello"d); 9690 "hello"d.asCapitalized.equal("Hello"d); 9691 "HELLO"d.asCapitalized.equal("Hello"d); 9692 9693 import std.utf : byChar; 9694 assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array); 9695 } 9696 9697 // TODO: helper, I wish std.utf was more flexible (and stright) 9698 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9699 { 9700 if (c <= 0x7F) 9701 { 9702 buf[idx] = cast(char) c; 9703 idx++; 9704 } 9705 else if (c <= 0x7FF) 9706 { 9707 buf[idx] = cast(char)(0xC0 | (c >> 6)); 9708 buf[idx+1] = cast(char)(0x80 | (c & 0x3F)); 9709 idx += 2; 9710 } 9711 else if (c <= 0xFFFF) 9712 { 9713 buf[idx] = cast(char)(0xE0 | (c >> 12)); 9714 buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9715 buf[idx+2] = cast(char)(0x80 | (c & 0x3F)); 9716 idx += 3; 9717 } 9718 else if (c <= 0x10FFFF) 9719 { 9720 buf[idx] = cast(char)(0xF0 | (c >> 18)); 9721 buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 9722 buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9723 buf[idx+3] = cast(char)(0x80 | (c & 0x3F)); 9724 idx += 4; 9725 } 9726 else 9727 assert(0); 9728 return idx; 9729 } 9730 9731 @safe unittest 9732 { 9733 char[] s = "abcd".dup; 9734 size_t i = 0; 9735 i = encodeTo(s, i, 'X'); 9736 assert(s == "Xbcd"); 9737 9738 i = encodeTo(s, i, cast(dchar)'\u00A9'); 9739 assert(s == "X\xC2\xA9d"); 9740 } 9741 9742 // TODO: helper, I wish std.utf was more flexible (and stright) 9743 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure 9744 { 9745 import std.utf : UTFException; 9746 if (c <= 0xFFFF) 9747 { 9748 if (0xD800 <= c && c <= 0xDFFF) 9749 throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c); 9750 buf[idx] = cast(wchar) c; 9751 idx++; 9752 } 9753 else if (c <= 0x10FFFF) 9754 { 9755 buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 9756 buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 9757 idx += 2; 9758 } 9759 else 9760 assert(0); 9761 return idx; 9762 } 9763 9764 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9765 { 9766 buf[idx] = c; 9767 idx++; 9768 return idx; 9769 } 9770 9771 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure 9772 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9773 { 9774 import std.utf : decode, codeLength; 9775 size_t curIdx = 0; 9776 size_t destIdx = 0; 9777 alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn); 9778 size_t lastUnchanged = 0; 9779 // in-buffer move of bytes to a new start index 9780 // the trick is that it may not need to copy at all 9781 static size_t moveTo(C[] str, size_t dest, size_t from, size_t to) 9782 { 9783 // Interestingly we may just bump pointer for a while 9784 // then have to copy if a re-cased char was smaller the original 9785 // later we may regain pace with char that got bigger 9786 // In the end it sometimes flip-flops between the 2 cases below 9787 if (dest == from) 9788 return to; 9789 // got to copy 9790 foreach (C c; str[from .. to]) 9791 str[dest++] = c; 9792 return dest; 9793 } 9794 while (curIdx != s.length) 9795 { 9796 size_t startIdx = curIdx; 9797 immutable ch = decode(s, curIdx); 9798 // TODO: special case for ASCII 9799 immutable caseIndex = indexFn(ch); 9800 if (caseIndex == ushort.max) // unchanged, skip over 9801 { 9802 continue; 9803 } 9804 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9805 { 9806 // previous cased chars had the same length as uncased ones 9807 // thus can just adjust pointer 9808 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9809 lastUnchanged = curIdx; 9810 immutable cased = tableFn(caseIndex); 9811 immutable casedLen = codeLength!C(cased); 9812 if (casedLen + destIdx > curIdx) // no place to fit cased char 9813 { 9814 // switch to slow codepath, where we allocate 9815 return slowToCase(s, startIdx, destIdx); 9816 } 9817 else 9818 { 9819 destIdx = encodeTo(s, destIdx, cased); 9820 } 9821 } 9822 else // 1:m codepoint mapping, slow codepath 9823 { 9824 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9825 lastUnchanged = curIdx; 9826 return slowToCase(s, startIdx, destIdx); 9827 } 9828 assert(destIdx <= curIdx); 9829 } 9830 if (lastUnchanged != s.length) 9831 { 9832 destIdx = moveTo(s, destIdx, lastUnchanged, s.length); 9833 } 9834 s = s[0 .. destIdx]; 9835 } 9836 9837 // helper to precalculate size of case-converted string 9838 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn) 9839 { 9840 size_t toCaseLength(C)(const scope C[] str) 9841 { 9842 import std.utf : decode, codeLength; 9843 size_t codeLen = 0; 9844 size_t lastNonTrivial = 0; 9845 size_t curIdx = 0; 9846 while (curIdx != str.length) 9847 { 9848 immutable startIdx = curIdx; 9849 immutable ch = decode(str, curIdx); 9850 immutable ushort caseIndex = indexFn(ch); 9851 if (caseIndex == ushort.max) 9852 continue; 9853 else if (caseIndex < maxIdx) 9854 { 9855 codeLen += startIdx - lastNonTrivial; 9856 lastNonTrivial = curIdx; 9857 immutable cased = tableFn(caseIndex); 9858 codeLen += codeLength!C(cased); 9859 } 9860 else 9861 { 9862 codeLen += startIdx - lastNonTrivial; 9863 lastNonTrivial = curIdx; 9864 immutable val = tableFn(caseIndex); 9865 immutable len = val >> 24; 9866 immutable dchar cased = val & 0xFF_FFFF; 9867 codeLen += codeLength!C(cased); 9868 foreach (j; caseIndex+1 .. caseIndex+len) 9869 codeLen += codeLength!C(tableFn(j)); 9870 } 9871 } 9872 if (lastNonTrivial != str.length) 9873 codeLen += str.length - lastNonTrivial; 9874 return codeLen; 9875 } 9876 } 9877 9878 @safe unittest 9879 { 9880 alias toLowerLength = toCaseLength!(LowerTriple); 9881 assert(toLowerLength("abcd") == 4); 9882 assert(toLowerLength("аБВгд456") == 10+3); 9883 } 9884 9885 // slower code path that preallocates and then copies 9886 // case-converted stuf to the new string 9887 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn) 9888 { 9889 void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx, 9890 size_t destIdx) @trusted pure 9891 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9892 { 9893 import std.utf : decode; 9894 alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn); 9895 auto trueLength = destIdx + caseLength(s[curIdx..$]); 9896 C[] ns = new C[trueLength]; 9897 ns[0 .. destIdx] = s[0 .. destIdx]; 9898 size_t lastUnchanged = curIdx; 9899 while (curIdx != s.length) 9900 { 9901 immutable startIdx = curIdx; // start of current codepoint 9902 immutable ch = decode(s, curIdx); 9903 immutable caseIndex = indexFn(ch); 9904 if (caseIndex == ushort.max) // skip over 9905 { 9906 continue; 9907 } 9908 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9909 { 9910 immutable cased = tableFn(caseIndex); 9911 auto toCopy = startIdx - lastUnchanged; 9912 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9913 lastUnchanged = curIdx; 9914 destIdx += toCopy; 9915 destIdx = encodeTo(ns, destIdx, cased); 9916 } 9917 else // 1:m codepoint mapping, slow codepath 9918 { 9919 auto toCopy = startIdx - lastUnchanged; 9920 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9921 lastUnchanged = curIdx; 9922 destIdx += toCopy; 9923 auto val = tableFn(caseIndex); 9924 // unpack length + codepoint 9925 immutable uint len = val >> 24; 9926 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF)); 9927 foreach (j; caseIndex+1 .. caseIndex+len) 9928 destIdx = encodeTo(ns, destIdx, tableFn(j)); 9929 } 9930 } 9931 if (lastUnchanged != s.length) 9932 { 9933 auto toCopy = s.length - lastUnchanged; 9934 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$]; 9935 destIdx += toCopy; 9936 } 9937 assert(ns.length == destIdx); 9938 s = ns; 9939 } 9940 } 9941 9942 /++ 9943 Converts `s` to lowercase (by performing Unicode lowercase mapping) in place. 9944 For a few characters string length may increase after the transformation, 9945 in such a case the function reallocates exactly once. 9946 If `s` does not have any uppercase characters, then `s` is unaltered. 9947 +/ 9948 void toLowerInPlace(C)(ref C[] s) @trusted pure 9949 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9950 { 9951 toCaseInPlace!(LowerTriple)(s); 9952 } 9953 // overloads for the most common cases to reduce compile time 9954 @safe pure /*TODO nothrow*/ 9955 { 9956 void toLowerInPlace(ref char[] s) 9957 { toLowerInPlace!char(s); } 9958 void toLowerInPlace(ref wchar[] s) 9959 { toLowerInPlace!wchar(s); } 9960 void toLowerInPlace(ref dchar[] s) 9961 { toLowerInPlace!dchar(s); } 9962 } 9963 9964 /++ 9965 Converts `s` to uppercase (by performing Unicode uppercase mapping) in place. 9966 For a few characters string length may increase after the transformation, 9967 in such a case the function reallocates exactly once. 9968 If `s` does not have any lowercase characters, then `s` is unaltered. 9969 +/ 9970 void toUpperInPlace(C)(ref C[] s) @trusted pure 9971 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9972 { 9973 toCaseInPlace!(UpperTriple)(s); 9974 } 9975 // overloads for the most common cases to reduce compile time/code size 9976 @safe pure /*TODO nothrow*/ 9977 { 9978 void toUpperInPlace(ref char[] s) 9979 { toUpperInPlace!char(s); } 9980 void toUpperInPlace(ref wchar[] s) 9981 { toUpperInPlace!wchar(s); } 9982 void toUpperInPlace(ref dchar[] s) 9983 { toUpperInPlace!dchar(s); } 9984 } 9985 9986 /++ 9987 If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent 9988 is returned. Otherwise `c` is returned. 9989 9990 Warning: certain alphabets like German and Greek have no 1:1 9991 upper-lower mapping. Use overload of toLower which takes full string instead. 9992 +/ 9993 @safe pure nothrow @nogc 9994 dchar toLower(dchar c) 9995 { 9996 // optimize ASCII case 9997 if (c < 0xAA) 9998 { 9999 if (c < 'A') 10000 return c; 10001 if (c <= 'Z') 10002 return c + 32; 10003 return c; 10004 } 10005 size_t idx = toLowerSimpleIndex(c); 10006 if (idx != ushort.max) 10007 { 10008 return toLowerTab(idx); 10009 } 10010 return c; 10011 } 10012 10013 /++ 10014 Creates a new array which is identical to `s` except that all of its 10015 characters are converted to lowercase (by performing Unicode lowercase mapping). 10016 If none of `s` characters were affected, then `s` itself is returned if `s` is a 10017 `string`-like type. 10018 10019 Params: 10020 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 10021 of characters 10022 Returns: 10023 An array with the same element type as `s`. 10024 +/ 10025 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted 10026 if (isSomeString!S) 10027 { 10028 static import std.ascii; 10029 return toCase!(LowerTriple, std.ascii.toLower)(s); 10030 } 10031 10032 /// ditto 10033 ElementEncodingType!S[] toLower(S)(S s) 10034 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10035 { 10036 static import std.ascii; 10037 return toCase!(LowerTriple, std.ascii.toLower)(s); 10038 } 10039 10040 // overloads for the most common cases to reduce compile time 10041 @safe pure /*TODO nothrow*/ 10042 { 10043 string toLower(return scope string s) 10044 { return toLower!string(s); } 10045 wstring toLower(return scope wstring s) 10046 { return toLower!wstring(s); } 10047 dstring toLower(return scope dstring s) 10048 { return toLower!dstring(s); } 10049 10050 @safe unittest 10051 { 10052 // https://issues.dlang.org/show_bug.cgi?id=16663 10053 10054 static struct String 10055 { 10056 string data; 10057 alias data this; 10058 } 10059 10060 void foo() 10061 { 10062 auto u = toLower(String("")); 10063 } 10064 } 10065 } 10066 10067 10068 @safe unittest 10069 { 10070 static import std.ascii; 10071 import std.format : format; 10072 foreach (ch; 0 .. 0x80) 10073 assert(std.ascii.toLower(ch) == toLower(ch)); 10074 assert(toLower('Я') == 'я'); 10075 assert(toLower('Δ') == 'δ'); 10076 foreach (ch; unicode.upperCase.byCodepoint) 10077 { 10078 dchar low = ch.toLower(); 10079 assert(low == ch || isLower(low), format("%s -> %s", ch, low)); 10080 } 10081 assert(toLower("АЯ") == "ая"); 10082 10083 assert("\u1E9E".toLower == "\u00df"); 10084 assert("\u00df".toUpper == "SS"); 10085 } 10086 10087 // https://issues.dlang.org/show_bug.cgi?id=9629 10088 @safe unittest 10089 { 10090 wchar[] test = "hello þ world"w.dup; 10091 auto piece = test[6 .. 7]; 10092 toUpperInPlace(piece); 10093 assert(test == "hello Þ world"); 10094 } 10095 10096 10097 @safe unittest 10098 { 10099 import std.algorithm.comparison : cmp; 10100 string s1 = "FoL"; 10101 string s2 = toLower(s1); 10102 assert(cmp(s2, "fol") == 0, s2); 10103 assert(s2 != s1); 10104 10105 char[] s3 = s1.dup; 10106 toLowerInPlace(s3); 10107 assert(s3 == s2); 10108 10109 s1 = "A\u0100B\u0101d"; 10110 s2 = toLower(s1); 10111 s3 = s1.dup; 10112 assert(cmp(s2, "a\u0101b\u0101d") == 0); 10113 assert(s2 !is s1); 10114 toLowerInPlace(s3); 10115 assert(s3 == s2); 10116 10117 s1 = "A\u0460B\u0461d"; 10118 s2 = toLower(s1); 10119 s3 = s1.dup; 10120 assert(cmp(s2, "a\u0461b\u0461d") == 0); 10121 assert(s2 !is s1); 10122 toLowerInPlace(s3); 10123 assert(s3 == s2); 10124 10125 s1 = "\u0130"; 10126 s2 = toLower(s1); 10127 s3 = s1.dup; 10128 assert(s2 == "i\u0307"); 10129 assert(s2 !is s1); 10130 toLowerInPlace(s3); 10131 assert(s3 == s2); 10132 10133 // Test on wchar and dchar strings. 10134 assert(toLower("Some String"w) == "some string"w); 10135 assert(toLower("Some String"d) == "some string"d); 10136 10137 // https://issues.dlang.org/show_bug.cgi?id=12455 10138 dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE 10139 assert(isUpper(c)); 10140 assert(toLower(c) == 'i'); 10141 // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report 10142 // check simple-case toUpper too 10143 c = '\u1f87'; 10144 assert(isLower(c)); 10145 assert(toUpper(c) == '\u1F8F'); 10146 } 10147 10148 @safe pure unittest 10149 { 10150 import std.algorithm.comparison : cmp, equal; 10151 import std.utf : byCodeUnit; 10152 auto r1 = "FoL".byCodeUnit; 10153 assert(r1.toLower.cmp("fol") == 0); 10154 auto r2 = "A\u0460B\u0461d".byCodeUnit; 10155 assert(r2.toLower.cmp("a\u0461b\u0461d") == 0); 10156 } 10157 10158 /++ 10159 If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent 10160 is returned. Otherwise `c` is returned. 10161 10162 Warning: 10163 Certain alphabets like German and Greek have no 1:1 10164 upper-lower mapping. Use overload of toUpper which takes full string instead. 10165 10166 toUpper can be used as an argument to $(REF map, std,algorithm,iteration) 10167 to produce an algorithm that can convert a range of characters to upper case 10168 without allocating memory. 10169 A string can then be produced by using $(REF copy, std,algorithm,mutation) 10170 to send it to an $(REF appender, std,array). 10171 +/ 10172 @safe pure nothrow @nogc 10173 dchar toUpper(dchar c) 10174 { 10175 // optimize ASCII case 10176 if (c < 0xAA) 10177 { 10178 if (c < 'a') 10179 return c; 10180 if (c <= 'z') 10181 return c - 32; 10182 return c; 10183 } 10184 size_t idx = toUpperSimpleIndex(c); 10185 if (idx != ushort.max) 10186 { 10187 return toUpperTab(idx); 10188 } 10189 return c; 10190 } 10191 10192 /// 10193 @safe unittest 10194 { 10195 import std.algorithm.iteration : map; 10196 import std.algorithm.mutation : copy; 10197 import std.array : appender; 10198 10199 auto abuf = appender!(char[])(); 10200 "hello".map!toUpper.copy(abuf); 10201 assert(abuf.data == "HELLO"); 10202 } 10203 10204 @safe unittest 10205 { 10206 static import std.ascii; 10207 import std.format : format; 10208 foreach (ch; 0 .. 0x80) 10209 assert(std.ascii.toUpper(ch) == toUpper(ch)); 10210 assert(toUpper('я') == 'Я'); 10211 assert(toUpper('δ') == 'Δ'); 10212 auto title = unicode.Titlecase_Letter; 10213 foreach (ch; unicode.lowerCase.byCodepoint) 10214 { 10215 dchar up = ch.toUpper(); 10216 assert(up == ch || isUpper(up) || title[up], 10217 format("%x -> %x", ch, up)); 10218 } 10219 } 10220 10221 /++ 10222 Allocates a new array which is identical to `s` except that all of its 10223 characters are converted to uppercase (by performing Unicode uppercase mapping). 10224 If none of `s` characters were affected, then `s` itself is returned if `s` 10225 is a `string`-like type. 10226 10227 Params: 10228 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 10229 of characters 10230 Returns: 10231 An new array with the same element type as `s`. 10232 +/ 10233 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted 10234 if (isSomeString!S) 10235 { 10236 static import std.ascii; 10237 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10238 } 10239 10240 /// ditto 10241 ElementEncodingType!S[] toUpper(S)(S s) 10242 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10243 { 10244 static import std.ascii; 10245 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10246 } 10247 10248 // overloads for the most common cases to reduce compile time 10249 @safe pure /*TODO nothrow*/ 10250 { 10251 string toUpper(return scope string s) 10252 { return toUpper!string(s); } 10253 wstring toUpper(return scope wstring s) 10254 { return toUpper!wstring(s); } 10255 dstring toUpper(return scope dstring s) 10256 { return toUpper!dstring(s); } 10257 10258 @safe unittest 10259 { 10260 // https://issues.dlang.org/show_bug.cgi?id=16663 10261 10262 static struct String 10263 { 10264 string data; 10265 alias data this; 10266 } 10267 10268 void foo() 10269 { 10270 auto u = toUpper(String("")); 10271 } 10272 } 10273 } 10274 10275 @safe unittest 10276 { 10277 import std.algorithm.comparison : cmp; 10278 10279 string s1 = "FoL"; 10280 string s2; 10281 char[] s3; 10282 10283 s2 = toUpper(s1); 10284 s3 = s1.dup; toUpperInPlace(s3); 10285 assert(s3 == s2, s3); 10286 assert(cmp(s2, "FOL") == 0); 10287 assert(s2 !is s1); 10288 10289 s1 = "a\u0100B\u0101d"; 10290 s2 = toUpper(s1); 10291 s3 = s1.dup; toUpperInPlace(s3); 10292 assert(s3 == s2); 10293 assert(cmp(s2, "A\u0100B\u0100D") == 0); 10294 assert(s2 !is s1); 10295 10296 s1 = "a\u0460B\u0461d"; 10297 s2 = toUpper(s1); 10298 s3 = s1.dup; toUpperInPlace(s3); 10299 assert(s3 == s2); 10300 assert(cmp(s2, "A\u0460B\u0460D") == 0); 10301 assert(s2 !is s1); 10302 } 10303 10304 @safe unittest 10305 { 10306 static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow) 10307 { 10308 import std.format : format; 10309 string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)"; 10310 auto low = s.toLower() , up = s.toUpper(); 10311 auto lowInp = s.dup, upInp = s.dup; 10312 lowInp.toLowerInPlace(); 10313 upInp.toUpperInPlace(); 10314 assert(low == trueLow, format(diff, low, trueLow)); 10315 assert(up == trueUp, format(diff, up, trueUp)); 10316 assert(lowInp == trueLow, 10317 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow)); 10318 assert(upInp == trueUp, 10319 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp)); 10320 } 10321 static foreach (S; AliasSeq!(dstring, wstring, string)) 10322 {{ 10323 10324 S easy = "123"; 10325 S good = "abCФеж"; 10326 S awful = "\u0131\u023f\u2126"; 10327 S wicked = "\u0130\u1FE2"; 10328 auto options = [easy, good, awful, wicked]; 10329 S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 10330 S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 10331 10332 foreach (val; [easy, good]) 10333 { 10334 auto e = val.dup; 10335 auto g = e; 10336 e.toUpperInPlace(); 10337 assert(e is g); 10338 e.toLowerInPlace(); 10339 assert(e is g); 10340 } 10341 foreach (i, v; options) 10342 { 10343 doTest(v, upper[i], lower[i]); 10344 } 10345 10346 // a few combinatorial runs 10347 foreach (i; 0 .. options.length) 10348 foreach (j; i .. options.length) 10349 foreach (k; j .. options.length) 10350 { 10351 auto sample = options[i] ~ options[j] ~ options[k]; 10352 auto sample2 = options[k] ~ options[j] ~ options[i]; 10353 doTest(sample, upper[i] ~ upper[j] ~ upper[k], 10354 lower[i] ~ lower[j] ~ lower[k]); 10355 doTest(sample2, upper[k] ~ upper[j] ~ upper[i], 10356 lower[k] ~ lower[j] ~ lower[i]); 10357 } 10358 }} 10359 } 10360 10361 // test random access ranges 10362 @safe pure unittest 10363 { 10364 import std.algorithm.comparison : cmp; 10365 import std.utf : byCodeUnit; 10366 auto s1 = "FoL".byCodeUnit; 10367 assert(s1.toUpper.cmp("FOL") == 0); 10368 auto s2 = "a\u0460B\u0461d".byCodeUnit; 10369 assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0); 10370 } 10371 10372 /++ 10373 Returns whether `c` is a Unicode alphabetic $(CHARACTER) 10374 (general Unicode category: Alphabetic). 10375 +/ 10376 @safe pure nothrow @nogc 10377 bool isAlpha(dchar c) 10378 { 10379 // optimization 10380 if (c < 0xAA) 10381 { 10382 return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); 10383 } 10384 10385 return alphaTrie[c]; 10386 } 10387 10388 @safe unittest 10389 { 10390 auto alpha = unicode("Alphabetic"); 10391 foreach (ch; alpha.byCodepoint) 10392 assert(isAlpha(ch)); 10393 foreach (ch; 0 .. 0x4000) 10394 assert((ch in alpha) == isAlpha(ch)); 10395 } 10396 10397 10398 /++ 10399 Returns whether `c` is a Unicode mark 10400 (general Unicode category: Mn, Me, Mc). 10401 +/ 10402 @safe pure nothrow @nogc 10403 bool isMark(dchar c) 10404 { 10405 return markTrie[c]; 10406 } 10407 10408 @safe unittest 10409 { 10410 auto mark = unicode("Mark"); 10411 foreach (ch; mark.byCodepoint) 10412 assert(isMark(ch)); 10413 foreach (ch; 0 .. 0x4000) 10414 assert((ch in mark) == isMark(ch)); 10415 } 10416 10417 /++ 10418 Returns whether `c` is a Unicode numerical $(CHARACTER) 10419 (general Unicode category: Nd, Nl, No). 10420 +/ 10421 @safe pure nothrow @nogc 10422 bool isNumber(dchar c) 10423 { 10424 // optimization for ascii case 10425 if (c <= 0x7F) 10426 { 10427 return c >= '0' && c <= '9'; 10428 } 10429 else 10430 { 10431 return numberTrie[c]; 10432 } 10433 } 10434 10435 @safe unittest 10436 { 10437 auto n = unicode("N"); 10438 foreach (ch; n.byCodepoint) 10439 assert(isNumber(ch)); 10440 foreach (ch; 0 .. 0x4000) 10441 assert((ch in n) == isNumber(ch)); 10442 } 10443 10444 /++ 10445 Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number. 10446 (general Unicode category: Alphabetic, Nd, Nl, No). 10447 10448 Params: 10449 c = any Unicode character 10450 Returns: 10451 `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode 10452 categories 10453 +/ 10454 @safe pure nothrow @nogc 10455 bool isAlphaNum(dchar c) 10456 { 10457 static import std.ascii; 10458 10459 // optimization for ascii case 10460 if (std.ascii.isASCII(c)) 10461 { 10462 return std.ascii.isAlphaNum(c); 10463 } 10464 else 10465 { 10466 return isAlpha(c) || isNumber(c); 10467 } 10468 } 10469 10470 @safe unittest 10471 { 10472 auto n = unicode("N"); 10473 auto alpha = unicode("Alphabetic"); 10474 10475 foreach (ch; n.byCodepoint) 10476 assert(isAlphaNum(ch)); 10477 10478 foreach (ch; alpha.byCodepoint) 10479 assert(isAlphaNum(ch)); 10480 10481 foreach (ch; 0 .. 0x4000) 10482 { 10483 assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch)); 10484 } 10485 } 10486 10487 /++ 10488 Returns whether `c` is a Unicode punctuation $(CHARACTER) 10489 (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf). 10490 +/ 10491 @safe pure nothrow @nogc 10492 bool isPunctuation(dchar c) 10493 { 10494 static import std.ascii; 10495 10496 // optimization for ascii case 10497 if (c <= 0x7F) 10498 { 10499 return std.ascii.isPunctuation(c); 10500 } 10501 else 10502 { 10503 return punctuationTrie[c]; 10504 } 10505 } 10506 10507 @safe unittest 10508 { 10509 assert(isPunctuation('\u0021')); 10510 assert(isPunctuation('\u0028')); 10511 assert(isPunctuation('\u0029')); 10512 assert(isPunctuation('\u002D')); 10513 assert(isPunctuation('\u005F')); 10514 assert(isPunctuation('\u00AB')); 10515 assert(isPunctuation('\u00BB')); 10516 foreach (ch; unicode("P").byCodepoint) 10517 assert(isPunctuation(ch)); 10518 } 10519 10520 /++ 10521 Returns whether `c` is a Unicode symbol $(CHARACTER) 10522 (general Unicode category: Sm, Sc, Sk, So). 10523 +/ 10524 @safe pure nothrow @nogc 10525 bool isSymbol(dchar c) 10526 { 10527 return symbolTrie[c]; 10528 } 10529 10530 @safe unittest 10531 { 10532 import std.format : format; 10533 assert(isSymbol('\u0024')); 10534 assert(isSymbol('\u002B')); 10535 assert(isSymbol('\u005E')); 10536 assert(isSymbol('\u00A6')); 10537 foreach (ch; unicode("S").byCodepoint) 10538 assert(isSymbol(ch), format("%04x", ch)); 10539 } 10540 10541 /++ 10542 Returns whether `c` is a Unicode space $(CHARACTER) 10543 (general Unicode category: Zs) 10544 Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER). 10545 For commonly used less strict semantics see $(LREF isWhite). 10546 +/ 10547 @safe pure nothrow @nogc 10548 bool isSpace(dchar c) 10549 { 10550 import std.internal.unicode_tables : isSpaceGen; // generated file 10551 return isSpaceGen(c); 10552 } 10553 10554 @safe unittest 10555 { 10556 assert(isSpace('\u0020')); 10557 auto space = unicode.Zs; 10558 foreach (ch; space.byCodepoint) 10559 assert(isSpace(ch)); 10560 foreach (ch; 0 .. 0x1000) 10561 assert(isSpace(ch) == space[ch]); 10562 } 10563 10564 10565 /++ 10566 Returns whether `c` is a Unicode graphical $(CHARACTER) 10567 (general Unicode category: L, M, N, P, S, Zs). 10568 10569 +/ 10570 @safe pure nothrow @nogc 10571 bool isGraphical(dchar c) 10572 { 10573 return graphicalTrie[c]; 10574 } 10575 10576 10577 @safe unittest 10578 { 10579 auto set = unicode("Graphical"); 10580 import std.format : format; 10581 foreach (ch; set.byCodepoint) 10582 assert(isGraphical(ch), format("%4x", ch)); 10583 foreach (ch; 0 .. 0x4000) 10584 assert((ch in set) == isGraphical(ch)); 10585 } 10586 10587 10588 /++ 10589 Returns whether `c` is a Unicode control $(CHARACTER) 10590 (general Unicode category: Cc). 10591 +/ 10592 @safe pure nothrow @nogc 10593 bool isControl(dchar c) 10594 { 10595 import std.internal.unicode_tables : isControlGen; // generated file 10596 return isControlGen(c); 10597 } 10598 10599 @safe unittest 10600 { 10601 assert(isControl('\u0000')); 10602 assert(isControl('\u0081')); 10603 assert(!isControl('\u0100')); 10604 auto cc = unicode.Cc; 10605 foreach (ch; cc.byCodepoint) 10606 assert(isControl(ch)); 10607 foreach (ch; 0 .. 0x1000) 10608 assert(isControl(ch) == cc[ch]); 10609 } 10610 10611 10612 /++ 10613 Returns whether `c` is a Unicode formatting $(CHARACTER) 10614 (general Unicode category: Cf). 10615 +/ 10616 @safe pure nothrow @nogc 10617 bool isFormat(dchar c) 10618 { 10619 import std.internal.unicode_tables : isFormatGen; // generated file 10620 return isFormatGen(c); 10621 } 10622 10623 10624 @safe unittest 10625 { 10626 assert(isFormat('\u00AD')); 10627 foreach (ch; unicode("Format").byCodepoint) 10628 assert(isFormat(ch)); 10629 } 10630 10631 // code points for private use, surrogates are not likely to change in near feature 10632 // if need be they can be generated from unicode data as well 10633 10634 /++ 10635 Returns whether `c` is a Unicode Private Use $(CODEPOINT) 10636 (general Unicode category: Co). 10637 +/ 10638 @safe pure nothrow @nogc 10639 bool isPrivateUse(dchar c) 10640 { 10641 return (0x00_E000 <= c && c <= 0x00_F8FF) 10642 || (0x0F_0000 <= c && c <= 0x0F_FFFD) 10643 || (0x10_0000 <= c && c <= 0x10_FFFD); 10644 } 10645 10646 /++ 10647 Returns whether `c` is a Unicode surrogate $(CODEPOINT) 10648 (general Unicode category: Cs). 10649 +/ 10650 @safe pure nothrow @nogc 10651 bool isSurrogate(dchar c) 10652 { 10653 return (0xD800 <= c && c <= 0xDFFF); 10654 } 10655 10656 /++ 10657 Returns whether `c` is a Unicode high surrogate (lead surrogate). 10658 +/ 10659 @safe pure nothrow @nogc 10660 bool isSurrogateHi(dchar c) 10661 { 10662 return (0xD800 <= c && c <= 0xDBFF); 10663 } 10664 10665 /++ 10666 Returns whether `c` is a Unicode low surrogate (trail surrogate). 10667 +/ 10668 @safe pure nothrow @nogc 10669 bool isSurrogateLo(dchar c) 10670 { 10671 return (0xDC00 <= c && c <= 0xDFFF); 10672 } 10673 10674 /++ 10675 Returns whether `c` is a Unicode non-character i.e. 10676 a $(CODEPOINT) with no assigned abstract character. 10677 (general Unicode category: Cn) 10678 +/ 10679 @safe pure nothrow @nogc 10680 bool isNonCharacter(dchar c) 10681 { 10682 return nonCharacterTrie[c]; 10683 } 10684 10685 @safe unittest 10686 { 10687 auto set = unicode("Cn"); 10688 foreach (ch; set.byCodepoint) 10689 assert(isNonCharacter(ch)); 10690 } 10691 10692 private: 10693 // load static data from pre-generated tables into usable datastructures 10694 10695 10696 @safe auto asSet(const (ubyte)[] compressed) pure 10697 { 10698 return CodepointSet.fromIntervals(decompressIntervals(compressed)); 10699 } 10700 10701 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e) 10702 { 10703 return const(CodepointTrie!T)(e.offsets, e.sizes, e.data); 10704 } 10705 10706 @safe pure nothrow @nogc @property 10707 { 10708 // It's important to use auto return here, so that the compiler 10709 // only runs semantic on the return type if the function gets 10710 // used. Also these are functions rather than templates to not 10711 // increase the object size of the caller. 10712 auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; } 10713 auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; } 10714 auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; } 10715 auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; } 10716 auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; } 10717 auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; } 10718 auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; } 10719 auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; } 10720 auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; } 10721 auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; } 10722 auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; } 10723 10724 //normalization quick-check tables 10725 auto nfcQCTrie() 10726 { 10727 import std.internal.unicode_norm : nfcQCTrieEntries; 10728 static immutable res = asTrie(nfcQCTrieEntries); 10729 return res; 10730 } 10731 10732 auto nfdQCTrie() 10733 { 10734 import std.internal.unicode_norm : nfdQCTrieEntries; 10735 static immutable res = asTrie(nfdQCTrieEntries); 10736 return res; 10737 } 10738 10739 auto nfkcQCTrie() 10740 { 10741 import std.internal.unicode_norm : nfkcQCTrieEntries; 10742 static immutable res = asTrie(nfkcQCTrieEntries); 10743 return res; 10744 } 10745 10746 auto nfkdQCTrie() 10747 { 10748 import std.internal.unicode_norm : nfkdQCTrieEntries; 10749 static immutable res = asTrie(nfkdQCTrieEntries); 10750 return res; 10751 } 10752 10753 //grapheme breaking algorithm tables 10754 auto spacingMarkTrie() 10755 { 10756 import std.internal.unicode_grapheme : spacingMarkTrieEntries; 10757 static immutable res = asTrie(spacingMarkTrieEntries); 10758 return res; 10759 } 10760 10761 auto graphemeExtendTrie() 10762 { 10763 import std.internal.unicode_grapheme : graphemeExtendTrieEntries; 10764 static immutable res = asTrie(graphemeExtendTrieEntries); 10765 return res; 10766 } 10767 10768 auto hangLV() 10769 { 10770 import std.internal.unicode_grapheme : hangulLVTrieEntries; 10771 static immutable res = asTrie(hangulLVTrieEntries); 10772 return res; 10773 } 10774 10775 auto hangLVT() 10776 { 10777 import std.internal.unicode_grapheme : hangulLVTTrieEntries; 10778 static immutable res = asTrie(hangulLVTTrieEntries); 10779 return res; 10780 } 10781 10782 auto prependTrie() 10783 { 10784 import std.internal.unicode_grapheme : prependTrieEntries; 10785 static immutable res = asTrie(prependTrieEntries); 10786 return res; 10787 } 10788 10789 auto graphemeControlTrie() 10790 { 10791 import std.internal.unicode_grapheme : controlTrieEntries; 10792 static immutable res = asTrie(controlTrieEntries); 10793 return res; 10794 } 10795 10796 auto xpictoTrie() 10797 { 10798 import std.internal.unicode_grapheme : Extended_PictographicTrieEntries; 10799 static immutable res = asTrie(Extended_PictographicTrieEntries); 10800 return res; 10801 } 10802 10803 // tables below are used for composition/decomposition 10804 auto combiningClassTrie() 10805 { 10806 import std.internal.unicode_comp : combiningClassTrieEntries; 10807 static immutable res = asTrie(combiningClassTrieEntries); 10808 return res; 10809 } 10810 10811 auto compatMappingTrie() 10812 { 10813 import std.internal.unicode_decomp : compatMappingTrieEntries; 10814 static immutable res = asTrie(compatMappingTrieEntries); 10815 return res; 10816 } 10817 10818 auto canonMappingTrie() 10819 { 10820 import std.internal.unicode_decomp : canonMappingTrieEntries; 10821 static immutable res = asTrie(canonMappingTrieEntries); 10822 return res; 10823 } 10824 10825 auto compositionJumpTrie() 10826 { 10827 import std.internal.unicode_comp : compositionJumpTrieEntries; 10828 static immutable res = asTrie(compositionJumpTrieEntries); 10829 return res; 10830 } 10831 10832 //case conversion tables 10833 auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; } 10834 auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; } 10835 auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; } 10836 //simple case conversion tables 10837 auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; } 10838 auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; } 10839 auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; } 10840 10841 } 10842 10843 }// version (!std_uni_bootstrap)