std.uni source code

1 // Written in the D programming language.
2 
3 /++
4     $(P The `std.uni` module provides an implementation
5     of fundamental Unicode algorithms and data structures.
6     This doesn't include UTF encoding and decoding primitives,
7     see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
8     for this functionality. )
9 
10 $(SCRIPT inhibitQuickIndex = 1;)
11 $(DIVC quickindex,
12 $(BOOKTABLE,
13 $(TR $(TH Category) $(TH Functions))
14 $(TR $(TD Decode) $(TD
15     $(LREF byCodePoint)
16     $(LREF byGrapheme)
17     $(LREF decodeGrapheme)
18     $(LREF graphemeStride)
19 ))
20 $(TR $(TD Comparison) $(TD
21     $(LREF icmp)
22     $(LREF sicmp)
23 ))
24 $(TR $(TD Classification) $(TD
25     $(LREF isAlpha)
26     $(LREF isAlphaNum)
27     $(LREF isCodepointSet)
28     $(LREF isControl)
29     $(LREF isFormat)
30     $(LREF isGraphical)
31     $(LREF isIntegralPair)
32     $(LREF isMark)
33     $(LREF isNonCharacter)
34     $(LREF isNumber)
35     $(LREF isPrivateUse)
36     $(LREF isPunctuation)
37     $(LREF isSpace)
38     $(LREF isSurrogate)
39     $(LREF isSurrogateHi)
40     $(LREF isSurrogateLo)
41     $(LREF isSymbol)
42     $(LREF isWhite)
43 ))
44 $(TR $(TD Normalization) $(TD
45     $(LREF NFC)
46     $(LREF NFD)
47     $(LREF NFKD)
48     $(LREF NormalizationForm)
49     $(LREF normalize)
50 ))
51 $(TR $(TD Decompose) $(TD
52     $(LREF decompose)
53     $(LREF decomposeHangul)
54     $(LREF UnicodeDecomposition)
55 ))
56 $(TR $(TD Compose) $(TD
57     $(LREF compose)
58     $(LREF composeJamo)
59 ))
60 $(TR $(TD Sets) $(TD
61     $(LREF CodepointInterval)
62     $(LREF CodepointSet)
63     $(LREF InversionList)
64     $(LREF unicode)
65 ))
66 $(TR $(TD Trie) $(TD
67     $(LREF codepointSetTrie)
68     $(LREF CodepointSetTrie)
69     $(LREF codepointTrie)
70     $(LREF CodepointTrie)
71     $(LREF toTrie)
72     $(LREF toDelegate)
73 ))
74 $(TR $(TD Casing) $(TD
75     $(LREF asCapitalized)
76     $(LREF asLowerCase)
77     $(LREF asUpperCase)
78     $(LREF isLower)
79     $(LREF isUpper)
80     $(LREF toLower)
81     $(LREF toLowerInPlace)
82     $(LREF toUpper)
83     $(LREF toUpperInPlace)
84 ))
85 $(TR $(TD Utf8Matcher) $(TD
86     $(LREF isUtfMatcher)
87     $(LREF MatcherConcept)
88     $(LREF utfMatcher)
89 ))
90 $(TR $(TD Separators) $(TD
91     $(LREF lineSep)
92     $(LREF nelSep)
93     $(LREF paraSep)
94 ))
95 $(TR $(TD Building blocks) $(TD
96     $(LREF allowedIn)
97     $(LREF combiningClass)
98     $(LREF Grapheme)
99 ))
100 ))
101 
102     $(P All primitives listed operate on Unicode characters and
103         sets of characters. For functions which operate on ASCII characters
104         and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
105         For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
106         used throughout this module see the $(S_LINK Terminology, terminology) section
107         below.
108     )
109     $(P The focus of this module is the core needs of developing Unicode-aware
110         applications. To that effect it provides the following optimized primitives:
111     )
112     $(UL
113         $(LI Character classification by category and common properties:
114             $(LREF isAlpha), $(LREF isWhite) and others.
115         )
116         $(LI
117             Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
118         )
119         $(LI
120             Converting text to any of the four normalization forms via $(LREF normalize).
121         )
122         $(LI
123             Decoding ($(LREF decodeGrapheme))  and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
124             by user-perceived characters, that is by $(LREF Grapheme) clusters.
125         )
126         $(LI
127             Decomposing and composing of individual character(s) according to canonical
128             or compatibility rules, see $(LREF compose) and $(LREF decompose),
129             including the specific version for Hangul syllables $(LREF composeJamo)
130             and $(LREF decomposeHangul).
131         )
132     )
133     $(P It's recognized that an application may need further enhancements
134         and extensions, such as less commonly known algorithms,
135         or tailoring existing ones for region specific needs. To help users
136         with building any extra functionality beyond the core primitives,
137         the module provides:
138     )
139     $(UL
140         $(LI
141             $(LREF CodepointSet), a type for easy manipulation of sets of characters.
142             Besides the typical set algebra it provides an unusual feature:
143             a D source code generator for detection of $(CODEPOINTS) in this set.
144             This is a boon for meta-programming parser frameworks,
145             and is used internally to power classification in small
146             sets like $(LREF isWhite).
147         )
148         $(LI
149             A way to construct optimal packed multi-stage tables also known as a
150             special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
151             The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
152             construct custom tries that map dchar to value.
153             The end result is a fast and predictable $(BIGOH 1) lookup that powers
154             functions like $(LREF isAlpha) and $(LREF combiningClass),
155             but for user-defined data sets.
156         )
157         $(LI
158             A useful technique for Unicode-aware parsers that perform
159             character classification of encoded $(CODEPOINTS)
160             is to avoid unnecassary decoding at all costs.
161             $(LREF utfMatcher) provides an improvement over the usual workflow
162             of decode-classify-process, combining the decoding and classification
163             steps. By extracting necessary bits directly from encoded
164             $(S_LINK Code unit, code units) matchers achieve
165             significant performance improvements. See $(LREF MatcherConcept) for
166             the common interface of UTF matchers.
167         )
168         $(LI
169             Generally useful building blocks for customized normalization:
170             $(LREF combiningClass) for querying combining class
171             and $(LREF allowedIn) for testing the Quick_Check
172             property of a given normalization form.
173         )
174         $(LI
175             Access to a large selection of commonly used sets of $(CODEPOINTS).
176             $(S_LINK Unicode properties, Supported sets) include Script,
177             Block and General Category. The exact contents of a set can be
178             observed in the CLDR utility, on the
179             $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
180             of the Unicode website.
181             See $(LREF unicode) for easy and (optionally) compile-time checked set
182             queries.
183         )
184     )
185     $(SECTION Synopsis)
186     ---
187     import std.uni;
188     void main()
189     {
190         // initialize code point sets using script/block or property name
191         // now 'set' contains code points from both scripts.
192         auto set = unicode("Cyrillic") | unicode("Armenian");
193         // same thing but simpler and checked at compile-time
194         auto ascii = unicode.ASCII;
195         auto currency = unicode.Currency_Symbol;
196 
197         // easy set ops
198         auto a = set & ascii;
199         assert(a.empty); // as it has no intersection with ascii
200         a = set | ascii;
201         auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
202 
203         // some properties of code point sets
204         assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
205         // testing presence of a code point in a set
206         // is just fine, it is O(logN)
207         assert(!b['$']);
208         assert(!b['\u058F']); // Armenian dram sign
209         assert(b['¥']);
210 
211         // building fast lookup tables, these guarantee O(1) complexity
212         // 1-level Trie lookup table essentially a huge bit-set ~262Kb
213         auto oneTrie = toTrie!1(b);
214         // 2-level far more compact but typically slightly slower
215         auto twoTrie = toTrie!2(b);
216         // 3-level even smaller, and a bit slower yet
217         auto threeTrie = toTrie!3(b);
218         assert(oneTrie['£']);
219         assert(twoTrie['£']);
220         assert(threeTrie['£']);
221 
222         // build the trie with the most sensible trie level
223         // and bind it as a functor
224         auto cyrillicOrArmenian = toDelegate(set);
225         auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
226         assert(balance == "ընկեր!");
227         // compatible with bool delegate(dchar)
228         bool delegate(dchar) bindIt = cyrillicOrArmenian;
229 
230         // Normalization
231         string s = "Plain ascii (and not only), is always normalized!";
232         assert(s is normalize(s));// is the same string
233 
234         string nonS = "A\u0308ffin"; // A ligature
235         auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
236         assert(nS == "Äffin");
237         assert(nS != nonS);
238         string composed = "Äffin";
239 
240         assert(normalize!NFD(composed) == "A\u0308ffin");
241         // to NFKD, compatibility decomposition useful for fuzzy matching/searching
242         assert(normalize!NFKD("2¹⁰") == "210");
243     }
244     ---
245     $(SECTION Terminology)
246     $(P The following is a list of important Unicode notions
247     and definitions. Any conventions used specifically in this
248     module alone are marked as such. The descriptions are based on the formal
249     definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
250     chapter three of The Unicode Standard Core Specification.)
251     )
252     $(P $(DEF Abstract character) A unit of information used for the organization,
253         control, or representation of textual data.
254         Note that:
255         $(UL
256             $(LI When representing data, the nature of that data
257                 is generally symbolic as opposed to some other
258                 kind of data (for example, visual).
259             )
260              $(LI An abstract character has no concrete form
261                 and should not be confused with a $(S_LINK Glyph, glyph).
262             )
263             $(LI An abstract character does not necessarily
264                 correspond to what a user thinks of as a “character”
265                 and should not be confused with a $(LREF Grapheme).
266             )
267             $(LI The abstract characters encoded (see Encoded character)
268                 are known as Unicode abstract characters.
269             )
270             $(LI Abstract characters not directly
271                 encoded by the Unicode Standard can often be
272                 represented by the use of combining character sequences.
273             )
274         )
275     )
276     $(P $(DEF Canonical decomposition)
277         The decomposition of a character or character sequence
278         that results from recursively applying the canonical
279         mappings found in the Unicode Character Database
280         and these described in Conjoining Jamo Behavior
281         (section 12 of
282         $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
283     )
284     $(P $(DEF Canonical composition)
285         The precise definition of the Canonical composition
286         is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
287         Unicode Conformance) section 11.
288         Informally it's the process that does the reverse of the canonical
289         decomposition with the addition of certain rules
290         that e.g. prevent legacy characters from appearing in the composed result.
291     )
292     $(P $(DEF Canonical equivalent)
293         Two character sequences are said to be canonical equivalents if
294         their full canonical decompositions are identical.
295     )
296     $(P $(DEF Character) Typically differs by context.
297         For the purpose of this documentation the term $(I character)
298         implies $(I encoded character), that is, a code point having
299         an assigned abstract character (a symbolic meaning).
300     )
301     $(P $(DEF Code point) Any value in the Unicode codespace;
302         that is, the range of integers from 0 to 10FFFF (hex).
303         Not all code points are assigned to encoded characters.
304     )
305     $(P $(DEF Code unit) The minimal bit combination that can represent
306         a unit of encoded text for processing or interchange.
307         Depending on the encoding this could be:
308         8-bit code units in the UTF-8 (`char`),
309         16-bit code units in the UTF-16 (`wchar`),
310         and 32-bit code units in the UTF-32 (`dchar`).
311         $(I Note that in UTF-32, a code unit is a code point
312         and is represented by the D `dchar` type.)
313     )
314     $(P $(DEF Combining character) A character with the General Category
315         of Combining Mark(M).
316         $(UL
317             $(LI All characters with non-zero canonical combining class
318             are combining characters, but the reverse is not the case:
319             there are combining characters with a zero combining class.
320             )
321             $(LI These characters are not normally used in isolation
322             unless they are being described. They include such characters
323             as accents, diacritics, Hebrew points, Arabic vowel signs,
324             and Indic matras.
325             )
326         )
327     )
328     $(P $(DEF Combining class)
329         A numerical value used by the Unicode Canonical Ordering Algorithm
330         to determine which sequences of combining marks are to be
331         considered canonically equivalent and  which are not.
332     )
333     $(P $(DEF Compatibility decomposition)
334         The decomposition of a character or character sequence that results
335         from recursively applying both the compatibility mappings and
336         the canonical mappings found in the Unicode Character Database, and those
337         described in Conjoining Jamo Behavior no characters
338         can be further decomposed.
339     )
340     $(P $(DEF Compatibility equivalent)
341         Two character sequences are said to be compatibility
342         equivalents if their full compatibility decompositions are identical.
343     )
344     $(P $(DEF Encoded character) An association (or mapping)
345         between an abstract character and a code point.
346     )
347     $(P $(DEF Glyph) The actual, concrete image of a glyph representation
348         having been rasterized or otherwise imaged onto some display surface.
349     )
350     $(P $(DEF Grapheme base) A character with the property
351         Grapheme_Base, or any standard Korean syllable block.
352     )
353     $(P $(DEF Grapheme cluster) Defined as the text between
354         grapheme boundaries  as specified by Unicode Standard Annex #29,
355         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
356         Important general properties of a grapheme:
357         $(UL
358             $(LI The grapheme cluster represents a horizontally segmentable
359             unit of text, consisting of some grapheme base (which may
360             consist of a Korean syllable) together with any number of
361             nonspacing marks applied to it.
362             )
363             $(LI  A grapheme cluster typically starts with a grapheme base
364             and then extends across any subsequent sequence of nonspacing marks.
365             A grapheme cluster is most directly relevant to text rendering and
366             processes such as cursor placement and text selection in editing,
367             but may also be relevant to comparison and searching.
368             )
369             $(LI For many processes, a grapheme cluster behaves as if it was a
370             single character with the same properties as its grapheme base.
371             Effectively, nonspacing marks apply $(I graphically) to the base,
372             but do not change its properties.
373             )
374         )
375         $(P This module defines a number of primitives that work with graphemes:
376         $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
377         All of them are using $(I extended grapheme) boundaries
378         as defined in the aforementioned standard annex.
379         )
380     )
381     $(P $(DEF Nonspacing mark) A combining character with the
382         General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
383     )
384     $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
385     )
386     $(SECTION Normalization)
387     $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
388         or $(S_LINK Compatibility equivalent, compatibility equivalent)
389         characters in the Unicode Standard make it necessary to have a full, formal
390         definition of equivalence for Unicode strings.
391         String equivalence is determined by a process called normalization,
392         whereby strings are converted into forms which are compared
393         directly for identity. This is the primary goal of the normalization process,
394         see the function $(LREF normalize) to convert into any of
395         the four defined forms.
396     )
397     $(P A very important attribute of the Unicode Normalization Forms
398         is that they must remain stable between versions of the Unicode Standard.
399         A Unicode string normalized to a particular Unicode Normalization Form
400         in one version of the standard is guaranteed to remain in that Normalization
401         Form for implementations of future versions of the standard.
402     )
403     $(P The Unicode Standard specifies four normalization forms.
404         Informally, two of these forms are defined by maximal decomposition
405         of equivalent sequences, and two of these forms are defined
406         by maximal $(I composition) of equivalent sequences.
407             $(UL
408             $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
409                 canonical decomposition) of a character sequence.)
410             $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
411                 compatibility decomposition) of a character sequence.)
412             $(LI Normalization Form C (NFC): The canonical composition of the
413                 $(S_LINK Canonical decomposition, canonical decomposition)
414                 of a coded character sequence.)
415             $(LI Normalization Form KC (NFKC): The canonical composition
416             of the $(S_LINK Compatibility decomposition,
417                 compatibility decomposition) of a character sequence)
418             )
419     )
420     $(P The choice of the normalization form depends on the particular use case.
421         NFC is the best form for general text, since it's more compatible with
422         strings converted from legacy encodings. NFKC is the preferred form for
423         identifiers, especially where there are security concerns. NFD and NFKD
424         are the most useful for internal processing.
425     )
426     $(SECTION Construction of lookup tables)
427     $(P The Unicode standard describes a set of algorithms that
428         depend on having the ability to quickly look up various properties
429         of a code point. Given the codespace of about 1 million $(CODEPOINTS),
430         it is not a trivial task to provide a space-efficient solution for
431         the multitude of properties.
432     )
433     $(P Common approaches such as hash-tables or binary search over
434         sorted code point intervals (as in $(LREF InversionList)) are insufficient.
435         Hash-tables have enormous memory footprint and binary search
436         over intervals is not fast enough for some heavy-duty algorithms.
437     )
438     $(P The recommended solution (see Unicode Implementation Guidelines)
439         is using multi-stage tables that are an implementation of the
440         $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
441         keys and a fixed number of stages. For the remainder of the section
442         this will be called a fixed trie. The following describes a particular
443         implementation that is aimed for the speed of access at the expense
444         of ideal size savings.
445     )
446     $(P Taking a 2-level Trie as an example the principle of operation is as follows.
447         Split the number of bits in a key (code point, 21 bits) into 2 components
448         (e.g. 15 and 8).  The first is the number of bits in the index of the trie
449          and the other is number of bits in each page of the trie.
450         The layout of the trie is then an array of size 2^^bits-of-index followed
451         an array of memory chunks of size 2^^bits-of-page/bits-per-element.
452     )
453     $(P The number of pages is variable (but not less then 1)
454         unlike the number of entries in the index. The slots of the index
455         all have to contain a number of a page that is present. The lookup is then
456         just a couple of operations - slice the upper bits,
457         lookup an index for these, take a page at this index and use
458         the lower bits as an offset within this page.
459 
460         Assuming that pages are laid out consequently
461         in one array at `pages`, the pseudo-code is:
462     )
463     ---
464     auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
465     pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
466     ---
467     $(P Where if `elemsPerPage` is a power of 2 the whole process is
468         a handful of simple instructions and 2 array reads. Subsequent levels
469         of the trie are introduced by recursing on this notion - the index array
470         is treated as values. The number of bits in index is then again
471         split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
472     )
473 
474     $(P For completeness a level 1 trie is simply an array.
475         The current implementation takes advantage of bit-packing values
476         when the range is known to be limited in advance (such as `bool`).
477         See also $(LREF BitPacked) for enforcing it manually.
478         The major size advantage however comes from the fact
479         that multiple $(B identical pages on every level are merged) by construction.
480     )
481     $(P The process of constructing a trie is more involved and is hidden from
482         the user in a form of the convenience functions $(LREF codepointTrie),
483         $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
484         In general a set or built-in AA with `dchar` type
485         can be turned into a trie. The trie object in this module
486         is read-only (immutable); it's effectively frozen after construction.
487     )
488     $(SECTION Unicode properties)
489     $(P This is a full list of Unicode properties accessible through $(LREF unicode)
490         with specific helpers per category nested within. Consult the
491         $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
492         when in doubt about the contents of a particular set.
493     )
494     $(P General category sets listed below are only accessible with the
495         $(LREF unicode) shorthand accessor.)
496         $(BOOKTABLE $(B General category ),
497              $(TR $(TH Abb.) $(TH Long form)
498                 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
499             $(TR $(TD L) $(TD Letter)
500                 $(TD Cn) $(TD Unassigned)  $(TD Po) $(TD Other_Punctuation))
501             $(TR $(TD Ll) $(TD Lowercase_Letter)
502                 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
503             $(TR $(TD Lm) $(TD Modifier_Letter)
504                 $(TD Cs) $(TD Surrogate)   $(TD S) $(TD Symbol))
505             $(TR $(TD Lo) $(TD Other_Letter)
506                 $(TD N) $(TD Number)  $(TD Sc) $(TD Currency_Symbol))
507             $(TR $(TD Lt) $(TD Titlecase_Letter)
508               $(TD Nd) $(TD Decimal_Number)  $(TD Sk) $(TD Modifier_Symbol))
509             $(TR $(TD Lu) $(TD Uppercase_Letter)
510               $(TD Nl) $(TD Letter_Number)   $(TD Sm) $(TD Math_Symbol))
511             $(TR $(TD M) $(TD Mark)
512               $(TD No) $(TD Other_Number)    $(TD So) $(TD Other_Symbol))
513             $(TR $(TD Mc) $(TD Spacing_Mark)
514               $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
515             $(TR $(TD Me) $(TD Enclosing_Mark)
516               $(TD Pc) $(TD Connector_Punctuation)   $(TD Zl) $(TD Line_Separator))
517             $(TR $(TD Mn) $(TD Nonspacing_Mark)
518               $(TD Pd) $(TD Dash_Punctuation)    $(TD Zp) $(TD Paragraph_Separator))
519             $(TR $(TD C) $(TD Other)
520               $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
521             $(TR $(TD Cc) $(TD Control) $(TD Pf)
522               $(TD Final_Punctuation)   $(TD -) $(TD Any))
523             $(TR $(TD Cf) $(TD Format)
524               $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
525     )
526     $(P Sets for other commonly useful properties that are
527         accessible with $(LREF unicode):)
528         $(BOOKTABLE $(B Common binary properties),
529             $(TR $(TH Name) $(TH Name) $(TH Name))
530             $(TR $(TD Alphabetic)  $(TD Ideographic) $(TD Other_Uppercase))
531             $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
532             $(TR $(TD Bidi_Control)    $(TD ID_Start)    $(TD Pattern_White_Space))
533             $(TR $(TD Cased)   $(TD IDS_Trinary_Operator)    $(TD Quotation_Mark))
534             $(TR $(TD Case_Ignorable)  $(TD Join_Control)    $(TD Radical))
535             $(TR $(TD Dash)    $(TD Logical_Order_Exception) $(TD Soft_Dotted))
536             $(TR $(TD Default_Ignorable_Code_Point)    $(TD Lowercase)   $(TD STerm))
537             $(TR $(TD Deprecated)  $(TD Math)    $(TD Terminal_Punctuation))
538             $(TR $(TD Diacritic)   $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
539             $(TR $(TD Extender)    $(TD Other_Alphabetic)    $(TD Uppercase))
540             $(TR $(TD Grapheme_Base)   $(TD Other_Default_Ignorable_Code_Point)  $(TD Variation_Selector))
541             $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend)   $(TD White_Space))
542             $(TR $(TD Grapheme_Link)   $(TD Other_ID_Continue)   $(TD XID_Continue))
543             $(TR $(TD Hex_Digit)   $(TD Other_ID_Start)  $(TD XID_Start))
544             $(TR $(TD Hyphen)  $(TD Other_Lowercase) )
545             $(TR $(TD ID_Continue) $(TD Other_Math)  )
546     )
547     $(P Below is the table with block names accepted by $(LREF unicode.block).
548         Note that the shorthand version $(LREF unicode) requires "In"
549         to be prepended to the names of blocks so as to disambiguate
550         scripts and blocks.
551     )
552     $(BOOKTABLE $(B Blocks),
553         $(TR $(TD Aegean Numbers)    $(TD Ethiopic Extended) $(TD Mongolian))
554         $(TR $(TD Alchemical Symbols)    $(TD Ethiopic Extended-A)   $(TD Musical Symbols))
555         $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement)   $(TD Myanmar))
556         $(TR $(TD Ancient Greek Musical Notation)    $(TD General Punctuation)   $(TD Myanmar Extended-A))
557         $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes)  $(TD New Tai Lue))
558         $(TR $(TD Ancient Symbols)   $(TD Georgian)  $(TD NKo))
559         $(TR $(TD Arabic)    $(TD Georgian Supplement)   $(TD Number Forms))
560         $(TR $(TD Arabic Extended-A) $(TD Glagolitic)    $(TD Ogham))
561         $(TR $(TD Arabic Mathematical Alphabetic Symbols)    $(TD Gothic)    $(TD Ol Chiki))
562         $(TR $(TD Arabic Presentation Forms-A)   $(TD Greek and Coptic)  $(TD Old Italic))
563         $(TR $(TD Arabic Presentation Forms-B)   $(TD Greek Extended)    $(TD Old Persian))
564         $(TR $(TD Arabic Supplement) $(TD Gujarati)  $(TD Old South Arabian))
565         $(TR $(TD Armenian)  $(TD Gurmukhi)  $(TD Old Turkic))
566         $(TR $(TD Arrows)    $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
567         $(TR $(TD Avestan)   $(TD Hangul Compatibility Jamo) $(TD Oriya))
568         $(TR $(TD Balinese)  $(TD Hangul Jamo)   $(TD Osmanya))
569         $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A)    $(TD Phags-pa))
570         $(TR $(TD Bamum Supplement)  $(TD Hangul Jamo Extended-B)    $(TD Phaistos Disc))
571         $(TR $(TD Basic Latin)   $(TD Hangul Syllables)  $(TD Phoenician))
572         $(TR $(TD Batak) $(TD Hanunoo)   $(TD Phonetic Extensions))
573         $(TR $(TD Bengali)   $(TD Hebrew)    $(TD Phonetic Extensions Supplement))
574         $(TR $(TD Block Elements)    $(TD High Private Use Surrogates)   $(TD Playing Cards))
575         $(TR $(TD Bopomofo)  $(TD High Surrogates)   $(TD Private Use Area))
576         $(TR $(TD Bopomofo Extended) $(TD Hiragana)  $(TD Rejang))
577         $(TR $(TD Box Drawing)   $(TD Ideographic Description Characters)    $(TD Rumi Numeral Symbols))
578         $(TR $(TD Brahmi)    $(TD Imperial Aramaic)  $(TD Runic))
579         $(TR $(TD Braille Patterns)  $(TD Inscriptional Pahlavi) $(TD Samaritan))
580         $(TR $(TD Buginese)  $(TD Inscriptional Parthian)    $(TD Saurashtra))
581         $(TR $(TD Buhid) $(TD IPA Extensions)    $(TD Sharada))
582         $(TR $(TD Byzantine Musical Symbols) $(TD Javanese)  $(TD Shavian))
583         $(TR $(TD Carian)    $(TD Kaithi)    $(TD Sinhala))
584         $(TR $(TD Chakma)    $(TD Kana Supplement)   $(TD Small Form Variants))
585         $(TR $(TD Cham)  $(TD Kanbun)    $(TD Sora Sompeng))
586         $(TR $(TD Cherokee)  $(TD Kangxi Radicals)   $(TD Spacing Modifier Letters))
587         $(TR $(TD CJK Compatibility) $(TD Kannada)   $(TD Specials))
588         $(TR $(TD CJK Compatibility Forms)   $(TD Katakana)  $(TD Sundanese))
589         $(TR $(TD CJK Compatibility Ideographs)  $(TD Katakana Phonetic Extensions)  $(TD Sundanese Supplement))
590         $(TR $(TD CJK Compatibility Ideographs Supplement)   $(TD Kayah Li)  $(TD Superscripts and Subscripts))
591         $(TR $(TD CJK Radicals Supplement)   $(TD Kharoshthi)    $(TD Supplemental Arrows-A))
592         $(TR $(TD CJK Strokes)   $(TD Khmer) $(TD Supplemental Arrows-B))
593         $(TR $(TD CJK Symbols and Punctuation)   $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
594         $(TR $(TD CJK Unified Ideographs)    $(TD Lao)   $(TD Supplemental Punctuation))
595         $(TR $(TD CJK Unified Ideographs Extension A)    $(TD Latin-1 Supplement)    $(TD Supplementary Private Use Area-A))
596         $(TR $(TD CJK Unified Ideographs Extension B)    $(TD Latin Extended-A)  $(TD Supplementary Private Use Area-B))
597         $(TR $(TD CJK Unified Ideographs Extension C)    $(TD Latin Extended Additional) $(TD Syloti Nagri))
598         $(TR $(TD CJK Unified Ideographs Extension D)    $(TD Latin Extended-B)  $(TD Syriac))
599         $(TR $(TD Combining Diacritical Marks)   $(TD Latin Extended-C)  $(TD Tagalog))
600         $(TR $(TD Combining Diacritical Marks for Symbols)   $(TD Latin Extended-D)  $(TD Tagbanwa))
601         $(TR $(TD Combining Diacritical Marks Supplement)    $(TD Lepcha)    $(TD Tags))
602         $(TR $(TD Combining Half Marks)  $(TD Letterlike Symbols)    $(TD Tai Le))
603         $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
604         $(TR $(TD Control Pictures)  $(TD Linear B Ideograms)    $(TD Tai Viet))
605         $(TR $(TD Coptic)    $(TD Linear B Syllabary)    $(TD Tai Xuan Jing Symbols))
606         $(TR $(TD Counting Rod Numerals) $(TD Lisu)  $(TD Takri))
607         $(TR $(TD Cuneiform) $(TD Low Surrogates)    $(TD Tamil))
608         $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian)    $(TD Telugu))
609         $(TR $(TD Currency Symbols)  $(TD Lydian)    $(TD Thaana))
610         $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
611         $(TR $(TD Cyrillic)  $(TD Malayalam) $(TD Tibetan))
612         $(TR $(TD Cyrillic Extended-A)   $(TD Mandaic)   $(TD Tifinagh))
613         $(TR $(TD Cyrillic Extended-B)   $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
614         $(TR $(TD Cyrillic Supplement)   $(TD Mathematical Operators)    $(TD Ugaritic))
615         $(TR $(TD Deseret)   $(TD Meetei Mayek)  $(TD Unified Canadian Aboriginal Syllabics))
616         $(TR $(TD Devanagari)    $(TD Meetei Mayek Extensions)   $(TD Unified Canadian Aboriginal Syllabics Extended))
617         $(TR $(TD Devanagari Extended)   $(TD Meroitic Cursive)  $(TD Vai))
618         $(TR $(TD Dingbats)  $(TD Meroitic Hieroglyphs)  $(TD Variation Selectors))
619         $(TR $(TD Domino Tiles)  $(TD Miao)  $(TD Variation Selectors Supplement))
620         $(TR $(TD Egyptian Hieroglyphs)  $(TD Miscellaneous Mathematical Symbols-A)  $(TD Vedic Extensions))
621         $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B)  $(TD Vertical Forms))
622         $(TR $(TD Enclosed Alphanumerics)    $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
623         $(TR $(TD Enclosed Alphanumeric Supplement)  $(TD Miscellaneous Symbols and Arrows)  $(TD Yi Radicals))
624         $(TR $(TD Enclosed CJK Letters and Months)   $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
625         $(TR $(TD Enclosed Ideographic Supplement)   $(TD Miscellaneous Technical)   )
626         $(TR $(TD Ethiopic)  $(TD Modifier Tone Letters) )
627     )
628     $(P Below is the table with script names accepted by $(LREF unicode.script)
629         and by the shorthand version $(LREF unicode):)
630         $(BOOKTABLE $(B Scripts),
631             $(TR $(TD Arabic)  $(TD Hanunoo) $(TD Old_Italic))
632             $(TR $(TD Armenian)    $(TD Hebrew)  $(TD Old_Persian))
633             $(TR $(TD Avestan) $(TD Hiragana)    $(TD Old_South_Arabian))
634             $(TR $(TD Balinese)    $(TD Imperial_Aramaic)    $(TD Old_Turkic))
635             $(TR $(TD Bamum)   $(TD Inherited)   $(TD Oriya))
636             $(TR $(TD Batak)   $(TD Inscriptional_Pahlavi)   $(TD Osmanya))
637             $(TR $(TD Bengali) $(TD Inscriptional_Parthian)  $(TD Phags_Pa))
638             $(TR $(TD Bopomofo)    $(TD Javanese)    $(TD Phoenician))
639             $(TR $(TD Brahmi)  $(TD Kaithi)  $(TD Rejang))
640             $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
641             $(TR $(TD Buginese)    $(TD Katakana)    $(TD Samaritan))
642             $(TR $(TD Buhid)   $(TD Kayah_Li)    $(TD Saurashtra))
643             $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi)  $(TD Sharada))
644             $(TR $(TD Carian)  $(TD Khmer)   $(TD Shavian))
645             $(TR $(TD Chakma)  $(TD Lao) $(TD Sinhala))
646             $(TR $(TD Cham)    $(TD Latin)   $(TD Sora_Sompeng))
647             $(TR $(TD Cherokee)    $(TD Lepcha)  $(TD Sundanese))
648             $(TR $(TD Common)  $(TD Limbu)   $(TD Syloti_Nagri))
649             $(TR $(TD Coptic)  $(TD Linear_B)    $(TD Syriac))
650             $(TR $(TD Cuneiform)   $(TD Lisu)    $(TD Tagalog))
651             $(TR $(TD Cypriot) $(TD Lycian)  $(TD Tagbanwa))
652             $(TR $(TD Cyrillic)    $(TD Lydian)  $(TD Tai_Le))
653             $(TR $(TD Deseret) $(TD Malayalam)   $(TD Tai_Tham))
654             $(TR $(TD Devanagari)  $(TD Mandaic) $(TD Tai_Viet))
655             $(TR $(TD Egyptian_Hieroglyphs)    $(TD Meetei_Mayek)    $(TD Takri))
656             $(TR $(TD Ethiopic)    $(TD Meroitic_Cursive)    $(TD Tamil))
657             $(TR $(TD Georgian)    $(TD Meroitic_Hieroglyphs)    $(TD Telugu))
658             $(TR $(TD Glagolitic)  $(TD Miao)    $(TD Thaana))
659             $(TR $(TD Gothic)  $(TD Mongolian)   $(TD Thai))
660             $(TR $(TD Greek)   $(TD Myanmar) $(TD Tibetan))
661             $(TR $(TD Gujarati)    $(TD New_Tai_Lue) $(TD Tifinagh))
662             $(TR $(TD Gurmukhi)    $(TD Nko) $(TD Ugaritic))
663             $(TR $(TD Han) $(TD Ogham)   $(TD Vai))
664             $(TR $(TD Hangul)  $(TD Ol_Chiki)    $(TD Yi))
665     )
666     $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
667         $(BOOKTABLE $(B Hangul syllable type),
668             $(TR $(TH Abb.) $(TH Long form))
669             $(TR $(TD L)   $(TD Leading_Jamo))
670             $(TR $(TD LV)  $(TD LV_Syllable))
671             $(TR $(TD LVT) $(TD LVT_Syllable) )
672             $(TR $(TD T)   $(TD Trailing_Jamo))
673             $(TR $(TD V)   $(TD Vowel_Jamo))
674     )
675     References:
676         $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
677         $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
678         $(HTTP www.unicode.org, The Unicode Consortium),
679         $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
680         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
681         $(HTTP www.unicode.org/uni2book/ch05.pdf,
682             Unicode Implementation Guidelines)
683         $(HTTP www.unicode.org/uni2book/ch03.pdf,
684             Unicode Conformance)
685     Trademarks:
686         Unicode(tm) is a trademark of Unicode, Inc.
687 
688     Copyright: Copyright 2013 -
689     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
690     Authors:   Dmitry Olshansky
691     Source:    $(PHOBOSSRC std/uni/package.d)
692     Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
693 
694 Macros:
695 
696 SECTION = <h3><a id="$1">$0</a></h3>
697 DEF = <div><a id="$1"><i>$0</i></a></div>
698 S_LINK = <a href="#$1">$+</a>
699 CODEPOINT = $(S_LINK Code point, code point)
700 CODEPOINTS = $(S_LINK Code point, code points)
701 CHARACTER = $(S_LINK Character, character)
702 CHARACTERS = $(S_LINK Character, characters)
703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
704 +/
705 module std.uni;
706 
707 import std.meta : AliasSeq;
708 import std.range.primitives : back, ElementEncodingType, ElementType, empty,
709     front, hasLength, hasSlicing, isForwardRange, isInputRange,
710     isRandomAccessRange, popFront, put, save;
711 import std.traits : isConvertibleToString, isIntegral, isSomeChar,
712     isSomeString, Unqual, isDynamicArray;
713 // debug = std_uni;
714 
715 import std.internal.unicode_tables; // generated file
716 
717 debug(std_uni) import std.stdio; // writefln, writeln
718 
719 private:
720 
721 
722 void copyBackwards(T,U)(T[] src, U[] dest)
723 {
724     assert(src.length == dest.length);
725     for (size_t i=src.length; i-- > 0; )
726         dest[i] = src[i];
727 }
728 
729 void copyForward(T,U)(T[] src, U[] dest)
730 {
731     assert(src.length == dest.length);
732     for (size_t i=0; i<src.length; i++)
733         dest[i] = src[i];
734 }
735 
736 // TODO: update to reflect all major CPUs supporting unaligned reads
737 version (X86)
738     enum hasUnalignedReads = true;
739 else version (X86_64)
740     enum hasUnalignedReads = true;
741 else version (SystemZ)
742     enum hasUnalignedReads = true;
743 else
744     enum hasUnalignedReads = false; // better be safe then sorry
745 
746 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
747 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
748 public enum dchar nelSep  = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
749 
750 // test the intro example
751 @safe unittest
752 {
753     import std.algorithm.searching : find;
754     // initialize code point sets using script/block or property name
755     // set contains code points from both scripts.
756     auto set = unicode("Cyrillic") | unicode("Armenian");
757     // or simpler and statically-checked look
758     auto ascii = unicode.ASCII;
759     auto currency = unicode.Currency_Symbol;
760 
761     // easy set ops
762     auto a = set & ascii;
763     assert(a.empty); // as it has no intersection with ascii
764     a = set | ascii;
765     auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
766 
767     // some properties of code point sets
768     assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
769     // testing presence of a code point in a set
770     // is just fine, it is O(logN)
771     assert(!b['$']);
772     assert(!b['\u058F']); // Armenian dram sign
773     assert(b['¥']);
774 
775     // building fast lookup tables, these guarantee O(1) complexity
776     // 1-level Trie lookup table essentially a huge bit-set ~262Kb
777     auto oneTrie = toTrie!1(b);
778     // 2-level far more compact but typically slightly slower
779     auto twoTrie = toTrie!2(b);
780     // 3-level even smaller, and a bit slower yet
781     auto threeTrie = toTrie!3(b);
782     assert(oneTrie['£']);
783     assert(twoTrie['£']);
784     assert(threeTrie['£']);
785 
786     // build the trie with the most sensible trie level
787     // and bind it as a functor
788     auto cyrillicOrArmenian = toDelegate(set);
789     auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
790     assert(balance == "ընկեր!");
791     // compatible with bool delegate(dchar)
792     bool delegate(dchar) bindIt = cyrillicOrArmenian;
793 
794     // Normalization
795     string s = "Plain ascii (and not only), is always normalized!";
796     assert(s is normalize(s));// is the same string
797 
798     string nonS = "A\u0308ffin"; // A ligature
799     auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
800     assert(nS == "Äffin");
801     assert(nS != nonS);
802     string composed = "Äffin";
803 
804     assert(normalize!NFD(composed) == "A\u0308ffin");
805     // to NFKD, compatibility decomposition useful for fuzzy matching/searching
806     assert(normalize!NFKD("2¹⁰") == "210");
807 }
808 
809 enum lastDchar = 0x10FFFF;
810 
811 auto force(T, F)(F from)
812 if (isIntegral!T && !is(T == F))
813 {
814     assert(from <= T.max && from >= T.min);
815     return cast(T) from;
816 }
817 
818 auto force(T, F)(F from)
819 if (isBitPacked!T && !is(T == F))
820 {
821     assert(from <= 2^^bitSizeOf!T-1);
822     return T(cast(TypeOfBitPacked!T) from);
823 }
824 
825 auto force(T, F)(F from)
826 if (is(T == F))
827 {
828     return from;
829 }
830 
831 // repeat X times the bit-pattern in val assuming it's length is 'bits'
832 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
833 {
834     static if (times == 1)
835         return val;
836     else static if (bits == 1)
837     {
838         static if (times == size_t.sizeof*8)
839             return val ? size_t.max : 0;
840         else
841             return val ? (1 << times)-1 : 0;
842     }
843     else static if (times % 2)
844         return (replicateBits!(times-1, bits)(val)<<bits) | val;
845     else
846         return replicateBits!(times/2, bits*2)((val << bits) | val);
847 }
848 
849 @safe pure nothrow @nogc unittest // for replicate
850 {
851     import std.algorithm.iteration : sum, map;
852     import std.range : iota;
853     size_t m = 0b111;
854     size_t m2 = 0b01;
855     static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
856     {
857         assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
858         assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
859     }
860 }
861 
862 // multiple arrays squashed into one memory block
863 struct MultiArray(Types...)
864 {
865     import std.range.primitives : isOutputRange;
866     this(size_t[] sizes...) @safe pure nothrow
867     {
868         assert(dim == sizes.length);
869         size_t full_size;
870         foreach (i, v; Types)
871         {
872             full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
873             sz[i] = sizes[i];
874             static if (i >= 1)
875                 offsets[i] = offsets[i-1] +
876                     spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
877         }
878 
879         storage = new size_t[full_size];
880     }
881 
882     this(const(size_t)[] raw_offsets,
883         const(size_t)[] raw_sizes,
884         return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc
885     {
886         offsets[] = raw_offsets[];
887         sz[] = raw_sizes[];
888         storage = data;
889     }
890 
891     @property auto slice(size_t n)()inout pure nothrow @nogc
892     {
893         auto ptr = raw_ptr!n;
894         return packedArrayView!(Types[n])(ptr, sz[n]);
895     }
896 
897     @property auto ptr(size_t n)()inout pure nothrow @nogc
898     {
899         auto ptr = raw_ptr!n;
900         return inout(PackedPtr!(Types[n]))(ptr);
901     }
902 
903     template length(size_t n)
904     {
905         @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
906 
907         @property void length(size_t new_size)
908         {
909             if (new_size > sz[n])
910             {// extend
911                 size_t delta = (new_size - sz[n]);
912                 sz[n] += delta;
913                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
914                 storage.length +=  delta;// extend space at end
915                 // raw_slice!x must follow resize as it could be moved!
916                 // next stmts move all data past this array, last-one-goes-first
917                 static if (n != dim-1)
918                 {
919                     auto start = raw_ptr!(n+1);
920                     // len includes delta
921                     size_t len = (storage.ptr+storage.length-start);
922 
923                     copyBackwards(start[0 .. len-delta], start[delta .. len]);
924 
925                     start[0 .. delta] = 0;
926                     // offsets are used for raw_slice, ptr etc.
927                     foreach (i; n+1 .. dim)
928                         offsets[i] += delta;
929                 }
930             }
931             else if (new_size < sz[n])
932             {// shrink
933                 size_t delta = (sz[n] - new_size);
934                 sz[n] -= delta;
935                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
936                 // move all data past this array, forward direction
937                 static if (n != dim-1)
938                 {
939                     auto start = raw_ptr!(n+1);
940                     size_t len = (storage.ptr+storage.length-start);
941                     copyForward(start[0 .. len-delta], start[delta .. len]);
942 
943                     // adjust offsets last, they affect raw_slice
944                     foreach (i; n+1 .. dim)
945                         offsets[i] -= delta;
946                 }
947                 storage.length -= delta;
948             }
949             // else - NOP
950         }
951     }
952 
953     @property size_t bytes(size_t n=size_t.max)() const @safe
954     {
955         static if (n == size_t.max)
956             return storage.length*size_t.sizeof;
957         else static if (n != Types.length-1)
958             return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
959         else
960             return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
961     }
962 
963     void store(OutRange)(scope OutRange sink) const
964         if (isOutputRange!(OutRange, char))
965     {
966         import std.format.write : formattedWrite;
967         formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
968         formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
969         formattedWrite(sink, ", [%( 0x%x, %)]", storage);
970     }
971 
972 private:
973     import std.meta : staticMap;
974     @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
975     {
976         static if (n == 0)
977             return storage.ptr;
978         else
979         {
980             return storage.ptr+offsets[n];
981         }
982     }
983     enum dim = Types.length;
984     size_t[dim] offsets;// offset for level x
985     size_t[dim] sz;// size of level x
986     alias bitWidth = staticMap!(bitSizeOf, Types);
987     size_t[] storage;
988 }
989 
990 @system unittest
991 {
992     import std.conv : text;
993     enum dg = (){
994         // sizes are:
995         // lvl0: 3, lvl1 : 2, lvl2: 1
996         auto m = MultiArray!(int, ubyte, int)(3,2,1);
997 
998         static void check(size_t k, T)(ref T m, int n)
999         {
1000             foreach (i; 0 .. n)
1001                 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
1002         }
1003 
1004         static void checkB(size_t k, T)(ref T m, int n)
1005         {
1006             foreach (i; 0 .. n)
1007                 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
1008         }
1009 
1010         static void fill(size_t k, T)(ref T m, int n)
1011         {
1012             foreach (i; 0 .. n)
1013                 m.slice!(k)[i] = force!ubyte(i+1);
1014         }
1015 
1016         static void fillB(size_t k, T)(ref T m, int n)
1017         {
1018             foreach (i; 0 .. n)
1019                 m.slice!(k)[i] = force!ubyte(n-i);
1020         }
1021 
1022         m.length!1 = 100;
1023         fill!1(m, 100);
1024         check!1(m, 100);
1025 
1026         m.length!0 = 220;
1027         fill!0(m, 220);
1028         check!1(m, 100);
1029         check!0(m, 220);
1030 
1031         m.length!2 = 17;
1032         fillB!2(m, 17);
1033         checkB!2(m, 17);
1034         check!0(m, 220);
1035         check!1(m, 100);
1036 
1037         m.length!2 = 33;
1038         checkB!2(m, 17);
1039         fillB!2(m, 33);
1040         checkB!2(m, 33);
1041         check!0(m, 220);
1042         check!1(m, 100);
1043 
1044         m.length!1 = 195;
1045         fillB!1(m, 195);
1046         checkB!1(m, 195);
1047         checkB!2(m, 33);
1048         check!0(m, 220);
1049 
1050         auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
1051         marr.length!0 = 15;
1052         marr.length!1 = 30;
1053         fill!1(marr, 30);
1054         fill!0(marr, 15);
1055         check!1(marr, 30);
1056         check!0(marr, 15);
1057         return 0;
1058     };
1059     enum ct = dg();
1060     auto rt = dg();
1061 }
1062 
1063 @system unittest
1064 {// more bitpacking tests
1065     import std.conv : text;
1066 
1067     alias Bitty =
1068       MultiArray!(BitPacked!(size_t, 3)
1069                 , BitPacked!(size_t, 4)
1070                 , BitPacked!(size_t, 3)
1071                 , BitPacked!(size_t, 6)
1072                 , bool);
1073     alias fn1 = sliceBits!(13, 16);
1074     alias fn2 = sliceBits!( 9, 13);
1075     alias fn3 = sliceBits!( 6,  9);
1076     alias fn4 = sliceBits!( 0,  6);
1077     static void check(size_t lvl, MA)(ref MA arr){
1078         for (size_t i = 0; i< arr.length!lvl; i++)
1079             assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
1080     }
1081 
1082     static void fillIdx(size_t lvl, MA)(ref MA arr){
1083         for (size_t i = 0; i< arr.length!lvl; i++)
1084             arr.slice!(lvl)[i] = i;
1085     }
1086     Bitty m1;
1087 
1088     m1.length!4 = 10;
1089     m1.length!3 = 2^^6;
1090     m1.length!2 = 2^^3;
1091     m1.length!1 = 2^^4;
1092     m1.length!0 = 2^^3;
1093 
1094     m1.length!4 = 2^^16;
1095 
1096     for (size_t i = 0; i< m1.length!4; i++)
1097         m1.slice!(4)[i] = i % 2;
1098 
1099     fillIdx!1(m1);
1100     check!1(m1);
1101     fillIdx!2(m1);
1102     check!2(m1);
1103     fillIdx!3(m1);
1104     check!3(m1);
1105     fillIdx!0(m1);
1106     check!0(m1);
1107     check!3(m1);
1108     check!2(m1);
1109     check!1(m1);
1110     for (size_t i=0; i < 2^^16; i++)
1111     {
1112         m1.slice!(4)[i] = i % 2;
1113         m1.slice!(0)[fn1(i)] = fn1(i);
1114         m1.slice!(1)[fn2(i)] = fn2(i);
1115         m1.slice!(2)[fn3(i)] = fn3(i);
1116         m1.slice!(3)[fn4(i)] = fn4(i);
1117     }
1118     for (size_t i=0; i < 2^^16; i++)
1119     {
1120         assert(m1.slice!(4)[i] == i % 2);
1121         assert(m1.slice!(0)[fn1(i)] == fn1(i));
1122         assert(m1.slice!(1)[fn2(i)] == fn2(i));
1123         assert(m1.slice!(2)[fn3(i)] == fn3(i));
1124         assert(m1.slice!(3)[fn4(i)] == fn4(i));
1125     }
1126 }
1127 
1128 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
1129 {
1130     import std.math.algebraic : nextPow2;
1131     enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
1132     static if (bits > 8*size_t.sizeof)
1133     {
1134         static assert(bits % (size_t.sizeof*8) == 0);
1135         return new_len * bits/(8*size_t.sizeof);
1136     }
1137     else
1138     {
1139         enum factor = size_t.sizeof*8/bits;
1140         return (new_len+factor-1)/factor; // rounded up
1141     }
1142 }
1143 
1144 template isBitPackableType(T)
1145 {
1146     enum isBitPackableType = isBitPacked!T
1147         || isIntegral!T || is(T == bool) || isSomeChar!T;
1148 }
1149 
1150 //============================================================================
1151 template PackedArrayView(T)
1152 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1153     && isBitPackableType!U) || isBitPackableType!T)
1154 {
1155     import std.math.algebraic : nextPow2;
1156     private enum bits = bitSizeOf!T;
1157     alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1158 }
1159 
1160 //unsafe and fast access to a chunk of RAM as if it contains packed values
1161 template PackedPtr(T)
1162 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1163     && isBitPackableType!U) || isBitPackableType!T)
1164 {
1165     import std.math.algebraic : nextPow2;
1166     private enum bits = bitSizeOf!T;
1167     alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1168 }
1169 
1170 struct PackedPtrImpl(T, size_t bits)
1171 {
1172 pure nothrow:
1173     static assert(isPow2OrZero(bits));
1174 
1175     this(inout(size_t)* ptr)inout @safe @nogc
1176     {
1177         origin = ptr;
1178     }
1179 
1180     private T simpleIndex(size_t n) inout
1181     {
1182         immutable q = n / factor;
1183         immutable r = n % factor;
1184         return cast(T)((origin[q] >> bits*r) & mask);
1185     }
1186 
1187     private void simpleWrite(TypeOfBitPacked!T val, size_t n)
1188     in
1189     {
1190         static if (isIntegral!T)
1191             assert(val <= mask);
1192     }
1193     do
1194     {
1195         immutable q = n / factor;
1196         immutable r = n % factor;
1197         immutable tgt_shift = bits*r;
1198         immutable word = origin[q];
1199         origin[q] = (word & ~(mask << tgt_shift))
1200             | (cast(size_t) val << tgt_shift);
1201     }
1202 
1203     static if (factor == bytesPerWord// can safely pack by byte
1204          || factor == 1 // a whole word at a time
1205          || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
1206                 && hasUnalignedReads)) // this needs unaligned reads
1207     {
1208         static if (factor == bytesPerWord)
1209             alias U = ubyte;
1210         else static if (factor == bytesPerWord/2)
1211             alias U = ushort;
1212         else static if (factor == bytesPerWord/4)
1213             alias U = uint;
1214         else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
1215             alias U = ulong;
1216 
1217         T opIndex(size_t idx) inout
1218         {
1219             T ret;
1220             version (LittleEndian)
1221                 ret = __ctfe ? simpleIndex(idx) :
1222                     cast(inout(T))(cast(U*) origin)[idx];
1223             else
1224                 ret = simpleIndex(idx);
1225             return ret;
1226         }
1227 
1228         static if (isBitPacked!T) // lack of user-defined implicit conversion
1229         {
1230             void opIndexAssign(T val, size_t idx)
1231             {
1232                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1233             }
1234         }
1235 
1236         void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1237         {
1238             version (LittleEndian)
1239             {
1240                 if (__ctfe)
1241                     simpleWrite(val, idx);
1242                 else
1243                     (cast(U*) origin)[idx] = cast(U) val;
1244             }
1245             else
1246                 simpleWrite(val, idx);
1247         }
1248     }
1249     else
1250     {
1251         T opIndex(size_t n) inout
1252         {
1253             return simpleIndex(n);
1254         }
1255 
1256         static if (isBitPacked!T) // lack of user-defined implicit conversion
1257         {
1258             void opIndexAssign(T val, size_t idx)
1259             {
1260                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1261             }
1262         }
1263 
1264         void opIndexAssign(TypeOfBitPacked!T val, size_t n)
1265         {
1266             return simpleWrite(val, n);
1267         }
1268     }
1269 
1270 private:
1271     // factor - number of elements in one machine word
1272     enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
1273     enum bytesPerWord =  size_t.sizeof;
1274     size_t* origin;
1275 }
1276 
1277 // data is packed only by power of two sized packs per word,
1278 // thus avoiding mul/div overhead at the cost of ultimate packing
1279 // this construct doesn't own memory, only provides access, see MultiArray for usage
1280 struct PackedArrayViewImpl(T, size_t bits)
1281 {
1282 pure nothrow:
1283 
1284     this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
1285     {
1286         ptr = inout(PackedPtr!(T))(origin);
1287         ofs = offset;
1288         limit = items;
1289     }
1290 
1291     bool zeros(size_t s, size_t e)
1292     in
1293     {
1294         assert(s <= e);
1295     }
1296     do
1297     {
1298         s += ofs;
1299         e += ofs;
1300         immutable pad_s = roundUp(s);
1301         if ( s >= e)
1302         {
1303             foreach (i; s .. e)
1304                 if (ptr[i])
1305                     return false;
1306             return true;
1307         }
1308         immutable pad_e = roundDown(e);
1309         size_t i;
1310         for (i=s; i<pad_s; i++)
1311             if (ptr[i])
1312                 return false;
1313         // all in between is x*factor elements
1314         for (size_t j=i/factor; i<pad_e; i+=factor, j++)
1315             if (ptr.origin[j])
1316                 return false;
1317         for (; i<e; i++)
1318             if (ptr[i])
1319                 return false;
1320         return true;
1321     }
1322 
1323     T opIndex(size_t idx) inout
1324     in
1325     {
1326         assert(idx < limit);
1327     }
1328     do
1329     {
1330         return ptr[ofs + idx];
1331     }
1332 
1333     static if (isBitPacked!T) // lack of user-defined implicit conversion
1334     {
1335         void opIndexAssign(T val, size_t idx)
1336         {
1337             return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1338         }
1339     }
1340 
1341     void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1342     in
1343     {
1344         assert(idx < limit);
1345     }
1346     do
1347     {
1348         ptr[ofs + idx] = val;
1349     }
1350 
1351     static if (isBitPacked!T) // lack of user-defined implicit conversions
1352     {
1353         void opSliceAssign(T val, size_t start, size_t end)
1354         {
1355             opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
1356         }
1357     }
1358 
1359     void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
1360     in
1361     {
1362         assert(start <= end);
1363         assert(end <= limit);
1364     }
1365     do
1366     {
1367         // account for ofsetted view
1368         start += ofs;
1369         end += ofs;
1370         // rounded to factor granularity
1371         immutable pad_start = roundUp(start);// rounded up
1372         if (pad_start >= end) //rounded up >= then end of slice
1373         {
1374             //nothing to gain, use per element assignment
1375             foreach (i; start .. end)
1376                 ptr[i] = val;
1377             return;
1378         }
1379         immutable pad_end = roundDown(end); // rounded down
1380         size_t i;
1381         for (i=start; i<pad_start; i++)
1382             ptr[i] = val;
1383         // all in between is x*factor elements
1384         if (pad_start != pad_end)
1385         {
1386             immutable repval = replicateBits!(factor, bits)(val);
1387             for (size_t j=i/factor; i<pad_end; i+=factor, j++)
1388                 ptr.origin[j] = repval;// so speed it up by factor
1389         }
1390         for (; i<end; i++)
1391             ptr[i] = val;
1392     }
1393 
1394     auto opSlice(size_t from, size_t to)inout
1395     in
1396     {
1397         assert(from <= to);
1398         assert(ofs + to <= limit);
1399     }
1400     do
1401     {
1402         return typeof(this)(ptr.origin, ofs + from, to - from);
1403     }
1404 
1405     auto opSlice(){ return opSlice(0, length); }
1406 
1407     bool opEquals(T)(auto ref T arr) const
1408     {
1409         if (limit != arr.limit)
1410            return false;
1411         size_t s1 = ofs, s2 = arr.ofs;
1412         size_t e1 = s1 + limit, e2 = s2 + limit;
1413         if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
1414         {
1415             return ptr.origin[s1/factor .. e1/factor]
1416                 == arr.ptr.origin[s2/factor .. e2/factor];
1417         }
1418         for (size_t i=0;i<limit; i++)
1419             if (this[i] != arr[i])
1420                 return false;
1421         return true;
1422     }
1423 
1424     @property size_t length()const{ return limit; }
1425 
1426 private:
1427     auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
1428     auto roundDown()(size_t val){ return val/factor*factor; }
1429     // factor - number of elements in one machine word
1430     enum factor = size_t.sizeof*8/bits;
1431     PackedPtr!(T) ptr;
1432     size_t ofs, limit;
1433 }
1434 
1435 
1436 private struct SliceOverIndexed(T)
1437 {
1438     enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
1439     enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
1440     auto opIndex(size_t idx)const
1441     in
1442     {
1443         assert(idx < to - from);
1444     }
1445     do
1446     {
1447         return (*arr)[from+idx];
1448     }
1449 
1450     static if (assignableIndex)
1451     void opIndexAssign(Item val, size_t idx)
1452     in
1453     {
1454         assert(idx < to - from);
1455     }
1456     do
1457     {
1458        (*arr)[from+idx] = val;
1459     }
1460 
1461     auto opSlice(size_t a, size_t b)
1462     {
1463         return typeof(this)(from+a, from+b, arr);
1464     }
1465 
1466     // static if (assignableSlice)
1467     void opSliceAssign(T)(T val, size_t start, size_t end)
1468     {
1469         (*arr)[start+from .. end+from] = val;
1470     }
1471 
1472     auto opSlice()
1473     {
1474         return typeof(this)(from, to, arr);
1475     }
1476 
1477     @property size_t length()const { return to-from;}
1478 
1479     alias opDollar = length;
1480 
1481     @property bool empty()const { return from == to; }
1482 
1483     @property auto front()const { return (*arr)[from]; }
1484 
1485     static if (assignableIndex)
1486     @property void front(Item val) { (*arr)[from] = val; }
1487 
1488     @property auto back()const { return (*arr)[to-1]; }
1489 
1490     static if (assignableIndex)
1491     @property void back(Item val) { (*arr)[to-1] = val; }
1492 
1493     @property auto save() inout { return this; }
1494 
1495     void popFront() {   from++; }
1496 
1497     void popBack() {    to--; }
1498 
1499     bool opEquals(T)(auto ref T arr) const
1500     {
1501         if (arr.length != length)
1502             return false;
1503         for (size_t i=0; i <length; i++)
1504             if (this[i] != arr[i])
1505                 return false;
1506         return true;
1507     }
1508 private:
1509     alias Item = typeof(T.init[0]);
1510     size_t from, to;
1511     T* arr;
1512 }
1513 
1514 @safe pure nothrow @nogc unittest
1515 {
1516     static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
1517 }
1518 
1519 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
1520 if (is(Unqual!T == T))
1521 {
1522     return SliceOverIndexed!(const(T))(a, b, x);
1523 }
1524 
1525 // BUG? inout is out of reach
1526 //...SliceOverIndexed.arr only parameters or stack based variables can be inout
1527 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
1528 if (is(Unqual!T == T))
1529 {
1530     return SliceOverIndexed!T(a, b, x);
1531 }
1532 
1533 @system unittest
1534 {
1535     int[] idxArray = [2, 3, 5, 8, 13];
1536     auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
1537 
1538     assert(!sliced.empty);
1539     assert(sliced.front == 2);
1540     sliced.front = 1;
1541     assert(sliced.front == 1);
1542     assert(sliced.back == 13);
1543     sliced.popFront();
1544     assert(sliced.front == 3);
1545     assert(sliced.back == 13);
1546     sliced.back = 11;
1547     assert(sliced.back == 11);
1548     sliced.popBack();
1549 
1550     assert(sliced.front == 3);
1551     assert(sliced[$-1] == 8);
1552     sliced = sliced[];
1553     assert(sliced[0] == 3);
1554     assert(sliced.back == 8);
1555     sliced = sliced[1..$];
1556     assert(sliced.front == 5);
1557     sliced = sliced[0..$-1];
1558     assert(sliced[$-1] == 5);
1559 
1560     int[] other = [2, 5];
1561     assert(sliced[] == sliceOverIndexed(1, 2, &other));
1562     sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
1563     assert(idxArray[0 .. 2] == [-1, -1]);
1564     uint[] nullArr = null;
1565     auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
1566     assert(nullSlice.empty);
1567 }
1568 
1569 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items)
1570 {
1571     return inout(PackedArrayView!T)(ptr, 0, items);
1572 }
1573 
1574 
1575 //============================================================================
1576 // Partially unrolled binary search using Shar's method
1577 //============================================================================
1578 
1579 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
1580 {
1581     import core.bitop : bsr;
1582     import std.array : replace;
1583     import std.conv : to;
1584     assert(isPow2OrZero(size));
1585     string code = `
1586     import core.bitop : bsr;
1587     auto power = bsr(m)+1;
1588     switch (power){`;
1589     size_t i = bsr(size);
1590     foreach_reverse (val; 0 .. bsr(size))
1591     {
1592         auto v = 2^^val;
1593         code ~= `
1594         case pow:
1595             if (pred(range[idx+m], needle))
1596                 idx +=  m;
1597             goto case;
1598         `.replace("m", to!string(v))
1599         .replace("pow", to!string(i));
1600         i--;
1601     }
1602     code ~= `
1603         case 0:
1604             if (pred(range[idx], needle))
1605                 idx += 1;
1606             goto default;
1607         `;
1608     code ~= `
1609         default:
1610     }`;
1611     return code;
1612 }
1613 
1614 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
1615 {
1616     // See also: std.math.isPowerOf2()
1617     return (sz & (sz-1)) == 0;
1618 }
1619 
1620 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
1621 if (is(T : ElementType!Range))
1622 {
1623     assert(isPow2OrZero(range.length));
1624     size_t idx = 0, m = range.length/2;
1625     while (m != 0)
1626     {
1627         if (pred(range[idx+m], needle))
1628             idx += m;
1629         m /= 2;
1630     }
1631     if (pred(range[idx], needle))
1632         idx += 1;
1633     return idx;
1634 }
1635 
1636 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
1637 if (is(T : ElementType!Range))
1638 {
1639     assert(isPow2OrZero(range.length));
1640     size_t idx = 0, m = range.length/2;
1641     enum max = 1 << 10;
1642     while (m >= max)
1643     {
1644         if (pred(range[idx+m], needle))
1645             idx += m;
1646         m /= 2;
1647     }
1648     mixin(genUnrolledSwitchSearch(max));
1649     return idx;
1650 }
1651 
1652 template sharMethod(alias uniLowerBound)
1653 {
1654     size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
1655         if (is(T : ElementType!Range))
1656     {
1657         import std.functional : binaryFun;
1658         import std.math.algebraic : nextPow2, truncPow2;
1659         alias pred = binaryFun!_pred;
1660         if (range.length == 0)
1661             return 0;
1662         if (isPow2OrZero(range.length))
1663             return uniLowerBound!pred(range, needle);
1664         size_t n = truncPow2(range.length);
1665         if (pred(range[n-1], needle))
1666         {// search in another 2^^k area that fully covers the tail of range
1667             size_t k = nextPow2(range.length - n + 1);
1668             return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
1669         }
1670         else
1671             return uniLowerBound!pred(range[0 .. n], needle);
1672     }
1673 }
1674 
1675 alias sharLowerBound = sharMethod!uniformLowerBound;
1676 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
1677 
1678 @safe unittest
1679 {
1680     import std.array : array;
1681     import std.range : assumeSorted, iota;
1682 
1683     auto stdLowerBound(T)(T[] range, T needle)
1684     {
1685         return assumeSorted(range).lowerBound(needle).length;
1686     }
1687     immutable MAX = 5*1173;
1688     auto arr = array(iota(5, MAX, 5));
1689     assert(arr.length == MAX/5-1);
1690     foreach (i; 0 .. MAX+5)
1691     {
1692         auto st = stdLowerBound(arr, i);
1693         assert(st == sharLowerBound(arr, i));
1694         assert(st == sharSwitchLowerBound(arr, i));
1695     }
1696     arr = [];
1697     auto st = stdLowerBound(arr, 33);
1698     assert(st == sharLowerBound(arr, 33));
1699     assert(st == sharSwitchLowerBound(arr, 33));
1700 }
1701 //============================================================================
1702 
1703 @safe
1704 {
1705 // hope to see simillar stuff in public interface... once Allocators are out
1706 //@@@BUG moveFront and friends? dunno, for now it's POD-only
1707 
1708 @trusted size_t genericReplace(Policy=void, T, Range)
1709     (ref T dest, size_t from, size_t to, Range stuff)
1710 {
1711     import std.algorithm.mutation : copy;
1712     size_t delta = to - from;
1713     size_t stuff_end = from+stuff.length;
1714     if (stuff.length > delta)
1715     {// replace increases length
1716         delta = stuff.length - delta;// now, new is > old  by delta
1717         static if (is(Policy == void))
1718             dest.length = dest.length+delta;//@@@BUG lame @property
1719         else
1720             dest = Policy.realloc(dest, dest.length+delta);
1721         copyBackwards(dest[to .. dest.length-delta],
1722             dest[to+delta .. dest.length]);
1723         copyForward(stuff, dest[from .. stuff_end]);
1724     }
1725     else if (stuff.length == delta)
1726     {
1727         copy(stuff, dest[from .. to]);
1728     }
1729     else
1730     {// replace decreases length by delta
1731         delta = delta - stuff.length;
1732         copy(stuff, dest[from .. stuff_end]);
1733         copyForward(dest[to .. dest.length],
1734             dest[stuff_end .. dest.length-delta]);
1735         static if (is(Policy == void))
1736             dest.length = dest.length - delta;//@@@BUG lame @property
1737         else
1738             dest = Policy.realloc(dest, dest.length-delta);
1739     }
1740     return stuff_end;
1741 }
1742 
1743 
1744 // Simple storage manipulation policy
1745 @safe private struct GcPolicy
1746 {
1747     import std.traits : isDynamicArray;
1748 
1749     static T[] dup(T)(const T[] arr)
1750     {
1751         return arr.dup;
1752     }
1753 
1754     static T[] alloc(T)(size_t size)
1755     {
1756         return new T[size];
1757     }
1758 
1759     static T[] realloc(T)(T[] arr, size_t sz)
1760     {
1761         arr.length = sz;
1762         return arr;
1763     }
1764 
1765     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1766     {
1767         replaceInPlace(dest, from, to, stuff);
1768     }
1769 
1770     static void append(T, V)(ref T[] arr, V value)
1771         if (!isInputRange!V)
1772     {
1773         arr ~= force!T(value);
1774     }
1775 
1776     static void append(T, V)(ref T[] arr, V value)
1777         if (isInputRange!V)
1778     {
1779         insertInPlace(arr, arr.length, value);
1780     }
1781 
1782     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1783         if (isDynamicArray!T && is(Unqual!T == T))
1784     {
1785         debug
1786         {
1787             assert(accessIsSafe);
1788             arr[] = cast(typeof(T.init[0]))(0xdead_beef);
1789         }
1790         arr = null;
1791     }
1792 
1793     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1794         if (isDynamicArray!T && !is(Unqual!T == T))
1795     {
1796         arr = null;
1797     }
1798 
1799     // This is unfortunately necessary to "fake pure". It will only ever be called
1800     // in the destructor for a GC-allocated CowArray, which is the only place where
1801     // this might return false. Current code expects this to be pure, so we can't
1802     // break that. But before this change, the code would access the referenced
1803     // array inside a GC finalizer, which is invalid.
1804     pragma(mangle, "gc_inFinalizer") private static extern(C) bool pureInGCFinalizer() @safe pure nothrow;
1805 
1806     static @property bool accessIsSafe() @safe nothrow pure
1807     {
1808         return __ctfe || !pureInGCFinalizer;
1809     }
1810 }
1811 
1812 // ditto
1813 @safe struct ReallocPolicy
1814 {
1815     import std.range.primitives : hasLength;
1816 
1817     static T[] dup(T)(const T[] arr)
1818     {
1819         auto result = alloc!T(arr.length);
1820         result[] = arr[];
1821         return result;
1822     }
1823 
1824     static T[] alloc(T)(size_t size) @trusted
1825     {
1826         import std.internal.memory : enforceMalloc;
1827 
1828         import core.checkedint : mulu;
1829         bool overflow;
1830         size_t nbytes = mulu(size, T.sizeof, overflow);
1831         if (overflow) assert(0);
1832 
1833         auto ptr = cast(T*) enforceMalloc(nbytes);
1834         return ptr[0 .. size];
1835     }
1836 
1837     static T[] realloc(T)(return scope T[] arr, size_t size) @trusted
1838     {
1839         import std.internal.memory : enforceRealloc;
1840         if (!size)
1841         {
1842             destroy(arr);
1843             return null;
1844         }
1845 
1846         import core.checkedint : mulu;
1847         bool overflow;
1848         size_t nbytes = mulu(size, T.sizeof, overflow);
1849         if (overflow) assert(0);
1850 
1851         auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes);
1852         return ptr[0 .. size];
1853     }
1854 
1855     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1856     {
1857         genericReplace!(ReallocPolicy)(dest, from, to, stuff);
1858     }
1859 
1860     static void append(T, V)(ref T[] arr, V value)
1861         if (!isInputRange!V)
1862     {
1863         if (arr.length == size_t.max) assert(0);
1864         arr = realloc(arr, arr.length+1);
1865         arr[$-1] = force!T(value);
1866     }
1867 
1868     pure @safe unittest
1869     {
1870         int[] arr;
1871         ReallocPolicy.append(arr, 3);
1872 
1873         import std.algorithm.comparison : equal;
1874         assert(equal(arr, [3]));
1875     }
1876 
1877     static void append(T, V)(ref T[] arr, V value)
1878         if (isInputRange!V && hasLength!V)
1879     {
1880         import core.checkedint : addu;
1881         bool overflow;
1882         size_t nelems = addu(arr.length, value.length, overflow);
1883         if (overflow) assert(0);
1884 
1885         arr = realloc(arr, nelems);
1886 
1887         import std.algorithm.mutation : copy;
1888         copy(value, arr[$-value.length..$]);
1889     }
1890 
1891     pure @safe unittest
1892     {
1893         int[] arr;
1894         ReallocPolicy.append(arr, [1,2,3]);
1895 
1896         import std.algorithm.comparison : equal;
1897         assert(equal(arr, [1,2,3]));
1898     }
1899 
1900     static void destroy(T)(scope ref T[] arr) @trusted
1901     {
1902         import core.memory : pureFree;
1903         if (arr.ptr)
1904             pureFree(arr.ptr);
1905         arr = null;
1906     }
1907 
1908     enum accessIsSafe = true;
1909 }
1910 
1911 //build hack
1912 alias _RealArray = CowArray!ReallocPolicy;
1913 
1914 pure @safe unittest
1915 {
1916     import std.algorithm.comparison : equal;
1917 
1918     with(ReallocPolicy)
1919     {
1920         bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
1921                    string file = __FILE__, size_t line = __LINE__)
1922         {
1923             {
1924                 replaceImpl(orig, from, to, toReplace);
1925                 scope(exit) destroy(orig);
1926                 if (!equal(orig, result))
1927                     return false;
1928             }
1929             return true;
1930         }
1931         static T[] arr(T)(T[] args... )
1932         {
1933             return dup(args);
1934         }
1935 
1936         assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
1937         assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
1938         assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
1939         assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
1940         assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
1941     }
1942 }
1943 
1944 /**
1945     Tests if T is some kind a set of code points. Intended for template constraints.
1946 */
1947 public template isCodepointSet(T)
1948 {
1949     static if (is(T dummy == InversionList!(Args), Args...))
1950         enum isCodepointSet = true;
1951     else
1952         enum isCodepointSet = false;
1953 }
1954 
1955 /**
1956     Tests if `T` is a pair of integers that implicitly convert to `V`.
1957     The following code must compile for any pair `T`:
1958     ---
1959     (T x){ V a = x[0]; V b = x[1];}
1960     ---
1961     The following must not compile:
1962      ---
1963     (T x){ V c = x[2];}
1964     ---
1965 */
1966 public template isIntegralPair(T, V=uint)
1967 {
1968     enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
1969         && !is(typeof((T x){ V c = x[2]; }));
1970 }
1971 
1972 
1973 /**
1974     The recommended default type for set of $(CODEPOINTS).
1975     For details, see the current implementation: $(LREF InversionList).
1976 */
1977 public alias CodepointSet = InversionList!GcPolicy;
1978 
1979 
1980 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
1981 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error
1982 // hence below doesn't seem to work
1983 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
1984 
1985 /**
1986     The recommended type of $(REF Tuple, std,_typecons)
1987     to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
1988     Any interval type should pass $(LREF isIntegralPair) trait.
1989 */
1990 public struct CodepointInterval
1991 {
1992 pure:
1993     uint[2] _tuple;
1994     alias _tuple this;
1995 
1996 @safe pure nothrow @nogc:
1997 
1998     this(uint low, uint high)
1999     {
2000         _tuple[0] = low;
2001         _tuple[1] = high;
2002     }
2003     bool opEquals(T)(T val) const
2004     {
2005         return this[0] == val[0] && this[1] == val[1];
2006     }
2007     @property ref inout(uint) a() return inout { return _tuple[0]; }
2008     @property ref inout(uint) b() return inout { return _tuple[1]; }
2009 }
2010 
2011 /**
2012     $(P
2013     `InversionList` is a set of $(CODEPOINTS)
2014     represented as an array of open-right [a, b$(RPAREN)
2015     intervals (see $(LREF CodepointInterval) above).
2016     The name comes from the way the representation reads left to right.
2017     For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
2018     plus a singular value 60 looks like this:
2019     )
2020     ---
2021     10, 50, 60, 61, 80, 90
2022     ---
2023     $(P
2024     The way to read this is: start with negative meaning that all numbers
2025     smaller then the next one are not present in this set (and positive -
2026     the contrary). Then switch positive/negative after each
2027     number passed from left to right.
2028     )
2029     $(P This way negative spans until 10, then positive until 50,
2030     then negative until 60, then positive until 61, and so on.
2031     As seen this provides a space-efficient storage of highly redundant data
2032     that comes in long runs. A description which Unicode $(CHARACTER)
2033     properties fit nicely. The technique itself could be seen as a variation
2034     on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
2035     )
2036 
2037     $(P Sets are value types (just like `int` is) thus they
2038         are never aliased.
2039     )
2040         Example:
2041         ---
2042         auto a = CodepointSet('a', 'z'+1);
2043         auto b = CodepointSet('A', 'Z'+1);
2044         auto c = a;
2045         a = a | b;
2046         assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
2047         assert(a != c);
2048         ---
2049     $(P See also $(LREF unicode) for simpler construction of sets
2050         from predefined ones.
2051     )
2052 
2053     $(P Memory usage is 8 bytes per each contiguous interval in a set.
2054     The value semantics are achieved by using the
2055     $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
2056     and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
2057     )
2058 
2059     Note:
2060     $(P It's not recommended to rely on the template parameters
2061     or the exact type of a current $(CODEPOINT) set in `std.uni`.
2062     The type and parameters may change when the standard
2063     allocators design is finalized.
2064     Use $(LREF isCodepointSet) with templates or just stick with the default
2065     alias $(LREF CodepointSet) throughout the whole code base.
2066     )
2067 */
2068 public struct InversionList(SP=GcPolicy)
2069 {
2070     import std.range : assumeSorted;
2071 
2072     /**
2073         Construct from another code point set of any type.
2074     */
2075     this(Set)(Set set) pure
2076         if (isCodepointSet!Set)
2077     {
2078         uint[] arr;
2079         foreach (v; set.byInterval)
2080         {
2081             arr ~= v.a;
2082             arr ~= v.b;
2083         }
2084         data = CowArray!(SP).reuse(arr);
2085     }
2086 
2087     /**
2088         Construct a set from a forward range of code point intervals.
2089     */
2090     this(Range)(Range intervals) pure
2091         if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
2092     {
2093         uint[] arr;
2094         foreach (v; intervals)
2095         {
2096             SP.append(arr, v.a);
2097             SP.append(arr, v.b);
2098         }
2099         data = CowArray!(SP).reuse(arr);
2100         sanitize(); //enforce invariant: sort intervals etc.
2101     }
2102 
2103     //helper function that avoids sanity check to be CTFE-friendly
2104     private static fromIntervals(Range)(Range intervals) pure
2105     {
2106         import std.algorithm.iteration : map;
2107         import std.range : roundRobin;
2108         auto flattened = roundRobin(intervals.save.map!"a[0]"(),
2109             intervals.save.map!"a[1]"());
2110         InversionList set;
2111         set.data = CowArray!(SP)(flattened);
2112         return set;
2113     }
2114     //ditto untill sort is CTFE-able
2115     private static fromIntervals()(uint[] intervals...) pure
2116     in
2117     {
2118         import std.conv : text;
2119         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2120         for (uint i = 0; i < intervals.length; i += 2)
2121         {
2122             auto a = intervals[i], b = intervals[i+1];
2123             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2124         }
2125     }
2126     do
2127     {
2128         InversionList set;
2129         set.data = CowArray!(SP)(intervals);
2130         return set;
2131     }
2132 
2133     /**
2134         Construct a set from plain values of code point intervals.
2135     */
2136     this()(uint[] intervals...)
2137     in
2138     {
2139         import std.conv : text;
2140         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2141         for (uint i = 0; i < intervals.length; i += 2)
2142         {
2143             auto a = intervals[i], b = intervals[i+1];
2144             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2145         }
2146     }
2147     do
2148     {
2149         data = CowArray!(SP)(intervals);
2150         sanitize(); //enforce invariant: sort intervals etc.
2151     }
2152 
2153     ///
2154     pure @safe unittest
2155     {
2156         import std.algorithm.comparison : equal;
2157 
2158         auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
2159         foreach (v; 'a'..'z'+1)
2160             assert(set[v]);
2161         // Cyrillic lowercase interval
2162         foreach (v; 'а'..'я'+1)
2163             assert(set[v]);
2164         //specific order is not required, intervals may interesect
2165         auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
2166         //the same end result
2167         assert(set2.byInterval.equal(set.byInterval));
2168         // test constructor this(Range)(Range intervals)
2169         auto chessPiecesWhite = CodepointInterval(9812, 9818);
2170         auto chessPiecesBlack = CodepointInterval(9818, 9824);
2171         auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]);
2172         foreach (v; '♔'..'♟'+1)
2173             assert(set3[v]);
2174     }
2175 
2176     /**
2177         Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
2178     */
2179     @property auto byInterval() scope
2180     {
2181         // TODO: change this to data[] once the -dip1000 errors have been fixed
2182         // see e.g. https://github.com/dlang/phobos/pull/6638
2183         import std.array : array;
2184         return Intervals!(typeof(data.array))(data.array);
2185     }
2186 
2187     @safe unittest
2188     {
2189         import std.algorithm.comparison : equal;
2190         import std.typecons : tuple;
2191 
2192         auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
2193 
2194         assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
2195     }
2196 
2197     package(std) @property const(CodepointInterval)[] intervals() const
2198     {
2199         import std.array : array;
2200         return Intervals!(typeof(data[]))(data[]).array;
2201     }
2202 
2203     /**
2204         Tests the presence of code point `val` in this set.
2205     */
2206     bool opIndex(uint val) const
2207     {
2208         // the <= ensures that searching in  interval of [a, b) for 'a' you get .length == 1
2209         // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
2210         return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
2211     }
2212 
2213     ///
2214     pure @safe unittest
2215     {
2216         auto gothic = unicode.Gothic;
2217         // Gothic letter ahsa
2218         assert(gothic['\U00010330']);
2219         // no ascii in Gothic obviously
2220         assert(!gothic['$']);
2221     }
2222 
2223 
2224     // Linear scan for `ch`. Useful only for small sets.
2225     // TODO:
2226     // used internally in std.regex
2227     // should be properly exposed in a public API ?
2228     package(std) auto scanFor()(dchar ch) const
2229     {
2230         immutable len = data.length;
2231         for (size_t i = 0; i < len; i++)
2232             if (ch < data[i])
2233                 return i & 1;
2234         return 0;
2235     }
2236 
2237     /// Number of $(CODEPOINTS) in this set
2238     @property size_t length()
2239     {
2240         size_t sum = 0;
2241         foreach (iv; byInterval)
2242         {
2243             sum += iv.b - iv.a;
2244         }
2245         return sum;
2246     }
2247 
2248 // bootstrap full set operations from 4 primitives (suitable as a template mixin):
2249 // addInterval, skipUpTo, dropUpTo & byInterval iteration
2250 //============================================================================
2251 public:
2252     /**
2253         $(P Sets support natural syntax for set algebra, namely: )
2254         $(BOOKTABLE ,
2255             $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
2256             $(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
2257             $(TR $(TD |) $(TD a ∪ b) $(TD union) )
2258             $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
2259             $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
2260         )
2261     */
2262     This opBinary(string op, U)(U rhs)
2263         if (isCodepointSet!U || is(U:dchar))
2264     {
2265         static if (op == "&" || op == "|" || op == "~")
2266         {// symmetric ops thus can swap arguments to reuse r-value
2267             static if (is(U:dchar))
2268             {
2269                 auto tmp = this;
2270                 mixin("tmp "~op~"= rhs; ");
2271                 return tmp;
2272             }
2273             else
2274             {
2275                 static if (is(Unqual!U == U))
2276                 {
2277                     // try hard to reuse r-value
2278                     mixin("rhs "~op~"= this;");
2279                     return rhs;
2280                 }
2281                 else
2282                 {
2283                     auto tmp = this;
2284                     mixin("tmp "~op~"= rhs;");
2285                     return tmp;
2286                 }
2287             }
2288         }
2289         else static if (op == "-") // anti-symmetric
2290         {
2291             auto tmp = this;
2292             tmp -= rhs;
2293             return tmp;
2294         }
2295         else
2296             static assert(0, "no operator "~op~" defined for Set");
2297     }
2298 
2299     ///
2300     pure @safe unittest
2301     {
2302         import std.algorithm.comparison : equal;
2303         import std.range : iota;
2304 
2305         auto lower = unicode.LowerCase;
2306         auto upper = unicode.UpperCase;
2307         auto ascii = unicode.ASCII;
2308 
2309         assert((lower & upper).empty); // no intersection
2310         auto lowerASCII = lower & ascii;
2311         assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
2312         // throw away all of the lowercase ASCII
2313         assert((ascii - lower).length == 128 - 26);
2314 
2315         auto onlyOneOf = lower ~ ascii;
2316         assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
2317         assert(onlyOneOf['$']); // ASCII and not lowercase
2318         assert(!onlyOneOf['a']); // ASCII and lowercase
2319         assert(onlyOneOf['я']); // not ASCII but lowercase
2320 
2321         // throw away all cased letters from ASCII
2322         auto noLetters = ascii - (lower | upper);
2323         assert(noLetters.length == 128 - 26*2);
2324     }
2325 
2326     /// The 'op=' versions of the above overloaded operators.
2327     ref This opOpAssign(string op, U)(U rhs)
2328         if (isCodepointSet!U || is(U:dchar))
2329     {
2330         static if (op == "|")    // union
2331         {
2332             static if (is(U:dchar))
2333             {
2334                 this.addInterval(rhs, rhs+1);
2335                 return this;
2336             }
2337             else
2338                 return this.add(rhs);
2339         }
2340         else static if (op == "&")   // intersection
2341                 return this.intersect(rhs);// overloaded
2342         else static if (op == "-")   // set difference
2343                 return this.sub(rhs);// overloaded
2344         else static if (op == "~")   // symmetric set difference
2345         {
2346             auto copy = this & rhs;
2347             this |= rhs;
2348             this -= copy;
2349             return this;
2350         }
2351         else
2352             static assert(0, "no operator "~op~" defined for Set");
2353     }
2354 
2355     /**
2356         Tests the presence of codepoint `ch` in this set,
2357         the same as $(LREF opIndex).
2358     */
2359     bool opBinaryRight(string op: "in", U)(U ch) const
2360         if (is(U : dchar))
2361     {
2362         return this[ch];
2363     }
2364 
2365     ///
2366     pure @safe unittest
2367     {
2368         assert('я' in unicode.Cyrillic);
2369         assert(!('z' in unicode.Cyrillic));
2370     }
2371 
2372 
2373 
2374     /**
2375      * Obtains a set that is the inversion of this set.
2376      *
2377      * See_Also: $(LREF inverted)
2378      */
2379     auto opUnary(string op: "!")()
2380     {
2381         return this.inverted;
2382     }
2383 
2384     /**
2385         A range that spans each $(CODEPOINT) in this set.
2386     */
2387     @property auto byCodepoint()
2388     {
2389         static struct CodepointRange
2390         {
2391             this(This set)
2392             {
2393                 r = set.byInterval;
2394                 if (!r.empty)
2395                     cur = r.front.a;
2396             }
2397 
2398             @property dchar front() const
2399             {
2400                 return cast(dchar) cur;
2401             }
2402 
2403             @property bool empty() const
2404             {
2405                 return r.empty;
2406             }
2407 
2408             void popFront()
2409             {
2410                 cur++;
2411                 while (cur >= r.front.b)
2412                 {
2413                     r.popFront();
2414                     if (r.empty)
2415                         break;
2416                     cur = r.front.a;
2417                 }
2418             }
2419         private:
2420             uint cur;
2421             @(imported!"core.attribute".mutableRefInit) typeof(This.init.byInterval) r;
2422         }
2423 
2424         return CodepointRange(this);
2425     }
2426 
2427     ///
2428     pure @safe unittest
2429     {
2430         import std.algorithm.comparison : equal;
2431         import std.range : iota;
2432 
2433         auto set = unicode.ASCII;
2434         set.byCodepoint.equal(iota(0, 0x80));
2435     }
2436 
2437     /**
2438         $(P Obtain textual representation of this set in from of
2439         open-right intervals and feed it to `sink`.
2440         )
2441         $(P Used by various standard formatting facilities such as
2442          $(REF formattedWrite, std,format), $(REF write, std,stdio),
2443          $(REF writef, std,stdio), $(REF to, std,conv) and others.
2444         )
2445         Example:
2446         ---
2447         import std.conv;
2448         assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
2449         ---
2450     */
2451 
2452     private import std.format.spec : FormatSpec;
2453 
2454     /***************************************
2455      * Obtain a textual representation of this InversionList
2456      * in form of open-right intervals.
2457      *
2458      * The formatting flag is applied individually to each value, for example:
2459      * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
2460      * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
2461      * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
2462      */
2463     void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */
2464     {
2465         import std.format.write : formatValue;
2466         auto range = byInterval;
2467         if (range.empty)
2468             return;
2469 
2470         while (1)
2471         {
2472             auto i = range.front;
2473             range.popFront();
2474 
2475             put(sink, "[");
2476             formatValue(sink, i.a, fmt);
2477             put(sink, "..");
2478             formatValue(sink, i.b, fmt);
2479             put(sink, ")");
2480             if (range.empty) return;
2481             put(sink, " ");
2482         }
2483     }
2484 
2485     ///
2486     pure @safe unittest
2487     {
2488         import std.conv : to;
2489         import std.format : format;
2490         import std.uni : unicode;
2491 
2492         // This was originally using Cyrillic script.
2493         // Unfortunately this is a pretty active range for changes,
2494         // and hence broke in an update.
2495         // Therefore the range Basic latin was used instead as it
2496         // unlikely to ever change.
2497 
2498         assert(unicode.InBasic_latin.to!string == "[0..128)");
2499 
2500         // The specs '%s' and '%d' are equivalent to the to!string call above.
2501         assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string);
2502 
2503         assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)");
2504         assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)");
2505     }
2506 
2507     pure @safe unittest
2508     {
2509         import std.exception : assertThrown;
2510         import std.format : format, FormatException;
2511         assertThrown!FormatException(format("%z", unicode.ASCII));
2512     }
2513 
2514 
2515     /**
2516         Add an interval [a, b$(RPAREN) to this set.
2517     */
2518     ref add()(uint a, uint b)
2519     {
2520         addInterval(a, b);
2521         return this;
2522     }
2523 
2524     ///
2525     pure @safe unittest
2526     {
2527         CodepointSet someSet;
2528         someSet.add('0', '5').add('A','Z'+1);
2529         someSet.add('5', '9'+1);
2530         assert(someSet['0']);
2531         assert(someSet['5']);
2532         assert(someSet['9']);
2533         assert(someSet['Z']);
2534     }
2535 
2536 private:
2537 
2538   package(std)  // used from: std.regex.internal.parser
2539     ref intersect(U)(U rhs)
2540         if (isCodepointSet!U)
2541     {
2542         Marker mark;
2543         foreach ( i; rhs.byInterval)
2544         {
2545             mark = this.dropUpTo(i.a, mark);
2546             mark = this.skipUpTo(i.b, mark);
2547         }
2548         this.dropUpTo(uint.max, mark);
2549         return this;
2550     }
2551 
2552     ref intersect()(dchar ch)
2553     {
2554         foreach (i; byInterval)
2555             if (i.a <= ch && ch < i.b)
2556                 return this = This.init.add(ch, ch+1);
2557         this = This.init;
2558         return this;
2559     }
2560 
2561     pure @safe unittest
2562     {
2563         assert(unicode.Cyrillic.intersect('-').byInterval.empty);
2564     }
2565 
2566     ref sub()(dchar ch)
2567     {
2568         return subChar(ch);
2569     }
2570 
2571     // same as the above except that skip & drop parts are swapped
2572   package(std)  // used from: std.regex.internal.parser
2573     ref sub(U)(U rhs)
2574         if (isCodepointSet!U)
2575     {
2576         Marker mark;
2577         foreach (i; rhs.byInterval)
2578         {
2579             mark = this.skipUpTo(i.a, mark);
2580             mark = this.dropUpTo(i.b, mark);
2581         }
2582         return this;
2583     }
2584 
2585   package(std)  // used from: std.regex.internal.parse
2586     ref add(U)(U rhs)
2587         if (isCodepointSet!U)
2588     {
2589         Marker start;
2590         foreach (i; rhs.byInterval)
2591         {
2592             start = addInterval(i.a, i.b, start);
2593         }
2594         return this;
2595     }
2596 
2597 // end of mixin-able part
2598 //============================================================================
2599 public:
2600     /**
2601         Obtains a set that is the inversion of this set.
2602 
2603         See the '!' $(LREF opUnary) for the same but using operators.
2604     */
2605     @property auto inverted()
2606     {
2607         InversionList inversion = this;
2608         if (inversion.data.length == 0)
2609         {
2610             inversion.addInterval(0, lastDchar+1);
2611             return inversion;
2612         }
2613         if (inversion.data[0] != 0)
2614             genericReplace(inversion.data, 0, 0, [0]);
2615         else
2616             genericReplace(inversion.data, 0, 1, cast(uint[]) null);
2617         if (data[data.length-1] != lastDchar+1)
2618             genericReplace(inversion.data,
2619                 inversion.data.length, inversion.data.length, [lastDchar+1]);
2620         else
2621             genericReplace(inversion.data,
2622                 inversion.data.length-1, inversion.data.length, cast(uint[]) null);
2623 
2624         return inversion;
2625     }
2626 
2627     ///
2628     pure @safe unittest
2629     {
2630         auto set = unicode.ASCII;
2631         // union with the inverse gets all of the code points in the Unicode
2632         assert((set | set.inverted).length == 0x110000);
2633         // no intersection with the inverse
2634         assert((set & set.inverted).empty);
2635     }
2636 
2637     package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName)
2638     {
2639         import std.algorithm.searching : countUntil;
2640         import std.format : format;
2641         enum maxBinary = 3;
2642         static string linearScope(R)(R ivals, string indent)
2643         {
2644             string result = indent~"{\n";
2645             string deeper = indent~"    ";
2646             foreach (ival; ivals)
2647             {
2648                 immutable span = ival[1] - ival[0];
2649                 assert(span != 0);
2650                 if (span == 1)
2651                 {
2652                     result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
2653                 }
2654                 else if (span == 2)
2655                 {
2656                     result ~= format("%sif (ch == %s || ch == %s) return true;\n",
2657                         deeper, ival[0], ival[0]+1);
2658                 }
2659                 else
2660                 {
2661                     if (ival[0] != 0) // dchar is unsigned and  < 0 is useless
2662                         result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
2663                     result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
2664                 }
2665             }
2666             result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
2667             return result;
2668         }
2669 
2670         static string binaryScope(R)(R ivals, string indent) @safe
2671         {
2672             // time to do unrolled comparisons?
2673             if (ivals.length < maxBinary)
2674                 return linearScope(ivals, indent);
2675             else
2676                 return bisect(ivals, ivals.length/2, indent);
2677         }
2678 
2679         // not used yet if/elsebinary search is far better with DMD  as of 2.061
2680         // and GDC is doing fine job either way
2681         static string switchScope(R)(R ivals, string indent)
2682         {
2683             string result = indent~"switch (ch){\n";
2684             string deeper = indent~"    ";
2685             foreach (ival; ivals)
2686             {
2687                 if (ival[0]+1 == ival[1])
2688                 {
2689                     result ~= format("%scase %s: return true;\n",
2690                         deeper, ival[0]);
2691                 }
2692                 else
2693                 {
2694                     result ~= format("%scase %s: .. case %s: return true;\n",
2695                          deeper, ival[0], ival[1]-1);
2696                 }
2697             }
2698             result ~= deeper~"default: return false;\n"~indent~"}\n";
2699             return result;
2700         }
2701 
2702         static string bisect(R)(R range, size_t idx, string indent)
2703         {
2704             string deeper = indent ~ "    ";
2705             // bisect on one [a, b) interval at idx
2706             string result = indent~"{\n";
2707             // less branch, < a
2708             result ~= format("%sif (ch < %s)\n%s",
2709                 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
2710             // middle point,  >= a && < b
2711             result ~= format("%selse if (ch < %s) return true;\n",
2712                 deeper, range[idx][1]);
2713             // greater or equal branch,  >= b
2714             result ~= format("%selse\n%s",
2715                 deeper, binaryScope(range[idx+1..$], deeper));
2716             return result~indent~"}\n";
2717         }
2718 
2719         string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
2720             funcName.empty ? "function" : funcName);
2721         // special case first bisection to be on ASCII vs beyond
2722         auto tillAscii = countUntil!"a[0] > 0x80"(range);
2723         if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
2724             code ~= binaryScope(range, "");
2725         else
2726             code ~= bisect(range, tillAscii, "");
2727         return code;
2728     }
2729 
2730     /**
2731         Generates string with D source code of unary function with name of
2732         `funcName` taking a single `dchar` argument. If `funcName` is empty
2733         the code is adjusted to be a lambda function.
2734 
2735         The function generated tests if the $(CODEPOINT) passed
2736         belongs to this set or not. The result is to be used with string mixin.
2737         The intended usage area is aggressive optimization via meta programming
2738         in parser generators and the like.
2739 
2740         Note: Use with care for relatively small or regular sets. It
2741         could end up being slower then just using multi-staged tables.
2742 
2743         Example:
2744         ---
2745         import std.stdio;
2746 
2747         // construct set directly from [a, b$RPAREN intervals
2748         auto set = CodepointSet(10, 12, 45, 65, 100, 200);
2749         writeln(set);
2750         writeln(set.toSourceCode("func"));
2751         ---
2752 
2753         The above outputs something along the lines of:
2754         ---
2755         bool func(dchar ch)  @safe pure nothrow @nogc
2756         {
2757             if (ch < 45)
2758             {
2759                 if (ch == 10 || ch == 11) return true;
2760                 return false;
2761             }
2762             else if (ch < 65) return true;
2763             else
2764             {
2765                 if (ch < 100) return false;
2766                 if (ch < 200) return true;
2767                 return false;
2768             }
2769         }
2770         ---
2771     */
2772     string toSourceCode(string funcName="")
2773     {
2774         import std.array : array;
2775         auto range = byInterval.array();
2776         return toSourceCode(range, funcName);
2777     }
2778 
2779     /**
2780         True if this set doesn't contain any $(CODEPOINTS).
2781     */
2782     @property bool empty() const
2783     {
2784         return data.length == 0;
2785     }
2786 
2787     ///
2788     pure @safe unittest
2789     {
2790         CodepointSet emptySet;
2791         assert(emptySet.length == 0);
2792         assert(emptySet.empty);
2793     }
2794 
2795 private:
2796     alias This = typeof(this);
2797     alias Marker = size_t;
2798 
2799     // a random-access range of integral pairs
2800     static struct Intervals(Range)
2801     {
2802         import std.range.primitives : hasAssignableElements;
2803 
2804         this(Range sp) scope
2805         {
2806             slice = sp;
2807             start = 0;
2808             end = sp.length;
2809         }
2810 
2811         this(Range sp, size_t s, size_t e) scope
2812         {
2813             slice = sp;
2814             start = s;
2815             end = e;
2816         }
2817 
2818         @property auto front()const
2819         {
2820             immutable a = slice[start];
2821             immutable b = slice[start+1];
2822             return CodepointInterval(a, b);
2823         }
2824 
2825         //may break sorted property - but we need std.sort to access it
2826         //hence package(std) protection attribute
2827         static if (hasAssignableElements!Range)
2828         package(std) @property void front(CodepointInterval val)
2829         {
2830             slice[start] = val.a;
2831             slice[start+1] = val.b;
2832         }
2833 
2834         @property auto back()const
2835         {
2836             immutable a = slice[end-2];
2837             immutable b = slice[end-1];
2838             return CodepointInterval(a, b);
2839         }
2840 
2841         //ditto about package
2842         static if (hasAssignableElements!Range)
2843         package(std) @property void back(CodepointInterval val)
2844         {
2845             slice[end-2] = val.a;
2846             slice[end-1] = val.b;
2847         }
2848 
2849         void popFront()
2850         {
2851             start += 2;
2852         }
2853 
2854         void popBack()
2855         {
2856             end -= 2;
2857         }
2858 
2859         auto opIndex(size_t idx) const
2860         {
2861             immutable a = slice[start+idx*2];
2862             immutable b = slice[start+idx*2+1];
2863             return CodepointInterval(a, b);
2864         }
2865 
2866         //ditto about package
2867         static if (hasAssignableElements!Range)
2868         package(std) void opIndexAssign(CodepointInterval val, size_t idx)
2869         {
2870             slice[start+idx*2] = val.a;
2871             slice[start+idx*2+1] = val.b;
2872         }
2873 
2874         auto opSlice(size_t s, size_t e)
2875         {
2876             return Intervals(slice, s*2+start, e*2+start);
2877         }
2878 
2879         @property size_t length()const {  return slice.length/2; }
2880 
2881         @property bool empty()const { return start == end; }
2882 
2883         @property auto save(){ return this; }
2884     private:
2885         size_t start, end;
2886         Range slice;
2887     }
2888 
2889     // called after construction from intervals
2890     // to make sure invariants hold
2891     void sanitize()
2892     {
2893         import std.algorithm.comparison : max;
2894         import std.algorithm.mutation : SwapStrategy;
2895         import std.algorithm.sorting : sort;
2896         if (data.length == 0)
2897             return;
2898         alias Ival = CodepointInterval;
2899         //intervals wrapper for a _range_ over packed array
2900         auto ivals = Intervals!(typeof(data[]))(data[]);
2901         //@@@BUG@@@ can't use "a.a < b.a" see
2902         // https://issues.dlang.org/show_bug.cgi?id=12265
2903         sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
2904         // what follows is a variation on stable remove
2905         // differences:
2906         // - predicate is binary, and is tested against
2907         //   the last kept element (at 'i').
2908         // - predicate mutates lhs (merges rhs into lhs)
2909         size_t len = ivals.length;
2910         size_t i = 0;
2911         size_t j = 1;
2912         while (j < len)
2913         {
2914             if (ivals[i].b >= ivals[j].a)
2915             {
2916                 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
2917                 j++;
2918             }
2919             else //unmergable
2920             {
2921                 // check if there is a hole after merges
2922                 // (in the best case we do 0 writes to ivals)
2923                 if (j != i+1)
2924                     ivals[i+1] = ivals[j]; //copy over
2925                 i++;
2926                 j++;
2927             }
2928         }
2929         len = i + 1;
2930         for (size_t k=0; k + 1 < len; k++)
2931         {
2932             assert(ivals[k].a < ivals[k].b);
2933             assert(ivals[k].b < ivals[k+1].a);
2934         }
2935         data.length = len * 2;
2936     }
2937 
2938     // special case for normal InversionList
2939     ref subChar(dchar ch)
2940     {
2941         auto mark = skipUpTo(ch);
2942         if (mark != data.length
2943             && data[mark] == ch && data[mark-1] == ch)
2944         {
2945             // it has split, meaning that ch happens to be in one of intervals
2946             data[mark] = data[mark]+1;
2947         }
2948         return this;
2949     }
2950 
2951     //
2952     Marker addInterval(int a, int b, Marker hint=Marker.init) scope
2953     in
2954     {
2955         assert(a <= b);
2956     }
2957     do
2958     {
2959         import std.range : assumeSorted, SearchPolicy;
2960         auto range = assumeSorted(data[]);
2961         size_t pos;
2962         size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
2963         if (a_idx == range.length)
2964         {
2965             //  [---+++----++++----++++++]
2966             //  [                         a  b]
2967             data.append(a, b);
2968             return data.length-1;
2969         }
2970         size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
2971         uint[3] buf = void;
2972         uint to_insert;
2973         debug(std_uni)
2974         {
2975             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2976         }
2977         if (b_idx == range.length)
2978         {
2979             //  [-------++++++++----++++++-]
2980             //  [      s     a                 b]
2981             if (a_idx & 1)// a in positive
2982             {
2983                 buf[0] = b;
2984                 to_insert = 1;
2985             }
2986             else// a in negative
2987             {
2988                 buf[0] = a;
2989                 buf[1] = b;
2990                 to_insert = 2;
2991             }
2992             pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
2993             return pos - 1;
2994         }
2995 
2996         uint top = data[b_idx];
2997 
2998         debug(std_uni)
2999         {
3000             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
3001             writefln("a=%s; b=%s; top=%s;", a, b, top);
3002         }
3003         if (a_idx & 1)
3004         {// a in positive
3005             if (b_idx & 1)// b in positive
3006             {
3007                 //  [-------++++++++----++++++-]
3008                 //  [       s    a        b    ]
3009                 buf[0] = top;
3010                 to_insert = 1;
3011             }
3012             else // b in negative
3013             {
3014                 //  [-------++++++++----++++++-]
3015                 //  [       s    a   b         ]
3016                 if (top == b)
3017                 {
3018                     assert(b_idx+1 < data.length);
3019                     buf[0] = data[b_idx+1];
3020                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
3021                     return pos - 1;
3022                 }
3023                 buf[0] = b;
3024                 buf[1] = top;
3025                 to_insert = 2;
3026             }
3027         }
3028         else
3029         { // a in negative
3030             if (b_idx & 1) // b in positive
3031             {
3032                 //  [----------+++++----++++++-]
3033                 //  [     a     b              ]
3034                 buf[0] = a;
3035                 buf[1] = top;
3036                 to_insert = 2;
3037             }
3038             else// b in negative
3039             {
3040                 //  [----------+++++----++++++-]
3041                 //  [  a       s      b        ]
3042                 if (top == b)
3043                 {
3044                     assert(b_idx+1 < data.length);
3045                     buf[0] = a;
3046                     buf[1] = data[b_idx+1];
3047                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
3048                     return pos - 1;
3049                 }
3050                 buf[0] = a;
3051                 buf[1] = b;
3052                 buf[2] = top;
3053                 to_insert = 3;
3054             }
3055         }
3056         pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
3057         debug(std_uni)
3058         {
3059             writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
3060             writeln("inserting ", buf[0 .. to_insert]);
3061         }
3062         return pos - 1;
3063     }
3064 
3065     //
3066     Marker dropUpTo(uint a, Marker pos=Marker.init)
3067     in
3068     {
3069         assert(pos % 2 == 0); // at start of interval
3070     }
3071     do
3072     {
3073         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3074         if (range.empty)
3075             return pos;
3076         size_t idx = pos;
3077         idx += range.lowerBound(a).length;
3078 
3079         debug(std_uni)
3080         {
3081             writeln("dropUpTo full length=", data.length);
3082             writeln(pos,"~~~", idx);
3083         }
3084         if (idx == data.length)
3085             return genericReplace(data, pos, idx, cast(uint[])[]);
3086         if (idx & 1)
3087         {   // a in positive
3088             //[--+++----++++++----+++++++------...]
3089             //      |<---si       s  a  t
3090             genericReplace(data, pos, idx, [a]);
3091         }
3092         else
3093         {   // a in negative
3094             //[--+++----++++++----+++++++-------+++...]
3095             //      |<---si              s  a  t
3096             genericReplace(data, pos, idx, cast(uint[])[]);
3097         }
3098         return pos;
3099     }
3100 
3101     //
3102     Marker skipUpTo(uint a, Marker pos=Marker.init)
3103     out(result)
3104     {
3105         assert(result % 2 == 0);// always start of interval
3106         //(may be  0-width after-split)
3107     }
3108     do
3109     {
3110         assert(data.length % 2 == 0);
3111         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3112         size_t idx = pos+range.lowerBound(a).length;
3113 
3114         if (idx >= data.length) // could have Marker point to recently removed stuff
3115             return data.length;
3116 
3117         if (idx & 1)// inside of interval, check for split
3118         {
3119 
3120             immutable top = data[idx];
3121             if (top == a)// no need to split, it's end
3122                 return idx+1;
3123             immutable start = data[idx-1];
3124             if (a == start)
3125                 return idx-1;
3126             // split it up
3127             genericReplace(data, idx, idx+1, [a, a, top]);
3128             return idx+1;        // avoid odd index
3129         }
3130         return idx;
3131     }
3132 
3133     CowArray!SP data;
3134 }
3135 
3136 pure @safe unittest
3137 {
3138     import std.conv : to;
3139     assert(unicode.ASCII.to!string() == "[0..128)");
3140 }
3141 
3142 // pedantic version for ctfe, and aligned-access only architectures
3143 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3144 {
3145     idx *= 3;
3146     version (LittleEndian)
3147         return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
3148              + (cast(uint) ptr[idx+2]<<16);
3149     else
3150         return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
3151              + ptr[idx+2];
3152 }
3153 
3154 // ditto
3155 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3156 {
3157     idx *= 3;
3158     version (LittleEndian)
3159     {
3160         ptr[idx] = val & 0xFF;
3161         ptr[idx+1] = (val >> 8) & 0xFF;
3162         ptr[idx+2] = (val >> 16) & 0xFF;
3163     }
3164     else
3165     {
3166         ptr[idx] = (val >> 16) & 0xFF;
3167         ptr[idx+1] = (val >> 8) & 0xFF;
3168         ptr[idx+2] = val & 0xFF;
3169     }
3170 }
3171 
3172 // unaligned x86-like read/write functions
3173 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3174 {
3175     uint* src = cast(uint*)(ptr+3*idx);
3176     version (LittleEndian)
3177         return *src & 0xFF_FFFF;
3178     else
3179         return *src >> 8;
3180 }
3181 
3182 // ditto
3183 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3184 {
3185     uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
3186     version (LittleEndian)
3187         *dest = val | (*dest & 0xFF00_0000);
3188     else
3189         *dest = (val << 8) | (*dest & 0xFF);
3190 }
3191 
3192 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3193 {
3194     static if (hasUnalignedReads)
3195         return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
3196     else
3197         return safeRead24(ptr, idx);
3198 }
3199 
3200 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3201 {
3202     static if (hasUnalignedReads)
3203         return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
3204     else
3205         return safeWrite24(ptr, val, idx);
3206 }
3207 
3208 struct CowArray(SP=GcPolicy)
3209 {
3210     import std.range.primitives : hasLength;
3211 
3212   @safe:
3213     static auto reuse(uint[] arr)
3214     {
3215         CowArray cow;
3216         cow.data = arr;
3217         SP.append(cow.data, 1);
3218         assert(cow.refCount == 1);
3219         assert(cow.length == arr.length);
3220         return cow;
3221     }
3222 
3223     this(Range)(Range range)
3224         if (isInputRange!Range && hasLength!Range)
3225     {
3226         import std.algorithm.mutation : copy;
3227         length = range.length;
3228         copy(range, data[0..$-1]);
3229     }
3230 
3231     this(Range)(Range range)
3232         if (isForwardRange!Range && !hasLength!Range)
3233     {
3234         import std.algorithm.mutation : copy;
3235         import std.range.primitives : walkLength;
3236         immutable len = walkLength(range.save);
3237         length = len;
3238         copy(range, data[0..$-1]);
3239     }
3240 
3241     this(this)
3242     {
3243         if (!empty)
3244         {
3245             refCount = refCount + 1;
3246         }
3247     }
3248 
3249     ~this()
3250     {
3251         if (!SP.accessIsSafe)
3252             // detach from the array, we can no longer access it.
3253             data = null;
3254 
3255         if (!empty)
3256         {
3257             immutable cnt = refCount;
3258             if (cnt == 1)
3259                 SP.destroy(data);
3260             else
3261                 refCount = cnt - 1;
3262         }
3263     }
3264 
3265     // no ref-count for empty U24 array
3266     @property bool empty() const { return data.length == 0; }
3267 
3268     // report one less then actual size
3269     @property size_t length() const
3270     {
3271         return data.length ? data.length - 1 : 0;
3272     }
3273 
3274     //+ an extra slot for ref-count
3275     @property void length(size_t len)
3276     {
3277         import std.algorithm.comparison : min;
3278         import std.algorithm.mutation : copy;
3279         if (len == 0)
3280         {
3281             if (!empty)
3282                 freeThisReference();
3283             return;
3284         }
3285         immutable total = len + 1; // including ref-count
3286         if (empty)
3287         {
3288             data = SP.alloc!uint(total);
3289             refCount = 1;
3290             return;
3291         }
3292         immutable cur_cnt = refCount;
3293         if (cur_cnt != 1) // have more references to this memory
3294         {
3295             refCount = cur_cnt - 1;
3296             auto new_data = SP.alloc!uint(total);
3297             // take shrinking into account
3298             auto to_copy = min(total, data.length) - 1;
3299             copy(data[0 .. to_copy], new_data[0 .. to_copy]);
3300             data = new_data; // before setting refCount!
3301             refCount = 1;
3302         }
3303         else // 'this' is the only reference
3304         {
3305             // use the realloc (hopefully in-place operation)
3306             data = SP.realloc(data, total);
3307             refCount = 1; // setup a ref-count in the new end of the array
3308         }
3309     }
3310 
3311     alias opDollar = length;
3312 
3313     uint opIndex()(size_t idx)const
3314     {
3315         return data[idx];
3316     }
3317 
3318     void opIndexAssign(uint val, size_t idx)
3319     {
3320         auto cnt = refCount;
3321         if (cnt != 1)
3322             dupThisReference(cnt);
3323         data[idx] = val;
3324     }
3325 
3326     //
3327     auto opSlice(size_t from, size_t to)
3328     {
3329         if (!empty)
3330         {
3331             auto cnt = refCount;
3332             if (cnt != 1)
3333                 dupThisReference(cnt);
3334         }
3335         return data[from .. to];
3336 
3337     }
3338 
3339     //
3340     auto opSlice(size_t from, size_t to) const
3341     {
3342         return data[from .. to];
3343     }
3344 
3345     // length slices before the ref count
3346     auto opSlice()
3347     {
3348         return opSlice(0, length);
3349     }
3350 
3351     // ditto
3352     auto opSlice() const
3353     {
3354         return opSlice(0, length);
3355     }
3356 
3357     void append(Range)(Range range)
3358         if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
3359     {
3360         size_t nl = length + range.length;
3361         length = nl;
3362         copy(range, this[nl-range.length .. nl]);
3363     }
3364 
3365     void append()(uint[] val...)
3366     {
3367         length = length + val.length;
3368         data[$-val.length-1 .. $-1] = val[];
3369     }
3370 
3371     bool opEquals()(auto const ref CowArray rhs)const
3372     {
3373         if (empty ^ rhs.empty)
3374             return false; // one is empty and the other isn't
3375         return empty || data[0..$-1] == rhs.data[0..$-1];
3376     }
3377 
3378 private:
3379     // ref-count is right after the data
3380     @property uint refCount() const
3381     {
3382         return data[$-1];
3383     }
3384 
3385     @property void refCount(uint cnt)
3386     {
3387         data[$-1] = cnt;
3388     }
3389 
3390     void freeThisReference()
3391     {
3392         immutable count = refCount;
3393         if (count != 1) // have more references to this memory
3394         {
3395             // dec shared ref-count
3396             refCount = count - 1;
3397             data = [];
3398         }
3399         else
3400             SP.destroy(data);
3401         assert(!data.ptr);
3402     }
3403 
3404     void dupThisReference(uint count)
3405     in
3406     {
3407         assert(!empty && count != 1 && count == refCount);
3408     }
3409     do
3410     {
3411         import std.algorithm.mutation : copy;
3412         // dec shared ref-count
3413         refCount = count - 1;
3414         // copy to the new chunk of RAM
3415         auto new_data = SP.alloc!uint(data.length);
3416         // bit-blit old stuff except the counter
3417         copy(data[0..$-1], new_data[0..$-1]);
3418         data = new_data; // before setting refCount!
3419         refCount = 1; // so that this updates the right one
3420     }
3421 
3422     uint[] data;
3423 }
3424 
3425 pure @safe unittest// Uint24 tests
3426 {
3427     import std.algorithm.comparison : equal;
3428     import std.algorithm.mutation : copy;
3429     import std.conv : text;
3430     import std.range : iota, chain;
3431     import std.range.primitives : isBidirectionalRange, isOutputRange;
3432     void funcRef(T)(ref T u24)
3433     {
3434         u24.length = 2;
3435         u24[1] = 1024;
3436         T u24_c = u24;
3437         assert(u24[1] == 1024);
3438         u24.length = 0;
3439         assert(u24.empty);
3440         u24.append([1, 2]);
3441         assert(equal(u24[], [1, 2]));
3442         u24.append(111);
3443         assert(equal(u24[], [1, 2, 111]));
3444         assert(!u24_c.empty && u24_c[1] == 1024);
3445         u24.length = 3;
3446         copy(iota(0, 3), u24[]);
3447         assert(equal(u24[], iota(0, 3)));
3448         assert(u24_c[1] == 1024);
3449     }
3450 
3451     void func2(T)(T u24)
3452     {
3453         T u24_2 = u24;
3454         T u24_3;
3455         u24_3 = u24_2;
3456         assert(u24_2 == u24_3);
3457         assert(equal(u24[], u24_2[]));
3458         assert(equal(u24_2[], u24_3[]));
3459         funcRef(u24_3);
3460 
3461         assert(equal(u24_3[], iota(0, 3)));
3462         assert(!equal(u24_2[], u24_3[]));
3463         assert(equal(u24_2[], u24[]));
3464         u24_2 = u24_3;
3465         assert(equal(u24_2[], iota(0, 3)));
3466         // to test that passed arg is intact outside
3467         // plus try out opEquals
3468         u24 = u24_3;
3469         u24 = T.init;
3470         u24_3 = T.init;
3471         assert(u24.empty);
3472         assert(u24 == u24_3);
3473         assert(u24 != u24_2);
3474     }
3475 
3476     static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
3477     {{
3478         alias Range = typeof(CowArray!Policy.init[]);
3479         alias U24A = CowArray!Policy;
3480         static assert(isForwardRange!Range);
3481         static assert(isBidirectionalRange!Range);
3482         static assert(isOutputRange!(Range, uint));
3483         static assert(isRandomAccessRange!(Range));
3484 
3485         auto arr = U24A([42u, 36, 100]);
3486         assert(arr[0] == 42);
3487         assert(arr[1] == 36);
3488         arr[0] = 72;
3489         arr[1] = 0xFE_FEFE;
3490         assert(arr[0] == 72);
3491         assert(arr[1] == 0xFE_FEFE);
3492         assert(arr[2] == 100);
3493         U24A arr2 = arr;
3494         assert(arr2[0] == 72);
3495         arr2[0] = 11;
3496         // test COW-ness
3497         assert(arr[0] == 72);
3498         assert(arr2[0] == 11);
3499         // set this to about 100M to stress-test COW memory management
3500         foreach (v; 0 .. 10_000)
3501             func2(arr);
3502         assert(equal(arr[], [72, 0xFE_FEFE, 100]));
3503 
3504         auto r2 = U24A(iota(0, 100));
3505         assert(equal(r2[], iota(0, 100)), text(r2[]));
3506         copy(iota(10, 170, 2), r2[10 .. 90]);
3507         assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
3508                , text(r2[]));
3509     }}
3510 }
3511 
3512 pure @safe unittest// core set primitives test
3513 {
3514     import std.conv : text;
3515     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3516     foreach (CodeList; AllSets)
3517     {
3518         CodeList a;
3519         //"plug a hole" test
3520         a.add(10, 20).add(25, 30).add(15, 27);
3521         assert(a == CodeList(10, 30), text(a));
3522 
3523         auto x = CodeList.init;
3524         x.add(10, 20).add(30, 40).add(50, 60);
3525 
3526         a = x;
3527         a.add(20, 49);//[10, 49) [50, 60)
3528         assert(a == CodeList(10, 49, 50 ,60));
3529 
3530         a = x;
3531         a.add(20, 50);
3532         assert(a == CodeList(10, 60), text(a));
3533 
3534         // simple unions, mostly edge effects
3535         x = CodeList.init;
3536         x.add(10, 20).add(40, 60);
3537 
3538         a = x;
3539         a.add(10, 25); //[10, 25) [40, 60)
3540         assert(a == CodeList(10, 25, 40, 60));
3541 
3542         a = x;
3543         a.add(5, 15); //[5, 20) [40, 60)
3544         assert(a == CodeList(5, 20, 40, 60));
3545 
3546         a = x;
3547         a.add(0, 10); // [0, 20) [40, 60)
3548         assert(a == CodeList(0, 20, 40, 60));
3549 
3550         a = x;
3551         a.add(0, 5); // prepand
3552         assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
3553 
3554         a = x;
3555         a.add(5, 20);
3556         assert(a == CodeList(5, 20, 40, 60));
3557 
3558         a = x;
3559         a.add(3, 37);
3560         assert(a == CodeList(3, 37, 40, 60));
3561 
3562         a = x;
3563         a.add(37, 65);
3564         assert(a == CodeList(10, 20, 37, 65));
3565 
3566         // some tests on helpers for set intersection
3567         x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
3568         a = x;
3569 
3570         auto m = a.skipUpTo(60);
3571         a.dropUpTo(110, m);
3572         assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
3573 
3574         a = x;
3575         a.dropUpTo(100);
3576         assert(a == CodeList(100, 120), text(a.data[]));
3577 
3578         a = x;
3579         m = a.skipUpTo(50);
3580         a.dropUpTo(140, m);
3581         assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
3582         a = x;
3583         a.dropUpTo(60);
3584         assert(a == CodeList(100, 120), text(a.data[]));
3585     }
3586 }
3587 
3588 
3589 //test constructor to work with any order of intervals
3590 pure @safe unittest
3591 {
3592     import std.algorithm.comparison : equal;
3593     import std.conv : text, to;
3594     import std.range : chain, iota;
3595     import std.typecons : tuple;
3596     //ensure constructor handles bad ordering and overlap
3597     auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
3598     foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
3599         assert(ch in c1, to!string(ch));
3600 
3601     //contiguos
3602     assert(CodepointSet(1000, 1006, 1006, 1009)
3603         .byInterval.equal([tuple(1000, 1009)]));
3604     //contains
3605     assert(CodepointSet(900, 1200, 1000, 1100)
3606         .byInterval.equal([tuple(900, 1200)]));
3607     //intersect left
3608     assert(CodepointSet(900, 1100, 1000, 1200)
3609         .byInterval.equal([tuple(900, 1200)]));
3610     //intersect right
3611     assert(CodepointSet(1000, 1200, 900, 1100)
3612         .byInterval.equal([tuple(900, 1200)]));
3613 
3614     //ditto with extra items at end
3615     assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
3616         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3617     assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
3618         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3619 
3620     //"plug a hole" test
3621     auto c2 = CodepointSet(20, 40,
3622         60, 80, 100, 140, 150, 200,
3623         40, 60, 80, 100, 140, 150
3624     );
3625     assert(c2.byInterval.equal([tuple(20, 200)]));
3626 
3627     auto c3 = CodepointSet(
3628         20, 40, 60, 80, 100, 140, 150, 200,
3629         0, 10, 15, 100, 10, 20, 200, 220);
3630     assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
3631 }
3632 
3633 
3634 pure @safe unittest
3635 {   // full set operations
3636     import std.conv : text;
3637     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3638     foreach (CodeList; AllSets)
3639     {
3640         CodeList a, b, c, d;
3641 
3642         //"plug a hole"
3643         a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3644         b.add(40, 60).add(80, 100).add(140, 150);
3645         c = a | b;
3646         d = b | a;
3647         assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
3648         assert(c == d, text(c," vs ", d));
3649 
3650         b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
3651         c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
3652         d = b | a;
3653         assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
3654         assert(c == d, text(c," vs ", d));
3655 
3656         b = CodeList.init.add(10, 20).add(30,100).add(145,200);
3657         c = a | b;//[10, 140) [145, 200)
3658         d = b | a;
3659         assert(c == CodeList(10, 140, 145, 200));
3660         assert(c == d, text(c," vs ", d));
3661 
3662         b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
3663         c = a | b;//[0, 140) [150, 220)
3664         d = b | a;
3665         assert(c == CodeList(0, 140, 150, 220));
3666         assert(c == d, text(c," vs ", d));
3667 
3668 
3669         a = CodeList.init.add(20, 40).add(60, 80);
3670         b = CodeList.init.add(25, 35).add(65, 75);
3671         c = a & b;
3672         d = b & a;
3673         assert(c == CodeList(25, 35, 65, 75), text(c));
3674         assert(c == d, text(c," vs ", d));
3675 
3676         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3677         b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
3678         c = a & b;
3679         d = b & a;
3680         assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
3681         assert(c == d, text(c," vs ", d));
3682 
3683         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3684         b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
3685         c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
3686         d = b & a;
3687 
3688         assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
3689         assert(c == d, text(c, " vs ",d));
3690         assert((c & a) == c);
3691         assert((d & b) == d);
3692         assert((c & d) == d);
3693 
3694         b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
3695         c = a & b;
3696         d = b & a;
3697         assert(c == CodeList(150, 200), text(c));
3698         assert(c == d, text(c, " vs ",d));
3699         assert((c & a) == c);
3700         assert((d & b) == d);
3701         assert((c & d) == d);
3702 
3703         assert((a & a) == a);
3704         assert((b & b) == b);
3705 
3706         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3707         b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
3708         c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
3709         d = b - a;// [40, 60) [80, 100) [200, 300)
3710         assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
3711         assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
3712         assert(c - d == c, text(c-d, " vs ", c));
3713         assert(d - c == d, text(d-c, " vs ", d));
3714         assert(c - c == CodeList.init);
3715         assert(d - d == CodeList.init);
3716 
3717         a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150,            200);
3718         b = CodeList.init.add(10,  50).add(60,                           160).add(190, 300);
3719         c = a - b;// [160, 190)
3720         d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
3721         assert(c == CodeList(160, 190), text(c));
3722         assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
3723         assert(c - d == c, text(c-d, " vs ", c));
3724         assert(d - c == d, text(d-c, " vs ", d));
3725         assert(c - c == CodeList.init);
3726         assert(d - d == CodeList.init);
3727 
3728         a = CodeList.init.add(20,    40).add(60, 80).add(100,      140).add(150,  200);
3729         b = CodeList.init.add(10, 30).add(45,         100).add(130,             190);
3730         c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
3731         d = b ~ a;
3732         assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
3733                text(c));
3734         assert(c == d, text(c, " vs ", d));
3735     }
3736 }
3737 
3738 }
3739 
3740 pure @safe unittest// vs single dchar
3741 {
3742     import std.conv : text;
3743     CodepointSet a = CodepointSet(10, 100, 120, 200);
3744     assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
3745     assert((a & 'B') == CodepointSet(66, 67));
3746 }
3747 
3748 pure @safe unittest// iteration & opIndex
3749 {
3750     import std.algorithm.comparison : equal;
3751     import std.conv : text;
3752     import std.typecons : tuple, Tuple;
3753 
3754     static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
3755     {{
3756         auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
3757         auto a = CodeList('A','N','a', 'n');
3758         assert(equal(a.byInterval,
3759                 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
3760             ), text(a.byInterval));
3761 
3762         // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ?
3763         version (bug8949)
3764         {
3765             import std.range : retro;
3766             assert(equal(retro(a.byInterval),
3767                 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
3768             ), text(retro(a.byInterval)));
3769         }
3770         auto achr = a.byCodepoint;
3771         assert(equal(achr, arr), text(a.byCodepoint));
3772         foreach (ch; a.byCodepoint)
3773             assert(a[ch]);
3774         auto x = CodeList(100, 500, 600, 900, 1200, 1500);
3775         assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
3776         foreach (ch; x.byCodepoint)
3777             assert(x[ch]);
3778         static if (is(CodeList == CodepointSet))
3779         {
3780             auto y = CodeList(x.byInterval);
3781             assert(equal(x.byInterval, y.byInterval));
3782         }
3783         assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
3784         assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
3785     }}
3786 }
3787 
3788 //============================================================================
3789 // Generic Trie template and various ways to build it
3790 //============================================================================
3791 
3792 // debug helper to get a shortened array dump
3793 auto arrayRepr(T)(T x)
3794 {
3795     import std.conv : text;
3796     if (x.length > 32)
3797     {
3798         return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
3799     }
3800     else
3801         return text(x);
3802 }
3803 
3804 /**
3805     Maps `Key` to a suitable integer index within the range of `size_t`.
3806     The mapping is constructed by applying predicates from `Prefix` left to right
3807     and concatenating the resulting bits.
3808 
3809     The first (leftmost) predicate defines the most significant bits of
3810     the resulting index.
3811  */
3812 template mapTrieIndex(Prefix...)
3813 {
3814     size_t mapTrieIndex(Key)(Key key)
3815         if (isValidPrefixForTrie!(Key, Prefix))
3816     {
3817         alias p = Prefix;
3818         size_t idx;
3819         foreach (i, v; p[0..$-1])
3820         {
3821             idx |= p[i](key);
3822             idx <<= p[i+1].bitSize;
3823         }
3824         idx |= p[$-1](key);
3825         return idx;
3826     }
3827 }
3828 
3829 /*
3830     `TrieBuilder` is a type used for incremental construction
3831     of $(LREF Trie)s.
3832 
3833     See $(LREF buildTrie) for generic helpers built on top of it.
3834 */
3835 @trusted private struct TrieBuilder(Value, Key, Args...)
3836 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
3837 {
3838     import std.exception : enforce;
3839 
3840 private:
3841     // last index is not stored in table, it is used as an offset to values in a block.
3842     static if (is(Value == bool))// always pack bool
3843         alias V = BitPacked!(Value, 1);
3844     else
3845         alias V = Value;
3846     static auto deduceMaxIndex(Preds...)()
3847     {
3848         size_t idx = 1;
3849         foreach (v; Preds)
3850             idx *= 2^^v.bitSize;
3851         return idx;
3852     }
3853 
3854     static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
3855     {
3856         alias Prefix = Args[1..$];
3857         enum lastPageSize = 2^^Prefix[$-1].bitSize;
3858         enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
3859         enum roughedMaxIndex =
3860             (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
3861         // check warp around - if wrapped, use the default deduction rule
3862         enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
3863             deduceMaxIndex!(Prefix)() : roughedMaxIndex;
3864     }
3865     else
3866     {
3867         alias Prefix = Args;
3868         enum maxIndex = deduceMaxIndex!(Prefix)();
3869     }
3870 
3871     alias getIndex = mapTrieIndex!(Prefix);
3872 
3873     enum lastLevel = Prefix.length-1;
3874     struct ConstructState
3875     {
3876         size_t idx_zeros, idx_ones;
3877     }
3878     // iteration over levels of Trie, each indexes its own level and thus a shortened domain
3879     size_t[Prefix.length] indices;
3880     // default filler value to use
3881     Value defValue;
3882     // this is a full-width index of next item
3883     size_t curIndex;
3884     // all-zeros page index, all-ones page index (+ indicator if there is such a page)
3885     ConstructState[Prefix.length] state;
3886     // the table being constructed
3887     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
3888 
3889     @disable this();
3890 
3891     //shortcut for index variable at level 'level'
3892     @property ref idx(size_t level)(){ return indices[level]; }
3893 
3894     // this function assumes no holes in the input so
3895     // indices are going one by one
3896     void addValue(size_t level, T)(T val, size_t numVals)
3897     {
3898         alias j = idx!level;
3899         enum pageSize = 1 << Prefix[level].bitSize;
3900         if (numVals == 0)
3901             return;
3902         auto ptr = table.slice!(level);
3903         if (numVals == 1)
3904         {
3905             static if (level == Prefix.length-1)
3906                 ptr[j] = val;
3907             else
3908             {// can incur narrowing conversion
3909                 assert(j < ptr.length);
3910                 ptr[j] = force!(typeof(ptr[j]))(val);
3911             }
3912             j++;
3913             if (j % pageSize == 0)
3914                 spillToNextPage!level(ptr);
3915             return;
3916         }
3917         // longer row of values
3918         // get to the next page boundary
3919         immutable nextPB = (j + pageSize) & ~(pageSize-1);
3920         immutable n =  nextPB - j;// can fill right in this page
3921         if (numVals < n) //fits in current page
3922         {
3923             ptr[j .. j+numVals]  = val;
3924             j += numVals;
3925             return;
3926         }
3927         static if (level != 0)//on the first level it always fits
3928         {
3929             numVals -= n;
3930             //write till the end of current page
3931             ptr[j .. j+n]  = val;
3932             j += n;
3933             //spill to the next page
3934             spillToNextPage!level(ptr);
3935             // page at once loop
3936             if (state[level].idx_zeros != size_t.max && val == T.init)
3937             {
3938                 alias NextIdx = typeof(table.slice!(level-1)[0]);
3939                 addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
3940                     numVals/pageSize);
3941                 ptr = table.slice!level; //table structure might have changed
3942                 numVals %= pageSize;
3943             }
3944             else
3945             {
3946                 while (numVals >= pageSize)
3947                 {
3948                     numVals -= pageSize;
3949                     ptr[j .. j+pageSize]  = val;
3950                     j += pageSize;
3951                     spillToNextPage!level(ptr);
3952                 }
3953             }
3954             if (numVals)
3955             {
3956                 // the leftovers, an incomplete page
3957                 ptr[j .. j+numVals]  = val;
3958                 j += numVals;
3959             }
3960         }
3961     }
3962 
3963     void spillToNextPage(size_t level, Slice)(ref Slice ptr)
3964     {
3965         // last level (i.e. topmost) has 1 "page"
3966         // thus it need not to add a new page on upper level
3967         static if (level != 0)
3968             spillToNextPageImpl!(level)(ptr);
3969     }
3970 
3971     // this can re-use the current page if duplicate or allocate a new one
3972     // it also makes sure that previous levels point to the correct page in this level
3973     void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
3974     {
3975         alias NextIdx = typeof(table.slice!(level-1)[0]);
3976         NextIdx next_lvl_index;
3977         enum pageSize = 1 << Prefix[level].bitSize;
3978         assert(idx!level % pageSize == 0);
3979         immutable last = idx!level-pageSize;
3980         const slice = ptr[idx!level - pageSize .. idx!level];
3981         size_t j;
3982         for (j=0; j<last; j+=pageSize)
3983         {
3984             if (ptr[j .. j+pageSize] == slice)
3985             {
3986                 // get index to it, reuse ptr space for the next block
3987                 next_lvl_index = force!NextIdx(j/pageSize);
3988                 version (none)
3989                 {
3990                 import std.stdio : writefln, writeln;
3991                 writefln("LEVEL(%s) page mapped idx: %s: 0..%s  ---> [%s..%s]"
3992                         ,level
3993                         ,indices[level-1], pageSize, j, j+pageSize);
3994                 writeln("LEVEL(", level
3995                         , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
3996                 writeln("LEVEL(", level
3997                         , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
3998                 }
3999                 idx!level -= pageSize; // reuse this page, it is duplicate
4000                 break;
4001             }
4002         }
4003         if (j == last)
4004         {
4005     L_allocate_page:
4006             next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
4007             if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
4008             {
4009                 state[level].idx_zeros = next_lvl_index;
4010             }
4011             // allocate next page
4012             version (none)
4013             {
4014             import std.stdio : writefln;
4015             writefln("LEVEL(%s) page allocated: %s"
4016                      , level, arrayRepr(slice[0 .. pageSize]));
4017             writefln("LEVEL(%s) index: %s ; page at this index %s"
4018                      , level
4019                      , next_lvl_index
4020                      , arrayRepr(
4021                          table.slice!(level)
4022                           [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
4023                         ));
4024             }
4025             table.length!level = table.length!level + pageSize;
4026         }
4027     L_know_index:
4028         // for the previous level, values are indices to the pages in the current level
4029         addValue!(level-1)(next_lvl_index, 1);
4030         ptr = table.slice!level; //re-load the slice after moves
4031     }
4032 
4033     // idx - full-width index to fill with v (full-width index != key)
4034     // fills everything in the range of [curIndex, idx) with filler
4035     void putAt(size_t idx, Value v)
4036     {
4037         assert(idx >= curIndex);
4038         immutable numFillers = idx - curIndex;
4039         addValue!lastLevel(defValue, numFillers);
4040         addValue!lastLevel(v, 1);
4041         curIndex = idx + 1;
4042     }
4043 
4044     // ditto, but sets the range of [idxA, idxB) to v
4045     void putRangeAt(size_t idxA, size_t idxB, Value v)
4046     {
4047         assert(idxA >= curIndex);
4048         assert(idxB >= idxA);
4049         size_t numFillers = idxA - curIndex;
4050         addValue!lastLevel(defValue, numFillers);
4051         addValue!lastLevel(v, idxB - idxA);
4052         curIndex = idxB; // open-right
4053     }
4054 
4055     enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
4056         "duplicate key->value mapping";
4057 
4058 public:
4059     /**
4060         Construct a builder, where `filler` is a value
4061         to indicate empty slots (or "not found" condition).
4062     */
4063     this(Value filler)
4064     {
4065         curIndex = 0;
4066         defValue = filler;
4067         // zeros-page index, ones-page index
4068         foreach (ref v; state)
4069             v = ConstructState(size_t.max, size_t.max);
4070         table = typeof(table)(indices);
4071         // one page per level is a bootstrap minimum
4072         foreach (i, Pred; Prefix)
4073             table.length!i = (1 << Pred.bitSize);
4074     }
4075 
4076     /**
4077         Put a value `v` into interval as
4078         mapped by keys from `a` to `b`.
4079         All slots prior to `a` are filled with
4080         the default filler.
4081     */
4082     void putRange(Key a, Key b, Value v)
4083     {
4084         auto idxA = getIndex(a), idxB = getIndex(b);
4085         // indexes of key should always grow
4086         enforce(idxB >= idxA && idxA >= curIndex, errMsg);
4087         putRangeAt(idxA, idxB, v);
4088     }
4089 
4090     /**
4091         Put a value `v` into slot mapped by `key`.
4092         All slots prior to `key` are filled with the
4093         default filler.
4094     */
4095     void putValue(Key key, Value v)
4096     {
4097         auto idx = getIndex(key);
4098         enforce(idx >= curIndex, errMsg);
4099         putAt(idx, v);
4100     }
4101 
4102     /// Finishes construction of Trie, yielding an immutable Trie instance.
4103     auto build()
4104     {
4105         static if (maxIndex != 0) // doesn't cover full range of size_t
4106         {
4107             assert(curIndex <= maxIndex);
4108             addValue!lastLevel(defValue, maxIndex - curIndex);
4109         }
4110         else
4111         {
4112             if (curIndex != 0 // couldn't wrap around
4113                 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
4114             {
4115                 addValue!lastLevel(defValue, size_t.max - curIndex);
4116                 addValue!lastLevel(defValue, 1);
4117             }
4118             // else curIndex already completed the full range of size_t by wrapping around
4119         }
4120         return Trie!(V, Key, maxIndex, Prefix)(table);
4121     }
4122 }
4123 
4124 /**
4125     $(P A generic Trie data-structure for a fixed number of stages.
4126     The design goal is optimal speed with smallest footprint size.
4127     )
4128     $(P It's intentionally read-only and doesn't provide constructors.
4129      To construct one use a special builder,
4130      see $(LREF TrieBuilder) and $(LREF buildTrie).
4131     )
4132 
4133 */
4134 @trusted private struct Trie(Value, Key, Args...)
4135 if (isValidPrefixForTrie!(Key, Args)
4136     || (isValidPrefixForTrie!(Key, Args[1..$])
4137     && is(typeof(Args[0]) : size_t)))
4138 {
4139     import std.range.primitives : isOutputRange;
4140     static if (is(typeof(Args[0]) : size_t))
4141     {
4142         private enum maxIndex = Args[0];
4143         private enum hasBoundsCheck = true;
4144         private alias Prefix = Args[1..$];
4145     }
4146     else
4147     {
4148         private enum hasBoundsCheck = false;
4149         private alias Prefix = Args;
4150     }
4151 
4152     private this()(typeof(_table) table)
4153     {
4154         _table = table;
4155     }
4156 
4157     // only for constant Tries constructed from precompiled tables
4158     private this()(const(size_t)[] offsets, const(size_t)[] sizes,
4159         const(size_t)[] data) const
4160     {
4161         _table = typeof(_table)(offsets, sizes, data);
4162     }
4163 
4164     /**
4165         $(P Lookup the `key` in this `Trie`. )
4166 
4167         $(P The lookup always succeeds if key fits the domain
4168         provided during construction. The whole domain defined
4169         is covered so instead of not found condition
4170         the sentinel (filler) value could be used. )
4171 
4172         $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
4173         define a domain of `Trie` keys and the sentinel value. )
4174 
4175         Note:
4176         Domain range-checking is only enabled in debug builds
4177         and results in assertion failure.
4178     */
4179     TypeOfBitPacked!Value opIndex()(Key key) const
4180     {
4181         static if (hasBoundsCheck)
4182             assert(mapTrieIndex!Prefix(key) < maxIndex);
4183         size_t idx;
4184         alias p = Prefix;
4185         idx = cast(size_t) p[0](key);
4186         foreach (i, v; p[0..$-1])
4187             idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
4188         return _table.ptr!(p.length-1)[idx];
4189     }
4190 
4191     ///
4192     @property size_t bytes(size_t n=size_t.max)() const
4193     {
4194         return _table.bytes!n;
4195     }
4196 
4197     ///
4198     @property size_t pages(size_t n)() const
4199     {
4200         return (bytes!n+2^^(Prefix[n].bitSize-1))
4201                 /2^^Prefix[n].bitSize;
4202     }
4203 
4204     ///
4205     void store(OutRange)(scope OutRange sink) const
4206         if (isOutputRange!(OutRange, char))
4207     {
4208         _table.store(sink);
4209     }
4210 
4211 private:
4212     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
4213 }
4214 
4215 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
4216 // left-to-right, the most significant bits first
4217 template GetBitSlicing(size_t top, sizes...)
4218 {
4219     static if (sizes.length > 0)
4220         alias GetBitSlicing =
4221             AliasSeq!(sliceBits!(top - sizes[0], top),
4222                       GetBitSlicing!(top - sizes[0], sizes[1..$]));
4223     else
4224         alias GetBitSlicing = AliasSeq!();
4225 }
4226 
4227 template callableWith(T)
4228 {
4229     template callableWith(alias Pred)
4230     {
4231         static if (!is(typeof(Pred(T.init))))
4232             enum callableWith = false;
4233         else
4234         {
4235             alias Result = typeof(Pred(T.init));
4236             enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
4237         }
4238     }
4239 }
4240 
4241 /*
4242     Check if `Prefix` is a valid set of predicates
4243     for `Trie` template having `Key` as the type of keys.
4244     This requires all predicates to be callable, take
4245     single argument of type `Key` and return unsigned value.
4246 */
4247 template isValidPrefixForTrie(Key, Prefix...)
4248 {
4249     import std.meta : allSatisfy;
4250     enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
4251 }
4252 
4253 /*
4254     Check if `Args` is a set of maximum key value followed by valid predicates
4255     for `Trie` template having `Key` as the type of keys.
4256 */
4257 template isValidArgsForTrie(Key, Args...)
4258 {
4259     static if (Args.length > 1)
4260     {
4261         enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
4262             || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
4263     }
4264     else
4265         enum isValidArgsForTrie = isValidPrefixForTrie!Args;
4266 }
4267 
4268 @property size_t sumOfIntegerTuple(ints...)()
4269 {
4270     size_t count=0;
4271     foreach (v; ints)
4272         count += v;
4273     return count;
4274 }
4275 
4276 /**
4277     A shorthand for creating a custom multi-level fixed Trie
4278     from a `CodepointSet`. `sizes` are numbers of bits per level,
4279     with the most significant bits used first.
4280 
4281     Note: The sum of `sizes` must be equal 21.
4282 
4283     See_Also: $(LREF toTrie), which is even simpler.
4284 
4285     Example:
4286     ---
4287     {
4288         import std.stdio;
4289         auto set = unicode("Number");
4290         auto trie = codepointSetTrie!(8, 5, 8)(set);
4291         writeln("Input code points to test:");
4292         foreach (line; stdin.byLine)
4293         {
4294             int count=0;
4295             foreach (dchar ch; line)
4296                 if (trie[ch])// is number
4297                     count++;
4298             writefln("Contains %d number code points.", count);
4299         }
4300     }
4301     ---
4302 */
4303 public template codepointSetTrie(sizes...)
4304 if (sumOfIntegerTuple!sizes == 21)
4305 {
4306     auto codepointSetTrie(Set)(Set set)
4307         if (isCodepointSet!Set)
4308     {
4309         auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
4310         foreach (ival; set.byInterval)
4311             builder.putRange(ival[0], ival[1], true);
4312         return builder.build();
4313     }
4314 }
4315 
4316 /// Type of Trie generated by codepointSetTrie function.
4317 public template CodepointSetTrie(sizes...)
4318 if (sumOfIntegerTuple!sizes == 21)
4319 {
4320     alias Prefix = GetBitSlicing!(21, sizes);
4321     alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
4322 }
4323 
4324 /**
4325     A slightly more general tool for building fixed `Trie`
4326     for the Unicode data.
4327 
4328     Specifically unlike `codepointSetTrie` it's allows creating mappings
4329     of `dchar` to an arbitrary type `T`.
4330 
4331     Note: Overload taking `CodepointSet`s will naturally convert
4332     only to bool mapping `Trie`s.
4333 
4334     CodepointTrie is the type of Trie as generated by codepointTrie function.
4335 */
4336 public template codepointTrie(T, sizes...)
4337 if (sumOfIntegerTuple!sizes == 21)
4338 {
4339     alias Prefix = GetBitSlicing!(21, sizes);
4340 
4341     static if (is(TypeOfBitPacked!T == bool))
4342     {
4343         auto codepointTrie(Set)(const scope Set set)
4344             if (isCodepointSet!Set)
4345         {
4346             return codepointSetTrie(set);
4347         }
4348     }
4349 
4350     ///
4351     auto codepointTrie()(T[dchar] map, T defValue=T.init)
4352     {
4353         return buildTrie!(T, dchar, Prefix)(map, defValue);
4354     }
4355 
4356     // unsorted range of pairs
4357     ///
4358     auto codepointTrie(R)(R range, T defValue=T.init)
4359         if (isInputRange!R
4360             && is(typeof(ElementType!R.init[0]) : T)
4361             && is(typeof(ElementType!R.init[1]) : dchar))
4362     {
4363         // build from unsorted array of pairs
4364         // TODO: expose index sorting functions for Trie
4365         return buildTrie!(T, dchar, Prefix)(range, defValue, true);
4366     }
4367 }
4368 
4369 @system pure unittest
4370 {
4371     import std.algorithm.comparison : max;
4372     import std.algorithm.searching : count;
4373 
4374     // pick characters from the Greek script
4375     auto set = unicode.Greek;
4376 
4377     // a user-defined property (or an expensive function)
4378     // that we want to look up
4379     static uint luckFactor(dchar ch)
4380     {
4381         // here we consider a character lucky
4382         // if its code point has a lot of identical hex-digits
4383         // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
4384         ubyte[6] nibbles; // 6 4-bit chunks of code point
4385         uint value = ch;
4386         foreach (i; 0 .. 6)
4387         {
4388             nibbles[i] = value & 0xF;
4389             value >>= 4;
4390         }
4391         uint luck;
4392         foreach (n; nibbles)
4393             luck = cast(uint) max(luck, count(nibbles[], n));
4394         return luck;
4395     }
4396 
4397     // only unsigned built-ins are supported at the moment
4398     alias LuckFactor = BitPacked!(uint, 3);
4399 
4400     // create a temporary associative array (AA)
4401     LuckFactor[dchar] map;
4402     foreach (ch; set.byCodepoint)
4403         map[ch] = LuckFactor(luckFactor(ch));
4404 
4405     // bits per stage are chosen randomly, fell free to optimize
4406     auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
4407 
4408     // from now on the AA is not needed
4409     foreach (ch; set.byCodepoint)
4410         assert(trie[ch] == luckFactor(ch)); // verify
4411     // CJK is not Greek, thus it has the default value
4412     assert(trie['\u4444'] == 0);
4413     // and here is a couple of quite lucky Greek characters:
4414     // Greek small letter epsilon with dasia
4415     assert(trie['\u1F11'] == 3);
4416     // Ancient Greek metretes sign
4417     assert(trie['\U00010181'] == 3);
4418 
4419 }
4420 
4421 /// ditto
4422 public template CodepointTrie(T, sizes...)
4423 if (sumOfIntegerTuple!sizes == 21)
4424 {
4425     alias Prefix = GetBitSlicing!(21, sizes);
4426     alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
4427 }
4428 
4429 package(std) template cmpK0(alias Pred)
4430 {
4431     import std.typecons : Tuple;
4432     static bool cmpK0(Value, Key)
4433         (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
4434     {
4435         return Pred(a[1]) < Pred(b[1]);
4436     }
4437 }
4438 
4439 /**
4440     The most general utility for construction of `Trie`s
4441     short of using `TrieBuilder` directly.
4442 
4443     Provides a number of convenience overloads.
4444     `Args` is tuple of maximum key value followed by
4445     predicates to construct index from key.
4446 
4447     Alternatively if the first argument is not a value convertible to `Key`
4448     then the whole tuple of `Args` is treated as predicates
4449     and the maximum Key is deduced from predicates.
4450 */
4451 private template buildTrie(Value, Key, Args...)
4452 if (isValidArgsForTrie!(Key, Args))
4453 {
4454     static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
4455     {
4456         alias Prefix = Args[1..$];
4457     }
4458     else
4459         alias Prefix = Args;
4460 
4461     alias getIndex = mapTrieIndex!(Prefix);
4462 
4463     // for multi-sort
4464     template GetComparators(size_t n)
4465     {
4466         static if (n > 0)
4467             alias GetComparators =
4468                 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
4469         else
4470             alias GetComparators = AliasSeq!();
4471     }
4472 
4473     /*
4474         Build `Trie` from a range of a Key-Value pairs,
4475         assuming it is sorted by Key as defined by the following lambda:
4476         ------
4477         (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
4478         ------
4479         Exception is thrown if it's detected that the above order doesn't hold.
4480 
4481         In other words $(LREF mapTrieIndex) should be a
4482         monotonically increasing function that maps `Key` to an integer.
4483 
4484         See_Also: $(REF sort, std,_algorithm),
4485         $(REF SortedRange, std,range),
4486         $(REF setUnion, std,_algorithm).
4487     */
4488     auto buildTrie(Range)(Range range, Value filler=Value.init)
4489         if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
4490             && is(typeof(Range.init.front[1]) : Key))
4491     {
4492         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4493         foreach (v; range)
4494             builder.putValue(v[1], v[0]);
4495         return builder.build();
4496     }
4497 
4498     /*
4499         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4500         to build `Trie` from a range of open-right intervals of `Key`s.
4501         The requirement  on the ordering of keys (and the behavior on the
4502         violation of it) is the same as for Key-Value range overload.
4503 
4504         Intervals denote ranges of !`filler` i.e. the opposite of filler.
4505         If no filler provided keys inside of the intervals map to true,
4506         and `filler` is false.
4507     */
4508     auto buildTrie(Range)(Range range, Value filler=Value.init)
4509         if (is(TypeOfBitPacked!Value ==  bool)
4510             && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
4511             && is(typeof(Range.init.front[1]) : Key))
4512     {
4513         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4514         foreach (ival; range)
4515             builder.putRange(ival[0], ival[1], !filler);
4516         return builder.build();
4517     }
4518 
4519     auto buildTrie(Range)(Range range, Value filler, bool unsorted)
4520         if (isInputRange!Range
4521             && is(typeof(Range.init.front[0]) : Value)
4522             && is(typeof(Range.init.front[1]) : Key))
4523     {
4524         import std.algorithm.sorting : multiSort;
4525         alias Comps = GetComparators!(Prefix.length);
4526         if (unsorted)
4527             multiSort!(Comps)(range);
4528         return buildTrie(range, filler);
4529     }
4530 
4531     /*
4532         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4533         to build `Trie` simply from an input range of `Key`s.
4534         The requirement  on the ordering of keys (and the behavior on the
4535         violation of it) is the same as for Key-Value range overload.
4536 
4537         Keys found in range denote !`filler` i.e. the opposite of filler.
4538         If no filler provided keys map to true, and `filler` is false.
4539     */
4540     auto buildTrie(Range)(Range range, Value filler=Value.init)
4541         if (is(TypeOfBitPacked!Value ==  bool)
4542             && isInputRange!Range && is(typeof(Range.init.front) : Key))
4543     {
4544         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4545         foreach (v; range)
4546             builder.putValue(v, !filler);
4547         return builder.build();
4548     }
4549 
4550     /*
4551         If `Key` is unsigned integer `Trie` could be constructed from array
4552         of values where array index serves as key.
4553     */
4554     auto buildTrie()(Value[] array, Value filler=Value.init)
4555         if (isUnsigned!Key)
4556     {
4557         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4558         foreach (idx, v; array)
4559             builder.putValue(idx, v);
4560         return builder.build();
4561     }
4562 
4563     /*
4564         Builds `Trie` from associative array.
4565     */
4566     auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
4567     {
4568         import std.array : array;
4569         import std.range : zip;
4570         auto range = array(zip(map.values, map.keys));
4571         return buildTrie(range, filler, true); // sort it
4572     }
4573 }
4574 
4575 // helper in place of assumeSize to
4576 //reduce mangled name & help DMD inline Trie functors
4577 struct clamp(size_t bits)
4578 {
4579     static size_t opCall(T)(T arg){ return arg; }
4580     enum bitSize = bits;
4581 }
4582 
4583 struct clampIdx(size_t idx, size_t bits)
4584 {
4585     static size_t opCall(T)(T arg){ return arg[idx]; }
4586     enum bitSize = bits;
4587 }
4588 
4589 /**
4590     Conceptual type that outlines the common properties of all UTF Matchers.
4591 
4592     Note: For illustration purposes only, every method
4593     call results in assertion failure.
4594     Use $(LREF utfMatcher) to obtain a concrete matcher
4595     for UTF-8 or UTF-16 encodings.
4596 */
4597 public struct MatcherConcept
4598 {
4599     /**
4600         $(P Perform a semantic equivalent 2 operations:
4601         decoding a $(CODEPOINT) at front of `inp` and testing if
4602         it belongs to the set of $(CODEPOINTS) of this matcher. )
4603 
4604         $(P The effect on `inp` depends on the kind of function called:)
4605 
4606         $(P Match. If the codepoint is found in the set then range `inp`
4607         is advanced by its size in $(S_LINK Code unit, code units),
4608         otherwise the range is not modifed.)
4609 
4610         $(P Skip. The range is always advanced by the size
4611         of the tested $(CODEPOINT) regardless of the result of test.)
4612 
4613         $(P Test. The range is left unaffected regardless
4614         of the result of test.)
4615     */
4616     public bool match(Range)(ref Range inp)
4617         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4618     {
4619        assert(false);
4620     }
4621 
4622     ///ditto
4623     public bool skip(Range)(ref Range inp)
4624         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4625     {
4626         assert(false);
4627     }
4628 
4629     ///ditto
4630     public bool test(Range)(ref Range inp)
4631         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4632     {
4633         assert(false);
4634     }
4635     ///
4636     pure @safe unittest
4637     {
4638         string truth = "2² = 4";
4639         auto m = utfMatcher!char(unicode.Number);
4640         assert(m.match(truth)); // '2' is a number all right
4641         assert(truth == "² = 4"); // skips on match
4642         assert(m.match(truth)); // so is the superscript '2'
4643         assert(!m.match(truth)); // space is not a number
4644         assert(truth == " = 4"); // unaffected on no match
4645         assert(!m.skip(truth)); // same test ...
4646         assert(truth == "= 4"); // but skips a codepoint regardless
4647         assert(!m.test(truth)); // '=' is not a number
4648         assert(truth == "= 4"); // test never affects argument
4649     }
4650 
4651     /**
4652         Advanced feature - provide direct access to a subset of matcher based a
4653         set of known encoding lengths. Lengths are provided in
4654         $(S_LINK Code unit, code units). The sub-matcher then may do less
4655         operations per any `test`/`match`.
4656 
4657         Use with care as the sub-matcher won't match
4658         any $(CODEPOINTS) that have encoded length that doesn't belong
4659         to the selected set of lengths. Also the sub-matcher object references
4660         the parent matcher and must not be used past the liftetime
4661         of the latter.
4662 
4663         Another caveat of using sub-matcher is that skip is not available
4664         preciesly because sub-matcher doesn't detect all lengths.
4665     */
4666     @property auto subMatcher(Lengths...)()
4667     {
4668         assert(0);
4669         return this;
4670     }
4671 
4672     pure @safe unittest
4673     {
4674         auto m = utfMatcher!char(unicode.Number);
4675         string square = "2²";
4676         // about sub-matchers
4677         assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
4678         assert(m.subMatcher!1.match(square)); // ASCII-only, works
4679         assert(!m.subMatcher!1.test(square)); // unicode '²'
4680         assert(m.subMatcher!(2,3,4).match(square));  //
4681         assert(square == "");
4682         wstring wsquare = "2²";
4683         auto m16 = utfMatcher!wchar(unicode.Number);
4684         // may keep ref, but the orignal (m16) must be kept alive
4685         auto bmp = m16.subMatcher!1;
4686         assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
4687         assert(bmp.match(wsquare)); // And '²' too
4688     }
4689 }
4690 
4691 /**
4692     Test if `M` is an UTF Matcher for ranges of `Char`.
4693 */
4694 public enum isUtfMatcher(M, C) = __traits(compiles, (){
4695     C[] s;
4696     auto d = s.decoder;
4697     M m;
4698     assert(is(typeof(m.match(d)) == bool));
4699     assert(is(typeof(m.test(d)) == bool));
4700     static if (is(typeof(m.skip(d))))
4701     {
4702         assert(is(typeof(m.skip(d)) == bool));
4703         assert(is(typeof(m.skip(s)) == bool));
4704     }
4705     assert(is(typeof(m.match(s)) == bool));
4706     assert(is(typeof(m.test(s)) == bool));
4707 });
4708 
4709 pure @safe unittest
4710 {
4711     alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
4712     alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
4713     static assert(isUtfMatcher!(CharMatcher, char));
4714     static assert(isUtfMatcher!(CharMatcher, immutable(char)));
4715     static assert(isUtfMatcher!(WcharMatcher, wchar));
4716     static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
4717 }
4718 
4719 enum Mode {
4720     alwaysSkip,
4721     neverSkip,
4722     skipOnMatch
4723 }
4724 
4725 mixin template ForwardStrings()
4726 {
4727     private bool fwdStr(string fn, C)(ref C[] str) const @trusted
4728     {
4729         import std.utf : byCodeUnit;
4730         alias type = typeof(byCodeUnit(str));
4731         return mixin(fn~"(*cast(type*)&str)");
4732     }
4733 }
4734 
4735 template Utf8Matcher()
4736 {
4737     enum validSize(int sz) = sz >= 1 && sz <= 4;
4738 
4739     void badEncoding() pure @safe
4740     {
4741         import std.utf : UTFException;
4742         throw new UTFException("Invalid UTF-8 sequence");
4743     }
4744 
4745     //for 1-stage ASCII
4746     alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
4747     //for 2-stage lookup of 2 byte UTF-8 sequences
4748     alias Utf8Spec2 = AliasSeq!(bool, char[2],
4749         clampIdx!(0, 5), clampIdx!(1, 6));
4750     //ditto for 3 byte
4751     alias Utf8Spec3 = AliasSeq!(bool, char[3],
4752         clampIdx!(0, 4),
4753         clampIdx!(1, 6),
4754         clampIdx!(2, 6)
4755     );
4756     //ditto for 4 byte
4757     alias Utf8Spec4 = AliasSeq!(bool, char[4],
4758         clampIdx!(0, 3), clampIdx!(1, 6),
4759         clampIdx!(2, 6), clampIdx!(3, 6)
4760     );
4761     alias Tables = AliasSeq!(
4762         typeof(TrieBuilder!(AsciiSpec)(false).build()),
4763         typeof(TrieBuilder!(Utf8Spec2)(false).build()),
4764         typeof(TrieBuilder!(Utf8Spec3)(false).build()),
4765         typeof(TrieBuilder!(Utf8Spec4)(false).build())
4766     );
4767     alias Table(int size) = Tables[size-1];
4768 
4769     enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
4770     enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
4771 
4772     char truncate()(char ch) pure @safe
4773     {
4774         ch -= 0x80;
4775         if (ch < 0x40)
4776         {
4777             return ch;
4778         }
4779         else
4780         {
4781             badEncoding();
4782             return cast(char) 0;
4783         }
4784     }
4785 
4786     static auto encode(size_t sz)(dchar ch)
4787         if (sz > 1)
4788     {
4789         import std.utf : encodeUTF = encode;
4790         char[4] buf;
4791         encodeUTF(buf, ch);
4792         char[sz] ret;
4793         buf[0] &= leadMask!sz;
4794         foreach (n; 1 .. sz)
4795             buf[n] = buf[n] & 0x3f; //keep 6 lower bits
4796         ret[] = buf[0 .. sz];
4797         return ret;
4798     }
4799 
4800     auto build(Set)(Set set)
4801     {
4802         import std.algorithm.iteration : map;
4803         auto ascii = set & unicode.ASCII;
4804         auto utf8_2 = set & CodepointSet(0x80, 0x800);
4805         auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
4806         auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
4807         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
4808         auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
4809         auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
4810         auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
4811         alias Ret = Impl!(1,2,3,4);
4812         return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
4813     }
4814 
4815     // Bootstrap UTF-8 static matcher interface
4816     // from 3 primitives: tab!(size), lookup and Sizes
4817     mixin template DefMatcher()
4818     {
4819         import std.format : format;
4820         import std.meta : Erase, staticIndexOf;
4821         enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
4822         alias UniSizes = Erase!(1, Sizes);
4823 
4824         //generate dispatch code sequence for unicode parts
4825         static auto genDispatch()
4826         {
4827             string code;
4828             foreach (size; UniSizes)
4829                 code ~= format(q{
4830                     if ((ch & ~leadMask!%d) == encMask!(%d))
4831                         return lookup!(%d, mode)(inp);
4832                     else
4833                 }, size, size, size);
4834             static if (Sizes.length == 4) //covers all code unit cases
4835                 code ~= "{ badEncoding(); return false; }";
4836             else
4837                 code ~= "return false;"; //may be just fine but not covered
4838             return code;
4839         }
4840         enum dispatch = genDispatch();
4841 
4842         public bool match(Range)(ref Range inp) const
4843             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4844                 !isDynamicArray!Range)
4845         {
4846             enum mode = Mode.skipOnMatch;
4847             assert(!inp.empty);
4848             immutable ch = inp[0];
4849             static if (hasASCII)
4850             {
4851                 if (ch < 0x80)
4852                 {
4853                     immutable r = tab!1[ch];
4854                     if (r)
4855                         inp.popFront();
4856                     return r;
4857                 }
4858                 else
4859                     mixin(dispatch);
4860             }
4861             else
4862                 mixin(dispatch);
4863         }
4864 
4865         static if (Sizes.length == 4) // can skip iff can detect all encodings
4866         {
4867             public bool skip(Range)(ref Range inp) const
4868                 if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4869                     !isDynamicArray!Range)
4870             {
4871                 enum mode = Mode.alwaysSkip;
4872                 assert(!inp.empty);
4873                 auto ch = inp[0];
4874                 static if (hasASCII)
4875                 {
4876                     if (ch < 0x80)
4877                     {
4878                         inp.popFront();
4879                         return tab!1[ch];
4880                     }
4881                     else
4882                         mixin(dispatch);
4883                 }
4884                 else
4885                     mixin(dispatch);
4886             }
4887         }
4888 
4889         public bool test(Range)(ref Range inp) const
4890             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4891                 !isDynamicArray!Range)
4892         {
4893             enum mode = Mode.neverSkip;
4894             assert(!inp.empty);
4895             auto ch = inp[0];
4896 
4897             static if (hasASCII)
4898             {
4899                 if (ch < 0x80)
4900                     return tab!1[ch];
4901                 else
4902                     mixin(dispatch);
4903             }
4904             else
4905                 mixin(dispatch);
4906         }
4907 
4908         bool match(C)(ref C[] str) const
4909             if (isSomeChar!C)
4910         {
4911             return fwdStr!"match"(str);
4912         }
4913 
4914         bool skip(C)(ref C[] str) const
4915             if (isSomeChar!C)
4916         {
4917             return fwdStr!"skip"(str);
4918         }
4919 
4920         bool test(C)(ref C[] str) const
4921             if (isSomeChar!C)
4922         {
4923             return fwdStr!"test"(str);
4924         }
4925 
4926         mixin ForwardStrings;
4927     }
4928 
4929     struct Impl(Sizes...)
4930     {
4931         import std.meta : allSatisfy, staticMap;
4932         static assert(allSatisfy!(validSize, Sizes),
4933             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4934     private:
4935         //pick tables for chosen sizes
4936         alias OurTabs = staticMap!(Table, Sizes);
4937         OurTabs tables;
4938         mixin DefMatcher;
4939         //static disptach helper UTF size ==> table
4940         alias tab(int i) = tables[i - 1];
4941 
4942         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
4943         {
4944             return CherryPick!(Impl, SizesToPick)(&this);
4945         }
4946 
4947         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4948         {
4949             import std.range : popFrontN;
4950             if (inp.length < size)
4951             {
4952                 badEncoding();
4953                 return false;
4954             }
4955             char[size] needle = void;
4956             needle[0] = leadMask!size & inp[0];
4957             static foreach (i; 1 .. size)
4958             {
4959                 needle[i] = truncate(inp[i]);
4960             }
4961             //overlong encoding checks
4962             static if (size == 2)
4963             {
4964                 //0x80-0x7FF
4965                 //got 6 bits in needle[1], must use at least 8 bits
4966                 //must use at least 2 bits in needle[1]
4967                 if (needle[0] < 2) badEncoding();
4968             }
4969             else static if (size == 3)
4970             {
4971                 //0x800-0xFFFF
4972                 //got 6 bits in needle[2], must use at least 12bits
4973                 //must use 6 bits in needle[1] or anything in needle[0]
4974                 if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
4975             }
4976             else static if (size == 4)
4977             {
4978                 //0x800-0xFFFF
4979                 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
4980                 //must use 5 bits (or above) in needle[1] or anything in needle[0]
4981                 if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
4982             }
4983             static if (mode == Mode.alwaysSkip)
4984             {
4985                 inp.popFrontN(size);
4986                 return tab!size[needle];
4987             }
4988             else static if (mode == Mode.neverSkip)
4989             {
4990                 return tab!size[needle];
4991             }
4992             else
4993             {
4994                 static assert(mode == Mode.skipOnMatch);
4995 
4996                 if (tab!size[needle])
4997                 {
4998                     inp.popFrontN(size);
4999                     return true;
5000                 }
5001                 else
5002                     return false;
5003             }
5004         }
5005     }
5006 
5007     struct CherryPick(I, Sizes...)
5008     {
5009         import std.meta : allSatisfy;
5010         static assert(allSatisfy!(validSize, Sizes),
5011             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
5012     private:
5013         I* m;
5014         @property auto tab(int i)() const { return m.tables[i - 1]; }
5015         bool lookup(int size, Mode mode, Range)(ref Range inp) const
5016         {
5017             return m.lookup!(size, mode)(inp);
5018         }
5019         mixin DefMatcher;
5020     }
5021 }
5022 
5023 template Utf16Matcher()
5024 {
5025     enum validSize(int sz) = sz >= 1 && sz <= 2;
5026 
5027     void badEncoding() pure @safe
5028     {
5029         import std.utf : UTFException;
5030         throw new UTFException("Invalid UTF-16 sequence");
5031     }
5032 
5033     // 1-stage ASCII
5034     alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
5035     //2-stage BMP
5036     alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
5037     //4-stage - full Unicode
5038     //assume that 0xD800 & 0xDC00 bits are cleared
5039     //thus leaving 10 bit per wchar to worry about
5040     alias UniSpec = AliasSeq!(bool, wchar[2],
5041         assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
5042         assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
5043     );
5044     alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
5045     alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
5046     alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
5047 
5048     auto encode2(dchar ch)
5049     {
5050         ch -= 0x1_0000;
5051         assert(ch <= 0xF_FFFF);
5052         wchar[2] ret;
5053         //do not put surrogate bits, they are sliced off
5054         ret[0] = cast(wchar)(ch >> 10);
5055         ret[1] = (ch & 0xFFF);
5056         return ret;
5057     }
5058 
5059     auto build(Set)(Set set)
5060     {
5061         import std.algorithm.iteration : map;
5062         auto ascii = set & unicode.ASCII;
5063         auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
5064             - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
5065         auto other = set - (bmp | ascii);
5066         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
5067         auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
5068         auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
5069         alias Ret = Impl!(1,2);
5070         return Ret(asciiT, bmpT, otherT);
5071     }
5072 
5073     //bootstrap full UTF-16 matcher interace from
5074     //sizeFlags, lookupUni and ascii
5075     mixin template DefMatcher()
5076     {
5077         public bool match(Range)(ref Range inp) const
5078             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5079                 !isDynamicArray!Range)
5080         {
5081             enum mode = Mode.skipOnMatch;
5082             assert(!inp.empty);
5083             immutable ch = inp[0];
5084             static if (sizeFlags & 1)
5085             {
5086                 if (ch < 0x80)
5087                 {
5088                   if (ascii[ch])
5089                   {
5090                       inp.popFront();
5091                       return true;
5092                   }
5093                   else
5094                       return false;
5095                 }
5096                 return lookupUni!mode(inp);
5097             }
5098             else
5099                 return lookupUni!mode(inp);
5100         }
5101 
5102         static if (Sizes.length == 2)
5103         {
5104             public bool skip(Range)(ref Range inp) const
5105                 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5106                     !isDynamicArray!Range)
5107             {
5108                 enum mode = Mode.alwaysSkip;
5109                 assert(!inp.empty);
5110                 immutable ch = inp[0];
5111                 static if (sizeFlags & 1)
5112                 {
5113                     if (ch < 0x80)
5114                     {
5115                         inp.popFront();
5116                         return ascii[ch];
5117                     }
5118                     else
5119                         return lookupUni!mode(inp);
5120                 }
5121                 else
5122                     return lookupUni!mode(inp);
5123             }
5124         }
5125 
5126         public bool test(Range)(ref Range inp) const
5127             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5128                 !isDynamicArray!Range)
5129         {
5130             enum mode = Mode.neverSkip;
5131             assert(!inp.empty);
5132             auto ch = inp[0];
5133             static if (sizeFlags & 1)
5134                 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
5135             else
5136                 return lookupUni!mode(inp);
5137         }
5138 
5139         bool match(C)(ref C[] str) const
5140             if (isSomeChar!C)
5141         {
5142             return fwdStr!"match"(str);
5143         }
5144 
5145         bool skip(C)(ref C[] str) const
5146             if (isSomeChar!C)
5147         {
5148             return fwdStr!"skip"(str);
5149         }
5150 
5151         bool test(C)(ref C[] str) const
5152             if (isSomeChar!C)
5153         {
5154             return fwdStr!"test"(str);
5155         }
5156 
5157         mixin ForwardStrings; //dispatch strings to range versions
5158     }
5159 
5160     struct Impl(Sizes...)
5161         if (Sizes.length >= 1 && Sizes.length <= 2)
5162     {
5163     private:
5164         import std.meta : allSatisfy;
5165         static assert(allSatisfy!(validSize, Sizes),
5166             "Only lengths of 1 and 2 code units are possible in UTF-16");
5167         static if (Sizes.length > 1)
5168             enum sizeFlags = Sizes[0] | Sizes[1];
5169         else
5170             enum sizeFlags = Sizes[0];
5171 
5172         static if (sizeFlags & 1)
5173         {
5174             Ascii ascii;
5175             Bmp bmp;
5176         }
5177         static if (sizeFlags & 2)
5178         {
5179             Uni uni;
5180         }
5181         mixin DefMatcher;
5182 
5183         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
5184         {
5185             return CherryPick!(Impl, SizesToPick)(&this);
5186         }
5187 
5188         bool lookupUni(Mode mode, Range)(ref Range inp) const
5189         {
5190             wchar x = cast(wchar)(inp[0] - 0xD800);
5191             //not a high surrogate
5192             if (x > 0x3FF)
5193             {
5194                 //low surrogate
5195                 if (x <= 0x7FF) badEncoding();
5196                 static if (sizeFlags & 1)
5197                 {
5198                     auto ch = inp[0];
5199                     static if (mode == Mode.alwaysSkip)
5200                         inp.popFront();
5201                     static if (mode == Mode.skipOnMatch)
5202                     {
5203                         if (bmp[ch])
5204                         {
5205                             inp.popFront();
5206                             return true;
5207                         }
5208                         else
5209                             return false;
5210                     }
5211                     else
5212                         return bmp[ch];
5213                 }
5214                 else //skip is not available for sub-matchers, so just false
5215                     return false;
5216             }
5217             else
5218             {
5219                 import std.range : popFrontN;
5220                 static if (sizeFlags & 2)
5221                 {
5222                     if (inp.length < 2)
5223                         badEncoding();
5224                     wchar y = cast(wchar)(inp[1] - 0xDC00);
5225                     //not a low surrogate
5226                     if (y > 0x3FF)
5227                         badEncoding();
5228                     wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
5229                     static if (mode == Mode.alwaysSkip)
5230                         inp.popFrontN(2);
5231                     static if (mode == Mode.skipOnMatch)
5232                     {
5233                         if (uni[needle])
5234                         {
5235                             inp.popFrontN(2);
5236                             return true;
5237                         }
5238                         else
5239                             return false;
5240                     }
5241                     else
5242                         return uni[needle];
5243                 }
5244                 else //ditto
5245                     return false;
5246             }
5247         }
5248     }
5249 
5250     struct CherryPick(I, Sizes...)
5251         if (Sizes.length >= 1 && Sizes.length <= 2)
5252     {
5253     private:
5254         import std.meta : allSatisfy;
5255         I* m;
5256         enum sizeFlags = I.sizeFlags;
5257 
5258         static if (sizeFlags & 1)
5259         {
5260             @property auto ascii()() const { return m.ascii; }
5261         }
5262 
5263         bool lookupUni(Mode mode, Range)(ref Range inp) const
5264         {
5265             return m.lookupUni!mode(inp);
5266         }
5267         mixin DefMatcher;
5268         static assert(allSatisfy!(validSize, Sizes),
5269             "Only lengths of 1 and 2 code units are possible in UTF-16");
5270     }
5271 }
5272 
5273 private auto utf8Matcher(Set)(Set set)
5274 {
5275     return Utf8Matcher!().build(set);
5276 }
5277 
5278 private auto utf16Matcher(Set)(Set set)
5279 {
5280     return Utf16Matcher!().build(set);
5281 }
5282 
5283 /**
5284     Constructs a matcher object
5285     to classify $(CODEPOINTS) from the `set` for encoding
5286     that has `Char` as code unit.
5287 
5288     See $(LREF MatcherConcept) for API outline.
5289 */
5290 public auto utfMatcher(Char, Set)(Set set)
5291 if (isCodepointSet!Set)
5292 {
5293     static if (is(Char : char))
5294         return utf8Matcher(set);
5295     else static if (is(Char : wchar))
5296         return utf16Matcher(set);
5297     else static if (is(Char : dchar))
5298         static assert(false, "UTF-32 needs no decoding,
5299             and thus not supported by utfMatcher");
5300     else
5301         static assert(false, "Only character types 'char' and 'wchar' are allowed");
5302 }
5303 
5304 
5305 //a range of code units, packed with index to speed up forward iteration
5306 package(std) auto decoder(C)(C[] s, size_t offset=0)
5307 if (is(C : wchar) || is(C : char))
5308 {
5309     static struct Decoder
5310     {
5311     pure nothrow:
5312         C[] str;
5313         size_t idx;
5314         @property C front(){ return str[idx]; }
5315         @property C back(){ return str[$-1]; }
5316         void popFront(){ idx++; }
5317         void popBack(){ str = str[0..$-1]; }
5318         void popFrontN(size_t n){ idx += n; }
5319         @property bool empty(){ return idx == str.length; }
5320         @property auto save(){ return this; }
5321         auto opIndex(size_t i){ return str[idx+i]; }
5322         @property size_t length(){ return str.length - idx; }
5323         alias opDollar = length;
5324         auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
5325     }
5326     static assert(isRandomAccessRange!Decoder);
5327     static assert(is(ElementType!Decoder : C));
5328     return Decoder(s, offset);
5329 }
5330 
5331 pure @safe unittest
5332 {
5333     string rs = "hi! ﾈемног砀 текста";
5334     auto codec = rs.decoder;
5335     auto utf8 =  utf8Matcher(unicode.Letter);
5336     auto asc = utf8.subMatcher!(1);
5337     auto uni = utf8.subMatcher!(2,3,4);
5338 
5339     // h
5340     assert(asc.test(codec));
5341     assert(!uni.match(codec));
5342     assert(utf8.skip(codec));
5343     assert(codec.idx == 1);
5344 
5345     // i
5346     assert(asc.test(codec));
5347     assert(!uni.match(codec));
5348     assert(utf8.skip(codec));
5349     assert(codec.idx == 2);
5350 
5351     // !
5352     assert(!asc.match(codec));
5353     assert(!utf8.test(codec));
5354     assert(!utf8.skip(codec));
5355     assert(codec.idx == 3);
5356 
5357     // space
5358     assert(!asc.test(codec));
5359     assert(!utf8.test(codec));
5360     assert(!utf8.skip(codec));
5361     assert(codec.idx == 4);
5362 
5363     assert(utf8.test(codec));
5364     foreach (i; 0 .. 7)
5365     {
5366         assert(!asc.test(codec));
5367         assert(uni.test(codec));
5368         assert(utf8.skip(codec));
5369     }
5370     assert(!utf8.test(codec));
5371     assert(!utf8.skip(codec));
5372 
5373     //the same with match where applicable
5374     codec = rs.decoder;
5375     assert(utf8.match(codec));
5376     assert(codec.idx == 1);
5377     assert(utf8.match(codec));
5378     assert(codec.idx == 2);
5379     assert(!utf8.match(codec));
5380     assert(codec.idx == 2);
5381     assert(!utf8.skip(codec));
5382     assert(!utf8.skip(codec));
5383 
5384     foreach (i; 0 .. 7)
5385     {
5386         assert(!asc.test(codec));
5387         assert(utf8.test(codec));
5388         assert(utf8.match(codec));
5389     }
5390     auto i = codec.idx;
5391     assert(!utf8.match(codec));
5392     assert(codec.idx == i);
5393 }
5394 
5395 pure @system unittest
5396 {
5397     import std.range : stride;
5398     static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe
5399     {
5400         bool t = m.test(r);
5401         auto save = r.idx;
5402         assert(t == m.match(r));
5403         assert(r.idx == save || t); //ether no change or was match
5404         r.idx = save;
5405         static if (is(typeof(m.skip(r))))
5406         {
5407             assert(t == m.skip(r));
5408             assert(r.idx != save); //always changed
5409             r.idx = save;
5410         }
5411         return t;
5412     }
5413     auto utf16 = utfMatcher!wchar(unicode.L);
5414     auto bmp = utf16.subMatcher!1;
5415     auto nonBmp = utf16.subMatcher!1;
5416     auto utf8 = utfMatcher!char(unicode.L);
5417     auto ascii = utf8.subMatcher!1;
5418     auto uni2 = utf8.subMatcher!2;
5419     auto uni3 = utf8.subMatcher!3;
5420     auto uni24 = utf8.subMatcher!(2,4);
5421     foreach (ch; unicode.L.byCodepoint.stride(3))
5422     {
5423         import std.utf : encode;
5424         char[4] buf;
5425         wchar[2] buf16;
5426         auto len = encode(buf, ch);
5427         auto len16 = encode(buf16, ch);
5428         auto c8 = buf[0 .. len].decoder;
5429         auto c16 = buf16[0 .. len16].decoder;
5430         assert(testAll(utf16, c16));
5431         assert(testAll(bmp, c16) || len16 != 1);
5432         assert(testAll(nonBmp, c16) || len16 != 2);
5433 
5434         assert(testAll(utf8, c8));
5435 
5436         //submatchers return false on out of their domain
5437         assert(testAll(ascii, c8) || len != 1);
5438         assert(testAll(uni2, c8) || len != 2);
5439         assert(testAll(uni3, c8) || len != 3);
5440         assert(testAll(uni24, c8) || (len != 2 && len != 4));
5441     }
5442 }
5443 
5444 // cover decode fail cases of Matcher
5445 pure @safe unittest
5446 {
5447     import std.algorithm.iteration : map;
5448     import std.exception : collectException;
5449     import std.format : format;
5450     auto utf16 = utfMatcher!wchar(unicode.L);
5451     auto utf8 = utfMatcher!char(unicode.L);
5452     //decode failure cases UTF-8
5453     alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
5454         "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
5455         "\xCF\x00\0x00\0x00\x00");
5456     foreach (msg; fails8)
5457     {
5458         assert(collectException((){
5459             auto s = msg;
5460             size_t idx = 0;
5461             utf8.test(s);
5462         }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg));
5463     }
5464     //decode failure cases UTF-16
5465     alias fails16 = AliasSeq!([0xD811], [0xDC02]);
5466     foreach (msg; fails16)
5467     {
5468         assert(collectException((){
5469             auto s = msg.map!(x => cast(wchar) x);
5470             utf16.test(s);
5471         }()));
5472     }
5473 }
5474 
5475 /++
5476     Convenience function to construct optimal configurations for
5477     packed Trie from any `set` of $(CODEPOINTS).
5478 
5479     The parameter `level` indicates the number of trie levels to use,
5480     allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
5481     speed-size wise.
5482 
5483     $(P Level 1 is fastest and the most memory hungry (a bit array). )
5484     $(P Level 4 is the slowest and has the smallest footprint. )
5485 
5486     See the $(S_LINK Synopsis, Synopsis) section for example.
5487 
5488     Note:
5489     Level 4 stays very practical (being faster and more predictable)
5490     compared to using direct lookup on the `set` itself.
5491 
5492 
5493 +/
5494 public auto toTrie(size_t level, Set)(Set set)
5495 if (isCodepointSet!Set)
5496 {
5497     static if (level == 1)
5498         return codepointSetTrie!(21)(set);
5499     else static if (level == 2)
5500         return codepointSetTrie!(10, 11)(set);
5501     else static if (level == 3)
5502         return codepointSetTrie!(8, 5, 8)(set);
5503     else static if (level == 4)
5504          return codepointSetTrie!(6, 4, 4, 7)(set);
5505     else
5506         static assert(false,
5507             "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
5508 }
5509 
5510 /**
5511     $(P Builds a `Trie` with typically optimal speed-size trade-off
5512     and wraps it into a delegate of the following type:
5513     $(D bool delegate(dchar ch)). )
5514 
5515     $(P Effectively this creates a 'tester' lambda suitable
5516     for algorithms like std.algorithm.find that take unary predicates. )
5517 
5518     See the $(S_LINK Synopsis, Synopsis) section for example.
5519 */
5520 public auto toDelegate(Set)(Set set)
5521 if (isCodepointSet!Set)
5522 {
5523     // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
5524     auto t = toTrie!3(set);
5525     return (dchar ch) => t[ch];
5526 }
5527 
5528 /**
5529     $(P Opaque wrapper around unsigned built-in integers and
5530     code unit (char/wchar/dchar) types.
5531     Parameter `sz` indicates that the value is confined
5532     to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
5533     packed more tightly when stored in certain
5534     data-structures like trie. )
5535 
5536     Note:
5537     $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T`
5538     but not vise-versa. Users have to ensure the value fits in
5539     the range required and use the `cast`
5540     operator to perform the conversion.)
5541 */
5542 struct BitPacked(T, size_t sz)
5543 if (isIntegral!T || is(T:dchar))
5544 {
5545     enum bitSize = sz;
5546     T _value;
5547     alias _value this;
5548 }
5549 
5550 /*
5551     Depending on the form of the passed argument `bitSizeOf` returns
5552     the amount of bits required to represent a given type
5553     or a return type of a given functor.
5554 */
5555 template bitSizeOf(Args...)
5556 if (Args.length == 1)
5557 {
5558     import std.traits : ReturnType;
5559     alias T = Args[0];
5560     static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
5561     {
5562         enum bitSizeOf = T.bitSize;
5563     }
5564     else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
5565     {
5566         enum bitSizeOf = bitSizeOf!(ReturnType!T);
5567     }
5568     else
5569     {
5570         enum bitSizeOf = T.sizeof*8;
5571     }
5572 }
5573 
5574 /**
5575     Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x)
5576     and thus suitable for packing.
5577 */
5578 template isBitPacked(T)
5579 {
5580     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5581         enum isBitPacked = true;
5582     else
5583         enum isBitPacked = false;
5584 }
5585 
5586 /**
5587     Gives the type `U` from $(LREF BitPacked)!(U, x)
5588     or `T` itself for every other type.
5589 */
5590 template TypeOfBitPacked(T)
5591 {
5592     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5593         alias TypeOfBitPacked = U;
5594     else
5595         alias TypeOfBitPacked = T;
5596 }
5597 
5598 /*
5599     Wrapper, used in definition of custom data structures from `Trie` template.
5600     Applying it to a unary lambda function indicates that the returned value always
5601     fits within `bits` of bits.
5602 */
5603 struct assumeSize(alias Fn, size_t bits)
5604 {
5605     enum bitSize = bits;
5606     static auto ref opCall(T)(auto ref T arg)
5607     {
5608         return Fn(arg);
5609     }
5610 }
5611 
5612 /*
5613     A helper for defining lambda function that yields a slice
5614     of certain bits from an unsigned integral value.
5615     The resulting lambda is wrapped in assumeSize and can be used directly
5616     with `Trie` template.
5617 */
5618 struct sliceBits(size_t from, size_t to)
5619 {
5620     //for now bypass assumeSize, DMD has trouble inlining it
5621     enum bitSize = to-from;
5622     static auto opCall(T)(T x)
5623     out(result)
5624     {
5625         assert(result < (1 << to-from));
5626     }
5627     do
5628     {
5629         static assert(from < to);
5630         static if (from == 0)
5631             return x & ((1 << to)-1);
5632         else
5633         return (x >> from) & ((1<<(to-from))-1);
5634     }
5635 }
5636 
5637 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
5638 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
5639 alias lo8 = assumeSize!(low_8, 8);
5640 alias mlo8 = assumeSize!(midlow_8, 8);
5641 
5642 @safe pure nothrow @nogc unittest
5643 {
5644     static assert(bitSizeOf!lo8 == 8);
5645     static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
5646     static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
5647 }
5648 
5649 template Sequence(size_t start, size_t end)
5650 {
5651     static if (start < end)
5652         alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
5653     else
5654         alias Sequence = AliasSeq!();
5655 }
5656 
5657 //---- TRIE TESTS ----
5658 @system unittest
5659 {
5660     import std.algorithm.iteration : map;
5661     import std.algorithm.sorting : sort;
5662     import std.array : array;
5663     import std.conv : text, to;
5664     import std.range : iota;
5665     static trieStats(TRIE)(TRIE t)
5666     {
5667         version (std_uni_stats)
5668         {
5669             import std.stdio : writefln, writeln;
5670             writeln("---TRIE FOOTPRINT STATS---");
5671             static foreach (i; 0 .. t.table.dim)
5672             {
5673                 writefln("lvl%s = %s bytes;  %s pages"
5674                          , i, t.bytes!i, t.pages!i);
5675             }
5676             writefln("TOTAL: %s bytes", t.bytes);
5677             version (none)
5678             {
5679                 writeln("INDEX (excluding value level):");
5680                 static foreach (i; 0 .. t.table.dim-1)
5681                     writeln(t.table.slice!(i)[0 .. t.table.length!i]);
5682             }
5683             writeln("---------------------------");
5684         }
5685     }
5686     //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
5687     // alias lo8   = assumeSize!(8, function (uint x) { return x&0xFF; });
5688     // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
5689     alias Set = CodepointSet;
5690     auto set = Set('A','Z','a','z');
5691     auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
5692     for (int a='a'; a<'z';a++)
5693         assert(trie[a]);
5694     for (int a='A'; a<'Z';a++)
5695         assert(trie[a]);
5696     for (int a=0; a<'A'; a++)
5697         assert(!trie[a]);
5698     for (int a ='Z'; a<'a'; a++)
5699         assert(!trie[a]);
5700     trieStats(trie);
5701 
5702     auto redundant2 = Set(
5703         1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
5704     auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
5705     trieStats(trie2);
5706     foreach (e; redundant2.byCodepoint)
5707         assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
5708     foreach (i; 0 .. 1024)
5709     {
5710         assert(trie2[i] == (i in redundant2));
5711     }
5712 
5713 
5714     auto redundant3 = Set(
5715           2,    4,    6,    8,    16,
5716        2+16, 4+16, 16+6, 16+8, 16+16,
5717        2+32, 4+32, 32+6, 32+8,
5718       );
5719 
5720     enum max3 = 256;
5721     // sliceBits
5722     auto trie3 = buildTrie!(bool, uint, max3,
5723             sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
5724         )(redundant3.byInterval);
5725     trieStats(trie3);
5726     foreach (i; 0 .. max3)
5727         assert(trie3[i] == (i in redundant3), text(cast(uint) i));
5728 
5729     auto redundant4 = Set(
5730             10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
5731             1000, 2000, 3000, 4000, 5000, 6000
5732         );
5733     enum max4 = 2^^16;
5734     auto trie4 = buildTrie!(bool, size_t, max4,
5735             sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
5736         )(redundant4.byInterval);
5737     foreach (i; 0 .. max4)
5738     {
5739         if (i in redundant4)
5740             assert(trie4[i], text(cast(uint) i));
5741     }
5742     trieStats(trie4);
5743 
5744         alias mapToS = mapTrieIndex!(useItemAt!(0, char));
5745         string[] redundantS = ["tea", "start", "orange"];
5746         redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
5747         auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
5748         // using first char only
5749         assert(redundantS == ["orange", "start", "tea"]);
5750         assert(strie["test"], text(strie["test"]));
5751         assert(!strie["aea"]);
5752         assert(strie["s"]);
5753 
5754     // a bit size test
5755     auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
5756     auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
5757     trieStats(bt);
5758     foreach (i; 0 .. 256)
5759         assert(bt[cast(ubyte) i]);
5760 }
5761 
5762 template useItemAt(size_t idx, T)
5763 if (isIntegral!T || is(T: dchar))
5764 {
5765     size_t impl(const scope T[] arr){ return arr[idx]; }
5766     alias useItemAt = assumeSize!(impl, 8*T.sizeof);
5767 }
5768 
5769 template useLastItem(T)
5770 {
5771     size_t impl(const scope T[] arr){ return arr[$-1]; }
5772     alias useLastItem = assumeSize!(impl, 8*T.sizeof);
5773 }
5774 
5775 template fullBitSize(Prefix...)
5776 {
5777     static if (Prefix.length > 0)
5778         enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
5779     else
5780         enum fullBitSize = 0;
5781 }
5782 
5783 template idxTypes(Key, size_t fullBits, Prefix...)
5784 {
5785     static if (Prefix.length == 1)
5786     {// the last level is value level, so no index once reduced to 1-level
5787         alias idxTypes = AliasSeq!();
5788     }
5789     else
5790     {
5791         // Important note on bit packing
5792         // Each level has to hold enough of bits to address the next one
5793         // The bottom level is known to hold full bit width
5794         // thus it's size in pages is full_bit_width - size_of_last_prefix
5795         // Recourse on this notion
5796         alias idxTypes =
5797             AliasSeq!(
5798                 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
5799                 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
5800             );
5801     }
5802 }
5803 
5804 //============================================================================
5805 
5806 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
5807 if (is(Char1 : dchar) && is(Char2 : dchar))
5808 {
5809     import std.algorithm.comparison : cmp;
5810     import std.algorithm.iteration : map, filter;
5811     import std.ascii : toLower;
5812     static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
5813     return cmp(
5814         a.map!toLower.filter!pred,
5815         b.map!toLower.filter!pred);
5816 }
5817 
5818 @safe pure unittest
5819 {
5820     assert(!comparePropertyName("foo-bar", "fooBar"));
5821 }
5822 
5823 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
5824 if (is(Char1 : dchar) && is(Char2 : dchar))
5825 {
5826     return comparePropertyName(a, b) < 0;
5827 }
5828 
5829 //============================================================================
5830 // Utilities for compression of Unicode code point sets
5831 //============================================================================
5832 
5833 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow
5834 {
5835     // not optimized as usually done 1 time (and not public interface)
5836     if (val < 128)
5837         arr ~= cast(ubyte) val;
5838     else if (val < (1 << 13))
5839     {
5840         arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
5841         arr ~= val & 0xFF;
5842     }
5843     else
5844     {
5845         assert(val < (1 << 21));
5846         arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
5847         arr ~= (val >> 8) & 0xFF;
5848         arr ~= val  & 0xFF;
5849     }
5850 }
5851 
5852 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure
5853 {
5854     import std.exception : enforce;
5855     immutable first = arr[idx++];
5856     if (!(first & 0x80)) // no top bit -> [0 .. 127]
5857         return first;
5858     immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
5859     uint val = (first & 0x1F);
5860     enforce(idx + extra <= arr.length, "bad code point interval encoding");
5861     foreach (j; 0 .. extra)
5862         val = (val << 8) | arr[idx+j];
5863     idx += extra;
5864     return val;
5865 }
5866 
5867 
5868 package(std) ubyte[] compressIntervals(Range)(Range intervals)
5869 if (isInputRange!Range && isIntegralPair!(ElementType!Range))
5870 {
5871     ubyte[] storage;
5872     uint base = 0;
5873     // RLE encode
5874     foreach (val; intervals)
5875     {
5876         compressTo(val[0]-base, storage);
5877         base = val[0];
5878         if (val[1] != lastDchar+1) // till the end of the domain so don't store it
5879         {
5880             compressTo(val[1]-base, storage);
5881             base = val[1];
5882         }
5883     }
5884     return storage;
5885 }
5886 
5887 @safe pure unittest
5888 {
5889     import std.algorithm.comparison : equal;
5890     import std.typecons : tuple;
5891 
5892     auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
5893     ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
5894     assert(compressIntervals(run) == enc);
5895     auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
5896     ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
5897     assert(compressIntervals(run2) == enc2);
5898     size_t  idx = 0;
5899     assert(decompressFrom(enc, idx) == 80);
5900     assert(decompressFrom(enc, idx) == 47);
5901     assert(decompressFrom(enc, idx) == 1);
5902     assert(decompressFrom(enc, idx) == (1 << 10));
5903     idx = 0;
5904     assert(decompressFrom(enc2, idx) == 0);
5905     assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
5906     assert(equal(decompressIntervals(compressIntervals(run)), run));
5907     assert(equal(decompressIntervals(compressIntervals(run2)), run2));
5908 }
5909 
5910 // Creates a range of `CodepointInterval` that lazily decodes compressed data.
5911 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure
5912 {
5913     return DecompressedIntervals(data);
5914 }
5915 
5916 @safe struct DecompressedIntervals
5917 {
5918 pure:
5919     const(ubyte)[] _stream;
5920     size_t _idx;
5921     CodepointInterval _front;
5922 
5923     this(const(ubyte)[] stream)
5924     {
5925         _stream = stream;
5926         popFront();
5927     }
5928 
5929     @property CodepointInterval front()
5930     {
5931         assert(!empty);
5932         return _front;
5933     }
5934 
5935     void popFront()
5936     {
5937         if (_idx == _stream.length)
5938         {
5939             _idx = size_t.max;
5940             return;
5941         }
5942         uint base = _front[1];
5943         _front[0] = base + decompressFrom(_stream, _idx);
5944         if (_idx == _stream.length)// odd length ---> till the end
5945             _front[1] = lastDchar+1;
5946         else
5947         {
5948             base = _front[0];
5949             _front[1] = base + decompressFrom(_stream, _idx);
5950         }
5951     }
5952 
5953     @property bool empty() const
5954     {
5955         return _idx == size_t.max;
5956     }
5957 
5958     @property DecompressedIntervals save() return scope { return this; }
5959 }
5960 
5961 @safe pure nothrow @nogc unittest
5962 {
5963     static assert(isInputRange!DecompressedIntervals);
5964     static assert(isForwardRange!DecompressedIntervals);
5965 }
5966 
5967 //============================================================================
5968 
5969 version (std_uni_bootstrap){}
5970 else
5971 {
5972 
5973 // helper for looking up code point sets
5974 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name)
5975 {
5976     import std.algorithm.iteration : map;
5977     import std.range : assumeSorted;
5978     auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
5979         (table.map!"a.name"());
5980     size_t idx = range.lowerBound(name).length;
5981     if (idx < range.length && comparePropertyName(range[idx], name) == 0)
5982         return idx;
5983     return -1;
5984 }
5985 
5986 // another one that loads it
5987 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest)
5988 {
5989     auto idx = findUnicodeSet!table(name);
5990     if (idx >= 0)
5991     {
5992         dest = Set(asSet(table[idx].compressed));
5993         return true;
5994     }
5995     return false;
5996 }
5997 
5998 bool loadProperty(Set=CodepointSet, C)
5999     (const scope C[] name, ref Set target) pure
6000 {
6001     import std.internal.unicode_tables : uniProps; // generated file
6002     alias ucmp = comparePropertyName;
6003     // conjure cumulative properties by hand
6004     if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
6005     {
6006         target = asSet(uniProps.Lu);
6007         target |= asSet(uniProps.Ll);
6008         target |= asSet(uniProps.Lt);
6009         target |= asSet(uniProps.Lo);
6010         target |= asSet(uniProps.Lm);
6011     }
6012     else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
6013     {
6014         target = asSet(uniProps.Ll);
6015         target |= asSet(uniProps.Lu);
6016         target |= asSet(uniProps.Lt);// Title case
6017     }
6018     else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
6019     {
6020         target = asSet(uniProps.Mn);
6021         target |= asSet(uniProps.Mc);
6022         target |= asSet(uniProps.Me);
6023     }
6024     else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
6025     {
6026         target = asSet(uniProps.Nd);
6027         target |= asSet(uniProps.Nl);
6028         target |= asSet(uniProps.No);
6029     }
6030     else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
6031     {
6032         target = asSet(uniProps.Pc);
6033         target |= asSet(uniProps.Pd);
6034         target |= asSet(uniProps.Ps);
6035         target |= asSet(uniProps.Pe);
6036         target |= asSet(uniProps.Pi);
6037         target |= asSet(uniProps.Pf);
6038         target |= asSet(uniProps.Po);
6039     }
6040     else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
6041     {
6042         target = asSet(uniProps.Sm);
6043         target |= asSet(uniProps.Sc);
6044         target |= asSet(uniProps.Sk);
6045         target |= asSet(uniProps.So);
6046     }
6047     else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
6048     {
6049         target = asSet(uniProps.Zs);
6050         target |= asSet(uniProps.Zl);
6051         target |= asSet(uniProps.Zp);
6052     }
6053     else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
6054     {
6055         target = asSet(uniProps.Cc);
6056         target |= asSet(uniProps.Cf);
6057         target |= asSet(uniProps.Cs);
6058         target |= asSet(uniProps.Co);
6059         target |= asSet(uniProps.Cn);
6060     }
6061     else if (ucmp(name, "graphical") == 0)
6062     {
6063         target = asSet(uniProps.Alphabetic);
6064 
6065         target |= asSet(uniProps.Mn);
6066         target |= asSet(uniProps.Mc);
6067         target |= asSet(uniProps.Me);
6068 
6069         target |= asSet(uniProps.Nd);
6070         target |= asSet(uniProps.Nl);
6071         target |= asSet(uniProps.No);
6072 
6073         target |= asSet(uniProps.Pc);
6074         target |= asSet(uniProps.Pd);
6075         target |= asSet(uniProps.Ps);
6076         target |= asSet(uniProps.Pe);
6077         target |= asSet(uniProps.Pi);
6078         target |= asSet(uniProps.Pf);
6079         target |= asSet(uniProps.Po);
6080 
6081         target |= asSet(uniProps.Zs);
6082 
6083         target |= asSet(uniProps.Sm);
6084         target |= asSet(uniProps.Sc);
6085         target |= asSet(uniProps.Sk);
6086         target |= asSet(uniProps.So);
6087     }
6088     else if (ucmp(name, "any") == 0)
6089         target = Set.fromIntervals(0, 0x110000);
6090     else if (ucmp(name, "ascii") == 0)
6091         target = Set.fromIntervals(0, 0x80);
6092     else
6093         return loadUnicodeSet!(uniProps.tab)(name, target);
6094     return true;
6095 }
6096 
6097 // CTFE-only helper for checking property names at compile-time
6098 @safe bool isPrettyPropertyName(C)(const scope C[] name)
6099 {
6100     import std.algorithm.searching : find;
6101     auto names = [
6102         "L", "Letter",
6103         "LC", "Cased Letter",
6104         "M", "Mark",
6105         "N", "Number",
6106         "P", "Punctuation",
6107         "S", "Symbol",
6108         "Z", "Separator",
6109         "Graphical",
6110         "any",
6111         "ascii"
6112     ];
6113     auto x = find!(x => comparePropertyName(x, name) == 0)(names);
6114     return !x.empty;
6115 }
6116 
6117 // ditto, CTFE-only, not optimized
6118 @safe private static bool findSetName(alias table, C)(const scope C[] name)
6119 {
6120     return findUnicodeSet!table(name) >= 0;
6121 }
6122 
6123 template SetSearcher(alias table, string kind)
6124 {
6125     /// Run-time checked search.
6126     static auto opCall(C)(const scope C[] name)
6127         if (is(C : dchar))
6128     {
6129         import std.conv : to;
6130         CodepointSet set;
6131         if (loadUnicodeSet!table(name, set))
6132             return set;
6133         throw new Exception("No unicode set for "~kind~" by name "
6134             ~name.to!string()~" was found.");
6135     }
6136     /// Compile-time checked search.
6137     static @property auto opDispatch(string name)()
6138     {
6139         static if (findSetName!table(name))
6140         {
6141             CodepointSet set;
6142             loadUnicodeSet!table(name, set);
6143             return set;
6144         }
6145         else
6146             static assert(false, "No unicode set for "~kind~" by name "
6147                 ~name~" was found.");
6148     }
6149 }
6150 
6151 // Characters that need escaping in string posed as regular expressions
6152 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
6153     ';', ':', '#', '&', '%', '/', '<', '>', '`',  '*', '+', '(', ')', '{', '}',  '~');
6154 
6155 package(std) CodepointSet memoizeExpr(string expr)()
6156 {
6157     if (__ctfe)
6158         return mixin(expr);
6159     alias T = typeof(mixin(expr));
6160     static T slot;
6161     static bool initialized;
6162     if (!initialized)
6163     {
6164         slot =  mixin(expr);
6165         initialized = true;
6166     }
6167     return slot;
6168 }
6169 
6170 //property for \w character class
6171 package(std) @property CodepointSet wordCharacter() @safe
6172 {
6173     return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
6174         | unicode.Me | unicode.Nd | unicode.Pc")();
6175 }
6176 
6177 //basic stack, just in case it gets used anywhere else then Parser
6178 package(std) struct Stack(T)
6179 {
6180 @safe:
6181     T[] data;
6182     @property bool empty(){ return data.empty; }
6183 
6184     @property size_t length(){ return data.length; }
6185 
6186     void push(T val){ data ~= val;  }
6187 
6188     @trusted T pop()
6189     {
6190         assert(!empty);
6191         auto val = data[$ - 1];
6192         data = data[0 .. $ - 1];
6193         if (!__ctfe)
6194             cast(void) data.assumeSafeAppend();
6195         return val;
6196     }
6197 
6198     @property ref T top()
6199     {
6200         assert(!empty);
6201         return data[$ - 1];
6202     }
6203 }
6204 
6205 //test if a given string starts with hex number of maxDigit that's a valid codepoint
6206 //returns it's value and skips these maxDigit chars on success, throws on failure
6207 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit)
6208 {
6209     import std.exception : enforce;
6210     //std.conv.parse is both @system and bogus
6211     uint val;
6212     for (int k = 0; k < maxDigit; k++)
6213     {
6214         enforce(!str.empty, "incomplete escape sequence");
6215         //accepts ascii only, so it's OK to index directly
6216         immutable current = str.front;
6217         if ('0' <= current && current <= '9')
6218             val = val * 16 + current - '0';
6219         else if ('a' <= current && current <= 'f')
6220             val = val * 16 + current -'a' + 10;
6221         else if ('A' <= current && current <= 'F')
6222             val = val * 16 + current - 'A' + 10;
6223         else
6224             throw new Exception("invalid escape sequence");
6225         str.popFront();
6226     }
6227     enforce(val <= 0x10FFFF, "invalid codepoint");
6228     return val;
6229 }
6230 
6231 @safe unittest
6232 {
6233     import std.algorithm.searching : canFind;
6234     import std.exception : collectException;
6235     string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
6236     string[] hex = [ "01", "ff", "00af", "10FFFF" ];
6237     int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
6238     foreach (v; non_hex)
6239         assert(collectException(parseUniHex(v, v.length)).msg
6240           .canFind("invalid escape sequence"));
6241     foreach (i, v; hex)
6242         assert(parseUniHex(v, v.length) == value[i]);
6243     string over = "0011FFFF";
6244     assert(collectException(parseUniHex(over, over.length)).msg
6245       .canFind("invalid codepoint"));
6246 }
6247 
6248 auto caseEnclose(CodepointSet set)
6249 {
6250     auto cased = set & unicode.LC;
6251     foreach (dchar ch; cased.byCodepoint)
6252     {
6253         foreach (c; simpleCaseFoldings(ch))
6254             set |= c;
6255     }
6256     return set;
6257 }
6258 
6259 /+
6260     fetch codepoint set corresponding to a name (InBlock or binary property)
6261 +/
6262 CodepointSet getUnicodeSet(const scope char[] name, bool negated,  bool casefold) @safe
6263 {
6264     CodepointSet s = unicode(name);
6265     //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
6266     if (casefold)
6267        s = caseEnclose(s);
6268     if (negated)
6269         s = s.inverted;
6270     return s;
6271 }
6272 
6273 struct UnicodeSetParser(Range)
6274 {
6275     import std.exception : enforce;
6276     import std.typecons : tuple, Tuple;
6277     Range range;
6278     bool casefold_;
6279 
6280     @property bool empty(){ return range.empty; }
6281     @property dchar front(){ return range.front; }
6282     void popFront(){ range.popFront(); }
6283 
6284     //CodepointSet operations relatively in order of priority
6285     enum Operator:uint {
6286         Open = 0, Negate,  Difference, SymDifference, Intersection, Union, None
6287     }
6288 
6289     //parse unit of CodepointSet spec, most notably escape sequences and char ranges
6290     //also fetches next set operation
6291     Tuple!(CodepointSet,Operator) parseCharTerm()
6292     {
6293         import std.range : drop;
6294         enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
6295         enum State{ Start, Char, Escape, CharDash, CharDashEscape,
6296             PotentialTwinSymbolOperator }
6297         Operator op = Operator.None;
6298         dchar last;
6299         CodepointSet set;
6300         State state = State.Start;
6301 
6302         void addWithFlags(ref CodepointSet set, uint ch)
6303         {
6304             if (casefold_)
6305             {
6306                 auto foldings = simpleCaseFoldings(ch);
6307                 foreach (v; foldings)
6308                     set |= v;
6309             }
6310             else
6311                 set |= ch;
6312         }
6313 
6314         static Operator twinSymbolOperator(dchar symbol)
6315         {
6316             switch (symbol)
6317             {
6318             case '|':
6319                 return Operator.Union;
6320             case '-':
6321                 return Operator.Difference;
6322             case '~':
6323                 return Operator.SymDifference;
6324             case '&':
6325                 return Operator.Intersection;
6326             default:
6327                 assert(false);
6328             }
6329         }
6330 
6331         L_CharTermLoop:
6332         for (;;)
6333         {
6334             final switch (state)
6335             {
6336             case State.Start:
6337                 switch (front)
6338                 {
6339                 case '|':
6340                 case '-':
6341                 case '~':
6342                 case '&':
6343                     state = State.PotentialTwinSymbolOperator;
6344                     last = front;
6345                     break;
6346                 case '[':
6347                     op = Operator.Union;
6348                     goto case;
6349                 case ']':
6350                     break L_CharTermLoop;
6351                 case '\\':
6352                     state = State.Escape;
6353                     break;
6354                 default:
6355                     state = State.Char;
6356                     last = front;
6357                 }
6358                 break;
6359             case State.Char:
6360                 // xxx last front xxx
6361                 switch (front)
6362                 {
6363                 case '|':
6364                 case '~':
6365                 case '&':
6366                     // then last is treated as normal char and added as implicit union
6367                     state = State.PotentialTwinSymbolOperator;
6368                     addWithFlags(set, last);
6369                     last = front;
6370                     break;
6371                 case '-': // still need more info
6372                     state = State.CharDash;
6373                     break;
6374                 case '\\':
6375                     set |= last;
6376                     state = State.Escape;
6377                     break;
6378                 case '[':
6379                     op = Operator.Union;
6380                     goto case;
6381                 case ']':
6382                     addWithFlags(set, last);
6383                     break L_CharTermLoop;
6384                 default:
6385                     state = State.Char;
6386                     addWithFlags(set, last);
6387                     last = front;
6388                 }
6389                 break;
6390             case State.PotentialTwinSymbolOperator:
6391                 // xxx last front xxxx
6392                 // where last = [|-&~]
6393                 if (front == last)
6394                 {
6395                     op = twinSymbolOperator(last);
6396                     popFront();//skip second twin char
6397                     break L_CharTermLoop;
6398                 }
6399                 goto case State.Char;
6400             case State.Escape:
6401                 // xxx \ front xxx
6402                 switch (front)
6403                 {
6404                 case 'f':
6405                     last = '\f';
6406                     state = State.Char;
6407                     break;
6408                 case 'n':
6409                     last = '\n';
6410                     state = State.Char;
6411                     break;
6412                 case 'r':
6413                     last = '\r';
6414                     state = State.Char;
6415                     break;
6416                 case 't':
6417                     last = '\t';
6418                     state = State.Char;
6419                     break;
6420                 case 'v':
6421                     last = '\v';
6422                     state = State.Char;
6423                     break;
6424                 case 'c':
6425                     last = unicode.parseControlCode(this);
6426                     state = State.Char;
6427                     break;
6428                 foreach (val; Escapables)
6429                 {
6430                 case val:
6431                 }
6432                     last = front;
6433                     state = State.Char;
6434                     break;
6435                 case 'p':
6436                     set.add(unicode.parsePropertySpec(this, false, casefold_));
6437                     state = State.Start;
6438                     continue L_CharTermLoop; //next char already fetched
6439                 case 'P':
6440                     set.add(unicode.parsePropertySpec(this, true, casefold_));
6441                     state = State.Start;
6442                     continue L_CharTermLoop; //next char already fetched
6443                 case 'x':
6444                     popFront();
6445                     last = parseUniHex(this, 2);
6446                     state = State.Char;
6447                     continue L_CharTermLoop;
6448                 case 'u':
6449                     popFront();
6450                     last = parseUniHex(this, 4);
6451                     state = State.Char;
6452                     continue L_CharTermLoop;
6453                 case 'U':
6454                     popFront();
6455                     last = parseUniHex(this, 8);
6456                     state = State.Char;
6457                     continue L_CharTermLoop;
6458                 case 'd':
6459                     set.add(unicode.Nd);
6460                     state = State.Start;
6461                     break;
6462                 case 'D':
6463                     set.add(unicode.Nd.inverted);
6464                     state = State.Start;
6465                     break;
6466                 case 's':
6467                     set.add(unicode.White_Space);
6468                     state = State.Start;
6469                     break;
6470                 case 'S':
6471                     set.add(unicode.White_Space.inverted);
6472                     state = State.Start;
6473                     break;
6474                 case 'w':
6475                     set.add(wordCharacter);
6476                     state = State.Start;
6477                     break;
6478                 case 'W':
6479                     set.add(wordCharacter.inverted);
6480                     state = State.Start;
6481                     break;
6482                 default:
6483                     if (front >= privateUseStart && front <= privateUseEnd)
6484                         enforce(false, "no matching ']' found while parsing character class");
6485                     enforce(false, "invalid escape sequence");
6486                 }
6487                 break;
6488             case State.CharDash:
6489                 // xxx last - front xxx
6490                 switch (front)
6491                 {
6492                 case '[':
6493                     op = Operator.Union;
6494                     goto case;
6495                 case ']':
6496                     //means dash is a single char not an interval specifier
6497                     addWithFlags(set, last);
6498                     addWithFlags(set, '-');
6499                     break L_CharTermLoop;
6500                  case '-'://set Difference again
6501                     addWithFlags(set, last);
6502                     op = Operator.Difference;
6503                     popFront();//skip '-'
6504                     break L_CharTermLoop;
6505                 case '\\':
6506                     state = State.CharDashEscape;
6507                     break;
6508                 default:
6509                     enforce(last <= front, "inverted range");
6510                     if (casefold_)
6511                     {
6512                         for (uint ch = last; ch <= front; ch++)
6513                             addWithFlags(set, ch);
6514                     }
6515                     else
6516                         set.add(last, front + 1);
6517                     state = State.Start;
6518                 }
6519                 break;
6520             case State.CharDashEscape:
6521             //xxx last - \ front xxx
6522                 uint end;
6523                 switch (front)
6524                 {
6525                 case 'f':
6526                     end = '\f';
6527                     break;
6528                 case 'n':
6529                     end = '\n';
6530                     break;
6531                 case 'r':
6532                     end = '\r';
6533                     break;
6534                 case 't':
6535                     end = '\t';
6536                     break;
6537                 case 'v':
6538                     end = '\v';
6539                     break;
6540                 foreach (val; Escapables)
6541                 {
6542                 case val:
6543                 }
6544                     end = front;
6545                     break;
6546                 case 'c':
6547                     end = unicode.parseControlCode(this);
6548                     break;
6549                 case 'x':
6550                     popFront();
6551                     end = parseUniHex(this, 2);
6552                     enforce(last <= end,"inverted range");
6553                     set.add(last, end + 1);
6554                     state = State.Start;
6555                     continue L_CharTermLoop;
6556                 case 'u':
6557                     popFront();
6558                     end = parseUniHex(this, 4);
6559                     enforce(last <= end,"inverted range");
6560                     set.add(last, end + 1);
6561                     state = State.Start;
6562                     continue L_CharTermLoop;
6563                 case 'U':
6564                     popFront();
6565                     end = parseUniHex(this, 8);
6566                     enforce(last <= end,"inverted range");
6567                     set.add(last, end + 1);
6568                     state = State.Start;
6569                     continue L_CharTermLoop;
6570                 default:
6571                     if (front >= privateUseStart && front <= privateUseEnd)
6572                         enforce(false, "no matching ']' found while parsing character class");
6573                     enforce(false, "invalid escape sequence");
6574                 }
6575                 // Lookahead to check if it's a \T
6576                 // where T is sub-pattern terminator in multi-pattern scheme
6577                 auto lookahead = range.save.drop(1);
6578                 if (end == '\\' && !lookahead.empty)
6579                 {
6580                     if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd)
6581                         enforce(false, "no matching ']' found while parsing character class");
6582                 }
6583                 enforce(last <= end,"inverted range");
6584                 set.add(last, end + 1);
6585                 state = State.Start;
6586                 break;
6587             }
6588             popFront();
6589             enforce(!empty, "unexpected end of CodepointSet");
6590         }
6591         return tuple(set, op);
6592     }
6593 
6594     alias ValStack = Stack!(CodepointSet);
6595     alias OpStack = Stack!(Operator);
6596 
6597     CodepointSet parseSet()
6598     {
6599         ValStack vstack;
6600         OpStack opstack;
6601         import std.functional : unaryFun;
6602         enforce(!empty, "unexpected end of input");
6603         enforce(front == '[', "expected '[' at the start of unicode set");
6604         //
6605         static bool apply(Operator op, ref ValStack stack)
6606         {
6607             switch (op)
6608             {
6609             case Operator.Negate:
6610                 enforce(!stack.empty, "no operand for '^'");
6611                 stack.top = stack.top.inverted;
6612                 break;
6613             case Operator.Union:
6614                 auto s = stack.pop();//2nd operand
6615                 enforce(!stack.empty, "no operand for '||'");
6616                 stack.top.add(s);
6617                 break;
6618             case Operator.Difference:
6619                 auto s = stack.pop();//2nd operand
6620                 enforce(!stack.empty, "no operand for '--'");
6621                 stack.top.sub(s);
6622                 break;
6623             case Operator.SymDifference:
6624                 auto s = stack.pop();//2nd operand
6625                 enforce(!stack.empty, "no operand for '~~'");
6626                 stack.top ~= s;
6627                 break;
6628             case Operator.Intersection:
6629                 auto s = stack.pop();//2nd operand
6630                 enforce(!stack.empty, "no operand for '&&'");
6631                 stack.top.intersect(s);
6632                 break;
6633             default:
6634                 return false;
6635             }
6636             return true;
6637         }
6638         static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
6639         {
6640             while (cond(opstack.top))
6641             {
6642                 if (!apply(opstack.pop(),vstack))
6643                     return false;//syntax error
6644                 if (opstack.empty)
6645                     return false;
6646             }
6647             return true;
6648         }
6649 
6650         L_CharsetLoop:
6651         do
6652         {
6653             switch (front)
6654             {
6655             case '[':
6656                 opstack.push(Operator.Open);
6657                 popFront();
6658                 enforce(!empty, "unexpected end of character class");
6659                 if (front == '^')
6660                 {
6661                     opstack.push(Operator.Negate);
6662                     popFront();
6663                     enforce(!empty, "unexpected end of character class");
6664                 }
6665                 else if (front == ']') // []...] is special cased
6666                 {
6667                     popFront();
6668                     enforce(!empty, "wrong character set");
6669                     auto pair = parseCharTerm();
6670                     pair[0].add(']', ']'+1);
6671                     if (pair[1] != Operator.None)
6672                     {
6673                         if (opstack.top == Operator.Union)
6674                             unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6675                         opstack.push(pair[1]);
6676                     }
6677                     vstack.push(pair[0]);
6678                 }
6679                 break;
6680             case ']':
6681                 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
6682                     "character class syntax error");
6683                 enforce(!opstack.empty, "unmatched ']'");
6684                 opstack.pop();
6685                 popFront();
6686                 if (opstack.empty)
6687                     break L_CharsetLoop;
6688                 auto pair  = parseCharTerm();
6689                 if (!pair[0].empty)//not only operator e.g. -- or ~~
6690                 {
6691                     vstack.top.add(pair[0]);//apply union
6692                 }
6693                 if (pair[1] != Operator.None)
6694                 {
6695                     if (opstack.top == Operator.Union)
6696                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6697                     opstack.push(pair[1]);
6698                 }
6699                 break;
6700             //
6701             default://yet another pair of term(op)?
6702                 auto pair = parseCharTerm();
6703                 if (pair[1] != Operator.None)
6704                 {
6705                     if (opstack.top == Operator.Union)
6706                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6707                     opstack.push(pair[1]);
6708                 }
6709                 vstack.push(pair[0]);
6710             }
6711 
6712         }while (!empty || !opstack.empty);
6713         while (!opstack.empty)
6714             apply(opstack.pop(),vstack);
6715         assert(vstack.length == 1);
6716         return vstack.top;
6717     }
6718 }
6719 
6720 /**
6721     A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
6722     a block, script or general category.
6723 
6724     It uses well defined standard rules of property name lookup.
6725     This includes fuzzy matching of names, so that
6726     'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
6727     and yield the same set of white space $(CHARACTERS).
6728 */
6729 @safe public struct unicode
6730 {
6731     import std.exception : enforce;
6732     /**
6733         Performs the lookup of set of $(CODEPOINTS)
6734         with compile-time correctness checking.
6735         This short-cut version combines 3 searches:
6736         across blocks, scripts, and common binary properties.
6737 
6738         Note that since scripts and blocks overlap the
6739         usual trick to disambiguate is used - to get a block use
6740         `unicode.InBlockName`, to search a script
6741         use `unicode.ScriptName`.
6742 
6743         See_Also: $(LREF block), $(LREF script)
6744         and (not included in this search) $(LREF hangulSyllableType).
6745     */
6746 
6747     static @property auto opDispatch(string name)() pure
6748     {
6749         static if (findAny(name))
6750             return loadAny(name);
6751         else
6752             static assert(false, "No unicode set by name "~name~" was found.");
6753     }
6754 
6755     ///
6756     @safe unittest
6757     {
6758         import std.exception : collectException;
6759         auto ascii = unicode.ASCII;
6760         assert(ascii['A']);
6761         assert(ascii['~']);
6762         assert(!ascii['\u00e0']);
6763         // matching is case-insensitive
6764         assert(ascii == unicode.ascII);
6765         assert(!ascii['à']);
6766         // underscores, '-' and whitespace in names are ignored too
6767         auto latin = unicode.in_latin1_Supplement;
6768         assert(latin['à']);
6769         assert(!latin['$']);
6770         // BTW Latin 1 Supplement is a block, hence "In" prefix
6771         assert(latin == unicode("In Latin 1 Supplement"));
6772         // run-time look up throws if no such set is found
6773         assert(collectException(unicode("InCyrilliac")));
6774     }
6775 
6776     /**
6777         The same lookup across blocks, scripts, or binary properties,
6778         but performed at run-time.
6779         This version is provided for cases where `name`
6780         is not known beforehand; otherwise compile-time
6781         checked $(LREF opDispatch) is typically a better choice.
6782 
6783         See the $(S_LINK Unicode properties, table of properties) for available
6784         sets.
6785     */
6786     static auto opCall(C)(const scope C[] name)
6787         if (is(C : dchar))
6788     {
6789         return loadAny(name);
6790     }
6791 
6792     /**
6793         Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
6794 
6795         Note:
6796         Here block names are unambiguous as no scripts are searched
6797         and thus to search use simply `unicode.block.BlockName` notation.
6798 
6799         See $(S_LINK Unicode properties, table of properties) for available sets.
6800         See_Also: $(S_LINK Unicode properties, table of properties).
6801     */
6802     struct block
6803     {
6804         import std.internal.unicode_tables : blocks; // generated file
6805         mixin SetSearcher!(blocks.tab, "block");
6806     }
6807 
6808     ///
6809     @safe unittest
6810     {
6811         // use .block for explicitness
6812         assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
6813     }
6814 
6815     /**
6816         Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
6817 
6818         See the $(S_LINK Unicode properties, table of properties) for available
6819         sets.
6820     */
6821     struct script
6822     {
6823         import std.internal.unicode_tables : scripts; // generated file
6824         mixin SetSearcher!(scripts.tab, "script");
6825     }
6826 
6827     ///
6828     @safe unittest
6829     {
6830         auto arabicScript = unicode.script.arabic;
6831         auto arabicBlock = unicode.block.arabic;
6832         // there is an intersection between script and block
6833         assert(arabicBlock['؁']);
6834         assert(arabicScript['؁']);
6835         // but they are different
6836         assert(arabicBlock != arabicScript);
6837         assert(arabicBlock == unicode.inArabic);
6838         assert(arabicScript == unicode.arabic);
6839     }
6840 
6841     /**
6842         Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
6843 
6844         Other non-binary properties (once supported) follow the same
6845         notation - `unicode.propertyName.propertyValue` for compile-time
6846         checked access and `unicode.propertyName(propertyValue)`
6847         for run-time checked one.
6848 
6849         See the $(S_LINK Unicode properties, table of properties) for available
6850         sets.
6851     */
6852     struct hangulSyllableType
6853     {
6854         import std.internal.unicode_tables : hangul; // generated file
6855         mixin SetSearcher!(hangul.tab, "hangul syllable type");
6856     }
6857 
6858     ///
6859     @safe unittest
6860     {
6861         // L here is syllable type not Letter as in unicode.L short-cut
6862         auto leadingVowel = unicode.hangulSyllableType("L");
6863         // check that some leading vowels are present
6864         foreach (vowel; '\u1110'..'\u115F')
6865             assert(leadingVowel[vowel]);
6866         assert(leadingVowel == unicode.hangulSyllableType.L);
6867     }
6868 
6869     //parse control code of form \cXXX, c assumed to be the current symbol
6870     static package(std) dchar parseControlCode(Parser)(ref Parser p)
6871     {
6872         with(p)
6873         {
6874             popFront();
6875             enforce(!empty, "Unfinished escape sequence");
6876             enforce(('a' <= front && front <= 'z')
6877                 || ('A' <= front && front <= 'Z'),
6878             "Only letters are allowed after \\c");
6879             return front & 0x1f;
6880         }
6881     }
6882 
6883     //parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
6884     //\ - assumed to be processed, p - is current
6885     static package(std) CodepointSet parsePropertySpec(Range)(ref Range p,
6886         bool negated, bool casefold)
6887     {
6888         static import std.ascii;
6889         with(p)
6890         {
6891             enum MAX_PROPERTY = 128;
6892             char[MAX_PROPERTY] result;
6893             uint k = 0;
6894             popFront();
6895             enforce(!empty, "eof parsing unicode property spec");
6896             if (front == '{')
6897             {
6898                 popFront();
6899                 while (k < MAX_PROPERTY && !empty && front !='}'
6900                     && front !=':')
6901                 {
6902                     if (front != '-' && front != ' ' && front != '_')
6903                         result[k++] = cast(char) std.ascii.toLower(front);
6904                     popFront();
6905                 }
6906                 enforce(k != MAX_PROPERTY, "invalid property name");
6907                 enforce(front == '}', "} expected ");
6908             }
6909             else
6910             {//single char properties e.g.: \pL, \pN ...
6911                 enforce(front < 0x80, "invalid property name");
6912                 result[k++] = cast(char) front;
6913             }
6914             auto s = getUnicodeSet(result[0 .. k], negated, casefold);
6915             enforce(!s.empty, "unrecognized unicode property spec");
6916             popFront();
6917             return s;
6918         }
6919     }
6920 
6921     /**
6922         Parse unicode codepoint set from given `range` using standard regex
6923         syntax '[...]'. The range is advanced skiping over regex set definition.
6924         `casefold` parameter determines if the set should be casefolded - that is
6925         include both lower and upper case versions for any letters in the set.
6926     */
6927     static CodepointSet parseSet(Range)(ref Range range, bool casefold=false)
6928     if (isInputRange!Range && is(ElementType!Range : dchar))
6929     {
6930         auto usParser = UnicodeSetParser!Range(range, casefold);
6931         auto set = usParser.parseSet();
6932         range = usParser.range;
6933         return set;
6934     }
6935 
6936     ///
6937     @safe unittest
6938     {
6939         import std.uni : unicode;
6940         string pat = "[a-zA-Z0-9]hello";
6941         auto set = unicode.parseSet(pat);
6942         // check some of the codepoints
6943         assert(set['a'] && set['A'] && set['9']);
6944         assert(pat == "hello");
6945     }
6946 
6947 private:
6948     alias ucmp = comparePropertyName;
6949 
6950     static bool findAny(string name)
6951     {
6952         import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
6953         return isPrettyPropertyName(name)
6954             || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
6955             || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
6956     }
6957 
6958     static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure
6959     {
6960         import std.conv : to;
6961         import std.internal.unicode_tables : blocks, scripts; // generated file
6962         Set set;
6963         immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
6964             || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
6965                 && loadUnicodeSet!(blocks.tab)(name[2..$], set));
6966         if (loaded)
6967             return set;
6968         throw new Exception("No unicode set by name "~name.to!string()~" was found.");
6969     }
6970 
6971     // FIXME: re-disable once the compiler is fixed
6972     // Disabled to prevent the mistake of creating instances of this pseudo-struct.
6973     //@disable ~this();
6974 }
6975 
6976 @safe unittest
6977 {
6978     import std.internal.unicode_tables : blocks, uniProps; // generated file
6979     assert(unicode("InHebrew") == asSet(blocks.Hebrew));
6980     assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
6981     assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
6982 }
6983 
6984 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
6985 
6986 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
6987 // Use combined trie instead of checking for '\r' | '\n' | ccTrie,
6988 //   or extend | '\u200D' separately
6989 
6990 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
6991 {
6992     return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
6993 }
6994 
6995 // Our grapheme decoder is a state machine, this is list of all possible
6996 // states before each code point.
6997 private enum GraphemeState
6998 {
6999     Start,
7000     CR,
7001     RI,
7002     L,
7003     V,
7004     LVT,
7005     Emoji,
7006     EmojiZWJ,
7007     Prepend,
7008     End
7009 }
7010 
7011 // Message values whether end of grapheme is reached
7012 private enum TransformRes
7013 {
7014     // No, unless the source range ends here
7015     // (GB2 - break at end of text, unless text is empty)
7016     goOn,
7017     redo, // Run last character again with new state
7018     retInclude, // Yes, after the just iterated character
7019     retExclude // Yes, before the just iterated character
7020 }
7021 
7022 // The logic of the grapheme decoding is all here
7023 // GB# means Grapheme Breaking rule number # - see Unicode standard annex #29
7024 // Note, getting GB1 (break at start of text, unless text is empty) right
7025 // relies on the user starting grapheme walking from beginning of the text, and
7026 // not attempting to walk an empty text.
7027 private enum TransformRes
7028     function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms =
7029 [
7030     GraphemeState.Start: (ref state, ch)
7031     {
7032         // GB4. Break after controls.
7033         if (graphemeControlTrie[ch] || ch == '\n')
7034             return TransformRes.retInclude;
7035 
7036         with (GraphemeState) state =
7037             ch == '\r' ? CR :
7038             isRegionalIndicator(ch) ? RI :
7039             isHangL(ch) ? L :
7040             hangLV[ch] || isHangV(ch) ? V :
7041             hangLVT[ch] || isHangT(ch) ? LVT :
7042             prependTrie[ch] ? Prepend :
7043             xpictoTrie[ch] ? Emoji :
7044             End;
7045 
7046         // No matter what we encountered, we always include the
7047         // first code point in the grapheme.
7048         return TransformRes.goOn;
7049     },
7050 
7051     // GB3, GB4. Do not break between a CR and LF.
7052     // Otherwise, break after controls.
7053     GraphemeState.CR: (ref state, ch) => ch == '\n' ?
7054         TransformRes.retInclude :
7055         TransformRes.retExclude,
7056 
7057     // GB12 - GB13. Do not break within emoji flag sequences.
7058     // That is, do not break between regional indicator (RI) symbols if
7059     // there is an odd number of RI characters before the break point.
7060     // This state applies if one and only one RI code point has been
7061     // encountered.
7062     GraphemeState.RI: (ref state, ch)
7063     {
7064         state = GraphemeState.End;
7065 
7066         return isRegionalIndicator(ch) ?
7067             TransformRes.goOn :
7068             TransformRes.redo;
7069     },
7070 
7071     // GB6. Do not break Hangul syllable sequences.
7072     GraphemeState.L: (ref state, ch)
7073     {
7074         if (isHangL(ch))
7075             return TransformRes.goOn;
7076         else if (isHangV(ch) || hangLV[ch])
7077         {
7078             state = GraphemeState.V;
7079             return TransformRes.goOn;
7080         }
7081         else if (hangLVT[ch])
7082         {
7083             state = GraphemeState.LVT;
7084             return TransformRes.goOn;
7085         }
7086 
7087         state = GraphemeState.End;
7088         return TransformRes.redo;
7089     },
7090 
7091     // GB7. Do not break Hangul syllable sequences.
7092     GraphemeState.V: (ref state, ch)
7093     {
7094         if (isHangV(ch))
7095             return TransformRes.goOn;
7096         else if (isHangT(ch))
7097         {
7098             state = GraphemeState.LVT;
7099             return TransformRes.goOn;
7100         }
7101 
7102         state = GraphemeState.End;
7103         return TransformRes.redo;
7104     },
7105 
7106     // GB8. Do not break Hangul syllable sequences.
7107     GraphemeState.LVT: (ref state, ch)
7108     {
7109         if (isHangT(ch))
7110             return TransformRes.goOn;
7111 
7112         state = GraphemeState.End;
7113         return TransformRes.redo;
7114     },
7115 
7116     // GB11. Do not break within emoji modifier sequences or emoji
7117     // zwj sequences. This state applies when the last code point was
7118     // NOT a ZWJ.
7119     GraphemeState.Emoji: (ref state, ch)
7120     {
7121         if (graphemeExtendTrie[ch])
7122             return TransformRes.goOn;
7123 
7124         static assert(!graphemeExtendTrie['\u200D']);
7125 
7126         if (ch == '\u200D')
7127         {
7128             state = GraphemeState.EmojiZWJ;
7129             return TransformRes.goOn;
7130         }
7131 
7132         state = GraphemeState.End;
7133         // There might still be spacing marks are
7134         // at the end, which are not allowed in
7135         // middle of emoji sequences
7136         return TransformRes.redo;
7137     },
7138 
7139     // GB11. Do not break within emoji modifier sequences or emoji
7140     // zwj sequences. This state applies when the last code point was
7141     // a ZWJ.
7142     GraphemeState.EmojiZWJ: (ref state, ch)
7143     {
7144         state = GraphemeState.Emoji;
7145         if (xpictoTrie[ch])
7146             return TransformRes.goOn;
7147         return TransformRes.redo;
7148     },
7149 
7150     // GB9b. Do not break after Prepend characters.
7151     GraphemeState.Prepend: (ref state, ch)
7152     {
7153         // GB5. Break before controls.
7154         if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n')
7155             return TransformRes.retExclude;
7156 
7157         state = GraphemeState.Start;
7158         return TransformRes.redo;
7159     },
7160 
7161     // GB9, GB9a. Do not break before extending characters, ZWJ
7162     // or SpacingMarks.
7163     // GB999. Otherwise, break everywhere.
7164     GraphemeState.End: (ref state, ch)
7165         => !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ?
7166             TransformRes.retExclude :
7167             TransformRes.goOn
7168 ];
7169 
7170 template genericDecodeGrapheme(bool getValue)
7171 {
7172     static if (getValue)
7173         alias Value = Grapheme;
7174     else
7175         alias Value = void;
7176 
7177     Value genericDecodeGrapheme(Input)(ref Input range)
7178     {
7179         static if (getValue)
7180             Grapheme grapheme;
7181         auto state = GraphemeState.Start;
7182         dchar ch;
7183 
7184         assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
7185     outer:
7186         while (!range.empty)
7187         {
7188             ch = range.front;
7189 
7190         rerun:
7191             final switch (graphemeTransforms[state](state, ch))
7192                 with(TransformRes)
7193             {
7194             case goOn:
7195                 static if (getValue)
7196                     grapheme ~= ch;
7197                 range.popFront();
7198                 continue;
7199 
7200             case redo:
7201                 goto rerun;
7202 
7203             case retInclude:
7204                 static if (getValue)
7205                     grapheme ~= ch;
7206                 range.popFront();
7207                 break outer;
7208 
7209             case retExclude:
7210                 break outer;
7211             }
7212         }
7213 
7214         static if (getValue)
7215             return grapheme;
7216     }
7217 }
7218 
7219 public: // Public API continues
7220 
7221 /++
7222     Computes the length of grapheme cluster starting at `index`.
7223     Both the resulting length and the `index` are measured
7224     in $(S_LINK Code unit, code units).
7225 
7226     Params:
7227         C = type that is implicitly convertible to `dchars`
7228         input = array of grapheme clusters
7229         index = starting index into `input[]`
7230 
7231     Returns:
7232         length of grapheme cluster
7233 +/
7234 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure
7235 if (is(C : dchar))
7236 {
7237     auto src = input[index..$];
7238     auto n = src.length;
7239     genericDecodeGrapheme!(false)(src);
7240     return n - src.length;
7241 }
7242 
7243 ///
7244 @safe unittest
7245 {
7246     assert(graphemeStride("  ", 1) == 1);
7247     // A + combing ring above
7248     string city = "A\u030Arhus";
7249     size_t first = graphemeStride(city, 0);
7250     assert(first == 3); //\u030A has 2 UTF-8 code units
7251     assert(city[0 .. first] == "A\u030A");
7252     assert(city[first..$] == "rhus");
7253 }
7254 
7255 @safe unittest
7256 {
7257     // Ensure that graphemeStride is usable from CTFE.
7258     enum c1 = graphemeStride("A", 0);
7259     static assert(c1 == 1);
7260 
7261     enum c2 = graphemeStride("A\u0301", 0);
7262     static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
7263 }
7264 
7265 // TODO: make this @nogc. Probably no big deal since the state machine is
7266 // already GC-free.
7267 @safe pure nothrow unittest
7268 {
7269     // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face
7270     assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2);
7271     // skier ~ female sign ~ '€'
7272     assert(graphemeStride("\u26F7\u2640€"d, 0) == 1);
7273     // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€'
7274     assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2);
7275     // skier ~ zero-width joiner ~ female sign ~ '€'
7276     assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3);
7277     // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner
7278     // ~ female sign ~ '€'
7279     assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4);
7280     // skier ~ zero-width joiner ~ '€'
7281     assert(graphemeStride("\u26F7\u200D€"d, 0) == 2);
7282     //'€' ~ zero-width joiner ~ skier
7283     assert(graphemeStride("€\u200D\u26F7"d, 0) == 2);
7284     // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two
7285     assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2);
7286     // Kaithi number sign ~ null
7287     assert(graphemeStride("\U000110BD\0"d, 0) == 1);
7288 }
7289 
7290 /++
7291     Reads one full grapheme cluster from an
7292     $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`.
7293 
7294     For examples see the $(LREF Grapheme) below.
7295 
7296     Note:
7297     This function modifies `inp` and thus `inp`
7298     must be an L-value.
7299 +/
7300 Grapheme decodeGrapheme(Input)(ref Input inp)
7301 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
7302 {
7303     return genericDecodeGrapheme!true(inp);
7304 }
7305 
7306 @safe unittest
7307 {
7308     import std.algorithm.comparison : equal;
7309 
7310     Grapheme gr;
7311     string s = " \u0020\u0308 ";
7312     gr = decodeGrapheme(s);
7313     assert(gr.length == 1 && gr[0] == ' ');
7314     gr = decodeGrapheme(s);
7315     assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
7316     s = "\u0300\u0308\u1100";
7317     assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
7318     assert(equal(decodeGrapheme(s)[], "\u1100"));
7319     s = "\u11A8\u0308\uAC01";
7320     assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
7321     assert(equal(decodeGrapheme(s)[], "\uAC01"));
7322 
7323     // Two Union Jacks of the Great Britain
7324     s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
7325     assert(equal(decodeGrapheme(s)[], "\U0001F1EC\U0001F1E7"));
7326 }
7327 
7328 /++
7329     $(P Iterate a string by $(LREF Grapheme).)
7330 
7331     $(P Useful for doing string manipulation that needs to be aware
7332     of graphemes.)
7333 
7334     See_Also:
7335         $(LREF byCodePoint)
7336 +/
7337 auto byGrapheme(Range)(Range range)
7338 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7339 {
7340     // TODO: Bidirectional access
7341     static struct Result(R)
7342     {
7343         private R _range;
7344         private Grapheme _front;
7345 
7346         bool empty() @property
7347         {
7348             return _front.length == 0;
7349         }
7350 
7351         Grapheme front() @property
7352         {
7353             return _front;
7354         }
7355 
7356         void popFront()
7357         {
7358             _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
7359         }
7360 
7361         static if (isForwardRange!R)
7362         {
7363             Result save() @property
7364             {
7365                 return Result(_range.save, _front);
7366             }
7367         }
7368     }
7369 
7370     auto result = Result!(Range)(range);
7371     result.popFront();
7372     return result;
7373 }
7374 
7375 ///
7376 @safe unittest
7377 {
7378     import std.algorithm.comparison : equal;
7379     import std.range.primitives : walkLength;
7380     import std.range : take, drop;
7381     auto text = "noe\u0308l"; // noël using e + combining diaeresis
7382     assert(text.walkLength == 5); // 5 code points
7383 
7384     auto gText = text.byGrapheme;
7385     assert(gText.walkLength == 4); // 4 graphemes
7386 
7387     assert(gText.take(3).equal("noe\u0308".byGrapheme));
7388     assert(gText.drop(3).equal("l".byGrapheme));
7389 }
7390 
7391 // For testing non-forward-range input ranges
7392 version (StdUnittest)
7393 private static @safe struct InputRangeString
7394 {
7395     private string s;
7396 
7397     bool empty() @property { return s.empty; }
7398     dchar front() @property { return s.front; }
7399     void popFront() { s.popFront(); }
7400 }
7401 
7402 @safe unittest
7403 {
7404     import std.algorithm.comparison : equal;
7405     import std.array : array;
7406     import std.range : retro;
7407     import std.range.primitives : walkLength;
7408     assert("".byGrapheme.walkLength == 0);
7409 
7410     auto reverse = "le\u0308on";
7411     assert(reverse.walkLength == 5);
7412 
7413     auto gReverse = reverse.byGrapheme;
7414     assert(gReverse.walkLength == 4);
7415 
7416     static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
7417     {{
7418         assert(text.walkLength == 5);
7419         static assert(isForwardRange!(typeof(text)));
7420 
7421         auto gText = text.byGrapheme;
7422         static assert(isForwardRange!(typeof(gText)));
7423         assert(gText.walkLength == 4);
7424         assert(gText.array.retro.equal(gReverse));
7425     }}
7426 
7427     auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
7428     static assert(!isForwardRange!(typeof(nonForwardRange)));
7429     assert(nonForwardRange.walkLength == 4);
7430 }
7431 
7432 // Issue 23474
7433 @safe pure unittest
7434 {
7435     import std.range.primitives : walkLength;
7436     assert(byGrapheme("\r\u0308").walkLength == 2);
7437 }
7438 
7439 /++
7440     $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
7441 
7442     $(P Useful for converting the result to a string after doing operations
7443     on graphemes.)
7444 
7445     $(P If passed in a range of code points, returns a range with equivalent capabilities.)
7446 +/
7447 auto byCodePoint(Range)(Range range)
7448 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme))
7449 {
7450     // TODO: Propagate bidirectional access
7451     static struct Result
7452     {
7453         private Range _range;
7454         private size_t i = 0;
7455 
7456         bool empty() @property
7457         {
7458             return _range.empty;
7459         }
7460 
7461         dchar front() @property
7462         {
7463             return _range.front[i];
7464         }
7465 
7466         void popFront()
7467         {
7468             ++i;
7469 
7470             if (i >= _range.front.length)
7471             {
7472                 _range.popFront();
7473                 i = 0;
7474             }
7475         }
7476 
7477         static if (isForwardRange!Range)
7478         {
7479             Result save() @property
7480             {
7481                 return Result(_range.save, i);
7482             }
7483         }
7484     }
7485 
7486     return Result(range);
7487 }
7488 
7489 /// Ditto
7490 auto byCodePoint(Range)(Range range)
7491 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7492 {
7493     import std.range.primitives : isBidirectionalRange, popBack;
7494     import std.traits : isNarrowString;
7495     static if (isNarrowString!Range)
7496     {
7497         static struct Result
7498         {
7499             private Range _range;
7500             @property bool empty() { return _range.empty; }
7501             @property dchar front(){ return _range.front; }
7502             void popFront(){ _range.popFront; }
7503             @property auto save() { return Result(_range.save); }
7504             @property dchar back(){ return _range.back; }
7505             void popBack(){ _range.popBack; }
7506         }
7507         static assert(isBidirectionalRange!(Result));
7508         return Result(range);
7509     }
7510     else
7511         return range;
7512 }
7513 
7514 ///
7515 @safe unittest
7516 {
7517     import std.array : array;
7518     import std.conv : text;
7519     import std.range : retro;
7520 
7521     string s = "noe\u0308l"; // noël
7522 
7523     // reverse it and convert the result to a string
7524     string reverse = s.byGrapheme
7525         .array
7526         .retro
7527         .byCodePoint
7528         .text;
7529 
7530     assert(reverse == "le\u0308on"); // lëon
7531 }
7532 
7533 @safe unittest
7534 {
7535     import std.algorithm.comparison : equal;
7536     import std.range.primitives : walkLength;
7537     import std.range : retro;
7538     assert("".byGrapheme.byCodePoint.equal(""));
7539 
7540     string text = "noe\u0308l";
7541     static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length));
7542 
7543     auto gText = InputRangeString(text).byGrapheme;
7544     static assert(!isForwardRange!(typeof(gText)));
7545 
7546     auto cpText = gText.byCodePoint;
7547     static assert(!isForwardRange!(typeof(cpText)));
7548 
7549     assert(cpText.walkLength == text.walkLength);
7550 
7551     auto plainCp = text.byCodePoint;
7552     static assert(isForwardRange!(typeof(plainCp)));
7553     assert(equal(plainCp, text));
7554     assert(equal(retro(plainCp.save), retro(text.save)));
7555     // Check that we still have length for dstring
7556     assert("абвгд"d.byCodePoint.length == 5);
7557 }
7558 
7559 /++
7560     $(P A structure designed to effectively pack $(CHARACTERS)
7561     of a $(CLUSTER).
7562     )
7563 
7564     $(P `Grapheme` has value semantics so 2 copies of a `Grapheme`
7565     always refer to distinct objects. In most actual scenarios a `Grapheme`
7566     fits on the stack and avoids memory allocation overhead for all but quite
7567     long clusters.
7568     )
7569 
7570     See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
7571 +/
7572 @safe struct Grapheme
7573 {
7574     import std.exception : enforce;
7575     import std.traits : isDynamicArray;
7576 
7577 public:
7578     /// Ctor
7579     this(C)(const scope C[] chars...)
7580         if (is(C : dchar))
7581     {
7582         this ~= chars;
7583     }
7584 
7585     ///ditto
7586     this(Input)(Input seq)
7587         if (!isDynamicArray!Input
7588             && isInputRange!Input && is(ElementType!Input : dchar))
7589     {
7590         this ~= seq;
7591     }
7592 
7593     /// Gets a $(CODEPOINT) at the given index in this cluster.
7594     dchar opIndex(size_t index) const @nogc nothrow pure @trusted
7595     {
7596         assert(index < length);
7597         return read24(isBig ? ptr_ : small_.ptr, index);
7598     }
7599 
7600     /++
7601         Writes a $(CODEPOINT) `ch` at given index in this cluster.
7602 
7603         Warning:
7604         Use of this facility may invalidate grapheme cluster,
7605         see also $(LREF Grapheme.valid).
7606     +/
7607     void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted
7608     {
7609         assert(index < length);
7610         write24(isBig ? ptr_ : small_.ptr, ch, index);
7611     }
7612 
7613     ///
7614     @safe unittest
7615     {
7616         auto g = Grapheme("A\u0302");
7617         assert(g[0] == 'A');
7618         assert(g.valid);
7619         g[1] = '~'; // ASCII tilda is not a combining mark
7620         assert(g[1] == '~');
7621         assert(!g.valid);
7622     }
7623 
7624     /++
7625         Random-access range over Grapheme's $(CHARACTERS).
7626 
7627         Warning: Invalidates when this Grapheme leaves the scope,
7628         attempts to use it then would lead to memory corruption.
7629     +/
7630     SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return
7631     {
7632         return sliceOverIndexed(a, b, &this);
7633     }
7634 
7635     /// ditto
7636     SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return
7637     {
7638         return sliceOverIndexed(0, length, &this);
7639     }
7640 
7641     /// Grapheme cluster length in $(CODEPOINTS).
7642     @property size_t length() const @nogc nothrow pure
7643     {
7644         return isBig ? len_ : slen_ & 0x7F;
7645     }
7646 
7647     /++
7648         Append $(CHARACTER) `ch` to this grapheme.
7649         Warning:
7650         Use of this facility may invalidate grapheme cluster,
7651         see also `valid`.
7652 
7653         See_Also: $(LREF Grapheme.valid)
7654     +/
7655     ref opOpAssign(string op)(dchar ch) @trusted
7656     {
7657         static if (op == "~")
7658         {
7659             import std.internal.memory : enforceRealloc;
7660             if (!isBig)
7661             {
7662                 if (slen_ == small_cap)
7663                     convertToBig();// & fallthrough to "big" branch
7664                 else
7665                 {
7666                     write24(small_.ptr, ch, smallLength);
7667                     slen_++;
7668                     return this;
7669                 }
7670             }
7671 
7672             assert(isBig);
7673             if (len_ == cap_)
7674             {
7675                 import core.checkedint : addu, mulu;
7676                 bool overflow;
7677                 cap_ = addu(cap_, grow, overflow);
7678                 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
7679                 if (overflow) assert(0);
7680                 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems);
7681             }
7682             write24(ptr_, ch, len_++);
7683             return this;
7684         }
7685         else
7686             static assert(false, "No operation "~op~" defined for Grapheme");
7687     }
7688 
7689     ///
7690     @safe unittest
7691     {
7692         import std.algorithm.comparison : equal;
7693         auto g = Grapheme("A");
7694         assert(g.valid);
7695         g ~= '\u0301';
7696         assert(g[].equal("A\u0301"));
7697         assert(g.valid);
7698         g ~= "B";
7699         // not a valid grapheme cluster anymore
7700         assert(!g.valid);
7701         // still could be useful though
7702         assert(g[].equal("A\u0301B"));
7703     }
7704 
7705     /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme.
7706     ref opOpAssign(string op, Input)(scope Input inp)
7707         if (isInputRange!Input && is(ElementType!Input : dchar))
7708     {
7709         static if (op == "~")
7710         {
7711             foreach (dchar ch; inp)
7712                 this ~= ch;
7713             return this;
7714         }
7715         else
7716             static assert(false, "No operation "~op~" defined for Grapheme");
7717     }
7718 
7719     // This is not a good `opEquals`, but formerly the automatically generated
7720     // opEquals was used, which was inferred `@safe` because of bugzilla 20655:
7721     // https://issues.dlang.org/show_bug.cgi?id=20655
7722     // This `@trusted opEquals` is only here to prevent breakage.
7723     bool opEquals(R)(const auto ref R other) const @trusted
7724     {
7725         return this.tupleof == other.tupleof;
7726     }
7727 
7728     // Define a default toHash to allow AA usage
7729     size_t toHash() const @trusted
7730     {
7731         return hashOf(slen_, hashOf(small_));
7732     }
7733 
7734     /++
7735         True if this object contains valid extended grapheme cluster.
7736         Decoding primitives of this module always return a valid `Grapheme`.
7737 
7738         Appending to and direct manipulation of grapheme's $(CHARACTERS) may
7739         render it no longer valid. Certain applications may chose to use
7740         Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
7741         entirely.
7742     +/
7743     @property bool valid()() /*const*/
7744     {
7745         auto r = this[];
7746         genericDecodeGrapheme!false(r);
7747         return r.length == 0;
7748     }
7749 
7750     this(this) @nogc nothrow pure @trusted
7751     {
7752         import std.internal.memory : enforceMalloc;
7753         if (isBig)
7754         {// dup it
7755             import core.checkedint : addu, mulu;
7756             bool overflow;
7757             auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
7758             if (overflow) assert(0);
7759 
7760             auto p = cast(ubyte*) enforceMalloc(raw_cap);
7761             p[0 .. raw_cap] = ptr_[0 .. raw_cap];
7762             ptr_ = p;
7763         }
7764     }
7765 
7766     ~this() @nogc nothrow pure @trusted
7767     {
7768         import core.memory : pureFree;
7769         if (isBig)
7770         {
7771             pureFree(ptr_);
7772         }
7773     }
7774 
7775 
7776 private:
7777     enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
7778     // "out of the blue" grow rate, needs testing
7779     // (though graphemes are typically small < 9)
7780     enum grow = 20;
7781     enum small_cap = small_bytes/3;
7782     enum small_flag = 0x80, small_mask = 0x7F;
7783     // 16 bytes in 32bits, should be enough for the majority of cases
7784     union
7785     {
7786         struct
7787         {
7788             ubyte* ptr_;
7789             size_t cap_;
7790             size_t len_;
7791             size_t padding_;
7792         }
7793         struct
7794         {
7795             ubyte[small_bytes] small_;
7796             ubyte slen_;
7797         }
7798     }
7799 
7800     void convertToBig() @nogc nothrow pure @trusted
7801     {
7802         import std.internal.memory : enforceMalloc;
7803         static assert(grow.max / 3 - 1 >= grow);
7804         enum nbytes = 3 * (grow + 1);
7805         size_t k = smallLength;
7806         ubyte* p = cast(ubyte*) enforceMalloc(nbytes);
7807         for (int i=0; i<k; i++)
7808             write24(p, read24(small_.ptr, i), i);
7809         // now we can overwrite small array data
7810         ptr_ = p;
7811         len_ = slen_;
7812         assert(grow > len_);
7813         cap_ = grow;
7814         setBig();
7815     }
7816 
7817     void setBig() @nogc nothrow pure { slen_ |= small_flag; }
7818 
7819     @property size_t smallLength() const @nogc nothrow pure
7820     {
7821         return slen_ & small_mask;
7822     }
7823     @property ubyte isBig() const @nogc nothrow pure
7824     {
7825         return slen_ & small_flag;
7826     }
7827 }
7828 
7829 static assert(Grapheme.sizeof == size_t.sizeof*4);
7830 
7831 
7832 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
7833 {
7834     import std.algorithm.comparison : equal;
7835     Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")];
7836     assert(byGrapheme("ЮУЗ").equal(data[]));
7837 }
7838 
7839 ///
7840 @safe unittest
7841 {
7842     import std.algorithm.comparison : equal;
7843     import std.algorithm.iteration : filter;
7844     import std.range : isRandomAccessRange;
7845 
7846     string bold = "ku\u0308hn";
7847 
7848     // note that decodeGrapheme takes parameter by ref
7849     auto first = decodeGrapheme(bold);
7850 
7851     assert(first.length == 1);
7852     assert(first[0] == 'k');
7853 
7854     // the next grapheme is 2 characters long
7855     auto wideOne = decodeGrapheme(bold);
7856     // slicing a grapheme yields a random-access range of dchar
7857     assert(wideOne[].equal("u\u0308"));
7858     assert(wideOne.length == 2);
7859     static assert(isRandomAccessRange!(typeof(wideOne[])));
7860 
7861     // all of the usual range manipulation is possible
7862     assert(wideOne[].filter!isMark().equal("\u0308"));
7863 
7864     auto g = Grapheme("A");
7865     assert(g.valid);
7866     g ~= '\u0301';
7867     assert(g[].equal("A\u0301"));
7868     assert(g.valid);
7869     g ~= "B";
7870     // not a valid grapheme cluster anymore
7871     assert(!g.valid);
7872     // still could be useful though
7873     assert(g[].equal("A\u0301B"));
7874 }
7875 
7876 @safe unittest
7877 {
7878     auto g = Grapheme("A\u0302");
7879     assert(g[0] == 'A');
7880     assert(g.valid);
7881     g[1] = '~'; // ASCII tilda is not a combining mark
7882     assert(g[1] == '~');
7883     assert(!g.valid);
7884 }
7885 
7886 @safe unittest
7887 {
7888     import std.algorithm.comparison : equal;
7889     import std.algorithm.iteration : map;
7890     import std.conv : text;
7891     import std.range : iota;
7892 
7893     // not valid clusters (but it just a test)
7894     auto g  = Grapheme('a', 'b', 'c', 'd', 'e');
7895     assert(g[0] == 'a');
7896     assert(g[1] == 'b');
7897     assert(g[2] == 'c');
7898     assert(g[3] == 'd');
7899     assert(g[4] == 'e');
7900     g[3] = 'Й';
7901     assert(g[2] == 'c');
7902     assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
7903     assert(g[4] == 'e');
7904     assert(!g.valid);
7905 
7906     g ~= 'ц';
7907     g ~= '~';
7908     assert(g[0] == 'a');
7909     assert(g[1] == 'b');
7910     assert(g[2] == 'c');
7911     assert(g[3] == 'Й');
7912     assert(g[4] == 'e');
7913     assert(g[5] == 'ц');
7914     assert(g[6] == '~');
7915     assert(!g.valid);
7916 
7917     Grapheme copy = g;
7918     copy[0] = 'X';
7919     copy[1] = '-';
7920     assert(g[0] == 'a' && copy[0] == 'X');
7921     assert(g[1] == 'b' && copy[1] == '-');
7922     assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
7923     copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
7924     assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8]));
7925     copy ~= "xyz";
7926     assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
7927     assert(!copy.valid);
7928 
7929     Grapheme h;
7930     foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
7931         h ~= v;
7932     assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
7933 }
7934 
7935 // ensure Grapheme can be used as an AA key.
7936 @safe unittest
7937 {
7938     int[Grapheme] aa;
7939 }
7940 
7941 /++
7942     $(P Does basic case-insensitive comparison of `r1` and `r2`.
7943     This function uses simpler comparison rule thus achieving better performance
7944     than $(LREF icmp). However keep in mind the warning below.)
7945 
7946     Params:
7947         r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7948         r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7949 
7950     Returns:
7951         An `int` that is 0 if the strings match,
7952         &lt;0 if `r1` is lexicographically "less" than `r2`,
7953         &gt;0 if `r1` is lexicographically "greater" than `r2`
7954 
7955     Warning:
7956     This function only handles 1:1 $(CODEPOINT) mapping
7957     and thus is not sufficient for certain alphabets
7958     like German, Greek and few others.
7959 
7960     See_Also:
7961         $(LREF icmp)
7962         $(REF cmp, std,algorithm,comparison)
7963 +/
7964 int sicmp(S1, S2)(scope S1 r1, scope S2 r2)
7965 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
7966     && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
7967 {
7968     import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
7969     import std.range.primitives : isInfinite;
7970     import std.utf : decodeFront;
7971     import std.traits : isDynamicArray;
7972     import std.typecons : Yes;
7973     static import std.ascii;
7974 
7975     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7976         && (isDynamicArray!S2 || isRandomAccessRange!S2)
7977         && !(isInfinite!S1 && isInfinite!S2)
7978         && __traits(compiles,
7979             {
7980                 size_t s = size_t.sizeof / 2;
7981                 r1 = r1[s .. $];
7982                 r2 = r2[s .. $];
7983             }))
7984     {{
7985         // ASCII optimization for dynamic arrays & similar.
7986         size_t i = 0;
7987         static if (isInfinite!S1)
7988             immutable end = r2.length;
7989         else static if (isInfinite!S2)
7990             immutable end = r1.length;
7991         else
7992             immutable end = r1.length > r2.length ? r2.length : r1.length;
7993         for (; i < end; ++i)
7994         {
7995             auto lhs = r1[i];
7996             auto rhs = r2[i];
7997             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7998             if (lhs == rhs) continue;
7999             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
8000             if (lowDiff) return lowDiff;
8001         }
8002         static if (isInfinite!S1)
8003             return 1;
8004         else static if (isInfinite!S2)
8005             return -1;
8006         else
8007             return (r1.length > r2.length) - (r2.length > r1.length);
8008 
8009     NonAsciiPath:
8010         r1 = r1[i .. $];
8011         r2 = r2[i .. $];
8012         // Fall through to standard case.
8013     }}
8014 
8015     while (!r1.empty)
8016     {
8017         immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1);
8018         if (r2.empty)
8019             return 1;
8020         immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2);
8021         int diff = lhs - rhs;
8022         if (!diff)
8023             continue;
8024         if ((lhs | rhs) < 0x80)
8025         {
8026             immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
8027             if (!d) continue;
8028             return d;
8029         }
8030         size_t idx = simpleCaseTrie[lhs];
8031         size_t idx2 = simpleCaseTrie[rhs];
8032         // simpleCaseTrie is packed index table
8033         if (idx != EMPTY_CASE_TRIE)
8034         {
8035             if (idx2 != EMPTY_CASE_TRIE)
8036             {// both cased chars
8037                 // adjust idx --> start of bucket
8038                 idx = idx - sTable[idx].n;
8039                 idx2 = idx2 - sTable[idx2].n;
8040                 if (idx == idx2)// one bucket, equivalent chars
8041                     continue;
8042                 else//  not the same bucket
8043                     diff = sTable[idx].ch - sTable[idx2].ch;
8044             }
8045             else
8046                 diff = sTable[idx - sTable[idx].n].ch - rhs;
8047         }
8048         else if (idx2 != EMPTY_CASE_TRIE)
8049         {
8050             diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
8051         }
8052         // one of chars is not cased at all
8053         return diff;
8054     }
8055     return int(r2.empty) - 1;
8056 }
8057 
8058 ///
8059 @safe @nogc pure nothrow unittest
8060 {
8061     assert(sicmp("Август", "авгусТ") == 0);
8062     // Greek also works as long as there is no 1:M mapping in sight
8063     assert(sicmp("ΌΎ", "όύ") == 0);
8064     // things like the following won't get matched as equal
8065     // Greek small letter iota with dialytika and tonos
8066     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8067 
8068     // while icmp has no problem with that
8069     assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
8070     assert(icmp("ΌΎ", "όύ") == 0);
8071 }
8072 
8073 // overloads for the most common cases to reduce compile time
8074 @safe @nogc pure nothrow
8075 {
8076     int sicmp(scope const(char)[] str1, scope const(char)[] str2)
8077     { return sicmp!(const(char)[], const(char)[])(str1, str2); }
8078 
8079     int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2)
8080     { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8081 
8082     int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2)
8083     { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8084 }
8085 
8086 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
8087 {
8088     import std.algorithm.searching : skipOver;
8089     import std.internal.unicode_tables : fullCaseTable; // generated file
8090     alias fTable = fullCaseTable;
8091     size_t idx = fullCaseTrie[lhs];
8092     // fullCaseTrie is packed index table
8093     if (idx == EMPTY_CASE_TRIE)
8094         return lhs;
8095     immutable start = idx - fTable[idx].n;
8096     immutable end = fTable[idx].size + start;
8097     assert(fTable[start].entry_len == 1);
8098     for (idx=start; idx<end; idx++)
8099     {
8100         auto entryLen = fTable[idx].entry_len;
8101         if (entryLen == 1)
8102         {
8103             if (fTable[idx].seq[0] == rhs)
8104             {
8105                 return 0;
8106             }
8107         }
8108         else
8109         {// OK it's a long chunk, like 'ss' for German
8110             dstring seq = fTable[idx].seq[0 .. entryLen];
8111             if (rhs == seq[0]
8112                 && rtail.skipOver(seq[1..$]))
8113             {
8114                 // note that this path modifies rtail
8115                 // iff we managed to get there
8116                 return 0;
8117             }
8118         }
8119     }
8120     return fTable[start].seq[0]; // new remapped character for accurate diffs
8121 }
8122 
8123 /++
8124     Does case insensitive comparison of `r1` and `r2`.
8125     Follows the rules of full case-folding mapping.
8126     This includes matching as equal german ß with "ss" and
8127     other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
8128     The cost of `icmp` being pedantically correct is
8129     slightly worse performance.
8130 
8131     Params:
8132         r1 = a forward range of characters
8133         r2 = a forward range of characters
8134 
8135     Returns:
8136         An `int` that is 0 if the strings match,
8137         &lt;0 if `str1` is lexicographically "less" than `str2`,
8138         &gt;0 if `str1` is lexicographically "greater" than `str2`
8139 
8140     See_Also:
8141         $(LREF sicmp)
8142         $(REF cmp, std,algorithm,comparison)
8143 +/
8144 int icmp(S1, S2)(S1 r1, S2 r2)
8145 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
8146     && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
8147 {
8148     import std.range.primitives : isInfinite;
8149     import std.traits : isDynamicArray;
8150     import std.utf : byDchar;
8151     static import std.ascii;
8152 
8153     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
8154         && (isDynamicArray!S2 || isRandomAccessRange!S2)
8155         && !(isInfinite!S1 && isInfinite!S2)
8156         && __traits(compiles,
8157             {
8158                 size_t s = size_t.max / 2;
8159                 r1 = r1[s .. $];
8160                 r2 = r2[s .. $];
8161             }))
8162     {{
8163         // ASCII optimization for dynamic arrays & similar.
8164         size_t i = 0;
8165         static if (isInfinite!S1)
8166             immutable end = r2.length;
8167         else static if (isInfinite!S2)
8168             immutable end = r1.length;
8169         else
8170             immutable end = r1.length > r2.length ? r2.length : r1.length;
8171         for (; i < end; ++i)
8172         {
8173             auto lhs = r1[i];
8174             auto rhs = r2[i];
8175             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
8176             if (lhs == rhs) continue;
8177             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
8178             if (lowDiff) return lowDiff;
8179         }
8180         static if (isInfinite!S1)
8181             return 1;
8182         else static if (isInfinite!S2)
8183             return -1;
8184         else
8185             return (r1.length > r2.length) - (r2.length > r1.length);
8186 
8187     NonAsciiPath:
8188         r1 = r1[i .. $];
8189         r2 = r2[i .. $];
8190         // Fall through to standard case.
8191     }}
8192 
8193     auto str1 = r1.byDchar;
8194     auto str2 = r2.byDchar;
8195 
8196     for (;;)
8197     {
8198         if (str1.empty)
8199             return str2.empty ? 0 : -1;
8200         immutable lhs = str1.front;
8201         if (str2.empty)
8202             return 1;
8203         immutable rhs = str2.front;
8204         str1.popFront();
8205         str2.popFront();
8206         if (!(lhs - rhs))
8207             continue;
8208         // first try to match lhs to <rhs,right-tail> sequence
8209         immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
8210         if (!cmpLR)
8211             continue;
8212         // then rhs to <lhs,left-tail> sequence
8213         immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
8214         if (!cmpRL)
8215             continue;
8216         // cmpXX contain remapped codepoints
8217         // to obtain stable ordering of icmp
8218         return cmpLR - cmpRL;
8219     }
8220 }
8221 
8222 ///
8223 @safe @nogc pure nothrow unittest
8224 {
8225     assert(icmp("Rußland", "Russland") == 0);
8226     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8227 }
8228 
8229 /**
8230  * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
8231  * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
8232  */
8233 @safe @nogc nothrow pure unittest
8234 {
8235     import std.utf : byDchar;
8236 
8237     assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
8238     assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
8239 }
8240 
8241 // test different character types
8242 @safe unittest
8243 {
8244     assert(icmp("Rußland", "Russland") == 0);
8245     assert(icmp("Rußland"w, "Russland") == 0);
8246     assert(icmp("Rußland", "Russland"w) == 0);
8247     assert(icmp("Rußland"w, "Russland"w) == 0);
8248     assert(icmp("Rußland"d, "Russland"w) == 0);
8249     assert(icmp("Rußland"w, "Russland"d) == 0);
8250 }
8251 
8252 // overloads for the most common cases to reduce compile time
8253 @safe @nogc pure nothrow
8254 {
8255     int icmp(const(char)[] str1, const(char)[] str2)
8256     { return icmp!(const(char)[], const(char)[])(str1, str2); }
8257     int icmp(const(wchar)[] str1, const(wchar)[] str2)
8258     { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8259     int icmp(const(dchar)[] str1, const(dchar)[] str2)
8260     { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8261 }
8262 
8263 @safe unittest
8264 {
8265     import std.algorithm.sorting : sort;
8266     import std.conv : to;
8267     import std.exception : assertCTFEable;
8268     assertCTFEable!(
8269     {
8270     static foreach (cfunc; AliasSeq!(icmp, sicmp))
8271     {{
8272         static foreach (S1; AliasSeq!(string, wstring, dstring))
8273         static foreach (S2; AliasSeq!(string, wstring, dstring))
8274         {
8275             assert(cfunc("".to!S1(), "".to!S2()) == 0);
8276             assert(cfunc("A".to!S1(), "".to!S2()) > 0);
8277             assert(cfunc("".to!S1(), "0".to!S2()) < 0);
8278             assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
8279             assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
8280             assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
8281             assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
8282             assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
8283             // Check example:
8284             assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
8285             assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
8286         }
8287         // check that the order is properly agnostic to the case
8288         auto strs = [ "Apple", "ORANGE",  "orAcle", "amp", "banana"];
8289         sort!((a,b) => cfunc(a,b) < 0)(strs);
8290         assert(strs == ["amp", "Apple",  "banana", "orAcle", "ORANGE"]);
8291     }}
8292     assert(icmp("ßb", "ssa") > 0);
8293     // Check example:
8294     assert(icmp("Russland", "Rußland") == 0);
8295     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8296     assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
8297     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8298     // https://issues.dlang.org/show_bug.cgi?id=11057
8299     assert( icmp("K", "L") < 0 );
8300     });
8301 }
8302 
8303 // https://issues.dlang.org/show_bug.cgi?id=17372
8304 @safe pure unittest
8305 {
8306     import std.algorithm.iteration : joiner, map;
8307     import std.algorithm.sorting : sort;
8308     import std.array : array;
8309     auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
8310 }
8311 
8312 // This is package(std) for the moment to be used as a support tool for std.regex
8313 // It needs a better API
8314 /*
8315     Return a range of all $(CODEPOINTS) that casefold to
8316     and from this `ch`.
8317 */
8318 package(std) auto simpleCaseFoldings(dchar ch) @safe
8319 {
8320     import std.internal.unicode_tables : simpleCaseTable; // generated file
8321     alias sTable = simpleCaseTable;
8322     static struct Range
8323     {
8324     @safe pure nothrow:
8325         uint idx; //if == uint.max, then read c.
8326         union
8327         {
8328             dchar c; // == 0 - empty range
8329             uint len;
8330         }
8331         @property bool isSmall() const { return idx == uint.max; }
8332 
8333         this(dchar ch)
8334         {
8335             idx = uint.max;
8336             c = ch;
8337         }
8338 
8339         this(uint start, uint size)
8340         {
8341             idx = start;
8342             len = size;
8343         }
8344 
8345         @property dchar front() const
8346         {
8347             assert(!empty);
8348             if (isSmall)
8349             {
8350                 return c;
8351             }
8352             auto ch = sTable[idx].ch;
8353             return ch;
8354         }
8355 
8356         @property bool empty() const
8357         {
8358             if (isSmall)
8359             {
8360                 return c == 0;
8361             }
8362             return len == 0;
8363         }
8364 
8365         @property size_t length() const
8366         {
8367             if (isSmall)
8368             {
8369                 return c == 0 ? 0 : 1;
8370             }
8371             return len;
8372         }
8373 
8374         void popFront()
8375         {
8376             if (isSmall)
8377                 c = 0;
8378             else
8379             {
8380                 idx++;
8381                 len--;
8382             }
8383         }
8384     }
8385     immutable idx = simpleCaseTrie[ch];
8386     if (idx == EMPTY_CASE_TRIE)
8387         return Range(ch);
8388     auto entry = sTable[idx];
8389     immutable start = idx - entry.n;
8390     return Range(start, entry.size);
8391 }
8392 
8393 @safe unittest
8394 {
8395     import std.algorithm.comparison : equal;
8396     import std.algorithm.searching : canFind;
8397     import std.array : array;
8398     import std.exception : assertCTFEable;
8399     assertCTFEable!((){
8400         auto r = simpleCaseFoldings('Э').array;
8401         assert(r.length == 2);
8402         assert(r.canFind('э') && r.canFind('Э'));
8403         auto sr = simpleCaseFoldings('~');
8404         assert(sr.equal("~"));
8405         //A with ring above - casefolds to the same bucket as Angstrom sign
8406         sr = simpleCaseFoldings('Å');
8407         assert(sr.length == 3);
8408         assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
8409     });
8410 }
8411 
8412 /++
8413     $(P Returns the $(S_LINK Combining class, combining class) of `ch`.)
8414 +/
8415 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
8416 {
8417     return combiningClassTrie[ch];
8418 }
8419 
8420 ///
8421 @safe unittest
8422 {
8423     // shorten the code
8424     alias CC = combiningClass;
8425 
8426     // combining tilda
8427     assert(CC('\u0303') == 230);
8428     // combining ring below
8429     assert(CC('\u0325') == 220);
8430     // the simple consequence is that  "tilda" should be
8431     // placed after a "ring below" in a sequence
8432 }
8433 
8434 @safe pure nothrow @nogc unittest
8435 {
8436     foreach (ch; 0 .. 0x80)
8437         assert(combiningClass(ch) == 0);
8438     assert(combiningClass('\u05BD') == 22);
8439     assert(combiningClass('\u0300') == 230);
8440     assert(combiningClass('\u0317') == 220);
8441     assert(combiningClass('\u1939') == 222);
8442 }
8443 
8444 /// Unicode character decomposition type.
8445 enum UnicodeDecomposition {
8446     /// Canonical decomposition. The result is canonically equivalent sequence.
8447     Canonical,
8448     /**
8449          Compatibility decomposition. The result is compatibility equivalent sequence.
8450          Note: Compatibility decomposition is a $(B lossy) conversion,
8451          typically suitable only for fuzzy matching and internal processing.
8452     */
8453     Compatibility
8454 }
8455 
8456 /**
8457     Shorthand aliases for character decomposition type, passed as a
8458     template parameter to $(LREF decompose).
8459 */
8460 enum {
8461     Canonical = UnicodeDecomposition.Canonical,
8462     Compatibility = UnicodeDecomposition.Compatibility
8463 }
8464 
8465 /++
8466     Try to canonically compose 2 $(CHARACTERS).
8467     Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
8468 
8469     The assumption is that `first` comes before `second` in the original text,
8470     usually meaning that the first is a starter.
8471 
8472     Note: Hangul syllables are not covered by this function.
8473     See `composeJamo` below.
8474 +/
8475 public dchar compose(dchar first, dchar second) pure nothrow @safe
8476 {
8477     import std.algorithm.iteration : map;
8478     import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
8479     import std.range : assumeSorted;
8480     immutable packed = compositionJumpTrie[first];
8481     if (packed == ushort.max)
8482         return dchar.init;
8483     // unpack offset and length
8484     immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
8485     // TODO: optimize this micro binary search (no more then 4-5 steps)
8486     auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted();
8487     immutable target = r.lowerBound(second).length;
8488     if (target == cnt)
8489         return dchar.init;
8490     immutable entry = compositionTable[idx+target];
8491     if (entry.rhs != second)
8492         return dchar.init;
8493     return entry.composed;
8494 }
8495 
8496 ///
8497 @safe unittest
8498 {
8499     assert(compose('A','\u0308') == '\u00C4');
8500     assert(compose('A', 'B') == dchar.init);
8501     assert(compose('C', '\u0301') == '\u0106');
8502     // note that the starter is the first one
8503     // thus the following doesn't compose
8504     assert(compose('\u0308', 'A') == dchar.init);
8505 }
8506 
8507 /++
8508     Returns a full $(S_LINK Canonical decomposition, Canonical)
8509     (by default) or $(S_LINK Compatibility decomposition, Compatibility)
8510     decomposition of $(CHARACTER) `ch`.
8511     If no decomposition is available returns a $(LREF Grapheme)
8512     with the `ch` itself.
8513 
8514     Note:
8515     This function also decomposes hangul syllables
8516     as prescribed by the standard.
8517 
8518     See_Also: $(LREF decomposeHangul) for a restricted version
8519     that takes into account only hangul syllables  but
8520     no other decompositions.
8521 +/
8522 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
8523 {
8524     import std.algorithm.searching : until;
8525     import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
8526     static if (decompType == Canonical)
8527     {
8528         alias table = decompCanonTable;
8529         alias mapping = canonMappingTrie;
8530     }
8531     else static if (decompType == Compatibility)
8532     {
8533         alias table = decompCompatTable;
8534         alias mapping = compatMappingTrie;
8535     }
8536     immutable idx = mapping[ch];
8537     if (!idx) // not found, check hangul arithmetic decomposition
8538         return decomposeHangul(ch);
8539     auto decomp = table[idx..$].until(0);
8540     return Grapheme(decomp);
8541 }
8542 
8543 ///
8544 @safe unittest
8545 {
8546     import std.algorithm.comparison : equal;
8547 
8548     assert(compose('A','\u0308') == '\u00C4');
8549     assert(compose('A', 'B') == dchar.init);
8550     assert(compose('C', '\u0301') == '\u0106');
8551     // note that the starter is the first one
8552     // thus the following doesn't compose
8553     assert(compose('\u0308', 'A') == dchar.init);
8554 
8555     assert(decompose('Ĉ')[].equal("C\u0302"));
8556     assert(decompose('D')[].equal("D"));
8557     assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
8558     assert(decompose!Compatibility('¹')[].equal("1"));
8559 }
8560 
8561 //----------------------------------------------------------------------------
8562 // Hangul specific composition/decomposition
8563 enum jamoSBase = 0xAC00;
8564 enum jamoLBase = 0x1100;
8565 enum jamoVBase = 0x1161;
8566 enum jamoTBase = 0x11A7;
8567 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
8568 enum jamoNCount = jamoVCount * jamoTCount;
8569 enum jamoSCount = jamoLCount * jamoNCount;
8570 
8571 // Tests if `ch` is a Hangul leading consonant jamo.
8572 bool isJamoL(dchar ch) pure nothrow @nogc @safe
8573 {
8574     // first cmp rejects ~ 1M code points above leading jamo range
8575     return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
8576 }
8577 
8578 // Tests if `ch` is a Hangul vowel jamo.
8579 bool isJamoT(dchar ch) pure nothrow @nogc @safe
8580 {
8581     // first cmp rejects ~ 1M code points above trailing jamo range
8582     // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
8583     return ch < jamoTBase+jamoTCount && ch > jamoTBase;
8584 }
8585 
8586 // Tests if `ch` is a Hangul trailnig consonant jamo.
8587 bool isJamoV(dchar ch) pure nothrow @nogc @safe
8588 {
8589     // first cmp rejects ~ 1M code points above vowel range
8590     return  ch < jamoVBase+jamoVCount && ch >= jamoVBase;
8591 }
8592 
8593 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
8594 {
8595     int idxS = cast(int) ch - jamoSBase;
8596     return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
8597 }
8598 
8599 // internal helper: compose hangul syllables leaving dchar.init in holes
8600 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe
8601 {
8602     for (size_t idx = 0; idx + 1 < seq.length; )
8603     {
8604         if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
8605         {
8606             immutable int indexL = seq[idx] - jamoLBase;
8607             immutable int indexV = seq[idx+1] - jamoVBase;
8608             immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
8609             if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
8610             {
8611                 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
8612                 seq[idx+1] = dchar.init;
8613                 seq[idx+2] = dchar.init;
8614                 idx += 3;
8615             }
8616             else
8617             {
8618                 seq[idx] = jamoSBase + indexLV;
8619                 seq[idx+1] = dchar.init;
8620                 idx += 2;
8621             }
8622         }
8623         else
8624             idx++;
8625     }
8626 }
8627 
8628 //----------------------------------------------------------------------------
8629 public:
8630 
8631 /**
8632     Decomposes a Hangul syllable. If `ch` is not a composed syllable
8633     then this function returns $(LREF Grapheme) containing only `ch` as is.
8634 */
8635 Grapheme decomposeHangul(dchar ch) nothrow pure @safe
8636 {
8637     immutable idxS = cast(int) ch - jamoSBase;
8638     if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
8639     immutable idxL = idxS / jamoNCount;
8640     immutable idxV = (idxS % jamoNCount) / jamoTCount;
8641     immutable idxT = idxS % jamoTCount;
8642 
8643     immutable partL = jamoLBase + idxL;
8644     immutable partV = jamoVBase + idxV;
8645     if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
8646         return Grapheme(partL, partV, jamoTBase + idxT);
8647     else // <L, V> decomposition
8648         return Grapheme(partL, partV);
8649 }
8650 
8651 ///
8652 @safe unittest
8653 {
8654     import std.algorithm.comparison : equal;
8655     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8656 }
8657 
8658 /++
8659     Try to compose hangul syllable out of a leading consonant (`lead`),
8660     a `vowel` and optional `trailing` consonant jamos.
8661 
8662     On success returns the composed LV or LVT hangul syllable.
8663 
8664     If any of `lead` and `vowel` are not a valid hangul jamo
8665     of the respective $(CHARACTER) class returns dchar.init.
8666 +/
8667 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
8668 {
8669     if (!isJamoL(lead))
8670         return dchar.init;
8671     immutable indexL = lead - jamoLBase;
8672     if (!isJamoV(vowel))
8673         return dchar.init;
8674     immutable indexV = vowel - jamoVBase;
8675     immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
8676     immutable dchar syllable = jamoSBase + indexLV;
8677     return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
8678 }
8679 
8680 ///
8681 @safe unittest
8682 {
8683     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8684     // leaving out T-vowel, or passing any codepoint
8685     // that is not trailing consonant composes an LV-syllable
8686     assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
8687     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8688     assert(composeJamo('\u1111', 'A') == dchar.init);
8689     assert(composeJamo('A', '\u1171') == dchar.init);
8690 }
8691 
8692 @safe unittest
8693 {
8694     import std.algorithm.comparison : equal;
8695     import std.conv : text;
8696 
8697     static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
8698     {
8699         Grapheme g = decompose!T(ch);
8700         assert(equal(g[], r), text(g[], " vs ", r));
8701     }
8702     testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
8703     testDecomp!Canonical('\uF907', "\u9F9C");
8704     testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
8705     testDecomp!Compatibility('\uA7F9', "\u0153");
8706 
8707     // check examples
8708     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8709     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8710     assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
8711     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8712     assert(composeJamo('\u1111', 'A') == dchar.init);
8713     assert(composeJamo('A', '\u1171') == dchar.init);
8714 }
8715 
8716 /**
8717     Enumeration type for normalization forms,
8718     passed as template parameter for functions like $(LREF normalize).
8719 */
8720 enum NormalizationForm {
8721     NFC,
8722     NFD,
8723     NFKC,
8724     NFKD
8725 }
8726 
8727 
8728 enum {
8729     /**
8730         Shorthand aliases from values indicating normalization forms.
8731     */
8732     NFC = NormalizationForm.NFC,
8733     ///ditto
8734     NFD = NormalizationForm.NFD,
8735     ///ditto
8736     NFKC = NormalizationForm.NFKC,
8737     ///ditto
8738     NFKD = NormalizationForm.NFKD
8739 }
8740 
8741 /++
8742     Returns `input` string normalized to the chosen form.
8743     Form C is used by default.
8744 
8745     For more information on normalization forms see
8746     the $(S_LINK Normalization, normalization section).
8747 
8748     Note:
8749     In cases where the string in question is already normalized,
8750     it is returned unmodified and no memory allocation happens.
8751 +/
8752 /*
8753     WARNING: @trusted lambda inside - handle with same care as @trusted
8754         functions
8755 
8756     Despite being a template, the attributes do no harm since this doesn't work
8757     with user-defined range or character types anyway.
8758 */
8759 pure @safe inout(C)[] normalize(NormalizationForm norm=NFC, C)
8760     (return scope inout(C)[] input)
8761 {
8762     import std.algorithm.mutation : SwapStrategy;
8763     import std.algorithm.sorting : sort;
8764     import std.array : appender;
8765     import std.range : zip;
8766 
8767     auto anchors = splitNormalized!norm(input);
8768     if (anchors[0] == input.length && anchors[1] == input.length)
8769         return input;
8770     dchar[] decomposed;
8771     decomposed.reserve(31);
8772     ubyte[] ccc;
8773     ccc.reserve(31);
8774     auto app = appender!(C[])();
8775     do
8776     {
8777         app.put(input[0 .. anchors[0]]);
8778         foreach (dchar ch; input[anchors[0]..anchors[1]])
8779             static if (norm == NFD || norm == NFC)
8780             {
8781                 foreach (dchar c; decompose!Canonical(ch)[])
8782                     decomposed ~= c;
8783             }
8784             else // NFKD & NFKC
8785             {
8786                 foreach (dchar c; decompose!Compatibility(ch)[])
8787                     decomposed ~= c;
8788             }
8789         ccc.length = decomposed.length;
8790         size_t firstNonStable = 0;
8791         ubyte lastClazz = 0;
8792 
8793         foreach (idx, dchar ch; decomposed)
8794         {
8795             immutable clazz = combiningClass(ch);
8796             ccc[idx] = clazz;
8797             if (clazz == 0 && lastClazz != 0)
8798             {
8799                 // found a stable code point after unstable ones
8800                 sort!("a[0] < b[0]", SwapStrategy.stable)
8801                     (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
8802                 firstNonStable = decomposed.length;
8803             }
8804             else if (clazz != 0 && lastClazz == 0)
8805             {
8806                 // found first unstable code point after stable ones
8807                 firstNonStable = idx;
8808             }
8809             lastClazz = clazz;
8810         }
8811         sort!("a[0] < b[0]", SwapStrategy.stable)
8812             (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
8813         static if (norm == NFC || norm == NFKC)
8814         {
8815             import std.algorithm.searching : countUntil;
8816             auto first = countUntil(ccc, 0);
8817             if (first >= 0) // no starters?? no recomposition
8818             {
8819                 for (;;)
8820                 {
8821                     immutable second = recompose(first, decomposed, ccc);
8822                     if (second == decomposed.length)
8823                         break;
8824                     first = second;
8825                 }
8826                 // 2nd pass for hangul syllables
8827                 hangulRecompose(decomposed);
8828             }
8829         }
8830         static if (norm == NFD || norm == NFKD)
8831             app.put(decomposed);
8832         else
8833         {
8834             import std.algorithm.mutation : remove;
8835             auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
8836             app.put(decomposed[0 .. clean.length]);
8837         }
8838         // reset variables
8839         decomposed.length = 0;
8840         () @trusted {
8841             // assumeSafeAppend isn't considered pure as of writing, hence the
8842             // cast. It isn't pure in the sense that the elements after
8843             // the array in question are affected, but we don't use those
8844             // making the call pure for our purposes.
8845             (cast(void delegate() pure nothrow) {decomposed.assumeSafeAppend();})();
8846             ccc.length = 0;
8847             (cast(void delegate() pure nothrow) {ccc.assumeSafeAppend();})();
8848         } ();
8849         input = input[anchors[1]..$];
8850         // and move on
8851         anchors = splitNormalized!norm(input);
8852     } while (anchors[0] != input.length);
8853     app.put(input[0 .. anchors[0]]);
8854     return () @trusted inout { return cast(inout(C)[]) app.data; } ();
8855 }
8856 
8857 ///
8858 @safe pure unittest
8859 {
8860     // any encoding works
8861     wstring greet = "Hello world";
8862     assert(normalize(greet) is greet); // the same exact slice
8863 
8864     // An example of a character with all 4 forms being different:
8865     // Greek upsilon with acute and hook symbol (code point 0x03D3)
8866     assert(normalize!NFC("ϓ") == "\u03D3");
8867     assert(normalize!NFD("ϓ") == "\u03D2\u0301");
8868     assert(normalize!NFKC("ϓ") == "\u038E");
8869     assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
8870 }
8871 
8872 @safe pure unittest
8873 {
8874     import std.conv : text;
8875 
8876     assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
8877     assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
8878     assert(normalize!NFD("Äffin") == "A\u0308ffin");
8879 
8880     // test with dstring
8881     dstring greet = "Hello world";
8882     assert(normalize(greet) is greet); // the same exact slice
8883 }
8884 
8885 // canonically recompose given slice of code points, works in-place and mutates data
8886 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe
8887 {
8888     assert(input.length == ccc.length);
8889     int accumCC = -1;// so that it's out of 0 .. 255 range
8890     // writefln("recomposing %( %04x %)", input);
8891     // first one is always a starter thus we start at i == 1
8892     size_t i = start+1;
8893     for (; ; )
8894     {
8895         if (i == input.length)
8896             break;
8897         immutable curCC = ccc[i];
8898         // In any character sequence beginning with a starter S
8899         // a character C is blocked from S if and only if there
8900         // is some character B between S and C, and either B
8901         // is a starter or it has the same or higher combining class as C.
8902         //------------------------
8903         // Applying to our case:
8904         // S is input[0]
8905         // accumCC is the maximum CCC of characters between C and S,
8906         //     as ccc are sorted
8907         // C is input[i]
8908 
8909         if (curCC > accumCC)
8910         {
8911             immutable comp = compose(input[start], input[i]);
8912             if (comp != dchar.init)
8913             {
8914                 input[start] = comp;
8915                 input[i] = dchar.init;// put a sentinel
8916                 // current was merged so its CCC shouldn't affect
8917                 // composing with the next one
8918             }
8919             else
8920             {
8921                 // if it was a starter then accumCC is now 0, end of loop
8922                 accumCC = curCC;
8923                 if (accumCC == 0)
8924                     break;
8925             }
8926         }
8927         else
8928         {
8929             // ditto here
8930             accumCC = curCC;
8931             if (accumCC == 0)
8932                 break;
8933         }
8934         i++;
8935     }
8936     return i;
8937 }
8938 
8939 // returns tuple of 2 indexes that delimit:
8940 // normalized text, piece that needs normalization and
8941 // the rest of input starting with stable code point
8942 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input)
8943 {
8944     import std.typecons : tuple;
8945     ubyte lastCC = 0;
8946 
8947     foreach (idx, dchar ch; input)
8948     {
8949         static if (norm == NFC)
8950             if (ch < 0x0300)
8951             {
8952                 lastCC = 0;
8953                 continue;
8954             }
8955         immutable ubyte CC = combiningClass(ch);
8956         if (lastCC > CC && CC != 0)
8957         {
8958             return seekStable!norm(idx, input);
8959         }
8960 
8961         if (notAllowedIn!norm(ch))
8962         {
8963            return seekStable!norm(idx, input);
8964         }
8965         lastCC = CC;
8966     }
8967     return tuple(input.length, input.length);
8968 }
8969 
8970 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input)
8971 {
8972     import std.typecons : tuple;
8973     import std.utf : codeLength;
8974 
8975     auto br = input[0 .. idx];
8976     size_t region_start = 0;// default
8977     for (;;)
8978     {
8979         if (br.empty)// start is 0
8980             break;
8981         dchar ch = br.back;
8982         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8983         {
8984             region_start = br.length - codeLength!C(ch);
8985             break;
8986         }
8987         br.popFront();
8988     }
8989     ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
8990     size_t region_end=input.length;// end is $ by default
8991     foreach (i, dchar ch; input[idx..$])
8992     {
8993         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8994         {
8995             region_end = i+idx;
8996             break;
8997         }
8998     }
8999     // writeln("Region to normalize: ", input[region_start .. region_end]);
9000     return tuple(region_start, region_end);
9001 }
9002 
9003 /**
9004     Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization
9005     form `norm`.
9006 */
9007 public bool allowedIn(NormalizationForm norm)(dchar ch)
9008 {
9009     return !notAllowedIn!norm(ch);
9010 }
9011 
9012 ///
9013 @safe unittest
9014 {
9015     // e.g. Cyrillic is always allowed, so is ASCII
9016     assert(allowedIn!NFC('я'));
9017     assert(allowedIn!NFD('я'));
9018     assert(allowedIn!NFKC('я'));
9019     assert(allowedIn!NFKD('я'));
9020     assert(allowedIn!NFC('Z'));
9021 }
9022 
9023 // not user friendly name but more direct
9024 private bool notAllowedIn(NormalizationForm norm)(dchar ch)
9025 {
9026     static if (norm == NFC)
9027         alias qcTrie = nfcQCTrie;
9028     else static if (norm == NFD)
9029         alias qcTrie = nfdQCTrie;
9030     else static if (norm == NFKC)
9031         alias qcTrie = nfkcQCTrie;
9032     else static if (norm == NFKD)
9033         alias qcTrie = nfkdQCTrie;
9034     else
9035         static assert("Unknown normalization form "~norm);
9036     return qcTrie[ch];
9037 }
9038 
9039 @safe unittest
9040 {
9041     assert(allowedIn!NFC('я'));
9042     assert(allowedIn!NFD('я'));
9043     assert(allowedIn!NFKC('я'));
9044     assert(allowedIn!NFKD('я'));
9045     assert(allowedIn!NFC('Z'));
9046 }
9047 
9048 }
9049 
9050 version (std_uni_bootstrap)
9051 {
9052     // old version used for bootstrapping of gen_uni.d that generates
9053     // up to date optimal versions of all of isXXX functions
9054     @safe pure nothrow @nogc public bool isWhite(dchar c)
9055     {
9056         import std.ascii : isWhite;
9057         return isWhite(c) ||
9058                c == lineSep || c == paraSep ||
9059                c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
9060                (c >= '\u2000' && c <= '\u200A') ||
9061                c == '\u202F' || c == '\u205F' || c == '\u3000';
9062     }
9063 }
9064 else
9065 {
9066 
9067 // trusted -> avoid bounds check
9068 @trusted pure nothrow @nogc private
9069 {
9070     import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
9071 
9072     // hide template instances behind functions
9073     // https://issues.dlang.org/show_bug.cgi?id=13232
9074     ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
9075     ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
9076     dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
9077 
9078     ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
9079     ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
9080     dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
9081 
9082     ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
9083     ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
9084     dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
9085 }
9086 
9087 public:
9088 
9089 /++
9090     Whether or not `c` is a Unicode whitespace $(CHARACTER).
9091     (general Unicode category: Part of C0(tab, vertical tab, form feed,
9092     carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
9093 +/
9094 @safe pure nothrow @nogc
9095 public bool isWhite(dchar c)
9096 {
9097     import std.internal.unicode_tables : isWhiteGen; // generated file
9098     return isWhiteGen(c); // call pregenerated binary search
9099 }
9100 
9101 /++
9102     Return whether `c` is a Unicode lowercase $(CHARACTER).
9103 +/
9104 @safe pure nothrow @nogc
9105 bool isLower(dchar c)
9106 {
9107     import std.ascii : isLower, isASCII;
9108     if (isASCII(c))
9109         return isLower(c);
9110     return lowerCaseTrie[c];
9111 }
9112 
9113 @safe unittest
9114 {
9115     import std.ascii : isLower;
9116     foreach (v; 0 .. 0x80)
9117         assert(isLower(v) == .isLower(v));
9118     assert(.isLower('я'));
9119     assert(.isLower('й'));
9120     assert(!.isLower('Ж'));
9121     // Greek HETA
9122     assert(!.isLower('\u0370'));
9123     assert(.isLower('\u0371'));
9124     assert(!.isLower('\u039C')); // capital MU
9125     assert(.isLower('\u03B2')); // beta
9126     // from extended Greek
9127     assert(!.isLower('\u1F18'));
9128     assert(.isLower('\u1F00'));
9129     foreach (v; unicode.lowerCase.byCodepoint)
9130         assert(.isLower(v) && !isUpper(v));
9131 }
9132 
9133 
9134 /++
9135     Return whether `c` is a Unicode uppercase $(CHARACTER).
9136 +/
9137 @safe pure nothrow @nogc
9138 bool isUpper(dchar c)
9139 {
9140     import std.ascii : isUpper, isASCII;
9141     if (isASCII(c))
9142         return isUpper(c);
9143     return upperCaseTrie[c];
9144 }
9145 
9146 @safe unittest
9147 {
9148     import std.ascii : isLower;
9149     foreach (v; 0 .. 0x80)
9150         assert(isLower(v) == .isLower(v));
9151     assert(!isUpper('й'));
9152     assert(isUpper('Ж'));
9153     // Greek HETA
9154     assert(isUpper('\u0370'));
9155     assert(!isUpper('\u0371'));
9156     assert(isUpper('\u039C')); // capital MU
9157     assert(!isUpper('\u03B2')); // beta
9158     // from extended Greek
9159     assert(!isUpper('\u1F00'));
9160     assert(isUpper('\u1F18'));
9161     foreach (v; unicode.upperCase.byCodepoint)
9162         assert(isUpper(v) && !.isLower(v));
9163 }
9164 
9165 
9166 //TODO: Hidden for now, needs better API.
9167 //Other transforms could use better API as well, but this one is a new primitive.
9168 @safe pure nothrow @nogc
9169 private dchar toTitlecase(dchar c)
9170 {
9171     // optimize ASCII case
9172     if (c < 0xAA)
9173     {
9174         if (c < 'a')
9175             return c;
9176         if (c <= 'z')
9177             return c - 32;
9178         return c;
9179     }
9180     size_t idx = toTitleSimpleIndex(c);
9181     if (idx != ushort.max)
9182     {
9183         return toTitleTab(idx);
9184     }
9185     return c;
9186 }
9187 
9188 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
9189 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
9190 
9191 // generic toUpper/toLower on whole string, creates new or returns as is
9192 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s)
9193 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9194 {
9195     import std.array : appender, array;
9196     import std.ascii : isASCII;
9197     import std.utf : byDchar, codeLength;
9198 
9199     alias C = ElementEncodingType!S;
9200 
9201     auto r = s.byDchar;
9202     for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront())
9203     {
9204         auto cOuter = r.front;
9205         ushort idx = indexFn(cOuter);
9206         if (idx == ushort.max)
9207             continue;
9208         auto result = appender!(C[])();
9209         result.reserve(s.length);
9210         result.put(s[0 .. i]);
9211         foreach (dchar c; s[i .. $].byDchar)
9212         {
9213             if (c.isASCII)
9214             {
9215                 result.put(asciiConvert(c));
9216             }
9217             else
9218             {
9219                 idx = indexFn(c);
9220                 if (idx == ushort.max)
9221                     result.put(c);
9222                 else if (idx < maxIdx)
9223                 {
9224                     c = tableFn(idx);
9225                     result.put(c);
9226                 }
9227                 else
9228                 {
9229                     auto val = tableFn(idx);
9230                     // unpack length + codepoint
9231                     immutable uint len = val >> 24;
9232                     result.put(cast(dchar)(val & 0xFF_FFFF));
9233                     foreach (j; idx+1 .. idx+len)
9234                         result.put(tableFn(j));
9235                 }
9236             }
9237         }
9238         return result.data;
9239     }
9240 
9241     static if (isSomeString!S)
9242         return s;
9243     else
9244         return s.array;
9245 }
9246 
9247 // https://issues.dlang.org/show_bug.cgi?id=12428
9248 @safe unittest
9249 {
9250     import std.array : replicate;
9251     auto s = "abcdefghij".replicate(300);
9252     s = s[0 .. 10];
9253 
9254     toUpper(s);
9255 
9256     assert(s == "abcdefghij");
9257 }
9258 
9259 // https://issues.dlang.org/show_bug.cgi?id=18993
9260 @safe unittest
9261 {
9262     static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length);
9263 }
9264 
9265 
9266 // generic toUpper/toLower on whole range, returns range
9267 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
9268     // Accept range of dchar's
9269 if (isInputRange!Range &&
9270     isSomeChar!(ElementEncodingType!Range) &&
9271     ElementEncodingType!Range.sizeof == dchar.sizeof)
9272 {
9273     static struct ToCaserImpl
9274     {
9275         @property bool empty()
9276         {
9277             return !nLeft && r.empty;
9278         }
9279 
9280         @property auto front()
9281         {
9282             import std.ascii : isASCII;
9283 
9284             if (!nLeft)
9285             {
9286                 dchar c = r.front;
9287                 if (c.isASCII)
9288                 {
9289                     buf[0] = asciiConvert(c);
9290                     nLeft = 1;
9291                 }
9292                 else
9293                 {
9294                     const idx = indexFn(c);
9295                     if (idx == ushort.max)
9296                     {
9297                         buf[0] = c;
9298                         nLeft = 1;
9299                     }
9300                     else if (idx < maxIdx)
9301                     {
9302                         buf[0] = tableFn(idx);
9303                         nLeft = 1;
9304                     }
9305                     else
9306                     {
9307                         immutable val = tableFn(idx);
9308                         // unpack length + codepoint
9309                         nLeft = val >> 24;
9310                         if (nLeft == 0)
9311                             nLeft = 1;
9312                         assert(nLeft <= buf.length);
9313                         buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9314                         foreach (j; 1 .. nLeft)
9315                             buf[nLeft - j - 1] = tableFn(idx + j);
9316                     }
9317                 }
9318             }
9319             return buf[nLeft - 1];
9320         }
9321 
9322         void popFront()
9323         {
9324             if (!nLeft)
9325                 front;
9326             assert(nLeft);
9327             --nLeft;
9328             if (!nLeft)
9329                 r.popFront();
9330         }
9331 
9332         static if (isForwardRange!Range)
9333         {
9334             @property auto save()
9335             {
9336                 auto ret = this;
9337                 ret.r = r.save;
9338                 return ret;
9339             }
9340         }
9341 
9342       private:
9343         Range r;
9344         uint nLeft;
9345         dchar[3] buf = void;
9346     }
9347 
9348     return ToCaserImpl(str);
9349 }
9350 
9351 /*********************
9352  * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9353  * or a string to upper or lower case.
9354  *
9355  * Does not allocate memory.
9356  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9357  * are treated as $(REF replacementDchar, std,utf).
9358  *
9359  * Params:
9360  *      str = string or range of characters
9361  *
9362  * Returns:
9363  *      an input range of `dchar`s
9364  *
9365  * See_Also:
9366  *      $(LREF toUpper), $(LREF toLower)
9367  */
9368 
9369 auto asLowerCase(Range)(Range str)
9370 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9371     !isConvertibleToString!Range)
9372 {
9373     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9374     {
9375         import std.utf : byDchar;
9376 
9377         // Decode first
9378         return asLowerCase(str.byDchar);
9379     }
9380     else
9381     {
9382         static import std.ascii;
9383         return toCaser!(LowerTriple, std.ascii.toLower)(str);
9384     }
9385 }
9386 
9387 /// ditto
9388 auto asUpperCase(Range)(Range str)
9389 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9390     !isConvertibleToString!Range)
9391 {
9392     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9393     {
9394         import std.utf : byDchar;
9395 
9396         // Decode first
9397         return asUpperCase(str.byDchar);
9398     }
9399     else
9400     {
9401         static import std.ascii;
9402         return toCaser!(UpperTriple, std.ascii.toUpper)(str);
9403     }
9404 }
9405 
9406 ///
9407 @safe pure unittest
9408 {
9409     import std.algorithm.comparison : equal;
9410 
9411     assert("hEllo".asUpperCase.equal("HELLO"));
9412 }
9413 
9414 // explicitly undocumented
9415 auto asLowerCase(Range)(auto ref Range str)
9416 if (isConvertibleToString!Range)
9417 {
9418     import std.traits : StringTypeOf;
9419     return asLowerCase!(StringTypeOf!Range)(str);
9420 }
9421 
9422 // explicitly undocumented
9423 auto asUpperCase(Range)(auto ref Range str)
9424 if (isConvertibleToString!Range)
9425 {
9426     import std.traits : StringTypeOf;
9427     return asUpperCase!(StringTypeOf!Range)(str);
9428 }
9429 
9430 @safe unittest
9431 {
9432     static struct TestAliasedString
9433     {
9434         string get() @safe @nogc pure nothrow { return _s; }
9435         alias get this;
9436         @disable this(this);
9437         string _s;
9438     }
9439 
9440     static bool testAliasedString(alias func, Args...)(string s, Args args)
9441     {
9442         import std.algorithm.comparison : equal;
9443         auto a = func(TestAliasedString(s), args);
9444         auto b = func(s, args);
9445         static if (is(typeof(equal(a, b))))
9446         {
9447             // For ranges, compare contents instead of object identity.
9448             return equal(a, b);
9449         }
9450         else
9451         {
9452             return a == b;
9453         }
9454     }
9455     assert(testAliasedString!asLowerCase("hEllo"));
9456     assert(testAliasedString!asUpperCase("hEllo"));
9457     assert(testAliasedString!asCapitalized("hEllo"));
9458 }
9459 
9460 @safe unittest
9461 {
9462     import std.array : array;
9463 
9464     auto a = "HELLo".asLowerCase;
9465     auto savea = a.save;
9466     auto s = a.array;
9467     assert(s == "hello");
9468     s = savea.array;
9469     assert(s == "hello");
9470 
9471     string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
9472     string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
9473 
9474     foreach (i, slwr; lower)
9475     {
9476         import std.utf : byChar;
9477 
9478         auto sx = slwr.asUpperCase.byChar.array;
9479         assert(sx == toUpper(slwr));
9480         auto sy = upper[i].asLowerCase.byChar.array;
9481         assert(sy == toLower(upper[i]));
9482     }
9483 
9484     // Not necessary to call r.front
9485     for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
9486     {
9487     }
9488 
9489     import std.algorithm.comparison : equal;
9490 
9491     "HELLo"w.asLowerCase.equal("hello"d);
9492     "HELLo"w.asUpperCase.equal("HELLO"d);
9493     "HELLo"d.asLowerCase.equal("hello"d);
9494     "HELLo"d.asUpperCase.equal("HELLO"d);
9495 
9496     import std.utf : byChar;
9497     assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
9498 }
9499 
9500 // generic capitalizer on whole range, returns range
9501 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
9502                            Range)(Range str)
9503     // Accept range of dchar's
9504 if (isInputRange!Range &&
9505     isSomeChar!(ElementEncodingType!Range) &&
9506     ElementEncodingType!Range.sizeof == dchar.sizeof)
9507 {
9508     static struct ToCapitalizerImpl
9509     {
9510         @property bool empty()
9511         {
9512             return lower ? lwr.empty : !nLeft && r.empty;
9513         }
9514 
9515         @property auto front()
9516         {
9517             if (lower)
9518                 return lwr.front;
9519 
9520             if (!nLeft)
9521             {
9522                 immutable dchar c = r.front;
9523                 const idx = indexFnUpper(c);
9524                 if (idx == ushort.max)
9525                 {
9526                     buf[0] = c;
9527                     nLeft = 1;
9528                 }
9529                 else if (idx < maxIdxUpper)
9530                 {
9531                     buf[0] = tableFnUpper(idx);
9532                     nLeft = 1;
9533                 }
9534                 else
9535                 {
9536                     immutable val = tableFnUpper(idx);
9537                     // unpack length + codepoint
9538                     nLeft = val >> 24;
9539                     if (nLeft == 0)
9540                         nLeft = 1;
9541                     assert(nLeft <= buf.length);
9542                     buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9543                     foreach (j; 1 .. nLeft)
9544                         buf[nLeft - j - 1] = tableFnUpper(idx + j);
9545                 }
9546             }
9547             return buf[nLeft - 1];
9548         }
9549 
9550         void popFront()
9551         {
9552             if (lower)
9553                 lwr.popFront();
9554             else
9555             {
9556                 if (!nLeft)
9557                     front;
9558                 assert(nLeft);
9559                 --nLeft;
9560                 if (!nLeft)
9561                 {
9562                     r.popFront();
9563                     lwr = r.asLowerCase();
9564                     lower = true;
9565                 }
9566             }
9567         }
9568 
9569         static if (isForwardRange!Range)
9570         {
9571             @property auto save()
9572             {
9573                 auto ret = this;
9574                 ret.r = r.save;
9575                 ret.lwr = lwr.save;
9576                 return ret;
9577             }
9578         }
9579 
9580       private:
9581         Range r;
9582         typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
9583         bool lower = false;     // false for first character, true for rest of string
9584         dchar[3] buf = void;
9585         uint nLeft = 0;
9586     }
9587 
9588     return ToCapitalizerImpl(str);
9589 }
9590 
9591 /*********************
9592  * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9593  * or string, meaning convert the first
9594  * character to upper case and subsequent characters to lower case.
9595  *
9596  * Does not allocate memory.
9597  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9598  * are treated as $(REF replacementDchar, std,utf).
9599  *
9600  * Params:
9601  *      str = string or range of characters
9602  *
9603  * Returns:
9604  *      an InputRange of dchars
9605  *
9606  * See_Also:
9607  *      $(LREF toUpper), $(LREF toLower)
9608  *      $(LREF asUpperCase), $(LREF asLowerCase)
9609  */
9610 
9611 auto asCapitalized(Range)(Range str)
9612 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9613     !isConvertibleToString!Range)
9614 {
9615     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9616     {
9617         import std.utf : byDchar;
9618 
9619         // Decode first
9620         return toCapitalizer!UpperTriple(str.byDchar);
9621     }
9622     else
9623     {
9624         return toCapitalizer!UpperTriple(str);
9625     }
9626 }
9627 
9628 ///
9629 @safe pure unittest
9630 {
9631     import std.algorithm.comparison : equal;
9632 
9633     assert("hEllo".asCapitalized.equal("Hello"));
9634 }
9635 
9636 auto asCapitalized(Range)(auto ref Range str)
9637 if (isConvertibleToString!Range)
9638 {
9639     import std.traits : StringTypeOf;
9640     return asCapitalized!(StringTypeOf!Range)(str);
9641 }
9642 
9643 @safe pure nothrow @nogc unittest
9644 {
9645     auto r = "hEllo".asCapitalized();
9646     assert(r.front == 'H');
9647 }
9648 
9649 @safe unittest
9650 {
9651     import std.array : array;
9652 
9653     auto a = "hELLo".asCapitalized;
9654     auto savea = a.save;
9655     auto s = a.array;
9656     assert(s == "Hello");
9657     s = savea.array;
9658     assert(s == "Hello");
9659 
9660     string[2][] cases =
9661     [
9662         ["", ""],
9663         ["h", "H"],
9664         ["H", "H"],
9665         ["3", "3"],
9666         ["123", "123"],
9667         ["h123A", "H123a"],
9668         ["феж", "Феж"],
9669         ["\u1Fe2", "\u03a5\u0308\u0300"],
9670     ];
9671 
9672     foreach (i; 0 .. cases.length)
9673     {
9674         import std.utf : byChar;
9675 
9676         auto r = cases[i][0].asCapitalized.byChar.array;
9677         auto result = cases[i][1];
9678         assert(r == result);
9679     }
9680 
9681     // Don't call r.front
9682     for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
9683     {
9684     }
9685 
9686     import std.algorithm.comparison : equal;
9687 
9688     "HELLo"w.asCapitalized.equal("Hello"d);
9689     "hElLO"w.asCapitalized.equal("Hello"d);
9690     "hello"d.asCapitalized.equal("Hello"d);
9691     "HELLO"d.asCapitalized.equal("Hello"d);
9692 
9693     import std.utf : byChar;
9694     assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
9695 }
9696 
9697 // TODO: helper, I wish std.utf was more flexible (and stright)
9698 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9699 {
9700     if (c <= 0x7F)
9701     {
9702         buf[idx] = cast(char) c;
9703         idx++;
9704     }
9705     else if (c <= 0x7FF)
9706     {
9707         buf[idx] = cast(char)(0xC0 | (c >> 6));
9708         buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
9709         idx += 2;
9710     }
9711     else if (c <= 0xFFFF)
9712     {
9713         buf[idx] = cast(char)(0xE0 | (c >> 12));
9714         buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9715         buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
9716         idx += 3;
9717     }
9718     else if (c <= 0x10FFFF)
9719     {
9720         buf[idx] = cast(char)(0xF0 | (c >> 18));
9721         buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
9722         buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9723         buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
9724         idx += 4;
9725     }
9726     else
9727         assert(0);
9728     return idx;
9729 }
9730 
9731 @safe unittest
9732 {
9733     char[] s = "abcd".dup;
9734     size_t i = 0;
9735     i = encodeTo(s, i, 'X');
9736     assert(s == "Xbcd");
9737 
9738     i = encodeTo(s, i, cast(dchar)'\u00A9');
9739     assert(s == "X\xC2\xA9d");
9740 }
9741 
9742 // TODO: helper, I wish std.utf was more flexible (and stright)
9743 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
9744 {
9745     import std.utf : UTFException;
9746     if (c <= 0xFFFF)
9747     {
9748         if (0xD800 <= c && c <= 0xDFFF)
9749             throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
9750         buf[idx] = cast(wchar) c;
9751         idx++;
9752     }
9753     else if (c <= 0x10FFFF)
9754     {
9755         buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
9756         buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
9757         idx += 2;
9758     }
9759     else
9760         assert(0);
9761     return idx;
9762 }
9763 
9764 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9765 {
9766     buf[idx] = c;
9767     idx++;
9768     return idx;
9769 }
9770 
9771 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
9772 if (is(C == char) || is(C == wchar)  || is(C == dchar))
9773 {
9774     import std.utf : decode, codeLength;
9775     size_t curIdx = 0;
9776     size_t destIdx = 0;
9777     alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
9778     size_t lastUnchanged = 0;
9779     // in-buffer move of bytes to a new start index
9780     // the trick is that it may not need to copy at all
9781     static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
9782     {
9783         // Interestingly we may just bump pointer for a while
9784         // then have to copy if a re-cased char was smaller the original
9785         // later we may regain pace with char that got bigger
9786         // In the end it sometimes flip-flops between the 2 cases below
9787         if (dest == from)
9788             return to;
9789         // got to copy
9790         foreach (C c; str[from .. to])
9791             str[dest++] = c;
9792         return dest;
9793     }
9794     while (curIdx != s.length)
9795     {
9796         size_t startIdx = curIdx;
9797         immutable ch = decode(s, curIdx);
9798         // TODO: special case for ASCII
9799         immutable caseIndex = indexFn(ch);
9800         if (caseIndex == ushort.max) // unchanged, skip over
9801         {
9802             continue;
9803         }
9804         else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9805         {
9806             // previous cased chars had the same length as uncased ones
9807             // thus can just adjust pointer
9808             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9809             lastUnchanged = curIdx;
9810             immutable cased = tableFn(caseIndex);
9811             immutable casedLen = codeLength!C(cased);
9812             if (casedLen + destIdx > curIdx) // no place to fit cased char
9813             {
9814                 // switch to slow codepath, where we allocate
9815                 return slowToCase(s, startIdx, destIdx);
9816             }
9817             else
9818             {
9819                 destIdx = encodeTo(s, destIdx, cased);
9820             }
9821         }
9822         else  // 1:m codepoint mapping, slow codepath
9823         {
9824             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9825             lastUnchanged = curIdx;
9826             return slowToCase(s, startIdx, destIdx);
9827         }
9828         assert(destIdx <= curIdx);
9829     }
9830     if (lastUnchanged != s.length)
9831     {
9832         destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
9833     }
9834     s = s[0 .. destIdx];
9835 }
9836 
9837 // helper to precalculate size of case-converted string
9838 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
9839 {
9840     size_t toCaseLength(C)(const scope C[] str)
9841     {
9842         import std.utf : decode, codeLength;
9843         size_t codeLen = 0;
9844         size_t lastNonTrivial = 0;
9845         size_t curIdx = 0;
9846         while (curIdx != str.length)
9847         {
9848             immutable startIdx = curIdx;
9849             immutable ch = decode(str, curIdx);
9850             immutable ushort caseIndex = indexFn(ch);
9851             if (caseIndex == ushort.max)
9852                 continue;
9853             else if (caseIndex < maxIdx)
9854             {
9855                 codeLen += startIdx - lastNonTrivial;
9856                 lastNonTrivial = curIdx;
9857                 immutable cased = tableFn(caseIndex);
9858                 codeLen += codeLength!C(cased);
9859             }
9860             else
9861             {
9862                 codeLen += startIdx - lastNonTrivial;
9863                 lastNonTrivial = curIdx;
9864                 immutable val = tableFn(caseIndex);
9865                 immutable len = val >> 24;
9866                 immutable dchar cased = val & 0xFF_FFFF;
9867                 codeLen += codeLength!C(cased);
9868                 foreach (j; caseIndex+1 .. caseIndex+len)
9869                     codeLen += codeLength!C(tableFn(j));
9870             }
9871         }
9872         if (lastNonTrivial != str.length)
9873             codeLen += str.length - lastNonTrivial;
9874         return codeLen;
9875     }
9876 }
9877 
9878 @safe unittest
9879 {
9880     alias toLowerLength = toCaseLength!(LowerTriple);
9881     assert(toLowerLength("abcd") == 4);
9882     assert(toLowerLength("аБВгд456") == 10+3);
9883 }
9884 
9885 // slower code path that preallocates and then copies
9886 // case-converted stuf to the new string
9887 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
9888 {
9889     void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
9890         size_t destIdx) @trusted pure
9891         if (is(C == char) || is(C == wchar) || is(C == dchar))
9892     {
9893         import std.utf : decode;
9894         alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
9895         auto trueLength = destIdx + caseLength(s[curIdx..$]);
9896         C[] ns = new C[trueLength];
9897         ns[0 .. destIdx] = s[0 .. destIdx];
9898         size_t lastUnchanged = curIdx;
9899         while (curIdx != s.length)
9900         {
9901             immutable startIdx = curIdx; // start of current codepoint
9902             immutable ch = decode(s, curIdx);
9903             immutable caseIndex = indexFn(ch);
9904             if (caseIndex == ushort.max) // skip over
9905             {
9906                 continue;
9907             }
9908             else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9909             {
9910                 immutable cased = tableFn(caseIndex);
9911                 auto toCopy = startIdx - lastUnchanged;
9912                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9913                 lastUnchanged = curIdx;
9914                 destIdx += toCopy;
9915                 destIdx = encodeTo(ns, destIdx, cased);
9916             }
9917             else  // 1:m codepoint mapping, slow codepath
9918             {
9919                 auto toCopy = startIdx - lastUnchanged;
9920                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9921                 lastUnchanged = curIdx;
9922                 destIdx += toCopy;
9923                 auto val = tableFn(caseIndex);
9924                 // unpack length + codepoint
9925                 immutable uint len = val >> 24;
9926                 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
9927                 foreach (j; caseIndex+1 .. caseIndex+len)
9928                     destIdx = encodeTo(ns, destIdx, tableFn(j));
9929             }
9930         }
9931         if (lastUnchanged != s.length)
9932         {
9933             auto toCopy = s.length - lastUnchanged;
9934             ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
9935             destIdx += toCopy;
9936         }
9937         assert(ns.length == destIdx);
9938         s = ns;
9939     }
9940 }
9941 
9942 /++
9943     Converts `s` to lowercase (by performing Unicode lowercase mapping) in place.
9944     For a few characters string length may increase after the transformation,
9945     in such a case the function reallocates exactly once.
9946     If `s` does not have any uppercase characters, then `s` is unaltered.
9947 +/
9948 void toLowerInPlace(C)(ref C[] s) @trusted pure
9949 if (is(C == char) || is(C == wchar) || is(C == dchar))
9950 {
9951     toCaseInPlace!(LowerTriple)(s);
9952 }
9953 // overloads for the most common cases to reduce compile time
9954 @safe pure /*TODO nothrow*/
9955 {
9956     void toLowerInPlace(ref char[] s)
9957     { toLowerInPlace!char(s); }
9958     void toLowerInPlace(ref wchar[] s)
9959     { toLowerInPlace!wchar(s); }
9960     void toLowerInPlace(ref dchar[] s)
9961     { toLowerInPlace!dchar(s); }
9962 }
9963 
9964 /++
9965     Converts `s` to uppercase  (by performing Unicode uppercase mapping) in place.
9966     For a few characters string length may increase after the transformation,
9967     in such a case the function reallocates exactly once.
9968     If `s` does not have any lowercase characters, then `s` is unaltered.
9969 +/
9970 void toUpperInPlace(C)(ref C[] s) @trusted pure
9971 if (is(C == char) || is(C == wchar) || is(C == dchar))
9972 {
9973     toCaseInPlace!(UpperTriple)(s);
9974 }
9975 // overloads for the most common cases to reduce compile time/code size
9976 @safe pure /*TODO nothrow*/
9977 {
9978     void toUpperInPlace(ref char[] s)
9979     { toUpperInPlace!char(s); }
9980     void toUpperInPlace(ref wchar[] s)
9981     { toUpperInPlace!wchar(s); }
9982     void toUpperInPlace(ref dchar[] s)
9983     { toUpperInPlace!dchar(s); }
9984 }
9985 
9986 /++
9987     If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
9988     is returned. Otherwise `c` is returned.
9989 
9990     Warning: certain alphabets like German and Greek have no 1:1
9991     upper-lower mapping. Use overload of toLower which takes full string instead.
9992 +/
9993 @safe pure nothrow @nogc
9994 dchar toLower(dchar c)
9995 {
9996      // optimize ASCII case
9997     if (c < 0xAA)
9998     {
9999         if (c < 'A')
10000             return c;
10001         if (c <= 'Z')
10002             return c + 32;
10003         return c;
10004     }
10005     size_t idx = toLowerSimpleIndex(c);
10006     if (idx != ushort.max)
10007     {
10008         return toLowerTab(idx);
10009     }
10010     return c;
10011 }
10012 
10013 /++
10014     Creates a new array which is identical to `s` except that all of its
10015     characters are converted to lowercase (by performing Unicode lowercase mapping).
10016     If none of `s` characters were affected, then `s` itself is returned if `s` is a
10017     `string`-like type.
10018 
10019     Params:
10020         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10021         of characters
10022     Returns:
10023         An array with the same element type as `s`.
10024 +/
10025 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted
10026 if (isSomeString!S)
10027 {
10028     static import std.ascii;
10029     return toCase!(LowerTriple, std.ascii.toLower)(s);
10030 }
10031 
10032 /// ditto
10033 ElementEncodingType!S[] toLower(S)(S s)
10034 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10035 {
10036     static import std.ascii;
10037     return toCase!(LowerTriple, std.ascii.toLower)(s);
10038 }
10039 
10040 // overloads for the most common cases to reduce compile time
10041 @safe pure /*TODO nothrow*/
10042 {
10043     string toLower(return scope string s)
10044     { return toLower!string(s); }
10045     wstring toLower(return scope wstring s)
10046     { return toLower!wstring(s); }
10047     dstring toLower(return scope dstring s)
10048     { return toLower!dstring(s); }
10049 
10050     @safe unittest
10051     {
10052         // https://issues.dlang.org/show_bug.cgi?id=16663
10053 
10054         static struct String
10055         {
10056             string data;
10057             alias data this;
10058         }
10059 
10060         void foo()
10061         {
10062             auto u = toLower(String(""));
10063         }
10064     }
10065 }
10066 
10067 
10068 @safe unittest
10069 {
10070     static import std.ascii;
10071     import std.format : format;
10072     foreach (ch; 0 .. 0x80)
10073         assert(std.ascii.toLower(ch) == toLower(ch));
10074     assert(toLower('Я') == 'я');
10075     assert(toLower('Δ') == 'δ');
10076     foreach (ch; unicode.upperCase.byCodepoint)
10077     {
10078         dchar low = ch.toLower();
10079         assert(low == ch || isLower(low), format("%s -> %s", ch, low));
10080     }
10081     assert(toLower("АЯ") == "ая");
10082 
10083     assert("\u1E9E".toLower == "\u00df");
10084     assert("\u00df".toUpper == "SS");
10085 }
10086 
10087 // https://issues.dlang.org/show_bug.cgi?id=9629
10088 @safe unittest
10089 {
10090     wchar[] test = "hello þ world"w.dup;
10091     auto piece = test[6 .. 7];
10092     toUpperInPlace(piece);
10093     assert(test == "hello Þ world");
10094 }
10095 
10096 
10097 @safe unittest
10098 {
10099     import std.algorithm.comparison : cmp;
10100     string s1 = "FoL";
10101     string s2 = toLower(s1);
10102     assert(cmp(s2, "fol") == 0, s2);
10103     assert(s2 != s1);
10104 
10105     char[] s3 = s1.dup;
10106     toLowerInPlace(s3);
10107     assert(s3 == s2);
10108 
10109     s1 = "A\u0100B\u0101d";
10110     s2 = toLower(s1);
10111     s3 = s1.dup;
10112     assert(cmp(s2, "a\u0101b\u0101d") == 0);
10113     assert(s2 !is s1);
10114     toLowerInPlace(s3);
10115     assert(s3 == s2);
10116 
10117     s1 = "A\u0460B\u0461d";
10118     s2 = toLower(s1);
10119     s3 = s1.dup;
10120     assert(cmp(s2, "a\u0461b\u0461d") == 0);
10121     assert(s2 !is s1);
10122     toLowerInPlace(s3);
10123     assert(s3 == s2);
10124 
10125     s1 = "\u0130";
10126     s2 = toLower(s1);
10127     s3 = s1.dup;
10128     assert(s2 == "i\u0307");
10129     assert(s2 !is s1);
10130     toLowerInPlace(s3);
10131     assert(s3 == s2);
10132 
10133     // Test on wchar and dchar strings.
10134     assert(toLower("Some String"w) == "some string"w);
10135     assert(toLower("Some String"d) == "some string"d);
10136 
10137     // https://issues.dlang.org/show_bug.cgi?id=12455
10138     dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
10139     assert(isUpper(c));
10140     assert(toLower(c) == 'i');
10141     // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report
10142     // check simple-case toUpper too
10143     c = '\u1f87';
10144     assert(isLower(c));
10145     assert(toUpper(c) == '\u1F8F');
10146 }
10147 
10148 @safe pure unittest
10149 {
10150     import std.algorithm.comparison : cmp, equal;
10151     import std.utf : byCodeUnit;
10152     auto r1 = "FoL".byCodeUnit;
10153     assert(r1.toLower.cmp("fol") == 0);
10154     auto r2 = "A\u0460B\u0461d".byCodeUnit;
10155     assert(r2.toLower.cmp("a\u0461b\u0461d") == 0);
10156 }
10157 
10158 /++
10159     If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
10160     is returned. Otherwise `c` is returned.
10161 
10162     Warning:
10163     Certain alphabets like German and Greek have no 1:1
10164     upper-lower mapping. Use overload of toUpper which takes full string instead.
10165 
10166     toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
10167     to produce an algorithm that can convert a range of characters to upper case
10168     without allocating memory.
10169     A string can then be produced by using $(REF copy, std,algorithm,mutation)
10170     to send it to an $(REF appender, std,array).
10171 +/
10172 @safe pure nothrow @nogc
10173 dchar toUpper(dchar c)
10174 {
10175     // optimize ASCII case
10176     if (c < 0xAA)
10177     {
10178         if (c < 'a')
10179             return c;
10180         if (c <= 'z')
10181             return c - 32;
10182         return c;
10183     }
10184     size_t idx = toUpperSimpleIndex(c);
10185     if (idx != ushort.max)
10186     {
10187         return toUpperTab(idx);
10188     }
10189     return c;
10190 }
10191 
10192 ///
10193 @safe unittest
10194 {
10195     import std.algorithm.iteration : map;
10196     import std.algorithm.mutation : copy;
10197     import std.array : appender;
10198 
10199     auto abuf = appender!(char[])();
10200     "hello".map!toUpper.copy(abuf);
10201     assert(abuf.data == "HELLO");
10202 }
10203 
10204 @safe unittest
10205 {
10206     static import std.ascii;
10207     import std.format : format;
10208     foreach (ch; 0 .. 0x80)
10209         assert(std.ascii.toUpper(ch) == toUpper(ch));
10210     assert(toUpper('я') == 'Я');
10211     assert(toUpper('δ') == 'Δ');
10212     auto title = unicode.Titlecase_Letter;
10213     foreach (ch; unicode.lowerCase.byCodepoint)
10214     {
10215         dchar up = ch.toUpper();
10216         assert(up == ch || isUpper(up) || title[up],
10217             format("%x -> %x", ch, up));
10218     }
10219 }
10220 
10221 /++
10222     Allocates a new array which is identical to `s` except that all of its
10223     characters are converted to uppercase (by performing Unicode uppercase mapping).
10224     If none of `s` characters were affected, then `s` itself is returned if `s`
10225     is a `string`-like type.
10226 
10227     Params:
10228         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10229         of characters
10230     Returns:
10231         An new array with the same element type as `s`.
10232 +/
10233 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted
10234 if (isSomeString!S)
10235 {
10236     static import std.ascii;
10237     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10238 }
10239 
10240 /// ditto
10241 ElementEncodingType!S[] toUpper(S)(S s)
10242 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10243 {
10244     static import std.ascii;
10245     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10246 }
10247 
10248 // overloads for the most common cases to reduce compile time
10249 @safe pure /*TODO nothrow*/
10250 {
10251     string toUpper(return scope string s)
10252     { return toUpper!string(s); }
10253     wstring toUpper(return scope wstring s)
10254     { return toUpper!wstring(s); }
10255     dstring toUpper(return scope dstring s)
10256     { return toUpper!dstring(s); }
10257 
10258     @safe unittest
10259     {
10260         // https://issues.dlang.org/show_bug.cgi?id=16663
10261 
10262         static struct String
10263         {
10264             string data;
10265             alias data this;
10266         }
10267 
10268         void foo()
10269         {
10270             auto u = toUpper(String(""));
10271         }
10272     }
10273 }
10274 
10275 @safe unittest
10276 {
10277     import std.algorithm.comparison : cmp;
10278 
10279     string s1 = "FoL";
10280     string s2;
10281     char[] s3;
10282 
10283     s2 = toUpper(s1);
10284     s3 = s1.dup; toUpperInPlace(s3);
10285     assert(s3 == s2, s3);
10286     assert(cmp(s2, "FOL") == 0);
10287     assert(s2 !is s1);
10288 
10289     s1 = "a\u0100B\u0101d";
10290     s2 = toUpper(s1);
10291     s3 = s1.dup; toUpperInPlace(s3);
10292     assert(s3 == s2);
10293     assert(cmp(s2, "A\u0100B\u0100D") == 0);
10294     assert(s2 !is s1);
10295 
10296     s1 = "a\u0460B\u0461d";
10297     s2 = toUpper(s1);
10298     s3 = s1.dup; toUpperInPlace(s3);
10299     assert(s3 == s2);
10300     assert(cmp(s2, "A\u0460B\u0460D") == 0);
10301     assert(s2 !is s1);
10302 }
10303 
10304 @safe unittest
10305 {
10306     static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
10307     {
10308         import std.format : format;
10309         string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
10310         auto low = s.toLower() , up = s.toUpper();
10311         auto lowInp = s.dup, upInp = s.dup;
10312         lowInp.toLowerInPlace();
10313         upInp.toUpperInPlace();
10314         assert(low == trueLow, format(diff, low, trueLow));
10315         assert(up == trueUp,  format(diff, up, trueUp));
10316         assert(lowInp == trueLow,
10317             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow));
10318         assert(upInp == trueUp,
10319             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp));
10320     }
10321     static foreach (S; AliasSeq!(dstring, wstring, string))
10322     {{
10323 
10324         S easy = "123";
10325         S good = "abCФеж";
10326         S awful = "\u0131\u023f\u2126";
10327         S wicked = "\u0130\u1FE2";
10328         auto options = [easy, good, awful, wicked];
10329         S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
10330         S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
10331 
10332         foreach (val; [easy, good])
10333         {
10334             auto e = val.dup;
10335             auto g = e;
10336             e.toUpperInPlace();
10337             assert(e is g);
10338             e.toLowerInPlace();
10339             assert(e is g);
10340         }
10341         foreach (i, v; options)
10342         {
10343             doTest(v, upper[i], lower[i]);
10344         }
10345 
10346         // a few combinatorial runs
10347         foreach (i; 0 .. options.length)
10348         foreach (j; i .. options.length)
10349         foreach (k; j .. options.length)
10350         {
10351             auto sample = options[i] ~ options[j] ~ options[k];
10352             auto sample2 = options[k] ~ options[j] ~ options[i];
10353             doTest(sample, upper[i] ~ upper[j] ~ upper[k],
10354                 lower[i] ~ lower[j] ~ lower[k]);
10355             doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
10356                 lower[k] ~ lower[j] ~ lower[i]);
10357         }
10358     }}
10359 }
10360 
10361 // test random access ranges
10362 @safe pure unittest
10363 {
10364     import std.algorithm.comparison : cmp;
10365     import std.utf : byCodeUnit;
10366     auto s1 = "FoL".byCodeUnit;
10367     assert(s1.toUpper.cmp("FOL") == 0);
10368     auto s2 = "a\u0460B\u0461d".byCodeUnit;
10369     assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0);
10370 }
10371 
10372 /++
10373     Returns whether `c` is a Unicode alphabetic $(CHARACTER)
10374     (general Unicode category: Alphabetic).
10375 +/
10376 @safe pure nothrow @nogc
10377 bool isAlpha(dchar c)
10378 {
10379     // optimization
10380     if (c < 0xAA)
10381     {
10382         return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
10383     }
10384 
10385     return alphaTrie[c];
10386 }
10387 
10388 @safe unittest
10389 {
10390     auto alpha = unicode("Alphabetic");
10391     foreach (ch; alpha.byCodepoint)
10392         assert(isAlpha(ch));
10393     foreach (ch; 0 .. 0x4000)
10394         assert((ch in alpha) == isAlpha(ch));
10395 }
10396 
10397 
10398 /++
10399     Returns whether `c` is a Unicode mark
10400     (general Unicode category: Mn, Me, Mc).
10401 +/
10402 @safe pure nothrow @nogc
10403 bool isMark(dchar c)
10404 {
10405     return markTrie[c];
10406 }
10407 
10408 @safe unittest
10409 {
10410     auto mark = unicode("Mark");
10411     foreach (ch; mark.byCodepoint)
10412         assert(isMark(ch));
10413     foreach (ch; 0 .. 0x4000)
10414         assert((ch in mark) == isMark(ch));
10415 }
10416 
10417 /++
10418     Returns whether `c` is a Unicode numerical $(CHARACTER)
10419     (general Unicode category: Nd, Nl, No).
10420 +/
10421 @safe pure nothrow @nogc
10422 bool isNumber(dchar c)
10423 {
10424     // optimization for ascii case
10425     if (c <= 0x7F)
10426     {
10427         return c >= '0' && c <= '9';
10428     }
10429     else
10430     {
10431         return numberTrie[c];
10432     }
10433 }
10434 
10435 @safe unittest
10436 {
10437     auto n = unicode("N");
10438     foreach (ch; n.byCodepoint)
10439         assert(isNumber(ch));
10440     foreach (ch; 0 .. 0x4000)
10441         assert((ch in n) == isNumber(ch));
10442 }
10443 
10444 /++
10445     Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number.
10446     (general Unicode category: Alphabetic, Nd, Nl, No).
10447 
10448     Params:
10449         c = any Unicode character
10450     Returns:
10451         `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
10452         categories
10453 +/
10454 @safe pure nothrow @nogc
10455 bool isAlphaNum(dchar c)
10456 {
10457     static import std.ascii;
10458 
10459     // optimization for ascii case
10460     if (std.ascii.isASCII(c))
10461     {
10462         return std.ascii.isAlphaNum(c);
10463     }
10464     else
10465     {
10466         return isAlpha(c) || isNumber(c);
10467     }
10468 }
10469 
10470 @safe unittest
10471 {
10472     auto n = unicode("N");
10473     auto alpha = unicode("Alphabetic");
10474 
10475     foreach (ch; n.byCodepoint)
10476         assert(isAlphaNum(ch));
10477 
10478     foreach (ch; alpha.byCodepoint)
10479         assert(isAlphaNum(ch));
10480 
10481     foreach (ch; 0 .. 0x4000)
10482     {
10483         assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
10484     }
10485 }
10486 
10487 /++
10488     Returns whether `c` is a Unicode punctuation $(CHARACTER)
10489     (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
10490 +/
10491 @safe pure nothrow @nogc
10492 bool isPunctuation(dchar c)
10493 {
10494     static import std.ascii;
10495 
10496     // optimization for ascii case
10497     if (c <= 0x7F)
10498     {
10499         return std.ascii.isPunctuation(c);
10500     }
10501     else
10502     {
10503         return punctuationTrie[c];
10504     }
10505 }
10506 
10507 @safe unittest
10508 {
10509     assert(isPunctuation('\u0021'));
10510     assert(isPunctuation('\u0028'));
10511     assert(isPunctuation('\u0029'));
10512     assert(isPunctuation('\u002D'));
10513     assert(isPunctuation('\u005F'));
10514     assert(isPunctuation('\u00AB'));
10515     assert(isPunctuation('\u00BB'));
10516     foreach (ch; unicode("P").byCodepoint)
10517         assert(isPunctuation(ch));
10518 }
10519 
10520 /++
10521     Returns whether `c` is a Unicode symbol $(CHARACTER)
10522     (general Unicode category: Sm, Sc, Sk, So).
10523 +/
10524 @safe pure nothrow @nogc
10525 bool isSymbol(dchar c)
10526 {
10527    return symbolTrie[c];
10528 }
10529 
10530 @safe unittest
10531 {
10532     import std.format : format;
10533     assert(isSymbol('\u0024'));
10534     assert(isSymbol('\u002B'));
10535     assert(isSymbol('\u005E'));
10536     assert(isSymbol('\u00A6'));
10537     foreach (ch; unicode("S").byCodepoint)
10538         assert(isSymbol(ch), format("%04x", ch));
10539 }
10540 
10541 /++
10542     Returns whether `c` is a Unicode space $(CHARACTER)
10543     (general Unicode category: Zs)
10544     Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
10545     For commonly used less strict semantics see $(LREF isWhite).
10546 +/
10547 @safe pure nothrow @nogc
10548 bool isSpace(dchar c)
10549 {
10550     import std.internal.unicode_tables : isSpaceGen; // generated file
10551     return isSpaceGen(c);
10552 }
10553 
10554 @safe unittest
10555 {
10556     assert(isSpace('\u0020'));
10557     auto space = unicode.Zs;
10558     foreach (ch; space.byCodepoint)
10559         assert(isSpace(ch));
10560     foreach (ch; 0 .. 0x1000)
10561         assert(isSpace(ch) == space[ch]);
10562 }
10563 
10564 
10565 /++
10566     Returns whether `c` is a Unicode graphical $(CHARACTER)
10567     (general Unicode category: L, M, N, P, S, Zs).
10568 
10569 +/
10570 @safe pure nothrow @nogc
10571 bool isGraphical(dchar c)
10572 {
10573     return graphicalTrie[c];
10574 }
10575 
10576 
10577 @safe unittest
10578 {
10579     auto set = unicode("Graphical");
10580     import std.format : format;
10581     foreach (ch; set.byCodepoint)
10582         assert(isGraphical(ch), format("%4x", ch));
10583     foreach (ch; 0 .. 0x4000)
10584         assert((ch in set) == isGraphical(ch));
10585 }
10586 
10587 
10588 /++
10589     Returns whether `c` is a Unicode control $(CHARACTER)
10590     (general Unicode category: Cc).
10591 +/
10592 @safe pure nothrow @nogc
10593 bool isControl(dchar c)
10594 {
10595     import std.internal.unicode_tables : isControlGen; // generated file
10596     return isControlGen(c);
10597 }
10598 
10599 @safe unittest
10600 {
10601     assert(isControl('\u0000'));
10602     assert(isControl('\u0081'));
10603     assert(!isControl('\u0100'));
10604     auto cc = unicode.Cc;
10605     foreach (ch; cc.byCodepoint)
10606         assert(isControl(ch));
10607     foreach (ch; 0 .. 0x1000)
10608         assert(isControl(ch) == cc[ch]);
10609 }
10610 
10611 
10612 /++
10613     Returns whether `c` is a Unicode formatting $(CHARACTER)
10614     (general Unicode category: Cf).
10615 +/
10616 @safe pure nothrow @nogc
10617 bool isFormat(dchar c)
10618 {
10619     import std.internal.unicode_tables : isFormatGen; // generated file
10620     return isFormatGen(c);
10621 }
10622 
10623 
10624 @safe unittest
10625 {
10626     assert(isFormat('\u00AD'));
10627     foreach (ch; unicode("Format").byCodepoint)
10628         assert(isFormat(ch));
10629 }
10630 
10631 // code points for private use, surrogates are not likely to change in near feature
10632 // if need be they can be generated from unicode data as well
10633 
10634 /++
10635     Returns whether `c` is a Unicode Private Use $(CODEPOINT)
10636     (general Unicode category: Co).
10637 +/
10638 @safe pure nothrow @nogc
10639 bool isPrivateUse(dchar c)
10640 {
10641     return (0x00_E000 <= c && c <= 0x00_F8FF)
10642         || (0x0F_0000 <= c && c <= 0x0F_FFFD)
10643         || (0x10_0000 <= c && c <= 0x10_FFFD);
10644 }
10645 
10646 /++
10647     Returns whether `c` is a Unicode surrogate $(CODEPOINT)
10648     (general Unicode category: Cs).
10649 +/
10650 @safe pure nothrow @nogc
10651 bool isSurrogate(dchar c)
10652 {
10653     return (0xD800 <= c && c <= 0xDFFF);
10654 }
10655 
10656 /++
10657     Returns whether `c` is a Unicode high surrogate (lead surrogate).
10658 +/
10659 @safe pure nothrow @nogc
10660 bool isSurrogateHi(dchar c)
10661 {
10662     return (0xD800 <= c && c <= 0xDBFF);
10663 }
10664 
10665 /++
10666     Returns whether `c` is a Unicode low surrogate (trail surrogate).
10667 +/
10668 @safe pure nothrow @nogc
10669 bool isSurrogateLo(dchar c)
10670 {
10671     return (0xDC00 <= c && c <= 0xDFFF);
10672 }
10673 
10674 /++
10675     Returns whether `c` is a Unicode non-character i.e.
10676     a $(CODEPOINT) with no assigned abstract character.
10677     (general Unicode category: Cn)
10678 +/
10679 @safe pure nothrow @nogc
10680 bool isNonCharacter(dchar c)
10681 {
10682     return nonCharacterTrie[c];
10683 }
10684 
10685 @safe unittest
10686 {
10687     auto set = unicode("Cn");
10688     foreach (ch; set.byCodepoint)
10689         assert(isNonCharacter(ch));
10690 }
10691 
10692 private:
10693 // load static data from pre-generated tables into usable datastructures
10694 
10695 
10696 @safe auto asSet(const (ubyte)[] compressed) pure
10697 {
10698     return CodepointSet.fromIntervals(decompressIntervals(compressed));
10699 }
10700 
10701 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e)
10702 {
10703     return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
10704 }
10705 
10706 @safe pure nothrow @nogc @property
10707 {
10708     // It's important to use auto return here, so that the compiler
10709     // only runs semantic on the return type if the function gets
10710     // used. Also these are functions rather than templates to not
10711     // increase the object size of the caller.
10712     auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
10713     auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
10714     auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
10715     auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
10716     auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
10717     auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
10718     auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
10719     auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
10720     auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
10721     auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
10722     auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
10723 
10724     //normalization quick-check tables
10725     auto nfcQCTrie()
10726     {
10727         import std.internal.unicode_norm : nfcQCTrieEntries;
10728         static immutable res = asTrie(nfcQCTrieEntries);
10729         return res;
10730     }
10731 
10732     auto nfdQCTrie()
10733     {
10734         import std.internal.unicode_norm : nfdQCTrieEntries;
10735         static immutable res = asTrie(nfdQCTrieEntries);
10736         return res;
10737     }
10738 
10739     auto nfkcQCTrie()
10740     {
10741         import std.internal.unicode_norm : nfkcQCTrieEntries;
10742         static immutable res = asTrie(nfkcQCTrieEntries);
10743         return res;
10744     }
10745 
10746     auto nfkdQCTrie()
10747     {
10748         import std.internal.unicode_norm : nfkdQCTrieEntries;
10749         static immutable res = asTrie(nfkdQCTrieEntries);
10750         return res;
10751     }
10752 
10753     //grapheme breaking algorithm tables
10754     auto spacingMarkTrie()
10755     {
10756         import std.internal.unicode_grapheme : spacingMarkTrieEntries;
10757         static immutable res = asTrie(spacingMarkTrieEntries);
10758         return res;
10759     }
10760 
10761     auto graphemeExtendTrie()
10762     {
10763         import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
10764         static immutable res = asTrie(graphemeExtendTrieEntries);
10765         return res;
10766     }
10767 
10768     auto hangLV()
10769     {
10770         import std.internal.unicode_grapheme : hangulLVTrieEntries;
10771         static immutable res = asTrie(hangulLVTrieEntries);
10772         return res;
10773     }
10774 
10775     auto hangLVT()
10776     {
10777         import std.internal.unicode_grapheme : hangulLVTTrieEntries;
10778         static immutable res = asTrie(hangulLVTTrieEntries);
10779         return res;
10780     }
10781 
10782     auto prependTrie()
10783     {
10784         import std.internal.unicode_grapheme : prependTrieEntries;
10785         static immutable res = asTrie(prependTrieEntries);
10786         return res;
10787     }
10788 
10789     auto graphemeControlTrie()
10790     {
10791         import std.internal.unicode_grapheme : controlTrieEntries;
10792         static immutable res = asTrie(controlTrieEntries);
10793         return res;
10794     }
10795 
10796     auto xpictoTrie()
10797     {
10798         import std.internal.unicode_grapheme : Extended_PictographicTrieEntries;
10799         static immutable res = asTrie(Extended_PictographicTrieEntries);
10800         return res;
10801     }
10802 
10803     // tables below are used for composition/decomposition
10804     auto combiningClassTrie()
10805     {
10806         import std.internal.unicode_comp : combiningClassTrieEntries;
10807         static immutable res = asTrie(combiningClassTrieEntries);
10808         return res;
10809     }
10810 
10811     auto compatMappingTrie()
10812     {
10813         import std.internal.unicode_decomp : compatMappingTrieEntries;
10814         static immutable res = asTrie(compatMappingTrieEntries);
10815         return res;
10816     }
10817 
10818     auto canonMappingTrie()
10819     {
10820         import std.internal.unicode_decomp : canonMappingTrieEntries;
10821         static immutable res = asTrie(canonMappingTrieEntries);
10822         return res;
10823     }
10824 
10825     auto compositionJumpTrie()
10826     {
10827         import std.internal.unicode_comp : compositionJumpTrieEntries;
10828         static immutable res = asTrie(compositionJumpTrieEntries);
10829         return res;
10830     }
10831 
10832     //case conversion tables
10833     auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
10834     auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
10835     auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
10836     //simple case conversion tables
10837     auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
10838     auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
10839     auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
10840 
10841 }
10842 
10843 }// version (!std_uni_bootstrap)
The OpenD Programming Language