The OpenD Programming Language

1 // Written in the D programming language.
2 
3 /++
4     $(P The `std.uni` module provides an implementation
5     of fundamental Unicode algorithms and data structures.
6     This doesn't include UTF encoding and decoding primitives,
7     see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
8     for this functionality. )
9 
10 $(SCRIPT inhibitQuickIndex = 1;)
11 $(DIVC quickindex,
12 $(BOOKTABLE,
13 $(TR $(TH Category) $(TH Functions))
14 $(TR $(TD Decode) $(TD
15     $(LREF byCodePoint)
16     $(LREF byGrapheme)
17     $(LREF decodeGrapheme)
18     $(LREF graphemeStride)
19 ))
20 $(TR $(TD Comparison) $(TD
21     $(LREF icmp)
22     $(LREF sicmp)
23 ))
24 $(TR $(TD Classification) $(TD
25     $(LREF isAlpha)
26     $(LREF isAlphaNum)
27     $(LREF isCodepointSet)
28     $(LREF isControl)
29     $(LREF isFormat)
30     $(LREF isGraphical)
31     $(LREF isIntegralPair)
32     $(LREF isMark)
33     $(LREF isNonCharacter)
34     $(LREF isNumber)
35     $(LREF isPrivateUse)
36     $(LREF isPunctuation)
37     $(LREF isSpace)
38     $(LREF isSurrogate)
39     $(LREF isSurrogateHi)
40     $(LREF isSurrogateLo)
41     $(LREF isSymbol)
42     $(LREF isWhite)
43 ))
44 $(TR $(TD Normalization) $(TD
45     $(LREF NFC)
46     $(LREF NFD)
47     $(LREF NFKD)
48     $(LREF NormalizationForm)
49     $(LREF normalize)
50 ))
51 $(TR $(TD Decompose) $(TD
52     $(LREF decompose)
53     $(LREF decomposeHangul)
54     $(LREF UnicodeDecomposition)
55 ))
56 $(TR $(TD Compose) $(TD
57     $(LREF compose)
58     $(LREF composeJamo)
59 ))
60 $(TR $(TD Sets) $(TD
61     $(LREF CodepointInterval)
62     $(LREF CodepointSet)
63     $(LREF InversionList)
64     $(LREF unicode)
65 ))
66 $(TR $(TD Trie) $(TD
67     $(LREF codepointSetTrie)
68     $(LREF CodepointSetTrie)
69     $(LREF codepointTrie)
70     $(LREF CodepointTrie)
71     $(LREF toTrie)
72     $(LREF toDelegate)
73 ))
74 $(TR $(TD Casing) $(TD
75     $(LREF asCapitalized)
76     $(LREF asLowerCase)
77     $(LREF asUpperCase)
78     $(LREF isLower)
79     $(LREF isUpper)
80     $(LREF toLower)
81     $(LREF toLowerInPlace)
82     $(LREF toUpper)
83     $(LREF toUpperInPlace)
84 ))
85 $(TR $(TD Utf8Matcher) $(TD
86     $(LREF isUtfMatcher)
87     $(LREF MatcherConcept)
88     $(LREF utfMatcher)
89 ))
90 $(TR $(TD Separators) $(TD
91     $(LREF lineSep)
92     $(LREF nelSep)
93     $(LREF paraSep)
94 ))
95 $(TR $(TD Building blocks) $(TD
96     $(LREF allowedIn)
97     $(LREF combiningClass)
98     $(LREF Grapheme)
99 ))
100 ))
101 
102     $(P All primitives listed operate on Unicode characters and
103         sets of characters. For functions which operate on ASCII characters
104         and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
105         For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
106         used throughout this module see the $(S_LINK Terminology, terminology) section
107         below.
108     )
109     $(P The focus of this module is the core needs of developing Unicode-aware
110         applications. To that effect it provides the following optimized primitives:
111     )
112     $(UL
113         $(LI Character classification by category and common properties:
114             $(LREF isAlpha), $(LREF isWhite) and others.
115         )
116         $(LI
117             Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
118         )
119         $(LI
120             Converting text to any of the four normalization forms via $(LREF normalize).
121         )
122         $(LI
123             Decoding ($(LREF decodeGrapheme))  and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
124             by user-perceived characters, that is by $(LREF Grapheme) clusters.
125         )
126         $(LI
127             Decomposing and composing of individual character(s) according to canonical
128             or compatibility rules, see $(LREF compose) and $(LREF decompose),
129             including the specific version for Hangul syllables $(LREF composeJamo)
130             and $(LREF decomposeHangul).
131         )
132     )
133     $(P It's recognized that an application may need further enhancements
134         and extensions, such as less commonly known algorithms,
135         or tailoring existing ones for region specific needs. To help users
136         with building any extra functionality beyond the core primitives,
137         the module provides:
138     )
139     $(UL
140         $(LI
141             $(LREF CodepointSet), a type for easy manipulation of sets of characters.
142             Besides the typical set algebra it provides an unusual feature:
143             a D source code generator for detection of $(CODEPOINTS) in this set.
144             This is a boon for meta-programming parser frameworks,
145             and is used internally to power classification in small
146             sets like $(LREF isWhite).
147         )
148         $(LI
149             A way to construct optimal packed multi-stage tables also known as a
150             special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
151             The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
152             construct custom tries that map dchar to value.
153             The end result is a fast and predictable $(BIGOH 1) lookup that powers
154             functions like $(LREF isAlpha) and $(LREF combiningClass),
155             but for user-defined data sets.
156         )
157         $(LI
158             A useful technique for Unicode-aware parsers that perform
159             character classification of encoded $(CODEPOINTS)
160             is to avoid unnecassary decoding at all costs.
161             $(LREF utfMatcher) provides an improvement over the usual workflow
162             of decode-classify-process, combining the decoding and classification
163             steps. By extracting necessary bits directly from encoded
164             $(S_LINK Code unit, code units) matchers achieve
165             significant performance improvements. See $(LREF MatcherConcept) for
166             the common interface of UTF matchers.
167         )
168         $(LI
169             Generally useful building blocks for customized normalization:
170             $(LREF combiningClass) for querying combining class
171             and $(LREF allowedIn) for testing the Quick_Check
172             property of a given normalization form.
173         )
174         $(LI
175             Access to a large selection of commonly used sets of $(CODEPOINTS).
176             $(S_LINK Unicode properties, Supported sets) include Script,
177             Block and General Category. The exact contents of a set can be
178             observed in the CLDR utility, on the
179             $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
180             of the Unicode website.
181             See $(LREF unicode) for easy and (optionally) compile-time checked set
182             queries.
183         )
184     )
185     $(SECTION Synopsis)
186     ---
187     import std.uni;
188     void main()
189     {
190         // initialize code point sets using script/block or property name
191         // now 'set' contains code points from both scripts.
192         auto set = unicode("Cyrillic") | unicode("Armenian");
193         // same thing but simpler and checked at compile-time
194         auto ascii = unicode.ASCII;
195         auto currency = unicode.Currency_Symbol;
196 
197         // easy set ops
198         auto a = set & ascii;
199         assert(a.empty); // as it has no intersection with ascii
200         a = set | ascii;
201         auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
202 
203         // some properties of code point sets
204         assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
205         // testing presence of a code point in a set
206         // is just fine, it is O(logN)
207         assert(!b['$']);
208         assert(!b['\u058F']); // Armenian dram sign
209         assert(b['¥']);
210 
211         // building fast lookup tables, these guarantee O(1) complexity
212         // 1-level Trie lookup table essentially a huge bit-set ~262Kb
213         auto oneTrie = toTrie!1(b);
214         // 2-level far more compact but typically slightly slower
215         auto twoTrie = toTrie!2(b);
216         // 3-level even smaller, and a bit slower yet
217         auto threeTrie = toTrie!3(b);
218         assert(oneTrie['£']);
219         assert(twoTrie['£']);
220         assert(threeTrie['£']);
221 
222         // build the trie with the most sensible trie level
223         // and bind it as a functor
224         auto cyrillicOrArmenian = toDelegate(set);
225         auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
226         assert(balance == "ընկեր!");
227         // compatible with bool delegate(dchar)
228         bool delegate(dchar) bindIt = cyrillicOrArmenian;
229 
230         // Normalization
231         string s = "Plain ascii (and not only), is always normalized!";
232         assert(s is normalize(s));// is the same string
233 
234         string nonS = "A\u0308ffin"; // A ligature
235         auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
236         assert(nS == "Äffin");
237         assert(nS != nonS);
238         string composed = "Äffin";
239 
240         assert(normalize!NFD(composed) == "A\u0308ffin");
241         // to NFKD, compatibility decomposition useful for fuzzy matching/searching
242         assert(normalize!NFKD("2¹⁰") == "210");
243     }
244     ---
245     $(SECTION Terminology)
246     $(P The following is a list of important Unicode notions
247     and definitions. Any conventions used specifically in this
248     module alone are marked as such. The descriptions are based on the formal
249     definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
250     chapter three of The Unicode Standard Core Specification.)
251     )
252     $(P $(DEF Abstract character) A unit of information used for the organization,
253         control, or representation of textual data.
254         Note that:
255         $(UL
256             $(LI When representing data, the nature of that data
257                 is generally symbolic as opposed to some other
258                 kind of data (for example, visual).
259             )
260              $(LI An abstract character has no concrete form
261                 and should not be confused with a $(S_LINK Glyph, glyph).
262             )
263             $(LI An abstract character does not necessarily
264                 correspond to what a user thinks of as a “character”
265                 and should not be confused with a $(LREF Grapheme).
266             )
267             $(LI The abstract characters encoded (see Encoded character)
268                 are known as Unicode abstract characters.
269             )
270             $(LI Abstract characters not directly
271                 encoded by the Unicode Standard can often be
272                 represented by the use of combining character sequences.
273             )
274         )
275     )
276     $(P $(DEF Canonical decomposition)
277         The decomposition of a character or character sequence
278         that results from recursively applying the canonical
279         mappings found in the Unicode Character Database
280         and these described in Conjoining Jamo Behavior
281         (section 12 of
282         $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
283     )
284     $(P $(DEF Canonical composition)
285         The precise definition of the Canonical composition
286         is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
287         Unicode Conformance) section 11.
288         Informally it's the process that does the reverse of the canonical
289         decomposition with the addition of certain rules
290         that e.g. prevent legacy characters from appearing in the composed result.
291     )
292     $(P $(DEF Canonical equivalent)
293         Two character sequences are said to be canonical equivalents if
294         their full canonical decompositions are identical.
295     )
296     $(P $(DEF Character) Typically differs by context.
297         For the purpose of this documentation the term $(I character)
298         implies $(I encoded character), that is, a code point having
299         an assigned abstract character (a symbolic meaning).
300     )
301     $(P $(DEF Code point) Any value in the Unicode codespace;
302         that is, the range of integers from 0 to 10FFFF (hex).
303         Not all code points are assigned to encoded characters.
304     )
305     $(P $(DEF Code unit) The minimal bit combination that can represent
306         a unit of encoded text for processing or interchange.
307         Depending on the encoding this could be:
308         8-bit code units in the UTF-8 (`char`),
309         16-bit code units in the UTF-16 (`wchar`),
310         and 32-bit code units in the UTF-32 (`dchar`).
311         $(I Note that in UTF-32, a code unit is a code point
312         and is represented by the D `dchar` type.)
313     )
314     $(P $(DEF Combining character) A character with the General Category
315         of Combining Mark(M).
316         $(UL
317             $(LI All characters with non-zero canonical combining class
318             are combining characters, but the reverse is not the case:
319             there are combining characters with a zero combining class.
320             )
321             $(LI These characters are not normally used in isolation
322             unless they are being described. They include such characters
323             as accents, diacritics, Hebrew points, Arabic vowel signs,
324             and Indic matras.
325             )
326         )
327     )
328     $(P $(DEF Combining class)
329         A numerical value used by the Unicode Canonical Ordering Algorithm
330         to determine which sequences of combining marks are to be
331         considered canonically equivalent and  which are not.
332     )
333     $(P $(DEF Compatibility decomposition)
334         The decomposition of a character or character sequence that results
335         from recursively applying both the compatibility mappings and
336         the canonical mappings found in the Unicode Character Database, and those
337         described in Conjoining Jamo Behavior no characters
338         can be further decomposed.
339     )
340     $(P $(DEF Compatibility equivalent)
341         Two character sequences are said to be compatibility
342         equivalents if their full compatibility decompositions are identical.
343     )
344     $(P $(DEF Encoded character) An association (or mapping)
345         between an abstract character and a code point.
346     )
347     $(P $(DEF Glyph) The actual, concrete image of a glyph representation
348         having been rasterized or otherwise imaged onto some display surface.
349     )
350     $(P $(DEF Grapheme base) A character with the property
351         Grapheme_Base, or any standard Korean syllable block.
352     )
353     $(P $(DEF Grapheme cluster) Defined as the text between
354         grapheme boundaries  as specified by Unicode Standard Annex #29,
355         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
356         Important general properties of a grapheme:
357         $(UL
358             $(LI The grapheme cluster represents a horizontally segmentable
359             unit of text, consisting of some grapheme base (which may
360             consist of a Korean syllable) together with any number of
361             nonspacing marks applied to it.
362             )
363             $(LI  A grapheme cluster typically starts with a grapheme base
364             and then extends across any subsequent sequence of nonspacing marks.
365             A grapheme cluster is most directly relevant to text rendering and
366             processes such as cursor placement and text selection in editing,
367             but may also be relevant to comparison and searching.
368             )
369             $(LI For many processes, a grapheme cluster behaves as if it was a
370             single character with the same properties as its grapheme base.
371             Effectively, nonspacing marks apply $(I graphically) to the base,
372             but do not change its properties.
373             )
374         )
375         $(P This module defines a number of primitives that work with graphemes:
376         $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
377         All of them are using $(I extended grapheme) boundaries
378         as defined in the aforementioned standard annex.
379         )
380     )
381     $(P $(DEF Nonspacing mark) A combining character with the
382         General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
383     )
384     $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
385     )
386     $(SECTION Normalization)
387     $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
388         or $(S_LINK Compatibility equivalent, compatibility equivalent)
389         characters in the Unicode Standard make it necessary to have a full, formal
390         definition of equivalence for Unicode strings.
391         String equivalence is determined by a process called normalization,
392         whereby strings are converted into forms which are compared
393         directly for identity. This is the primary goal of the normalization process,
394         see the function $(LREF normalize) to convert into any of
395         the four defined forms.
396     )
397     $(P A very important attribute of the Unicode Normalization Forms
398         is that they must remain stable between versions of the Unicode Standard.
399         A Unicode string normalized to a particular Unicode Normalization Form
400         in one version of the standard is guaranteed to remain in that Normalization
401         Form for implementations of future versions of the standard.
402     )
403     $(P The Unicode Standard specifies four normalization forms.
404         Informally, two of these forms are defined by maximal decomposition
405         of equivalent sequences, and two of these forms are defined
406         by maximal $(I composition) of equivalent sequences.
407             $(UL
408             $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
409                 canonical decomposition) of a character sequence.)
410             $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
411                 compatibility decomposition) of a character sequence.)
412             $(LI Normalization Form C (NFC): The canonical composition of the
413                 $(S_LINK Canonical decomposition, canonical decomposition)
414                 of a coded character sequence.)
415             $(LI Normalization Form KC (NFKC): The canonical composition
416             of the $(S_LINK Compatibility decomposition,
417                 compatibility decomposition) of a character sequence)
418             )
419     )
420     $(P The choice of the normalization form depends on the particular use case.
421         NFC is the best form for general text, since it's more compatible with
422         strings converted from legacy encodings. NFKC is the preferred form for
423         identifiers, especially where there are security concerns. NFD and NFKD
424         are the most useful for internal processing.
425     )
426     $(SECTION Construction of lookup tables)
427     $(P The Unicode standard describes a set of algorithms that
428         depend on having the ability to quickly look up various properties
429         of a code point. Given the codespace of about 1 million $(CODEPOINTS),
430         it is not a trivial task to provide a space-efficient solution for
431         the multitude of properties.
432     )
433     $(P Common approaches such as hash-tables or binary search over
434         sorted code point intervals (as in $(LREF InversionList)) are insufficient.
435         Hash-tables have enormous memory footprint and binary search
436         over intervals is not fast enough for some heavy-duty algorithms.
437     )
438     $(P The recommended solution (see Unicode Implementation Guidelines)
439         is using multi-stage tables that are an implementation of the
440         $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
441         keys and a fixed number of stages. For the remainder of the section
442         this will be called a fixed trie. The following describes a particular
443         implementation that is aimed for the speed of access at the expense
444         of ideal size savings.
445     )
446     $(P Taking a 2-level Trie as an example the principle of operation is as follows.
447         Split the number of bits in a key (code point, 21 bits) into 2 components
448         (e.g. 15 and 8).  The first is the number of bits in the index of the trie
449          and the other is number of bits in each page of the trie.
450         The layout of the trie is then an array of size 2^^bits-of-index followed
451         an array of memory chunks of size 2^^bits-of-page/bits-per-element.
452     )
453     $(P The number of pages is variable (but not less then 1)
454         unlike the number of entries in the index. The slots of the index
455         all have to contain a number of a page that is present. The lookup is then
456         just a couple of operations - slice the upper bits,
457         lookup an index for these, take a page at this index and use
458         the lower bits as an offset within this page.
459 
460         Assuming that pages are laid out consequently
461         in one array at `pages`, the pseudo-code is:
462     )
463     ---
464     auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
465     pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
466     ---
467     $(P Where if `elemsPerPage` is a power of 2 the whole process is
468         a handful of simple instructions and 2 array reads. Subsequent levels
469         of the trie are introduced by recursing on this notion - the index array
470         is treated as values. The number of bits in index is then again
471         split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
472     )
473 
474     $(P For completeness a level 1 trie is simply an array.
475         The current implementation takes advantage of bit-packing values
476         when the range is known to be limited in advance (such as `bool`).
477         See also $(LREF BitPacked) for enforcing it manually.
478         The major size advantage however comes from the fact
479         that multiple $(B identical pages on every level are merged) by construction.
480     )
481     $(P The process of constructing a trie is more involved and is hidden from
482         the user in a form of the convenience functions $(LREF codepointTrie),
483         $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
484         In general a set or built-in AA with `dchar` type
485         can be turned into a trie. The trie object in this module
486         is read-only (immutable); it's effectively frozen after construction.
487     )
488     $(SECTION Unicode properties)
489     $(P This is a full list of Unicode properties accessible through $(LREF unicode)
490         with specific helpers per category nested within. Consult the
491         $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
492         when in doubt about the contents of a particular set.
493     )
494     $(P General category sets listed below are only accessible with the
495         $(LREF unicode) shorthand accessor.)
496         $(BOOKTABLE $(B General category ),
497              $(TR $(TH Abb.) $(TH Long form)
498                 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
499             $(TR $(TD L) $(TD Letter)
500                 $(TD Cn) $(TD Unassigned)  $(TD Po) $(TD Other_Punctuation))
501             $(TR $(TD Ll) $(TD Lowercase_Letter)
502                 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
503             $(TR $(TD Lm) $(TD Modifier_Letter)
504                 $(TD Cs) $(TD Surrogate)   $(TD S) $(TD Symbol))
505             $(TR $(TD Lo) $(TD Other_Letter)
506                 $(TD N) $(TD Number)  $(TD Sc) $(TD Currency_Symbol))
507             $(TR $(TD Lt) $(TD Titlecase_Letter)
508               $(TD Nd) $(TD Decimal_Number)  $(TD Sk) $(TD Modifier_Symbol))
509             $(TR $(TD Lu) $(TD Uppercase_Letter)
510               $(TD Nl) $(TD Letter_Number)   $(TD Sm) $(TD Math_Symbol))
511             $(TR $(TD M) $(TD Mark)
512               $(TD No) $(TD Other_Number)    $(TD So) $(TD Other_Symbol))
513             $(TR $(TD Mc) $(TD Spacing_Mark)
514               $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
515             $(TR $(TD Me) $(TD Enclosing_Mark)
516               $(TD Pc) $(TD Connector_Punctuation)   $(TD Zl) $(TD Line_Separator))
517             $(TR $(TD Mn) $(TD Nonspacing_Mark)
518               $(TD Pd) $(TD Dash_Punctuation)    $(TD Zp) $(TD Paragraph_Separator))
519             $(TR $(TD C) $(TD Other)
520               $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
521             $(TR $(TD Cc) $(TD Control) $(TD Pf)
522               $(TD Final_Punctuation)   $(TD -) $(TD Any))
523             $(TR $(TD Cf) $(TD Format)
524               $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
525     )
526     $(P Sets for other commonly useful properties that are
527         accessible with $(LREF unicode):)
528         $(BOOKTABLE $(B Common binary properties),
529             $(TR $(TH Name) $(TH Name) $(TH Name))
530             $(TR $(TD Alphabetic)  $(TD Ideographic) $(TD Other_Uppercase))
531             $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
532             $(TR $(TD Bidi_Control)    $(TD ID_Start)    $(TD Pattern_White_Space))
533             $(TR $(TD Cased)   $(TD IDS_Trinary_Operator)    $(TD Quotation_Mark))
534             $(TR $(TD Case_Ignorable)  $(TD Join_Control)    $(TD Radical))
535             $(TR $(TD Dash)    $(TD Logical_Order_Exception) $(TD Soft_Dotted))
536             $(TR $(TD Default_Ignorable_Code_Point)    $(TD Lowercase)   $(TD STerm))
537             $(TR $(TD Deprecated)  $(TD Math)    $(TD Terminal_Punctuation))
538             $(TR $(TD Diacritic)   $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
539             $(TR $(TD Extender)    $(TD Other_Alphabetic)    $(TD Uppercase))
540             $(TR $(TD Grapheme_Base)   $(TD Other_Default_Ignorable_Code_Point)  $(TD Variation_Selector))
541             $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend)   $(TD White_Space))
542             $(TR $(TD Grapheme_Link)   $(TD Other_ID_Continue)   $(TD XID_Continue))
543             $(TR $(TD Hex_Digit)   $(TD Other_ID_Start)  $(TD XID_Start))
544             $(TR $(TD Hyphen)  $(TD Other_Lowercase) )
545             $(TR $(TD ID_Continue) $(TD Other_Math)  )
546     )
547     $(P Below is the table with block names accepted by $(LREF unicode.block).
548         Note that the shorthand version $(LREF unicode) requires "In"
549         to be prepended to the names of blocks so as to disambiguate
550         scripts and blocks.
551     )
552     $(BOOKTABLE $(B Blocks),
553         $(TR $(TD Aegean Numbers)    $(TD Ethiopic Extended) $(TD Mongolian))
554         $(TR $(TD Alchemical Symbols)    $(TD Ethiopic Extended-A)   $(TD Musical Symbols))
555         $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement)   $(TD Myanmar))
556         $(TR $(TD Ancient Greek Musical Notation)    $(TD General Punctuation)   $(TD Myanmar Extended-A))
557         $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes)  $(TD New Tai Lue))
558         $(TR $(TD Ancient Symbols)   $(TD Georgian)  $(TD NKo))
559         $(TR $(TD Arabic)    $(TD Georgian Supplement)   $(TD Number Forms))
560         $(TR $(TD Arabic Extended-A) $(TD Glagolitic)    $(TD Ogham))
561         $(TR $(TD Arabic Mathematical Alphabetic Symbols)    $(TD Gothic)    $(TD Ol Chiki))
562         $(TR $(TD Arabic Presentation Forms-A)   $(TD Greek and Coptic)  $(TD Old Italic))
563         $(TR $(TD Arabic Presentation Forms-B)   $(TD Greek Extended)    $(TD Old Persian))
564         $(TR $(TD Arabic Supplement) $(TD Gujarati)  $(TD Old South Arabian))
565         $(TR $(TD Armenian)  $(TD Gurmukhi)  $(TD Old Turkic))
566         $(TR $(TD Arrows)    $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
567         $(TR $(TD Avestan)   $(TD Hangul Compatibility Jamo) $(TD Oriya))
568         $(TR $(TD Balinese)  $(TD Hangul Jamo)   $(TD Osmanya))
569         $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A)    $(TD Phags-pa))
570         $(TR $(TD Bamum Supplement)  $(TD Hangul Jamo Extended-B)    $(TD Phaistos Disc))
571         $(TR $(TD Basic Latin)   $(TD Hangul Syllables)  $(TD Phoenician))
572         $(TR $(TD Batak) $(TD Hanunoo)   $(TD Phonetic Extensions))
573         $(TR $(TD Bengali)   $(TD Hebrew)    $(TD Phonetic Extensions Supplement))
574         $(TR $(TD Block Elements)    $(TD High Private Use Surrogates)   $(TD Playing Cards))
575         $(TR $(TD Bopomofo)  $(TD High Surrogates)   $(TD Private Use Area))
576         $(TR $(TD Bopomofo Extended) $(TD Hiragana)  $(TD Rejang))
577         $(TR $(TD Box Drawing)   $(TD Ideographic Description Characters)    $(TD Rumi Numeral Symbols))
578         $(TR $(TD Brahmi)    $(TD Imperial Aramaic)  $(TD Runic))
579         $(TR $(TD Braille Patterns)  $(TD Inscriptional Pahlavi) $(TD Samaritan))
580         $(TR $(TD Buginese)  $(TD Inscriptional Parthian)    $(TD Saurashtra))
581         $(TR $(TD Buhid) $(TD IPA Extensions)    $(TD Sharada))
582         $(TR $(TD Byzantine Musical Symbols) $(TD Javanese)  $(TD Shavian))
583         $(TR $(TD Carian)    $(TD Kaithi)    $(TD Sinhala))
584         $(TR $(TD Chakma)    $(TD Kana Supplement)   $(TD Small Form Variants))
585         $(TR $(TD Cham)  $(TD Kanbun)    $(TD Sora Sompeng))
586         $(TR $(TD Cherokee)  $(TD Kangxi Radicals)   $(TD Spacing Modifier Letters))
587         $(TR $(TD CJK Compatibility) $(TD Kannada)   $(TD Specials))
588         $(TR $(TD CJK Compatibility Forms)   $(TD Katakana)  $(TD Sundanese))
589         $(TR $(TD CJK Compatibility Ideographs)  $(TD Katakana Phonetic Extensions)  $(TD Sundanese Supplement))
590         $(TR $(TD CJK Compatibility Ideographs Supplement)   $(TD Kayah Li)  $(TD Superscripts and Subscripts))
591         $(TR $(TD CJK Radicals Supplement)   $(TD Kharoshthi)    $(TD Supplemental Arrows-A))
592         $(TR $(TD CJK Strokes)   $(TD Khmer) $(TD Supplemental Arrows-B))
593         $(TR $(TD CJK Symbols and Punctuation)   $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
594         $(TR $(TD CJK Unified Ideographs)    $(TD Lao)   $(TD Supplemental Punctuation))
595         $(TR $(TD CJK Unified Ideographs Extension A)    $(TD Latin-1 Supplement)    $(TD Supplementary Private Use Area-A))
596         $(TR $(TD CJK Unified Ideographs Extension B)    $(TD Latin Extended-A)  $(TD Supplementary Private Use Area-B))
597         $(TR $(TD CJK Unified Ideographs Extension C)    $(TD Latin Extended Additional) $(TD Syloti Nagri))
598         $(TR $(TD CJK Unified Ideographs Extension D)    $(TD Latin Extended-B)  $(TD Syriac))
599         $(TR $(TD Combining Diacritical Marks)   $(TD Latin Extended-C)  $(TD Tagalog))
600         $(TR $(TD Combining Diacritical Marks for Symbols)   $(TD Latin Extended-D)  $(TD Tagbanwa))
601         $(TR $(TD Combining Diacritical Marks Supplement)    $(TD Lepcha)    $(TD Tags))
602         $(TR $(TD Combining Half Marks)  $(TD Letterlike Symbols)    $(TD Tai Le))
603         $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
604         $(TR $(TD Control Pictures)  $(TD Linear B Ideograms)    $(TD Tai Viet))
605         $(TR $(TD Coptic)    $(TD Linear B Syllabary)    $(TD Tai Xuan Jing Symbols))
606         $(TR $(TD Counting Rod Numerals) $(TD Lisu)  $(TD Takri))
607         $(TR $(TD Cuneiform) $(TD Low Surrogates)    $(TD Tamil))
608         $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian)    $(TD Telugu))
609         $(TR $(TD Currency Symbols)  $(TD Lydian)    $(TD Thaana))
610         $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
611         $(TR $(TD Cyrillic)  $(TD Malayalam) $(TD Tibetan))
612         $(TR $(TD Cyrillic Extended-A)   $(TD Mandaic)   $(TD Tifinagh))
613         $(TR $(TD Cyrillic Extended-B)   $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
614         $(TR $(TD Cyrillic Supplement)   $(TD Mathematical Operators)    $(TD Ugaritic))
615         $(TR $(TD Deseret)   $(TD Meetei Mayek)  $(TD Unified Canadian Aboriginal Syllabics))
616         $(TR $(TD Devanagari)    $(TD Meetei Mayek Extensions)   $(TD Unified Canadian Aboriginal Syllabics Extended))
617         $(TR $(TD Devanagari Extended)   $(TD Meroitic Cursive)  $(TD Vai))
618         $(TR $(TD Dingbats)  $(TD Meroitic Hieroglyphs)  $(TD Variation Selectors))
619         $(TR $(TD Domino Tiles)  $(TD Miao)  $(TD Variation Selectors Supplement))
620         $(TR $(TD Egyptian Hieroglyphs)  $(TD Miscellaneous Mathematical Symbols-A)  $(TD Vedic Extensions))
621         $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B)  $(TD Vertical Forms))
622         $(TR $(TD Enclosed Alphanumerics)    $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
623         $(TR $(TD Enclosed Alphanumeric Supplement)  $(TD Miscellaneous Symbols and Arrows)  $(TD Yi Radicals))
624         $(TR $(TD Enclosed CJK Letters and Months)   $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
625         $(TR $(TD Enclosed Ideographic Supplement)   $(TD Miscellaneous Technical)   )
626         $(TR $(TD Ethiopic)  $(TD Modifier Tone Letters) )
627     )
628     $(P Below is the table with script names accepted by $(LREF unicode.script)
629         and by the shorthand version $(LREF unicode):)
630         $(BOOKTABLE $(B Scripts),
631             $(TR $(TD Arabic)  $(TD Hanunoo) $(TD Old_Italic))
632             $(TR $(TD Armenian)    $(TD Hebrew)  $(TD Old_Persian))
633             $(TR $(TD Avestan) $(TD Hiragana)    $(TD Old_South_Arabian))
634             $(TR $(TD Balinese)    $(TD Imperial_Aramaic)    $(TD Old_Turkic))
635             $(TR $(TD Bamum)   $(TD Inherited)   $(TD Oriya))
636             $(TR $(TD Batak)   $(TD Inscriptional_Pahlavi)   $(TD Osmanya))
637             $(TR $(TD Bengali) $(TD Inscriptional_Parthian)  $(TD Phags_Pa))
638             $(TR $(TD Bopomofo)    $(TD Javanese)    $(TD Phoenician))
639             $(TR $(TD Brahmi)  $(TD Kaithi)  $(TD Rejang))
640             $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
641             $(TR $(TD Buginese)    $(TD Katakana)    $(TD Samaritan))
642             $(TR $(TD Buhid)   $(TD Kayah_Li)    $(TD Saurashtra))
643             $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi)  $(TD Sharada))
644             $(TR $(TD Carian)  $(TD Khmer)   $(TD Shavian))
645             $(TR $(TD Chakma)  $(TD Lao) $(TD Sinhala))
646             $(TR $(TD Cham)    $(TD Latin)   $(TD Sora_Sompeng))
647             $(TR $(TD Cherokee)    $(TD Lepcha)  $(TD Sundanese))
648             $(TR $(TD Common)  $(TD Limbu)   $(TD Syloti_Nagri))
649             $(TR $(TD Coptic)  $(TD Linear_B)    $(TD Syriac))
650             $(TR $(TD Cuneiform)   $(TD Lisu)    $(TD Tagalog))
651             $(TR $(TD Cypriot) $(TD Lycian)  $(TD Tagbanwa))
652             $(TR $(TD Cyrillic)    $(TD Lydian)  $(TD Tai_Le))
653             $(TR $(TD Deseret) $(TD Malayalam)   $(TD Tai_Tham))
654             $(TR $(TD Devanagari)  $(TD Mandaic) $(TD Tai_Viet))
655             $(TR $(TD Egyptian_Hieroglyphs)    $(TD Meetei_Mayek)    $(TD Takri))
656             $(TR $(TD Ethiopic)    $(TD Meroitic_Cursive)    $(TD Tamil))
657             $(TR $(TD Georgian)    $(TD Meroitic_Hieroglyphs)    $(TD Telugu))
658             $(TR $(TD Glagolitic)  $(TD Miao)    $(TD Thaana))
659             $(TR $(TD Gothic)  $(TD Mongolian)   $(TD Thai))
660             $(TR $(TD Greek)   $(TD Myanmar) $(TD Tibetan))
661             $(TR $(TD Gujarati)    $(TD New_Tai_Lue) $(TD Tifinagh))
662             $(TR $(TD Gurmukhi)    $(TD Nko) $(TD Ugaritic))
663             $(TR $(TD Han) $(TD Ogham)   $(TD Vai))
664             $(TR $(TD Hangul)  $(TD Ol_Chiki)    $(TD Yi))
665     )
666     $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
667         $(BOOKTABLE $(B Hangul syllable type),
668             $(TR $(TH Abb.) $(TH Long form))
669             $(TR $(TD L)   $(TD Leading_Jamo))
670             $(TR $(TD LV)  $(TD LV_Syllable))
671             $(TR $(TD LVT) $(TD LVT_Syllable) )
672             $(TR $(TD T)   $(TD Trailing_Jamo))
673             $(TR $(TD V)   $(TD Vowel_Jamo))
674     )
675     References:
676         $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
677         $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
678         $(HTTP www.unicode.org, The Unicode Consortium),
679         $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
680         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
681         $(HTTP www.unicode.org/uni2book/ch05.pdf,
682             Unicode Implementation Guidelines)
683         $(HTTP www.unicode.org/uni2book/ch03.pdf,
684             Unicode Conformance)
685     Trademarks:
686         Unicode(tm) is a trademark of Unicode, Inc.
687 
688     Copyright: Copyright 2013 -
689     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
690     Authors:   Dmitry Olshansky
691     Source:    $(PHOBOSSRC std/uni/package.d)
692     Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
693 
694 Macros:
695 
696 SECTION = <h3><a id="$1">$0</a></h3>
697 DEF = <div><a id="$1"><i>$0</i></a></div>
698 S_LINK = <a href="#$1">$+</a>
699 CODEPOINT = $(S_LINK Code point, code point)
700 CODEPOINTS = $(S_LINK Code point, code points)
701 CHARACTER = $(S_LINK Character, character)
702 CHARACTERS = $(S_LINK Character, characters)
703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
704 +/
705 module std.uni;
706 
707 import std.meta : AliasSeq;
708 import std.range.primitives : back, ElementEncodingType, ElementType, empty,
709     front, hasLength, hasSlicing, isForwardRange, isInputRange,
710     isRandomAccessRange, popFront, put, save;
711 import std.traits : isConvertibleToString, isIntegral, isSomeChar,
712     isSomeString, Unqual, isDynamicArray;
713 // debug = std_uni;
714 
715 import std.internal.unicode_tables; // generated file
716 
717 debug(std_uni) import std.stdio; // writefln, writeln
718 
719 private:
720 
721 
722 void copyBackwards(T,U)(T[] src, U[] dest)
723 {
724     assert(src.length == dest.length);
725     for (size_t i=src.length; i-- > 0; )
726         dest[i] = src[i];
727 }
728 
729 void copyForward(T,U)(T[] src, U[] dest)
730 {
731     assert(src.length == dest.length);
732     for (size_t i=0; i<src.length; i++)
733         dest[i] = src[i];
734 }
735 
736 // TODO: update to reflect all major CPUs supporting unaligned reads
737 version (X86)
738     enum hasUnalignedReads = true;
739 else version (X86_64)
740     enum hasUnalignedReads = true;
741 else version (SystemZ)
742     enum hasUnalignedReads = true;
743 else
744     enum hasUnalignedReads = false; // better be safe then sorry
745 
746 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
747 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
748 public enum dchar nelSep  = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
749 
750 // test the intro example
751 @safe unittest
752 {
753     import std.algorithm.searching : find;
754     // initialize code point sets using script/block or property name
755     // set contains code points from both scripts.
756     auto set = unicode("Cyrillic") | unicode("Armenian");
757     // or simpler and statically-checked look
758     auto ascii = unicode.ASCII;
759     auto currency = unicode.Currency_Symbol;
760 
761     // easy set ops
762     auto a = set & ascii;
763     assert(a.empty); // as it has no intersection with ascii
764     a = set | ascii;
765     auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
766 
767     // some properties of code point sets
768     assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
769     // testing presence of a code point in a set
770     // is just fine, it is O(logN)
771     assert(!b['$']);
772     assert(!b['\u058F']); // Armenian dram sign
773     assert(b['¥']);
774 
775     // building fast lookup tables, these guarantee O(1) complexity
776     // 1-level Trie lookup table essentially a huge bit-set ~262Kb
777     auto oneTrie = toTrie!1(b);
778     // 2-level far more compact but typically slightly slower
779     auto twoTrie = toTrie!2(b);
780     // 3-level even smaller, and a bit slower yet
781     auto threeTrie = toTrie!3(b);
782     assert(oneTrie['£']);
783     assert(twoTrie['£']);
784     assert(threeTrie['£']);
785 
786     // build the trie with the most sensible trie level
787     // and bind it as a functor
788     auto cyrillicOrArmenian = toDelegate(set);
789     auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
790     assert(balance == "ընկեր!");
791     // compatible with bool delegate(dchar)
792     bool delegate(dchar) bindIt = cyrillicOrArmenian;
793 
794     // Normalization
795     string s = "Plain ascii (and not only), is always normalized!";
796     assert(s is normalize(s));// is the same string
797 
798     string nonS = "A\u0308ffin"; // A ligature
799     auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
800     assert(nS == "Äffin");
801     assert(nS != nonS);
802     string composed = "Äffin";
803 
804     assert(normalize!NFD(composed) == "A\u0308ffin");
805     // to NFKD, compatibility decomposition useful for fuzzy matching/searching
806     assert(normalize!NFKD("2¹⁰") == "210");
807 }
808 
809 enum lastDchar = 0x10FFFF;
810 
811 auto force(T, F)(F from)
812 if (isIntegral!T && !is(T == F))
813 {
814     assert(from <= T.max && from >= T.min);
815     return cast(T) from;
816 }
817 
818 auto force(T, F)(F from)
819 if (isBitPacked!T && !is(T == F))
820 {
821     assert(from <= 2^^bitSizeOf!T-1);
822     return T(cast(TypeOfBitPacked!T) from);
823 }
824 
825 auto force(T, F)(F from)
826 if (is(T == F))
827 {
828     return from;
829 }
830 
831 // repeat X times the bit-pattern in val assuming it's length is 'bits'
832 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
833 {
834     static if (times == 1)
835         return val;
836     else static if (bits == 1)
837     {
838         static if (times == size_t.sizeof*8)
839             return val ? size_t.max : 0;
840         else
841             return val ? (1 << times)-1 : 0;
842     }
843     else static if (times % 2)
844         return (replicateBits!(times-1, bits)(val)<<bits) | val;
845     else
846         return replicateBits!(times/2, bits*2)((val << bits) | val);
847 }
848 
849 @safe pure nothrow @nogc unittest // for replicate
850 {
851     import std.algorithm.iteration : sum, map;
852     import std.range : iota;
853     size_t m = 0b111;
854     size_t m2 = 0b01;
855     static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
856     {
857         assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
858         assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
859     }
860 }
861 
862 // multiple arrays squashed into one memory block
863 struct MultiArray(Types...)
864 {
865     import std.range.primitives : isOutputRange;
866     this(size_t[] sizes...) @safe pure nothrow
867     {
868         assert(dim == sizes.length);
869         size_t full_size;
870         foreach (i, v; Types)
871         {
872             full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
873             sz[i] = sizes[i];
874             static if (i >= 1)
875                 offsets[i] = offsets[i-1] +
876                     spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
877         }
878 
879         storage = new size_t[full_size];
880     }
881 
882     this(const(size_t)[] raw_offsets,
883         const(size_t)[] raw_sizes,
884         return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc
885     {
886         offsets[] = raw_offsets[];
887         sz[] = raw_sizes[];
888         storage = data;
889     }
890 
891     @property auto slice(size_t n)()inout pure nothrow @nogc
892     {
893         auto ptr = raw_ptr!n;
894         return packedArrayView!(Types[n])(ptr, sz[n]);
895     }
896 
897     @property auto ptr(size_t n)()inout pure nothrow @nogc
898     {
899         auto ptr = raw_ptr!n;
900         return inout(PackedPtr!(Types[n]))(ptr);
901     }
902 
903     template length(size_t n)
904     {
905         @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
906 
907         @property void length(size_t new_size)
908         {
909             if (new_size > sz[n])
910             {// extend
911                 size_t delta = (new_size - sz[n]);
912                 sz[n] += delta;
913                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
914                 storage.length +=  delta;// extend space at end
915                 // raw_slice!x must follow resize as it could be moved!
916                 // next stmts move all data past this array, last-one-goes-first
917                 static if (n != dim-1)
918                 {
919                     auto start = raw_ptr!(n+1);
920                     // len includes delta
921                     size_t len = (storage.ptr+storage.length-start);
922 
923                     copyBackwards(start[0 .. len-delta], start[delta .. len]);
924 
925                     start[0 .. delta] = 0;
926                     // offsets are used for raw_slice, ptr etc.
927                     foreach (i; n+1 .. dim)
928                         offsets[i] += delta;
929                 }
930             }
931             else if (new_size < sz[n])
932             {// shrink
933                 size_t delta = (sz[n] - new_size);
934                 sz[n] -= delta;
935                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
936                 // move all data past this array, forward direction
937                 static if (n != dim-1)
938                 {
939                     auto start = raw_ptr!(n+1);
940                     size_t len = (storage.ptr+storage.length-start);
941                     copyForward(start[0 .. len-delta], start[delta .. len]);
942 
943                     // adjust offsets last, they affect raw_slice
944                     foreach (i; n+1 .. dim)
945                         offsets[i] -= delta;
946                 }
947                 storage.length -= delta;
948             }
949             // else - NOP
950         }
951     }
952 
953     @property size_t bytes(size_t n=size_t.max)() const @safe
954     {
955         static if (n == size_t.max)
956             return storage.length*size_t.sizeof;
957         else static if (n != Types.length-1)
958             return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
959         else
960             return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
961     }
962 
963     void store(OutRange)(scope OutRange sink) const
964         if (isOutputRange!(OutRange, char))
965     {
966         import std.format.write : formattedWrite;
967         formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
968         formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
969         formattedWrite(sink, ", [%( 0x%x, %)]", storage);
970     }
971 
972 private:
973     import std.meta : staticMap;
974     @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
975     {
976         static if (n == 0)
977             return storage.ptr;
978         else
979         {
980             return storage.ptr+offsets[n];
981         }
982     }
983     enum dim = Types.length;
984     size_t[dim] offsets;// offset for level x
985     size_t[dim] sz;// size of level x
986     alias bitWidth = staticMap!(bitSizeOf, Types);
987     size_t[] storage;
988 }
989 
990 @system unittest
991 {
992     import std.conv : text;
993     enum dg = (){
994         // sizes are:
995         // lvl0: 3, lvl1 : 2, lvl2: 1
996         auto m = MultiArray!(int, ubyte, int)(3,2,1);
997 
998         static void check(size_t k, T)(ref T m, int n)
999         {
1000             foreach (i; 0 .. n)
1001                 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
1002         }
1003 
1004         static void checkB(size_t k, T)(ref T m, int n)
1005         {
1006             foreach (i; 0 .. n)
1007                 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
1008         }
1009 
1010         static void fill(size_t k, T)(ref T m, int n)
1011         {
1012             foreach (i; 0 .. n)
1013                 m.slice!(k)[i] = force!ubyte(i+1);
1014         }
1015 
1016         static void fillB(size_t k, T)(ref T m, int n)
1017         {
1018             foreach (i; 0 .. n)
1019                 m.slice!(k)[i] = force!ubyte(n-i);
1020         }
1021 
1022         m.length!1 = 100;
1023         fill!1(m, 100);
1024         check!1(m, 100);
1025 
1026         m.length!0 = 220;
1027         fill!0(m, 220);
1028         check!1(m, 100);
1029         check!0(m, 220);
1030 
1031         m.length!2 = 17;
1032         fillB!2(m, 17);
1033         checkB!2(m, 17);
1034         check!0(m, 220);
1035         check!1(m, 100);
1036 
1037         m.length!2 = 33;
1038         checkB!2(m, 17);
1039         fillB!2(m, 33);
1040         checkB!2(m, 33);
1041         check!0(m, 220);
1042         check!1(m, 100);
1043 
1044         m.length!1 = 195;
1045         fillB!1(m, 195);
1046         checkB!1(m, 195);
1047         checkB!2(m, 33);
1048         check!0(m, 220);
1049 
1050         auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
1051         marr.length!0 = 15;
1052         marr.length!1 = 30;
1053         fill!1(marr, 30);
1054         fill!0(marr, 15);
1055         check!1(marr, 30);
1056         check!0(marr, 15);
1057         return 0;
1058     };
1059     enum ct = dg();
1060     auto rt = dg();
1061 }
1062 
1063 @system unittest
1064 {// more bitpacking tests
1065     import std.conv : text;
1066 
1067     alias Bitty =
1068       MultiArray!(BitPacked!(size_t, 3)
1069                 , BitPacked!(size_t, 4)
1070                 , BitPacked!(size_t, 3)
1071                 , BitPacked!(size_t, 6)
1072                 , bool);
1073     alias fn1 = sliceBits!(13, 16);
1074     alias fn2 = sliceBits!( 9, 13);
1075     alias fn3 = sliceBits!( 6,  9);
1076     alias fn4 = sliceBits!( 0,  6);
1077     static void check(size_t lvl, MA)(ref MA arr){
1078         for (size_t i = 0; i< arr.length!lvl; i++)
1079             assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
1080     }
1081 
1082     static void fillIdx(size_t lvl, MA)(ref MA arr){
1083         for (size_t i = 0; i< arr.length!lvl; i++)
1084             arr.slice!(lvl)[i] = i;
1085     }
1086     Bitty m1;
1087 
1088     m1.length!4 = 10;
1089     m1.length!3 = 2^^6;
1090     m1.length!2 = 2^^3;
1091     m1.length!1 = 2^^4;
1092     m1.length!0 = 2^^3;
1093 
1094     m1.length!4 = 2^^16;
1095 
1096     for (size_t i = 0; i< m1.length!4; i++)
1097         m1.slice!(4)[i] = i % 2;
1098 
1099     fillIdx!1(m1);
1100     check!1(m1);
1101     fillIdx!2(m1);
1102     check!2(m1);
1103     fillIdx!3(m1);
1104     check!3(m1);
1105     fillIdx!0(m1);
1106     check!0(m1);
1107     check!3(m1);
1108     check!2(m1);
1109     check!1(m1);
1110     for (size_t i=0; i < 2^^16; i++)
1111     {
1112         m1.slice!(4)[i] = i % 2;
1113         m1.slice!(0)[fn1(i)] = fn1(i);
1114         m1.slice!(1)[fn2(i)] = fn2(i);
1115         m1.slice!(2)[fn3(i)] = fn3(i);
1116         m1.slice!(3)[fn4(i)] = fn4(i);
1117     }
1118     for (size_t i=0; i < 2^^16; i++)
1119     {
1120         assert(m1.slice!(4)[i] == i % 2);
1121         assert(m1.slice!(0)[fn1(i)] == fn1(i));
1122         assert(m1.slice!(1)[fn2(i)] == fn2(i));
1123         assert(m1.slice!(2)[fn3(i)] == fn3(i));
1124         assert(m1.slice!(3)[fn4(i)] == fn4(i));
1125     }
1126 }
1127 
1128 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
1129 {
1130     import std.math.algebraic : nextPow2;
1131     enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
1132     static if (bits > 8*size_t.sizeof)
1133     {
1134         static assert(bits % (size_t.sizeof*8) == 0);
1135         return new_len * bits/(8*size_t.sizeof);
1136     }
1137     else
1138     {
1139         enum factor = size_t.sizeof*8/bits;
1140         return (new_len+factor-1)/factor; // rounded up
1141     }
1142 }
1143 
1144 template isBitPackableType(T)
1145 {
1146     enum isBitPackableType = isBitPacked!T
1147         || isIntegral!T || is(T == bool) || isSomeChar!T;
1148 }
1149 
1150 //============================================================================
1151 template PackedArrayView(T)
1152 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1153     && isBitPackableType!U) || isBitPackableType!T)
1154 {
1155     import std.math.algebraic : nextPow2;
1156     private enum bits = bitSizeOf!T;
1157     alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1158 }
1159 
1160 //unsafe and fast access to a chunk of RAM as if it contains packed values
1161 template PackedPtr(T)
1162 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1163     && isBitPackableType!U) || isBitPackableType!T)
1164 {
1165     import std.math.algebraic : nextPow2;
1166     private enum bits = bitSizeOf!T;
1167     alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1168 }
1169 
1170 struct PackedPtrImpl(T, size_t bits)
1171 {
1172 pure nothrow:
1173     static assert(isPow2OrZero(bits));
1174 
1175     this(inout(size_t)* ptr)inout @safe @nogc
1176     {
1177         origin = ptr;
1178     }
1179 
1180     private T simpleIndex(size_t n) inout
1181     {
1182         immutable q = n / factor;
1183         immutable r = n % factor;
1184         return cast(T)((origin[q] >> bits*r) & mask);
1185     }
1186 
1187     private void simpleWrite(TypeOfBitPacked!T val, size_t n)
1188     in
1189     {
1190         static if (isIntegral!T)
1191             assert(val <= mask);
1192     }
1193     do
1194     {
1195         immutable q = n / factor;
1196         immutable r = n % factor;
1197         immutable tgt_shift = bits*r;
1198         immutable word = origin[q];
1199         origin[q] = (word & ~(mask << tgt_shift))
1200             | (cast(size_t) val << tgt_shift);
1201     }
1202 
1203     static if (factor == bytesPerWord// can safely pack by byte
1204          || factor == 1 // a whole word at a time
1205          || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
1206                 && hasUnalignedReads)) // this needs unaligned reads
1207     {
1208         static if (factor == bytesPerWord)
1209             alias U = ubyte;
1210         else static if (factor == bytesPerWord/2)
1211             alias U = ushort;
1212         else static if (factor == bytesPerWord/4)
1213             alias U = uint;
1214         else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
1215             alias U = ulong;
1216 
1217         T opIndex(size_t idx) inout
1218         {
1219             T ret;
1220             version (LittleEndian)
1221                 ret = __ctfe ? simpleIndex(idx) :
1222                     cast(inout(T))(cast(U*) origin)[idx];
1223             else
1224                 ret = simpleIndex(idx);
1225             return ret;
1226         }
1227 
1228         static if (isBitPacked!T) // lack of user-defined implicit conversion
1229         {
1230             void opIndexAssign(T val, size_t idx)
1231             {
1232                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1233             }
1234         }
1235 
1236         void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1237         {
1238             version (LittleEndian)
1239             {
1240                 if (__ctfe)
1241                     simpleWrite(val, idx);
1242                 else
1243                     (cast(U*) origin)[idx] = cast(U) val;
1244             }
1245             else
1246                 simpleWrite(val, idx);
1247         }
1248     }
1249     else
1250     {
1251         T opIndex(size_t n) inout
1252         {
1253             return simpleIndex(n);
1254         }
1255 
1256         static if (isBitPacked!T) // lack of user-defined implicit conversion
1257         {
1258             void opIndexAssign(T val, size_t idx)
1259             {
1260                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1261             }
1262         }
1263 
1264         void opIndexAssign(TypeOfBitPacked!T val, size_t n)
1265         {
1266             return simpleWrite(val, n);
1267         }
1268     }
1269 
1270 private:
1271     // factor - number of elements in one machine word
1272     enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
1273     enum bytesPerWord =  size_t.sizeof;
1274     size_t* origin;
1275 }
1276 
1277 // data is packed only by power of two sized packs per word,
1278 // thus avoiding mul/div overhead at the cost of ultimate packing
1279 // this construct doesn't own memory, only provides access, see MultiArray for usage
1280 struct PackedArrayViewImpl(T, size_t bits)
1281 {
1282 pure nothrow:
1283 
1284     this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
1285     {
1286         ptr = inout(PackedPtr!(T))(origin);
1287         ofs = offset;
1288         limit = items;
1289     }
1290 
1291     bool zeros(size_t s, size_t e)
1292     in
1293     {
1294         assert(s <= e);
1295     }
1296     do
1297     {
1298         s += ofs;
1299         e += ofs;
1300         immutable pad_s = roundUp(s);
1301         if ( s >= e)
1302         {
1303             foreach (i; s .. e)
1304                 if (ptr[i])
1305                     return false;
1306             return true;
1307         }
1308         immutable pad_e = roundDown(e);
1309         size_t i;
1310         for (i=s; i<pad_s; i++)
1311             if (ptr[i])
1312                 return false;
1313         // all in between is x*factor elements
1314         for (size_t j=i/factor; i<pad_e; i+=factor, j++)
1315             if (ptr.origin[j])
1316                 return false;
1317         for (; i<e; i++)
1318             if (ptr[i])
1319                 return false;
1320         return true;
1321     }
1322 
1323     T opIndex(size_t idx) inout
1324     in
1325     {
1326         assert(idx < limit);
1327     }
1328     do
1329     {
1330         return ptr[ofs + idx];
1331     }
1332 
1333     static if (isBitPacked!T) // lack of user-defined implicit conversion
1334     {
1335         void opIndexAssign(T val, size_t idx)
1336         {
1337             return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1338         }
1339     }
1340 
1341     void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1342     in
1343     {
1344         assert(idx < limit);
1345     }
1346     do
1347     {
1348         ptr[ofs + idx] = val;
1349     }
1350 
1351     static if (isBitPacked!T) // lack of user-defined implicit conversions
1352     {
1353         void opSliceAssign(T val, size_t start, size_t end)
1354         {
1355             opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
1356         }
1357     }
1358 
1359     void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
1360     in
1361     {
1362         assert(start <= end);
1363         assert(end <= limit);
1364     }
1365     do
1366     {
1367         // account for ofsetted view
1368         start += ofs;
1369         end += ofs;
1370         // rounded to factor granularity
1371         immutable pad_start = roundUp(start);// rounded up
1372         if (pad_start >= end) //rounded up >= then end of slice
1373         {
1374             //nothing to gain, use per element assignment
1375             foreach (i; start .. end)
1376                 ptr[i] = val;
1377             return;
1378         }
1379         immutable pad_end = roundDown(end); // rounded down
1380         size_t i;
1381         for (i=start; i<pad_start; i++)
1382             ptr[i] = val;
1383         // all in between is x*factor elements
1384         if (pad_start != pad_end)
1385         {
1386             immutable repval = replicateBits!(factor, bits)(val);
1387             for (size_t j=i/factor; i<pad_end; i+=factor, j++)
1388                 ptr.origin[j] = repval;// so speed it up by factor
1389         }
1390         for (; i<end; i++)
1391             ptr[i] = val;
1392     }
1393 
1394     auto opSlice(size_t from, size_t to)inout
1395     in
1396     {
1397         assert(from <= to);
1398         assert(ofs + to <= limit);
1399     }
1400     do
1401     {
1402         return typeof(this)(ptr.origin, ofs + from, to - from);
1403     }
1404 
1405     auto opSlice(){ return opSlice(0, length); }
1406 
1407     bool opEquals(T)(auto ref T arr) const
1408     {
1409         if (limit != arr.limit)
1410            return false;
1411         size_t s1 = ofs, s2 = arr.ofs;
1412         size_t e1 = s1 + limit, e2 = s2 + limit;
1413         if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
1414         {
1415             return ptr.origin[s1/factor .. e1/factor]
1416                 == arr.ptr.origin[s2/factor .. e2/factor];
1417         }
1418         for (size_t i=0;i<limit; i++)
1419             if (this[i] != arr[i])
1420                 return false;
1421         return true;
1422     }
1423 
1424     @property size_t length()const{ return limit; }
1425 
1426 private:
1427     auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
1428     auto roundDown()(size_t val){ return val/factor*factor; }
1429     // factor - number of elements in one machine word
1430     enum factor = size_t.sizeof*8/bits;
1431     PackedPtr!(T) ptr;
1432     size_t ofs, limit;
1433 }
1434 
1435 
1436 private struct SliceOverIndexed(T)
1437 {
1438     enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
1439     enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
1440     auto opIndex(size_t idx)const
1441     in
1442     {
1443         assert(idx < to - from);
1444     }
1445     do
1446     {
1447         return (*arr)[from+idx];
1448     }
1449 
1450     static if (assignableIndex)
1451     void opIndexAssign(Item val, size_t idx)
1452     in
1453     {
1454         assert(idx < to - from);
1455     }
1456     do
1457     {
1458        (*arr)[from+idx] = val;
1459     }
1460 
1461     auto opSlice(size_t a, size_t b)
1462     {
1463         return typeof(this)(from+a, from+b, arr);
1464     }
1465 
1466     // static if (assignableSlice)
1467     void opSliceAssign(T)(T val, size_t start, size_t end)
1468     {
1469         (*arr)[start+from .. end+from] = val;
1470     }
1471 
1472     auto opSlice()
1473     {
1474         return typeof(this)(from, to, arr);
1475     }
1476 
1477     @property size_t length()const { return to-from;}
1478 
1479     alias opDollar = length;
1480 
1481     @property bool empty()const { return from == to; }
1482 
1483     @property auto front()const { return (*arr)[from]; }
1484 
1485     static if (assignableIndex)
1486     @property void front(Item val) { (*arr)[from] = val; }
1487 
1488     @property auto back()const { return (*arr)[to-1]; }
1489 
1490     static if (assignableIndex)
1491     @property void back(Item val) { (*arr)[to-1] = val; }
1492 
1493     @property auto save() inout { return this; }
1494 
1495     void popFront() {   from++; }
1496 
1497     void popBack() {    to--; }
1498 
1499     bool opEquals(T)(auto ref T arr) const
1500     {
1501         if (arr.length != length)
1502             return false;
1503         for (size_t i=0; i <length; i++)
1504             if (this[i] != arr[i])
1505                 return false;
1506         return true;
1507     }
1508 private:
1509     alias Item = typeof(T.init[0]);
1510     size_t from, to;
1511     T* arr;
1512 }
1513 
1514 @safe pure nothrow @nogc unittest
1515 {
1516     static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
1517 }
1518 
1519 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
1520 if (is(Unqual!T == T))
1521 {
1522     return SliceOverIndexed!(const(T))(a, b, x);
1523 }
1524 
1525 // BUG? inout is out of reach
1526 //...SliceOverIndexed.arr only parameters or stack based variables can be inout
1527 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
1528 if (is(Unqual!T == T))
1529 {
1530     return SliceOverIndexed!T(a, b, x);
1531 }
1532 
1533 @system unittest
1534 {
1535     int[] idxArray = [2, 3, 5, 8, 13];
1536     auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
1537 
1538     assert(!sliced.empty);
1539     assert(sliced.front == 2);
1540     sliced.front = 1;
1541     assert(sliced.front == 1);
1542     assert(sliced.back == 13);
1543     sliced.popFront();
1544     assert(sliced.front == 3);
1545     assert(sliced.back == 13);
1546     sliced.back = 11;
1547     assert(sliced.back == 11);
1548     sliced.popBack();
1549 
1550     assert(sliced.front == 3);
1551     assert(sliced[$-1] == 8);
1552     sliced = sliced[];
1553     assert(sliced[0] == 3);
1554     assert(sliced.back == 8);
1555     sliced = sliced[1..$];
1556     assert(sliced.front == 5);
1557     sliced = sliced[0..$-1];
1558     assert(sliced[$-1] == 5);
1559 
1560     int[] other = [2, 5];
1561     assert(sliced[] == sliceOverIndexed(1, 2, &other));
1562     sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
1563     assert(idxArray[0 .. 2] == [-1, -1]);
1564     uint[] nullArr = null;
1565     auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
1566     assert(nullSlice.empty);
1567 }
1568 
1569 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items)
1570 {
1571     return inout(PackedArrayView!T)(ptr, 0, items);
1572 }
1573 
1574 
1575 //============================================================================
1576 // Partially unrolled binary search using Shar's method
1577 //============================================================================
1578 
1579 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
1580 {
1581     import core.bitop : bsr;
1582     import std.array : replace;
1583     import std.conv : to;
1584     assert(isPow2OrZero(size));
1585     string code = `
1586     import core.bitop : bsr;
1587     auto power = bsr(m)+1;
1588     switch (power){`;
1589     size_t i = bsr(size);
1590     foreach_reverse (val; 0 .. bsr(size))
1591     {
1592         auto v = 2^^val;
1593         code ~= `
1594         case pow:
1595             if (pred(range[idx+m], needle))
1596                 idx +=  m;
1597             goto case;
1598         `.replace("m", to!string(v))
1599         .replace("pow", to!string(i));
1600         i--;
1601     }
1602     code ~= `
1603         case 0:
1604             if (pred(range[idx], needle))
1605                 idx += 1;
1606             goto default;
1607         `;
1608     code ~= `
1609         default:
1610     }`;
1611     return code;
1612 }
1613 
1614 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
1615 {
1616     // See also: std.math.isPowerOf2()
1617     return (sz & (sz-1)) == 0;
1618 }
1619 
1620 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
1621 if (is(T : ElementType!Range))
1622 {
1623     assert(isPow2OrZero(range.length));
1624     size_t idx = 0, m = range.length/2;
1625     while (m != 0)
1626     {
1627         if (pred(range[idx+m], needle))
1628             idx += m;
1629         m /= 2;
1630     }
1631     if (pred(range[idx], needle))
1632         idx += 1;
1633     return idx;
1634 }
1635 
1636 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
1637 if (is(T : ElementType!Range))
1638 {
1639     assert(isPow2OrZero(range.length));
1640     size_t idx = 0, m = range.length/2;
1641     enum max = 1 << 10;
1642     while (m >= max)
1643     {
1644         if (pred(range[idx+m], needle))
1645             idx += m;
1646         m /= 2;
1647     }
1648     mixin(genUnrolledSwitchSearch(max));
1649     return idx;
1650 }
1651 
1652 template sharMethod(alias uniLowerBound)
1653 {
1654     size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
1655         if (is(T : ElementType!Range))
1656     {
1657         import std.functional : binaryFun;
1658         import std.math.algebraic : nextPow2, truncPow2;
1659         alias pred = binaryFun!_pred;
1660         if (range.length == 0)
1661             return 0;
1662         if (isPow2OrZero(range.length))
1663             return uniLowerBound!pred(range, needle);
1664         size_t n = truncPow2(range.length);
1665         if (pred(range[n-1], needle))
1666         {// search in another 2^^k area that fully covers the tail of range
1667             size_t k = nextPow2(range.length - n + 1);
1668             return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
1669         }
1670         else
1671             return uniLowerBound!pred(range[0 .. n], needle);
1672     }
1673 }
1674 
1675 alias sharLowerBound = sharMethod!uniformLowerBound;
1676 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
1677 
1678 @safe unittest
1679 {
1680     import std.array : array;
1681     import std.range : assumeSorted, iota;
1682 
1683     auto stdLowerBound(T)(T[] range, T needle)
1684     {
1685         return assumeSorted(range).lowerBound(needle).length;
1686     }
1687     immutable MAX = 5*1173;
1688     auto arr = array(iota(5, MAX, 5));
1689     assert(arr.length == MAX/5-1);
1690     foreach (i; 0 .. MAX+5)
1691     {
1692         auto st = stdLowerBound(arr, i);
1693         assert(st == sharLowerBound(arr, i));
1694         assert(st == sharSwitchLowerBound(arr, i));
1695     }
1696     arr = [];
1697     auto st = stdLowerBound(arr, 33);
1698     assert(st == sharLowerBound(arr, 33));
1699     assert(st == sharSwitchLowerBound(arr, 33));
1700 }
1701 //============================================================================
1702 
1703 @safe
1704 {
1705 // hope to see simillar stuff in public interface... once Allocators are out
1706 //@@@BUG moveFront and friends? dunno, for now it's POD-only
1707 
1708 @trusted size_t genericReplace(Policy=void, T, Range)
1709     (ref T dest, size_t from, size_t to, Range stuff)
1710 {
1711     import std.algorithm.mutation : copy;
1712     size_t delta = to - from;
1713     size_t stuff_end = from+stuff.length;
1714     if (stuff.length > delta)
1715     {// replace increases length
1716         delta = stuff.length - delta;// now, new is > old  by delta
1717         static if (is(Policy == void))
1718             dest.length = dest.length+delta;//@@@BUG lame @property
1719         else
1720             dest = Policy.realloc(dest, dest.length+delta);
1721         copyBackwards(dest[to .. dest.length-delta],
1722             dest[to+delta .. dest.length]);
1723         copyForward(stuff, dest[from .. stuff_end]);
1724     }
1725     else if (stuff.length == delta)
1726     {
1727         copy(stuff, dest[from .. to]);
1728     }
1729     else
1730     {// replace decreases length by delta
1731         delta = delta - stuff.length;
1732         copy(stuff, dest[from .. stuff_end]);
1733         copyForward(dest[to .. dest.length],
1734             dest[stuff_end .. dest.length-delta]);
1735         static if (is(Policy == void))
1736             dest.length = dest.length - delta;//@@@BUG lame @property
1737         else
1738             dest = Policy.realloc(dest, dest.length-delta);
1739     }
1740     return stuff_end;
1741 }
1742 
1743 
1744 // Simple storage manipulation policy
1745 @safe private struct GcPolicy
1746 {
1747     import std.traits : isDynamicArray;
1748 
1749     static T[] dup(T)(const T[] arr)
1750     {
1751         return arr.dup;
1752     }
1753 
1754     static T[] alloc(T)(size_t size)
1755     {
1756         return new T[size];
1757     }
1758 
1759     static T[] realloc(T)(T[] arr, size_t sz)
1760     {
1761         arr.length = sz;
1762         return arr;
1763     }
1764 
1765     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1766     {
1767         replaceInPlace(dest, from, to, stuff);
1768     }
1769 
1770     static void append(T, V)(ref T[] arr, V value)
1771         if (!isInputRange!V)
1772     {
1773         arr ~= force!T(value);
1774     }
1775 
1776     static void append(T, V)(ref T[] arr, V value)
1777         if (isInputRange!V)
1778     {
1779         insertInPlace(arr, arr.length, value);
1780     }
1781 
1782     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1783         if (isDynamicArray!T && is(Unqual!T == T))
1784     {
1785         debug
1786         {
1787             arr[] = cast(typeof(T.init[0]))(0xdead_beef);
1788         }
1789         arr = null;
1790     }
1791 
1792     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1793         if (isDynamicArray!T && !is(Unqual!T == T))
1794     {
1795         arr = null;
1796     }
1797 }
1798 
1799 // ditto
1800 @safe struct ReallocPolicy
1801 {
1802     import std.range.primitives : hasLength;
1803 
1804     static T[] dup(T)(const T[] arr)
1805     {
1806         auto result = alloc!T(arr.length);
1807         result[] = arr[];
1808         return result;
1809     }
1810 
1811     static T[] alloc(T)(size_t size) @trusted
1812     {
1813         import std.internal.memory : enforceMalloc;
1814 
1815         import core.checkedint : mulu;
1816         bool overflow;
1817         size_t nbytes = mulu(size, T.sizeof, overflow);
1818         if (overflow) assert(0);
1819 
1820         auto ptr = cast(T*) enforceMalloc(nbytes);
1821         return ptr[0 .. size];
1822     }
1823 
1824     static T[] realloc(T)(return scope T[] arr, size_t size) @trusted
1825     {
1826         import std.internal.memory : enforceRealloc;
1827         if (!size)
1828         {
1829             destroy(arr);
1830             return null;
1831         }
1832 
1833         import core.checkedint : mulu;
1834         bool overflow;
1835         size_t nbytes = mulu(size, T.sizeof, overflow);
1836         if (overflow) assert(0);
1837 
1838         auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes);
1839         return ptr[0 .. size];
1840     }
1841 
1842     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1843     {
1844         genericReplace!(ReallocPolicy)(dest, from, to, stuff);
1845     }
1846 
1847     static void append(T, V)(ref T[] arr, V value)
1848         if (!isInputRange!V)
1849     {
1850         if (arr.length == size_t.max) assert(0);
1851         arr = realloc(arr, arr.length+1);
1852         arr[$-1] = force!T(value);
1853     }
1854 
1855     pure @safe unittest
1856     {
1857         int[] arr;
1858         ReallocPolicy.append(arr, 3);
1859 
1860         import std.algorithm.comparison : equal;
1861         assert(equal(arr, [3]));
1862     }
1863 
1864     static void append(T, V)(ref T[] arr, V value)
1865         if (isInputRange!V && hasLength!V)
1866     {
1867         import core.checkedint : addu;
1868         bool overflow;
1869         size_t nelems = addu(arr.length, value.length, overflow);
1870         if (overflow) assert(0);
1871 
1872         arr = realloc(arr, nelems);
1873 
1874         import std.algorithm.mutation : copy;
1875         copy(value, arr[$-value.length..$]);
1876     }
1877 
1878     pure @safe unittest
1879     {
1880         int[] arr;
1881         ReallocPolicy.append(arr, [1,2,3]);
1882 
1883         import std.algorithm.comparison : equal;
1884         assert(equal(arr, [1,2,3]));
1885     }
1886 
1887     static void destroy(T)(scope ref T[] arr) @trusted
1888     {
1889         import core.memory : pureFree;
1890         if (arr.ptr)
1891             pureFree(arr.ptr);
1892         arr = null;
1893     }
1894 }
1895 
1896 //build hack
1897 alias _RealArray = CowArray!ReallocPolicy;
1898 
1899 pure @safe unittest
1900 {
1901     import std.algorithm.comparison : equal;
1902 
1903     with(ReallocPolicy)
1904     {
1905         bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
1906                    string file = __FILE__, size_t line = __LINE__)
1907         {
1908             {
1909                 replaceImpl(orig, from, to, toReplace);
1910                 scope(exit) destroy(orig);
1911                 if (!equal(orig, result))
1912                     return false;
1913             }
1914             return true;
1915         }
1916         static T[] arr(T)(T[] args... )
1917         {
1918             return dup(args);
1919         }
1920 
1921         assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
1922         assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
1923         assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
1924         assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
1925         assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
1926     }
1927 }
1928 
1929 /**
1930     Tests if T is some kind a set of code points. Intended for template constraints.
1931 */
1932 public template isCodepointSet(T)
1933 {
1934     static if (is(T dummy == InversionList!(Args), Args...))
1935         enum isCodepointSet = true;
1936     else
1937         enum isCodepointSet = false;
1938 }
1939 
1940 /**
1941     Tests if `T` is a pair of integers that implicitly convert to `V`.
1942     The following code must compile for any pair `T`:
1943     ---
1944     (T x){ V a = x[0]; V b = x[1];}
1945     ---
1946     The following must not compile:
1947      ---
1948     (T x){ V c = x[2];}
1949     ---
1950 */
1951 public template isIntegralPair(T, V=uint)
1952 {
1953     enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
1954         && !is(typeof((T x){ V c = x[2]; }));
1955 }
1956 
1957 
1958 /**
1959     The recommended default type for set of $(CODEPOINTS).
1960     For details, see the current implementation: $(LREF InversionList).
1961 */
1962 public alias CodepointSet = InversionList!GcPolicy;
1963 
1964 
1965 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
1966 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error
1967 // hence below doesn't seem to work
1968 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
1969 
1970 /**
1971     The recommended type of $(REF Tuple, std,_typecons)
1972     to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
1973     Any interval type should pass $(LREF isIntegralPair) trait.
1974 */
1975 public struct CodepointInterval
1976 {
1977 pure:
1978     uint[2] _tuple;
1979     alias _tuple this;
1980 
1981 @safe pure nothrow @nogc:
1982 
1983     this(uint low, uint high)
1984     {
1985         _tuple[0] = low;
1986         _tuple[1] = high;
1987     }
1988     bool opEquals(T)(T val) const
1989     {
1990         return this[0] == val[0] && this[1] == val[1];
1991     }
1992     @property ref inout(uint) a() return inout { return _tuple[0]; }
1993     @property ref inout(uint) b() return inout { return _tuple[1]; }
1994 }
1995 
1996 /**
1997     $(P
1998     `InversionList` is a set of $(CODEPOINTS)
1999     represented as an array of open-right [a, b$(RPAREN)
2000     intervals (see $(LREF CodepointInterval) above).
2001     The name comes from the way the representation reads left to right.
2002     For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
2003     plus a singular value 60 looks like this:
2004     )
2005     ---
2006     10, 50, 60, 61, 80, 90
2007     ---
2008     $(P
2009     The way to read this is: start with negative meaning that all numbers
2010     smaller then the next one are not present in this set (and positive -
2011     the contrary). Then switch positive/negative after each
2012     number passed from left to right.
2013     )
2014     $(P This way negative spans until 10, then positive until 50,
2015     then negative until 60, then positive until 61, and so on.
2016     As seen this provides a space-efficient storage of highly redundant data
2017     that comes in long runs. A description which Unicode $(CHARACTER)
2018     properties fit nicely. The technique itself could be seen as a variation
2019     on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
2020     )
2021 
2022     $(P Sets are value types (just like `int` is) thus they
2023         are never aliased.
2024     )
2025         Example:
2026         ---
2027         auto a = CodepointSet('a', 'z'+1);
2028         auto b = CodepointSet('A', 'Z'+1);
2029         auto c = a;
2030         a = a | b;
2031         assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
2032         assert(a != c);
2033         ---
2034     $(P See also $(LREF unicode) for simpler construction of sets
2035         from predefined ones.
2036     )
2037 
2038     $(P Memory usage is 8 bytes per each contiguous interval in a set.
2039     The value semantics are achieved by using the
2040     $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
2041     and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
2042     )
2043 
2044     Note:
2045     $(P It's not recommended to rely on the template parameters
2046     or the exact type of a current $(CODEPOINT) set in `std.uni`.
2047     The type and parameters may change when the standard
2048     allocators design is finalized.
2049     Use $(LREF isCodepointSet) with templates or just stick with the default
2050     alias $(LREF CodepointSet) throughout the whole code base.
2051     )
2052 */
2053 public struct InversionList(SP=GcPolicy)
2054 {
2055     import std.range : assumeSorted;
2056 
2057     /**
2058         Construct from another code point set of any type.
2059     */
2060     this(Set)(Set set) pure
2061         if (isCodepointSet!Set)
2062     {
2063         uint[] arr;
2064         foreach (v; set.byInterval)
2065         {
2066             arr ~= v.a;
2067             arr ~= v.b;
2068         }
2069         data = CowArray!(SP).reuse(arr);
2070     }
2071 
2072     /**
2073         Construct a set from a forward range of code point intervals.
2074     */
2075     this(Range)(Range intervals) pure
2076         if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
2077     {
2078         uint[] arr;
2079         foreach (v; intervals)
2080         {
2081             SP.append(arr, v.a);
2082             SP.append(arr, v.b);
2083         }
2084         data = CowArray!(SP).reuse(arr);
2085         sanitize(); //enforce invariant: sort intervals etc.
2086     }
2087 
2088     //helper function that avoids sanity check to be CTFE-friendly
2089     private static fromIntervals(Range)(Range intervals) pure
2090     {
2091         import std.algorithm.iteration : map;
2092         import std.range : roundRobin;
2093         auto flattened = roundRobin(intervals.save.map!"a[0]"(),
2094             intervals.save.map!"a[1]"());
2095         InversionList set;
2096         set.data = CowArray!(SP)(flattened);
2097         return set;
2098     }
2099     //ditto untill sort is CTFE-able
2100     private static fromIntervals()(uint[] intervals...) pure
2101     in
2102     {
2103         import std.conv : text;
2104         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2105         for (uint i = 0; i < intervals.length; i += 2)
2106         {
2107             auto a = intervals[i], b = intervals[i+1];
2108             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2109         }
2110     }
2111     do
2112     {
2113         InversionList set;
2114         set.data = CowArray!(SP)(intervals);
2115         return set;
2116     }
2117 
2118     /**
2119         Construct a set from plain values of code point intervals.
2120     */
2121     this()(uint[] intervals...)
2122     in
2123     {
2124         import std.conv : text;
2125         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2126         for (uint i = 0; i < intervals.length; i += 2)
2127         {
2128             auto a = intervals[i], b = intervals[i+1];
2129             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2130         }
2131     }
2132     do
2133     {
2134         data = CowArray!(SP)(intervals);
2135         sanitize(); //enforce invariant: sort intervals etc.
2136     }
2137 
2138     ///
2139     pure @safe unittest
2140     {
2141         import std.algorithm.comparison : equal;
2142 
2143         auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
2144         foreach (v; 'a'..'z'+1)
2145             assert(set[v]);
2146         // Cyrillic lowercase interval
2147         foreach (v; 'а'..'я'+1)
2148             assert(set[v]);
2149         //specific order is not required, intervals may interesect
2150         auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
2151         //the same end result
2152         assert(set2.byInterval.equal(set.byInterval));
2153         // test constructor this(Range)(Range intervals)
2154         auto chessPiecesWhite = CodepointInterval(9812, 9818);
2155         auto chessPiecesBlack = CodepointInterval(9818, 9824);
2156         auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]);
2157         foreach (v; '♔'..'♟'+1)
2158             assert(set3[v]);
2159     }
2160 
2161     /**
2162         Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
2163     */
2164     @property auto byInterval() scope
2165     {
2166         // TODO: change this to data[] once the -dip1000 errors have been fixed
2167         // see e.g. https://github.com/dlang/phobos/pull/6638
2168         import std.array : array;
2169         return Intervals!(typeof(data.array))(data.array);
2170     }
2171 
2172     @safe unittest
2173     {
2174         import std.algorithm.comparison : equal;
2175         import std.typecons : tuple;
2176 
2177         auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
2178 
2179         assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
2180     }
2181 
2182     package(std) @property const(CodepointInterval)[] intervals() const
2183     {
2184         import std.array : array;
2185         return Intervals!(typeof(data[]))(data[]).array;
2186     }
2187 
2188     /**
2189         Tests the presence of code point `val` in this set.
2190     */
2191     bool opIndex(uint val) const
2192     {
2193         // the <= ensures that searching in  interval of [a, b) for 'a' you get .length == 1
2194         // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
2195         return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
2196     }
2197 
2198     ///
2199     pure @safe unittest
2200     {
2201         auto gothic = unicode.Gothic;
2202         // Gothic letter ahsa
2203         assert(gothic['\U00010330']);
2204         // no ascii in Gothic obviously
2205         assert(!gothic['$']);
2206     }
2207 
2208 
2209     // Linear scan for `ch`. Useful only for small sets.
2210     // TODO:
2211     // used internally in std.regex
2212     // should be properly exposed in a public API ?
2213     package(std) auto scanFor()(dchar ch) const
2214     {
2215         immutable len = data.length;
2216         for (size_t i = 0; i < len; i++)
2217             if (ch < data[i])
2218                 return i & 1;
2219         return 0;
2220     }
2221 
2222     /// Number of $(CODEPOINTS) in this set
2223     @property size_t length()
2224     {
2225         size_t sum = 0;
2226         foreach (iv; byInterval)
2227         {
2228             sum += iv.b - iv.a;
2229         }
2230         return sum;
2231     }
2232 
2233 // bootstrap full set operations from 4 primitives (suitable as a template mixin):
2234 // addInterval, skipUpTo, dropUpTo & byInterval iteration
2235 //============================================================================
2236 public:
2237     /**
2238         $(P Sets support natural syntax for set algebra, namely: )
2239         $(BOOKTABLE ,
2240             $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
2241             $(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
2242             $(TR $(TD |) $(TD a ∪ b) $(TD union) )
2243             $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
2244             $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
2245         )
2246     */
2247     This opBinary(string op, U)(U rhs)
2248         if (isCodepointSet!U || is(U:dchar))
2249     {
2250         static if (op == "&" || op == "|" || op == "~")
2251         {// symmetric ops thus can swap arguments to reuse r-value
2252             static if (is(U:dchar))
2253             {
2254                 auto tmp = this;
2255                 mixin("tmp "~op~"= rhs; ");
2256                 return tmp;
2257             }
2258             else
2259             {
2260                 static if (is(Unqual!U == U))
2261                 {
2262                     // try hard to reuse r-value
2263                     mixin("rhs "~op~"= this;");
2264                     return rhs;
2265                 }
2266                 else
2267                 {
2268                     auto tmp = this;
2269                     mixin("tmp "~op~"= rhs;");
2270                     return tmp;
2271                 }
2272             }
2273         }
2274         else static if (op == "-") // anti-symmetric
2275         {
2276             auto tmp = this;
2277             tmp -= rhs;
2278             return tmp;
2279         }
2280         else
2281             static assert(0, "no operator "~op~" defined for Set");
2282     }
2283 
2284     ///
2285     pure @safe unittest
2286     {
2287         import std.algorithm.comparison : equal;
2288         import std.range : iota;
2289 
2290         auto lower = unicode.LowerCase;
2291         auto upper = unicode.UpperCase;
2292         auto ascii = unicode.ASCII;
2293 
2294         assert((lower & upper).empty); // no intersection
2295         auto lowerASCII = lower & ascii;
2296         assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
2297         // throw away all of the lowercase ASCII
2298         assert((ascii - lower).length == 128 - 26);
2299 
2300         auto onlyOneOf = lower ~ ascii;
2301         assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
2302         assert(onlyOneOf['$']); // ASCII and not lowercase
2303         assert(!onlyOneOf['a']); // ASCII and lowercase
2304         assert(onlyOneOf['я']); // not ASCII but lowercase
2305 
2306         // throw away all cased letters from ASCII
2307         auto noLetters = ascii - (lower | upper);
2308         assert(noLetters.length == 128 - 26*2);
2309     }
2310 
2311     /// The 'op=' versions of the above overloaded operators.
2312     ref This opOpAssign(string op, U)(U rhs)
2313         if (isCodepointSet!U || is(U:dchar))
2314     {
2315         static if (op == "|")    // union
2316         {
2317             static if (is(U:dchar))
2318             {
2319                 this.addInterval(rhs, rhs+1);
2320                 return this;
2321             }
2322             else
2323                 return this.add(rhs);
2324         }
2325         else static if (op == "&")   // intersection
2326                 return this.intersect(rhs);// overloaded
2327         else static if (op == "-")   // set difference
2328                 return this.sub(rhs);// overloaded
2329         else static if (op == "~")   // symmetric set difference
2330         {
2331             auto copy = this & rhs;
2332             this |= rhs;
2333             this -= copy;
2334             return this;
2335         }
2336         else
2337             static assert(0, "no operator "~op~" defined for Set");
2338     }
2339 
2340     /**
2341         Tests the presence of codepoint `ch` in this set,
2342         the same as $(LREF opIndex).
2343     */
2344     bool opBinaryRight(string op: "in", U)(U ch) const
2345         if (is(U : dchar))
2346     {
2347         return this[ch];
2348     }
2349 
2350     ///
2351     pure @safe unittest
2352     {
2353         assert('я' in unicode.Cyrillic);
2354         assert(!('z' in unicode.Cyrillic));
2355     }
2356 
2357 
2358 
2359     /**
2360      * Obtains a set that is the inversion of this set.
2361      *
2362      * See_Also: $(LREF inverted)
2363      */
2364     auto opUnary(string op: "!")()
2365     {
2366         return this.inverted;
2367     }
2368 
2369     /**
2370         A range that spans each $(CODEPOINT) in this set.
2371     */
2372     @property auto byCodepoint()
2373     {
2374         static struct CodepointRange
2375         {
2376             this(This set)
2377             {
2378                 r = set.byInterval;
2379                 if (!r.empty)
2380                     cur = r.front.a;
2381             }
2382 
2383             @property dchar front() const
2384             {
2385                 return cast(dchar) cur;
2386             }
2387 
2388             @property bool empty() const
2389             {
2390                 return r.empty;
2391             }
2392 
2393             void popFront()
2394             {
2395                 cur++;
2396                 while (cur >= r.front.b)
2397                 {
2398                     r.popFront();
2399                     if (r.empty)
2400                         break;
2401                     cur = r.front.a;
2402                 }
2403             }
2404         private:
2405             uint cur;
2406             @(imported!"core.attribute".mutableRefInit) typeof(This.init.byInterval) r;
2407         }
2408 
2409         return CodepointRange(this);
2410     }
2411 
2412     ///
2413     pure @safe unittest
2414     {
2415         import std.algorithm.comparison : equal;
2416         import std.range : iota;
2417 
2418         auto set = unicode.ASCII;
2419         set.byCodepoint.equal(iota(0, 0x80));
2420     }
2421 
2422     /**
2423         $(P Obtain textual representation of this set in from of
2424         open-right intervals and feed it to `sink`.
2425         )
2426         $(P Used by various standard formatting facilities such as
2427          $(REF formattedWrite, std,format), $(REF write, std,stdio),
2428          $(REF writef, std,stdio), $(REF to, std,conv) and others.
2429         )
2430         Example:
2431         ---
2432         import std.conv;
2433         assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
2434         ---
2435     */
2436 
2437     private import std.format.spec : FormatSpec;
2438 
2439     /***************************************
2440      * Obtain a textual representation of this InversionList
2441      * in form of open-right intervals.
2442      *
2443      * The formatting flag is applied individually to each value, for example:
2444      * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
2445      * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
2446      * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
2447      */
2448     void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */
2449     {
2450         import std.format.write : formatValue;
2451         auto range = byInterval;
2452         if (range.empty)
2453             return;
2454 
2455         while (1)
2456         {
2457             auto i = range.front;
2458             range.popFront();
2459 
2460             put(sink, "[");
2461             formatValue(sink, i.a, fmt);
2462             put(sink, "..");
2463             formatValue(sink, i.b, fmt);
2464             put(sink, ")");
2465             if (range.empty) return;
2466             put(sink, " ");
2467         }
2468     }
2469 
2470     ///
2471     pure @safe unittest
2472     {
2473         import std.conv : to;
2474         import std.format : format;
2475         import std.uni : unicode;
2476 
2477         // This was originally using Cyrillic script.
2478         // Unfortunately this is a pretty active range for changes,
2479         // and hence broke in an update.
2480         // Therefore the range Basic latin was used instead as it
2481         // unlikely to ever change.
2482 
2483         assert(unicode.InBasic_latin.to!string == "[0..128)");
2484 
2485         // The specs '%s' and '%d' are equivalent to the to!string call above.
2486         assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string);
2487 
2488         assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)");
2489         assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)");
2490     }
2491 
2492     pure @safe unittest
2493     {
2494         import std.exception : assertThrown;
2495         import std.format : format, FormatException;
2496         assertThrown!FormatException(format("%z", unicode.ASCII));
2497     }
2498 
2499 
2500     /**
2501         Add an interval [a, b$(RPAREN) to this set.
2502     */
2503     ref add()(uint a, uint b)
2504     {
2505         addInterval(a, b);
2506         return this;
2507     }
2508 
2509     ///
2510     pure @safe unittest
2511     {
2512         CodepointSet someSet;
2513         someSet.add('0', '5').add('A','Z'+1);
2514         someSet.add('5', '9'+1);
2515         assert(someSet['0']);
2516         assert(someSet['5']);
2517         assert(someSet['9']);
2518         assert(someSet['Z']);
2519     }
2520 
2521 private:
2522 
2523   package(std)  // used from: std.regex.internal.parser
2524     ref intersect(U)(U rhs)
2525         if (isCodepointSet!U)
2526     {
2527         Marker mark;
2528         foreach ( i; rhs.byInterval)
2529         {
2530             mark = this.dropUpTo(i.a, mark);
2531             mark = this.skipUpTo(i.b, mark);
2532         }
2533         this.dropUpTo(uint.max, mark);
2534         return this;
2535     }
2536 
2537     ref intersect()(dchar ch)
2538     {
2539         foreach (i; byInterval)
2540             if (i.a <= ch && ch < i.b)
2541                 return this = This.init.add(ch, ch+1);
2542         this = This.init;
2543         return this;
2544     }
2545 
2546     pure @safe unittest
2547     {
2548         assert(unicode.Cyrillic.intersect('-').byInterval.empty);
2549     }
2550 
2551     ref sub()(dchar ch)
2552     {
2553         return subChar(ch);
2554     }
2555 
2556     // same as the above except that skip & drop parts are swapped
2557   package(std)  // used from: std.regex.internal.parser
2558     ref sub(U)(U rhs)
2559         if (isCodepointSet!U)
2560     {
2561         Marker mark;
2562         foreach (i; rhs.byInterval)
2563         {
2564             mark = this.skipUpTo(i.a, mark);
2565             mark = this.dropUpTo(i.b, mark);
2566         }
2567         return this;
2568     }
2569 
2570   package(std)  // used from: std.regex.internal.parse
2571     ref add(U)(U rhs)
2572         if (isCodepointSet!U)
2573     {
2574         Marker start;
2575         foreach (i; rhs.byInterval)
2576         {
2577             start = addInterval(i.a, i.b, start);
2578         }
2579         return this;
2580     }
2581 
2582 // end of mixin-able part
2583 //============================================================================
2584 public:
2585     /**
2586         Obtains a set that is the inversion of this set.
2587 
2588         See the '!' $(LREF opUnary) for the same but using operators.
2589     */
2590     @property auto inverted()
2591     {
2592         InversionList inversion = this;
2593         if (inversion.data.length == 0)
2594         {
2595             inversion.addInterval(0, lastDchar+1);
2596             return inversion;
2597         }
2598         if (inversion.data[0] != 0)
2599             genericReplace(inversion.data, 0, 0, [0]);
2600         else
2601             genericReplace(inversion.data, 0, 1, cast(uint[]) null);
2602         if (data[data.length-1] != lastDchar+1)
2603             genericReplace(inversion.data,
2604                 inversion.data.length, inversion.data.length, [lastDchar+1]);
2605         else
2606             genericReplace(inversion.data,
2607                 inversion.data.length-1, inversion.data.length, cast(uint[]) null);
2608 
2609         return inversion;
2610     }
2611 
2612     ///
2613     pure @safe unittest
2614     {
2615         auto set = unicode.ASCII;
2616         // union with the inverse gets all of the code points in the Unicode
2617         assert((set | set.inverted).length == 0x110000);
2618         // no intersection with the inverse
2619         assert((set & set.inverted).empty);
2620     }
2621 
2622     package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName)
2623     {
2624         import std.algorithm.searching : countUntil;
2625         import std.format : format;
2626         enum maxBinary = 3;
2627         static string linearScope(R)(R ivals, string indent)
2628         {
2629             string result = indent~"{\n";
2630             string deeper = indent~"    ";
2631             foreach (ival; ivals)
2632             {
2633                 immutable span = ival[1] - ival[0];
2634                 assert(span != 0);
2635                 if (span == 1)
2636                 {
2637                     result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
2638                 }
2639                 else if (span == 2)
2640                 {
2641                     result ~= format("%sif (ch == %s || ch == %s) return true;\n",
2642                         deeper, ival[0], ival[0]+1);
2643                 }
2644                 else
2645                 {
2646                     if (ival[0] != 0) // dchar is unsigned and  < 0 is useless
2647                         result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
2648                     result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
2649                 }
2650             }
2651             result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
2652             return result;
2653         }
2654 
2655         static string binaryScope(R)(R ivals, string indent) @safe
2656         {
2657             // time to do unrolled comparisons?
2658             if (ivals.length < maxBinary)
2659                 return linearScope(ivals, indent);
2660             else
2661                 return bisect(ivals, ivals.length/2, indent);
2662         }
2663 
2664         // not used yet if/elsebinary search is far better with DMD  as of 2.061
2665         // and GDC is doing fine job either way
2666         static string switchScope(R)(R ivals, string indent)
2667         {
2668             string result = indent~"switch (ch){\n";
2669             string deeper = indent~"    ";
2670             foreach (ival; ivals)
2671             {
2672                 if (ival[0]+1 == ival[1])
2673                 {
2674                     result ~= format("%scase %s: return true;\n",
2675                         deeper, ival[0]);
2676                 }
2677                 else
2678                 {
2679                     result ~= format("%scase %s: .. case %s: return true;\n",
2680                          deeper, ival[0], ival[1]-1);
2681                 }
2682             }
2683             result ~= deeper~"default: return false;\n"~indent~"}\n";
2684             return result;
2685         }
2686 
2687         static string bisect(R)(R range, size_t idx, string indent)
2688         {
2689             string deeper = indent ~ "    ";
2690             // bisect on one [a, b) interval at idx
2691             string result = indent~"{\n";
2692             // less branch, < a
2693             result ~= format("%sif (ch < %s)\n%s",
2694                 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
2695             // middle point,  >= a && < b
2696             result ~= format("%selse if (ch < %s) return true;\n",
2697                 deeper, range[idx][1]);
2698             // greater or equal branch,  >= b
2699             result ~= format("%selse\n%s",
2700                 deeper, binaryScope(range[idx+1..$], deeper));
2701             return result~indent~"}\n";
2702         }
2703 
2704         string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
2705             funcName.empty ? "function" : funcName);
2706         // special case first bisection to be on ASCII vs beyond
2707         auto tillAscii = countUntil!"a[0] > 0x80"(range);
2708         if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
2709             code ~= binaryScope(range, "");
2710         else
2711             code ~= bisect(range, tillAscii, "");
2712         return code;
2713     }
2714 
2715     /**
2716         Generates string with D source code of unary function with name of
2717         `funcName` taking a single `dchar` argument. If `funcName` is empty
2718         the code is adjusted to be a lambda function.
2719 
2720         The function generated tests if the $(CODEPOINT) passed
2721         belongs to this set or not. The result is to be used with string mixin.
2722         The intended usage area is aggressive optimization via meta programming
2723         in parser generators and the like.
2724 
2725         Note: Use with care for relatively small or regular sets. It
2726         could end up being slower then just using multi-staged tables.
2727 
2728         Example:
2729         ---
2730         import std.stdio;
2731 
2732         // construct set directly from [a, b$RPAREN intervals
2733         auto set = CodepointSet(10, 12, 45, 65, 100, 200);
2734         writeln(set);
2735         writeln(set.toSourceCode("func"));
2736         ---
2737 
2738         The above outputs something along the lines of:
2739         ---
2740         bool func(dchar ch)  @safe pure nothrow @nogc
2741         {
2742             if (ch < 45)
2743             {
2744                 if (ch == 10 || ch == 11) return true;
2745                 return false;
2746             }
2747             else if (ch < 65) return true;
2748             else
2749             {
2750                 if (ch < 100) return false;
2751                 if (ch < 200) return true;
2752                 return false;
2753             }
2754         }
2755         ---
2756     */
2757     string toSourceCode(string funcName="")
2758     {
2759         import std.array : array;
2760         auto range = byInterval.array();
2761         return toSourceCode(range, funcName);
2762     }
2763 
2764     /**
2765         True if this set doesn't contain any $(CODEPOINTS).
2766     */
2767     @property bool empty() const
2768     {
2769         return data.length == 0;
2770     }
2771 
2772     ///
2773     pure @safe unittest
2774     {
2775         CodepointSet emptySet;
2776         assert(emptySet.length == 0);
2777         assert(emptySet.empty);
2778     }
2779 
2780 private:
2781     alias This = typeof(this);
2782     alias Marker = size_t;
2783 
2784     // a random-access range of integral pairs
2785     static struct Intervals(Range)
2786     {
2787         import std.range.primitives : hasAssignableElements;
2788 
2789         this(Range sp) scope
2790         {
2791             slice = sp;
2792             start = 0;
2793             end = sp.length;
2794         }
2795 
2796         this(Range sp, size_t s, size_t e) scope
2797         {
2798             slice = sp;
2799             start = s;
2800             end = e;
2801         }
2802 
2803         @property auto front()const
2804         {
2805             immutable a = slice[start];
2806             immutable b = slice[start+1];
2807             return CodepointInterval(a, b);
2808         }
2809 
2810         //may break sorted property - but we need std.sort to access it
2811         //hence package(std) protection attribute
2812         static if (hasAssignableElements!Range)
2813         package(std) @property void front(CodepointInterval val)
2814         {
2815             slice[start] = val.a;
2816             slice[start+1] = val.b;
2817         }
2818 
2819         @property auto back()const
2820         {
2821             immutable a = slice[end-2];
2822             immutable b = slice[end-1];
2823             return CodepointInterval(a, b);
2824         }
2825 
2826         //ditto about package
2827         static if (hasAssignableElements!Range)
2828         package(std) @property void back(CodepointInterval val)
2829         {
2830             slice[end-2] = val.a;
2831             slice[end-1] = val.b;
2832         }
2833 
2834         void popFront()
2835         {
2836             start += 2;
2837         }
2838 
2839         void popBack()
2840         {
2841             end -= 2;
2842         }
2843 
2844         auto opIndex(size_t idx) const
2845         {
2846             immutable a = slice[start+idx*2];
2847             immutable b = slice[start+idx*2+1];
2848             return CodepointInterval(a, b);
2849         }
2850 
2851         //ditto about package
2852         static if (hasAssignableElements!Range)
2853         package(std) void opIndexAssign(CodepointInterval val, size_t idx)
2854         {
2855             slice[start+idx*2] = val.a;
2856             slice[start+idx*2+1] = val.b;
2857         }
2858 
2859         auto opSlice(size_t s, size_t e)
2860         {
2861             return Intervals(slice, s*2+start, e*2+start);
2862         }
2863 
2864         @property size_t length()const {  return slice.length/2; }
2865 
2866         @property bool empty()const { return start == end; }
2867 
2868         @property auto save(){ return this; }
2869     private:
2870         size_t start, end;
2871         Range slice;
2872     }
2873 
2874     // called after construction from intervals
2875     // to make sure invariants hold
2876     void sanitize()
2877     {
2878         import std.algorithm.comparison : max;
2879         import std.algorithm.mutation : SwapStrategy;
2880         import std.algorithm.sorting : sort;
2881         if (data.length == 0)
2882             return;
2883         alias Ival = CodepointInterval;
2884         //intervals wrapper for a _range_ over packed array
2885         auto ivals = Intervals!(typeof(data[]))(data[]);
2886         //@@@BUG@@@ can't use "a.a < b.a" see
2887         // https://issues.dlang.org/show_bug.cgi?id=12265
2888         sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
2889         // what follows is a variation on stable remove
2890         // differences:
2891         // - predicate is binary, and is tested against
2892         //   the last kept element (at 'i').
2893         // - predicate mutates lhs (merges rhs into lhs)
2894         size_t len = ivals.length;
2895         size_t i = 0;
2896         size_t j = 1;
2897         while (j < len)
2898         {
2899             if (ivals[i].b >= ivals[j].a)
2900             {
2901                 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
2902                 j++;
2903             }
2904             else //unmergable
2905             {
2906                 // check if there is a hole after merges
2907                 // (in the best case we do 0 writes to ivals)
2908                 if (j != i+1)
2909                     ivals[i+1] = ivals[j]; //copy over
2910                 i++;
2911                 j++;
2912             }
2913         }
2914         len = i + 1;
2915         for (size_t k=0; k + 1 < len; k++)
2916         {
2917             assert(ivals[k].a < ivals[k].b);
2918             assert(ivals[k].b < ivals[k+1].a);
2919         }
2920         data.length = len * 2;
2921     }
2922 
2923     // special case for normal InversionList
2924     ref subChar(dchar ch)
2925     {
2926         auto mark = skipUpTo(ch);
2927         if (mark != data.length
2928             && data[mark] == ch && data[mark-1] == ch)
2929         {
2930             // it has split, meaning that ch happens to be in one of intervals
2931             data[mark] = data[mark]+1;
2932         }
2933         return this;
2934     }
2935 
2936     //
2937     Marker addInterval(int a, int b, Marker hint=Marker.init) scope
2938     in
2939     {
2940         assert(a <= b);
2941     }
2942     do
2943     {
2944         import std.range : assumeSorted, SearchPolicy;
2945         auto range = assumeSorted(data[]);
2946         size_t pos;
2947         size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
2948         if (a_idx == range.length)
2949         {
2950             //  [---+++----++++----++++++]
2951             //  [                         a  b]
2952             data.append(a, b);
2953             return data.length-1;
2954         }
2955         size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
2956         uint[3] buf = void;
2957         uint to_insert;
2958         debug(std_uni)
2959         {
2960             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2961         }
2962         if (b_idx == range.length)
2963         {
2964             //  [-------++++++++----++++++-]
2965             //  [      s     a                 b]
2966             if (a_idx & 1)// a in positive
2967             {
2968                 buf[0] = b;
2969                 to_insert = 1;
2970             }
2971             else// a in negative
2972             {
2973                 buf[0] = a;
2974                 buf[1] = b;
2975                 to_insert = 2;
2976             }
2977             pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
2978             return pos - 1;
2979         }
2980 
2981         uint top = data[b_idx];
2982 
2983         debug(std_uni)
2984         {
2985             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2986             writefln("a=%s; b=%s; top=%s;", a, b, top);
2987         }
2988         if (a_idx & 1)
2989         {// a in positive
2990             if (b_idx & 1)// b in positive
2991             {
2992                 //  [-------++++++++----++++++-]
2993                 //  [       s    a        b    ]
2994                 buf[0] = top;
2995                 to_insert = 1;
2996             }
2997             else // b in negative
2998             {
2999                 //  [-------++++++++----++++++-]
3000                 //  [       s    a   b         ]
3001                 if (top == b)
3002                 {
3003                     assert(b_idx+1 < data.length);
3004                     buf[0] = data[b_idx+1];
3005                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
3006                     return pos - 1;
3007                 }
3008                 buf[0] = b;
3009                 buf[1] = top;
3010                 to_insert = 2;
3011             }
3012         }
3013         else
3014         { // a in negative
3015             if (b_idx & 1) // b in positive
3016             {
3017                 //  [----------+++++----++++++-]
3018                 //  [     a     b              ]
3019                 buf[0] = a;
3020                 buf[1] = top;
3021                 to_insert = 2;
3022             }
3023             else// b in negative
3024             {
3025                 //  [----------+++++----++++++-]
3026                 //  [  a       s      b        ]
3027                 if (top == b)
3028                 {
3029                     assert(b_idx+1 < data.length);
3030                     buf[0] = a;
3031                     buf[1] = data[b_idx+1];
3032                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
3033                     return pos - 1;
3034                 }
3035                 buf[0] = a;
3036                 buf[1] = b;
3037                 buf[2] = top;
3038                 to_insert = 3;
3039             }
3040         }
3041         pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
3042         debug(std_uni)
3043         {
3044             writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
3045             writeln("inserting ", buf[0 .. to_insert]);
3046         }
3047         return pos - 1;
3048     }
3049 
3050     //
3051     Marker dropUpTo(uint a, Marker pos=Marker.init)
3052     in
3053     {
3054         assert(pos % 2 == 0); // at start of interval
3055     }
3056     do
3057     {
3058         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3059         if (range.empty)
3060             return pos;
3061         size_t idx = pos;
3062         idx += range.lowerBound(a).length;
3063 
3064         debug(std_uni)
3065         {
3066             writeln("dropUpTo full length=", data.length);
3067             writeln(pos,"~~~", idx);
3068         }
3069         if (idx == data.length)
3070             return genericReplace(data, pos, idx, cast(uint[])[]);
3071         if (idx & 1)
3072         {   // a in positive
3073             //[--+++----++++++----+++++++------...]
3074             //      |<---si       s  a  t
3075             genericReplace(data, pos, idx, [a]);
3076         }
3077         else
3078         {   // a in negative
3079             //[--+++----++++++----+++++++-------+++...]
3080             //      |<---si              s  a  t
3081             genericReplace(data, pos, idx, cast(uint[])[]);
3082         }
3083         return pos;
3084     }
3085 
3086     //
3087     Marker skipUpTo(uint a, Marker pos=Marker.init)
3088     out(result)
3089     {
3090         assert(result % 2 == 0);// always start of interval
3091         //(may be  0-width after-split)
3092     }
3093     do
3094     {
3095         assert(data.length % 2 == 0);
3096         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3097         size_t idx = pos+range.lowerBound(a).length;
3098 
3099         if (idx >= data.length) // could have Marker point to recently removed stuff
3100             return data.length;
3101 
3102         if (idx & 1)// inside of interval, check for split
3103         {
3104 
3105             immutable top = data[idx];
3106             if (top == a)// no need to split, it's end
3107                 return idx+1;
3108             immutable start = data[idx-1];
3109             if (a == start)
3110                 return idx-1;
3111             // split it up
3112             genericReplace(data, idx, idx+1, [a, a, top]);
3113             return idx+1;        // avoid odd index
3114         }
3115         return idx;
3116     }
3117 
3118     CowArray!SP data;
3119 }
3120 
3121 pure @safe unittest
3122 {
3123     import std.conv : to;
3124     assert(unicode.ASCII.to!string() == "[0..128)");
3125 }
3126 
3127 // pedantic version for ctfe, and aligned-access only architectures
3128 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3129 {
3130     idx *= 3;
3131     version (LittleEndian)
3132         return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
3133              + (cast(uint) ptr[idx+2]<<16);
3134     else
3135         return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
3136              + ptr[idx+2];
3137 }
3138 
3139 // ditto
3140 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3141 {
3142     idx *= 3;
3143     version (LittleEndian)
3144     {
3145         ptr[idx] = val & 0xFF;
3146         ptr[idx+1] = (val >> 8) & 0xFF;
3147         ptr[idx+2] = (val >> 16) & 0xFF;
3148     }
3149     else
3150     {
3151         ptr[idx] = (val >> 16) & 0xFF;
3152         ptr[idx+1] = (val >> 8) & 0xFF;
3153         ptr[idx+2] = val & 0xFF;
3154     }
3155 }
3156 
3157 // unaligned x86-like read/write functions
3158 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3159 {
3160     uint* src = cast(uint*)(ptr+3*idx);
3161     version (LittleEndian)
3162         return *src & 0xFF_FFFF;
3163     else
3164         return *src >> 8;
3165 }
3166 
3167 // ditto
3168 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3169 {
3170     uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
3171     version (LittleEndian)
3172         *dest = val | (*dest & 0xFF00_0000);
3173     else
3174         *dest = (val << 8) | (*dest & 0xFF);
3175 }
3176 
3177 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3178 {
3179     static if (hasUnalignedReads)
3180         return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
3181     else
3182         return safeRead24(ptr, idx);
3183 }
3184 
3185 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3186 {
3187     static if (hasUnalignedReads)
3188         return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
3189     else
3190         return safeWrite24(ptr, val, idx);
3191 }
3192 
3193 struct CowArray(SP=GcPolicy)
3194 {
3195     import std.range.primitives : hasLength;
3196 
3197   @safe:
3198     static auto reuse(uint[] arr)
3199     {
3200         CowArray cow;
3201         cow.data = arr;
3202         SP.append(cow.data, 1);
3203         assert(cow.refCount == 1);
3204         assert(cow.length == arr.length);
3205         return cow;
3206     }
3207 
3208     this(Range)(Range range)
3209         if (isInputRange!Range && hasLength!Range)
3210     {
3211         import std.algorithm.mutation : copy;
3212         length = range.length;
3213         copy(range, data[0..$-1]);
3214     }
3215 
3216     this(Range)(Range range)
3217         if (isForwardRange!Range && !hasLength!Range)
3218     {
3219         import std.algorithm.mutation : copy;
3220         import std.range.primitives : walkLength;
3221         immutable len = walkLength(range.save);
3222         length = len;
3223         copy(range, data[0..$-1]);
3224     }
3225 
3226     this(this)
3227     {
3228         if (!empty)
3229         {
3230             refCount = refCount + 1;
3231         }
3232     }
3233 
3234     ~this()
3235     {
3236         if (!empty)
3237         {
3238             immutable cnt = refCount;
3239             if (cnt == 1)
3240                 SP.destroy(data);
3241             else
3242                 refCount = cnt - 1;
3243         }
3244     }
3245 
3246     // no ref-count for empty U24 array
3247     @property bool empty() const { return data.length == 0; }
3248 
3249     // report one less then actual size
3250     @property size_t length() const
3251     {
3252         return data.length ? data.length - 1 : 0;
3253     }
3254 
3255     //+ an extra slot for ref-count
3256     @property void length(size_t len)
3257     {
3258         import std.algorithm.comparison : min;
3259         import std.algorithm.mutation : copy;
3260         if (len == 0)
3261         {
3262             if (!empty)
3263                 freeThisReference();
3264             return;
3265         }
3266         immutable total = len + 1; // including ref-count
3267         if (empty)
3268         {
3269             data = SP.alloc!uint(total);
3270             refCount = 1;
3271             return;
3272         }
3273         immutable cur_cnt = refCount;
3274         if (cur_cnt != 1) // have more references to this memory
3275         {
3276             refCount = cur_cnt - 1;
3277             auto new_data = SP.alloc!uint(total);
3278             // take shrinking into account
3279             auto to_copy = min(total, data.length) - 1;
3280             copy(data[0 .. to_copy], new_data[0 .. to_copy]);
3281             data = new_data; // before setting refCount!
3282             refCount = 1;
3283         }
3284         else // 'this' is the only reference
3285         {
3286             // use the realloc (hopefully in-place operation)
3287             data = SP.realloc(data, total);
3288             refCount = 1; // setup a ref-count in the new end of the array
3289         }
3290     }
3291 
3292     alias opDollar = length;
3293 
3294     uint opIndex()(size_t idx)const
3295     {
3296         return data[idx];
3297     }
3298 
3299     void opIndexAssign(uint val, size_t idx)
3300     {
3301         auto cnt = refCount;
3302         if (cnt != 1)
3303             dupThisReference(cnt);
3304         data[idx] = val;
3305     }
3306 
3307     //
3308     auto opSlice(size_t from, size_t to)
3309     {
3310         if (!empty)
3311         {
3312             auto cnt = refCount;
3313             if (cnt != 1)
3314                 dupThisReference(cnt);
3315         }
3316         return data[from .. to];
3317 
3318     }
3319 
3320     //
3321     auto opSlice(size_t from, size_t to) const
3322     {
3323         return data[from .. to];
3324     }
3325 
3326     // length slices before the ref count
3327     auto opSlice()
3328     {
3329         return opSlice(0, length);
3330     }
3331 
3332     // ditto
3333     auto opSlice() const
3334     {
3335         return opSlice(0, length);
3336     }
3337 
3338     void append(Range)(Range range)
3339         if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
3340     {
3341         size_t nl = length + range.length;
3342         length = nl;
3343         copy(range, this[nl-range.length .. nl]);
3344     }
3345 
3346     void append()(uint[] val...)
3347     {
3348         length = length + val.length;
3349         data[$-val.length-1 .. $-1] = val[];
3350     }
3351 
3352     bool opEquals()(auto const ref CowArray rhs)const
3353     {
3354         if (empty ^ rhs.empty)
3355             return false; // one is empty and the other isn't
3356         return empty || data[0..$-1] == rhs.data[0..$-1];
3357     }
3358 
3359 private:
3360     // ref-count is right after the data
3361     @property uint refCount() const
3362     {
3363         return data[$-1];
3364     }
3365 
3366     @property void refCount(uint cnt)
3367     {
3368         data[$-1] = cnt;
3369     }
3370 
3371     void freeThisReference()
3372     {
3373         immutable count = refCount;
3374         if (count != 1) // have more references to this memory
3375         {
3376             // dec shared ref-count
3377             refCount = count - 1;
3378             data = [];
3379         }
3380         else
3381             SP.destroy(data);
3382         assert(!data.ptr);
3383     }
3384 
3385     void dupThisReference(uint count)
3386     in
3387     {
3388         assert(!empty && count != 1 && count == refCount);
3389     }
3390     do
3391     {
3392         import std.algorithm.mutation : copy;
3393         // dec shared ref-count
3394         refCount = count - 1;
3395         // copy to the new chunk of RAM
3396         auto new_data = SP.alloc!uint(data.length);
3397         // bit-blit old stuff except the counter
3398         copy(data[0..$-1], new_data[0..$-1]);
3399         data = new_data; // before setting refCount!
3400         refCount = 1; // so that this updates the right one
3401     }
3402 
3403     uint[] data;
3404 }
3405 
3406 pure @safe unittest// Uint24 tests
3407 {
3408     import std.algorithm.comparison : equal;
3409     import std.algorithm.mutation : copy;
3410     import std.conv : text;
3411     import std.range : iota, chain;
3412     import std.range.primitives : isBidirectionalRange, isOutputRange;
3413     void funcRef(T)(ref T u24)
3414     {
3415         u24.length = 2;
3416         u24[1] = 1024;
3417         T u24_c = u24;
3418         assert(u24[1] == 1024);
3419         u24.length = 0;
3420         assert(u24.empty);
3421         u24.append([1, 2]);
3422         assert(equal(u24[], [1, 2]));
3423         u24.append(111);
3424         assert(equal(u24[], [1, 2, 111]));
3425         assert(!u24_c.empty && u24_c[1] == 1024);
3426         u24.length = 3;
3427         copy(iota(0, 3), u24[]);
3428         assert(equal(u24[], iota(0, 3)));
3429         assert(u24_c[1] == 1024);
3430     }
3431 
3432     void func2(T)(T u24)
3433     {
3434         T u24_2 = u24;
3435         T u24_3;
3436         u24_3 = u24_2;
3437         assert(u24_2 == u24_3);
3438         assert(equal(u24[], u24_2[]));
3439         assert(equal(u24_2[], u24_3[]));
3440         funcRef(u24_3);
3441 
3442         assert(equal(u24_3[], iota(0, 3)));
3443         assert(!equal(u24_2[], u24_3[]));
3444         assert(equal(u24_2[], u24[]));
3445         u24_2 = u24_3;
3446         assert(equal(u24_2[], iota(0, 3)));
3447         // to test that passed arg is intact outside
3448         // plus try out opEquals
3449         u24 = u24_3;
3450         u24 = T.init;
3451         u24_3 = T.init;
3452         assert(u24.empty);
3453         assert(u24 == u24_3);
3454         assert(u24 != u24_2);
3455     }
3456 
3457     static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
3458     {{
3459         alias Range = typeof(CowArray!Policy.init[]);
3460         alias U24A = CowArray!Policy;
3461         static assert(isForwardRange!Range);
3462         static assert(isBidirectionalRange!Range);
3463         static assert(isOutputRange!(Range, uint));
3464         static assert(isRandomAccessRange!(Range));
3465 
3466         auto arr = U24A([42u, 36, 100]);
3467         assert(arr[0] == 42);
3468         assert(arr[1] == 36);
3469         arr[0] = 72;
3470         arr[1] = 0xFE_FEFE;
3471         assert(arr[0] == 72);
3472         assert(arr[1] == 0xFE_FEFE);
3473         assert(arr[2] == 100);
3474         U24A arr2 = arr;
3475         assert(arr2[0] == 72);
3476         arr2[0] = 11;
3477         // test COW-ness
3478         assert(arr[0] == 72);
3479         assert(arr2[0] == 11);
3480         // set this to about 100M to stress-test COW memory management
3481         foreach (v; 0 .. 10_000)
3482             func2(arr);
3483         assert(equal(arr[], [72, 0xFE_FEFE, 100]));
3484 
3485         auto r2 = U24A(iota(0, 100));
3486         assert(equal(r2[], iota(0, 100)), text(r2[]));
3487         copy(iota(10, 170, 2), r2[10 .. 90]);
3488         assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
3489                , text(r2[]));
3490     }}
3491 }
3492 
3493 pure @safe unittest// core set primitives test
3494 {
3495     import std.conv : text;
3496     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3497     foreach (CodeList; AllSets)
3498     {
3499         CodeList a;
3500         //"plug a hole" test
3501         a.add(10, 20).add(25, 30).add(15, 27);
3502         assert(a == CodeList(10, 30), text(a));
3503 
3504         auto x = CodeList.init;
3505         x.add(10, 20).add(30, 40).add(50, 60);
3506 
3507         a = x;
3508         a.add(20, 49);//[10, 49) [50, 60)
3509         assert(a == CodeList(10, 49, 50 ,60));
3510 
3511         a = x;
3512         a.add(20, 50);
3513         assert(a == CodeList(10, 60), text(a));
3514 
3515         // simple unions, mostly edge effects
3516         x = CodeList.init;
3517         x.add(10, 20).add(40, 60);
3518 
3519         a = x;
3520         a.add(10, 25); //[10, 25) [40, 60)
3521         assert(a == CodeList(10, 25, 40, 60));
3522 
3523         a = x;
3524         a.add(5, 15); //[5, 20) [40, 60)
3525         assert(a == CodeList(5, 20, 40, 60));
3526 
3527         a = x;
3528         a.add(0, 10); // [0, 20) [40, 60)
3529         assert(a == CodeList(0, 20, 40, 60));
3530 
3531         a = x;
3532         a.add(0, 5); // prepand
3533         assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
3534 
3535         a = x;
3536         a.add(5, 20);
3537         assert(a == CodeList(5, 20, 40, 60));
3538 
3539         a = x;
3540         a.add(3, 37);
3541         assert(a == CodeList(3, 37, 40, 60));
3542 
3543         a = x;
3544         a.add(37, 65);
3545         assert(a == CodeList(10, 20, 37, 65));
3546 
3547         // some tests on helpers for set intersection
3548         x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
3549         a = x;
3550 
3551         auto m = a.skipUpTo(60);
3552         a.dropUpTo(110, m);
3553         assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
3554 
3555         a = x;
3556         a.dropUpTo(100);
3557         assert(a == CodeList(100, 120), text(a.data[]));
3558 
3559         a = x;
3560         m = a.skipUpTo(50);
3561         a.dropUpTo(140, m);
3562         assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
3563         a = x;
3564         a.dropUpTo(60);
3565         assert(a == CodeList(100, 120), text(a.data[]));
3566     }
3567 }
3568 
3569 
3570 //test constructor to work with any order of intervals
3571 pure @safe unittest
3572 {
3573     import std.algorithm.comparison : equal;
3574     import std.conv : text, to;
3575     import std.range : chain, iota;
3576     import std.typecons : tuple;
3577     //ensure constructor handles bad ordering and overlap
3578     auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
3579     foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
3580         assert(ch in c1, to!string(ch));
3581 
3582     //contiguos
3583     assert(CodepointSet(1000, 1006, 1006, 1009)
3584         .byInterval.equal([tuple(1000, 1009)]));
3585     //contains
3586     assert(CodepointSet(900, 1200, 1000, 1100)
3587         .byInterval.equal([tuple(900, 1200)]));
3588     //intersect left
3589     assert(CodepointSet(900, 1100, 1000, 1200)
3590         .byInterval.equal([tuple(900, 1200)]));
3591     //intersect right
3592     assert(CodepointSet(1000, 1200, 900, 1100)
3593         .byInterval.equal([tuple(900, 1200)]));
3594 
3595     //ditto with extra items at end
3596     assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
3597         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3598     assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
3599         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3600 
3601     //"plug a hole" test
3602     auto c2 = CodepointSet(20, 40,
3603         60, 80, 100, 140, 150, 200,
3604         40, 60, 80, 100, 140, 150
3605     );
3606     assert(c2.byInterval.equal([tuple(20, 200)]));
3607 
3608     auto c3 = CodepointSet(
3609         20, 40, 60, 80, 100, 140, 150, 200,
3610         0, 10, 15, 100, 10, 20, 200, 220);
3611     assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
3612 }
3613 
3614 
3615 pure @safe unittest
3616 {   // full set operations
3617     import std.conv : text;
3618     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3619     foreach (CodeList; AllSets)
3620     {
3621         CodeList a, b, c, d;
3622 
3623         //"plug a hole"
3624         a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3625         b.add(40, 60).add(80, 100).add(140, 150);
3626         c = a | b;
3627         d = b | a;
3628         assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
3629         assert(c == d, text(c," vs ", d));
3630 
3631         b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
3632         c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
3633         d = b | a;
3634         assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
3635         assert(c == d, text(c," vs ", d));
3636 
3637         b = CodeList.init.add(10, 20).add(30,100).add(145,200);
3638         c = a | b;//[10, 140) [145, 200)
3639         d = b | a;
3640         assert(c == CodeList(10, 140, 145, 200));
3641         assert(c == d, text(c," vs ", d));
3642 
3643         b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
3644         c = a | b;//[0, 140) [150, 220)
3645         d = b | a;
3646         assert(c == CodeList(0, 140, 150, 220));
3647         assert(c == d, text(c," vs ", d));
3648 
3649 
3650         a = CodeList.init.add(20, 40).add(60, 80);
3651         b = CodeList.init.add(25, 35).add(65, 75);
3652         c = a & b;
3653         d = b & a;
3654         assert(c == CodeList(25, 35, 65, 75), text(c));
3655         assert(c == d, text(c," vs ", d));
3656 
3657         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3658         b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
3659         c = a & b;
3660         d = b & a;
3661         assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
3662         assert(c == d, text(c," vs ", d));
3663 
3664         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3665         b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
3666         c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
3667         d = b & a;
3668 
3669         assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
3670         assert(c == d, text(c, " vs ",d));
3671         assert((c & a) == c);
3672         assert((d & b) == d);
3673         assert((c & d) == d);
3674 
3675         b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
3676         c = a & b;
3677         d = b & a;
3678         assert(c == CodeList(150, 200), text(c));
3679         assert(c == d, text(c, " vs ",d));
3680         assert((c & a) == c);
3681         assert((d & b) == d);
3682         assert((c & d) == d);
3683 
3684         assert((a & a) == a);
3685         assert((b & b) == b);
3686 
3687         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3688         b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
3689         c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
3690         d = b - a;// [40, 60) [80, 100) [200, 300)
3691         assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
3692         assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
3693         assert(c - d == c, text(c-d, " vs ", c));
3694         assert(d - c == d, text(d-c, " vs ", d));
3695         assert(c - c == CodeList.init);
3696         assert(d - d == CodeList.init);
3697 
3698         a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150,            200);
3699         b = CodeList.init.add(10,  50).add(60,                           160).add(190, 300);
3700         c = a - b;// [160, 190)
3701         d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
3702         assert(c == CodeList(160, 190), text(c));
3703         assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
3704         assert(c - d == c, text(c-d, " vs ", c));
3705         assert(d - c == d, text(d-c, " vs ", d));
3706         assert(c - c == CodeList.init);
3707         assert(d - d == CodeList.init);
3708 
3709         a = CodeList.init.add(20,    40).add(60, 80).add(100,      140).add(150,  200);
3710         b = CodeList.init.add(10, 30).add(45,         100).add(130,             190);
3711         c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
3712         d = b ~ a;
3713         assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
3714                text(c));
3715         assert(c == d, text(c, " vs ", d));
3716     }
3717 }
3718 
3719 }
3720 
3721 pure @safe unittest// vs single dchar
3722 {
3723     import std.conv : text;
3724     CodepointSet a = CodepointSet(10, 100, 120, 200);
3725     assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
3726     assert((a & 'B') == CodepointSet(66, 67));
3727 }
3728 
3729 pure @safe unittest// iteration & opIndex
3730 {
3731     import std.algorithm.comparison : equal;
3732     import std.conv : text;
3733     import std.typecons : tuple, Tuple;
3734 
3735     static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
3736     {{
3737         auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
3738         auto a = CodeList('A','N','a', 'n');
3739         assert(equal(a.byInterval,
3740                 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
3741             ), text(a.byInterval));
3742 
3743         // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ?
3744         version (bug8949)
3745         {
3746             import std.range : retro;
3747             assert(equal(retro(a.byInterval),
3748                 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
3749             ), text(retro(a.byInterval)));
3750         }
3751         auto achr = a.byCodepoint;
3752         assert(equal(achr, arr), text(a.byCodepoint));
3753         foreach (ch; a.byCodepoint)
3754             assert(a[ch]);
3755         auto x = CodeList(100, 500, 600, 900, 1200, 1500);
3756         assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
3757         foreach (ch; x.byCodepoint)
3758             assert(x[ch]);
3759         static if (is(CodeList == CodepointSet))
3760         {
3761             auto y = CodeList(x.byInterval);
3762             assert(equal(x.byInterval, y.byInterval));
3763         }
3764         assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
3765         assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
3766     }}
3767 }
3768 
3769 //============================================================================
3770 // Generic Trie template and various ways to build it
3771 //============================================================================
3772 
3773 // debug helper to get a shortened array dump
3774 auto arrayRepr(T)(T x)
3775 {
3776     import std.conv : text;
3777     if (x.length > 32)
3778     {
3779         return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
3780     }
3781     else
3782         return text(x);
3783 }
3784 
3785 /**
3786     Maps `Key` to a suitable integer index within the range of `size_t`.
3787     The mapping is constructed by applying predicates from `Prefix` left to right
3788     and concatenating the resulting bits.
3789 
3790     The first (leftmost) predicate defines the most significant bits of
3791     the resulting index.
3792  */
3793 template mapTrieIndex(Prefix...)
3794 {
3795     size_t mapTrieIndex(Key)(Key key)
3796         if (isValidPrefixForTrie!(Key, Prefix))
3797     {
3798         alias p = Prefix;
3799         size_t idx;
3800         foreach (i, v; p[0..$-1])
3801         {
3802             idx |= p[i](key);
3803             idx <<= p[i+1].bitSize;
3804         }
3805         idx |= p[$-1](key);
3806         return idx;
3807     }
3808 }
3809 
3810 /*
3811     `TrieBuilder` is a type used for incremental construction
3812     of $(LREF Trie)s.
3813 
3814     See $(LREF buildTrie) for generic helpers built on top of it.
3815 */
3816 @trusted private struct TrieBuilder(Value, Key, Args...)
3817 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
3818 {
3819     import std.exception : enforce;
3820 
3821 private:
3822     // last index is not stored in table, it is used as an offset to values in a block.
3823     static if (is(Value == bool))// always pack bool
3824         alias V = BitPacked!(Value, 1);
3825     else
3826         alias V = Value;
3827     static auto deduceMaxIndex(Preds...)()
3828     {
3829         size_t idx = 1;
3830         foreach (v; Preds)
3831             idx *= 2^^v.bitSize;
3832         return idx;
3833     }
3834 
3835     static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
3836     {
3837         alias Prefix = Args[1..$];
3838         enum lastPageSize = 2^^Prefix[$-1].bitSize;
3839         enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
3840         enum roughedMaxIndex =
3841             (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
3842         // check warp around - if wrapped, use the default deduction rule
3843         enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
3844             deduceMaxIndex!(Prefix)() : roughedMaxIndex;
3845     }
3846     else
3847     {
3848         alias Prefix = Args;
3849         enum maxIndex = deduceMaxIndex!(Prefix)();
3850     }
3851 
3852     alias getIndex = mapTrieIndex!(Prefix);
3853 
3854     enum lastLevel = Prefix.length-1;
3855     struct ConstructState
3856     {
3857         size_t idx_zeros, idx_ones;
3858     }
3859     // iteration over levels of Trie, each indexes its own level and thus a shortened domain
3860     size_t[Prefix.length] indices;
3861     // default filler value to use
3862     Value defValue;
3863     // this is a full-width index of next item
3864     size_t curIndex;
3865     // all-zeros page index, all-ones page index (+ indicator if there is such a page)
3866     ConstructState[Prefix.length] state;
3867     // the table being constructed
3868     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
3869 
3870     @disable this();
3871 
3872     //shortcut for index variable at level 'level'
3873     @property ref idx(size_t level)(){ return indices[level]; }
3874 
3875     // this function assumes no holes in the input so
3876     // indices are going one by one
3877     void addValue(size_t level, T)(T val, size_t numVals)
3878     {
3879         alias j = idx!level;
3880         enum pageSize = 1 << Prefix[level].bitSize;
3881         if (numVals == 0)
3882             return;
3883         auto ptr = table.slice!(level);
3884         if (numVals == 1)
3885         {
3886             static if (level == Prefix.length-1)
3887                 ptr[j] = val;
3888             else
3889             {// can incur narrowing conversion
3890                 assert(j < ptr.length);
3891                 ptr[j] = force!(typeof(ptr[j]))(val);
3892             }
3893             j++;
3894             if (j % pageSize == 0)
3895                 spillToNextPage!level(ptr);
3896             return;
3897         }
3898         // longer row of values
3899         // get to the next page boundary
3900         immutable nextPB = (j + pageSize) & ~(pageSize-1);
3901         immutable n =  nextPB - j;// can fill right in this page
3902         if (numVals < n) //fits in current page
3903         {
3904             ptr[j .. j+numVals]  = val;
3905             j += numVals;
3906             return;
3907         }
3908         static if (level != 0)//on the first level it always fits
3909         {
3910             numVals -= n;
3911             //write till the end of current page
3912             ptr[j .. j+n]  = val;
3913             j += n;
3914             //spill to the next page
3915             spillToNextPage!level(ptr);
3916             // page at once loop
3917             if (state[level].idx_zeros != size_t.max && val == T.init)
3918             {
3919                 alias NextIdx = typeof(table.slice!(level-1)[0]);
3920                 addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
3921                     numVals/pageSize);
3922                 ptr = table.slice!level; //table structure might have changed
3923                 numVals %= pageSize;
3924             }
3925             else
3926             {
3927                 while (numVals >= pageSize)
3928                 {
3929                     numVals -= pageSize;
3930                     ptr[j .. j+pageSize]  = val;
3931                     j += pageSize;
3932                     spillToNextPage!level(ptr);
3933                 }
3934             }
3935             if (numVals)
3936             {
3937                 // the leftovers, an incomplete page
3938                 ptr[j .. j+numVals]  = val;
3939                 j += numVals;
3940             }
3941         }
3942     }
3943 
3944     void spillToNextPage(size_t level, Slice)(ref Slice ptr)
3945     {
3946         // last level (i.e. topmost) has 1 "page"
3947         // thus it need not to add a new page on upper level
3948         static if (level != 0)
3949             spillToNextPageImpl!(level)(ptr);
3950     }
3951 
3952     // this can re-use the current page if duplicate or allocate a new one
3953     // it also makes sure that previous levels point to the correct page in this level
3954     void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
3955     {
3956         alias NextIdx = typeof(table.slice!(level-1)[0]);
3957         NextIdx next_lvl_index;
3958         enum pageSize = 1 << Prefix[level].bitSize;
3959         assert(idx!level % pageSize == 0);
3960         immutable last = idx!level-pageSize;
3961         const slice = ptr[idx!level - pageSize .. idx!level];
3962         size_t j;
3963         for (j=0; j<last; j+=pageSize)
3964         {
3965             if (ptr[j .. j+pageSize] == slice)
3966             {
3967                 // get index to it, reuse ptr space for the next block
3968                 next_lvl_index = force!NextIdx(j/pageSize);
3969                 version (none)
3970                 {
3971                 import std.stdio : writefln, writeln;
3972                 writefln("LEVEL(%s) page mapped idx: %s: 0..%s  ---> [%s..%s]"
3973                         ,level
3974                         ,indices[level-1], pageSize, j, j+pageSize);
3975                 writeln("LEVEL(", level
3976                         , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
3977                 writeln("LEVEL(", level
3978                         , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
3979                 }
3980                 idx!level -= pageSize; // reuse this page, it is duplicate
3981                 break;
3982             }
3983         }
3984         if (j == last)
3985         {
3986     L_allocate_page:
3987             next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
3988             if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
3989             {
3990                 state[level].idx_zeros = next_lvl_index;
3991             }
3992             // allocate next page
3993             version (none)
3994             {
3995             import std.stdio : writefln;
3996             writefln("LEVEL(%s) page allocated: %s"
3997                      , level, arrayRepr(slice[0 .. pageSize]));
3998             writefln("LEVEL(%s) index: %s ; page at this index %s"
3999                      , level
4000                      , next_lvl_index
4001                      , arrayRepr(
4002                          table.slice!(level)
4003                           [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
4004                         ));
4005             }
4006             table.length!level = table.length!level + pageSize;
4007         }
4008     L_know_index:
4009         // for the previous level, values are indices to the pages in the current level
4010         addValue!(level-1)(next_lvl_index, 1);
4011         ptr = table.slice!level; //re-load the slice after moves
4012     }
4013 
4014     // idx - full-width index to fill with v (full-width index != key)
4015     // fills everything in the range of [curIndex, idx) with filler
4016     void putAt(size_t idx, Value v)
4017     {
4018         assert(idx >= curIndex);
4019         immutable numFillers = idx - curIndex;
4020         addValue!lastLevel(defValue, numFillers);
4021         addValue!lastLevel(v, 1);
4022         curIndex = idx + 1;
4023     }
4024 
4025     // ditto, but sets the range of [idxA, idxB) to v
4026     void putRangeAt(size_t idxA, size_t idxB, Value v)
4027     {
4028         assert(idxA >= curIndex);
4029         assert(idxB >= idxA);
4030         size_t numFillers = idxA - curIndex;
4031         addValue!lastLevel(defValue, numFillers);
4032         addValue!lastLevel(v, idxB - idxA);
4033         curIndex = idxB; // open-right
4034     }
4035 
4036     enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
4037         "duplicate key->value mapping";
4038 
4039 public:
4040     /**
4041         Construct a builder, where `filler` is a value
4042         to indicate empty slots (or "not found" condition).
4043     */
4044     this(Value filler)
4045     {
4046         curIndex = 0;
4047         defValue = filler;
4048         // zeros-page index, ones-page index
4049         foreach (ref v; state)
4050             v = ConstructState(size_t.max, size_t.max);
4051         table = typeof(table)(indices);
4052         // one page per level is a bootstrap minimum
4053         foreach (i, Pred; Prefix)
4054             table.length!i = (1 << Pred.bitSize);
4055     }
4056 
4057     /**
4058         Put a value `v` into interval as
4059         mapped by keys from `a` to `b`.
4060         All slots prior to `a` are filled with
4061         the default filler.
4062     */
4063     void putRange(Key a, Key b, Value v)
4064     {
4065         auto idxA = getIndex(a), idxB = getIndex(b);
4066         // indexes of key should always grow
4067         enforce(idxB >= idxA && idxA >= curIndex, errMsg);
4068         putRangeAt(idxA, idxB, v);
4069     }
4070 
4071     /**
4072         Put a value `v` into slot mapped by `key`.
4073         All slots prior to `key` are filled with the
4074         default filler.
4075     */
4076     void putValue(Key key, Value v)
4077     {
4078         auto idx = getIndex(key);
4079         enforce(idx >= curIndex, errMsg);
4080         putAt(idx, v);
4081     }
4082 
4083     /// Finishes construction of Trie, yielding an immutable Trie instance.
4084     auto build()
4085     {
4086         static if (maxIndex != 0) // doesn't cover full range of size_t
4087         {
4088             assert(curIndex <= maxIndex);
4089             addValue!lastLevel(defValue, maxIndex - curIndex);
4090         }
4091         else
4092         {
4093             if (curIndex != 0 // couldn't wrap around
4094                 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
4095             {
4096                 addValue!lastLevel(defValue, size_t.max - curIndex);
4097                 addValue!lastLevel(defValue, 1);
4098             }
4099             // else curIndex already completed the full range of size_t by wrapping around
4100         }
4101         return Trie!(V, Key, maxIndex, Prefix)(table);
4102     }
4103 }
4104 
4105 /**
4106     $(P A generic Trie data-structure for a fixed number of stages.
4107     The design goal is optimal speed with smallest footprint size.
4108     )
4109     $(P It's intentionally read-only and doesn't provide constructors.
4110      To construct one use a special builder,
4111      see $(LREF TrieBuilder) and $(LREF buildTrie).
4112     )
4113 
4114 */
4115 @trusted private struct Trie(Value, Key, Args...)
4116 if (isValidPrefixForTrie!(Key, Args)
4117     || (isValidPrefixForTrie!(Key, Args[1..$])
4118     && is(typeof(Args[0]) : size_t)))
4119 {
4120     import std.range.primitives : isOutputRange;
4121     static if (is(typeof(Args[0]) : size_t))
4122     {
4123         private enum maxIndex = Args[0];
4124         private enum hasBoundsCheck = true;
4125         private alias Prefix = Args[1..$];
4126     }
4127     else
4128     {
4129         private enum hasBoundsCheck = false;
4130         private alias Prefix = Args;
4131     }
4132 
4133     private this()(typeof(_table) table)
4134     {
4135         _table = table;
4136     }
4137 
4138     // only for constant Tries constructed from precompiled tables
4139     private this()(const(size_t)[] offsets, const(size_t)[] sizes,
4140         const(size_t)[] data) const
4141     {
4142         _table = typeof(_table)(offsets, sizes, data);
4143     }
4144 
4145     /**
4146         $(P Lookup the `key` in this `Trie`. )
4147 
4148         $(P The lookup always succeeds if key fits the domain
4149         provided during construction. The whole domain defined
4150         is covered so instead of not found condition
4151         the sentinel (filler) value could be used. )
4152 
4153         $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
4154         define a domain of `Trie` keys and the sentinel value. )
4155 
4156         Note:
4157         Domain range-checking is only enabled in debug builds
4158         and results in assertion failure.
4159     */
4160     TypeOfBitPacked!Value opIndex()(Key key) const
4161     {
4162         static if (hasBoundsCheck)
4163             assert(mapTrieIndex!Prefix(key) < maxIndex);
4164         size_t idx;
4165         alias p = Prefix;
4166         idx = cast(size_t) p[0](key);
4167         foreach (i, v; p[0..$-1])
4168             idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
4169         return _table.ptr!(p.length-1)[idx];
4170     }
4171 
4172     ///
4173     @property size_t bytes(size_t n=size_t.max)() const
4174     {
4175         return _table.bytes!n;
4176     }
4177 
4178     ///
4179     @property size_t pages(size_t n)() const
4180     {
4181         return (bytes!n+2^^(Prefix[n].bitSize-1))
4182                 /2^^Prefix[n].bitSize;
4183     }
4184 
4185     ///
4186     void store(OutRange)(scope OutRange sink) const
4187         if (isOutputRange!(OutRange, char))
4188     {
4189         _table.store(sink);
4190     }
4191 
4192 private:
4193     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
4194 }
4195 
4196 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
4197 // left-to-right, the most significant bits first
4198 template GetBitSlicing(size_t top, sizes...)
4199 {
4200     static if (sizes.length > 0)
4201         alias GetBitSlicing =
4202             AliasSeq!(sliceBits!(top - sizes[0], top),
4203                       GetBitSlicing!(top - sizes[0], sizes[1..$]));
4204     else
4205         alias GetBitSlicing = AliasSeq!();
4206 }
4207 
4208 template callableWith(T)
4209 {
4210     template callableWith(alias Pred)
4211     {
4212         static if (!is(typeof(Pred(T.init))))
4213             enum callableWith = false;
4214         else
4215         {
4216             alias Result = typeof(Pred(T.init));
4217             enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
4218         }
4219     }
4220 }
4221 
4222 /*
4223     Check if `Prefix` is a valid set of predicates
4224     for `Trie` template having `Key` as the type of keys.
4225     This requires all predicates to be callable, take
4226     single argument of type `Key` and return unsigned value.
4227 */
4228 template isValidPrefixForTrie(Key, Prefix...)
4229 {
4230     import std.meta : allSatisfy;
4231     enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
4232 }
4233 
4234 /*
4235     Check if `Args` is a set of maximum key value followed by valid predicates
4236     for `Trie` template having `Key` as the type of keys.
4237 */
4238 template isValidArgsForTrie(Key, Args...)
4239 {
4240     static if (Args.length > 1)
4241     {
4242         enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
4243             || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
4244     }
4245     else
4246         enum isValidArgsForTrie = isValidPrefixForTrie!Args;
4247 }
4248 
4249 @property size_t sumOfIntegerTuple(ints...)()
4250 {
4251     size_t count=0;
4252     foreach (v; ints)
4253         count += v;
4254     return count;
4255 }
4256 
4257 /**
4258     A shorthand for creating a custom multi-level fixed Trie
4259     from a `CodepointSet`. `sizes` are numbers of bits per level,
4260     with the most significant bits used first.
4261 
4262     Note: The sum of `sizes` must be equal 21.
4263 
4264     See_Also: $(LREF toTrie), which is even simpler.
4265 
4266     Example:
4267     ---
4268     {
4269         import std.stdio;
4270         auto set = unicode("Number");
4271         auto trie = codepointSetTrie!(8, 5, 8)(set);
4272         writeln("Input code points to test:");
4273         foreach (line; stdin.byLine)
4274         {
4275             int count=0;
4276             foreach (dchar ch; line)
4277                 if (trie[ch])// is number
4278                     count++;
4279             writefln("Contains %d number code points.", count);
4280         }
4281     }
4282     ---
4283 */
4284 public template codepointSetTrie(sizes...)
4285 if (sumOfIntegerTuple!sizes == 21)
4286 {
4287     auto codepointSetTrie(Set)(Set set)
4288         if (isCodepointSet!Set)
4289     {
4290         auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
4291         foreach (ival; set.byInterval)
4292             builder.putRange(ival[0], ival[1], true);
4293         return builder.build();
4294     }
4295 }
4296 
4297 /// Type of Trie generated by codepointSetTrie function.
4298 public template CodepointSetTrie(sizes...)
4299 if (sumOfIntegerTuple!sizes == 21)
4300 {
4301     alias Prefix = GetBitSlicing!(21, sizes);
4302     alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
4303 }
4304 
4305 /**
4306     A slightly more general tool for building fixed `Trie`
4307     for the Unicode data.
4308 
4309     Specifically unlike `codepointSetTrie` it's allows creating mappings
4310     of `dchar` to an arbitrary type `T`.
4311 
4312     Note: Overload taking `CodepointSet`s will naturally convert
4313     only to bool mapping `Trie`s.
4314 
4315     CodepointTrie is the type of Trie as generated by codepointTrie function.
4316 */
4317 public template codepointTrie(T, sizes...)
4318 if (sumOfIntegerTuple!sizes == 21)
4319 {
4320     alias Prefix = GetBitSlicing!(21, sizes);
4321 
4322     static if (is(TypeOfBitPacked!T == bool))
4323     {
4324         auto codepointTrie(Set)(const scope Set set)
4325             if (isCodepointSet!Set)
4326         {
4327             return codepointSetTrie(set);
4328         }
4329     }
4330 
4331     ///
4332     auto codepointTrie()(T[dchar] map, T defValue=T.init)
4333     {
4334         return buildTrie!(T, dchar, Prefix)(map, defValue);
4335     }
4336 
4337     // unsorted range of pairs
4338     ///
4339     auto codepointTrie(R)(R range, T defValue=T.init)
4340         if (isInputRange!R
4341             && is(typeof(ElementType!R.init[0]) : T)
4342             && is(typeof(ElementType!R.init[1]) : dchar))
4343     {
4344         // build from unsorted array of pairs
4345         // TODO: expose index sorting functions for Trie
4346         return buildTrie!(T, dchar, Prefix)(range, defValue, true);
4347     }
4348 }
4349 
4350 @system pure unittest
4351 {
4352     import std.algorithm.comparison : max;
4353     import std.algorithm.searching : count;
4354 
4355     // pick characters from the Greek script
4356     auto set = unicode.Greek;
4357 
4358     // a user-defined property (or an expensive function)
4359     // that we want to look up
4360     static uint luckFactor(dchar ch)
4361     {
4362         // here we consider a character lucky
4363         // if its code point has a lot of identical hex-digits
4364         // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
4365         ubyte[6] nibbles; // 6 4-bit chunks of code point
4366         uint value = ch;
4367         foreach (i; 0 .. 6)
4368         {
4369             nibbles[i] = value & 0xF;
4370             value >>= 4;
4371         }
4372         uint luck;
4373         foreach (n; nibbles)
4374             luck = cast(uint) max(luck, count(nibbles[], n));
4375         return luck;
4376     }
4377 
4378     // only unsigned built-ins are supported at the moment
4379     alias LuckFactor = BitPacked!(uint, 3);
4380 
4381     // create a temporary associative array (AA)
4382     LuckFactor[dchar] map;
4383     foreach (ch; set.byCodepoint)
4384         map[ch] = LuckFactor(luckFactor(ch));
4385 
4386     // bits per stage are chosen randomly, fell free to optimize
4387     auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
4388 
4389     // from now on the AA is not needed
4390     foreach (ch; set.byCodepoint)
4391         assert(trie[ch] == luckFactor(ch)); // verify
4392     // CJK is not Greek, thus it has the default value
4393     assert(trie['\u4444'] == 0);
4394     // and here is a couple of quite lucky Greek characters:
4395     // Greek small letter epsilon with dasia
4396     assert(trie['\u1F11'] == 3);
4397     // Ancient Greek metretes sign
4398     assert(trie['\U00010181'] == 3);
4399 
4400 }
4401 
4402 /// ditto
4403 public template CodepointTrie(T, sizes...)
4404 if (sumOfIntegerTuple!sizes == 21)
4405 {
4406     alias Prefix = GetBitSlicing!(21, sizes);
4407     alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
4408 }
4409 
4410 package(std) template cmpK0(alias Pred)
4411 {
4412     import std.typecons : Tuple;
4413     static bool cmpK0(Value, Key)
4414         (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
4415     {
4416         return Pred(a[1]) < Pred(b[1]);
4417     }
4418 }
4419 
4420 /**
4421     The most general utility for construction of `Trie`s
4422     short of using `TrieBuilder` directly.
4423 
4424     Provides a number of convenience overloads.
4425     `Args` is tuple of maximum key value followed by
4426     predicates to construct index from key.
4427 
4428     Alternatively if the first argument is not a value convertible to `Key`
4429     then the whole tuple of `Args` is treated as predicates
4430     and the maximum Key is deduced from predicates.
4431 */
4432 private template buildTrie(Value, Key, Args...)
4433 if (isValidArgsForTrie!(Key, Args))
4434 {
4435     static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
4436     {
4437         alias Prefix = Args[1..$];
4438     }
4439     else
4440         alias Prefix = Args;
4441 
4442     alias getIndex = mapTrieIndex!(Prefix);
4443 
4444     // for multi-sort
4445     template GetComparators(size_t n)
4446     {
4447         static if (n > 0)
4448             alias GetComparators =
4449                 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
4450         else
4451             alias GetComparators = AliasSeq!();
4452     }
4453 
4454     /*
4455         Build `Trie` from a range of a Key-Value pairs,
4456         assuming it is sorted by Key as defined by the following lambda:
4457         ------
4458         (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
4459         ------
4460         Exception is thrown if it's detected that the above order doesn't hold.
4461 
4462         In other words $(LREF mapTrieIndex) should be a
4463         monotonically increasing function that maps `Key` to an integer.
4464 
4465         See_Also: $(REF sort, std,_algorithm),
4466         $(REF SortedRange, std,range),
4467         $(REF setUnion, std,_algorithm).
4468     */
4469     auto buildTrie(Range)(Range range, Value filler=Value.init)
4470         if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
4471             && is(typeof(Range.init.front[1]) : Key))
4472     {
4473         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4474         foreach (v; range)
4475             builder.putValue(v[1], v[0]);
4476         return builder.build();
4477     }
4478 
4479     /*
4480         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4481         to build `Trie` from a range of open-right intervals of `Key`s.
4482         The requirement  on the ordering of keys (and the behavior on the
4483         violation of it) is the same as for Key-Value range overload.
4484 
4485         Intervals denote ranges of !`filler` i.e. the opposite of filler.
4486         If no filler provided keys inside of the intervals map to true,
4487         and `filler` is false.
4488     */
4489     auto buildTrie(Range)(Range range, Value filler=Value.init)
4490         if (is(TypeOfBitPacked!Value ==  bool)
4491             && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
4492             && is(typeof(Range.init.front[1]) : Key))
4493     {
4494         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4495         foreach (ival; range)
4496             builder.putRange(ival[0], ival[1], !filler);
4497         return builder.build();
4498     }
4499 
4500     auto buildTrie(Range)(Range range, Value filler, bool unsorted)
4501         if (isInputRange!Range
4502             && is(typeof(Range.init.front[0]) : Value)
4503             && is(typeof(Range.init.front[1]) : Key))
4504     {
4505         import std.algorithm.sorting : multiSort;
4506         alias Comps = GetComparators!(Prefix.length);
4507         if (unsorted)
4508             multiSort!(Comps)(range);
4509         return buildTrie(range, filler);
4510     }
4511 
4512     /*
4513         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4514         to build `Trie` simply from an input range of `Key`s.
4515         The requirement  on the ordering of keys (and the behavior on the
4516         violation of it) is the same as for Key-Value range overload.
4517 
4518         Keys found in range denote !`filler` i.e. the opposite of filler.
4519         If no filler provided keys map to true, and `filler` is false.
4520     */
4521     auto buildTrie(Range)(Range range, Value filler=Value.init)
4522         if (is(TypeOfBitPacked!Value ==  bool)
4523             && isInputRange!Range && is(typeof(Range.init.front) : Key))
4524     {
4525         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4526         foreach (v; range)
4527             builder.putValue(v, !filler);
4528         return builder.build();
4529     }
4530 
4531     /*
4532         If `Key` is unsigned integer `Trie` could be constructed from array
4533         of values where array index serves as key.
4534     */
4535     auto buildTrie()(Value[] array, Value filler=Value.init)
4536         if (isUnsigned!Key)
4537     {
4538         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4539         foreach (idx, v; array)
4540             builder.putValue(idx, v);
4541         return builder.build();
4542     }
4543 
4544     /*
4545         Builds `Trie` from associative array.
4546     */
4547     auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
4548     {
4549         import std.array : array;
4550         import std.range : zip;
4551         auto range = array(zip(map.values, map.keys));
4552         return buildTrie(range, filler, true); // sort it
4553     }
4554 }
4555 
4556 // helper in place of assumeSize to
4557 //reduce mangled name & help DMD inline Trie functors
4558 struct clamp(size_t bits)
4559 {
4560     static size_t opCall(T)(T arg){ return arg; }
4561     enum bitSize = bits;
4562 }
4563 
4564 struct clampIdx(size_t idx, size_t bits)
4565 {
4566     static size_t opCall(T)(T arg){ return arg[idx]; }
4567     enum bitSize = bits;
4568 }
4569 
4570 /**
4571     Conceptual type that outlines the common properties of all UTF Matchers.
4572 
4573     Note: For illustration purposes only, every method
4574     call results in assertion failure.
4575     Use $(LREF utfMatcher) to obtain a concrete matcher
4576     for UTF-8 or UTF-16 encodings.
4577 */
4578 public struct MatcherConcept
4579 {
4580     /**
4581         $(P Perform a semantic equivalent 2 operations:
4582         decoding a $(CODEPOINT) at front of `inp` and testing if
4583         it belongs to the set of $(CODEPOINTS) of this matcher. )
4584 
4585         $(P The effect on `inp` depends on the kind of function called:)
4586 
4587         $(P Match. If the codepoint is found in the set then range `inp`
4588         is advanced by its size in $(S_LINK Code unit, code units),
4589         otherwise the range is not modifed.)
4590 
4591         $(P Skip. The range is always advanced by the size
4592         of the tested $(CODEPOINT) regardless of the result of test.)
4593 
4594         $(P Test. The range is left unaffected regardless
4595         of the result of test.)
4596     */
4597     public bool match(Range)(ref Range inp)
4598         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4599     {
4600        assert(false);
4601     }
4602 
4603     ///ditto
4604     public bool skip(Range)(ref Range inp)
4605         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4606     {
4607         assert(false);
4608     }
4609 
4610     ///ditto
4611     public bool test(Range)(ref Range inp)
4612         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4613     {
4614         assert(false);
4615     }
4616     ///
4617     pure @safe unittest
4618     {
4619         string truth = "2² = 4";
4620         auto m = utfMatcher!char(unicode.Number);
4621         assert(m.match(truth)); // '2' is a number all right
4622         assert(truth == "² = 4"); // skips on match
4623         assert(m.match(truth)); // so is the superscript '2'
4624         assert(!m.match(truth)); // space is not a number
4625         assert(truth == " = 4"); // unaffected on no match
4626         assert(!m.skip(truth)); // same test ...
4627         assert(truth == "= 4"); // but skips a codepoint regardless
4628         assert(!m.test(truth)); // '=' is not a number
4629         assert(truth == "= 4"); // test never affects argument
4630     }
4631 
4632     /**
4633         Advanced feature - provide direct access to a subset of matcher based a
4634         set of known encoding lengths. Lengths are provided in
4635         $(S_LINK Code unit, code units). The sub-matcher then may do less
4636         operations per any `test`/`match`.
4637 
4638         Use with care as the sub-matcher won't match
4639         any $(CODEPOINTS) that have encoded length that doesn't belong
4640         to the selected set of lengths. Also the sub-matcher object references
4641         the parent matcher and must not be used past the liftetime
4642         of the latter.
4643 
4644         Another caveat of using sub-matcher is that skip is not available
4645         preciesly because sub-matcher doesn't detect all lengths.
4646     */
4647     @property auto subMatcher(Lengths...)()
4648     {
4649         assert(0);
4650         return this;
4651     }
4652 
4653     pure @safe unittest
4654     {
4655         auto m = utfMatcher!char(unicode.Number);
4656         string square = "2²";
4657         // about sub-matchers
4658         assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
4659         assert(m.subMatcher!1.match(square)); // ASCII-only, works
4660         assert(!m.subMatcher!1.test(square)); // unicode '²'
4661         assert(m.subMatcher!(2,3,4).match(square));  //
4662         assert(square == "");
4663         wstring wsquare = "2²";
4664         auto m16 = utfMatcher!wchar(unicode.Number);
4665         // may keep ref, but the orignal (m16) must be kept alive
4666         auto bmp = m16.subMatcher!1;
4667         assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
4668         assert(bmp.match(wsquare)); // And '²' too
4669     }
4670 }
4671 
4672 /**
4673     Test if `M` is an UTF Matcher for ranges of `Char`.
4674 */
4675 public enum isUtfMatcher(M, C) = __traits(compiles, (){
4676     C[] s;
4677     auto d = s.decoder;
4678     M m;
4679     assert(is(typeof(m.match(d)) == bool));
4680     assert(is(typeof(m.test(d)) == bool));
4681     static if (is(typeof(m.skip(d))))
4682     {
4683         assert(is(typeof(m.skip(d)) == bool));
4684         assert(is(typeof(m.skip(s)) == bool));
4685     }
4686     assert(is(typeof(m.match(s)) == bool));
4687     assert(is(typeof(m.test(s)) == bool));
4688 });
4689 
4690 pure @safe unittest
4691 {
4692     alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
4693     alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
4694     static assert(isUtfMatcher!(CharMatcher, char));
4695     static assert(isUtfMatcher!(CharMatcher, immutable(char)));
4696     static assert(isUtfMatcher!(WcharMatcher, wchar));
4697     static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
4698 }
4699 
4700 enum Mode {
4701     alwaysSkip,
4702     neverSkip,
4703     skipOnMatch
4704 }
4705 
4706 mixin template ForwardStrings()
4707 {
4708     private bool fwdStr(string fn, C)(ref C[] str) const @trusted
4709     {
4710         import std.utf : byCodeUnit;
4711         alias type = typeof(byCodeUnit(str));
4712         return mixin(fn~"(*cast(type*)&str)");
4713     }
4714 }
4715 
4716 template Utf8Matcher()
4717 {
4718     enum validSize(int sz) = sz >= 1 && sz <= 4;
4719 
4720     void badEncoding() pure @safe
4721     {
4722         import std.utf : UTFException;
4723         throw new UTFException("Invalid UTF-8 sequence");
4724     }
4725 
4726     //for 1-stage ASCII
4727     alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
4728     //for 2-stage lookup of 2 byte UTF-8 sequences
4729     alias Utf8Spec2 = AliasSeq!(bool, char[2],
4730         clampIdx!(0, 5), clampIdx!(1, 6));
4731     //ditto for 3 byte
4732     alias Utf8Spec3 = AliasSeq!(bool, char[3],
4733         clampIdx!(0, 4),
4734         clampIdx!(1, 6),
4735         clampIdx!(2, 6)
4736     );
4737     //ditto for 4 byte
4738     alias Utf8Spec4 = AliasSeq!(bool, char[4],
4739         clampIdx!(0, 3), clampIdx!(1, 6),
4740         clampIdx!(2, 6), clampIdx!(3, 6)
4741     );
4742     alias Tables = AliasSeq!(
4743         typeof(TrieBuilder!(AsciiSpec)(false).build()),
4744         typeof(TrieBuilder!(Utf8Spec2)(false).build()),
4745         typeof(TrieBuilder!(Utf8Spec3)(false).build()),
4746         typeof(TrieBuilder!(Utf8Spec4)(false).build())
4747     );
4748     alias Table(int size) = Tables[size-1];
4749 
4750     enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
4751     enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
4752 
4753     char truncate()(char ch) pure @safe
4754     {
4755         ch -= 0x80;
4756         if (ch < 0x40)
4757         {
4758             return ch;
4759         }
4760         else
4761         {
4762             badEncoding();
4763             return cast(char) 0;
4764         }
4765     }
4766 
4767     static auto encode(size_t sz)(dchar ch)
4768         if (sz > 1)
4769     {
4770         import std.utf : encodeUTF = encode;
4771         char[4] buf;
4772         encodeUTF(buf, ch);
4773         char[sz] ret;
4774         buf[0] &= leadMask!sz;
4775         foreach (n; 1 .. sz)
4776             buf[n] = buf[n] & 0x3f; //keep 6 lower bits
4777         ret[] = buf[0 .. sz];
4778         return ret;
4779     }
4780 
4781     auto build(Set)(Set set)
4782     {
4783         import std.algorithm.iteration : map;
4784         auto ascii = set & unicode.ASCII;
4785         auto utf8_2 = set & CodepointSet(0x80, 0x800);
4786         auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
4787         auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
4788         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
4789         auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
4790         auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
4791         auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
4792         alias Ret = Impl!(1,2,3,4);
4793         return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
4794     }
4795 
4796     // Bootstrap UTF-8 static matcher interface
4797     // from 3 primitives: tab!(size), lookup and Sizes
4798     mixin template DefMatcher()
4799     {
4800         import std.format : format;
4801         import std.meta : Erase, staticIndexOf;
4802         enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
4803         alias UniSizes = Erase!(1, Sizes);
4804 
4805         //generate dispatch code sequence for unicode parts
4806         static auto genDispatch()
4807         {
4808             string code;
4809             foreach (size; UniSizes)
4810                 code ~= format(q{
4811                     if ((ch & ~leadMask!%d) == encMask!(%d))
4812                         return lookup!(%d, mode)(inp);
4813                     else
4814                 }, size, size, size);
4815             static if (Sizes.length == 4) //covers all code unit cases
4816                 code ~= "{ badEncoding(); return false; }";
4817             else
4818                 code ~= "return false;"; //may be just fine but not covered
4819             return code;
4820         }
4821         enum dispatch = genDispatch();
4822 
4823         public bool match(Range)(ref Range inp) const
4824             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4825                 !isDynamicArray!Range)
4826         {
4827             enum mode = Mode.skipOnMatch;
4828             assert(!inp.empty);
4829             immutable ch = inp[0];
4830             static if (hasASCII)
4831             {
4832                 if (ch < 0x80)
4833                 {
4834                     immutable r = tab!1[ch];
4835                     if (r)
4836                         inp.popFront();
4837                     return r;
4838                 }
4839                 else
4840                     mixin(dispatch);
4841             }
4842             else
4843                 mixin(dispatch);
4844         }
4845 
4846         static if (Sizes.length == 4) // can skip iff can detect all encodings
4847         {
4848             public bool skip(Range)(ref Range inp) const
4849                 if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4850                     !isDynamicArray!Range)
4851             {
4852                 enum mode = Mode.alwaysSkip;
4853                 assert(!inp.empty);
4854                 auto ch = inp[0];
4855                 static if (hasASCII)
4856                 {
4857                     if (ch < 0x80)
4858                     {
4859                         inp.popFront();
4860                         return tab!1[ch];
4861                     }
4862                     else
4863                         mixin(dispatch);
4864                 }
4865                 else
4866                     mixin(dispatch);
4867             }
4868         }
4869 
4870         public bool test(Range)(ref Range inp) const
4871             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4872                 !isDynamicArray!Range)
4873         {
4874             enum mode = Mode.neverSkip;
4875             assert(!inp.empty);
4876             auto ch = inp[0];
4877 
4878             static if (hasASCII)
4879             {
4880                 if (ch < 0x80)
4881                     return tab!1[ch];
4882                 else
4883                     mixin(dispatch);
4884             }
4885             else
4886                 mixin(dispatch);
4887         }
4888 
4889         bool match(C)(ref C[] str) const
4890             if (isSomeChar!C)
4891         {
4892             return fwdStr!"match"(str);
4893         }
4894 
4895         bool skip(C)(ref C[] str) const
4896             if (isSomeChar!C)
4897         {
4898             return fwdStr!"skip"(str);
4899         }
4900 
4901         bool test(C)(ref C[] str) const
4902             if (isSomeChar!C)
4903         {
4904             return fwdStr!"test"(str);
4905         }
4906 
4907         mixin ForwardStrings;
4908     }
4909 
4910     struct Impl(Sizes...)
4911     {
4912         import std.meta : allSatisfy, staticMap;
4913         static assert(allSatisfy!(validSize, Sizes),
4914             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4915     private:
4916         //pick tables for chosen sizes
4917         alias OurTabs = staticMap!(Table, Sizes);
4918         OurTabs tables;
4919         mixin DefMatcher;
4920         //static disptach helper UTF size ==> table
4921         alias tab(int i) = tables[i - 1];
4922 
4923         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
4924         {
4925             return CherryPick!(Impl, SizesToPick)(&this);
4926         }
4927 
4928         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4929         {
4930             import std.range : popFrontN;
4931             if (inp.length < size)
4932             {
4933                 badEncoding();
4934                 return false;
4935             }
4936             char[size] needle = void;
4937             needle[0] = leadMask!size & inp[0];
4938             static foreach (i; 1 .. size)
4939             {
4940                 needle[i] = truncate(inp[i]);
4941             }
4942             //overlong encoding checks
4943             static if (size == 2)
4944             {
4945                 //0x80-0x7FF
4946                 //got 6 bits in needle[1], must use at least 8 bits
4947                 //must use at least 2 bits in needle[1]
4948                 if (needle[0] < 2) badEncoding();
4949             }
4950             else static if (size == 3)
4951             {
4952                 //0x800-0xFFFF
4953                 //got 6 bits in needle[2], must use at least 12bits
4954                 //must use 6 bits in needle[1] or anything in needle[0]
4955                 if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
4956             }
4957             else static if (size == 4)
4958             {
4959                 //0x800-0xFFFF
4960                 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
4961                 //must use 5 bits (or above) in needle[1] or anything in needle[0]
4962                 if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
4963             }
4964             static if (mode == Mode.alwaysSkip)
4965             {
4966                 inp.popFrontN(size);
4967                 return tab!size[needle];
4968             }
4969             else static if (mode == Mode.neverSkip)
4970             {
4971                 return tab!size[needle];
4972             }
4973             else
4974             {
4975                 static assert(mode == Mode.skipOnMatch);
4976 
4977                 if (tab!size[needle])
4978                 {
4979                     inp.popFrontN(size);
4980                     return true;
4981                 }
4982                 else
4983                     return false;
4984             }
4985         }
4986     }
4987 
4988     struct CherryPick(I, Sizes...)
4989     {
4990         import std.meta : allSatisfy;
4991         static assert(allSatisfy!(validSize, Sizes),
4992             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4993     private:
4994         I* m;
4995         @property auto tab(int i)() const { return m.tables[i - 1]; }
4996         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4997         {
4998             return m.lookup!(size, mode)(inp);
4999         }
5000         mixin DefMatcher;
5001     }
5002 }
5003 
5004 template Utf16Matcher()
5005 {
5006     enum validSize(int sz) = sz >= 1 && sz <= 2;
5007 
5008     void badEncoding() pure @safe
5009     {
5010         import std.utf : UTFException;
5011         throw new UTFException("Invalid UTF-16 sequence");
5012     }
5013 
5014     // 1-stage ASCII
5015     alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
5016     //2-stage BMP
5017     alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
5018     //4-stage - full Unicode
5019     //assume that 0xD800 & 0xDC00 bits are cleared
5020     //thus leaving 10 bit per wchar to worry about
5021     alias UniSpec = AliasSeq!(bool, wchar[2],
5022         assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
5023         assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
5024     );
5025     alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
5026     alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
5027     alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
5028 
5029     auto encode2(dchar ch)
5030     {
5031         ch -= 0x1_0000;
5032         assert(ch <= 0xF_FFFF);
5033         wchar[2] ret;
5034         //do not put surrogate bits, they are sliced off
5035         ret[0] = cast(wchar)(ch >> 10);
5036         ret[1] = (ch & 0xFFF);
5037         return ret;
5038     }
5039 
5040     auto build(Set)(Set set)
5041     {
5042         import std.algorithm.iteration : map;
5043         auto ascii = set & unicode.ASCII;
5044         auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
5045             - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
5046         auto other = set - (bmp | ascii);
5047         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
5048         auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
5049         auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
5050         alias Ret = Impl!(1,2);
5051         return Ret(asciiT, bmpT, otherT);
5052     }
5053 
5054     //bootstrap full UTF-16 matcher interace from
5055     //sizeFlags, lookupUni and ascii
5056     mixin template DefMatcher()
5057     {
5058         public bool match(Range)(ref Range inp) const
5059             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5060                 !isDynamicArray!Range)
5061         {
5062             enum mode = Mode.skipOnMatch;
5063             assert(!inp.empty);
5064             immutable ch = inp[0];
5065             static if (sizeFlags & 1)
5066             {
5067                 if (ch < 0x80)
5068                 {
5069                   if (ascii[ch])
5070                   {
5071                       inp.popFront();
5072                       return true;
5073                   }
5074                   else
5075                       return false;
5076                 }
5077                 return lookupUni!mode(inp);
5078             }
5079             else
5080                 return lookupUni!mode(inp);
5081         }
5082 
5083         static if (Sizes.length == 2)
5084         {
5085             public bool skip(Range)(ref Range inp) const
5086                 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5087                     !isDynamicArray!Range)
5088             {
5089                 enum mode = Mode.alwaysSkip;
5090                 assert(!inp.empty);
5091                 immutable ch = inp[0];
5092                 static if (sizeFlags & 1)
5093                 {
5094                     if (ch < 0x80)
5095                     {
5096                         inp.popFront();
5097                         return ascii[ch];
5098                     }
5099                     else
5100                         return lookupUni!mode(inp);
5101                 }
5102                 else
5103                     return lookupUni!mode(inp);
5104             }
5105         }
5106 
5107         public bool test(Range)(ref Range inp) const
5108             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5109                 !isDynamicArray!Range)
5110         {
5111             enum mode = Mode.neverSkip;
5112             assert(!inp.empty);
5113             auto ch = inp[0];
5114             static if (sizeFlags & 1)
5115                 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
5116             else
5117                 return lookupUni!mode(inp);
5118         }
5119 
5120         bool match(C)(ref C[] str) const
5121             if (isSomeChar!C)
5122         {
5123             return fwdStr!"match"(str);
5124         }
5125 
5126         bool skip(C)(ref C[] str) const
5127             if (isSomeChar!C)
5128         {
5129             return fwdStr!"skip"(str);
5130         }
5131 
5132         bool test(C)(ref C[] str) const
5133             if (isSomeChar!C)
5134         {
5135             return fwdStr!"test"(str);
5136         }
5137 
5138         mixin ForwardStrings; //dispatch strings to range versions
5139     }
5140 
5141     struct Impl(Sizes...)
5142         if (Sizes.length >= 1 && Sizes.length <= 2)
5143     {
5144     private:
5145         import std.meta : allSatisfy;
5146         static assert(allSatisfy!(validSize, Sizes),
5147             "Only lengths of 1 and 2 code units are possible in UTF-16");
5148         static if (Sizes.length > 1)
5149             enum sizeFlags = Sizes[0] | Sizes[1];
5150         else
5151             enum sizeFlags = Sizes[0];
5152 
5153         static if (sizeFlags & 1)
5154         {
5155             Ascii ascii;
5156             Bmp bmp;
5157         }
5158         static if (sizeFlags & 2)
5159         {
5160             Uni uni;
5161         }
5162         mixin DefMatcher;
5163 
5164         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
5165         {
5166             return CherryPick!(Impl, SizesToPick)(&this);
5167         }
5168 
5169         bool lookupUni(Mode mode, Range)(ref Range inp) const
5170         {
5171             wchar x = cast(wchar)(inp[0] - 0xD800);
5172             //not a high surrogate
5173             if (x > 0x3FF)
5174             {
5175                 //low surrogate
5176                 if (x <= 0x7FF) badEncoding();
5177                 static if (sizeFlags & 1)
5178                 {
5179                     auto ch = inp[0];
5180                     static if (mode == Mode.alwaysSkip)
5181                         inp.popFront();
5182                     static if (mode == Mode.skipOnMatch)
5183                     {
5184                         if (bmp[ch])
5185                         {
5186                             inp.popFront();
5187                             return true;
5188                         }
5189                         else
5190                             return false;
5191                     }
5192                     else
5193                         return bmp[ch];
5194                 }
5195                 else //skip is not available for sub-matchers, so just false
5196                     return false;
5197             }
5198             else
5199             {
5200                 import std.range : popFrontN;
5201                 static if (sizeFlags & 2)
5202                 {
5203                     if (inp.length < 2)
5204                         badEncoding();
5205                     wchar y = cast(wchar)(inp[1] - 0xDC00);
5206                     //not a low surrogate
5207                     if (y > 0x3FF)
5208                         badEncoding();
5209                     wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
5210                     static if (mode == Mode.alwaysSkip)
5211                         inp.popFrontN(2);
5212                     static if (mode == Mode.skipOnMatch)
5213                     {
5214                         if (uni[needle])
5215                         {
5216                             inp.popFrontN(2);
5217                             return true;
5218                         }
5219                         else
5220                             return false;
5221                     }
5222                     else
5223                         return uni[needle];
5224                 }
5225                 else //ditto
5226                     return false;
5227             }
5228         }
5229     }
5230 
5231     struct CherryPick(I, Sizes...)
5232         if (Sizes.length >= 1 && Sizes.length <= 2)
5233     {
5234     private:
5235         import std.meta : allSatisfy;
5236         I* m;
5237         enum sizeFlags = I.sizeFlags;
5238 
5239         static if (sizeFlags & 1)
5240         {
5241             @property auto ascii()() const { return m.ascii; }
5242         }
5243 
5244         bool lookupUni(Mode mode, Range)(ref Range inp) const
5245         {
5246             return m.lookupUni!mode(inp);
5247         }
5248         mixin DefMatcher;
5249         static assert(allSatisfy!(validSize, Sizes),
5250             "Only lengths of 1 and 2 code units are possible in UTF-16");
5251     }
5252 }
5253 
5254 private auto utf8Matcher(Set)(Set set)
5255 {
5256     return Utf8Matcher!().build(set);
5257 }
5258 
5259 private auto utf16Matcher(Set)(Set set)
5260 {
5261     return Utf16Matcher!().build(set);
5262 }
5263 
5264 /**
5265     Constructs a matcher object
5266     to classify $(CODEPOINTS) from the `set` for encoding
5267     that has `Char` as code unit.
5268 
5269     See $(LREF MatcherConcept) for API outline.
5270 */
5271 public auto utfMatcher(Char, Set)(Set set)
5272 if (isCodepointSet!Set)
5273 {
5274     static if (is(Char : char))
5275         return utf8Matcher(set);
5276     else static if (is(Char : wchar))
5277         return utf16Matcher(set);
5278     else static if (is(Char : dchar))
5279         static assert(false, "UTF-32 needs no decoding,
5280             and thus not supported by utfMatcher");
5281     else
5282         static assert(false, "Only character types 'char' and 'wchar' are allowed");
5283 }
5284 
5285 
5286 //a range of code units, packed with index to speed up forward iteration
5287 package(std) auto decoder(C)(C[] s, size_t offset=0)
5288 if (is(C : wchar) || is(C : char))
5289 {
5290     static struct Decoder
5291     {
5292     pure nothrow:
5293         C[] str;
5294         size_t idx;
5295         @property C front(){ return str[idx]; }
5296         @property C back(){ return str[$-1]; }
5297         void popFront(){ idx++; }
5298         void popBack(){ str = str[0..$-1]; }
5299         void popFrontN(size_t n){ idx += n; }
5300         @property bool empty(){ return idx == str.length; }
5301         @property auto save(){ return this; }
5302         auto opIndex(size_t i){ return str[idx+i]; }
5303         @property size_t length(){ return str.length - idx; }
5304         alias opDollar = length;
5305         auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
5306     }
5307     static assert(isRandomAccessRange!Decoder);
5308     static assert(is(ElementType!Decoder : C));
5309     return Decoder(s, offset);
5310 }
5311 
5312 pure @safe unittest
5313 {
5314     string rs = "hi! ネемног砀 текста";
5315     auto codec = rs.decoder;
5316     auto utf8 =  utf8Matcher(unicode.Letter);
5317     auto asc = utf8.subMatcher!(1);
5318     auto uni = utf8.subMatcher!(2,3,4);
5319 
5320     // h
5321     assert(asc.test(codec));
5322     assert(!uni.match(codec));
5323     assert(utf8.skip(codec));
5324     assert(codec.idx == 1);
5325 
5326     // i
5327     assert(asc.test(codec));
5328     assert(!uni.match(codec));
5329     assert(utf8.skip(codec));
5330     assert(codec.idx == 2);
5331 
5332     // !
5333     assert(!asc.match(codec));
5334     assert(!utf8.test(codec));
5335     assert(!utf8.skip(codec));
5336     assert(codec.idx == 3);
5337 
5338     // space
5339     assert(!asc.test(codec));
5340     assert(!utf8.test(codec));
5341     assert(!utf8.skip(codec));
5342     assert(codec.idx == 4);
5343 
5344     assert(utf8.test(codec));
5345     foreach (i; 0 .. 7)
5346     {
5347         assert(!asc.test(codec));
5348         assert(uni.test(codec));
5349         assert(utf8.skip(codec));
5350     }
5351     assert(!utf8.test(codec));
5352     assert(!utf8.skip(codec));
5353 
5354     //the same with match where applicable
5355     codec = rs.decoder;
5356     assert(utf8.match(codec));
5357     assert(codec.idx == 1);
5358     assert(utf8.match(codec));
5359     assert(codec.idx == 2);
5360     assert(!utf8.match(codec));
5361     assert(codec.idx == 2);
5362     assert(!utf8.skip(codec));
5363     assert(!utf8.skip(codec));
5364 
5365     foreach (i; 0 .. 7)
5366     {
5367         assert(!asc.test(codec));
5368         assert(utf8.test(codec));
5369         assert(utf8.match(codec));
5370     }
5371     auto i = codec.idx;
5372     assert(!utf8.match(codec));
5373     assert(codec.idx == i);
5374 }
5375 
5376 pure @system unittest
5377 {
5378     import std.range : stride;
5379     static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe
5380     {
5381         bool t = m.test(r);
5382         auto save = r.idx;
5383         assert(t == m.match(r));
5384         assert(r.idx == save || t); //ether no change or was match
5385         r.idx = save;
5386         static if (is(typeof(m.skip(r))))
5387         {
5388             assert(t == m.skip(r));
5389             assert(r.idx != save); //always changed
5390             r.idx = save;
5391         }
5392         return t;
5393     }
5394     auto utf16 = utfMatcher!wchar(unicode.L);
5395     auto bmp = utf16.subMatcher!1;
5396     auto nonBmp = utf16.subMatcher!1;
5397     auto utf8 = utfMatcher!char(unicode.L);
5398     auto ascii = utf8.subMatcher!1;
5399     auto uni2 = utf8.subMatcher!2;
5400     auto uni3 = utf8.subMatcher!3;
5401     auto uni24 = utf8.subMatcher!(2,4);
5402     foreach (ch; unicode.L.byCodepoint.stride(3))
5403     {
5404         import std.utf : encode;
5405         char[4] buf;
5406         wchar[2] buf16;
5407         auto len = encode(buf, ch);
5408         auto len16 = encode(buf16, ch);
5409         auto c8 = buf[0 .. len].decoder;
5410         auto c16 = buf16[0 .. len16].decoder;
5411         assert(testAll(utf16, c16));
5412         assert(testAll(bmp, c16) || len16 != 1);
5413         assert(testAll(nonBmp, c16) || len16 != 2);
5414 
5415         assert(testAll(utf8, c8));
5416 
5417         //submatchers return false on out of their domain
5418         assert(testAll(ascii, c8) || len != 1);
5419         assert(testAll(uni2, c8) || len != 2);
5420         assert(testAll(uni3, c8) || len != 3);
5421         assert(testAll(uni24, c8) || (len != 2 && len != 4));
5422     }
5423 }
5424 
5425 // cover decode fail cases of Matcher
5426 pure @safe unittest
5427 {
5428     import std.algorithm.iteration : map;
5429     import std.exception : collectException;
5430     import std.format : format;
5431     auto utf16 = utfMatcher!wchar(unicode.L);
5432     auto utf8 = utfMatcher!char(unicode.L);
5433     //decode failure cases UTF-8
5434     alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
5435         "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
5436         "\xCF\x00\0x00\0x00\x00");
5437     foreach (msg; fails8)
5438     {
5439         assert(collectException((){
5440             auto s = msg;
5441             size_t idx = 0;
5442             utf8.test(s);
5443         }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg));
5444     }
5445     //decode failure cases UTF-16
5446     alias fails16 = AliasSeq!([0xD811], [0xDC02]);
5447     foreach (msg; fails16)
5448     {
5449         assert(collectException((){
5450             auto s = msg.map!(x => cast(wchar) x);
5451             utf16.test(s);
5452         }()));
5453     }
5454 }
5455 
5456 /++
5457     Convenience function to construct optimal configurations for
5458     packed Trie from any `set` of $(CODEPOINTS).
5459 
5460     The parameter `level` indicates the number of trie levels to use,
5461     allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
5462     speed-size wise.
5463 
5464     $(P Level 1 is fastest and the most memory hungry (a bit array). )
5465     $(P Level 4 is the slowest and has the smallest footprint. )
5466 
5467     See the $(S_LINK Synopsis, Synopsis) section for example.
5468 
5469     Note:
5470     Level 4 stays very practical (being faster and more predictable)
5471     compared to using direct lookup on the `set` itself.
5472 
5473 
5474 +/
5475 public auto toTrie(size_t level, Set)(Set set)
5476 if (isCodepointSet!Set)
5477 {
5478     static if (level == 1)
5479         return codepointSetTrie!(21)(set);
5480     else static if (level == 2)
5481         return codepointSetTrie!(10, 11)(set);
5482     else static if (level == 3)
5483         return codepointSetTrie!(8, 5, 8)(set);
5484     else static if (level == 4)
5485          return codepointSetTrie!(6, 4, 4, 7)(set);
5486     else
5487         static assert(false,
5488             "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
5489 }
5490 
5491 /**
5492     $(P Builds a `Trie` with typically optimal speed-size trade-off
5493     and wraps it into a delegate of the following type:
5494     $(D bool delegate(dchar ch)). )
5495 
5496     $(P Effectively this creates a 'tester' lambda suitable
5497     for algorithms like std.algorithm.find that take unary predicates. )
5498 
5499     See the $(S_LINK Synopsis, Synopsis) section for example.
5500 */
5501 public auto toDelegate(Set)(Set set)
5502 if (isCodepointSet!Set)
5503 {
5504     // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
5505     auto t = toTrie!3(set);
5506     return (dchar ch) => t[ch];
5507 }
5508 
5509 /**
5510     $(P Opaque wrapper around unsigned built-in integers and
5511     code unit (char/wchar/dchar) types.
5512     Parameter `sz` indicates that the value is confined
5513     to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
5514     packed more tightly when stored in certain
5515     data-structures like trie. )
5516 
5517     Note:
5518     $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T`
5519     but not vise-versa. Users have to ensure the value fits in
5520     the range required and use the `cast`
5521     operator to perform the conversion.)
5522 */
5523 struct BitPacked(T, size_t sz)
5524 if (isIntegral!T || is(T:dchar))
5525 {
5526     enum bitSize = sz;
5527     T _value;
5528     alias _value this;
5529 }
5530 
5531 /*
5532     Depending on the form of the passed argument `bitSizeOf` returns
5533     the amount of bits required to represent a given type
5534     or a return type of a given functor.
5535 */
5536 template bitSizeOf(Args...)
5537 if (Args.length == 1)
5538 {
5539     import std.traits : ReturnType;
5540     alias T = Args[0];
5541     static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
5542     {
5543         enum bitSizeOf = T.bitSize;
5544     }
5545     else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
5546     {
5547         enum bitSizeOf = bitSizeOf!(ReturnType!T);
5548     }
5549     else
5550     {
5551         enum bitSizeOf = T.sizeof*8;
5552     }
5553 }
5554 
5555 /**
5556     Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x)
5557     and thus suitable for packing.
5558 */
5559 template isBitPacked(T)
5560 {
5561     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5562         enum isBitPacked = true;
5563     else
5564         enum isBitPacked = false;
5565 }
5566 
5567 /**
5568     Gives the type `U` from $(LREF BitPacked)!(U, x)
5569     or `T` itself for every other type.
5570 */
5571 template TypeOfBitPacked(T)
5572 {
5573     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5574         alias TypeOfBitPacked = U;
5575     else
5576         alias TypeOfBitPacked = T;
5577 }
5578 
5579 /*
5580     Wrapper, used in definition of custom data structures from `Trie` template.
5581     Applying it to a unary lambda function indicates that the returned value always
5582     fits within `bits` of bits.
5583 */
5584 struct assumeSize(alias Fn, size_t bits)
5585 {
5586     enum bitSize = bits;
5587     static auto ref opCall(T)(auto ref T arg)
5588     {
5589         return Fn(arg);
5590     }
5591 }
5592 
5593 /*
5594     A helper for defining lambda function that yields a slice
5595     of certain bits from an unsigned integral value.
5596     The resulting lambda is wrapped in assumeSize and can be used directly
5597     with `Trie` template.
5598 */
5599 struct sliceBits(size_t from, size_t to)
5600 {
5601     //for now bypass assumeSize, DMD has trouble inlining it
5602     enum bitSize = to-from;
5603     static auto opCall(T)(T x)
5604     out(result)
5605     {
5606         assert(result < (1 << to-from));
5607     }
5608     do
5609     {
5610         static assert(from < to);
5611         static if (from == 0)
5612             return x & ((1 << to)-1);
5613         else
5614         return (x >> from) & ((1<<(to-from))-1);
5615     }
5616 }
5617 
5618 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
5619 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
5620 alias lo8 = assumeSize!(low_8, 8);
5621 alias mlo8 = assumeSize!(midlow_8, 8);
5622 
5623 @safe pure nothrow @nogc unittest
5624 {
5625     static assert(bitSizeOf!lo8 == 8);
5626     static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
5627     static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
5628 }
5629 
5630 template Sequence(size_t start, size_t end)
5631 {
5632     static if (start < end)
5633         alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
5634     else
5635         alias Sequence = AliasSeq!();
5636 }
5637 
5638 //---- TRIE TESTS ----
5639 @system unittest
5640 {
5641     import std.algorithm.iteration : map;
5642     import std.algorithm.sorting : sort;
5643     import std.array : array;
5644     import std.conv : text, to;
5645     import std.range : iota;
5646     static trieStats(TRIE)(TRIE t)
5647     {
5648         version (std_uni_stats)
5649         {
5650             import std.stdio : writefln, writeln;
5651             writeln("---TRIE FOOTPRINT STATS---");
5652             static foreach (i; 0 .. t.table.dim)
5653             {
5654                 writefln("lvl%s = %s bytes;  %s pages"
5655                          , i, t.bytes!i, t.pages!i);
5656             }
5657             writefln("TOTAL: %s bytes", t.bytes);
5658             version (none)
5659             {
5660                 writeln("INDEX (excluding value level):");
5661                 static foreach (i; 0 .. t.table.dim-1)
5662                     writeln(t.table.slice!(i)[0 .. t.table.length!i]);
5663             }
5664             writeln("---------------------------");
5665         }
5666     }
5667     //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
5668     // alias lo8   = assumeSize!(8, function (uint x) { return x&0xFF; });
5669     // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
5670     alias Set = CodepointSet;
5671     auto set = Set('A','Z','a','z');
5672     auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
5673     for (int a='a'; a<'z';a++)
5674         assert(trie[a]);
5675     for (int a='A'; a<'Z';a++)
5676         assert(trie[a]);
5677     for (int a=0; a<'A'; a++)
5678         assert(!trie[a]);
5679     for (int a ='Z'; a<'a'; a++)
5680         assert(!trie[a]);
5681     trieStats(trie);
5682 
5683     auto redundant2 = Set(
5684         1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
5685     auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
5686     trieStats(trie2);
5687     foreach (e; redundant2.byCodepoint)
5688         assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
5689     foreach (i; 0 .. 1024)
5690     {
5691         assert(trie2[i] == (i in redundant2));
5692     }
5693 
5694 
5695     auto redundant3 = Set(
5696           2,    4,    6,    8,    16,
5697        2+16, 4+16, 16+6, 16+8, 16+16,
5698        2+32, 4+32, 32+6, 32+8,
5699       );
5700 
5701     enum max3 = 256;
5702     // sliceBits
5703     auto trie3 = buildTrie!(bool, uint, max3,
5704             sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
5705         )(redundant3.byInterval);
5706     trieStats(trie3);
5707     foreach (i; 0 .. max3)
5708         assert(trie3[i] == (i in redundant3), text(cast(uint) i));
5709 
5710     auto redundant4 = Set(
5711             10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
5712             1000, 2000, 3000, 4000, 5000, 6000
5713         );
5714     enum max4 = 2^^16;
5715     auto trie4 = buildTrie!(bool, size_t, max4,
5716             sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
5717         )(redundant4.byInterval);
5718     foreach (i; 0 .. max4)
5719     {
5720         if (i in redundant4)
5721             assert(trie4[i], text(cast(uint) i));
5722     }
5723     trieStats(trie4);
5724 
5725         alias mapToS = mapTrieIndex!(useItemAt!(0, char));
5726         string[] redundantS = ["tea", "start", "orange"];
5727         redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
5728         auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
5729         // using first char only
5730         assert(redundantS == ["orange", "start", "tea"]);
5731         assert(strie["test"], text(strie["test"]));
5732         assert(!strie["aea"]);
5733         assert(strie["s"]);
5734 
5735     // a bit size test
5736     auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
5737     auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
5738     trieStats(bt);
5739     foreach (i; 0 .. 256)
5740         assert(bt[cast(ubyte) i]);
5741 }
5742 
5743 template useItemAt(size_t idx, T)
5744 if (isIntegral!T || is(T: dchar))
5745 {
5746     size_t impl(const scope T[] arr){ return arr[idx]; }
5747     alias useItemAt = assumeSize!(impl, 8*T.sizeof);
5748 }
5749 
5750 template useLastItem(T)
5751 {
5752     size_t impl(const scope T[] arr){ return arr[$-1]; }
5753     alias useLastItem = assumeSize!(impl, 8*T.sizeof);
5754 }
5755 
5756 template fullBitSize(Prefix...)
5757 {
5758     static if (Prefix.length > 0)
5759         enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
5760     else
5761         enum fullBitSize = 0;
5762 }
5763 
5764 template idxTypes(Key, size_t fullBits, Prefix...)
5765 {
5766     static if (Prefix.length == 1)
5767     {// the last level is value level, so no index once reduced to 1-level
5768         alias idxTypes = AliasSeq!();
5769     }
5770     else
5771     {
5772         // Important note on bit packing
5773         // Each level has to hold enough of bits to address the next one
5774         // The bottom level is known to hold full bit width
5775         // thus it's size in pages is full_bit_width - size_of_last_prefix
5776         // Recourse on this notion
5777         alias idxTypes =
5778             AliasSeq!(
5779                 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
5780                 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
5781             );
5782     }
5783 }
5784 
5785 //============================================================================
5786 
5787 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
5788 if (is(Char1 : dchar) && is(Char2 : dchar))
5789 {
5790     import std.algorithm.comparison : cmp;
5791     import std.algorithm.iteration : map, filter;
5792     import std.ascii : toLower;
5793     static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
5794     return cmp(
5795         a.map!toLower.filter!pred,
5796         b.map!toLower.filter!pred);
5797 }
5798 
5799 @safe pure unittest
5800 {
5801     assert(!comparePropertyName("foo-bar", "fooBar"));
5802 }
5803 
5804 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
5805 if (is(Char1 : dchar) && is(Char2 : dchar))
5806 {
5807     return comparePropertyName(a, b) < 0;
5808 }
5809 
5810 //============================================================================
5811 // Utilities for compression of Unicode code point sets
5812 //============================================================================
5813 
5814 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow
5815 {
5816     // not optimized as usually done 1 time (and not public interface)
5817     if (val < 128)
5818         arr ~= cast(ubyte) val;
5819     else if (val < (1 << 13))
5820     {
5821         arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
5822         arr ~= val & 0xFF;
5823     }
5824     else
5825     {
5826         assert(val < (1 << 21));
5827         arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
5828         arr ~= (val >> 8) & 0xFF;
5829         arr ~= val  & 0xFF;
5830     }
5831 }
5832 
5833 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure
5834 {
5835     import std.exception : enforce;
5836     immutable first = arr[idx++];
5837     if (!(first & 0x80)) // no top bit -> [0 .. 127]
5838         return first;
5839     immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
5840     uint val = (first & 0x1F);
5841     enforce(idx + extra <= arr.length, "bad code point interval encoding");
5842     foreach (j; 0 .. extra)
5843         val = (val << 8) | arr[idx+j];
5844     idx += extra;
5845     return val;
5846 }
5847 
5848 
5849 package(std) ubyte[] compressIntervals(Range)(Range intervals)
5850 if (isInputRange!Range && isIntegralPair!(ElementType!Range))
5851 {
5852     ubyte[] storage;
5853     uint base = 0;
5854     // RLE encode
5855     foreach (val; intervals)
5856     {
5857         compressTo(val[0]-base, storage);
5858         base = val[0];
5859         if (val[1] != lastDchar+1) // till the end of the domain so don't store it
5860         {
5861             compressTo(val[1]-base, storage);
5862             base = val[1];
5863         }
5864     }
5865     return storage;
5866 }
5867 
5868 @safe pure unittest
5869 {
5870     import std.algorithm.comparison : equal;
5871     import std.typecons : tuple;
5872 
5873     auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
5874     ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
5875     assert(compressIntervals(run) == enc);
5876     auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
5877     ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
5878     assert(compressIntervals(run2) == enc2);
5879     size_t  idx = 0;
5880     assert(decompressFrom(enc, idx) == 80);
5881     assert(decompressFrom(enc, idx) == 47);
5882     assert(decompressFrom(enc, idx) == 1);
5883     assert(decompressFrom(enc, idx) == (1 << 10));
5884     idx = 0;
5885     assert(decompressFrom(enc2, idx) == 0);
5886     assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
5887     assert(equal(decompressIntervals(compressIntervals(run)), run));
5888     assert(equal(decompressIntervals(compressIntervals(run2)), run2));
5889 }
5890 
5891 // Creates a range of `CodepointInterval` that lazily decodes compressed data.
5892 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure
5893 {
5894     return DecompressedIntervals(data);
5895 }
5896 
5897 @safe struct DecompressedIntervals
5898 {
5899 pure:
5900     const(ubyte)[] _stream;
5901     size_t _idx;
5902     CodepointInterval _front;
5903 
5904     this(const(ubyte)[] stream)
5905     {
5906         _stream = stream;
5907         popFront();
5908     }
5909 
5910     @property CodepointInterval front()
5911     {
5912         assert(!empty);
5913         return _front;
5914     }
5915 
5916     void popFront()
5917     {
5918         if (_idx == _stream.length)
5919         {
5920             _idx = size_t.max;
5921             return;
5922         }
5923         uint base = _front[1];
5924         _front[0] = base + decompressFrom(_stream, _idx);
5925         if (_idx == _stream.length)// odd length ---> till the end
5926             _front[1] = lastDchar+1;
5927         else
5928         {
5929             base = _front[0];
5930             _front[1] = base + decompressFrom(_stream, _idx);
5931         }
5932     }
5933 
5934     @property bool empty() const
5935     {
5936         return _idx == size_t.max;
5937     }
5938 
5939     @property DecompressedIntervals save() return scope { return this; }
5940 }
5941 
5942 @safe pure nothrow @nogc unittest
5943 {
5944     static assert(isInputRange!DecompressedIntervals);
5945     static assert(isForwardRange!DecompressedIntervals);
5946 }
5947 
5948 //============================================================================
5949 
5950 version (std_uni_bootstrap){}
5951 else
5952 {
5953 
5954 // helper for looking up code point sets
5955 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name)
5956 {
5957     import std.algorithm.iteration : map;
5958     import std.range : assumeSorted;
5959     auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
5960         (table.map!"a.name"());
5961     size_t idx = range.lowerBound(name).length;
5962     if (idx < range.length && comparePropertyName(range[idx], name) == 0)
5963         return idx;
5964     return -1;
5965 }
5966 
5967 // another one that loads it
5968 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest)
5969 {
5970     auto idx = findUnicodeSet!table(name);
5971     if (idx >= 0)
5972     {
5973         dest = Set(asSet(table[idx].compressed));
5974         return true;
5975     }
5976     return false;
5977 }
5978 
5979 bool loadProperty(Set=CodepointSet, C)
5980     (const scope C[] name, ref Set target) pure
5981 {
5982     import std.internal.unicode_tables : uniProps; // generated file
5983     alias ucmp = comparePropertyName;
5984     // conjure cumulative properties by hand
5985     if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
5986     {
5987         target = asSet(uniProps.Lu);
5988         target |= asSet(uniProps.Ll);
5989         target |= asSet(uniProps.Lt);
5990         target |= asSet(uniProps.Lo);
5991         target |= asSet(uniProps.Lm);
5992     }
5993     else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
5994     {
5995         target = asSet(uniProps.Ll);
5996         target |= asSet(uniProps.Lu);
5997         target |= asSet(uniProps.Lt);// Title case
5998     }
5999     else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
6000     {
6001         target = asSet(uniProps.Mn);
6002         target |= asSet(uniProps.Mc);
6003         target |= asSet(uniProps.Me);
6004     }
6005     else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
6006     {
6007         target = asSet(uniProps.Nd);
6008         target |= asSet(uniProps.Nl);
6009         target |= asSet(uniProps.No);
6010     }
6011     else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
6012     {
6013         target = asSet(uniProps.Pc);
6014         target |= asSet(uniProps.Pd);
6015         target |= asSet(uniProps.Ps);
6016         target |= asSet(uniProps.Pe);
6017         target |= asSet(uniProps.Pi);
6018         target |= asSet(uniProps.Pf);
6019         target |= asSet(uniProps.Po);
6020     }
6021     else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
6022     {
6023         target = asSet(uniProps.Sm);
6024         target |= asSet(uniProps.Sc);
6025         target |= asSet(uniProps.Sk);
6026         target |= asSet(uniProps.So);
6027     }
6028     else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
6029     {
6030         target = asSet(uniProps.Zs);
6031         target |= asSet(uniProps.Zl);
6032         target |= asSet(uniProps.Zp);
6033     }
6034     else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
6035     {
6036         target = asSet(uniProps.Cc);
6037         target |= asSet(uniProps.Cf);
6038         target |= asSet(uniProps.Cs);
6039         target |= asSet(uniProps.Co);
6040         target |= asSet(uniProps.Cn);
6041     }
6042     else if (ucmp(name, "graphical") == 0)
6043     {
6044         target = asSet(uniProps.Alphabetic);
6045 
6046         target |= asSet(uniProps.Mn);
6047         target |= asSet(uniProps.Mc);
6048         target |= asSet(uniProps.Me);
6049 
6050         target |= asSet(uniProps.Nd);
6051         target |= asSet(uniProps.Nl);
6052         target |= asSet(uniProps.No);
6053 
6054         target |= asSet(uniProps.Pc);
6055         target |= asSet(uniProps.Pd);
6056         target |= asSet(uniProps.Ps);
6057         target |= asSet(uniProps.Pe);
6058         target |= asSet(uniProps.Pi);
6059         target |= asSet(uniProps.Pf);
6060         target |= asSet(uniProps.Po);
6061 
6062         target |= asSet(uniProps.Zs);
6063 
6064         target |= asSet(uniProps.Sm);
6065         target |= asSet(uniProps.Sc);
6066         target |= asSet(uniProps.Sk);
6067         target |= asSet(uniProps.So);
6068     }
6069     else if (ucmp(name, "any") == 0)
6070         target = Set.fromIntervals(0, 0x110000);
6071     else if (ucmp(name, "ascii") == 0)
6072         target = Set.fromIntervals(0, 0x80);
6073     else
6074         return loadUnicodeSet!(uniProps.tab)(name, target);
6075     return true;
6076 }
6077 
6078 // CTFE-only helper for checking property names at compile-time
6079 @safe bool isPrettyPropertyName(C)(const scope C[] name)
6080 {
6081     import std.algorithm.searching : find;
6082     auto names = [
6083         "L", "Letter",
6084         "LC", "Cased Letter",
6085         "M", "Mark",
6086         "N", "Number",
6087         "P", "Punctuation",
6088         "S", "Symbol",
6089         "Z", "Separator",
6090         "Graphical",
6091         "any",
6092         "ascii"
6093     ];
6094     auto x = find!(x => comparePropertyName(x, name) == 0)(names);
6095     return !x.empty;
6096 }
6097 
6098 // ditto, CTFE-only, not optimized
6099 @safe private static bool findSetName(alias table, C)(const scope C[] name)
6100 {
6101     return findUnicodeSet!table(name) >= 0;
6102 }
6103 
6104 template SetSearcher(alias table, string kind)
6105 {
6106     /// Run-time checked search.
6107     static auto opCall(C)(const scope C[] name)
6108         if (is(C : dchar))
6109     {
6110         import std.conv : to;
6111         CodepointSet set;
6112         if (loadUnicodeSet!table(name, set))
6113             return set;
6114         throw new Exception("No unicode set for "~kind~" by name "
6115             ~name.to!string()~" was found.");
6116     }
6117     /// Compile-time checked search.
6118     static @property auto opDispatch(string name)()
6119     {
6120         static if (findSetName!table(name))
6121         {
6122             CodepointSet set;
6123             loadUnicodeSet!table(name, set);
6124             return set;
6125         }
6126         else
6127             static assert(false, "No unicode set for "~kind~" by name "
6128                 ~name~" was found.");
6129     }
6130 }
6131 
6132 // Characters that need escaping in string posed as regular expressions
6133 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
6134     ';', ':', '#', '&', '%', '/', '<', '>', '`',  '*', '+', '(', ')', '{', '}',  '~');
6135 
6136 package(std) CodepointSet memoizeExpr(string expr)()
6137 {
6138     if (__ctfe)
6139         return mixin(expr);
6140     alias T = typeof(mixin(expr));
6141     static T slot;
6142     static bool initialized;
6143     if (!initialized)
6144     {
6145         slot =  mixin(expr);
6146         initialized = true;
6147     }
6148     return slot;
6149 }
6150 
6151 //property for \w character class
6152 package(std) @property CodepointSet wordCharacter() @safe
6153 {
6154     return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
6155         | unicode.Me | unicode.Nd | unicode.Pc")();
6156 }
6157 
6158 //basic stack, just in case it gets used anywhere else then Parser
6159 package(std) struct Stack(T)
6160 {
6161 @safe:
6162     T[] data;
6163     @property bool empty(){ return data.empty; }
6164 
6165     @property size_t length(){ return data.length; }
6166 
6167     void push(T val){ data ~= val;  }
6168 
6169     @trusted T pop()
6170     {
6171         assert(!empty);
6172         auto val = data[$ - 1];
6173         data = data[0 .. $ - 1];
6174         if (!__ctfe)
6175             cast(void) data.assumeSafeAppend();
6176         return val;
6177     }
6178 
6179     @property ref T top()
6180     {
6181         assert(!empty);
6182         return data[$ - 1];
6183     }
6184 }
6185 
6186 //test if a given string starts with hex number of maxDigit that's a valid codepoint
6187 //returns it's value and skips these maxDigit chars on success, throws on failure
6188 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit)
6189 {
6190     import std.exception : enforce;
6191     //std.conv.parse is both @system and bogus
6192     uint val;
6193     for (int k = 0; k < maxDigit; k++)
6194     {
6195         enforce(!str.empty, "incomplete escape sequence");
6196         //accepts ascii only, so it's OK to index directly
6197         immutable current = str.front;
6198         if ('0' <= current && current <= '9')
6199             val = val * 16 + current - '0';
6200         else if ('a' <= current && current <= 'f')
6201             val = val * 16 + current -'a' + 10;
6202         else if ('A' <= current && current <= 'F')
6203             val = val * 16 + current - 'A' + 10;
6204         else
6205             throw new Exception("invalid escape sequence");
6206         str.popFront();
6207     }
6208     enforce(val <= 0x10FFFF, "invalid codepoint");
6209     return val;
6210 }
6211 
6212 @safe unittest
6213 {
6214     import std.algorithm.searching : canFind;
6215     import std.exception : collectException;
6216     string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
6217     string[] hex = [ "01", "ff", "00af", "10FFFF" ];
6218     int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
6219     foreach (v; non_hex)
6220         assert(collectException(parseUniHex(v, v.length)).msg
6221           .canFind("invalid escape sequence"));
6222     foreach (i, v; hex)
6223         assert(parseUniHex(v, v.length) == value[i]);
6224     string over = "0011FFFF";
6225     assert(collectException(parseUniHex(over, over.length)).msg
6226       .canFind("invalid codepoint"));
6227 }
6228 
6229 auto caseEnclose(CodepointSet set)
6230 {
6231     auto cased = set & unicode.LC;
6232     foreach (dchar ch; cased.byCodepoint)
6233     {
6234         foreach (c; simpleCaseFoldings(ch))
6235             set |= c;
6236     }
6237     return set;
6238 }
6239 
6240 /+
6241     fetch codepoint set corresponding to a name (InBlock or binary property)
6242 +/
6243 CodepointSet getUnicodeSet(const scope char[] name, bool negated,  bool casefold) @safe
6244 {
6245     CodepointSet s = unicode(name);
6246     //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
6247     if (casefold)
6248        s = caseEnclose(s);
6249     if (negated)
6250         s = s.inverted;
6251     return s;
6252 }
6253 
6254 struct UnicodeSetParser(Range)
6255 {
6256     import std.exception : enforce;
6257     import std.typecons : tuple, Tuple;
6258     Range range;
6259     bool casefold_;
6260 
6261     @property bool empty(){ return range.empty; }
6262     @property dchar front(){ return range.front; }
6263     void popFront(){ range.popFront(); }
6264 
6265     //CodepointSet operations relatively in order of priority
6266     enum Operator:uint {
6267         Open = 0, Negate,  Difference, SymDifference, Intersection, Union, None
6268     }
6269 
6270     //parse unit of CodepointSet spec, most notably escape sequences and char ranges
6271     //also fetches next set operation
6272     Tuple!(CodepointSet,Operator) parseCharTerm()
6273     {
6274         import std.range : drop;
6275         enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
6276         enum State{ Start, Char, Escape, CharDash, CharDashEscape,
6277             PotentialTwinSymbolOperator }
6278         Operator op = Operator.None;
6279         dchar last;
6280         CodepointSet set;
6281         State state = State.Start;
6282 
6283         void addWithFlags(ref CodepointSet set, uint ch)
6284         {
6285             if (casefold_)
6286             {
6287                 auto foldings = simpleCaseFoldings(ch);
6288                 foreach (v; foldings)
6289                     set |= v;
6290             }
6291             else
6292                 set |= ch;
6293         }
6294 
6295         static Operator twinSymbolOperator(dchar symbol)
6296         {
6297             switch (symbol)
6298             {
6299             case '|':
6300                 return Operator.Union;
6301             case '-':
6302                 return Operator.Difference;
6303             case '~':
6304                 return Operator.SymDifference;
6305             case '&':
6306                 return Operator.Intersection;
6307             default:
6308                 assert(false);
6309             }
6310         }
6311 
6312         L_CharTermLoop:
6313         for (;;)
6314         {
6315             final switch (state)
6316             {
6317             case State.Start:
6318                 switch (front)
6319                 {
6320                 case '|':
6321                 case '-':
6322                 case '~':
6323                 case '&':
6324                     state = State.PotentialTwinSymbolOperator;
6325                     last = front;
6326                     break;
6327                 case '[':
6328                     op = Operator.Union;
6329                     goto case;
6330                 case ']':
6331                     break L_CharTermLoop;
6332                 case '\\':
6333                     state = State.Escape;
6334                     break;
6335                 default:
6336                     state = State.Char;
6337                     last = front;
6338                 }
6339                 break;
6340             case State.Char:
6341                 // xxx last front xxx
6342                 switch (front)
6343                 {
6344                 case '|':
6345                 case '~':
6346                 case '&':
6347                     // then last is treated as normal char and added as implicit union
6348                     state = State.PotentialTwinSymbolOperator;
6349                     addWithFlags(set, last);
6350                     last = front;
6351                     break;
6352                 case '-': // still need more info
6353                     state = State.CharDash;
6354                     break;
6355                 case '\\':
6356                     set |= last;
6357                     state = State.Escape;
6358                     break;
6359                 case '[':
6360                     op = Operator.Union;
6361                     goto case;
6362                 case ']':
6363                     addWithFlags(set, last);
6364                     break L_CharTermLoop;
6365                 default:
6366                     state = State.Char;
6367                     addWithFlags(set, last);
6368                     last = front;
6369                 }
6370                 break;
6371             case State.PotentialTwinSymbolOperator:
6372                 // xxx last front xxxx
6373                 // where last = [|-&~]
6374                 if (front == last)
6375                 {
6376                     op = twinSymbolOperator(last);
6377                     popFront();//skip second twin char
6378                     break L_CharTermLoop;
6379                 }
6380                 goto case State.Char;
6381             case State.Escape:
6382                 // xxx \ front xxx
6383                 switch (front)
6384                 {
6385                 case 'f':
6386                     last = '\f';
6387                     state = State.Char;
6388                     break;
6389                 case 'n':
6390                     last = '\n';
6391                     state = State.Char;
6392                     break;
6393                 case 'r':
6394                     last = '\r';
6395                     state = State.Char;
6396                     break;
6397                 case 't':
6398                     last = '\t';
6399                     state = State.Char;
6400                     break;
6401                 case 'v':
6402                     last = '\v';
6403                     state = State.Char;
6404                     break;
6405                 case 'c':
6406                     last = unicode.parseControlCode(this);
6407                     state = State.Char;
6408                     break;
6409                 foreach (val; Escapables)
6410                 {
6411                 case val:
6412                 }
6413                     last = front;
6414                     state = State.Char;
6415                     break;
6416                 case 'p':
6417                     set.add(unicode.parsePropertySpec(this, false, casefold_));
6418                     state = State.Start;
6419                     continue L_CharTermLoop; //next char already fetched
6420                 case 'P':
6421                     set.add(unicode.parsePropertySpec(this, true, casefold_));
6422                     state = State.Start;
6423                     continue L_CharTermLoop; //next char already fetched
6424                 case 'x':
6425                     popFront();
6426                     last = parseUniHex(this, 2);
6427                     state = State.Char;
6428                     continue L_CharTermLoop;
6429                 case 'u':
6430                     popFront();
6431                     last = parseUniHex(this, 4);
6432                     state = State.Char;
6433                     continue L_CharTermLoop;
6434                 case 'U':
6435                     popFront();
6436                     last = parseUniHex(this, 8);
6437                     state = State.Char;
6438                     continue L_CharTermLoop;
6439                 case 'd':
6440                     set.add(unicode.Nd);
6441                     state = State.Start;
6442                     break;
6443                 case 'D':
6444                     set.add(unicode.Nd.inverted);
6445                     state = State.Start;
6446                     break;
6447                 case 's':
6448                     set.add(unicode.White_Space);
6449                     state = State.Start;
6450                     break;
6451                 case 'S':
6452                     set.add(unicode.White_Space.inverted);
6453                     state = State.Start;
6454                     break;
6455                 case 'w':
6456                     set.add(wordCharacter);
6457                     state = State.Start;
6458                     break;
6459                 case 'W':
6460                     set.add(wordCharacter.inverted);
6461                     state = State.Start;
6462                     break;
6463                 default:
6464                     if (front >= privateUseStart && front <= privateUseEnd)
6465                         enforce(false, "no matching ']' found while parsing character class");
6466                     enforce(false, "invalid escape sequence");
6467                 }
6468                 break;
6469             case State.CharDash:
6470                 // xxx last - front xxx
6471                 switch (front)
6472                 {
6473                 case '[':
6474                     op = Operator.Union;
6475                     goto case;
6476                 case ']':
6477                     //means dash is a single char not an interval specifier
6478                     addWithFlags(set, last);
6479                     addWithFlags(set, '-');
6480                     break L_CharTermLoop;
6481                  case '-'://set Difference again
6482                     addWithFlags(set, last);
6483                     op = Operator.Difference;
6484                     popFront();//skip '-'
6485                     break L_CharTermLoop;
6486                 case '\\':
6487                     state = State.CharDashEscape;
6488                     break;
6489                 default:
6490                     enforce(last <= front, "inverted range");
6491                     if (casefold_)
6492                     {
6493                         for (uint ch = last; ch <= front; ch++)
6494                             addWithFlags(set, ch);
6495                     }
6496                     else
6497                         set.add(last, front + 1);
6498                     state = State.Start;
6499                 }
6500                 break;
6501             case State.CharDashEscape:
6502             //xxx last - \ front xxx
6503                 uint end;
6504                 switch (front)
6505                 {
6506                 case 'f':
6507                     end = '\f';
6508                     break;
6509                 case 'n':
6510                     end = '\n';
6511                     break;
6512                 case 'r':
6513                     end = '\r';
6514                     break;
6515                 case 't':
6516                     end = '\t';
6517                     break;
6518                 case 'v':
6519                     end = '\v';
6520                     break;
6521                 foreach (val; Escapables)
6522                 {
6523                 case val:
6524                 }
6525                     end = front;
6526                     break;
6527                 case 'c':
6528                     end = unicode.parseControlCode(this);
6529                     break;
6530                 case 'x':
6531                     popFront();
6532                     end = parseUniHex(this, 2);
6533                     enforce(last <= end,"inverted range");
6534                     set.add(last, end + 1);
6535                     state = State.Start;
6536                     continue L_CharTermLoop;
6537                 case 'u':
6538                     popFront();
6539                     end = parseUniHex(this, 4);
6540                     enforce(last <= end,"inverted range");
6541                     set.add(last, end + 1);
6542                     state = State.Start;
6543                     continue L_CharTermLoop;
6544                 case 'U':
6545                     popFront();
6546                     end = parseUniHex(this, 8);
6547                     enforce(last <= end,"inverted range");
6548                     set.add(last, end + 1);
6549                     state = State.Start;
6550                     continue L_CharTermLoop;
6551                 default:
6552                     if (front >= privateUseStart && front <= privateUseEnd)
6553                         enforce(false, "no matching ']' found while parsing character class");
6554                     enforce(false, "invalid escape sequence");
6555                 }
6556                 // Lookahead to check if it's a \T
6557                 // where T is sub-pattern terminator in multi-pattern scheme
6558                 auto lookahead = range.save.drop(1);
6559                 if (end == '\\' && !lookahead.empty)
6560                 {
6561                     if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd)
6562                         enforce(false, "no matching ']' found while parsing character class");
6563                 }
6564                 enforce(last <= end,"inverted range");
6565                 set.add(last, end + 1);
6566                 state = State.Start;
6567                 break;
6568             }
6569             popFront();
6570             enforce(!empty, "unexpected end of CodepointSet");
6571         }
6572         return tuple(set, op);
6573     }
6574 
6575     alias ValStack = Stack!(CodepointSet);
6576     alias OpStack = Stack!(Operator);
6577 
6578     CodepointSet parseSet()
6579     {
6580         ValStack vstack;
6581         OpStack opstack;
6582         import std.functional : unaryFun;
6583         enforce(!empty, "unexpected end of input");
6584         enforce(front == '[', "expected '[' at the start of unicode set");
6585         //
6586         static bool apply(Operator op, ref ValStack stack)
6587         {
6588             switch (op)
6589             {
6590             case Operator.Negate:
6591                 enforce(!stack.empty, "no operand for '^'");
6592                 stack.top = stack.top.inverted;
6593                 break;
6594             case Operator.Union:
6595                 auto s = stack.pop();//2nd operand
6596                 enforce(!stack.empty, "no operand for '||'");
6597                 stack.top.add(s);
6598                 break;
6599             case Operator.Difference:
6600                 auto s = stack.pop();//2nd operand
6601                 enforce(!stack.empty, "no operand for '--'");
6602                 stack.top.sub(s);
6603                 break;
6604             case Operator.SymDifference:
6605                 auto s = stack.pop();//2nd operand
6606                 enforce(!stack.empty, "no operand for '~~'");
6607                 stack.top ~= s;
6608                 break;
6609             case Operator.Intersection:
6610                 auto s = stack.pop();//2nd operand
6611                 enforce(!stack.empty, "no operand for '&&'");
6612                 stack.top.intersect(s);
6613                 break;
6614             default:
6615                 return false;
6616             }
6617             return true;
6618         }
6619         static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
6620         {
6621             while (cond(opstack.top))
6622             {
6623                 if (!apply(opstack.pop(),vstack))
6624                     return false;//syntax error
6625                 if (opstack.empty)
6626                     return false;
6627             }
6628             return true;
6629         }
6630 
6631         L_CharsetLoop:
6632         do
6633         {
6634             switch (front)
6635             {
6636             case '[':
6637                 opstack.push(Operator.Open);
6638                 popFront();
6639                 enforce(!empty, "unexpected end of character class");
6640                 if (front == '^')
6641                 {
6642                     opstack.push(Operator.Negate);
6643                     popFront();
6644                     enforce(!empty, "unexpected end of character class");
6645                 }
6646                 else if (front == ']') // []...] is special cased
6647                 {
6648                     popFront();
6649                     enforce(!empty, "wrong character set");
6650                     auto pair = parseCharTerm();
6651                     pair[0].add(']', ']'+1);
6652                     if (pair[1] != Operator.None)
6653                     {
6654                         if (opstack.top == Operator.Union)
6655                             unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6656                         opstack.push(pair[1]);
6657                     }
6658                     vstack.push(pair[0]);
6659                 }
6660                 break;
6661             case ']':
6662                 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
6663                     "character class syntax error");
6664                 enforce(!opstack.empty, "unmatched ']'");
6665                 opstack.pop();
6666                 popFront();
6667                 if (opstack.empty)
6668                     break L_CharsetLoop;
6669                 auto pair  = parseCharTerm();
6670                 if (!pair[0].empty)//not only operator e.g. -- or ~~
6671                 {
6672                     vstack.top.add(pair[0]);//apply union
6673                 }
6674                 if (pair[1] != Operator.None)
6675                 {
6676                     if (opstack.top == Operator.Union)
6677                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6678                     opstack.push(pair[1]);
6679                 }
6680                 break;
6681             //
6682             default://yet another pair of term(op)?
6683                 auto pair = parseCharTerm();
6684                 if (pair[1] != Operator.None)
6685                 {
6686                     if (opstack.top == Operator.Union)
6687                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6688                     opstack.push(pair[1]);
6689                 }
6690                 vstack.push(pair[0]);
6691             }
6692 
6693         }while (!empty || !opstack.empty);
6694         while (!opstack.empty)
6695             apply(opstack.pop(),vstack);
6696         assert(vstack.length == 1);
6697         return vstack.top;
6698     }
6699 }
6700 
6701 /**
6702     A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
6703     a block, script or general category.
6704 
6705     It uses well defined standard rules of property name lookup.
6706     This includes fuzzy matching of names, so that
6707     'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
6708     and yield the same set of white space $(CHARACTERS).
6709 */
6710 @safe public struct unicode
6711 {
6712     import std.exception : enforce;
6713     /**
6714         Performs the lookup of set of $(CODEPOINTS)
6715         with compile-time correctness checking.
6716         This short-cut version combines 3 searches:
6717         across blocks, scripts, and common binary properties.
6718 
6719         Note that since scripts and blocks overlap the
6720         usual trick to disambiguate is used - to get a block use
6721         `unicode.InBlockName`, to search a script
6722         use `unicode.ScriptName`.
6723 
6724         See_Also: $(LREF block), $(LREF script)
6725         and (not included in this search) $(LREF hangulSyllableType).
6726     */
6727 
6728     static @property auto opDispatch(string name)() pure
6729     {
6730         static if (findAny(name))
6731             return loadAny(name);
6732         else
6733             static assert(false, "No unicode set by name "~name~" was found.");
6734     }
6735 
6736     ///
6737     @safe unittest
6738     {
6739         import std.exception : collectException;
6740         auto ascii = unicode.ASCII;
6741         assert(ascii['A']);
6742         assert(ascii['~']);
6743         assert(!ascii['\u00e0']);
6744         // matching is case-insensitive
6745         assert(ascii == unicode.ascII);
6746         assert(!ascii['à']);
6747         // underscores, '-' and whitespace in names are ignored too
6748         auto latin = unicode.in_latin1_Supplement;
6749         assert(latin['à']);
6750         assert(!latin['$']);
6751         // BTW Latin 1 Supplement is a block, hence "In" prefix
6752         assert(latin == unicode("In Latin 1 Supplement"));
6753         // run-time look up throws if no such set is found
6754         assert(collectException(unicode("InCyrilliac")));
6755     }
6756 
6757     /**
6758         The same lookup across blocks, scripts, or binary properties,
6759         but performed at run-time.
6760         This version is provided for cases where `name`
6761         is not known beforehand; otherwise compile-time
6762         checked $(LREF opDispatch) is typically a better choice.
6763 
6764         See the $(S_LINK Unicode properties, table of properties) for available
6765         sets.
6766     */
6767     static auto opCall(C)(const scope C[] name)
6768         if (is(C : dchar))
6769     {
6770         return loadAny(name);
6771     }
6772 
6773     /**
6774         Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
6775 
6776         Note:
6777         Here block names are unambiguous as no scripts are searched
6778         and thus to search use simply `unicode.block.BlockName` notation.
6779 
6780         See $(S_LINK Unicode properties, table of properties) for available sets.
6781         See_Also: $(S_LINK Unicode properties, table of properties).
6782     */
6783     struct block
6784     {
6785         import std.internal.unicode_tables : blocks; // generated file
6786         mixin SetSearcher!(blocks.tab, "block");
6787     }
6788 
6789     ///
6790     @safe unittest
6791     {
6792         // use .block for explicitness
6793         assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
6794     }
6795 
6796     /**
6797         Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
6798 
6799         See the $(S_LINK Unicode properties, table of properties) for available
6800         sets.
6801     */
6802     struct script
6803     {
6804         import std.internal.unicode_tables : scripts; // generated file
6805         mixin SetSearcher!(scripts.tab, "script");
6806     }
6807 
6808     ///
6809     @safe unittest
6810     {
6811         auto arabicScript = unicode.script.arabic;
6812         auto arabicBlock = unicode.block.arabic;
6813         // there is an intersection between script and block
6814         assert(arabicBlock['؁']);
6815         assert(arabicScript['؁']);
6816         // but they are different
6817         assert(arabicBlock != arabicScript);
6818         assert(arabicBlock == unicode.inArabic);
6819         assert(arabicScript == unicode.arabic);
6820     }
6821 
6822     /**
6823         Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
6824 
6825         Other non-binary properties (once supported) follow the same
6826         notation - `unicode.propertyName.propertyValue` for compile-time
6827         checked access and `unicode.propertyName(propertyValue)`
6828         for run-time checked one.
6829 
6830         See the $(S_LINK Unicode properties, table of properties) for available
6831         sets.
6832     */
6833     struct hangulSyllableType
6834     {
6835         import std.internal.unicode_tables : hangul; // generated file
6836         mixin SetSearcher!(hangul.tab, "hangul syllable type");
6837     }
6838 
6839     ///
6840     @safe unittest
6841     {
6842         // L here is syllable type not Letter as in unicode.L short-cut
6843         auto leadingVowel = unicode.hangulSyllableType("L");
6844         // check that some leading vowels are present
6845         foreach (vowel; '\u1110'..'\u115F')
6846             assert(leadingVowel[vowel]);
6847         assert(leadingVowel == unicode.hangulSyllableType.L);
6848     }
6849 
6850     //parse control code of form \cXXX, c assumed to be the current symbol
6851     static package(std) dchar parseControlCode(Parser)(ref Parser p)
6852     {
6853         with(p)
6854         {
6855             popFront();
6856             enforce(!empty, "Unfinished escape sequence");
6857             enforce(('a' <= front && front <= 'z')
6858                 || ('A' <= front && front <= 'Z'),
6859             "Only letters are allowed after \\c");
6860             return front & 0x1f;
6861         }
6862     }
6863 
6864     //parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
6865     //\ - assumed to be processed, p - is current
6866     static package(std) CodepointSet parsePropertySpec(Range)(ref Range p,
6867         bool negated, bool casefold)
6868     {
6869         static import std.ascii;
6870         with(p)
6871         {
6872             enum MAX_PROPERTY = 128;
6873             char[MAX_PROPERTY] result;
6874             uint k = 0;
6875             popFront();
6876             enforce(!empty, "eof parsing unicode property spec");
6877             if (front == '{')
6878             {
6879                 popFront();
6880                 while (k < MAX_PROPERTY && !empty && front !='}'
6881                     && front !=':')
6882                 {
6883                     if (front != '-' && front != ' ' && front != '_')
6884                         result[k++] = cast(char) std.ascii.toLower(front);
6885                     popFront();
6886                 }
6887                 enforce(k != MAX_PROPERTY, "invalid property name");
6888                 enforce(front == '}', "} expected ");
6889             }
6890             else
6891             {//single char properties e.g.: \pL, \pN ...
6892                 enforce(front < 0x80, "invalid property name");
6893                 result[k++] = cast(char) front;
6894             }
6895             auto s = getUnicodeSet(result[0 .. k], negated, casefold);
6896             enforce(!s.empty, "unrecognized unicode property spec");
6897             popFront();
6898             return s;
6899         }
6900     }
6901 
6902     /**
6903         Parse unicode codepoint set from given `range` using standard regex
6904         syntax '[...]'. The range is advanced skiping over regex set definition.
6905         `casefold` parameter determines if the set should be casefolded - that is
6906         include both lower and upper case versions for any letters in the set.
6907     */
6908     static CodepointSet parseSet(Range)(ref Range range, bool casefold=false)
6909     if (isInputRange!Range && is(ElementType!Range : dchar))
6910     {
6911         auto usParser = UnicodeSetParser!Range(range, casefold);
6912         auto set = usParser.parseSet();
6913         range = usParser.range;
6914         return set;
6915     }
6916 
6917     ///
6918     @safe unittest
6919     {
6920         import std.uni : unicode;
6921         string pat = "[a-zA-Z0-9]hello";
6922         auto set = unicode.parseSet(pat);
6923         // check some of the codepoints
6924         assert(set['a'] && set['A'] && set['9']);
6925         assert(pat == "hello");
6926     }
6927 
6928 private:
6929     alias ucmp = comparePropertyName;
6930 
6931     static bool findAny(string name)
6932     {
6933         import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
6934         return isPrettyPropertyName(name)
6935             || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
6936             || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
6937     }
6938 
6939     static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure
6940     {
6941         import std.conv : to;
6942         import std.internal.unicode_tables : blocks, scripts; // generated file
6943         Set set;
6944         immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
6945             || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
6946                 && loadUnicodeSet!(blocks.tab)(name[2..$], set));
6947         if (loaded)
6948             return set;
6949         throw new Exception("No unicode set by name "~name.to!string()~" was found.");
6950     }
6951 
6952     // FIXME: re-disable once the compiler is fixed
6953     // Disabled to prevent the mistake of creating instances of this pseudo-struct.
6954     //@disable ~this();
6955 }
6956 
6957 @safe unittest
6958 {
6959     import std.internal.unicode_tables : blocks, uniProps; // generated file
6960     assert(unicode("InHebrew") == asSet(blocks.Hebrew));
6961     assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
6962     assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
6963 }
6964 
6965 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
6966 
6967 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
6968 // Use combined trie instead of checking for '\r' | '\n' | ccTrie,
6969 //   or extend | '\u200D' separately
6970 
6971 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
6972 {
6973     return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
6974 }
6975 
6976 // Our grapheme decoder is a state machine, this is list of all possible
6977 // states before each code point.
6978 private enum GraphemeState
6979 {
6980     Start,
6981     CR,
6982     RI,
6983     L,
6984     V,
6985     LVT,
6986     Emoji,
6987     EmojiZWJ,
6988     Prepend,
6989     End
6990 }
6991 
6992 // Message values whether end of grapheme is reached
6993 private enum TransformRes
6994 {
6995     // No, unless the source range ends here
6996     // (GB2 - break at end of text, unless text is empty)
6997     goOn,
6998     redo, // Run last character again with new state
6999     retInclude, // Yes, after the just iterated character
7000     retExclude // Yes, before the just iterated character
7001 }
7002 
7003 // The logic of the grapheme decoding is all here
7004 // GB# means Grapheme Breaking rule number # - see Unicode standard annex #29
7005 // Note, getting GB1 (break at start of text, unless text is empty) right
7006 // relies on the user starting grapheme walking from beginning of the text, and
7007 // not attempting to walk an empty text.
7008 private enum TransformRes
7009     function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms =
7010 [
7011     GraphemeState.Start: (ref state, ch)
7012     {
7013         // GB4. Break after controls.
7014         if (graphemeControlTrie[ch] || ch == '\n')
7015             return TransformRes.retInclude;
7016 
7017         with (GraphemeState) state =
7018             ch == '\r' ? CR :
7019             isRegionalIndicator(ch) ? RI :
7020             isHangL(ch) ? L :
7021             hangLV[ch] || isHangV(ch) ? V :
7022             hangLVT[ch] || isHangT(ch) ? LVT :
7023             prependTrie[ch] ? Prepend :
7024             xpictoTrie[ch] ? Emoji :
7025             End;
7026 
7027         // No matter what we encountered, we always include the
7028         // first code point in the grapheme.
7029         return TransformRes.goOn;
7030     },
7031 
7032     // GB3, GB4. Do not break between a CR and LF.
7033     // Otherwise, break after controls.
7034     GraphemeState.CR: (ref state, ch) => ch == '\n' ?
7035         TransformRes.retInclude :
7036         TransformRes.retExclude,
7037 
7038     // GB12 - GB13. Do not break within emoji flag sequences.
7039     // That is, do not break between regional indicator (RI) symbols if
7040     // there is an odd number of RI characters before the break point.
7041     // This state applies if one and only one RI code point has been
7042     // encountered.
7043     GraphemeState.RI: (ref state, ch)
7044     {
7045         state = GraphemeState.End;
7046 
7047         return isRegionalIndicator(ch) ?
7048             TransformRes.goOn :
7049             TransformRes.redo;
7050     },
7051 
7052     // GB6. Do not break Hangul syllable sequences.
7053     GraphemeState.L: (ref state, ch)
7054     {
7055         if (isHangL(ch))
7056             return TransformRes.goOn;
7057         else if (isHangV(ch) || hangLV[ch])
7058         {
7059             state = GraphemeState.V;
7060             return TransformRes.goOn;
7061         }
7062         else if (hangLVT[ch])
7063         {
7064             state = GraphemeState.LVT;
7065             return TransformRes.goOn;
7066         }
7067 
7068         state = GraphemeState.End;
7069         return TransformRes.redo;
7070     },
7071 
7072     // GB7. Do not break Hangul syllable sequences.
7073     GraphemeState.V: (ref state, ch)
7074     {
7075         if (isHangV(ch))
7076             return TransformRes.goOn;
7077         else if (isHangT(ch))
7078         {
7079             state = GraphemeState.LVT;
7080             return TransformRes.goOn;
7081         }
7082 
7083         state = GraphemeState.End;
7084         return TransformRes.redo;
7085     },
7086 
7087     // GB8. Do not break Hangul syllable sequences.
7088     GraphemeState.LVT: (ref state, ch)
7089     {
7090         if (isHangT(ch))
7091             return TransformRes.goOn;
7092 
7093         state = GraphemeState.End;
7094         return TransformRes.redo;
7095     },
7096 
7097     // GB11. Do not break within emoji modifier sequences or emoji
7098     // zwj sequences. This state applies when the last code point was
7099     // NOT a ZWJ.
7100     GraphemeState.Emoji: (ref state, ch)
7101     {
7102         if (graphemeExtendTrie[ch])
7103             return TransformRes.goOn;
7104 
7105         static assert(!graphemeExtendTrie['\u200D']);
7106 
7107         if (ch == '\u200D')
7108         {
7109             state = GraphemeState.EmojiZWJ;
7110             return TransformRes.goOn;
7111         }
7112 
7113         state = GraphemeState.End;
7114         // There might still be spacing marks are
7115         // at the end, which are not allowed in
7116         // middle of emoji sequences
7117         return TransformRes.redo;
7118     },
7119 
7120     // GB11. Do not break within emoji modifier sequences or emoji
7121     // zwj sequences. This state applies when the last code point was
7122     // a ZWJ.
7123     GraphemeState.EmojiZWJ: (ref state, ch)
7124     {
7125         state = GraphemeState.Emoji;
7126         if (xpictoTrie[ch])
7127             return TransformRes.goOn;
7128         return TransformRes.redo;
7129     },
7130 
7131     // GB9b. Do not break after Prepend characters.
7132     GraphemeState.Prepend: (ref state, ch)
7133     {
7134         // GB5. Break before controls.
7135         if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n')
7136             return TransformRes.retExclude;
7137 
7138         state = GraphemeState.Start;
7139         return TransformRes.redo;
7140     },
7141 
7142     // GB9, GB9a. Do not break before extending characters, ZWJ
7143     // or SpacingMarks.
7144     // GB999. Otherwise, break everywhere.
7145     GraphemeState.End: (ref state, ch)
7146         => !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ?
7147             TransformRes.retExclude :
7148             TransformRes.goOn
7149 ];
7150 
7151 template genericDecodeGrapheme(bool getValue)
7152 {
7153     static if (getValue)
7154         alias Value = Grapheme;
7155     else
7156         alias Value = void;
7157 
7158     Value genericDecodeGrapheme(Input)(ref Input range)
7159     {
7160         static if (getValue)
7161             Grapheme grapheme;
7162         auto state = GraphemeState.Start;
7163         dchar ch;
7164 
7165         assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
7166     outer:
7167         while (!range.empty)
7168         {
7169             ch = range.front;
7170 
7171         rerun:
7172             final switch (graphemeTransforms[state](state, ch))
7173                 with(TransformRes)
7174             {
7175             case goOn:
7176                 static if (getValue)
7177                     grapheme ~= ch;
7178                 range.popFront();
7179                 continue;
7180 
7181             case redo:
7182                 goto rerun;
7183 
7184             case retInclude:
7185                 static if (getValue)
7186                     grapheme ~= ch;
7187                 range.popFront();
7188                 break outer;
7189 
7190             case retExclude:
7191                 break outer;
7192             }
7193         }
7194 
7195         static if (getValue)
7196             return grapheme;
7197     }
7198 }
7199 
7200 public: // Public API continues
7201 
7202 /++
7203     Computes the length of grapheme cluster starting at `index`.
7204     Both the resulting length and the `index` are measured
7205     in $(S_LINK Code unit, code units).
7206 
7207     Params:
7208         C = type that is implicitly convertible to `dchars`
7209         input = array of grapheme clusters
7210         index = starting index into `input[]`
7211 
7212     Returns:
7213         length of grapheme cluster
7214 +/
7215 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure
7216 if (is(C : dchar))
7217 {
7218     auto src = input[index..$];
7219     auto n = src.length;
7220     genericDecodeGrapheme!(false)(src);
7221     return n - src.length;
7222 }
7223 
7224 ///
7225 @safe unittest
7226 {
7227     assert(graphemeStride("  ", 1) == 1);
7228     // A + combing ring above
7229     string city = "A\u030Arhus";
7230     size_t first = graphemeStride(city, 0);
7231     assert(first == 3); //\u030A has 2 UTF-8 code units
7232     assert(city[0 .. first] == "A\u030A");
7233     assert(city[first..$] == "rhus");
7234 }
7235 
7236 @safe unittest
7237 {
7238     // Ensure that graphemeStride is usable from CTFE.
7239     enum c1 = graphemeStride("A", 0);
7240     static assert(c1 == 1);
7241 
7242     enum c2 = graphemeStride("A\u0301", 0);
7243     static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
7244 }
7245 
7246 // TODO: make this @nogc. Probably no big deal since the state machine is
7247 // already GC-free.
7248 @safe pure nothrow unittest
7249 {
7250     // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face
7251     assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2);
7252     // skier ~ female sign ~ '€'
7253     assert(graphemeStride("\u26F7\u2640€"d, 0) == 1);
7254     // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€'
7255     assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2);
7256     // skier ~ zero-width joiner ~ female sign ~ '€'
7257     assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3);
7258     // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner
7259     // ~ female sign ~ '€'
7260     assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4);
7261     // skier ~ zero-width joiner ~ '€'
7262     assert(graphemeStride("\u26F7\u200D€"d, 0) == 2);
7263     //'€' ~ zero-width joiner ~ skier
7264     assert(graphemeStride("€\u200D\u26F7"d, 0) == 2);
7265     // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two
7266     assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2);
7267     // Kaithi number sign ~ null
7268     assert(graphemeStride("\U000110BD\0"d, 0) == 1);
7269 }
7270 
7271 /++
7272     Reads one full grapheme cluster from an
7273     $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`.
7274 
7275     For examples see the $(LREF Grapheme) below.
7276 
7277     Note:
7278     This function modifies `inp` and thus `inp`
7279     must be an L-value.
7280 +/
7281 Grapheme decodeGrapheme(Input)(ref Input inp)
7282 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
7283 {
7284     return genericDecodeGrapheme!true(inp);
7285 }
7286 
7287 @safe unittest
7288 {
7289     import std.algorithm.comparison : equal;
7290 
7291     Grapheme gr;
7292     string s = " \u0020\u0308 ";
7293     gr = decodeGrapheme(s);
7294     assert(gr.length == 1 && gr[0] == ' ');
7295     gr = decodeGrapheme(s);
7296     assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
7297     s = "\u0300\u0308\u1100";
7298     assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
7299     assert(equal(decodeGrapheme(s)[], "\u1100"));
7300     s = "\u11A8\u0308\uAC01";
7301     assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
7302     assert(equal(decodeGrapheme(s)[], "\uAC01"));
7303 
7304     // Two Union Jacks of the Great Britain
7305     s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
7306     assert(equal(decodeGrapheme(s)[], "\U0001F1EC\U0001F1E7"));
7307 }
7308 
7309 /++
7310     $(P Iterate a string by $(LREF Grapheme).)
7311 
7312     $(P Useful for doing string manipulation that needs to be aware
7313     of graphemes.)
7314 
7315     See_Also:
7316         $(LREF byCodePoint)
7317 +/
7318 auto byGrapheme(Range)(Range range)
7319 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7320 {
7321     // TODO: Bidirectional access
7322     static struct Result(R)
7323     {
7324         private R _range;
7325         private Grapheme _front;
7326 
7327         bool empty() @property
7328         {
7329             return _front.length == 0;
7330         }
7331 
7332         Grapheme front() @property
7333         {
7334             return _front;
7335         }
7336 
7337         void popFront()
7338         {
7339             _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
7340         }
7341 
7342         static if (isForwardRange!R)
7343         {
7344             Result save() @property
7345             {
7346                 return Result(_range.save, _front);
7347             }
7348         }
7349     }
7350 
7351     auto result = Result!(Range)(range);
7352     result.popFront();
7353     return result;
7354 }
7355 
7356 ///
7357 @safe unittest
7358 {
7359     import std.algorithm.comparison : equal;
7360     import std.range.primitives : walkLength;
7361     import std.range : take, drop;
7362     auto text = "noe\u0308l"; // noël using e + combining diaeresis
7363     assert(text.walkLength == 5); // 5 code points
7364 
7365     auto gText = text.byGrapheme;
7366     assert(gText.walkLength == 4); // 4 graphemes
7367 
7368     assert(gText.take(3).equal("noe\u0308".byGrapheme));
7369     assert(gText.drop(3).equal("l".byGrapheme));
7370 }
7371 
7372 // For testing non-forward-range input ranges
7373 version (StdUnittest)
7374 private static @safe struct InputRangeString
7375 {
7376     private string s;
7377 
7378     bool empty() @property { return s.empty; }
7379     dchar front() @property { return s.front; }
7380     void popFront() { s.popFront(); }
7381 }
7382 
7383 @safe unittest
7384 {
7385     import std.algorithm.comparison : equal;
7386     import std.array : array;
7387     import std.range : retro;
7388     import std.range.primitives : walkLength;
7389     assert("".byGrapheme.walkLength == 0);
7390 
7391     auto reverse = "le\u0308on";
7392     assert(reverse.walkLength == 5);
7393 
7394     auto gReverse = reverse.byGrapheme;
7395     assert(gReverse.walkLength == 4);
7396 
7397     static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
7398     {{
7399         assert(text.walkLength == 5);
7400         static assert(isForwardRange!(typeof(text)));
7401 
7402         auto gText = text.byGrapheme;
7403         static assert(isForwardRange!(typeof(gText)));
7404         assert(gText.walkLength == 4);
7405         assert(gText.array.retro.equal(gReverse));
7406     }}
7407 
7408     auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
7409     static assert(!isForwardRange!(typeof(nonForwardRange)));
7410     assert(nonForwardRange.walkLength == 4);
7411 }
7412 
7413 // Issue 23474
7414 @safe pure unittest
7415 {
7416     import std.range.primitives : walkLength;
7417     assert(byGrapheme("\r\u0308").walkLength == 2);
7418 }
7419 
7420 /++
7421     $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
7422 
7423     $(P Useful for converting the result to a string after doing operations
7424     on graphemes.)
7425 
7426     $(P If passed in a range of code points, returns a range with equivalent capabilities.)
7427 +/
7428 auto byCodePoint(Range)(Range range)
7429 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme))
7430 {
7431     // TODO: Propagate bidirectional access
7432     static struct Result
7433     {
7434         private Range _range;
7435         private size_t i = 0;
7436 
7437         bool empty() @property
7438         {
7439             return _range.empty;
7440         }
7441 
7442         dchar front() @property
7443         {
7444             return _range.front[i];
7445         }
7446 
7447         void popFront()
7448         {
7449             ++i;
7450 
7451             if (i >= _range.front.length)
7452             {
7453                 _range.popFront();
7454                 i = 0;
7455             }
7456         }
7457 
7458         static if (isForwardRange!Range)
7459         {
7460             Result save() @property
7461             {
7462                 return Result(_range.save, i);
7463             }
7464         }
7465     }
7466 
7467     return Result(range);
7468 }
7469 
7470 /// Ditto
7471 auto byCodePoint(Range)(Range range)
7472 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7473 {
7474     import std.range.primitives : isBidirectionalRange, popBack;
7475     import std.traits : isNarrowString;
7476     static if (isNarrowString!Range)
7477     {
7478         static struct Result
7479         {
7480             private Range _range;
7481             @property bool empty() { return _range.empty; }
7482             @property dchar front(){ return _range.front; }
7483             void popFront(){ _range.popFront; }
7484             @property auto save() { return Result(_range.save); }
7485             @property dchar back(){ return _range.back; }
7486             void popBack(){ _range.popBack; }
7487         }
7488         static assert(isBidirectionalRange!(Result));
7489         return Result(range);
7490     }
7491     else
7492         return range;
7493 }
7494 
7495 ///
7496 @safe unittest
7497 {
7498     import std.array : array;
7499     import std.conv : text;
7500     import std.range : retro;
7501 
7502     string s = "noe\u0308l"; // noël
7503 
7504     // reverse it and convert the result to a string
7505     string reverse = s.byGrapheme
7506         .array
7507         .retro
7508         .byCodePoint
7509         .text;
7510 
7511     assert(reverse == "le\u0308on"); // lëon
7512 }
7513 
7514 @safe unittest
7515 {
7516     import std.algorithm.comparison : equal;
7517     import std.range.primitives : walkLength;
7518     import std.range : retro;
7519     assert("".byGrapheme.byCodePoint.equal(""));
7520 
7521     string text = "noe\u0308l";
7522     static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length));
7523 
7524     auto gText = InputRangeString(text).byGrapheme;
7525     static assert(!isForwardRange!(typeof(gText)));
7526 
7527     auto cpText = gText.byCodePoint;
7528     static assert(!isForwardRange!(typeof(cpText)));
7529 
7530     assert(cpText.walkLength == text.walkLength);
7531 
7532     auto plainCp = text.byCodePoint;
7533     static assert(isForwardRange!(typeof(plainCp)));
7534     assert(equal(plainCp, text));
7535     assert(equal(retro(plainCp.save), retro(text.save)));
7536     // Check that we still have length for dstring
7537     assert("абвгд"d.byCodePoint.length == 5);
7538 }
7539 
7540 /++
7541     $(P A structure designed to effectively pack $(CHARACTERS)
7542     of a $(CLUSTER).
7543     )
7544 
7545     $(P `Grapheme` has value semantics so 2 copies of a `Grapheme`
7546     always refer to distinct objects. In most actual scenarios a `Grapheme`
7547     fits on the stack and avoids memory allocation overhead for all but quite
7548     long clusters.
7549     )
7550 
7551     See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
7552 +/
7553 @safe struct Grapheme
7554 {
7555     import std.exception : enforce;
7556     import std.traits : isDynamicArray;
7557 
7558 public:
7559     /// Ctor
7560     this(C)(const scope C[] chars...)
7561         if (is(C : dchar))
7562     {
7563         this ~= chars;
7564     }
7565 
7566     ///ditto
7567     this(Input)(Input seq)
7568         if (!isDynamicArray!Input
7569             && isInputRange!Input && is(ElementType!Input : dchar))
7570     {
7571         this ~= seq;
7572     }
7573 
7574     /// Gets a $(CODEPOINT) at the given index in this cluster.
7575     dchar opIndex(size_t index) const @nogc nothrow pure @trusted
7576     {
7577         assert(index < length);
7578         return read24(isBig ? ptr_ : small_.ptr, index);
7579     }
7580 
7581     /++
7582         Writes a $(CODEPOINT) `ch` at given index in this cluster.
7583 
7584         Warning:
7585         Use of this facility may invalidate grapheme cluster,
7586         see also $(LREF Grapheme.valid).
7587     +/
7588     void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted
7589     {
7590         assert(index < length);
7591         write24(isBig ? ptr_ : small_.ptr, ch, index);
7592     }
7593 
7594     ///
7595     @safe unittest
7596     {
7597         auto g = Grapheme("A\u0302");
7598         assert(g[0] == 'A');
7599         assert(g.valid);
7600         g[1] = '~'; // ASCII tilda is not a combining mark
7601         assert(g[1] == '~');
7602         assert(!g.valid);
7603     }
7604 
7605     /++
7606         Random-access range over Grapheme's $(CHARACTERS).
7607 
7608         Warning: Invalidates when this Grapheme leaves the scope,
7609         attempts to use it then would lead to memory corruption.
7610     +/
7611     SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return
7612     {
7613         return sliceOverIndexed(a, b, &this);
7614     }
7615 
7616     /// ditto
7617     SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return
7618     {
7619         return sliceOverIndexed(0, length, &this);
7620     }
7621 
7622     /// Grapheme cluster length in $(CODEPOINTS).
7623     @property size_t length() const @nogc nothrow pure
7624     {
7625         return isBig ? len_ : slen_ & 0x7F;
7626     }
7627 
7628     /++
7629         Append $(CHARACTER) `ch` to this grapheme.
7630         Warning:
7631         Use of this facility may invalidate grapheme cluster,
7632         see also `valid`.
7633 
7634         See_Also: $(LREF Grapheme.valid)
7635     +/
7636     ref opOpAssign(string op)(dchar ch) @trusted
7637     {
7638         static if (op == "~")
7639         {
7640             import std.internal.memory : enforceRealloc;
7641             if (!isBig)
7642             {
7643                 if (slen_ == small_cap)
7644                     convertToBig();// & fallthrough to "big" branch
7645                 else
7646                 {
7647                     write24(small_.ptr, ch, smallLength);
7648                     slen_++;
7649                     return this;
7650                 }
7651             }
7652 
7653             assert(isBig);
7654             if (len_ == cap_)
7655             {
7656                 import core.checkedint : addu, mulu;
7657                 bool overflow;
7658                 cap_ = addu(cap_, grow, overflow);
7659                 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
7660                 if (overflow) assert(0);
7661                 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems);
7662             }
7663             write24(ptr_, ch, len_++);
7664             return this;
7665         }
7666         else
7667             static assert(false, "No operation "~op~" defined for Grapheme");
7668     }
7669 
7670     ///
7671     @safe unittest
7672     {
7673         import std.algorithm.comparison : equal;
7674         auto g = Grapheme("A");
7675         assert(g.valid);
7676         g ~= '\u0301';
7677         assert(g[].equal("A\u0301"));
7678         assert(g.valid);
7679         g ~= "B";
7680         // not a valid grapheme cluster anymore
7681         assert(!g.valid);
7682         // still could be useful though
7683         assert(g[].equal("A\u0301B"));
7684     }
7685 
7686     /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme.
7687     ref opOpAssign(string op, Input)(scope Input inp)
7688         if (isInputRange!Input && is(ElementType!Input : dchar))
7689     {
7690         static if (op == "~")
7691         {
7692             foreach (dchar ch; inp)
7693                 this ~= ch;
7694             return this;
7695         }
7696         else
7697             static assert(false, "No operation "~op~" defined for Grapheme");
7698     }
7699 
7700     // This is not a good `opEquals`, but formerly the automatically generated
7701     // opEquals was used, which was inferred `@safe` because of bugzilla 20655:
7702     // https://issues.dlang.org/show_bug.cgi?id=20655
7703     // This `@trusted opEquals` is only here to prevent breakage.
7704     bool opEquals(R)(const auto ref R other) const @trusted
7705     {
7706         return this.tupleof == other.tupleof;
7707     }
7708 
7709     // Define a default toHash to allow AA usage
7710     size_t toHash() const @trusted
7711     {
7712         return hashOf(slen_, hashOf(small_));
7713     }
7714 
7715     /++
7716         True if this object contains valid extended grapheme cluster.
7717         Decoding primitives of this module always return a valid `Grapheme`.
7718 
7719         Appending to and direct manipulation of grapheme's $(CHARACTERS) may
7720         render it no longer valid. Certain applications may chose to use
7721         Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
7722         entirely.
7723     +/
7724     @property bool valid()() /*const*/
7725     {
7726         auto r = this[];
7727         genericDecodeGrapheme!false(r);
7728         return r.length == 0;
7729     }
7730 
7731     this(this) @nogc nothrow pure @trusted
7732     {
7733         import std.internal.memory : enforceMalloc;
7734         if (isBig)
7735         {// dup it
7736             import core.checkedint : addu, mulu;
7737             bool overflow;
7738             auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
7739             if (overflow) assert(0);
7740 
7741             auto p = cast(ubyte*) enforceMalloc(raw_cap);
7742             p[0 .. raw_cap] = ptr_[0 .. raw_cap];
7743             ptr_ = p;
7744         }
7745     }
7746 
7747     ~this() @nogc nothrow pure @trusted
7748     {
7749         import core.memory : pureFree;
7750         if (isBig)
7751         {
7752             pureFree(ptr_);
7753         }
7754     }
7755 
7756 
7757 private:
7758     enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
7759     // "out of the blue" grow rate, needs testing
7760     // (though graphemes are typically small < 9)
7761     enum grow = 20;
7762     enum small_cap = small_bytes/3;
7763     enum small_flag = 0x80, small_mask = 0x7F;
7764     // 16 bytes in 32bits, should be enough for the majority of cases
7765     union
7766     {
7767         struct
7768         {
7769             ubyte* ptr_;
7770             size_t cap_;
7771             size_t len_;
7772             size_t padding_;
7773         }
7774         struct
7775         {
7776             ubyte[small_bytes] small_;
7777             ubyte slen_;
7778         }
7779     }
7780 
7781     void convertToBig() @nogc nothrow pure @trusted
7782     {
7783         import std.internal.memory : enforceMalloc;
7784         static assert(grow.max / 3 - 1 >= grow);
7785         enum nbytes = 3 * (grow + 1);
7786         size_t k = smallLength;
7787         ubyte* p = cast(ubyte*) enforceMalloc(nbytes);
7788         for (int i=0; i<k; i++)
7789             write24(p, read24(small_.ptr, i), i);
7790         // now we can overwrite small array data
7791         ptr_ = p;
7792         len_ = slen_;
7793         assert(grow > len_);
7794         cap_ = grow;
7795         setBig();
7796     }
7797 
7798     void setBig() @nogc nothrow pure { slen_ |= small_flag; }
7799 
7800     @property size_t smallLength() const @nogc nothrow pure
7801     {
7802         return slen_ & small_mask;
7803     }
7804     @property ubyte isBig() const @nogc nothrow pure
7805     {
7806         return slen_ & small_flag;
7807     }
7808 }
7809 
7810 static assert(Grapheme.sizeof == size_t.sizeof*4);
7811 
7812 
7813 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
7814 {
7815     import std.algorithm.comparison : equal;
7816     Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")];
7817     assert(byGrapheme("ЮУЗ").equal(data[]));
7818 }
7819 
7820 ///
7821 @safe unittest
7822 {
7823     import std.algorithm.comparison : equal;
7824     import std.algorithm.iteration : filter;
7825     import std.range : isRandomAccessRange;
7826 
7827     string bold = "ku\u0308hn";
7828 
7829     // note that decodeGrapheme takes parameter by ref
7830     auto first = decodeGrapheme(bold);
7831 
7832     assert(first.length == 1);
7833     assert(first[0] == 'k');
7834 
7835     // the next grapheme is 2 characters long
7836     auto wideOne = decodeGrapheme(bold);
7837     // slicing a grapheme yields a random-access range of dchar
7838     assert(wideOne[].equal("u\u0308"));
7839     assert(wideOne.length == 2);
7840     static assert(isRandomAccessRange!(typeof(wideOne[])));
7841 
7842     // all of the usual range manipulation is possible
7843     assert(wideOne[].filter!isMark().equal("\u0308"));
7844 
7845     auto g = Grapheme("A");
7846     assert(g.valid);
7847     g ~= '\u0301';
7848     assert(g[].equal("A\u0301"));
7849     assert(g.valid);
7850     g ~= "B";
7851     // not a valid grapheme cluster anymore
7852     assert(!g.valid);
7853     // still could be useful though
7854     assert(g[].equal("A\u0301B"));
7855 }
7856 
7857 @safe unittest
7858 {
7859     auto g = Grapheme("A\u0302");
7860     assert(g[0] == 'A');
7861     assert(g.valid);
7862     g[1] = '~'; // ASCII tilda is not a combining mark
7863     assert(g[1] == '~');
7864     assert(!g.valid);
7865 }
7866 
7867 @safe unittest
7868 {
7869     import std.algorithm.comparison : equal;
7870     import std.algorithm.iteration : map;
7871     import std.conv : text;
7872     import std.range : iota;
7873 
7874     // not valid clusters (but it just a test)
7875     auto g  = Grapheme('a', 'b', 'c', 'd', 'e');
7876     assert(g[0] == 'a');
7877     assert(g[1] == 'b');
7878     assert(g[2] == 'c');
7879     assert(g[3] == 'd');
7880     assert(g[4] == 'e');
7881     g[3] = 'Й';
7882     assert(g[2] == 'c');
7883     assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
7884     assert(g[4] == 'e');
7885     assert(!g.valid);
7886 
7887     g ~= 'ц';
7888     g ~= '~';
7889     assert(g[0] == 'a');
7890     assert(g[1] == 'b');
7891     assert(g[2] == 'c');
7892     assert(g[3] == 'Й');
7893     assert(g[4] == 'e');
7894     assert(g[5] == 'ц');
7895     assert(g[6] == '~');
7896     assert(!g.valid);
7897 
7898     Grapheme copy = g;
7899     copy[0] = 'X';
7900     copy[1] = '-';
7901     assert(g[0] == 'a' && copy[0] == 'X');
7902     assert(g[1] == 'b' && copy[1] == '-');
7903     assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
7904     copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
7905     assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8]));
7906     copy ~= "xyz";
7907     assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
7908     assert(!copy.valid);
7909 
7910     Grapheme h;
7911     foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
7912         h ~= v;
7913     assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
7914 }
7915 
7916 // ensure Grapheme can be used as an AA key.
7917 @safe unittest
7918 {
7919     int[Grapheme] aa;
7920 }
7921 
7922 /++
7923     $(P Does basic case-insensitive comparison of `r1` and `r2`.
7924     This function uses simpler comparison rule thus achieving better performance
7925     than $(LREF icmp). However keep in mind the warning below.)
7926 
7927     Params:
7928         r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7929         r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7930 
7931     Returns:
7932         An `int` that is 0 if the strings match,
7933         &lt;0 if `r1` is lexicographically "less" than `r2`,
7934         &gt;0 if `r1` is lexicographically "greater" than `r2`
7935 
7936     Warning:
7937     This function only handles 1:1 $(CODEPOINT) mapping
7938     and thus is not sufficient for certain alphabets
7939     like German, Greek and few others.
7940 
7941     See_Also:
7942         $(LREF icmp)
7943         $(REF cmp, std,algorithm,comparison)
7944 +/
7945 int sicmp(S1, S2)(scope S1 r1, scope S2 r2)
7946 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
7947     && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
7948 {
7949     import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
7950     import std.range.primitives : isInfinite;
7951     import std.utf : decodeFront;
7952     import std.traits : isDynamicArray;
7953     import std.typecons : Yes;
7954     static import std.ascii;
7955 
7956     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7957         && (isDynamicArray!S2 || isRandomAccessRange!S2)
7958         && !(isInfinite!S1 && isInfinite!S2)
7959         && __traits(compiles,
7960             {
7961                 size_t s = size_t.sizeof / 2;
7962                 r1 = r1[s .. $];
7963                 r2 = r2[s .. $];
7964             }))
7965     {{
7966         // ASCII optimization for dynamic arrays & similar.
7967         size_t i = 0;
7968         static if (isInfinite!S1)
7969             immutable end = r2.length;
7970         else static if (isInfinite!S2)
7971             immutable end = r1.length;
7972         else
7973             immutable end = r1.length > r2.length ? r2.length : r1.length;
7974         for (; i < end; ++i)
7975         {
7976             auto lhs = r1[i];
7977             auto rhs = r2[i];
7978             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7979             if (lhs == rhs) continue;
7980             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7981             if (lowDiff) return lowDiff;
7982         }
7983         static if (isInfinite!S1)
7984             return 1;
7985         else static if (isInfinite!S2)
7986             return -1;
7987         else
7988             return (r1.length > r2.length) - (r2.length > r1.length);
7989 
7990     NonAsciiPath:
7991         r1 = r1[i .. $];
7992         r2 = r2[i .. $];
7993         // Fall through to standard case.
7994     }}
7995 
7996     while (!r1.empty)
7997     {
7998         immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1);
7999         if (r2.empty)
8000             return 1;
8001         immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2);
8002         int diff = lhs - rhs;
8003         if (!diff)
8004             continue;
8005         if ((lhs | rhs) < 0x80)
8006         {
8007             immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
8008             if (!d) continue;
8009             return d;
8010         }
8011         size_t idx = simpleCaseTrie[lhs];
8012         size_t idx2 = simpleCaseTrie[rhs];
8013         // simpleCaseTrie is packed index table
8014         if (idx != EMPTY_CASE_TRIE)
8015         {
8016             if (idx2 != EMPTY_CASE_TRIE)
8017             {// both cased chars
8018                 // adjust idx --> start of bucket
8019                 idx = idx - sTable[idx].n;
8020                 idx2 = idx2 - sTable[idx2].n;
8021                 if (idx == idx2)// one bucket, equivalent chars
8022                     continue;
8023                 else//  not the same bucket
8024                     diff = sTable[idx].ch - sTable[idx2].ch;
8025             }
8026             else
8027                 diff = sTable[idx - sTable[idx].n].ch - rhs;
8028         }
8029         else if (idx2 != EMPTY_CASE_TRIE)
8030         {
8031             diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
8032         }
8033         // one of chars is not cased at all
8034         return diff;
8035     }
8036     return int(r2.empty) - 1;
8037 }
8038 
8039 ///
8040 @safe @nogc pure nothrow unittest
8041 {
8042     assert(sicmp("Август", "авгусТ") == 0);
8043     // Greek also works as long as there is no 1:M mapping in sight
8044     assert(sicmp("ΌΎ", "όύ") == 0);
8045     // things like the following won't get matched as equal
8046     // Greek small letter iota with dialytika and tonos
8047     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8048 
8049     // while icmp has no problem with that
8050     assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
8051     assert(icmp("ΌΎ", "όύ") == 0);
8052 }
8053 
8054 // overloads for the most common cases to reduce compile time
8055 @safe @nogc pure nothrow
8056 {
8057     int sicmp(scope const(char)[] str1, scope const(char)[] str2)
8058     { return sicmp!(const(char)[], const(char)[])(str1, str2); }
8059 
8060     int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2)
8061     { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8062 
8063     int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2)
8064     { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8065 }
8066 
8067 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
8068 {
8069     import std.algorithm.searching : skipOver;
8070     import std.internal.unicode_tables : fullCaseTable; // generated file
8071     alias fTable = fullCaseTable;
8072     size_t idx = fullCaseTrie[lhs];
8073     // fullCaseTrie is packed index table
8074     if (idx == EMPTY_CASE_TRIE)
8075         return lhs;
8076     immutable start = idx - fTable[idx].n;
8077     immutable end = fTable[idx].size + start;
8078     assert(fTable[start].entry_len == 1);
8079     for (idx=start; idx<end; idx++)
8080     {
8081         auto entryLen = fTable[idx].entry_len;
8082         if (entryLen == 1)
8083         {
8084             if (fTable[idx].seq[0] == rhs)
8085             {
8086                 return 0;
8087             }
8088         }
8089         else
8090         {// OK it's a long chunk, like 'ss' for German
8091             dstring seq = fTable[idx].seq[0 .. entryLen];
8092             if (rhs == seq[0]
8093                 && rtail.skipOver(seq[1..$]))
8094             {
8095                 // note that this path modifies rtail
8096                 // iff we managed to get there
8097                 return 0;
8098             }
8099         }
8100     }
8101     return fTable[start].seq[0]; // new remapped character for accurate diffs
8102 }
8103 
8104 /++
8105     Does case insensitive comparison of `r1` and `r2`.
8106     Follows the rules of full case-folding mapping.
8107     This includes matching as equal german ß with "ss" and
8108     other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
8109     The cost of `icmp` being pedantically correct is
8110     slightly worse performance.
8111 
8112     Params:
8113         r1 = a forward range of characters
8114         r2 = a forward range of characters
8115 
8116     Returns:
8117         An `int` that is 0 if the strings match,
8118         &lt;0 if `str1` is lexicographically "less" than `str2`,
8119         &gt;0 if `str1` is lexicographically "greater" than `str2`
8120 
8121     See_Also:
8122         $(LREF sicmp)
8123         $(REF cmp, std,algorithm,comparison)
8124 +/
8125 int icmp(S1, S2)(S1 r1, S2 r2)
8126 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
8127     && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
8128 {
8129     import std.range.primitives : isInfinite;
8130     import std.traits : isDynamicArray;
8131     import std.utf : byDchar;
8132     static import std.ascii;
8133 
8134     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
8135         && (isDynamicArray!S2 || isRandomAccessRange!S2)
8136         && !(isInfinite!S1 && isInfinite!S2)
8137         && __traits(compiles,
8138             {
8139                 size_t s = size_t.max / 2;
8140                 r1 = r1[s .. $];
8141                 r2 = r2[s .. $];
8142             }))
8143     {{
8144         // ASCII optimization for dynamic arrays & similar.
8145         size_t i = 0;
8146         static if (isInfinite!S1)
8147             immutable end = r2.length;
8148         else static if (isInfinite!S2)
8149             immutable end = r1.length;
8150         else
8151             immutable end = r1.length > r2.length ? r2.length : r1.length;
8152         for (; i < end; ++i)
8153         {
8154             auto lhs = r1[i];
8155             auto rhs = r2[i];
8156             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
8157             if (lhs == rhs) continue;
8158             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
8159             if (lowDiff) return lowDiff;
8160         }
8161         static if (isInfinite!S1)
8162             return 1;
8163         else static if (isInfinite!S2)
8164             return -1;
8165         else
8166             return (r1.length > r2.length) - (r2.length > r1.length);
8167 
8168     NonAsciiPath:
8169         r1 = r1[i .. $];
8170         r2 = r2[i .. $];
8171         // Fall through to standard case.
8172     }}
8173 
8174     auto str1 = r1.byDchar;
8175     auto str2 = r2.byDchar;
8176 
8177     for (;;)
8178     {
8179         if (str1.empty)
8180             return str2.empty ? 0 : -1;
8181         immutable lhs = str1.front;
8182         if (str2.empty)
8183             return 1;
8184         immutable rhs = str2.front;
8185         str1.popFront();
8186         str2.popFront();
8187         if (!(lhs - rhs))
8188             continue;
8189         // first try to match lhs to <rhs,right-tail> sequence
8190         immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
8191         if (!cmpLR)
8192             continue;
8193         // then rhs to <lhs,left-tail> sequence
8194         immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
8195         if (!cmpRL)
8196             continue;
8197         // cmpXX contain remapped codepoints
8198         // to obtain stable ordering of icmp
8199         return cmpLR - cmpRL;
8200     }
8201 }
8202 
8203 ///
8204 @safe @nogc pure nothrow unittest
8205 {
8206     assert(icmp("Rußland", "Russland") == 0);
8207     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8208 }
8209 
8210 /**
8211  * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
8212  * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
8213  */
8214 @safe @nogc nothrow pure unittest
8215 {
8216     import std.utf : byDchar;
8217 
8218     assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
8219     assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
8220 }
8221 
8222 // test different character types
8223 @safe unittest
8224 {
8225     assert(icmp("Rußland", "Russland") == 0);
8226     assert(icmp("Rußland"w, "Russland") == 0);
8227     assert(icmp("Rußland", "Russland"w) == 0);
8228     assert(icmp("Rußland"w, "Russland"w) == 0);
8229     assert(icmp("Rußland"d, "Russland"w) == 0);
8230     assert(icmp("Rußland"w, "Russland"d) == 0);
8231 }
8232 
8233 // overloads for the most common cases to reduce compile time
8234 @safe @nogc pure nothrow
8235 {
8236     int icmp(const(char)[] str1, const(char)[] str2)
8237     { return icmp!(const(char)[], const(char)[])(str1, str2); }
8238     int icmp(const(wchar)[] str1, const(wchar)[] str2)
8239     { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8240     int icmp(const(dchar)[] str1, const(dchar)[] str2)
8241     { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8242 }
8243 
8244 @safe unittest
8245 {
8246     import std.algorithm.sorting : sort;
8247     import std.conv : to;
8248     import std.exception : assertCTFEable;
8249     assertCTFEable!(
8250     {
8251     static foreach (cfunc; AliasSeq!(icmp, sicmp))
8252     {{
8253         static foreach (S1; AliasSeq!(string, wstring, dstring))
8254         static foreach (S2; AliasSeq!(string, wstring, dstring))
8255         {
8256             assert(cfunc("".to!S1(), "".to!S2()) == 0);
8257             assert(cfunc("A".to!S1(), "".to!S2()) > 0);
8258             assert(cfunc("".to!S1(), "0".to!S2()) < 0);
8259             assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
8260             assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
8261             assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
8262             assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
8263             assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
8264             // Check example:
8265             assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
8266             assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
8267         }
8268         // check that the order is properly agnostic to the case
8269         auto strs = [ "Apple", "ORANGE",  "orAcle", "amp", "banana"];
8270         sort!((a,b) => cfunc(a,b) < 0)(strs);
8271         assert(strs == ["amp", "Apple",  "banana", "orAcle", "ORANGE"]);
8272     }}
8273     assert(icmp("ßb", "ssa") > 0);
8274     // Check example:
8275     assert(icmp("Russland", "Rußland") == 0);
8276     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8277     assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
8278     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8279     // https://issues.dlang.org/show_bug.cgi?id=11057
8280     assert( icmp("K", "L") < 0 );
8281     });
8282 }
8283 
8284 // https://issues.dlang.org/show_bug.cgi?id=17372
8285 @safe pure unittest
8286 {
8287     import std.algorithm.iteration : joiner, map;
8288     import std.algorithm.sorting : sort;
8289     import std.array : array;
8290     auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
8291 }
8292 
8293 // This is package(std) for the moment to be used as a support tool for std.regex
8294 // It needs a better API
8295 /*
8296     Return a range of all $(CODEPOINTS) that casefold to
8297     and from this `ch`.
8298 */
8299 package(std) auto simpleCaseFoldings(dchar ch) @safe
8300 {
8301     import std.internal.unicode_tables : simpleCaseTable; // generated file
8302     alias sTable = simpleCaseTable;
8303     static struct Range
8304     {
8305     @safe pure nothrow:
8306         uint idx; //if == uint.max, then read c.
8307         union
8308         {
8309             dchar c; // == 0 - empty range
8310             uint len;
8311         }
8312         @property bool isSmall() const { return idx == uint.max; }
8313 
8314         this(dchar ch)
8315         {
8316             idx = uint.max;
8317             c = ch;
8318         }
8319 
8320         this(uint start, uint size)
8321         {
8322             idx = start;
8323             len = size;
8324         }
8325 
8326         @property dchar front() const
8327         {
8328             assert(!empty);
8329             if (isSmall)
8330             {
8331                 return c;
8332             }
8333             auto ch = sTable[idx].ch;
8334             return ch;
8335         }
8336 
8337         @property bool empty() const
8338         {
8339             if (isSmall)
8340             {
8341                 return c == 0;
8342             }
8343             return len == 0;
8344         }
8345 
8346         @property size_t length() const
8347         {
8348             if (isSmall)
8349             {
8350                 return c == 0 ? 0 : 1;
8351             }
8352             return len;
8353         }
8354 
8355         void popFront()
8356         {
8357             if (isSmall)
8358                 c = 0;
8359             else
8360             {
8361                 idx++;
8362                 len--;
8363             }
8364         }
8365     }
8366     immutable idx = simpleCaseTrie[ch];
8367     if (idx == EMPTY_CASE_TRIE)
8368         return Range(ch);
8369     auto entry = sTable[idx];
8370     immutable start = idx - entry.n;
8371     return Range(start, entry.size);
8372 }
8373 
8374 @safe unittest
8375 {
8376     import std.algorithm.comparison : equal;
8377     import std.algorithm.searching : canFind;
8378     import std.array : array;
8379     import std.exception : assertCTFEable;
8380     assertCTFEable!((){
8381         auto r = simpleCaseFoldings('Э').array;
8382         assert(r.length == 2);
8383         assert(r.canFind('э') && r.canFind('Э'));
8384         auto sr = simpleCaseFoldings('~');
8385         assert(sr.equal("~"));
8386         //A with ring above - casefolds to the same bucket as Angstrom sign
8387         sr = simpleCaseFoldings('Å');
8388         assert(sr.length == 3);
8389         assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
8390     });
8391 }
8392 
8393 /++
8394     $(P Returns the $(S_LINK Combining class, combining class) of `ch`.)
8395 +/
8396 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
8397 {
8398     return combiningClassTrie[ch];
8399 }
8400 
8401 ///
8402 @safe unittest
8403 {
8404     // shorten the code
8405     alias CC = combiningClass;
8406 
8407     // combining tilda
8408     assert(CC('\u0303') == 230);
8409     // combining ring below
8410     assert(CC('\u0325') == 220);
8411     // the simple consequence is that  "tilda" should be
8412     // placed after a "ring below" in a sequence
8413 }
8414 
8415 @safe pure nothrow @nogc unittest
8416 {
8417     foreach (ch; 0 .. 0x80)
8418         assert(combiningClass(ch) == 0);
8419     assert(combiningClass('\u05BD') == 22);
8420     assert(combiningClass('\u0300') == 230);
8421     assert(combiningClass('\u0317') == 220);
8422     assert(combiningClass('\u1939') == 222);
8423 }
8424 
8425 /// Unicode character decomposition type.
8426 enum UnicodeDecomposition {
8427     /// Canonical decomposition. The result is canonically equivalent sequence.
8428     Canonical,
8429     /**
8430          Compatibility decomposition. The result is compatibility equivalent sequence.
8431          Note: Compatibility decomposition is a $(B lossy) conversion,
8432          typically suitable only for fuzzy matching and internal processing.
8433     */
8434     Compatibility
8435 }
8436 
8437 /**
8438     Shorthand aliases for character decomposition type, passed as a
8439     template parameter to $(LREF decompose).
8440 */
8441 enum {
8442     Canonical = UnicodeDecomposition.Canonical,
8443     Compatibility = UnicodeDecomposition.Compatibility
8444 }
8445 
8446 /++
8447     Try to canonically compose 2 $(CHARACTERS).
8448     Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
8449 
8450     The assumption is that `first` comes before `second` in the original text,
8451     usually meaning that the first is a starter.
8452 
8453     Note: Hangul syllables are not covered by this function.
8454     See `composeJamo` below.
8455 +/
8456 public dchar compose(dchar first, dchar second) pure nothrow @safe
8457 {
8458     import std.algorithm.iteration : map;
8459     import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
8460     import std.range : assumeSorted;
8461     immutable packed = compositionJumpTrie[first];
8462     if (packed == ushort.max)
8463         return dchar.init;
8464     // unpack offset and length
8465     immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
8466     // TODO: optimize this micro binary search (no more then 4-5 steps)
8467     auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted();
8468     immutable target = r.lowerBound(second).length;
8469     if (target == cnt)
8470         return dchar.init;
8471     immutable entry = compositionTable[idx+target];
8472     if (entry.rhs != second)
8473         return dchar.init;
8474     return entry.composed;
8475 }
8476 
8477 ///
8478 @safe unittest
8479 {
8480     assert(compose('A','\u0308') == '\u00C4');
8481     assert(compose('A', 'B') == dchar.init);
8482     assert(compose('C', '\u0301') == '\u0106');
8483     // note that the starter is the first one
8484     // thus the following doesn't compose
8485     assert(compose('\u0308', 'A') == dchar.init);
8486 }
8487 
8488 /++
8489     Returns a full $(S_LINK Canonical decomposition, Canonical)
8490     (by default) or $(S_LINK Compatibility decomposition, Compatibility)
8491     decomposition of $(CHARACTER) `ch`.
8492     If no decomposition is available returns a $(LREF Grapheme)
8493     with the `ch` itself.
8494 
8495     Note:
8496     This function also decomposes hangul syllables
8497     as prescribed by the standard.
8498 
8499     See_Also: $(LREF decomposeHangul) for a restricted version
8500     that takes into account only hangul syllables  but
8501     no other decompositions.
8502 +/
8503 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
8504 {
8505     import std.algorithm.searching : until;
8506     import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
8507     static if (decompType == Canonical)
8508     {
8509         alias table = decompCanonTable;
8510         alias mapping = canonMappingTrie;
8511     }
8512     else static if (decompType == Compatibility)
8513     {
8514         alias table = decompCompatTable;
8515         alias mapping = compatMappingTrie;
8516     }
8517     immutable idx = mapping[ch];
8518     if (!idx) // not found, check hangul arithmetic decomposition
8519         return decomposeHangul(ch);
8520     auto decomp = table[idx..$].until(0);
8521     return Grapheme(decomp);
8522 }
8523 
8524 ///
8525 @safe unittest
8526 {
8527     import std.algorithm.comparison : equal;
8528 
8529     assert(compose('A','\u0308') == '\u00C4');
8530     assert(compose('A', 'B') == dchar.init);
8531     assert(compose('C', '\u0301') == '\u0106');
8532     // note that the starter is the first one
8533     // thus the following doesn't compose
8534     assert(compose('\u0308', 'A') == dchar.init);
8535 
8536     assert(decompose('Ĉ')[].equal("C\u0302"));
8537     assert(decompose('D')[].equal("D"));
8538     assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
8539     assert(decompose!Compatibility('¹')[].equal("1"));
8540 }
8541 
8542 //----------------------------------------------------------------------------
8543 // Hangul specific composition/decomposition
8544 enum jamoSBase = 0xAC00;
8545 enum jamoLBase = 0x1100;
8546 enum jamoVBase = 0x1161;
8547 enum jamoTBase = 0x11A7;
8548 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
8549 enum jamoNCount = jamoVCount * jamoTCount;
8550 enum jamoSCount = jamoLCount * jamoNCount;
8551 
8552 // Tests if `ch` is a Hangul leading consonant jamo.
8553 bool isJamoL(dchar ch) pure nothrow @nogc @safe
8554 {
8555     // first cmp rejects ~ 1M code points above leading jamo range
8556     return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
8557 }
8558 
8559 // Tests if `ch` is a Hangul vowel jamo.
8560 bool isJamoT(dchar ch) pure nothrow @nogc @safe
8561 {
8562     // first cmp rejects ~ 1M code points above trailing jamo range
8563     // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
8564     return ch < jamoTBase+jamoTCount && ch > jamoTBase;
8565 }
8566 
8567 // Tests if `ch` is a Hangul trailnig consonant jamo.
8568 bool isJamoV(dchar ch) pure nothrow @nogc @safe
8569 {
8570     // first cmp rejects ~ 1M code points above vowel range
8571     return  ch < jamoVBase+jamoVCount && ch >= jamoVBase;
8572 }
8573 
8574 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
8575 {
8576     int idxS = cast(int) ch - jamoSBase;
8577     return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
8578 }
8579 
8580 // internal helper: compose hangul syllables leaving dchar.init in holes
8581 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe
8582 {
8583     for (size_t idx = 0; idx + 1 < seq.length; )
8584     {
8585         if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
8586         {
8587             immutable int indexL = seq[idx] - jamoLBase;
8588             immutable int indexV = seq[idx+1] - jamoVBase;
8589             immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
8590             if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
8591             {
8592                 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
8593                 seq[idx+1] = dchar.init;
8594                 seq[idx+2] = dchar.init;
8595                 idx += 3;
8596             }
8597             else
8598             {
8599                 seq[idx] = jamoSBase + indexLV;
8600                 seq[idx+1] = dchar.init;
8601                 idx += 2;
8602             }
8603         }
8604         else
8605             idx++;
8606     }
8607 }
8608 
8609 //----------------------------------------------------------------------------
8610 public:
8611 
8612 /**
8613     Decomposes a Hangul syllable. If `ch` is not a composed syllable
8614     then this function returns $(LREF Grapheme) containing only `ch` as is.
8615 */
8616 Grapheme decomposeHangul(dchar ch) nothrow pure @safe
8617 {
8618     immutable idxS = cast(int) ch - jamoSBase;
8619     if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
8620     immutable idxL = idxS / jamoNCount;
8621     immutable idxV = (idxS % jamoNCount) / jamoTCount;
8622     immutable idxT = idxS % jamoTCount;
8623 
8624     immutable partL = jamoLBase + idxL;
8625     immutable partV = jamoVBase + idxV;
8626     if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
8627         return Grapheme(partL, partV, jamoTBase + idxT);
8628     else // <L, V> decomposition
8629         return Grapheme(partL, partV);
8630 }
8631 
8632 ///
8633 @safe unittest
8634 {
8635     import std.algorithm.comparison : equal;
8636     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8637 }
8638 
8639 /++
8640     Try to compose hangul syllable out of a leading consonant (`lead`),
8641     a `vowel` and optional `trailing` consonant jamos.
8642 
8643     On success returns the composed LV or LVT hangul syllable.
8644 
8645     If any of `lead` and `vowel` are not a valid hangul jamo
8646     of the respective $(CHARACTER) class returns dchar.init.
8647 +/
8648 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
8649 {
8650     if (!isJamoL(lead))
8651         return dchar.init;
8652     immutable indexL = lead - jamoLBase;
8653     if (!isJamoV(vowel))
8654         return dchar.init;
8655     immutable indexV = vowel - jamoVBase;
8656     immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
8657     immutable dchar syllable = jamoSBase + indexLV;
8658     return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
8659 }
8660 
8661 ///
8662 @safe unittest
8663 {
8664     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8665     // leaving out T-vowel, or passing any codepoint
8666     // that is not trailing consonant composes an LV-syllable
8667     assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
8668     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8669     assert(composeJamo('\u1111', 'A') == dchar.init);
8670     assert(composeJamo('A', '\u1171') == dchar.init);
8671 }
8672 
8673 @safe unittest
8674 {
8675     import std.algorithm.comparison : equal;
8676     import std.conv : text;
8677 
8678     static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
8679     {
8680         Grapheme g = decompose!T(ch);
8681         assert(equal(g[], r), text(g[], " vs ", r));
8682     }
8683     testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
8684     testDecomp!Canonical('\uF907', "\u9F9C");
8685     testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
8686     testDecomp!Compatibility('\uA7F9', "\u0153");
8687 
8688     // check examples
8689     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8690     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8691     assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
8692     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8693     assert(composeJamo('\u1111', 'A') == dchar.init);
8694     assert(composeJamo('A', '\u1171') == dchar.init);
8695 }
8696 
8697 /**
8698     Enumeration type for normalization forms,
8699     passed as template parameter for functions like $(LREF normalize).
8700 */
8701 enum NormalizationForm {
8702     NFC,
8703     NFD,
8704     NFKC,
8705     NFKD
8706 }
8707 
8708 
8709 enum {
8710     /**
8711         Shorthand aliases from values indicating normalization forms.
8712     */
8713     NFC = NormalizationForm.NFC,
8714     ///ditto
8715     NFD = NormalizationForm.NFD,
8716     ///ditto
8717     NFKC = NormalizationForm.NFKC,
8718     ///ditto
8719     NFKD = NormalizationForm.NFKD
8720 }
8721 
8722 /++
8723     Returns `input` string normalized to the chosen form.
8724     Form C is used by default.
8725 
8726     For more information on normalization forms see
8727     the $(S_LINK Normalization, normalization section).
8728 
8729     Note:
8730     In cases where the string in question is already normalized,
8731     it is returned unmodified and no memory allocation happens.
8732 +/
8733 /*
8734     WARNING: @trusted lambda inside - handle with same care as @trusted
8735         functions
8736 
8737     Despite being a template, the attributes do no harm since this doesn't work
8738     with user-defined range or character types anyway.
8739 */
8740 pure @safe inout(C)[] normalize(NormalizationForm norm=NFC, C)
8741     (return scope inout(C)[] input)
8742 {
8743     import std.algorithm.mutation : SwapStrategy;
8744     import std.algorithm.sorting : sort;
8745     import std.array : appender;
8746     import std.range : zip;
8747 
8748     auto anchors = splitNormalized!norm(input);
8749     if (anchors[0] == input.length && anchors[1] == input.length)
8750         return input;
8751     dchar[] decomposed;
8752     decomposed.reserve(31);
8753     ubyte[] ccc;
8754     ccc.reserve(31);
8755     auto app = appender!(C[])();
8756     do
8757     {
8758         app.put(input[0 .. anchors[0]]);
8759         foreach (dchar ch; input[anchors[0]..anchors[1]])
8760             static if (norm == NFD || norm == NFC)
8761             {
8762                 foreach (dchar c; decompose!Canonical(ch)[])
8763                     decomposed ~= c;
8764             }
8765             else // NFKD & NFKC
8766             {
8767                 foreach (dchar c; decompose!Compatibility(ch)[])
8768                     decomposed ~= c;
8769             }
8770         ccc.length = decomposed.length;
8771         size_t firstNonStable = 0;
8772         ubyte lastClazz = 0;
8773 
8774         foreach (idx, dchar ch; decomposed)
8775         {
8776             immutable clazz = combiningClass(ch);
8777             ccc[idx] = clazz;
8778             if (clazz == 0 && lastClazz != 0)
8779             {
8780                 // found a stable code point after unstable ones
8781                 sort!("a[0] < b[0]", SwapStrategy.stable)
8782                     (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
8783                 firstNonStable = decomposed.length;
8784             }
8785             else if (clazz != 0 && lastClazz == 0)
8786             {
8787                 // found first unstable code point after stable ones
8788                 firstNonStable = idx;
8789             }
8790             lastClazz = clazz;
8791         }
8792         sort!("a[0] < b[0]", SwapStrategy.stable)
8793             (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
8794         static if (norm == NFC || norm == NFKC)
8795         {
8796             import std.algorithm.searching : countUntil;
8797             auto first = countUntil(ccc, 0);
8798             if (first >= 0) // no starters?? no recomposition
8799             {
8800                 for (;;)
8801                 {
8802                     immutable second = recompose(first, decomposed, ccc);
8803                     if (second == decomposed.length)
8804                         break;
8805                     first = second;
8806                 }
8807                 // 2nd pass for hangul syllables
8808                 hangulRecompose(decomposed);
8809             }
8810         }
8811         static if (norm == NFD || norm == NFKD)
8812             app.put(decomposed);
8813         else
8814         {
8815             import std.algorithm.mutation : remove;
8816             auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
8817             app.put(decomposed[0 .. clean.length]);
8818         }
8819         // reset variables
8820         decomposed.length = 0;
8821         () @trusted {
8822             // assumeSafeAppend isn't considered pure as of writing, hence the
8823             // cast. It isn't pure in the sense that the elements after
8824             // the array in question are affected, but we don't use those
8825             // making the call pure for our purposes.
8826             (cast(void delegate() pure nothrow) {decomposed.assumeSafeAppend();})();
8827             ccc.length = 0;
8828             (cast(void delegate() pure nothrow) {ccc.assumeSafeAppend();})();
8829         } ();
8830         input = input[anchors[1]..$];
8831         // and move on
8832         anchors = splitNormalized!norm(input);
8833     } while (anchors[0] != input.length);
8834     app.put(input[0 .. anchors[0]]);
8835     return () @trusted inout { return cast(inout(C)[]) app.data; } ();
8836 }
8837 
8838 ///
8839 @safe pure unittest
8840 {
8841     // any encoding works
8842     wstring greet = "Hello world";
8843     assert(normalize(greet) is greet); // the same exact slice
8844 
8845     // An example of a character with all 4 forms being different:
8846     // Greek upsilon with acute and hook symbol (code point 0x03D3)
8847     assert(normalize!NFC("ϓ") == "\u03D3");
8848     assert(normalize!NFD("ϓ") == "\u03D2\u0301");
8849     assert(normalize!NFKC("ϓ") == "\u038E");
8850     assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
8851 }
8852 
8853 @safe pure unittest
8854 {
8855     import std.conv : text;
8856 
8857     assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
8858     assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
8859     assert(normalize!NFD("Äffin") == "A\u0308ffin");
8860 
8861     // test with dstring
8862     dstring greet = "Hello world";
8863     assert(normalize(greet) is greet); // the same exact slice
8864 }
8865 
8866 // canonically recompose given slice of code points, works in-place and mutates data
8867 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe
8868 {
8869     assert(input.length == ccc.length);
8870     int accumCC = -1;// so that it's out of 0 .. 255 range
8871     // writefln("recomposing %( %04x %)", input);
8872     // first one is always a starter thus we start at i == 1
8873     size_t i = start+1;
8874     for (; ; )
8875     {
8876         if (i == input.length)
8877             break;
8878         immutable curCC = ccc[i];
8879         // In any character sequence beginning with a starter S
8880         // a character C is blocked from S if and only if there
8881         // is some character B between S and C, and either B
8882         // is a starter or it has the same or higher combining class as C.
8883         //------------------------
8884         // Applying to our case:
8885         // S is input[0]
8886         // accumCC is the maximum CCC of characters between C and S,
8887         //     as ccc are sorted
8888         // C is input[i]
8889 
8890         if (curCC > accumCC)
8891         {
8892             immutable comp = compose(input[start], input[i]);
8893             if (comp != dchar.init)
8894             {
8895                 input[start] = comp;
8896                 input[i] = dchar.init;// put a sentinel
8897                 // current was merged so its CCC shouldn't affect
8898                 // composing with the next one
8899             }
8900             else
8901             {
8902                 // if it was a starter then accumCC is now 0, end of loop
8903                 accumCC = curCC;
8904                 if (accumCC == 0)
8905                     break;
8906             }
8907         }
8908         else
8909         {
8910             // ditto here
8911             accumCC = curCC;
8912             if (accumCC == 0)
8913                 break;
8914         }
8915         i++;
8916     }
8917     return i;
8918 }
8919 
8920 // returns tuple of 2 indexes that delimit:
8921 // normalized text, piece that needs normalization and
8922 // the rest of input starting with stable code point
8923 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input)
8924 {
8925     import std.typecons : tuple;
8926     ubyte lastCC = 0;
8927 
8928     foreach (idx, dchar ch; input)
8929     {
8930         static if (norm == NFC)
8931             if (ch < 0x0300)
8932             {
8933                 lastCC = 0;
8934                 continue;
8935             }
8936         immutable ubyte CC = combiningClass(ch);
8937         if (lastCC > CC && CC != 0)
8938         {
8939             return seekStable!norm(idx, input);
8940         }
8941 
8942         if (notAllowedIn!norm(ch))
8943         {
8944            return seekStable!norm(idx, input);
8945         }
8946         lastCC = CC;
8947     }
8948     return tuple(input.length, input.length);
8949 }
8950 
8951 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input)
8952 {
8953     import std.typecons : tuple;
8954     import std.utf : codeLength;
8955 
8956     auto br = input[0 .. idx];
8957     size_t region_start = 0;// default
8958     for (;;)
8959     {
8960         if (br.empty)// start is 0
8961             break;
8962         dchar ch = br.back;
8963         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8964         {
8965             region_start = br.length - codeLength!C(ch);
8966             break;
8967         }
8968         br.popFront();
8969     }
8970     ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
8971     size_t region_end=input.length;// end is $ by default
8972     foreach (i, dchar ch; input[idx..$])
8973     {
8974         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8975         {
8976             region_end = i+idx;
8977             break;
8978         }
8979     }
8980     // writeln("Region to normalize: ", input[region_start .. region_end]);
8981     return tuple(region_start, region_end);
8982 }
8983 
8984 /**
8985     Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization
8986     form `norm`.
8987 */
8988 public bool allowedIn(NormalizationForm norm)(dchar ch)
8989 {
8990     return !notAllowedIn!norm(ch);
8991 }
8992 
8993 ///
8994 @safe unittest
8995 {
8996     // e.g. Cyrillic is always allowed, so is ASCII
8997     assert(allowedIn!NFC('я'));
8998     assert(allowedIn!NFD('я'));
8999     assert(allowedIn!NFKC('я'));
9000     assert(allowedIn!NFKD('я'));
9001     assert(allowedIn!NFC('Z'));
9002 }
9003 
9004 // not user friendly name but more direct
9005 private bool notAllowedIn(NormalizationForm norm)(dchar ch)
9006 {
9007     static if (norm == NFC)
9008         alias qcTrie = nfcQCTrie;
9009     else static if (norm == NFD)
9010         alias qcTrie = nfdQCTrie;
9011     else static if (norm == NFKC)
9012         alias qcTrie = nfkcQCTrie;
9013     else static if (norm == NFKD)
9014         alias qcTrie = nfkdQCTrie;
9015     else
9016         static assert("Unknown normalization form "~norm);
9017     return qcTrie[ch];
9018 }
9019 
9020 @safe unittest
9021 {
9022     assert(allowedIn!NFC('я'));
9023     assert(allowedIn!NFD('я'));
9024     assert(allowedIn!NFKC('я'));
9025     assert(allowedIn!NFKD('я'));
9026     assert(allowedIn!NFC('Z'));
9027 }
9028 
9029 }
9030 
9031 version (std_uni_bootstrap)
9032 {
9033     // old version used for bootstrapping of gen_uni.d that generates
9034     // up to date optimal versions of all of isXXX functions
9035     @safe pure nothrow @nogc public bool isWhite(dchar c)
9036     {
9037         import std.ascii : isWhite;
9038         return isWhite(c) ||
9039                c == lineSep || c == paraSep ||
9040                c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
9041                (c >= '\u2000' && c <= '\u200A') ||
9042                c == '\u202F' || c == '\u205F' || c == '\u3000';
9043     }
9044 }
9045 else
9046 {
9047 
9048 // trusted -> avoid bounds check
9049 @trusted pure nothrow @nogc private
9050 {
9051     import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
9052 
9053     // hide template instances behind functions
9054     // https://issues.dlang.org/show_bug.cgi?id=13232
9055     ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
9056     ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
9057     dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
9058 
9059     ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
9060     ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
9061     dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
9062 
9063     ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
9064     ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
9065     dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
9066 }
9067 
9068 public:
9069 
9070 /++
9071     Whether or not `c` is a Unicode whitespace $(CHARACTER).
9072     (general Unicode category: Part of C0(tab, vertical tab, form feed,
9073     carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
9074 +/
9075 @safe pure nothrow @nogc
9076 public bool isWhite(dchar c)
9077 {
9078     import std.internal.unicode_tables : isWhiteGen; // generated file
9079     return isWhiteGen(c); // call pregenerated binary search
9080 }
9081 
9082 /++
9083     Return whether `c` is a Unicode lowercase $(CHARACTER).
9084 +/
9085 @safe pure nothrow @nogc
9086 bool isLower(dchar c)
9087 {
9088     import std.ascii : isLower, isASCII;
9089     if (isASCII(c))
9090         return isLower(c);
9091     return lowerCaseTrie[c];
9092 }
9093 
9094 @safe unittest
9095 {
9096     import std.ascii : isLower;
9097     foreach (v; 0 .. 0x80)
9098         assert(isLower(v) == .isLower(v));
9099     assert(.isLower('я'));
9100     assert(.isLower('й'));
9101     assert(!.isLower('Ж'));
9102     // Greek HETA
9103     assert(!.isLower('\u0370'));
9104     assert(.isLower('\u0371'));
9105     assert(!.isLower('\u039C')); // capital MU
9106     assert(.isLower('\u03B2')); // beta
9107     // from extended Greek
9108     assert(!.isLower('\u1F18'));
9109     assert(.isLower('\u1F00'));
9110     foreach (v; unicode.lowerCase.byCodepoint)
9111         assert(.isLower(v) && !isUpper(v));
9112 }
9113 
9114 
9115 /++
9116     Return whether `c` is a Unicode uppercase $(CHARACTER).
9117 +/
9118 @safe pure nothrow @nogc
9119 bool isUpper(dchar c)
9120 {
9121     import std.ascii : isUpper, isASCII;
9122     if (isASCII(c))
9123         return isUpper(c);
9124     return upperCaseTrie[c];
9125 }
9126 
9127 @safe unittest
9128 {
9129     import std.ascii : isLower;
9130     foreach (v; 0 .. 0x80)
9131         assert(isLower(v) == .isLower(v));
9132     assert(!isUpper('й'));
9133     assert(isUpper('Ж'));
9134     // Greek HETA
9135     assert(isUpper('\u0370'));
9136     assert(!isUpper('\u0371'));
9137     assert(isUpper('\u039C')); // capital MU
9138     assert(!isUpper('\u03B2')); // beta
9139     // from extended Greek
9140     assert(!isUpper('\u1F00'));
9141     assert(isUpper('\u1F18'));
9142     foreach (v; unicode.upperCase.byCodepoint)
9143         assert(isUpper(v) && !.isLower(v));
9144 }
9145 
9146 
9147 //TODO: Hidden for now, needs better API.
9148 //Other transforms could use better API as well, but this one is a new primitive.
9149 @safe pure nothrow @nogc
9150 private dchar toTitlecase(dchar c)
9151 {
9152     // optimize ASCII case
9153     if (c < 0xAA)
9154     {
9155         if (c < 'a')
9156             return c;
9157         if (c <= 'z')
9158             return c - 32;
9159         return c;
9160     }
9161     size_t idx = toTitleSimpleIndex(c);
9162     if (idx != ushort.max)
9163     {
9164         return toTitleTab(idx);
9165     }
9166     return c;
9167 }
9168 
9169 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
9170 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
9171 
9172 // generic toUpper/toLower on whole string, creates new or returns as is
9173 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s)
9174 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9175 {
9176     import std.array : appender, array;
9177     import std.ascii : isASCII;
9178     import std.utf : byDchar, codeLength;
9179 
9180     alias C = ElementEncodingType!S;
9181 
9182     auto r = s.byDchar;
9183     for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront())
9184     {
9185         auto cOuter = r.front;
9186         ushort idx = indexFn(cOuter);
9187         if (idx == ushort.max)
9188             continue;
9189         auto result = appender!(C[])();
9190         result.reserve(s.length);
9191         result.put(s[0 .. i]);
9192         foreach (dchar c; s[i .. $].byDchar)
9193         {
9194             if (c.isASCII)
9195             {
9196                 result.put(asciiConvert(c));
9197             }
9198             else
9199             {
9200                 idx = indexFn(c);
9201                 if (idx == ushort.max)
9202                     result.put(c);
9203                 else if (idx < maxIdx)
9204                 {
9205                     c = tableFn(idx);
9206                     result.put(c);
9207                 }
9208                 else
9209                 {
9210                     auto val = tableFn(idx);
9211                     // unpack length + codepoint
9212                     immutable uint len = val >> 24;
9213                     result.put(cast(dchar)(val & 0xFF_FFFF));
9214                     foreach (j; idx+1 .. idx+len)
9215                         result.put(tableFn(j));
9216                 }
9217             }
9218         }
9219         return result.data;
9220     }
9221 
9222     static if (isSomeString!S)
9223         return s;
9224     else
9225         return s.array;
9226 }
9227 
9228 // https://issues.dlang.org/show_bug.cgi?id=12428
9229 @safe unittest
9230 {
9231     import std.array : replicate;
9232     auto s = "abcdefghij".replicate(300);
9233     s = s[0 .. 10];
9234 
9235     toUpper(s);
9236 
9237     assert(s == "abcdefghij");
9238 }
9239 
9240 // https://issues.dlang.org/show_bug.cgi?id=18993
9241 @safe unittest
9242 {
9243     static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length);
9244 }
9245 
9246 
9247 // generic toUpper/toLower on whole range, returns range
9248 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
9249     // Accept range of dchar's
9250 if (isInputRange!Range &&
9251     isSomeChar!(ElementEncodingType!Range) &&
9252     ElementEncodingType!Range.sizeof == dchar.sizeof)
9253 {
9254     static struct ToCaserImpl
9255     {
9256         @property bool empty()
9257         {
9258             return !nLeft && r.empty;
9259         }
9260 
9261         @property auto front()
9262         {
9263             import std.ascii : isASCII;
9264 
9265             if (!nLeft)
9266             {
9267                 dchar c = r.front;
9268                 if (c.isASCII)
9269                 {
9270                     buf[0] = asciiConvert(c);
9271                     nLeft = 1;
9272                 }
9273                 else
9274                 {
9275                     const idx = indexFn(c);
9276                     if (idx == ushort.max)
9277                     {
9278                         buf[0] = c;
9279                         nLeft = 1;
9280                     }
9281                     else if (idx < maxIdx)
9282                     {
9283                         buf[0] = tableFn(idx);
9284                         nLeft = 1;
9285                     }
9286                     else
9287                     {
9288                         immutable val = tableFn(idx);
9289                         // unpack length + codepoint
9290                         nLeft = val >> 24;
9291                         if (nLeft == 0)
9292                             nLeft = 1;
9293                         assert(nLeft <= buf.length);
9294                         buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9295                         foreach (j; 1 .. nLeft)
9296                             buf[nLeft - j - 1] = tableFn(idx + j);
9297                     }
9298                 }
9299             }
9300             return buf[nLeft - 1];
9301         }
9302 
9303         void popFront()
9304         {
9305             if (!nLeft)
9306                 front;
9307             assert(nLeft);
9308             --nLeft;
9309             if (!nLeft)
9310                 r.popFront();
9311         }
9312 
9313         static if (isForwardRange!Range)
9314         {
9315             @property auto save()
9316             {
9317                 auto ret = this;
9318                 ret.r = r.save;
9319                 return ret;
9320             }
9321         }
9322 
9323       private:
9324         Range r;
9325         uint nLeft;
9326         dchar[3] buf = void;
9327     }
9328 
9329     return ToCaserImpl(str);
9330 }
9331 
9332 /*********************
9333  * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9334  * or a string to upper or lower case.
9335  *
9336  * Does not allocate memory.
9337  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9338  * are treated as $(REF replacementDchar, std,utf).
9339  *
9340  * Params:
9341  *      str = string or range of characters
9342  *
9343  * Returns:
9344  *      an input range of `dchar`s
9345  *
9346  * See_Also:
9347  *      $(LREF toUpper), $(LREF toLower)
9348  */
9349 
9350 auto asLowerCase(Range)(Range str)
9351 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9352     !isConvertibleToString!Range)
9353 {
9354     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9355     {
9356         import std.utf : byDchar;
9357 
9358         // Decode first
9359         return asLowerCase(str.byDchar);
9360     }
9361     else
9362     {
9363         static import std.ascii;
9364         return toCaser!(LowerTriple, std.ascii.toLower)(str);
9365     }
9366 }
9367 
9368 /// ditto
9369 auto asUpperCase(Range)(Range str)
9370 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9371     !isConvertibleToString!Range)
9372 {
9373     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9374     {
9375         import std.utf : byDchar;
9376 
9377         // Decode first
9378         return asUpperCase(str.byDchar);
9379     }
9380     else
9381     {
9382         static import std.ascii;
9383         return toCaser!(UpperTriple, std.ascii.toUpper)(str);
9384     }
9385 }
9386 
9387 ///
9388 @safe pure unittest
9389 {
9390     import std.algorithm.comparison : equal;
9391 
9392     assert("hEllo".asUpperCase.equal("HELLO"));
9393 }
9394 
9395 // explicitly undocumented
9396 auto asLowerCase(Range)(auto ref Range str)
9397 if (isConvertibleToString!Range)
9398 {
9399     import std.traits : StringTypeOf;
9400     return asLowerCase!(StringTypeOf!Range)(str);
9401 }
9402 
9403 // explicitly undocumented
9404 auto asUpperCase(Range)(auto ref Range str)
9405 if (isConvertibleToString!Range)
9406 {
9407     import std.traits : StringTypeOf;
9408     return asUpperCase!(StringTypeOf!Range)(str);
9409 }
9410 
9411 @safe unittest
9412 {
9413     static struct TestAliasedString
9414     {
9415         string get() @safe @nogc pure nothrow { return _s; }
9416         alias get this;
9417         @disable this(this);
9418         string _s;
9419     }
9420 
9421     static bool testAliasedString(alias func, Args...)(string s, Args args)
9422     {
9423         import std.algorithm.comparison : equal;
9424         auto a = func(TestAliasedString(s), args);
9425         auto b = func(s, args);
9426         static if (is(typeof(equal(a, b))))
9427         {
9428             // For ranges, compare contents instead of object identity.
9429             return equal(a, b);
9430         }
9431         else
9432         {
9433             return a == b;
9434         }
9435     }
9436     assert(testAliasedString!asLowerCase("hEllo"));
9437     assert(testAliasedString!asUpperCase("hEllo"));
9438     assert(testAliasedString!asCapitalized("hEllo"));
9439 }
9440 
9441 @safe unittest
9442 {
9443     import std.array : array;
9444 
9445     auto a = "HELLo".asLowerCase;
9446     auto savea = a.save;
9447     auto s = a.array;
9448     assert(s == "hello");
9449     s = savea.array;
9450     assert(s == "hello");
9451 
9452     string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
9453     string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
9454 
9455     foreach (i, slwr; lower)
9456     {
9457         import std.utf : byChar;
9458 
9459         auto sx = slwr.asUpperCase.byChar.array;
9460         assert(sx == toUpper(slwr));
9461         auto sy = upper[i].asLowerCase.byChar.array;
9462         assert(sy == toLower(upper[i]));
9463     }
9464 
9465     // Not necessary to call r.front
9466     for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
9467     {
9468     }
9469 
9470     import std.algorithm.comparison : equal;
9471 
9472     "HELLo"w.asLowerCase.equal("hello"d);
9473     "HELLo"w.asUpperCase.equal("HELLO"d);
9474     "HELLo"d.asLowerCase.equal("hello"d);
9475     "HELLo"d.asUpperCase.equal("HELLO"d);
9476 
9477     import std.utf : byChar;
9478     assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
9479 }
9480 
9481 // generic capitalizer on whole range, returns range
9482 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
9483                            Range)(Range str)
9484     // Accept range of dchar's
9485 if (isInputRange!Range &&
9486     isSomeChar!(ElementEncodingType!Range) &&
9487     ElementEncodingType!Range.sizeof == dchar.sizeof)
9488 {
9489     static struct ToCapitalizerImpl
9490     {
9491         @property bool empty()
9492         {
9493             return lower ? lwr.empty : !nLeft && r.empty;
9494         }
9495 
9496         @property auto front()
9497         {
9498             if (lower)
9499                 return lwr.front;
9500 
9501             if (!nLeft)
9502             {
9503                 immutable dchar c = r.front;
9504                 const idx = indexFnUpper(c);
9505                 if (idx == ushort.max)
9506                 {
9507                     buf[0] = c;
9508                     nLeft = 1;
9509                 }
9510                 else if (idx < maxIdxUpper)
9511                 {
9512                     buf[0] = tableFnUpper(idx);
9513                     nLeft = 1;
9514                 }
9515                 else
9516                 {
9517                     immutable val = tableFnUpper(idx);
9518                     // unpack length + codepoint
9519                     nLeft = val >> 24;
9520                     if (nLeft == 0)
9521                         nLeft = 1;
9522                     assert(nLeft <= buf.length);
9523                     buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9524                     foreach (j; 1 .. nLeft)
9525                         buf[nLeft - j - 1] = tableFnUpper(idx + j);
9526                 }
9527             }
9528             return buf[nLeft - 1];
9529         }
9530 
9531         void popFront()
9532         {
9533             if (lower)
9534                 lwr.popFront();
9535             else
9536             {
9537                 if (!nLeft)
9538                     front;
9539                 assert(nLeft);
9540                 --nLeft;
9541                 if (!nLeft)
9542                 {
9543                     r.popFront();
9544                     lwr = r.asLowerCase();
9545                     lower = true;
9546                 }
9547             }
9548         }
9549 
9550         static if (isForwardRange!Range)
9551         {
9552             @property auto save()
9553             {
9554                 auto ret = this;
9555                 ret.r = r.save;
9556                 ret.lwr = lwr.save;
9557                 return ret;
9558             }
9559         }
9560 
9561       private:
9562         Range r;
9563         typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
9564         bool lower = false;     // false for first character, true for rest of string
9565         dchar[3] buf = void;
9566         uint nLeft = 0;
9567     }
9568 
9569     return ToCapitalizerImpl(str);
9570 }
9571 
9572 /*********************
9573  * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9574  * or string, meaning convert the first
9575  * character to upper case and subsequent characters to lower case.
9576  *
9577  * Does not allocate memory.
9578  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9579  * are treated as $(REF replacementDchar, std,utf).
9580  *
9581  * Params:
9582  *      str = string or range of characters
9583  *
9584  * Returns:
9585  *      an InputRange of dchars
9586  *
9587  * See_Also:
9588  *      $(LREF toUpper), $(LREF toLower)
9589  *      $(LREF asUpperCase), $(LREF asLowerCase)
9590  */
9591 
9592 auto asCapitalized(Range)(Range str)
9593 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9594     !isConvertibleToString!Range)
9595 {
9596     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9597     {
9598         import std.utf : byDchar;
9599 
9600         // Decode first
9601         return toCapitalizer!UpperTriple(str.byDchar);
9602     }
9603     else
9604     {
9605         return toCapitalizer!UpperTriple(str);
9606     }
9607 }
9608 
9609 ///
9610 @safe pure unittest
9611 {
9612     import std.algorithm.comparison : equal;
9613 
9614     assert("hEllo".asCapitalized.equal("Hello"));
9615 }
9616 
9617 auto asCapitalized(Range)(auto ref Range str)
9618 if (isConvertibleToString!Range)
9619 {
9620     import std.traits : StringTypeOf;
9621     return asCapitalized!(StringTypeOf!Range)(str);
9622 }
9623 
9624 @safe pure nothrow @nogc unittest
9625 {
9626     auto r = "hEllo".asCapitalized();
9627     assert(r.front == 'H');
9628 }
9629 
9630 @safe unittest
9631 {
9632     import std.array : array;
9633 
9634     auto a = "hELLo".asCapitalized;
9635     auto savea = a.save;
9636     auto s = a.array;
9637     assert(s == "Hello");
9638     s = savea.array;
9639     assert(s == "Hello");
9640 
9641     string[2][] cases =
9642     [
9643         ["", ""],
9644         ["h", "H"],
9645         ["H", "H"],
9646         ["3", "3"],
9647         ["123", "123"],
9648         ["h123A", "H123a"],
9649         ["феж", "Феж"],
9650         ["\u1Fe2", "\u03a5\u0308\u0300"],
9651     ];
9652 
9653     foreach (i; 0 .. cases.length)
9654     {
9655         import std.utf : byChar;
9656 
9657         auto r = cases[i][0].asCapitalized.byChar.array;
9658         auto result = cases[i][1];
9659         assert(r == result);
9660     }
9661 
9662     // Don't call r.front
9663     for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
9664     {
9665     }
9666 
9667     import std.algorithm.comparison : equal;
9668 
9669     "HELLo"w.asCapitalized.equal("Hello"d);
9670     "hElLO"w.asCapitalized.equal("Hello"d);
9671     "hello"d.asCapitalized.equal("Hello"d);
9672     "HELLO"d.asCapitalized.equal("Hello"d);
9673 
9674     import std.utf : byChar;
9675     assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
9676 }
9677 
9678 // TODO: helper, I wish std.utf was more flexible (and stright)
9679 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9680 {
9681     if (c <= 0x7F)
9682     {
9683         buf[idx] = cast(char) c;
9684         idx++;
9685     }
9686     else if (c <= 0x7FF)
9687     {
9688         buf[idx] = cast(char)(0xC0 | (c >> 6));
9689         buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
9690         idx += 2;
9691     }
9692     else if (c <= 0xFFFF)
9693     {
9694         buf[idx] = cast(char)(0xE0 | (c >> 12));
9695         buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9696         buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
9697         idx += 3;
9698     }
9699     else if (c <= 0x10FFFF)
9700     {
9701         buf[idx] = cast(char)(0xF0 | (c >> 18));
9702         buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
9703         buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9704         buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
9705         idx += 4;
9706     }
9707     else
9708         assert(0);
9709     return idx;
9710 }
9711 
9712 @safe unittest
9713 {
9714     char[] s = "abcd".dup;
9715     size_t i = 0;
9716     i = encodeTo(s, i, 'X');
9717     assert(s == "Xbcd");
9718 
9719     i = encodeTo(s, i, cast(dchar)'\u00A9');
9720     assert(s == "X\xC2\xA9d");
9721 }
9722 
9723 // TODO: helper, I wish std.utf was more flexible (and stright)
9724 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
9725 {
9726     import std.utf : UTFException;
9727     if (c <= 0xFFFF)
9728     {
9729         if (0xD800 <= c && c <= 0xDFFF)
9730             throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
9731         buf[idx] = cast(wchar) c;
9732         idx++;
9733     }
9734     else if (c <= 0x10FFFF)
9735     {
9736         buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
9737         buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
9738         idx += 2;
9739     }
9740     else
9741         assert(0);
9742     return idx;
9743 }
9744 
9745 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9746 {
9747     buf[idx] = c;
9748     idx++;
9749     return idx;
9750 }
9751 
9752 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
9753 if (is(C == char) || is(C == wchar)  || is(C == dchar))
9754 {
9755     import std.utf : decode, codeLength;
9756     size_t curIdx = 0;
9757     size_t destIdx = 0;
9758     alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
9759     size_t lastUnchanged = 0;
9760     // in-buffer move of bytes to a new start index
9761     // the trick is that it may not need to copy at all
9762     static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
9763     {
9764         // Interestingly we may just bump pointer for a while
9765         // then have to copy if a re-cased char was smaller the original
9766         // later we may regain pace with char that got bigger
9767         // In the end it sometimes flip-flops between the 2 cases below
9768         if (dest == from)
9769             return to;
9770         // got to copy
9771         foreach (C c; str[from .. to])
9772             str[dest++] = c;
9773         return dest;
9774     }
9775     while (curIdx != s.length)
9776     {
9777         size_t startIdx = curIdx;
9778         immutable ch = decode(s, curIdx);
9779         // TODO: special case for ASCII
9780         immutable caseIndex = indexFn(ch);
9781         if (caseIndex == ushort.max) // unchanged, skip over
9782         {
9783             continue;
9784         }
9785         else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9786         {
9787             // previous cased chars had the same length as uncased ones
9788             // thus can just adjust pointer
9789             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9790             lastUnchanged = curIdx;
9791             immutable cased = tableFn(caseIndex);
9792             immutable casedLen = codeLength!C(cased);
9793             if (casedLen + destIdx > curIdx) // no place to fit cased char
9794             {
9795                 // switch to slow codepath, where we allocate
9796                 return slowToCase(s, startIdx, destIdx);
9797             }
9798             else
9799             {
9800                 destIdx = encodeTo(s, destIdx, cased);
9801             }
9802         }
9803         else  // 1:m codepoint mapping, slow codepath
9804         {
9805             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9806             lastUnchanged = curIdx;
9807             return slowToCase(s, startIdx, destIdx);
9808         }
9809         assert(destIdx <= curIdx);
9810     }
9811     if (lastUnchanged != s.length)
9812     {
9813         destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
9814     }
9815     s = s[0 .. destIdx];
9816 }
9817 
9818 // helper to precalculate size of case-converted string
9819 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
9820 {
9821     size_t toCaseLength(C)(const scope C[] str)
9822     {
9823         import std.utf : decode, codeLength;
9824         size_t codeLen = 0;
9825         size_t lastNonTrivial = 0;
9826         size_t curIdx = 0;
9827         while (curIdx != str.length)
9828         {
9829             immutable startIdx = curIdx;
9830             immutable ch = decode(str, curIdx);
9831             immutable ushort caseIndex = indexFn(ch);
9832             if (caseIndex == ushort.max)
9833                 continue;
9834             else if (caseIndex < maxIdx)
9835             {
9836                 codeLen += startIdx - lastNonTrivial;
9837                 lastNonTrivial = curIdx;
9838                 immutable cased = tableFn(caseIndex);
9839                 codeLen += codeLength!C(cased);
9840             }
9841             else
9842             {
9843                 codeLen += startIdx - lastNonTrivial;
9844                 lastNonTrivial = curIdx;
9845                 immutable val = tableFn(caseIndex);
9846                 immutable len = val >> 24;
9847                 immutable dchar cased = val & 0xFF_FFFF;
9848                 codeLen += codeLength!C(cased);
9849                 foreach (j; caseIndex+1 .. caseIndex+len)
9850                     codeLen += codeLength!C(tableFn(j));
9851             }
9852         }
9853         if (lastNonTrivial != str.length)
9854             codeLen += str.length - lastNonTrivial;
9855         return codeLen;
9856     }
9857 }
9858 
9859 @safe unittest
9860 {
9861     alias toLowerLength = toCaseLength!(LowerTriple);
9862     assert(toLowerLength("abcd") == 4);
9863     assert(toLowerLength("аБВгд456") == 10+3);
9864 }
9865 
9866 // slower code path that preallocates and then copies
9867 // case-converted stuf to the new string
9868 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
9869 {
9870     void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
9871         size_t destIdx) @trusted pure
9872         if (is(C == char) || is(C == wchar) || is(C == dchar))
9873     {
9874         import std.utf : decode;
9875         alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
9876         auto trueLength = destIdx + caseLength(s[curIdx..$]);
9877         C[] ns = new C[trueLength];
9878         ns[0 .. destIdx] = s[0 .. destIdx];
9879         size_t lastUnchanged = curIdx;
9880         while (curIdx != s.length)
9881         {
9882             immutable startIdx = curIdx; // start of current codepoint
9883             immutable ch = decode(s, curIdx);
9884             immutable caseIndex = indexFn(ch);
9885             if (caseIndex == ushort.max) // skip over
9886             {
9887                 continue;
9888             }
9889             else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9890             {
9891                 immutable cased = tableFn(caseIndex);
9892                 auto toCopy = startIdx - lastUnchanged;
9893                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9894                 lastUnchanged = curIdx;
9895                 destIdx += toCopy;
9896                 destIdx = encodeTo(ns, destIdx, cased);
9897             }
9898             else  // 1:m codepoint mapping, slow codepath
9899             {
9900                 auto toCopy = startIdx - lastUnchanged;
9901                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9902                 lastUnchanged = curIdx;
9903                 destIdx += toCopy;
9904                 auto val = tableFn(caseIndex);
9905                 // unpack length + codepoint
9906                 immutable uint len = val >> 24;
9907                 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
9908                 foreach (j; caseIndex+1 .. caseIndex+len)
9909                     destIdx = encodeTo(ns, destIdx, tableFn(j));
9910             }
9911         }
9912         if (lastUnchanged != s.length)
9913         {
9914             auto toCopy = s.length - lastUnchanged;
9915             ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
9916             destIdx += toCopy;
9917         }
9918         assert(ns.length == destIdx);
9919         s = ns;
9920     }
9921 }
9922 
9923 /++
9924     Converts `s` to lowercase (by performing Unicode lowercase mapping) in place.
9925     For a few characters string length may increase after the transformation,
9926     in such a case the function reallocates exactly once.
9927     If `s` does not have any uppercase characters, then `s` is unaltered.
9928 +/
9929 void toLowerInPlace(C)(ref C[] s) @trusted pure
9930 if (is(C == char) || is(C == wchar) || is(C == dchar))
9931 {
9932     toCaseInPlace!(LowerTriple)(s);
9933 }
9934 // overloads for the most common cases to reduce compile time
9935 @safe pure /*TODO nothrow*/
9936 {
9937     void toLowerInPlace(ref char[] s)
9938     { toLowerInPlace!char(s); }
9939     void toLowerInPlace(ref wchar[] s)
9940     { toLowerInPlace!wchar(s); }
9941     void toLowerInPlace(ref dchar[] s)
9942     { toLowerInPlace!dchar(s); }
9943 }
9944 
9945 /++
9946     Converts `s` to uppercase  (by performing Unicode uppercase mapping) in place.
9947     For a few characters string length may increase after the transformation,
9948     in such a case the function reallocates exactly once.
9949     If `s` does not have any lowercase characters, then `s` is unaltered.
9950 +/
9951 void toUpperInPlace(C)(ref C[] s) @trusted pure
9952 if (is(C == char) || is(C == wchar) || is(C == dchar))
9953 {
9954     toCaseInPlace!(UpperTriple)(s);
9955 }
9956 // overloads for the most common cases to reduce compile time/code size
9957 @safe pure /*TODO nothrow*/
9958 {
9959     void toUpperInPlace(ref char[] s)
9960     { toUpperInPlace!char(s); }
9961     void toUpperInPlace(ref wchar[] s)
9962     { toUpperInPlace!wchar(s); }
9963     void toUpperInPlace(ref dchar[] s)
9964     { toUpperInPlace!dchar(s); }
9965 }
9966 
9967 /++
9968     If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
9969     is returned. Otherwise `c` is returned.
9970 
9971     Warning: certain alphabets like German and Greek have no 1:1
9972     upper-lower mapping. Use overload of toLower which takes full string instead.
9973 +/
9974 @safe pure nothrow @nogc
9975 dchar toLower(dchar c)
9976 {
9977      // optimize ASCII case
9978     if (c < 0xAA)
9979     {
9980         if (c < 'A')
9981             return c;
9982         if (c <= 'Z')
9983             return c + 32;
9984         return c;
9985     }
9986     size_t idx = toLowerSimpleIndex(c);
9987     if (idx != ushort.max)
9988     {
9989         return toLowerTab(idx);
9990     }
9991     return c;
9992 }
9993 
9994 /++
9995     Creates a new array which is identical to `s` except that all of its
9996     characters are converted to lowercase (by performing Unicode lowercase mapping).
9997     If none of `s` characters were affected, then `s` itself is returned if `s` is a
9998     `string`-like type.
9999 
10000     Params:
10001         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10002         of characters
10003     Returns:
10004         An array with the same element type as `s`.
10005 +/
10006 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted
10007 if (isSomeString!S)
10008 {
10009     static import std.ascii;
10010     return toCase!(LowerTriple, std.ascii.toLower)(s);
10011 }
10012 
10013 /// ditto
10014 ElementEncodingType!S[] toLower(S)(S s)
10015 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10016 {
10017     static import std.ascii;
10018     return toCase!(LowerTriple, std.ascii.toLower)(s);
10019 }
10020 
10021 // overloads for the most common cases to reduce compile time
10022 @safe pure /*TODO nothrow*/
10023 {
10024     string toLower(return scope string s)
10025     { return toLower!string(s); }
10026     wstring toLower(return scope wstring s)
10027     { return toLower!wstring(s); }
10028     dstring toLower(return scope dstring s)
10029     { return toLower!dstring(s); }
10030 
10031     @safe unittest
10032     {
10033         // https://issues.dlang.org/show_bug.cgi?id=16663
10034 
10035         static struct String
10036         {
10037             string data;
10038             alias data this;
10039         }
10040 
10041         void foo()
10042         {
10043             auto u = toLower(String(""));
10044         }
10045     }
10046 }
10047 
10048 
10049 @safe unittest
10050 {
10051     static import std.ascii;
10052     import std.format : format;
10053     foreach (ch; 0 .. 0x80)
10054         assert(std.ascii.toLower(ch) == toLower(ch));
10055     assert(toLower('Я') == 'я');
10056     assert(toLower('Δ') == 'δ');
10057     foreach (ch; unicode.upperCase.byCodepoint)
10058     {
10059         dchar low = ch.toLower();
10060         assert(low == ch || isLower(low), format("%s -> %s", ch, low));
10061     }
10062     assert(toLower("АЯ") == "ая");
10063 
10064     assert("\u1E9E".toLower == "\u00df");
10065     assert("\u00df".toUpper == "SS");
10066 }
10067 
10068 // https://issues.dlang.org/show_bug.cgi?id=9629
10069 @safe unittest
10070 {
10071     wchar[] test = "hello þ world"w.dup;
10072     auto piece = test[6 .. 7];
10073     toUpperInPlace(piece);
10074     assert(test == "hello Þ world");
10075 }
10076 
10077 
10078 @safe unittest
10079 {
10080     import std.algorithm.comparison : cmp;
10081     string s1 = "FoL";
10082     string s2 = toLower(s1);
10083     assert(cmp(s2, "fol") == 0, s2);
10084     assert(s2 != s1);
10085 
10086     char[] s3 = s1.dup;
10087     toLowerInPlace(s3);
10088     assert(s3 == s2);
10089 
10090     s1 = "A\u0100B\u0101d";
10091     s2 = toLower(s1);
10092     s3 = s1.dup;
10093     assert(cmp(s2, "a\u0101b\u0101d") == 0);
10094     assert(s2 !is s1);
10095     toLowerInPlace(s3);
10096     assert(s3 == s2);
10097 
10098     s1 = "A\u0460B\u0461d";
10099     s2 = toLower(s1);
10100     s3 = s1.dup;
10101     assert(cmp(s2, "a\u0461b\u0461d") == 0);
10102     assert(s2 !is s1);
10103     toLowerInPlace(s3);
10104     assert(s3 == s2);
10105 
10106     s1 = "\u0130";
10107     s2 = toLower(s1);
10108     s3 = s1.dup;
10109     assert(s2 == "i\u0307");
10110     assert(s2 !is s1);
10111     toLowerInPlace(s3);
10112     assert(s3 == s2);
10113 
10114     // Test on wchar and dchar strings.
10115     assert(toLower("Some String"w) == "some string"w);
10116     assert(toLower("Some String"d) == "some string"d);
10117 
10118     // https://issues.dlang.org/show_bug.cgi?id=12455
10119     dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
10120     assert(isUpper(c));
10121     assert(toLower(c) == 'i');
10122     // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report
10123     // check simple-case toUpper too
10124     c = '\u1f87';
10125     assert(isLower(c));
10126     assert(toUpper(c) == '\u1F8F');
10127 }
10128 
10129 @safe pure unittest
10130 {
10131     import std.algorithm.comparison : cmp, equal;
10132     import std.utf : byCodeUnit;
10133     auto r1 = "FoL".byCodeUnit;
10134     assert(r1.toLower.cmp("fol") == 0);
10135     auto r2 = "A\u0460B\u0461d".byCodeUnit;
10136     assert(r2.toLower.cmp("a\u0461b\u0461d") == 0);
10137 }
10138 
10139 /++
10140     If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
10141     is returned. Otherwise `c` is returned.
10142 
10143     Warning:
10144     Certain alphabets like German and Greek have no 1:1
10145     upper-lower mapping. Use overload of toUpper which takes full string instead.
10146 
10147     toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
10148     to produce an algorithm that can convert a range of characters to upper case
10149     without allocating memory.
10150     A string can then be produced by using $(REF copy, std,algorithm,mutation)
10151     to send it to an $(REF appender, std,array).
10152 +/
10153 @safe pure nothrow @nogc
10154 dchar toUpper(dchar c)
10155 {
10156     // optimize ASCII case
10157     if (c < 0xAA)
10158     {
10159         if (c < 'a')
10160             return c;
10161         if (c <= 'z')
10162             return c - 32;
10163         return c;
10164     }
10165     size_t idx = toUpperSimpleIndex(c);
10166     if (idx != ushort.max)
10167     {
10168         return toUpperTab(idx);
10169     }
10170     return c;
10171 }
10172 
10173 ///
10174 @safe unittest
10175 {
10176     import std.algorithm.iteration : map;
10177     import std.algorithm.mutation : copy;
10178     import std.array : appender;
10179 
10180     auto abuf = appender!(char[])();
10181     "hello".map!toUpper.copy(abuf);
10182     assert(abuf.data == "HELLO");
10183 }
10184 
10185 @safe unittest
10186 {
10187     static import std.ascii;
10188     import std.format : format;
10189     foreach (ch; 0 .. 0x80)
10190         assert(std.ascii.toUpper(ch) == toUpper(ch));
10191     assert(toUpper('я') == 'Я');
10192     assert(toUpper('δ') == 'Δ');
10193     auto title = unicode.Titlecase_Letter;
10194     foreach (ch; unicode.lowerCase.byCodepoint)
10195     {
10196         dchar up = ch.toUpper();
10197         assert(up == ch || isUpper(up) || title[up],
10198             format("%x -> %x", ch, up));
10199     }
10200 }
10201 
10202 /++
10203     Allocates a new array which is identical to `s` except that all of its
10204     characters are converted to uppercase (by performing Unicode uppercase mapping).
10205     If none of `s` characters were affected, then `s` itself is returned if `s`
10206     is a `string`-like type.
10207 
10208     Params:
10209         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10210         of characters
10211     Returns:
10212         An new array with the same element type as `s`.
10213 +/
10214 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted
10215 if (isSomeString!S)
10216 {
10217     static import std.ascii;
10218     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10219 }
10220 
10221 /// ditto
10222 ElementEncodingType!S[] toUpper(S)(S s)
10223 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10224 {
10225     static import std.ascii;
10226     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10227 }
10228 
10229 // overloads for the most common cases to reduce compile time
10230 @safe pure /*TODO nothrow*/
10231 {
10232     string toUpper(return scope string s)
10233     { return toUpper!string(s); }
10234     wstring toUpper(return scope wstring s)
10235     { return toUpper!wstring(s); }
10236     dstring toUpper(return scope dstring s)
10237     { return toUpper!dstring(s); }
10238 
10239     @safe unittest
10240     {
10241         // https://issues.dlang.org/show_bug.cgi?id=16663
10242 
10243         static struct String
10244         {
10245             string data;
10246             alias data this;
10247         }
10248 
10249         void foo()
10250         {
10251             auto u = toUpper(String(""));
10252         }
10253     }
10254 }
10255 
10256 @safe unittest
10257 {
10258     import std.algorithm.comparison : cmp;
10259 
10260     string s1 = "FoL";
10261     string s2;
10262     char[] s3;
10263 
10264     s2 = toUpper(s1);
10265     s3 = s1.dup; toUpperInPlace(s3);
10266     assert(s3 == s2, s3);
10267     assert(cmp(s2, "FOL") == 0);
10268     assert(s2 !is s1);
10269 
10270     s1 = "a\u0100B\u0101d";
10271     s2 = toUpper(s1);
10272     s3 = s1.dup; toUpperInPlace(s3);
10273     assert(s3 == s2);
10274     assert(cmp(s2, "A\u0100B\u0100D") == 0);
10275     assert(s2 !is s1);
10276 
10277     s1 = "a\u0460B\u0461d";
10278     s2 = toUpper(s1);
10279     s3 = s1.dup; toUpperInPlace(s3);
10280     assert(s3 == s2);
10281     assert(cmp(s2, "A\u0460B\u0460D") == 0);
10282     assert(s2 !is s1);
10283 }
10284 
10285 @safe unittest
10286 {
10287     static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
10288     {
10289         import std.format : format;
10290         string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
10291         auto low = s.toLower() , up = s.toUpper();
10292         auto lowInp = s.dup, upInp = s.dup;
10293         lowInp.toLowerInPlace();
10294         upInp.toUpperInPlace();
10295         assert(low == trueLow, format(diff, low, trueLow));
10296         assert(up == trueUp,  format(diff, up, trueUp));
10297         assert(lowInp == trueLow,
10298             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow));
10299         assert(upInp == trueUp,
10300             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp));
10301     }
10302     static foreach (S; AliasSeq!(dstring, wstring, string))
10303     {{
10304 
10305         S easy = "123";
10306         S good = "abCФеж";
10307         S awful = "\u0131\u023f\u2126";
10308         S wicked = "\u0130\u1FE2";
10309         auto options = [easy, good, awful, wicked];
10310         S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
10311         S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
10312 
10313         foreach (val; [easy, good])
10314         {
10315             auto e = val.dup;
10316             auto g = e;
10317             e.toUpperInPlace();
10318             assert(e is g);
10319             e.toLowerInPlace();
10320             assert(e is g);
10321         }
10322         foreach (i, v; options)
10323         {
10324             doTest(v, upper[i], lower[i]);
10325         }
10326 
10327         // a few combinatorial runs
10328         foreach (i; 0 .. options.length)
10329         foreach (j; i .. options.length)
10330         foreach (k; j .. options.length)
10331         {
10332             auto sample = options[i] ~ options[j] ~ options[k];
10333             auto sample2 = options[k] ~ options[j] ~ options[i];
10334             doTest(sample, upper[i] ~ upper[j] ~ upper[k],
10335                 lower[i] ~ lower[j] ~ lower[k]);
10336             doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
10337                 lower[k] ~ lower[j] ~ lower[i]);
10338         }
10339     }}
10340 }
10341 
10342 // test random access ranges
10343 @safe pure unittest
10344 {
10345     import std.algorithm.comparison : cmp;
10346     import std.utf : byCodeUnit;
10347     auto s1 = "FoL".byCodeUnit;
10348     assert(s1.toUpper.cmp("FOL") == 0);
10349     auto s2 = "a\u0460B\u0461d".byCodeUnit;
10350     assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0);
10351 }
10352 
10353 /++
10354     Returns whether `c` is a Unicode alphabetic $(CHARACTER)
10355     (general Unicode category: Alphabetic).
10356 +/
10357 @safe pure nothrow @nogc
10358 bool isAlpha(dchar c)
10359 {
10360     // optimization
10361     if (c < 0xAA)
10362     {
10363         return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
10364     }
10365 
10366     return alphaTrie[c];
10367 }
10368 
10369 @safe unittest
10370 {
10371     auto alpha = unicode("Alphabetic");
10372     foreach (ch; alpha.byCodepoint)
10373         assert(isAlpha(ch));
10374     foreach (ch; 0 .. 0x4000)
10375         assert((ch in alpha) == isAlpha(ch));
10376 }
10377 
10378 
10379 /++
10380     Returns whether `c` is a Unicode mark
10381     (general Unicode category: Mn, Me, Mc).
10382 +/
10383 @safe pure nothrow @nogc
10384 bool isMark(dchar c)
10385 {
10386     return markTrie[c];
10387 }
10388 
10389 @safe unittest
10390 {
10391     auto mark = unicode("Mark");
10392     foreach (ch; mark.byCodepoint)
10393         assert(isMark(ch));
10394     foreach (ch; 0 .. 0x4000)
10395         assert((ch in mark) == isMark(ch));
10396 }
10397 
10398 /++
10399     Returns whether `c` is a Unicode numerical $(CHARACTER)
10400     (general Unicode category: Nd, Nl, No).
10401 +/
10402 @safe pure nothrow @nogc
10403 bool isNumber(dchar c)
10404 {
10405     // optimization for ascii case
10406     if (c <= 0x7F)
10407     {
10408         return c >= '0' && c <= '9';
10409     }
10410     else
10411     {
10412         return numberTrie[c];
10413     }
10414 }
10415 
10416 @safe unittest
10417 {
10418     auto n = unicode("N");
10419     foreach (ch; n.byCodepoint)
10420         assert(isNumber(ch));
10421     foreach (ch; 0 .. 0x4000)
10422         assert((ch in n) == isNumber(ch));
10423 }
10424 
10425 /++
10426     Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number.
10427     (general Unicode category: Alphabetic, Nd, Nl, No).
10428 
10429     Params:
10430         c = any Unicode character
10431     Returns:
10432         `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
10433         categories
10434 +/
10435 @safe pure nothrow @nogc
10436 bool isAlphaNum(dchar c)
10437 {
10438     static import std.ascii;
10439 
10440     // optimization for ascii case
10441     if (std.ascii.isASCII(c))
10442     {
10443         return std.ascii.isAlphaNum(c);
10444     }
10445     else
10446     {
10447         return isAlpha(c) || isNumber(c);
10448     }
10449 }
10450 
10451 @safe unittest
10452 {
10453     auto n = unicode("N");
10454     auto alpha = unicode("Alphabetic");
10455 
10456     foreach (ch; n.byCodepoint)
10457         assert(isAlphaNum(ch));
10458 
10459     foreach (ch; alpha.byCodepoint)
10460         assert(isAlphaNum(ch));
10461 
10462     foreach (ch; 0 .. 0x4000)
10463     {
10464         assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
10465     }
10466 }
10467 
10468 /++
10469     Returns whether `c` is a Unicode punctuation $(CHARACTER)
10470     (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
10471 +/
10472 @safe pure nothrow @nogc
10473 bool isPunctuation(dchar c)
10474 {
10475     static import std.ascii;
10476 
10477     // optimization for ascii case
10478     if (c <= 0x7F)
10479     {
10480         return std.ascii.isPunctuation(c);
10481     }
10482     else
10483     {
10484         return punctuationTrie[c];
10485     }
10486 }
10487 
10488 @safe unittest
10489 {
10490     assert(isPunctuation('\u0021'));
10491     assert(isPunctuation('\u0028'));
10492     assert(isPunctuation('\u0029'));
10493     assert(isPunctuation('\u002D'));
10494     assert(isPunctuation('\u005F'));
10495     assert(isPunctuation('\u00AB'));
10496     assert(isPunctuation('\u00BB'));
10497     foreach (ch; unicode("P").byCodepoint)
10498         assert(isPunctuation(ch));
10499 }
10500 
10501 /++
10502     Returns whether `c` is a Unicode symbol $(CHARACTER)
10503     (general Unicode category: Sm, Sc, Sk, So).
10504 +/
10505 @safe pure nothrow @nogc
10506 bool isSymbol(dchar c)
10507 {
10508    return symbolTrie[c];
10509 }
10510 
10511 @safe unittest
10512 {
10513     import std.format : format;
10514     assert(isSymbol('\u0024'));
10515     assert(isSymbol('\u002B'));
10516     assert(isSymbol('\u005E'));
10517     assert(isSymbol('\u00A6'));
10518     foreach (ch; unicode("S").byCodepoint)
10519         assert(isSymbol(ch), format("%04x", ch));
10520 }
10521 
10522 /++
10523     Returns whether `c` is a Unicode space $(CHARACTER)
10524     (general Unicode category: Zs)
10525     Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
10526     For commonly used less strict semantics see $(LREF isWhite).
10527 +/
10528 @safe pure nothrow @nogc
10529 bool isSpace(dchar c)
10530 {
10531     import std.internal.unicode_tables : isSpaceGen; // generated file
10532     return isSpaceGen(c);
10533 }
10534 
10535 @safe unittest
10536 {
10537     assert(isSpace('\u0020'));
10538     auto space = unicode.Zs;
10539     foreach (ch; space.byCodepoint)
10540         assert(isSpace(ch));
10541     foreach (ch; 0 .. 0x1000)
10542         assert(isSpace(ch) == space[ch]);
10543 }
10544 
10545 
10546 /++
10547     Returns whether `c` is a Unicode graphical $(CHARACTER)
10548     (general Unicode category: L, M, N, P, S, Zs).
10549 
10550 +/
10551 @safe pure nothrow @nogc
10552 bool isGraphical(dchar c)
10553 {
10554     return graphicalTrie[c];
10555 }
10556 
10557 
10558 @safe unittest
10559 {
10560     auto set = unicode("Graphical");
10561     import std.format : format;
10562     foreach (ch; set.byCodepoint)
10563         assert(isGraphical(ch), format("%4x", ch));
10564     foreach (ch; 0 .. 0x4000)
10565         assert((ch in set) == isGraphical(ch));
10566 }
10567 
10568 
10569 /++
10570     Returns whether `c` is a Unicode control $(CHARACTER)
10571     (general Unicode category: Cc).
10572 +/
10573 @safe pure nothrow @nogc
10574 bool isControl(dchar c)
10575 {
10576     import std.internal.unicode_tables : isControlGen; // generated file
10577     return isControlGen(c);
10578 }
10579 
10580 @safe unittest
10581 {
10582     assert(isControl('\u0000'));
10583     assert(isControl('\u0081'));
10584     assert(!isControl('\u0100'));
10585     auto cc = unicode.Cc;
10586     foreach (ch; cc.byCodepoint)
10587         assert(isControl(ch));
10588     foreach (ch; 0 .. 0x1000)
10589         assert(isControl(ch) == cc[ch]);
10590 }
10591 
10592 
10593 /++
10594     Returns whether `c` is a Unicode formatting $(CHARACTER)
10595     (general Unicode category: Cf).
10596 +/
10597 @safe pure nothrow @nogc
10598 bool isFormat(dchar c)
10599 {
10600     import std.internal.unicode_tables : isFormatGen; // generated file
10601     return isFormatGen(c);
10602 }
10603 
10604 
10605 @safe unittest
10606 {
10607     assert(isFormat('\u00AD'));
10608     foreach (ch; unicode("Format").byCodepoint)
10609         assert(isFormat(ch));
10610 }
10611 
10612 // code points for private use, surrogates are not likely to change in near feature
10613 // if need be they can be generated from unicode data as well
10614 
10615 /++
10616     Returns whether `c` is a Unicode Private Use $(CODEPOINT)
10617     (general Unicode category: Co).
10618 +/
10619 @safe pure nothrow @nogc
10620 bool isPrivateUse(dchar c)
10621 {
10622     return (0x00_E000 <= c && c <= 0x00_F8FF)
10623         || (0x0F_0000 <= c && c <= 0x0F_FFFD)
10624         || (0x10_0000 <= c && c <= 0x10_FFFD);
10625 }
10626 
10627 /++
10628     Returns whether `c` is a Unicode surrogate $(CODEPOINT)
10629     (general Unicode category: Cs).
10630 +/
10631 @safe pure nothrow @nogc
10632 bool isSurrogate(dchar c)
10633 {
10634     return (0xD800 <= c && c <= 0xDFFF);
10635 }
10636 
10637 /++
10638     Returns whether `c` is a Unicode high surrogate (lead surrogate).
10639 +/
10640 @safe pure nothrow @nogc
10641 bool isSurrogateHi(dchar c)
10642 {
10643     return (0xD800 <= c && c <= 0xDBFF);
10644 }
10645 
10646 /++
10647     Returns whether `c` is a Unicode low surrogate (trail surrogate).
10648 +/
10649 @safe pure nothrow @nogc
10650 bool isSurrogateLo(dchar c)
10651 {
10652     return (0xDC00 <= c && c <= 0xDFFF);
10653 }
10654 
10655 /++
10656     Returns whether `c` is a Unicode non-character i.e.
10657     a $(CODEPOINT) with no assigned abstract character.
10658     (general Unicode category: Cn)
10659 +/
10660 @safe pure nothrow @nogc
10661 bool isNonCharacter(dchar c)
10662 {
10663     return nonCharacterTrie[c];
10664 }
10665 
10666 @safe unittest
10667 {
10668     auto set = unicode("Cn");
10669     foreach (ch; set.byCodepoint)
10670         assert(isNonCharacter(ch));
10671 }
10672 
10673 private:
10674 // load static data from pre-generated tables into usable datastructures
10675 
10676 
10677 @safe auto asSet(const (ubyte)[] compressed) pure
10678 {
10679     return CodepointSet.fromIntervals(decompressIntervals(compressed));
10680 }
10681 
10682 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e)
10683 {
10684     return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
10685 }
10686 
10687 @safe pure nothrow @nogc @property
10688 {
10689     // It's important to use auto return here, so that the compiler
10690     // only runs semantic on the return type if the function gets
10691     // used. Also these are functions rather than templates to not
10692     // increase the object size of the caller.
10693     auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
10694     auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
10695     auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
10696     auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
10697     auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
10698     auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
10699     auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
10700     auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
10701     auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
10702     auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
10703     auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
10704 
10705     //normalization quick-check tables
10706     auto nfcQCTrie()
10707     {
10708         import std.internal.unicode_norm : nfcQCTrieEntries;
10709         static immutable res = asTrie(nfcQCTrieEntries);
10710         return res;
10711     }
10712 
10713     auto nfdQCTrie()
10714     {
10715         import std.internal.unicode_norm : nfdQCTrieEntries;
10716         static immutable res = asTrie(nfdQCTrieEntries);
10717         return res;
10718     }
10719 
10720     auto nfkcQCTrie()
10721     {
10722         import std.internal.unicode_norm : nfkcQCTrieEntries;
10723         static immutable res = asTrie(nfkcQCTrieEntries);
10724         return res;
10725     }
10726 
10727     auto nfkdQCTrie()
10728     {
10729         import std.internal.unicode_norm : nfkdQCTrieEntries;
10730         static immutable res = asTrie(nfkdQCTrieEntries);
10731         return res;
10732     }
10733 
10734     //grapheme breaking algorithm tables
10735     auto spacingMarkTrie()
10736     {
10737         import std.internal.unicode_grapheme : spacingMarkTrieEntries;
10738         static immutable res = asTrie(spacingMarkTrieEntries);
10739         return res;
10740     }
10741 
10742     auto graphemeExtendTrie()
10743     {
10744         import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
10745         static immutable res = asTrie(graphemeExtendTrieEntries);
10746         return res;
10747     }
10748 
10749     auto hangLV()
10750     {
10751         import std.internal.unicode_grapheme : hangulLVTrieEntries;
10752         static immutable res = asTrie(hangulLVTrieEntries);
10753         return res;
10754     }
10755 
10756     auto hangLVT()
10757     {
10758         import std.internal.unicode_grapheme : hangulLVTTrieEntries;
10759         static immutable res = asTrie(hangulLVTTrieEntries);
10760         return res;
10761     }
10762 
10763     auto prependTrie()
10764     {
10765         import std.internal.unicode_grapheme : prependTrieEntries;
10766         static immutable res = asTrie(prependTrieEntries);
10767         return res;
10768     }
10769 
10770     auto graphemeControlTrie()
10771     {
10772         import std.internal.unicode_grapheme : controlTrieEntries;
10773         static immutable res = asTrie(controlTrieEntries);
10774         return res;
10775     }
10776 
10777     auto xpictoTrie()
10778     {
10779         import std.internal.unicode_grapheme : Extended_PictographicTrieEntries;
10780         static immutable res = asTrie(Extended_PictographicTrieEntries);
10781         return res;
10782     }
10783 
10784     // tables below are used for composition/decomposition
10785     auto combiningClassTrie()
10786     {
10787         import std.internal.unicode_comp : combiningClassTrieEntries;
10788         static immutable res = asTrie(combiningClassTrieEntries);
10789         return res;
10790     }
10791 
10792     auto compatMappingTrie()
10793     {
10794         import std.internal.unicode_decomp : compatMappingTrieEntries;
10795         static immutable res = asTrie(compatMappingTrieEntries);
10796         return res;
10797     }
10798 
10799     auto canonMappingTrie()
10800     {
10801         import std.internal.unicode_decomp : canonMappingTrieEntries;
10802         static immutable res = asTrie(canonMappingTrieEntries);
10803         return res;
10804     }
10805 
10806     auto compositionJumpTrie()
10807     {
10808         import std.internal.unicode_comp : compositionJumpTrieEntries;
10809         static immutable res = asTrie(compositionJumpTrieEntries);
10810         return res;
10811     }
10812 
10813     //case conversion tables
10814     auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
10815     auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
10816     auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
10817     //simple case conversion tables
10818     auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
10819     auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
10820     auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
10821 
10822 }
10823 
10824 }// version (!std_uni_bootstrap)