std.utf source code

1 // Written in the D programming language.
2 
3 /++
4     Encode and decode UTF-8, UTF-16 and UTF-32 strings.
5 
6     UTF character support is restricted to
7     $(D '\u0000' &lt;= character &lt;= '\U0010FFFF').
8 
9 $(SCRIPT inhibitQuickIndex = 1;)
10 $(DIVC quickindex,
11 $(BOOKTABLE,
12 $(TR $(TH Category) $(TH Functions))
13 $(TR $(TD Decode) $(TD
14     $(LREF decode)
15     $(LREF decodeFront)
16 ))
17 $(TR $(TD Lazy decode) $(TD
18     $(LREF byCodeUnit)
19     $(LREF byChar)
20     $(LREF byWchar)
21     $(LREF byDchar)
22     $(LREF byUTF)
23 ))
24 $(TR $(TD Encode) $(TD
25     $(LREF encode)
26     $(LREF toUTF8)
27     $(LREF toUTF16)
28     $(LREF toUTF32)
29     $(LREF toUTFz)
30     $(LREF toUTF16z)
31 ))
32 $(TR $(TD Length) $(TD
33     $(LREF codeLength)
34     $(LREF count)
35     $(LREF stride)
36     $(LREF strideBack)
37 ))
38 $(TR $(TD Index) $(TD
39     $(LREF toUCSindex)
40     $(LREF toUTFindex)
41 ))
42 $(TR $(TD Validation) $(TD
43     $(LREF isValidDchar)
44     $(LREF isValidCodepoint)
45     $(LREF validate)
46 ))
47 $(TR $(TD Miscellaneous) $(TD
48     $(LREF replacementDchar)
49     $(LREF UseReplacementDchar)
50     $(LREF UTFException)
51 ))
52 ))
53     See_Also:
54         $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
55         $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
56         $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
57     Copyright: Copyright The D Language Foundation 2000 - 2012.
58     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
59     Authors:   $(HTTP digitalmars.com, Walter Bright) and
60                $(HTTP jmdavisprog.com, Jonathan M Davis)
61     Source:    $(PHOBOSSRC std/utf.d)
62    +/
63 module std.utf;
64 
65 import std.exception : basicExceptionCtors;
66 import core.exception : UnicodeException;
67 import std.meta : AliasSeq;
68 import std.range;
69 import std.traits : isAutodecodableString, isConvertibleToString,
70     isSomeChar, isSomeString, isStaticArray, Unqual;
71 import std.typecons : Flag, Yes, No;
72 
73 
74 /++
75     Exception thrown on errors in std.utf functions.
76   +/
77 class UTFException : UnicodeException
78 {
79     import core.internal.string : unsignedToTempString, UnsignedStringBuf;
80 
81     uint[4] sequence;
82     size_t  len;
83 
84     @safe pure nothrow @nogc
85     UTFException setSequence(scope uint[] data...) return
86     {
87         assert(data.length <= 4);
88 
89         len = data.length < 4 ? data.length : 4;
90         sequence[0 .. len] = data[0 .. len];
91 
92         return this;
93     }
94 
95     // FIXME: Use std.exception.basicExceptionCtors here once
96     // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed
97 
98     /**
99     Standard exception constructors.
100      */
101     this(string msg, string file = __FILE__, size_t line = __LINE__,
102          Throwable next = null) @nogc @safe pure nothrow
103     {
104         super(msg, 0, file, line, next);
105     }
106     /// ditto
107     this(string msg, size_t index, string file = __FILE__,
108          size_t line = __LINE__, Throwable next = null) @safe pure nothrow
109     {
110         UnsignedStringBuf buf = void;
111         msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")";
112         super(msg, index, file, line, next);
113     }
114 
115     /**
116     Returns:
117         A `string` detailing the invalid UTF sequence.
118      */
119     override string toString() const
120     {
121         if (len == 0)
122         {
123             /* Exception.toString() is not marked as const, although
124              * it is const-compatible.
125              */
126             //return super.toString();
127             auto e = () @trusted { return cast(Exception) super; } ();
128             return e.toString();
129         }
130 
131         string result = "Invalid UTF sequence:";
132 
133         foreach (i; sequence[0 .. len])
134         {
135             UnsignedStringBuf buf = void;
136             result ~= ' ';
137             auto h = unsignedToTempString!16(i, buf);
138             if (h.length == 1)
139                 result ~= '0';
140             result ~= h;
141             result ~= 'x';
142         }
143 
144         if (super.msg.length > 0)
145         {
146             result ~= " - ";
147             result ~= super.msg;
148         }
149 
150         return result;
151     }
152 }
153 
154 ///
155 @safe unittest
156 {
157     import std.exception : assertThrown;
158 
159     char[4] buf;
160     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
161     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
162     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
163     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
164     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
165 }
166 
167 /*
168    Provide array of invalidly encoded UTF strings. Useful for testing.
169 
170    Params:
171         Char = char, wchar, or dchar
172 
173    Returns:
174         an array of invalidly encoded UTF strings
175  */
176 
177 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
178 if (isSomeChar!Char)
179 {
180     static if (is(Char == char))
181     {
182         enum x = 0xDC00;         // invalid surrogate value
183         enum y = 0x110000;       // out of range
184 
185         static immutable string[8] result =
186         [
187             "\x80",             // not a start byte
188             "\xC0",             // truncated
189             "\xC0\xC0",         // invalid continuation
190             "\xF0\x82\x82\xAC", // overlong
191             [
192               0xE0 | (x >> 12),
193               0x80 | ((x >> 6) & 0x3F),
194               0x80 | (x & 0x3F)
195             ],
196             [
197               cast(char)(0xF0 | (y >> 18)),
198               cast(char)(0x80 | ((y >> 12) & 0x3F)),
199               cast(char)(0x80 | ((y >> 6) & 0x3F)),
200               cast(char)(0x80 | (y & 0x3F))
201             ],
202             [
203               cast(char)(0xF8 | 3),     // 5 byte encoding
204               cast(char)(0x80 | 3),
205               cast(char)(0x80 | 3),
206               cast(char)(0x80 | 3),
207               cast(char)(0x80 | 3),
208             ],
209             [
210               cast(char)(0xFC | 3),     // 6 byte encoding
211               cast(char)(0x80 | 3),
212               cast(char)(0x80 | 3),
213               cast(char)(0x80 | 3),
214               cast(char)(0x80 | 3),
215               cast(char)(0x80 | 3),
216             ],
217         ];
218 
219         return result[];
220     }
221     else static if (is(Char == wchar))
222     {
223         static immutable wstring[5] result =
224         [
225             [
226               cast(wchar) 0xDC00,
227             ],
228             [
229               cast(wchar) 0xDFFF,
230             ],
231             [
232               cast(wchar) 0xDBFF,
233               cast(wchar) 0xDBFF,
234             ],
235             [
236               cast(wchar) 0xDBFF,
237               cast(wchar) 0xE000,
238             ],
239             [
240               cast(wchar) 0xD800,
241             ],
242         ];
243 
244         return result[];
245     }
246     else static if (is(Char == dchar))
247     {
248         static immutable dstring[3] result =
249         [
250             [ cast(dchar) 0x110000 ],
251             [ cast(dchar) 0x00D800 ],
252             [ cast(dchar) 0x00DFFF ],
253         ];
254 
255         return result;
256     }
257     else
258         static assert(0);
259 }
260 
261 /++
262     Check whether the given Unicode code point is valid.
263 
264     Params:
265         c = code point to check
266 
267     Returns:
268         `true` if and only if `c` is a valid Unicode code point
269 
270     Note:
271     `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`,
272     as they are permitted for internal use by an application, but they are
273     not allowed for interchange by the Unicode standard.
274   +/
275 bool isValidDchar(dchar c) pure nothrow @safe @nogc
276 {
277     return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
278 }
279 
280 ///
281 @safe @nogc pure nothrow unittest
282 {
283     assert( isValidDchar(cast(dchar) 0x41));
284     assert( isValidDchar(cast(dchar) 0x00));
285     assert(!isValidDchar(cast(dchar) 0xD800));
286     assert(!isValidDchar(cast(dchar) 0x11FFFF));
287 }
288 
289 pure nothrow @safe @nogc unittest
290 {
291     import std.exception;
292 
293     assertCTFEable!(
294     {
295     assert( isValidDchar(cast(dchar)'a') == true);
296     assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
297 
298     assert(!isValidDchar(cast(dchar) 0x00D800));
299     assert(!isValidDchar(cast(dchar) 0x00DBFF));
300     assert(!isValidDchar(cast(dchar) 0x00DC00));
301     assert(!isValidDchar(cast(dchar) 0x00DFFF));
302     assert( isValidDchar(cast(dchar) 0x00FFFE));
303     assert( isValidDchar(cast(dchar) 0x00FFFF));
304     assert( isValidDchar(cast(dchar) 0x01FFFF));
305     assert( isValidDchar(cast(dchar) 0x10FFFF));
306     assert(!isValidDchar(cast(dchar) 0x110000));
307     });
308 }
309 
310 /**
311 Checks if a single character forms a valid code point.
312 
313 When standing alone, some characters are invalid code points. For
314 example the `wchar` `0xD800` is a so called high surrogate, which can
315 only be interpreted together with a low surrogate following it. As a
316 standalone character it is considered invalid.
317 
318 See $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/,
319 Unicode Standard, D90, D91 and D92) for more details.
320 
321 Params:
322     c = character to test
323     Char = character type of `c`
324 
325 Returns:
326     `true`, if `c` forms a valid code point.
327  */
328 bool isValidCodepoint(Char)(Char c)
329 if (isSomeChar!Char)
330 {
331     alias UChar = Unqual!Char;
332     static if (is(UChar == char))
333     {
334         return c <= 0x7F;
335     }
336     else static if (is(UChar == wchar))
337     {
338         return c <= 0xD7FF || c >= 0xE000;
339     }
340     else static if (is(UChar == dchar))
341     {
342         return isValidDchar(c);
343     }
344     else
345         static assert(false, "unknown character type: `" ~ Char.stringof ~ "`");
346 }
347 
348 ///
349 @safe pure nothrow unittest
350 {
351     assert( isValidCodepoint(cast(char) 0x40));
352     assert(!isValidCodepoint(cast(char) 0x80));
353     assert( isValidCodepoint(cast(wchar) 0x1234));
354     assert(!isValidCodepoint(cast(wchar) 0xD800));
355     assert( isValidCodepoint(cast(dchar) 0x0010FFFF));
356     assert(!isValidCodepoint(cast(dchar) 0x12345678));
357 }
358 
359 /++
360     Calculate the length of the UTF sequence starting at `index`
361     in `str`.
362 
363     Params:
364         str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
365         of UTF code units. Must be random access if `index` is passed
366         index = starting index of UTF sequence (default: `0`)
367 
368     Returns:
369         The number of code units in the UTF sequence. For UTF-8, this is a
370         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
371         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
372 
373     Throws:
374         May throw a `UTFException` if `str[index]` is not the start of a
375         valid UTF sequence.
376 
377     Note:
378         `stride` will only analyze the first `str[index]` element. It
379         will not fully verify the validity of the UTF sequence, nor even verify
380         the presence of the sequence: it will not actually guarantee that
381         $(D index + stride(str, index) <= str.length).
382   +/
383 uint stride(S)(auto ref S str, size_t index)
384 if (is(S : const char[]) ||
385     (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
386 {
387     static if (is(typeof(str.length) : ulong))
388         assert(index < str.length, "Past the end of the UTF-8 sequence");
389     immutable c = str[index];
390 
391     if (c < 0x80)
392         return 1;
393     else
394         return strideImpl(c, index);
395 }
396 
397 /// Ditto
398 uint stride(S)(auto ref S str)
399 if (is(S : const char[]) ||
400     (isInputRange!S && is(immutable ElementType!S == immutable char)))
401 {
402     static if (is(S : const char[]))
403         immutable c = str[0];
404     else
405         immutable c = str.front;
406 
407     if (c < 0x80)
408         return 1;
409     else
410         return strideImpl(c, 0);
411 }
412 
413 @system unittest
414 {
415     import core.exception : AssertError;
416     import std.conv : to;
417     import std.exception;
418     import std.string : format;
419     import std.traits : FunctionAttribute, functionAttributes, isSafe;
420     static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
421     {
422         enforce(stride(s, i) == codeLength!char(c),
423                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
424 
425         enforce(stride(RandomCU!char(s), i) == codeLength!char(c),
426                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
427 
428         auto refRandom = new RefRandomCU!char(s);
429         immutable randLen = refRandom.length;
430         enforce(stride(refRandom, i) == codeLength!char(c),
431                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
432         enforce(refRandom.length == randLen,
433                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
434 
435         if (i == 0)
436         {
437             enforce(stride(s) == codeLength!char(c),
438                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
439 
440             enforce(stride(InputCU!char(s)) == codeLength!char(c),
441                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
442 
443             auto refBidir = new RefBidirCU!char(s);
444             immutable bidirLen = refBidir.length;
445             enforce(stride(refBidir) == codeLength!char(c),
446                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
447             enforce(refBidir.length == bidirLen,
448                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
449         }
450     }
451 
452     assertCTFEable!(
453     {
454     test("a", 'a');
455     test(" ", ' ');
456     test("\u2029", '\u2029'); //paraSep
457     test("\u0100", '\u0100');
458     test("\u0430", '\u0430');
459     test("\U00010143", '\U00010143');
460     test("abcdefcdef", 'a');
461     test("hello\U00010143\u0100\U00010143", 'h', 0);
462     test("hello\U00010143\u0100\U00010143", 'e', 1);
463     test("hello\U00010143\u0100\U00010143", 'l', 2);
464     test("hello\U00010143\u0100\U00010143", 'l', 3);
465     test("hello\U00010143\u0100\U00010143", 'o', 4);
466     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
467     test("hello\U00010143\u0100\U00010143", '\u0100', 9);
468     test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
469 
470     foreach (S; AliasSeq!(char[], const char[], string))
471     {
472         enum str = to!S("hello world");
473         static assert(isSafe!({ stride(str, 0); }));
474         static assert(isSafe!({ stride(str);    }));
475         static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0);
476         static assert((functionAttributes!({ stride(str);    }) & FunctionAttribute.pure_) != 0);
477     }
478     });
479 }
480 
481 @safe unittest // invalid start bytes
482 {
483     import std.exception : assertThrown;
484     immutable char[] invalidStartBytes = [
485         0b1111_1000, // indicating a sequence length of 5
486         0b1111_1100, // 6
487         0b1111_1110, // 7
488         0b1111_1111, // 8
489         0b1000_0000, // continuation byte
490     ];
491     foreach (c; invalidStartBytes)
492         assertThrown!UTFException(stride([c]));
493 }
494 
495 /// Ditto
496 uint stride(S)(auto ref S str, size_t index)
497 if (is(S : const wchar[]) ||
498     (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
499 {
500     static if (is(typeof(str.length) : ulong))
501         assert(index < str.length, "Past the end of the UTF-16 sequence");
502     immutable uint u = str[index];
503     return 1 + (u >= 0xD800 && u <= 0xDBFF);
504 }
505 
506 /// Ditto
507 uint stride(S)(auto ref S str) @safe pure
508 if (is(S : const wchar[]))
509 {
510     return stride(str, 0);
511 }
512 
513 /// Ditto
514 uint stride(S)(auto ref S str)
515 if (isInputRange!S && is(immutable ElementType!S == immutable wchar) &&
516     !is(S : const wchar[]))
517 {
518     assert(!str.empty, "UTF-16 sequence is empty");
519     immutable uint u = str.front;
520     return 1 + (u >= 0xD800 && u <= 0xDBFF);
521 }
522 
523 @system unittest
524 {
525     import core.exception : AssertError;
526     import std.conv : to;
527     import std.exception;
528     import std.string : format;
529     import std.traits : FunctionAttribute, functionAttributes, isSafe;
530     static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
531     {
532         enforce(stride(s, i) == codeLength!wchar(c),
533                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
534 
535         enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
536                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
537 
538         auto refRandom = new RefRandomCU!wchar(s);
539         immutable randLen = refRandom.length;
540         enforce(stride(refRandom, i) == codeLength!wchar(c),
541                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
542         enforce(refRandom.length == randLen,
543                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
544 
545         if (i == 0)
546         {
547             enforce(stride(s) == codeLength!wchar(c),
548                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
549 
550             enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
551                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
552 
553             auto refBidir = new RefBidirCU!wchar(s);
554             immutable bidirLen = refBidir.length;
555             enforce(stride(refBidir) == codeLength!wchar(c),
556                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
557             enforce(refBidir.length == bidirLen,
558                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
559         }
560     }
561 
562     assertCTFEable!(
563     {
564     test("a", 'a');
565     test(" ", ' ');
566     test("\u2029", '\u2029'); //paraSep
567     test("\u0100", '\u0100');
568     test("\u0430", '\u0430');
569     test("\U00010143", '\U00010143');
570     test("abcdefcdef", 'a');
571     test("hello\U00010143\u0100\U00010143", 'h', 0);
572     test("hello\U00010143\u0100\U00010143", 'e', 1);
573     test("hello\U00010143\u0100\U00010143", 'l', 2);
574     test("hello\U00010143\u0100\U00010143", 'l', 3);
575     test("hello\U00010143\u0100\U00010143", 'o', 4);
576     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
577     test("hello\U00010143\u0100\U00010143", '\u0100', 7);
578     test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
579 
580     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
581     {
582         enum str = to!S("hello world");
583         static assert(isSafe!(() => stride(str, 0)));
584         static assert(isSafe!(() => stride(str)   ));
585         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
586         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
587     }
588     });
589 }
590 
591 /// Ditto
592 uint stride(S)(auto ref S str, size_t index = 0)
593 if (is(S : const dchar[]) ||
594     (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
595 {
596     static if (is(typeof(str.length) : ulong))
597         assert(index < str.length, "Past the end of the UTF-32 sequence");
598     else
599         assert(!str.empty, "UTF-32 sequence is empty.");
600     return 1;
601 }
602 
603 ///
604 @safe unittest
605 {
606     assert("a".stride == 1);
607     assert("λ".stride == 2);
608     assert("aλ".stride == 1);
609     assert("aλ".stride(1) == 2);
610     assert("𐐷".stride == 4);
611 }
612 
613 @system unittest
614 {
615     import core.exception : AssertError;
616     import std.conv : to;
617     import std.exception;
618     import std.string : format;
619     import std.traits : FunctionAttribute, functionAttributes, isSafe;
620     static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
621     {
622         enforce(stride(s, i) == codeLength!dchar(c),
623                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
624 
625         enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
626                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
627 
628         auto refRandom = new RefRandomCU!dchar(s);
629         immutable randLen = refRandom.length;
630         enforce(stride(refRandom, i) == codeLength!dchar(c),
631                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
632         enforce(refRandom.length == randLen,
633                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
634 
635         if (i == 0)
636         {
637             enforce(stride(s) == codeLength!dchar(c),
638                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
639 
640             enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
641                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
642 
643             auto refBidir = new RefBidirCU!dchar(s);
644             immutable bidirLen = refBidir.length;
645             enforce(stride(refBidir) == codeLength!dchar(c),
646                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
647             enforce(refBidir.length == bidirLen,
648                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
649         }
650     }
651 
652     assertCTFEable!(
653     {
654     test("a", 'a');
655     test(" ", ' ');
656     test("\u2029", '\u2029'); //paraSep
657     test("\u0100", '\u0100');
658     test("\u0430", '\u0430');
659     test("\U00010143", '\U00010143');
660     test("abcdefcdef", 'a');
661     test("hello\U00010143\u0100\U00010143", 'h', 0);
662     test("hello\U00010143\u0100\U00010143", 'e', 1);
663     test("hello\U00010143\u0100\U00010143", 'l', 2);
664     test("hello\U00010143\u0100\U00010143", 'l', 3);
665     test("hello\U00010143\u0100\U00010143", 'o', 4);
666     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
667     test("hello\U00010143\u0100\U00010143", '\u0100', 6);
668     test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
669 
670     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
671     {
672         enum str = to!S("hello world");
673         static assert(isSafe!(() => stride(str, 0)));
674         static assert(isSafe!(() => stride(str)   ));
675         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
676         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
677     }
678     });
679 }
680 
681 private uint strideImpl(char c, size_t index) @trusted pure
682 in { assert(c & 0x80); }
683 do
684 {
685     import core.bitop : bsr;
686     immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
687     if (c == 0xFF || msbs < 2 || msbs > 4)
688         throw new UTFException("Invalid UTF-8 sequence", index);
689     return msbs;
690 }
691 
692 /++
693     Calculate the length of the UTF sequence ending one code unit before
694     `index` in `str`.
695 
696     Params:
697         str = bidirectional range of UTF code units. Must be random access if
698         `index` is passed
699         index = index one past end of UTF sequence (default: `str.length`)
700 
701     Returns:
702         The number of code units in the UTF sequence. For UTF-8, this is a
703         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
704         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
705 
706     Throws:
707         May throw a `UTFException` if `str[index]` is not one past the
708         end of a valid UTF sequence.
709 
710     Note:
711         `strideBack` will only analyze the element at $(D str[index - 1])
712         element. It will not fully verify the validity of the UTF sequence, nor
713         even verify the presence of the sequence: it will not actually
714         guarantee that $(D strideBack(str, index) <= index).
715   +/
716 uint strideBack(S)(auto ref S str, size_t index)
717 if (is(S : const char[]) ||
718     (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
719 {
720     static if (is(typeof(str.length) : ulong))
721         assert(index <= str.length, "Past the end of the UTF-8 sequence");
722     assert(index > 0, "Not the end of the UTF-8 sequence");
723 
724     if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
725         return 1;
726 
727     if (index >= 4) //single verification for most common case
728     {
729         static foreach (i; 2 .. 5)
730         {
731             if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
732                 return i;
733         }
734     }
735     else
736     {
737         static foreach (i; 2 .. 4)
738         {
739             if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
740                 return i;
741         }
742     }
743     throw new UTFException("Not the end of the UTF sequence", index);
744 }
745 
746 /// Ditto
747 uint strideBack(S)(auto ref S str)
748 if (is(S : const char[]) ||
749     (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char)))
750 {
751     return strideBack(str, str.length);
752 }
753 
754 /// Ditto
755 uint strideBack(S)(auto ref S str)
756 if (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S)
757 {
758     assert(!str.empty, "Past the end of the UTF-8 sequence");
759     auto temp = str.save;
760     foreach (i; AliasSeq!(1, 2, 3, 4))
761     {
762         if ((temp.back & 0b1100_0000) != 0b1000_0000)
763             return i;
764         temp.popBack();
765         if (temp.empty)
766             break;
767     }
768     throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
769 }
770 
771 @system unittest
772 {
773     import core.exception : AssertError;
774     import std.conv : to;
775     import std.exception;
776     import std.string : format;
777     import std.traits : FunctionAttribute, functionAttributes, isSafe;
778     static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
779     {
780         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
781                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
782 
783         enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c),
784                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
785 
786         auto refRandom = new RefRandomCU!char(s);
787         immutable randLen = refRandom.length;
788         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c),
789                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
790         enforce(refRandom.length == randLen,
791                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
792 
793         if (i == size_t.max)
794         {
795             enforce(strideBack(s) == codeLength!char(c),
796                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
797 
798             enforce(strideBack(BidirCU!char(s)) == codeLength!char(c),
799                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
800 
801             auto refBidir = new RefBidirCU!char(s);
802             immutable bidirLen = refBidir.length;
803             enforce(strideBack(refBidir) == codeLength!char(c),
804                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
805             enforce(refBidir.length == bidirLen,
806                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
807         }
808     }
809 
810     assertCTFEable!(
811     {
812     test("a", 'a');
813     test(" ", ' ');
814     test("\u2029", '\u2029'); //paraSep
815     test("\u0100", '\u0100');
816     test("\u0430", '\u0430');
817     test("\U00010143", '\U00010143');
818     test("abcdefcdef", 'f');
819     test("\U00010143\u0100\U00010143hello", 'o', 15);
820     test("\U00010143\u0100\U00010143hello", 'l', 14);
821     test("\U00010143\u0100\U00010143hello", 'l', 13);
822     test("\U00010143\u0100\U00010143hello", 'e', 12);
823     test("\U00010143\u0100\U00010143hello", 'h', 11);
824     test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
825     test("\U00010143\u0100\U00010143hello", '\u0100', 6);
826     test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
827 
828     foreach (S; AliasSeq!(char[], const char[], string))
829     {
830         enum str = to!S("hello world");
831         static assert(isSafe!({ strideBack(str, 0); }));
832         static assert(isSafe!({ strideBack(str);    }));
833         static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0);
834         static assert((functionAttributes!({ strideBack(str);    }) & FunctionAttribute.pure_) != 0);
835     }
836     });
837 }
838 
839 //UTF-16 is self synchronizing: The length of strideBack can be found from
840 //the value of a single wchar
841 /// Ditto
842 uint strideBack(S)(auto ref S str, size_t index)
843 if (is(S : const wchar[]) ||
844     (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
845 {
846     static if (is(typeof(str.length) : ulong))
847         assert(index <= str.length, "Past the end of the UTF-16 sequence");
848     assert(index > 0, "Not the end of a UTF-16 sequence");
849 
850     immutable c2 = str[index-1];
851     return 1 + (0xDC00 <= c2 && c2 < 0xE000);
852 }
853 
854 /// Ditto
855 uint strideBack(S)(auto ref S str)
856 if (is(S : const wchar[]) ||
857     (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar)))
858 {
859     assert(!str.empty, "UTF-16 sequence is empty");
860 
861     static if (is(S : const(wchar)[]))
862         immutable c2 = str[$ - 1];
863     else
864         immutable c2 = str.back;
865 
866     return 1 + (0xDC00 <= c2 && c2 <= 0xE000);
867 }
868 
869 @system unittest
870 {
871     import core.exception : AssertError;
872     import std.conv : to;
873     import std.exception;
874     import std.string : format;
875     import std.traits : FunctionAttribute, functionAttributes, isSafe;
876     static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
877     {
878         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
879                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
880 
881         enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c),
882                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
883 
884         auto refRandom = new RefRandomCU!wchar(s);
885         immutable randLen = refRandom.length;
886         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c),
887                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
888         enforce(refRandom.length == randLen,
889                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
890 
891         if (i == size_t.max)
892         {
893             enforce(strideBack(s) == codeLength!wchar(c),
894                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
895 
896             enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c),
897                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
898 
899             auto refBidir = new RefBidirCU!wchar(s);
900             immutable bidirLen = refBidir.length;
901             enforce(strideBack(refBidir) == codeLength!wchar(c),
902                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
903             enforce(refBidir.length == bidirLen,
904                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
905         }
906     }
907 
908     assertCTFEable!(
909     {
910     test("a", 'a');
911     test(" ", ' ');
912     test("\u2029", '\u2029'); //paraSep
913     test("\u0100", '\u0100');
914     test("\u0430", '\u0430');
915     test("\U00010143", '\U00010143');
916     test("abcdefcdef", 'f');
917     test("\U00010143\u0100\U00010143hello", 'o', 10);
918     test("\U00010143\u0100\U00010143hello", 'l', 9);
919     test("\U00010143\u0100\U00010143hello", 'l', 8);
920     test("\U00010143\u0100\U00010143hello", 'e', 7);
921     test("\U00010143\u0100\U00010143hello", 'h', 6);
922     test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
923     test("\U00010143\u0100\U00010143hello", '\u0100', 3);
924     test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
925 
926     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
927     {
928         enum str = to!S("hello world");
929         static assert(isSafe!(() => strideBack(str, 0)));
930         static assert(isSafe!(() => strideBack(str)   ));
931         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
932         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
933     }
934     });
935 }
936 
937 /// Ditto
938 uint strideBack(S)(auto ref S str, size_t index)
939 if (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar))
940 {
941     static if (is(typeof(str.length) : ulong))
942         assert(index <= str.length, "Past the end of the UTF-32 sequence");
943     assert(index > 0, "Not the end of the UTF-32 sequence");
944     return 1;
945 }
946 
947 /// Ditto
948 uint strideBack(S)(auto ref S str)
949 if (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar))
950 {
951     assert(!str.empty, "Empty UTF-32 sequence");
952     return 1;
953 }
954 
955 ///
956 @safe unittest
957 {
958     assert("a".strideBack == 1);
959     assert("λ".strideBack == 2);
960     assert("aλ".strideBack == 2);
961     assert("aλ".strideBack(1) == 1);
962     assert("𐐷".strideBack == 4);
963 }
964 
965 @system unittest
966 {
967     import core.exception : AssertError;
968     import std.conv : to;
969     import std.exception;
970     import std.string : format;
971     import std.traits : FunctionAttribute, functionAttributes, isSafe;
972     static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
973     {
974         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
975                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
976 
977         enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c),
978                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
979 
980         auto refRandom = new RefRandomCU!dchar(s);
981         immutable randLen = refRandom.length;
982         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c),
983                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
984         enforce(refRandom.length == randLen,
985                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
986 
987         if (i == size_t.max)
988         {
989             enforce(strideBack(s) == codeLength!dchar(c),
990                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
991 
992             enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c),
993                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
994 
995             auto refBidir = new RefBidirCU!dchar(s);
996             immutable bidirLen = refBidir.length;
997             enforce(strideBack(refBidir) == codeLength!dchar(c),
998                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
999             enforce(refBidir.length == bidirLen,
1000                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
1001         }
1002     }
1003 
1004     assertCTFEable!(
1005     {
1006     test("a", 'a');
1007     test(" ", ' ');
1008     test("\u2029", '\u2029'); //paraSep
1009     test("\u0100", '\u0100');
1010     test("\u0430", '\u0430');
1011     test("\U00010143", '\U00010143');
1012     test("abcdefcdef", 'f');
1013     test("\U00010143\u0100\U00010143hello", 'o', 8);
1014     test("\U00010143\u0100\U00010143hello", 'l', 7);
1015     test("\U00010143\u0100\U00010143hello", 'l', 6);
1016     test("\U00010143\u0100\U00010143hello", 'e', 5);
1017     test("\U00010143\u0100\U00010143hello", 'h', 4);
1018     test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
1019     test("\U00010143\u0100\U00010143hello", '\u0100', 2);
1020     test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
1021 
1022     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
1023     {
1024         enum str = to!S("hello world");
1025         static assert(isSafe!(() => strideBack(str, 0)));
1026         static assert(isSafe!(() => strideBack(str)   ));
1027         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
1028         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
1029     }
1030     });
1031 }
1032 
1033 
1034 /++
1035     Given `index` into `str` and assuming that `index` is at the start
1036     of a UTF sequence, `toUCSindex` determines the number of UCS characters
1037     up to `index`. So, `index` is the index of a code unit at the
1038     beginning of a code point, and the return value is how many code points into
1039     the string that that code point is.
1040   +/
1041 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
1042 if (isSomeChar!C)
1043 {
1044     static if (is(immutable C == immutable dchar))
1045         return index;
1046     else
1047     {
1048         size_t n = 0;
1049         size_t j = 0;
1050 
1051         for (; j < index; ++n)
1052             j += stride(str, j);
1053 
1054         if (j > index)
1055         {
1056             static if (is(immutable C == immutable char))
1057                 throw new UTFException("Invalid UTF-8 sequence", index);
1058             else
1059                 throw new UTFException("Invalid UTF-16 sequence", index);
1060         }
1061 
1062         return n;
1063     }
1064 }
1065 
1066 ///
1067 @safe unittest
1068 {
1069     assert(toUCSindex(`hello world`, 7) == 7);
1070     assert(toUCSindex(`hello world`w, 7) == 7);
1071     assert(toUCSindex(`hello world`d, 7) == 7);
1072 
1073     assert(toUCSindex(`Ma Chérie`, 7) == 6);
1074     assert(toUCSindex(`Ma Chérie`w, 7) == 7);
1075     assert(toUCSindex(`Ma Chérie`d, 7) == 7);
1076 
1077     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
1078     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1079     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1080 }
1081 
1082 
1083 /++
1084     Given a UCS index `n` into `str`, returns the UTF index.
1085     So, `n` is how many code points into the string the code point is, and
1086     the array index of the code unit is returned.
1087   +/
1088 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure
1089 if (isSomeChar!C)
1090 {
1091     static if (is(immutable C == immutable dchar))
1092     {
1093         return n;
1094     }
1095     else
1096     {
1097         size_t i;
1098         while (n--)
1099         {
1100             i += stride(str, i);
1101         }
1102         return i;
1103     }
1104 }
1105 
1106 ///
1107 @safe unittest
1108 {
1109     assert(toUTFindex(`hello world`, 7) == 7);
1110     assert(toUTFindex(`hello world`w, 7) == 7);
1111     assert(toUTFindex(`hello world`d, 7) == 7);
1112 
1113     assert(toUTFindex(`Ma Chérie`, 6) == 7);
1114     assert(toUTFindex(`Ma Chérie`w, 7) == 7);
1115     assert(toUTFindex(`Ma Chérie`d, 7) == 7);
1116 
1117     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
1118     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1119     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1120 }
1121 
1122 
1123 /* =================== Decode ======================= */
1124 
1125 /// Whether or not to replace invalid UTF with $(LREF replacementDchar)
1126 alias UseReplacementDchar = Flag!"useReplacementDchar";
1127 
1128 /++
1129     Decodes and returns the code point starting at `str[index]`. `index`
1130     is advanced to one past the decoded code point. If the code point is not
1131     well-formed, then a `UTFException` is thrown and `index` remains
1132     unchanged.
1133 
1134     decode will only work with strings and random access ranges of code units
1135     with length and slicing, whereas $(LREF decodeFront) will work with any
1136     input range of code units.
1137 
1138     Params:
1139         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1140         str = input string or indexable Range
1141         index = starting index into s[]; incremented by number of code units processed
1142 
1143     Returns:
1144         decoded character
1145 
1146     Throws:
1147         $(LREF UTFException) if `str[index]` is not the start of a valid UTF
1148         sequence and useReplacementDchar is `No.useReplacementDchar`
1149   +/
1150 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index)
1151 if (!isSomeString!S &&
1152     isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S))
1153 in
1154 {
1155     assert(index < str.length, "Attempted to decode past the end of a string");
1156 }
1157 out (result)
1158 {
1159     assert(isValidDchar(result));
1160 }
1161 do
1162 {
1163     if (str[index] < codeUnitLimit!S)
1164         return str[index++];
1165     else
1166         return decodeImpl!(true, useReplacementDchar)(str, index);
1167 }
1168 
1169 /// ditto
1170 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1171 auto ref scope S str, ref size_t index) @trusted pure
1172 if (isSomeString!S)
1173 in
1174 {
1175     assert(index < str.length, "Attempted to decode past the end of a string");
1176 }
1177 out (result)
1178 {
1179     assert(isValidDchar(result));
1180 }
1181 do
1182 {
1183     if (str[index] < codeUnitLimit!S)
1184         return str[index++];
1185     else static if (is(immutable S == immutable C[], C))
1186         return decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1187 }
1188 
1189 ///
1190 @safe pure unittest
1191 {
1192     size_t i;
1193 
1194     assert("a".decode(i) == 'a' && i == 1);
1195     i = 0;
1196     assert("å".decode(i) == 'å' && i == 2);
1197     i = 1;
1198     assert("aå".decode(i) == 'å' && i == 3);
1199     i = 0;
1200     assert("å"w.decode(i) == 'å' && i == 1);
1201 
1202     // ë as a multi-code point grapheme
1203     i = 0;
1204     assert("e\u0308".decode(i) == 'e' && i == 1);
1205     // ë as a single code point grapheme
1206     i = 0;
1207     assert("ë".decode(i) == 'ë' && i == 2);
1208     i = 0;
1209     assert("ë"w.decode(i) == 'ë' && i == 1);
1210 }
1211 
1212 @safe pure unittest // https://issues.dlang.org/show_bug.cgi?id=22867
1213 {
1214     import std.conv : hexString;
1215     string data = hexString!"f787a598";
1216     size_t offset = 0;
1217     try data.decode(offset);
1218     catch (UTFException ex) assert(offset == 0);
1219 }
1220 
1221 /++
1222     `decodeFront` is a variant of $(LREF decode) which specifically decodes
1223     the first code point. Unlike $(LREF decode), `decodeFront` accepts any
1224     $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
1225     of code units (rather than just a string or random access
1226     range). It also takes the range by `ref` and pops off the elements as it
1227     decodes them. If `numCodeUnits` is passed in, it gets set to the number
1228     of code units which were in the code point which was decoded.
1229 
1230     Params:
1231         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1232         str = input string or indexable Range
1233         numCodeUnits = set to number of code units processed
1234 
1235     Returns:
1236         decoded character
1237 
1238     Throws:
1239         $(LREF UTFException) if `str.front` is not the start of a valid UTF
1240         sequence. If an exception is thrown, then there is no guarantee as to
1241         the number of code units which were popped off, as it depends on the
1242         type of range being used and how many code units had to be popped off
1243         before the code point was determined to be invalid.
1244   +/
1245 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1246 ref S str, out size_t numCodeUnits)
1247 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S))
1248 in
1249 {
1250     assert(!str.empty);
1251 }
1252 out (result)
1253 {
1254     assert(isValidDchar(result));
1255 }
1256 do
1257 {
1258     immutable fst = str.front;
1259 
1260     if (fst < codeUnitLimit!S)
1261     {
1262         str.popFront();
1263         numCodeUnits = 1;
1264         return fst;
1265     }
1266     else
1267     {
1268         // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be
1269         // done outside of decodeImpl, which is undesirable, since not all
1270         // overloads of decodeImpl need it. So, it should be moved back into
1271         // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521
1272         // has been fixed.
1273         enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S;
1274         immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits);
1275 
1276         // The other range types were already popped by decodeImpl.
1277         static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1278             str = str[numCodeUnits .. str.length];
1279 
1280         return retval;
1281     }
1282 }
1283 
1284 /// ditto
1285 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1286 ref scope S str, out size_t numCodeUnits) @trusted pure
1287 if (isSomeString!S)
1288 in
1289 {
1290     assert(!str.empty);
1291 }
1292 out (result)
1293 {
1294     assert(isValidDchar(result));
1295 }
1296 do
1297 {
1298     if (str[0] < codeUnitLimit!S)
1299     {
1300         numCodeUnits = 1;
1301         immutable retval = str[0];
1302         str = str[1 .. $];
1303         return retval;
1304     }
1305     else static if (is(immutable S == immutable C[], C))
1306     {
1307         immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, numCodeUnits);
1308         str = str[numCodeUnits .. $];
1309         return retval;
1310     }
1311 }
1312 
1313 /++ Ditto +/
1314 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1315 if (isInputRange!S && isSomeChar!(ElementType!S))
1316 {
1317     size_t numCodeUnits;
1318     return decodeFront!useReplacementDchar(str, numCodeUnits);
1319 }
1320 
1321 ///
1322 @safe pure unittest
1323 {
1324     import std.range.primitives;
1325     string str = "Hello, World!";
1326 
1327     assert(str.decodeFront == 'H' && str == "ello, World!");
1328     str = "å";
1329     assert(str.decodeFront == 'å' && str.empty);
1330     str = "å";
1331     size_t i;
1332     assert(str.decodeFront(i) == 'å' && i == 2 && str.empty);
1333 }
1334 
1335 /++
1336     `decodeBack` is a variant of $(LREF decode) which specifically decodes
1337     the last code point. Unlike $(LREF decode), `decodeBack` accepts any
1338     bidirectional range of code units (rather than just a string or random access
1339     range). It also takes the range by `ref` and pops off the elements as it
1340     decodes them. If `numCodeUnits` is passed in, it gets set to the number
1341     of code units which were in the code point which was decoded.
1342 
1343     Params:
1344         useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
1345         str = input string or bidirectional Range
1346         numCodeUnits = gives the number of code units processed
1347 
1348     Returns:
1349         A decoded UTF character.
1350 
1351     Throws:
1352         $(LREF UTFException) if `str.back` is not the end of a valid UTF
1353         sequence. If an exception is thrown, the `str` itself remains unchanged,
1354         but there is no guarantee as to the value of `numCodeUnits` (when passed).
1355   +/
1356 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1357     ref S str, out size_t numCodeUnits)
1358 if (isSomeString!S)
1359 in
1360 {
1361     assert(!str.empty);
1362 }
1363 out (result)
1364 {
1365     assert(isValidDchar(result));
1366 }
1367 do
1368 {
1369     if (str[$ - 1] < codeUnitLimit!S)
1370     {
1371         numCodeUnits = 1;
1372         immutable retval = str[$ - 1];
1373         str = str[0 .. $ - 1];
1374         return retval;
1375     }
1376     else static if (is(immutable S == immutable C[], C))
1377     {
1378         numCodeUnits = strideBack(str);
1379         immutable newLength = str.length - numCodeUnits;
1380         size_t index = newLength;
1381         immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1382         str = str[0 .. newLength];
1383         return retval;
1384     }
1385 }
1386 
1387 /++ Ditto +/
1388 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1389     ref S str, out size_t numCodeUnits)
1390 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S
1391     && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S))
1392 in
1393 {
1394     assert(!str.empty);
1395 }
1396 out (result)
1397 {
1398     assert(isValidDchar(result));
1399 }
1400 do
1401 {
1402     if (str.back < codeUnitLimit!S)
1403     {
1404         numCodeUnits = 1;
1405         immutable retval = str.back;
1406         str.popBack();
1407         return retval;
1408     }
1409     else
1410     {
1411         numCodeUnits = strideBack(str);
1412         static if (isRandomAccessRange!S)
1413         {
1414             size_t index = str.length - numCodeUnits;
1415             immutable retval = decodeImpl!(true, useReplacementDchar)(str, index);
1416             str.popBackExactly(numCodeUnits);
1417             return retval;
1418         }
1419         else
1420         {
1421             alias Char = Unqual!(ElementType!S);
1422             Char[4] codeUnits;
1423             S tmp = str.save;
1424             for (size_t i = numCodeUnits; i > 0; )
1425             {
1426                 codeUnits[--i] = tmp.back;
1427                 tmp.popBack();
1428             }
1429             const Char[] codePoint = codeUnits[0 .. numCodeUnits];
1430             size_t index = 0;
1431             immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index);
1432             str = tmp;
1433             return retval;
1434         }
1435     }
1436 }
1437 
1438 /++ Ditto +/
1439 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1440 if (isSomeString!S
1441     || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S))
1442     || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S)))
1443 in
1444 {
1445     assert(!str.empty);
1446 }
1447 out (result)
1448 {
1449     assert(isValidDchar(result));
1450 }
1451 do
1452 {
1453     size_t numCodeUnits;
1454     return decodeBack!useReplacementDchar(str, numCodeUnits);
1455 }
1456 
1457 ///
1458 @system pure unittest
1459 {
1460     import std.range.primitives;
1461     string str = "Hello, World!";
1462 
1463     assert(str.decodeBack == '!' && str == "Hello, World");
1464     str = "å";
1465     assert(str.decodeBack == 'å' && str.empty);
1466     str = "å";
1467     size_t i;
1468     assert(str.decodeBack(i) == 'å' && i == 2 && str.empty);
1469 }
1470 
1471 // For the given range, code unit values less than this
1472 // are guaranteed to be valid single-codepoint encodings.
1473 package template codeUnitLimit(S)
1474 if (isSomeChar!(ElementEncodingType!S))
1475 {
1476     static if (is(immutable ElementEncodingType!S == immutable char))
1477         enum char codeUnitLimit = 0x80;
1478     else static if (is(immutable ElementEncodingType!S == immutable wchar))
1479         enum wchar codeUnitLimit = 0xD800;
1480     else
1481         enum dchar codeUnitLimit = 0xD800;
1482 }
1483 
1484 /*
1485  * For strings, this function does its own bounds checking to give a
1486  * more useful error message when attempting to decode past the end of a string.
1487  * Subsequently it uses a pointer instead of an array to avoid
1488  * redundant bounds checking.
1489  *
1490  * The three overloads of this operate on chars, wchars, and dchars.
1491  *
1492  * Params:
1493  *      canIndex = if S is indexable
1494  *      useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1495  *      str = input string or Range
1496  *      index = starting index into s[]; incremented by number of code units processed
1497  *
1498  * Returns:
1499  *      decoded character
1500  */
1501 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1502     auto ref S str, ref size_t index)
1503 if (
1504     is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char)))
1505 {
1506     /* The following encodings are valid, except for the 5 and 6 byte
1507      * combinations:
1508      *  0xxxxxxx
1509      *  110xxxxx 10xxxxxx
1510      *  1110xxxx 10xxxxxx 10xxxxxx
1511      *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1512      *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1513      *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1514      */
1515 
1516     /* Dchar bitmask for different numbers of UTF-8 code units.
1517      */
1518     alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1519 
1520     static if (is(S : const char[]))
1521         auto pstr = str.ptr + index;    // this is what makes decodeImpl() @system code
1522     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1523         auto pstr = str[index .. str.length];
1524     else
1525         alias pstr = str;
1526 
1527     // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1528     // outside of decodeImpl
1529     //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1530 
1531     static if (canIndex)
1532     {
1533         immutable length = str.length - index;
1534         ubyte fst = pstr[0];
1535     }
1536     else
1537     {
1538         ubyte fst = pstr.front;
1539         pstr.popFront();
1540     }
1541 
1542     static if (!useReplacementDchar)
1543     {
1544         static if (canIndex)
1545         {
1546             static UTFException exception(S)(S str, string msg)
1547             {
1548                 uint[4] sequence = void;
1549                 size_t i;
1550 
1551                 do
1552                 {
1553                     sequence[i] = str[i];
1554                 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
1555 
1556                 return new UTFException(msg, i).setSequence(sequence[0 .. i]);
1557             }
1558         }
1559 
1560         UTFException invalidUTF()
1561         {
1562             static if (canIndex)
1563                return exception(pstr[0 .. length], "Invalid UTF-8 sequence");
1564             else
1565             {
1566                 //We can't include the invalid sequence with input strings without
1567                 //saving each of the code units along the way, and we can't do it with
1568                 //forward ranges without saving the entire range. Both would incur a
1569                 //cost for the decoding of every character just to provide a better
1570                 //error message for the (hopefully) rare case when an invalid UTF-8
1571                 //sequence is encountered, so we don't bother trying to include the
1572                 //invalid sequence here, unlike with strings and sliceable ranges.
1573                return new UTFException("Invalid UTF-8 sequence");
1574             }
1575         }
1576 
1577         UTFException outOfBounds()
1578         {
1579             static if (canIndex)
1580                return exception(pstr[0 .. length], "Attempted to decode past the end of a string");
1581             else
1582                return new UTFException("Attempted to decode past the end of a string");
1583         }
1584     }
1585 
1586     if ((fst & 0b1100_0000) != 0b1100_0000)
1587     {
1588         static if (useReplacementDchar)
1589         {
1590             ++index;            // always consume bad input to avoid infinite loops
1591             return replacementDchar;
1592         }
1593         else
1594             throw invalidUTF(); // starter must have at least 2 first bits set
1595     }
1596     ubyte tmp = void;
1597     dchar d = fst; // upper control bits are masked out later
1598     fst <<= 1;
1599 
1600     foreach (i; AliasSeq!(1, 2, 3))
1601     {
1602 
1603         static if (canIndex)
1604         {
1605             if (i == length)
1606             {
1607                 static if (useReplacementDchar)
1608                 {
1609                     index += i;
1610                     return replacementDchar;
1611                 }
1612                 else
1613                     throw outOfBounds();
1614             }
1615         }
1616         else
1617         {
1618             if (pstr.empty)
1619             {
1620                 static if (useReplacementDchar)
1621                 {
1622                     index += i;
1623                     return replacementDchar;
1624                 }
1625                 else
1626                     throw outOfBounds();
1627             }
1628         }
1629 
1630         static if (canIndex)
1631             tmp = pstr[i];
1632         else
1633         {
1634             tmp = pstr.front;
1635             pstr.popFront();
1636         }
1637 
1638         if ((tmp & 0xC0) != 0x80)
1639         {
1640             static if (useReplacementDchar)
1641             {
1642                 index += i + 1;
1643                 return replacementDchar;
1644             }
1645             else
1646                 throw invalidUTF();
1647         }
1648 
1649         d = (d << 6) | (tmp & 0x3F);
1650         fst <<= 1;
1651 
1652         if (!(fst & 0x80)) // no more bytes
1653         {
1654             d &= bitMask[i]; // mask out control bits
1655 
1656             // overlong, could have been encoded with i bytes
1657             if ((d & ~bitMask[i - 1]) == 0)
1658             {
1659                 static if (useReplacementDchar)
1660                 {
1661                     index += i + 1;
1662                     return replacementDchar;
1663                 }
1664                 else
1665                     throw invalidUTF();
1666             }
1667 
1668             // check for surrogates only needed for 3 bytes
1669             static if (i == 2)
1670             {
1671                 if (!isValidDchar(d))
1672                 {
1673                     static if (useReplacementDchar)
1674                     {
1675                         index += i + 1;
1676                         return replacementDchar;
1677                     }
1678                     else
1679                         throw invalidUTF();
1680                 }
1681             }
1682 
1683             static if (i == 3)
1684             {
1685                 if (d > dchar.max)
1686                 {
1687                     static if (useReplacementDchar)
1688                         d = replacementDchar;
1689                     else
1690                         throw invalidUTF();
1691                 }
1692             }
1693 
1694             index += i + 1;
1695             return d;
1696         }
1697     }
1698 
1699     static if (useReplacementDchar)
1700     {
1701         index += 4;             // read 4 chars by now
1702         return replacementDchar;
1703     }
1704     else
1705         throw invalidUTF();
1706 }
1707 
1708 @safe pure @nogc nothrow
1709 unittest
1710 {
1711     // Add tests for useReplacemendDchar == yes path
1712 
1713     static struct R
1714     {
1715       @safe pure @nogc nothrow:
1716         this(string s) { this.s = s; }
1717         @property bool empty() { return idx == s.length; }
1718         @property char front() { return s[idx]; }
1719         void popFront() { ++idx; }
1720         size_t idx;
1721         string s;
1722     }
1723 
1724     foreach (s; invalidUTFstrings!char())
1725     {
1726         auto r = R(s);
1727         size_t index;
1728         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1729         assert(dc == replacementDchar);
1730         assert(1 <= index && index <= s.length);
1731     }
1732 }
1733 
1734 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)
1735 (auto ref S str, ref size_t index)
1736 if (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar)))
1737 {
1738     static if (is(S : const wchar[]))
1739         auto pstr = str.ptr + index;
1740     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1741         auto pstr = str[index .. str.length];
1742     else
1743         alias pstr = str;
1744 
1745     // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1746     // outside of decodeImpl
1747     //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1748 
1749     static if (canIndex)
1750     {
1751         immutable length = str.length - index;
1752         uint u = pstr[0];
1753     }
1754     else
1755     {
1756         uint u = pstr.front;
1757         pstr.popFront();
1758     }
1759 
1760     static if (!useReplacementDchar)
1761     {
1762         UTFException exception(string msg)
1763         {
1764             static if (canIndex)
1765                 return new UTFException(msg).setSequence(pstr[0]);
1766             else
1767                 return new UTFException(msg);
1768         }
1769     }
1770 
1771     // The < case must be taken care of before decodeImpl is called.
1772     assert(u >= 0xD800);
1773 
1774     if (u <= 0xDBFF)
1775     {
1776         static if (canIndex)
1777             immutable onlyOneCodeUnit = length == 1;
1778         else
1779             immutable onlyOneCodeUnit = pstr.empty;
1780 
1781         if (onlyOneCodeUnit)
1782         {
1783             static if (useReplacementDchar)
1784             {
1785                 ++index;
1786                 return replacementDchar;
1787             }
1788             else
1789                 throw exception("surrogate UTF-16 high value past end of string");
1790         }
1791 
1792         static if (canIndex)
1793             immutable uint u2 = pstr[1];
1794         else
1795         {
1796             immutable uint u2 = pstr.front;
1797             pstr.popFront();
1798         }
1799 
1800         if (u2 < 0xDC00 || u2 > 0xDFFF)
1801         {
1802             static if (useReplacementDchar)
1803                 u = replacementDchar;
1804             else
1805                 throw exception("surrogate UTF-16 low value out of range");
1806         }
1807         else
1808             u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
1809         ++index;
1810     }
1811     else if (u >= 0xDC00 && u <= 0xDFFF)
1812     {
1813         static if (useReplacementDchar)
1814             u = replacementDchar;
1815         else
1816             throw exception("unpaired surrogate UTF-16 value");
1817     }
1818     ++index;
1819 
1820     // Note: u+FFFE and u+FFFF are specifically permitted by the
1821     // Unicode standard for application internal use (see isValidDchar)
1822 
1823     return cast(dchar) u;
1824 }
1825 
1826 @safe pure @nogc nothrow
1827 unittest
1828 {
1829     // Add tests for useReplacemendDchar == true path
1830 
1831     static struct R
1832     {
1833       @safe pure @nogc nothrow:
1834         this(wstring s) { this.s = s; }
1835         @property bool empty() { return idx == s.length; }
1836         @property wchar front() { return s[idx]; }
1837         void popFront() { ++idx; }
1838         size_t idx;
1839         wstring s;
1840     }
1841 
1842     foreach (s; invalidUTFstrings!wchar())
1843     {
1844         auto r = R(s);
1845         size_t index;
1846         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1847         assert(dc == replacementDchar);
1848         assert(1 <= index && index <= s.length);
1849     }
1850 }
1851 
1852 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1853     auto ref S str, ref size_t index)
1854 if (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
1855 {
1856     static if (is(S : const dchar[]))
1857         auto pstr = str.ptr;
1858     else
1859         alias pstr = str;
1860 
1861     static if (is(S : const dchar[]) || isRandomAccessRange!S)
1862     {
1863         dchar dc = pstr[index];
1864         if (!isValidDchar(dc))
1865         {
1866             static if (useReplacementDchar)
1867                 dc = replacementDchar;
1868             else
1869                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1870         }
1871         ++index;
1872         return dc;
1873     }
1874     else
1875     {
1876         dchar dc = pstr.front;
1877         if (!isValidDchar(dc))
1878         {
1879             static if (useReplacementDchar)
1880                 dc = replacementDchar;
1881             else
1882                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1883         }
1884         ++index;
1885         pstr.popFront();
1886         return dc;
1887     }
1888 }
1889 
1890 @safe pure @nogc nothrow
1891 unittest
1892 {
1893     // Add tests for useReplacemendDchar == true path
1894 
1895     static struct R
1896     {
1897       @safe pure @nogc nothrow:
1898         this(dstring s) { this.s = s; }
1899         @property bool empty() { return idx == s.length; }
1900         @property dchar front() { return s[idx]; }
1901         void popFront() { ++idx; }
1902         size_t idx;
1903         dstring s;
1904     }
1905 
1906     foreach (s; invalidUTFstrings!dchar())
1907     {
1908         auto r = R(s);
1909         size_t index;
1910         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1911         assert(dc == replacementDchar);
1912         assert(1 <= index && index <= s.length);
1913     }
1914 }
1915 
1916 
1917 version (StdUnittest) private void testDecode(R)(R range,
1918                                              size_t index,
1919                                              dchar expectedChar,
1920                                              size_t expectedIndex,
1921                                              size_t line = __LINE__)
1922 {
1923     import core.exception : AssertError;
1924     import std.exception : enforce;
1925     import std.string : format;
1926     import std.traits : isNarrowString;
1927 
1928     static if (hasLength!R)
1929         immutable lenBefore = range.length;
1930 
1931     static if (isRandomAccessRange!R && !isNarrowString!R)
1932     {
1933         {
1934             immutable result = decode(range, index);
1935             enforce(result == expectedChar,
1936                     new AssertError(format("decode: Wrong character: %s", result), __FILE__, line));
1937             enforce(index == expectedIndex,
1938                     new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1939             static if (hasLength!R)
1940             {
1941                 enforce(range.length == lenBefore,
1942                         new AssertError(format("decode: length changed: %s", range.length), __FILE__, line));
1943             }
1944         }
1945     }
1946 }
1947 
1948 version (StdUnittest) private void testDecodeFront(R)(ref R range,
1949                                                   dchar expectedChar,
1950                                                   size_t expectedNumCodeUnits,
1951                                                   size_t line = __LINE__)
1952 {
1953     import core.exception : AssertError;
1954     import std.exception : enforce;
1955     import std.string : format;
1956 
1957     static if (hasLength!R)
1958         immutable lenBefore = range.length;
1959 
1960     size_t numCodeUnits;
1961     immutable result = decodeFront(range, numCodeUnits);
1962     enforce(result == expectedChar,
1963             new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line));
1964     enforce(numCodeUnits == expectedNumCodeUnits,
1965             new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1966 
1967     static if (hasLength!R)
1968     {
1969         enforce(range.length == lenBefore - numCodeUnits,
1970                 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line));
1971     }
1972 }
1973 
1974 version (StdUnittest) private void testDecodeBack(R)(ref R range,
1975                                                  dchar expectedChar,
1976                                                  size_t expectedNumCodeUnits,
1977                                                  size_t line = __LINE__)
1978 {
1979     // This condition is to allow unit testing all `decode` functions together
1980     static if (!isBidirectionalRange!R)
1981         return;
1982     else
1983     {
1984         import core.exception : AssertError;
1985         import std.exception : enforce;
1986         import std.string : format;
1987 
1988         static if (hasLength!R)
1989             immutable lenBefore = range.length;
1990 
1991         size_t numCodeUnits;
1992         immutable result = decodeBack(range, numCodeUnits);
1993         enforce(result == expectedChar,
1994                 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line));
1995         enforce(numCodeUnits == expectedNumCodeUnits,
1996                 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1997 
1998         static if (hasLength!R)
1999         {
2000             enforce(range.length == lenBefore - numCodeUnits,
2001                     new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line));
2002         }
2003     }
2004 }
2005 
2006 version (StdUnittest) private void testAllDecode(R)(R range,
2007                                                 dchar expectedChar,
2008                                                 size_t expectedIndex,
2009                                                 size_t line = __LINE__)
2010 {
2011     testDecode(range, 0, expectedChar, expectedIndex, line);
2012     static if (isBidirectionalRange!R)
2013     {
2014         auto rangeCopy = range.save;
2015         testDecodeBack(rangeCopy, expectedChar, expectedIndex, line);
2016     }
2017     testDecodeFront(range, expectedChar, expectedIndex, line);
2018 }
2019 
2020 version (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__)
2021 {
2022     import core.exception : AssertError;
2023     import std.exception : assertThrown, enforce;
2024     import std.string : format;
2025 
2026     immutable initialIndex = index;
2027 
2028     static if (hasLength!R)
2029         immutable lenBefore = range.length;
2030 
2031     static if (isRandomAccessRange!R)
2032     {
2033         assertThrown!UTFException(decode(range, index), null, __FILE__, line);
2034         enforce(index == initialIndex,
2035                 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
2036         static if (hasLength!R)
2037         {
2038             enforce(range.length == lenBefore,
2039                     new AssertError(format("decode: length changed:", range.length), __FILE__, line));
2040         }
2041     }
2042 
2043     if (initialIndex == 0)
2044         assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line);
2045 }
2046 
2047 version (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__)
2048 {
2049     // This condition is to allow unit testing all `decode` functions together
2050     static if (!isBidirectionalRange!R)
2051         return;
2052     else
2053     {
2054         import core.exception : AssertError;
2055         import std.exception : assertThrown, enforce;
2056         import std.string : format;
2057 
2058         static if (hasLength!R)
2059             immutable lenBefore = range.length;
2060 
2061         static if (isRandomAccessRange!R)
2062         {
2063             assertThrown!UTFException(decodeBack(range), null, __FILE__, line);
2064             static if (hasLength!R)
2065             {
2066                 enforce(range.length == lenBefore,
2067                         new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line));
2068             }
2069         }
2070     }
2071 }
2072 
2073 @system unittest
2074 {
2075     import std.conv : to;
2076     import std.exception;
2077 
2078     assertCTFEable!(
2079     {
2080     foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char,
2081                           (string s) => new RefBidirCU!char(s),
2082                           (string s) => new RefRandomCU!char(s)))
2083     {
2084         enum sHasLength = hasLength!(typeof(S("abcd")));
2085 
2086         {
2087             auto range = S("abcd");
2088             testDecode(range, 0, 'a', 1);
2089             testDecode(range, 1, 'b', 2);
2090             testDecodeFront(range, 'a', 1);
2091             testDecodeFront(range, 'b', 1);
2092             assert(decodeFront(range) == 'c');
2093             assert(decodeFront(range) == 'd');
2094         }
2095 
2096         {
2097             auto range = S("ウェブサイト");
2098             testDecode(range, 0, 'ウ', 3);
2099             testDecode(range, 3, 'ェ', 6);
2100             testDecodeFront(range, 'ウ', 3);
2101             testDecodeFront(range, 'ェ', 3);
2102             assert(decodeFront(range) == 'ブ');
2103             assert(decodeFront(range) == 'サ');
2104         }
2105 
2106         {
2107             auto range = S("abcd");
2108             testDecodeBack(range, 'd', 1);
2109             testDecodeBack(range, 'c', 1);
2110             testDecodeBack(range, 'b', 1);
2111             testDecodeBack(range, 'a', 1);
2112         }
2113 
2114         {
2115             auto range = S("ウェブサイト");
2116             testDecodeBack(range, 'ト', 3);
2117             testDecodeBack(range, 'イ', 3);
2118             testDecodeBack(range, 'サ', 3);
2119             testDecodeBack(range, 'ブ', 3);
2120         }
2121 
2122         testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
2123         testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
2124 
2125         foreach (str; ["\xE2\x89", // too short
2126                        "\xC0\x8A",
2127                        "\xE0\x80\x8A",
2128                        "\xF0\x80\x80\x8A",
2129                        "\xF8\x80\x80\x80\x8A",
2130                        "\xFC\x80\x80\x80\x80\x8A"])
2131         {
2132             testBadDecode(S(str), 0);
2133             testBadDecode(S(str), 1);
2134             testBadDecodeBack(S(str));
2135         }
2136 
2137         //Invalid UTF-8 sequence where the first code unit is valid.
2138         testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
2139         testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
2140 
2141         //Invalid UTF-8 sequence where the first code unit isn't valid.
2142         foreach (str; ["\xED\xA0\x80",
2143                        "\xED\xAD\xBF",
2144                        "\xED\xAE\x80",
2145                        "\xED\xAF\xBF",
2146                        "\xED\xB0\x80",
2147                        "\xED\xBE\x80",
2148                        "\xED\xBF\xBF"])
2149         {
2150             testBadDecode(S(str), 0);
2151             testBadDecodeBack(S(str));
2152         }
2153     }
2154     });
2155 }
2156 
2157 @system unittest
2158 {
2159     import std.exception;
2160     assertCTFEable!(
2161     {
2162     foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar,
2163                           (wstring s) => new RefBidirCU!wchar(s),
2164                           (wstring s) => new RefRandomCU!wchar(s)))
2165     {
2166         testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
2167         testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
2168         testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
2169         testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2170         testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2171 
2172         testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
2173         testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
2174 
2175         testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
2176         testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
2177 
2178         {
2179             auto range = S("ウェブサイト");
2180             testDecode(range, 0, 'ウ', 1);
2181             testDecode(range, 1, 'ェ', 2);
2182             testDecodeFront(range, 'ウ', 1);
2183             testDecodeFront(range, 'ェ', 1);
2184             assert(decodeFront(range) == 'ブ');
2185             assert(decodeFront(range) == 'サ');
2186         }
2187 
2188         {
2189             auto range = S("ウェブサイト");
2190             testDecodeBack(range, 'ト', 1);
2191             testDecodeBack(range, 'イ', 1);
2192             testDecodeBack(range, 'サ', 1);
2193             testDecodeBack(range, 'ブ', 1);
2194         }
2195     }
2196 
2197     foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s)))
2198     {
2199         auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2200                       cast(wchar) 0x1400,
2201                       cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2202         testDecode(str, 0, cast(dchar) 0x10000, 2);
2203         testDecode(str, 2, cast(dchar) 0x1400, 3);
2204         testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2205         testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2206         testDecodeBack(str, cast(dchar) 0x1400, 1);
2207         testDecodeBack(str, cast(dchar) 0x10000, 2);
2208     }
2209     });
2210 }
2211 
2212 @system unittest
2213 {
2214     import std.exception;
2215     assertCTFEable!(
2216     {
2217     foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar,
2218                           (dstring s) => new RefBidirCU!dchar(s),
2219                           (dstring s) => new RefRandomCU!dchar(s)))
2220     {
2221         testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2222         testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2223         testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2224         testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2225         testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2226 
2227         testBadDecode(S([cast(dchar) 0xD800]), 0);
2228         testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2229         testBadDecode(S([cast(dchar) 0x110000]), 0);
2230 
2231         testBadDecodeBack(S([cast(dchar) 0xD800]));
2232         testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2233         testBadDecodeBack(S([cast(dchar) 0x110000]));
2234 
2235         {
2236             auto range = S("ウェブサイト");
2237             testDecode(range, 0, 'ウ', 1);
2238             testDecode(range, 1, 'ェ', 2);
2239             testDecodeFront(range, 'ウ', 1);
2240             testDecodeFront(range, 'ェ', 1);
2241             assert(decodeFront(range) == 'ブ');
2242             assert(decodeFront(range) == 'サ');
2243         }
2244 
2245         {
2246             auto range = S("ウェブサイト");
2247             testDecodeBack(range, 'ト', 1);
2248             testDecodeBack(range, 'イ', 1);
2249             testDecodeBack(range, 'サ', 1);
2250             testDecodeBack(range, 'ブ', 1);
2251         }
2252     }
2253 
2254     foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s)))
2255     {
2256         auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2257         testDecode(str, 0, 0x10000, 1);
2258         testDecode(str, 1, 0x1400, 2);
2259         testDecode(str, 2, 0xB9DDE, 3);
2260         testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2261         testDecodeBack(str, cast(dchar) 0x1400, 1);
2262         testDecodeBack(str, cast(dchar) 0x10000, 1);
2263     }
2264     });
2265 }
2266 
2267 @safe unittest
2268 {
2269     import std.exception;
2270     import std.traits : FunctionAttribute, functionAttributes, isSafe;
2271     assertCTFEable!(
2272     {
2273     foreach (S; AliasSeq!( char[], const( char)[],  string,
2274                           wchar[], const(wchar)[], wstring,
2275                           dchar[], const(dchar)[], dstring))
2276     {
2277         static assert(isSafe!({ S str; size_t i = 0; decode(str, i);      }));
2278         static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); }));
2279         static assert(isSafe!({ S str; decodeFront(str); }));
2280         static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0);
2281         static assert((functionAttributes!({
2282             S str; size_t i = 0; decodeFront(str, i);
2283         }) & FunctionAttribute.pure_) != 0);
2284         static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0);
2285         static assert((functionAttributes!({
2286             S str; size_t i = 0; decodeBack(str, i);
2287         }) & FunctionAttribute.pure_) != 0);
2288         static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0);
2289     }
2290     });
2291 }
2292 
2293 @safe unittest
2294 {
2295     import std.exception;
2296     char[4] val;
2297     val[0] = 0b1111_0111;
2298     val[1] = 0b1011_1111;
2299     val[2] = 0b1011_1111;
2300     val[3] = 0b1011_1111;
2301     size_t i = 0;
2302     assertThrown!UTFException((){ dchar ch = decode(val[], i); }());
2303 }
2304 /* =================== Encode ======================= */
2305 
2306 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c)
2307 {
2308     static if (useReplacementDchar)
2309         return replacementDchar;
2310     else
2311         throw new UTFException(msg).setSequence(c);
2312 }
2313 
2314 /++
2315     Encodes `c` into the static array, `buf`, and returns the actual
2316     length of the encoded character (a number between `1` and `4` for
2317     `char[4]` buffers and a number between `1` and `2` for
2318     `wchar[2]` buffers).
2319 
2320     Throws:
2321         `UTFException` if `c` is not a valid UTF code point.
2322   +/
2323 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2324     out char[4] buf, dchar c) @safe pure
2325 {
2326     if (c <= 0x7F)
2327     {
2328         assert(isValidDchar(c));
2329         buf[0] = cast(char) c;
2330         return 1;
2331     }
2332     if (c <= 0x7FF)
2333     {
2334         assert(isValidDchar(c));
2335         buf[0] = cast(char)(0xC0 | (c >> 6));
2336         buf[1] = cast(char)(0x80 | (c & 0x3F));
2337         return 2;
2338     }
2339     if (c <= 0xFFFF)
2340     {
2341         if (0xD800 <= c && c <= 0xDFFF)
2342             c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2343 
2344         assert(isValidDchar(c));
2345     L3:
2346         buf[0] = cast(char)(0xE0 | (c >> 12));
2347         buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2348         buf[2] = cast(char)(0x80 | (c & 0x3F));
2349         return 3;
2350     }
2351     if (c <= 0x10FFFF)
2352     {
2353         assert(isValidDchar(c));
2354         buf[0] = cast(char)(0xF0 | (c >> 18));
2355         buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2356         buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2357         buf[3] = cast(char)(0x80 | (c & 0x3F));
2358         return 4;
2359     }
2360 
2361     assert(!isValidDchar(c));
2362     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2363     goto L3;
2364 }
2365 
2366 ///
2367 @safe unittest
2368 {
2369     import std.exception : assertThrown;
2370     import std.typecons : Yes;
2371 
2372     char[4] buf;
2373 
2374     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2375     assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2376     assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2377     assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2378     assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2379     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2380 
2381     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2382     auto slice = buf[];
2383     assert(slice.decodeFront == replacementDchar);
2384 }
2385 
2386 ///
2387 @safe unittest
2388 {
2389     import std.exception : assertThrown;
2390     import std.typecons : Yes;
2391 
2392     wchar[2] buf;
2393 
2394     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2395     assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2396     assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2397     assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2398     assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2399     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2400 
2401     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2402     auto slice = buf[];
2403     assert(slice.decodeFront == replacementDchar);
2404 }
2405 
2406 ///
2407 @safe unittest
2408 {
2409     import std.exception : assertThrown;
2410     import std.typecons : Yes;
2411 
2412     dchar[1] buf;
2413 
2414     assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000');
2415     assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF');
2416     assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000');
2417     assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF');
2418     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2419 
2420     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2421     assert(buf[0] == replacementDchar);
2422 }
2423 
2424 @safe unittest
2425 {
2426     import std.exception;
2427     assertCTFEable!(
2428     {
2429     char[4] buf;
2430 
2431     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2432     assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2433     assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2434     assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
2435     assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
2436     assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
2437     assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2438     assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2439     assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
2440     assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
2441     assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
2442 
2443     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2444     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2445     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2446     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2447     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2448 
2449     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2450     enum replacementDcharString = "\uFFFD";
2451     assert(buf[0 .. replacementDcharString.length] == replacementDcharString);
2452     });
2453 }
2454 
2455 
2456 /// Ditto
2457 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2458     out wchar[2] buf, dchar c) @safe pure
2459 {
2460     if (c <= 0xFFFF)
2461     {
2462         if (0xD800 <= c && c <= 0xDFFF)
2463             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2464 
2465         assert(isValidDchar(c));
2466     L1:
2467         buf[0] = cast(wchar) c;
2468         return 1;
2469     }
2470     if (c <= 0x10FFFF)
2471     {
2472         assert(isValidDchar(c));
2473         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2474         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2475         return 2;
2476     }
2477 
2478     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2479     goto L1;
2480 }
2481 
2482 @safe unittest
2483 {
2484     import std.exception;
2485     assertCTFEable!(
2486     {
2487     wchar[2] buf;
2488 
2489     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2490     assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2491     assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2492     assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
2493     assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
2494     assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2495     assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2496 
2497     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2498     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2499     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2500     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2501     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2502 
2503     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2504     assert(buf.front == replacementDchar);
2505     });
2506 }
2507 
2508 
2509 /// Ditto
2510 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2511     out dchar[1] buf, dchar c) @safe pure
2512 {
2513     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2514         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2515     else
2516         assert(isValidDchar(c));
2517     buf[0] = c;
2518     return 1;
2519 }
2520 
2521 @safe unittest
2522 {
2523     import std.exception;
2524     assertCTFEable!(
2525     {
2526     dchar[1] buf;
2527 
2528     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2529     encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF');
2530     encode(buf, '\uE000'); assert(buf[0] == '\uE000');
2531     encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE);
2532     encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF);
2533     encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF');
2534 
2535     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2536     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2537     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2538     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2539     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2540 
2541     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2542     assert(buf.front == replacementDchar);
2543     });
2544 }
2545 
2546 
2547 /++
2548     Encodes `c` in `str`'s encoding and appends it to `str`.
2549 
2550     Throws:
2551         `UTFException` if `c` is not a valid UTF code point.
2552   +/
2553 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2554     ref scope char[] str, dchar c) @safe pure
2555 {
2556     if (c <= 0x7F)
2557     {
2558         assert(isValidDchar(c));
2559         str ~= cast(char) c;
2560     }
2561     else
2562     {
2563         char[4] buf;
2564         uint L;
2565 
2566         if (c <= 0x7FF)
2567         {
2568             assert(isValidDchar(c));
2569             buf[0] = cast(char)(0xC0 | (c >> 6));
2570             buf[1] = cast(char)(0x80 | (c & 0x3F));
2571             L = 2;
2572         }
2573         else if (c <= 0xFFFF)
2574         {
2575             if (0xD800 <= c && c <= 0xDFFF)
2576                 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2577 
2578             assert(isValidDchar(c));
2579         L3:
2580             buf[0] = cast(char)(0xE0 | (c >> 12));
2581             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2582             buf[2] = cast(char)(0x80 | (c & 0x3F));
2583             L = 3;
2584         }
2585         else if (c <= 0x10FFFF)
2586         {
2587             assert(isValidDchar(c));
2588             buf[0] = cast(char)(0xF0 | (c >> 18));
2589             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2590             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2591             buf[3] = cast(char)(0x80 | (c & 0x3F));
2592             L = 4;
2593         }
2594         else
2595         {
2596             assert(!isValidDchar(c));
2597             c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2598             goto L3;
2599         }
2600         str ~= buf[0 .. L];
2601     }
2602 }
2603 
2604 ///
2605 @safe unittest
2606 {
2607     char[] s = "abcd".dup;
2608     dchar d1 = 'a';
2609     dchar d2 = 'ø';
2610 
2611     encode(s, d1);
2612     assert(s.length == 5);
2613     assert(s == "abcda");
2614     encode(s, d2);
2615     assert(s.length == 7);
2616     assert(s == "abcdaø");
2617 }
2618 
2619 @safe unittest
2620 {
2621     import std.exception;
2622 
2623     assertCTFEable!(
2624     {
2625     char[] s = "abcd".dup;
2626     encode(s, cast(dchar)'a');
2627     assert(s.length == 5);
2628     assert(s == "abcda");
2629 
2630     encode(s, cast(dchar)'\u00A9');
2631     assert(s.length == 7);
2632     assert(s == "abcda\xC2\xA9");
2633     //assert(s == "abcda\u00A9");   // BUG: fix compiler
2634 
2635     encode(s, cast(dchar)'\u2260');
2636     assert(s.length == 10);
2637     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
2638     });
2639 }
2640 
2641 @safe unittest
2642 {
2643     import std.exception;
2644     assertCTFEable!(
2645     {
2646     char[] buf;
2647 
2648     encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
2649     encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
2650     encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
2651     encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
2652     encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
2653     encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
2654     encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
2655     encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
2656     encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
2657     encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
2658     encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
2659 
2660     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2661     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2662     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2663     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2664     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2665 
2666     enum replacementDcharString = "\uFFFD";
2667     enum rdcslen = replacementDcharString.length;
2668     assert(buf[$ - rdcslen .. $] != replacementDcharString);
2669     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2670     assert(buf[$ - rdcslen .. $] == replacementDcharString);
2671     });
2672 }
2673 
2674 /// ditto
2675 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2676     ref scope wchar[] str, dchar c) @safe pure
2677 {
2678     if (c <= 0xFFFF)
2679     {
2680         if (0xD800 <= c && c <= 0xDFFF)
2681             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2682 
2683         assert(isValidDchar(c));
2684     L1:
2685         str ~= cast(wchar) c;
2686     }
2687     else if (c <= 0x10FFFF)
2688     {
2689         wchar[2] buf;
2690 
2691         assert(isValidDchar(c));
2692         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2693         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2694         str ~= buf;
2695     }
2696     else
2697     {
2698         assert(!isValidDchar(c));
2699         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2700         goto L1;
2701     }
2702 }
2703 
2704 @safe unittest
2705 {
2706     import std.exception;
2707     assertCTFEable!(
2708     {
2709     wchar[] buf;
2710 
2711     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2712     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2713     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2714     encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
2715     encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
2716     encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
2717     encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
2718 
2719     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2720     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2721     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2722     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2723     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2724 
2725     assert(buf.back != replacementDchar);
2726     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2727     assert(buf.back == replacementDchar);
2728     });
2729 }
2730 
2731 /// ditto
2732 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2733     ref scope dchar[] str, dchar c) @safe pure
2734 {
2735     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2736         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2737     else
2738         assert(isValidDchar(c));
2739     str ~= c;
2740 }
2741 
2742 @safe unittest
2743 {
2744     import std.exception;
2745     assertCTFEable!(
2746     {
2747     dchar[] buf;
2748 
2749     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2750     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2751     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2752     encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
2753     encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
2754     encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
2755 
2756     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2757     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2758     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2759     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2760     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2761 
2762     assert(buf.back != replacementDchar);
2763     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2764     assert(buf.back == replacementDchar);
2765     });
2766 }
2767 
2768 
2769 /++
2770     Returns the number of code units that are required to encode the code point
2771     `c` when `C` is the character type used to encode it.
2772   +/
2773 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc
2774 if (isSomeChar!C)
2775 {
2776     static if (C.sizeof == 1)
2777     {
2778         if (c <= 0x7F) return 1;
2779         if (c <= 0x7FF) return 2;
2780         if (c <= 0xFFFF) return 3;
2781         if (c <= 0x10FFFF) return 4;
2782         assert(false);
2783     }
2784     else static if (C.sizeof == 2)
2785     {
2786         return c <= 0xFFFF ? 1 : 2;
2787     }
2788     else
2789     {
2790         static assert(C.sizeof == 4);
2791         return 1;
2792     }
2793 }
2794 
2795 ///
2796 @safe pure nothrow @nogc unittest
2797 {
2798     assert(codeLength!char('a') == 1);
2799     assert(codeLength!wchar('a') == 1);
2800     assert(codeLength!dchar('a') == 1);
2801 
2802     assert(codeLength!char('\U0010FFFF') == 4);
2803     assert(codeLength!wchar('\U0010FFFF') == 2);
2804     assert(codeLength!dchar('\U0010FFFF') == 1);
2805 }
2806 
2807 
2808 /++
2809     Returns the number of code units that are required to encode `str`
2810     in a string whose character type is `C`. This is particularly useful
2811     when slicing one string with the length of another and the two string
2812     types use different character types.
2813 
2814     Params:
2815         C = the character type to get the encoding length for
2816         input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
2817         to calculate the encoding length from
2818     Returns:
2819         The number of code units in `input` when encoded to `C`
2820   +/
2821 size_t codeLength(C, InputRange)(InputRange input)
2822 if (isSomeFiniteCharInputRange!InputRange)
2823 {
2824     alias EncType = Unqual!(ElementEncodingType!InputRange);
2825     static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length)))
2826         return input.length;
2827     else
2828     {
2829         size_t total = 0;
2830 
2831         foreach (c; input.byDchar)
2832             total += codeLength!C(c);
2833 
2834         return total;
2835     }
2836 }
2837 
2838 ///
2839 @safe unittest
2840 {
2841     assert(codeLength!char("hello world") ==
2842            "hello world".length);
2843     assert(codeLength!wchar("hello world") ==
2844            "hello world"w.length);
2845     assert(codeLength!dchar("hello world") ==
2846            "hello world"d.length);
2847 
2848     assert(codeLength!char(`プログラミング`) ==
2849            `プログラミング`.length);
2850     assert(codeLength!wchar(`プログラミング`) ==
2851            `プログラミング`w.length);
2852     assert(codeLength!dchar(`プログラミング`) ==
2853            `プログラミング`d.length);
2854 
2855     string haystack = `Être sans la verité, ça, ce ne serait pas bien.`;
2856     wstring needle = `Être sans la verité`;
2857     assert(haystack[codeLength!char(needle) .. $] ==
2858            `, ça, ce ne serait pas bien.`);
2859 }
2860 
2861 @safe unittest
2862 {
2863     import std.algorithm.iteration : filter;
2864     import std.conv : to;
2865     import std.exception;
2866 
2867     assertCTFEable!(
2868     {
2869     foreach (S; AliasSeq!( char[], const  char[],  string,
2870                           wchar[], const wchar[], wstring,
2871                           dchar[], const dchar[], dstring))
2872     {
2873         foreach (C; AliasSeq!(char, wchar, dchar))
2874         {
2875             assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length);
2876             assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length);
2877             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) ==
2878                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2879             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) ==
2880                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2881         }
2882     }
2883     });
2884 }
2885 
2886 /+
2887 Internal helper function:
2888 
2889 Returns true if it is safe to search for the Codepoint `c` inside
2890 code units, without decoding.
2891 
2892 This is a runtime check that is used an optimization in various functions,
2893 particularly, in `std.string`.
2894   +/
2895 package bool canSearchInCodeUnits(C)(dchar c)
2896 if (isSomeChar!C)
2897 {
2898     static if (C.sizeof == 1)
2899          return c <= 0x7F;
2900     else static if (C.sizeof == 2)
2901         return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF);
2902     else static if (C.sizeof == 4)
2903         return true;
2904     else
2905         static assert(0);
2906 }
2907 @safe unittest
2908 {
2909     assert( canSearchInCodeUnits! char('a'));
2910     assert( canSearchInCodeUnits!wchar('a'));
2911     assert( canSearchInCodeUnits!dchar('a'));
2912     assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF
2913     assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
2914     assert( canSearchInCodeUnits!wchar('ö'));
2915     assert( canSearchInCodeUnits!dchar('ö'));
2916     assert(!canSearchInCodeUnits! char('日'));
2917     assert( canSearchInCodeUnits!wchar('日'));
2918     assert( canSearchInCodeUnits!dchar('日'));
2919     assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
2920     assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
2921     assert(!canSearchInCodeUnits! char('\U00010001'));
2922     assert(!canSearchInCodeUnits!wchar('\U00010001'));
2923     assert( canSearchInCodeUnits!dchar('\U00010001'));
2924 }
2925 
2926 /* =================== Validation ======================= */
2927 
2928 /++
2929     Checks to see if `str` is well-formed unicode or not.
2930 
2931     Throws:
2932         `UTFException` if `str` is not well-formed.
2933   +/
2934 void validate(S)(in S str) @safe pure
2935 if (isSomeString!S)
2936 {
2937     immutable len = str.length;
2938     for (size_t i = 0; i < len; )
2939     {
2940         decode(str, i);
2941     }
2942 }
2943 
2944 ///
2945 @safe unittest
2946 {
2947     import std.exception : assertThrown;
2948     char[] a = [167, 133, 175];
2949     assertThrown!UTFException(validate(a));
2950 }
2951 
2952 // https://issues.dlang.org/show_bug.cgi?id=12923
2953 @safe unittest
2954 {
2955     import std.exception;
2956     assertThrown((){
2957         char[3]a=[167, 133, 175];
2958         validate(a[]);
2959     }());
2960 }
2961 
2962 /**
2963  * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2964  * string of the elements.
2965  *
2966  * Params:
2967  *     s = the string to encode
2968  * Returns:
2969  *     A UTF-8 string
2970  * See_Also:
2971  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2972  */
2973 string toUTF8(S)(S s)
2974 if (isSomeFiniteCharInputRange!S)
2975 {
2976     return toUTFImpl!string(s);
2977 }
2978 
2979 ///
2980 @safe pure unittest
2981 {
2982     import std.algorithm.comparison : equal;
2983 
2984     // The ö is represented by two UTF-8 code units
2985     assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2986 
2987     // 𐐷 is four code units in UTF-8
2988     assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2989 }
2990 
2991 @system pure unittest
2992 {
2993     import std.algorithm.comparison : equal;
2994     import std.internal.test.dummyrange : ReferenceInputRange;
2995 
2996     alias RT = ReferenceInputRange!(ElementType!(string));
2997     auto r1 = new RT("Hellø");
2998     auto r2 = new RT("𐐷");
2999 
3000     assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
3001     assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
3002 }
3003 
3004 /**
3005  * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
3006  * `wstring` of the elements.
3007  *
3008  * Params:
3009  *     s = the range to encode
3010  * Returns:
3011  *     A UTF-16 string
3012  * See_Also:
3013  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3014  */
3015 wstring toUTF16(S)(S s)
3016 if (isSomeFiniteCharInputRange!S)
3017 {
3018     return toUTFImpl!wstring(s);
3019 }
3020 
3021 ///
3022 @safe pure unittest
3023 {
3024     import std.algorithm.comparison : equal;
3025 
3026     // these graphemes are two code units in UTF-16 and one in UTF-32
3027     assert("𤭢"d.length == 1);
3028     assert("𐐷"d.length == 1);
3029 
3030     assert("𤭢"d.toUTF16.equal([0xD852, 0xDF62]));
3031     assert("𐐷"d.toUTF16.equal([0xD801, 0xDC37]));
3032 }
3033 
3034 @system pure unittest
3035 {
3036     import std.algorithm.comparison : equal;
3037     import std.internal.test.dummyrange : ReferenceInputRange;
3038 
3039     alias RT = ReferenceInputRange!(ElementType!(string));
3040     auto r1 = new RT("𤭢");
3041     auto r2 = new RT("𐐷");
3042 
3043     assert(r1.toUTF16.equal([0xD852, 0xDF62]));
3044     assert(r2.toUTF16.equal([0xD801, 0xDC37]));
3045 }
3046 
3047 
3048 /**
3049  * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
3050  * `dstring` of the elements.
3051  *
3052  * Params:
3053  *     s = the range to encode
3054  * Returns:
3055  *     A UTF-32 string
3056  * See_Also:
3057  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3058  */
3059 dstring toUTF32(S)(scope S s)
3060 if (isSomeFiniteCharInputRange!S)
3061 {
3062     return toUTFImpl!dstring(s);
3063 }
3064 
3065 ///
3066 @safe pure unittest
3067 {
3068     import std.algorithm.comparison : equal;
3069 
3070     // these graphemes are two code units in UTF-16 and one in UTF-32
3071     assert("𤭢"w.length == 2);
3072     assert("𐐷"w.length == 2);
3073 
3074     assert("𤭢"w.toUTF32.equal([0x00024B62]));
3075     assert("𐐷"w.toUTF32.equal([0x00010437]));
3076 }
3077 
3078 private T toUTFImpl(T, S)(scope S s)
3079 {
3080     static if (is(S : T))
3081     {
3082         return s.idup;
3083     }
3084     else
3085     {
3086         import std.array : appender;
3087         auto app = appender!T();
3088 
3089         static if (is(S == C[], C) || hasLength!S)
3090             app.reserve(s.length);
3091 
3092         foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T)))
3093             app.put(c);
3094 
3095         return app.data;
3096     }
3097 }
3098 
3099 /* =================== toUTFz ======================= */
3100 
3101 /++
3102     Returns a C-style zero-terminated string equivalent to `str`. `str`
3103     must not contain embedded `'\0'`'s as any C function will treat the first
3104     `'\0'` that it sees as the end of the string. If `str.empty` is
3105     `true`, then a string containing only `'\0'` is returned.
3106 
3107     `toUTFz` accepts any type of string and is templated on the type of
3108     character pointer that you wish to convert to. It will avoid allocating a
3109     new string if it can, but there's a decent chance that it will end up having
3110     to allocate a new string - particularly when dealing with character types
3111     other than `char`.
3112 
3113     $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if
3114     anything alters the character one past the end of `str` (which is the
3115     `'\0'` character terminating the string), then the string won't be
3116     zero-terminated anymore. The most likely scenarios for that are if you
3117     append to `str` and no reallocation takes place or when `str` is a
3118     slice of a larger array, and you alter the character in the larger array
3119     which is one character past the end of `str`. Another case where it could
3120     occur would be if you had a mutable character array immediately after
3121     `str` in memory (for example, if they're member variables in a
3122     user-defined type with one declared right after the other) and that
3123     character array happened to start with `'\0'`. Such scenarios will never
3124     occur if you immediately use the zero-terminated string after calling
3125     `toUTFz` and the C function using it doesn't keep a reference to it.
3126     Also, they are unlikely to occur even if you save the zero-terminated string
3127     (the cases above would be among the few examples of where it could happen).
3128     However, if you save the zero-terminate string and want to be absolutely
3129     certain that the string stays zero-terminated, then simply append a
3130     `'\0'` to the string and use its `ptr` property rather than calling
3131     `toUTFz`.
3132 
3133     $(RED Warning 2:) When passing a character pointer to a C function, and the
3134     C function keeps it around for any reason, make sure that you keep a
3135     reference to it in your D code. Otherwise, it may go away during a garbage
3136     collection cycle and cause a nasty bug when the C code tries to use it.
3137   +/
3138 template toUTFz(P)
3139 if (is(P == C*, C) && isSomeChar!C)
3140 {
3141     P toUTFz(S)(S str) @safe pure
3142     if (isSomeString!S)
3143     {
3144         return toUTFzImpl!(P, S)(str);
3145     }
3146 }
3147 
3148 ///
3149 @safe pure unittest
3150 {
3151     auto p1 = toUTFz!(char*)("hello world");
3152     auto p2 = toUTFz!(const(char)*)("hello world");
3153     auto p3 = toUTFz!(immutable(char)*)("hello world");
3154     auto p4 = toUTFz!(char*)("hello world"d);
3155     auto p5 = toUTFz!(const(wchar)*)("hello world");
3156     auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
3157 }
3158 
3159 private P toUTFzImpl(P, S)(return scope S str) @safe pure
3160 if (is(immutable typeof(*P.init) == typeof(str[0])))
3161 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
3162 {
3163     if (str.empty)
3164     {
3165         typeof(*P.init)[] retval = ['\0'];
3166 
3167         auto trustedPtr() @trusted { return retval.ptr; }
3168         return trustedPtr();
3169     }
3170 
3171     alias C = Unqual!(ElementEncodingType!S);
3172 
3173     //If the P is mutable, then we have to make a copy.
3174     static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init)))
3175     {
3176         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3177     }
3178     else
3179     {
3180         if (!__ctfe)
3181         {
3182             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3183             immutable p = trustedPtrAdd(str);
3184 
3185             // Peek past end of str, if it's 0, no conversion necessary.
3186             // Note that the compiler will put a 0 past the end of static
3187             // strings, and the storage allocator will put a 0 past the end
3188             // of newly allocated char[]'s.
3189             // Is p dereferenceable? A simple test: if the p points to an
3190             // address multiple of 4, then conservatively assume the pointer
3191             // might be pointing to a new block of memory, which might be
3192             // unreadable. Otherwise, it's definitely pointing to valid
3193             // memory.
3194             if ((cast(size_t) p & 3) && *p == '\0')
3195                 return &str[0];
3196         }
3197 
3198         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3199     }
3200 }
3201 
3202 private P toUTFzImpl(P, S)(return scope S str) @safe pure
3203 if (is(typeof(str[0]) C) && is(immutable typeof(*P.init) == immutable C) && !is(C == immutable))
3204 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
3205 {
3206     alias InChar  = typeof(str[0]);
3207     alias OutChar = typeof(*P.init);
3208 
3209     //const(C)[] -> const(C)* or
3210     //C[] -> C* or const(C)*
3211     static if (( is(const(Unqual!InChar) == InChar) &&  is(const(Unqual!OutChar) == OutChar)) ||
3212                (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar)))
3213     {
3214         if (!__ctfe)
3215         {
3216             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3217             auto p = trustedPtrAdd(str);
3218 
3219             if ((cast(size_t) p & 3) && *p == '\0')
3220                 return &str[0];
3221         }
3222 
3223         str ~= '\0';
3224         return &str[0];
3225     }
3226     //const(C)[] -> C* or immutable(C)* or
3227     //C[] -> immutable(C)*
3228     else
3229     {
3230         import std.array : uninitializedArray;
3231         auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1);
3232         copy[0 .. $ - 1] = str[];
3233         copy[$ - 1] = '\0';
3234 
3235         auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; }
3236         return trustedCast(copy);
3237     }
3238 }
3239 
3240 private P toUTFzImpl(P, S)(S str) @safe pure
3241 if (!is(immutable typeof(*P.init) == immutable typeof(str[0])))
3242 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
3243 {
3244     import std.array : appender;
3245     auto retval = appender!(typeof(*P.init)[])();
3246 
3247     foreach (dchar c; str)
3248         retval.put(c);
3249     retval.put('\0');
3250 
3251     return () @trusted { return cast(P) retval.data.ptr; } ();
3252 }
3253 
3254 @safe pure unittest
3255 {
3256     import core.exception : AssertError;
3257     import std.algorithm;
3258     import std.conv : to;
3259     import std.exception;
3260     import std.string : format;
3261 
3262     assertCTFEable!(
3263     {
3264     foreach (S; AliasSeq!(string, wstring, dstring))
3265     {
3266         alias C = Unqual!(ElementEncodingType!S);
3267 
3268         auto s1 = to!S("hello\U00010143\u0100\U00010143");
3269         auto temp = new C[](s1.length + 1);
3270         temp[0 .. $ - 1] = s1[0 .. $];
3271         temp[$ - 1] = '\n';
3272         --temp.length;
3273         auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); }
3274         auto s2 = trustedAssumeUnique(temp);
3275         assert(s1 == s2);
3276 
3277         void trustedCStringAssert(P, S)(S s) @trusted
3278         {
3279             auto p = toUTFz!P(s);
3280             assert(p[0 .. s.length] == s);
3281             assert(p[s.length] == '\0');
3282         }
3283 
3284         foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*))
3285         {
3286             trustedCStringAssert!P(s1);
3287             trustedCStringAssert!P(s2);
3288         }
3289     }
3290     });
3291 
3292     static void test(P, S)(S s, size_t line = __LINE__) @trusted
3293     {
3294         static size_t zeroLen(C)(const(C)* ptr) @trusted
3295         {
3296             size_t len = 0;
3297             while (*ptr != '\0') { ++ptr; ++len; }
3298             return len;
3299         }
3300 
3301         auto p = toUTFz!P(s);
3302         immutable len = zeroLen(p);
3303         enforce(cmp(s, p[0 .. len]) == 0,
3304                 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof),
3305                                 __FILE__, line));
3306     }
3307 
3308     assertCTFEable!(
3309     {
3310     foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*,
3311                           dchar*, const(dchar)*, immutable(dchar)*))
3312     {
3313         test!P("hello\U00010143\u0100\U00010143");
3314     }
3315     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3316                           dchar*, const(dchar)*, immutable(dchar)*))
3317     {
3318         test!P("hello\U00010143\u0100\U00010143"w);
3319     }
3320     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3321                           wchar*, const(wchar)*, immutable(wchar)*))
3322     {
3323         test!P("hello\U00010143\u0100\U00010143"d);
3324     }
3325     foreach (S; AliasSeq!( char[], const( char)[],
3326                           wchar[], const(wchar)[],
3327                           dchar[], const(dchar)[]))
3328     {
3329         auto s = to!S("hello\U00010143\u0100\U00010143");
3330 
3331         foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3332                               wchar*, const(wchar)*, immutable(wchar)*,
3333                               dchar*, const(dchar)*, immutable(dchar)*))
3334         {
3335             test!P(s);
3336         }
3337     }
3338     });
3339 }
3340 
3341 
3342 /++
3343     `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`.
3344 
3345     Encodes string `s` into UTF-16 and returns the encoded string.
3346     `toUTF16z` is suitable for calling the 'W' functions in the Win32 API
3347     that take an `LPCWSTR` argument.
3348   +/
3349 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure
3350 if (isSomeChar!C)
3351 {
3352     return toUTFz!(const(wchar)*)(str);
3353 }
3354 
3355 ///
3356 @system unittest
3357 {
3358     string str = "Hello, World!";
3359     const(wchar)* p = str.toUTF16z;
3360     assert(p[str.length] == '\0');
3361 }
3362 
3363 @safe pure unittest
3364 {
3365     import std.conv : to;
3366     //toUTFz is already thoroughly tested, so this will just verify that
3367     //toUTF16z compiles properly for the various string types.
3368     foreach (S; AliasSeq!(string, wstring, dstring))
3369         assert(toUTF16z(to!S("hello world")) !is null);
3370 }
3371 
3372 
3373 /* ================================ tests ================================== */
3374 
3375 @safe pure unittest
3376 {
3377     import std.exception;
3378 
3379     assertCTFEable!(
3380     {
3381     assert(toUTF16("hello"c) == "hello");
3382     assert(toUTF32("hello"c) == "hello");
3383     assert(toUTF8 ("hello"w) == "hello");
3384     assert(toUTF32("hello"w) == "hello");
3385     assert(toUTF8 ("hello"d) == "hello");
3386     assert(toUTF16("hello"d) == "hello");
3387 
3388     assert(toUTF16("hel\u1234o"c) == "hel\u1234o");
3389     assert(toUTF32("hel\u1234o"c) == "hel\u1234o");
3390     assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o");
3391     assert(toUTF32("hel\u1234o"w) == "hel\u1234o");
3392     assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o");
3393     assert(toUTF16("hel\u1234o"d) == "hel\u1234o");
3394 
3395     assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3396     assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3397     assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3398     assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3399     assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3400     assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3401     });
3402 }
3403 
3404 
3405 /++
3406     Returns the total number of code points encoded in `str`.
3407 
3408     Supercedes: This function supercedes $(LREF toUCSindex).
3409 
3410     Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3411 
3412     Throws:
3413         `UTFException` if `str` is not well-formed.
3414   +/
3415 size_t count(C)(const(C)[] str) @safe pure nothrow @nogc
3416 if (isSomeChar!C)
3417 {
3418     return walkLength(str.byDchar);
3419 }
3420 
3421 ///
3422 @safe pure nothrow @nogc unittest
3423 {
3424     assert(count("") == 0);
3425     assert(count("a") == 1);
3426     assert(count("abc") == 3);
3427     assert(count("\u20AC100") == 4);
3428 }
3429 
3430 @safe pure nothrow @nogc unittest
3431 {
3432     import std.exception;
3433     assertCTFEable!(
3434     {
3435     assert(count("") == 0);
3436     assert(count("a") == 1);
3437     assert(count("abc") == 3);
3438     assert(count("\u20AC100") == 4);
3439     });
3440 }
3441 
3442 
3443 // Ranges of code units for testing.
3444 version (StdUnittest)
3445 {
3446 private:
3447     struct InputCU(C)
3448     {
3449         import std.conv : to;
3450         @property bool empty() { return _str.empty; }
3451         @property C front() { return _str[0]; }
3452         void popFront() { _str = _str[1 .. $]; }
3453 
3454         this(inout(C)[] str)
3455         {
3456             _str = to!(C[])(str);
3457         }
3458 
3459         C[] _str;
3460     }
3461 
3462     struct BidirCU(C)
3463     {
3464         import std.conv : to;
3465         @property bool empty() { return _str.empty; }
3466         @property C front() { return _str[0]; }
3467         void popFront() { _str = _str[1 .. $]; }
3468         @property C back() { return _str[$ - 1]; }
3469         void popBack() { _str = _str[0 .. $ - 1]; }
3470         @property auto save() { return BidirCU(_str); }
3471         @property size_t length() { return _str.length; }
3472 
3473         this(inout(C)[] str)
3474         {
3475             _str = to!(C[])(str);
3476         }
3477 
3478         C[] _str;
3479     }
3480 
3481     struct RandomCU(C)
3482     {
3483         import std.conv : to;
3484         @property bool empty() { return _str.empty; }
3485         @property C front() { return _str[0]; }
3486         void popFront() { _str = _str[1 .. $]; }
3487         @property C back() { return _str[$ - 1]; }
3488         void popBack() { _str = _str[0 .. $ - 1]; }
3489         @property auto save() { return RandomCU(_str); }
3490         @property size_t length() { return _str.length; }
3491         C opIndex(size_t i) { return _str[i]; }
3492         auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); }
3493 
3494         this(inout(C)[] str)
3495         {
3496             _str = to!(C[])(str);
3497         }
3498 
3499         C[] _str;
3500     }
3501 
3502     class RefBidirCU(C)
3503     {
3504         import std.conv : to;
3505         @property bool empty() { return _str.empty; }
3506         @property C front() { return _str[0]; }
3507         void popFront() { _str = _str[1 .. $]; }
3508         @property C back() { return _str[$ - 1]; }
3509         void popBack() { _str = _str[0 .. $ - 1]; }
3510         @property auto save() { return new RefBidirCU(_str); }
3511         @property size_t length() { return _str.length; }
3512 
3513         this(inout(C)[] str)
3514         {
3515             _str = to!(C[])(str);
3516         }
3517 
3518         C[] _str;
3519     }
3520 
3521     class RefRandomCU(C)
3522     {
3523         import std.conv : to;
3524         @property bool empty() { return _str.empty; }
3525         @property C front() { return _str[0]; }
3526         void popFront() { _str = _str[1 .. $]; }
3527         @property C back() { return _str[$ - 1]; }
3528         void popBack() { _str = _str[0 .. $ - 1]; }
3529         @property auto save() { return new RefRandomCU(_str); }
3530         @property size_t length() { return _str.length; }
3531         C opIndex(size_t i) { return _str[i]; }
3532         auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); }
3533 
3534         this(inout(C)[] str)
3535         {
3536             _str = to!(C[])(str);
3537         }
3538 
3539         C[] _str;
3540     }
3541 }
3542 
3543 
3544 /**
3545  * Inserted in place of invalid UTF sequences.
3546  *
3547  * References:
3548  *      $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
3549  */
3550 enum dchar replacementDchar = '\uFFFD';
3551 
3552 /********************************************
3553  * Iterate a range of char, wchar, or dchars by code unit.
3554  *
3555  * The purpose is to bypass the special case decoding that
3556  * $(REF front, std,range,primitives) does to character arrays. As a result,
3557  * using ranges with `byCodeUnit` can be `nothrow` while
3558  * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
3559  * sequences.
3560  *
3561  * A code unit is a building block of the UTF encodings. Generally, an
3562  * individual code unit does not represent what's perceived as a full
3563  * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
3564  * are encoded with multiple code units. For example, the UTF-8 code units for
3565  * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
3566  * often does not form a character on its own. Attempting to treat it as
3567  * one while iterating over the resulting range will give nonsensical results.
3568  *
3569  * Params:
3570  *      r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
3571  *      of characters (including strings) or a type that implicitly converts to a string type.
3572  * Returns:
3573  *      If `r` is not an auto-decodable string (i.e. a narrow string or a
3574  *      user-defined type that implicitly converts to a string type), then `r`
3575  *      is returned.
3576  *
3577  *      Otherwise, `r` is converted to its corresponding string type (if it's
3578  *      not already a string) and wrapped in a random-access range where the
3579  *      element encoding type of the string (its code unit) is the element type
3580  *      of the range, and that range returned. The range has slicing.
3581  *
3582  *      If `r` is quirky enough to be a struct or class which is an input range
3583  *      of characters on its own (i.e. it has the input range API as member
3584  *      functions), $(I and) it's implicitly convertible to a string type, then
3585  *      `r` is returned, and no implicit conversion takes place.
3586  *
3587  *      If `r` is wrapped in a new range, then that range has a `source`
3588  *      property for returning the string that's currently contained within that
3589  *      range.
3590  *
3591  * See_Also:
3592  *      Refer to the $(MREF std, uni) docs for a reference on Unicode
3593  *      terminology.
3594  *
3595  *      For a range that iterates by grapheme cluster (written character) see
3596  *      $(REF byGrapheme, std,uni).
3597  */
3598 auto byCodeUnit(R)(R r)
3599 if ((isConvertibleToString!R && !isStaticArray!R) ||
3600     (isInputRange!R && isSomeChar!(ElementEncodingType!R)))
3601 {
3602     import std.traits : StringTypeOf;
3603     static if (// This would be cleaner if we had a way to check whether a type
3604                // was a range without any implicit conversions.
3605                (isAutodecodableString!R && !__traits(hasMember, R, "empty") &&
3606                 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3607     {
3608         static struct ByCodeUnitImpl
3609         {
3610         @safe pure nothrow @nogc:
3611 
3612             @property bool empty() const     { return source.length == 0; }
3613             @property auto ref front() inout { return source[0]; }
3614             void popFront()                  { source = source[1 .. $]; }
3615 
3616             @property auto save() { return ByCodeUnitImpl(source.save); }
3617 
3618             @property auto ref back() inout { return source[$ - 1]; }
3619             void popBack()                  { source = source[0 .. $-1]; }
3620 
3621             auto ref opIndex(size_t index) inout     { return source[index]; }
3622             auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); }
3623 
3624             @property size_t length() const { return source.length; }
3625             alias opDollar = length;
3626 
3627             StringTypeOf!R source;
3628         }
3629 
3630         static assert(isRandomAccessRange!ByCodeUnitImpl);
3631 
3632         return ByCodeUnitImpl(r);
3633     }
3634     else static if (!isInputRange!R ||
3635                     (is(R : const dchar[]) && !__traits(hasMember, R, "empty") &&
3636                     !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3637     {
3638         return cast(StringTypeOf!R) r;
3639     }
3640     else
3641     {
3642         // byCodeUnit for ranges and dchar[] is a no-op
3643         return r;
3644     }
3645 }
3646 
3647 ///
3648 @safe unittest
3649 {
3650     import std.range.primitives;
3651     import std.traits : isAutodecodableString;
3652 
3653     auto r = "Hello, World!".byCodeUnit();
3654     static assert(hasLength!(typeof(r)));
3655     static assert(hasSlicing!(typeof(r)));
3656     static assert(isRandomAccessRange!(typeof(r)));
3657     static assert(is(ElementType!(typeof(r)) == immutable char));
3658 
3659     // contrast with the range capabilities of standard strings (with or
3660     // without autodecoding enabled).
3661     auto s = "Hello, World!";
3662     static assert(isBidirectionalRange!(typeof(r)));
3663     static if (isAutodecodableString!(typeof(s)))
3664     {
3665         // with autodecoding enabled, strings are non-random-access ranges of
3666         // dchar.
3667         static assert(is(ElementType!(typeof(s)) == dchar));
3668         static assert(!isRandomAccessRange!(typeof(s)));
3669         static assert(!hasSlicing!(typeof(s)));
3670         static assert(!hasLength!(typeof(s)));
3671     }
3672     else
3673     {
3674         // without autodecoding, strings are normal arrays.
3675         static assert(is(ElementType!(typeof(s)) == immutable char));
3676         static assert(isRandomAccessRange!(typeof(s)));
3677         static assert(hasSlicing!(typeof(s)));
3678         static assert(hasLength!(typeof(s)));
3679     }
3680 }
3681 
3682 /// `byCodeUnit` does no Unicode decoding
3683 @safe unittest
3684 {
3685     string noel1 = "noe\u0308l"; // noël using e + combining diaeresis
3686     assert(noel1.byCodeUnit[2] != 'ë');
3687     assert(noel1.byCodeUnit[2] == 'e');
3688 
3689     string noel2 = "no\u00EBl"; // noël using a precomposed ë character
3690     // Because string is UTF-8, the code unit at index 2 is just
3691     // the first of a sequence that encodes 'ë'
3692     assert(noel2.byCodeUnit[2] != 'ë');
3693 }
3694 
3695 /// `byCodeUnit` exposes a `source` property when wrapping narrow strings.
3696 @safe unittest
3697 {
3698     import std.algorithm.comparison : equal;
3699     import std.range : popFrontN;
3700     import std.traits : isAutodecodableString;
3701     {
3702         auto range = byCodeUnit("hello world");
3703         range.popFrontN(3);
3704         assert(equal(range.save, "lo world"));
3705         static if (isAutodecodableString!string) // only enabled with autodecoding
3706         {
3707             string str = range.source;
3708             assert(str == "lo world");
3709         }
3710     }
3711     // source only exists if the range was wrapped
3712     {
3713         auto range = byCodeUnit("hello world"d);
3714         static assert(!__traits(compiles, range.source));
3715     }
3716 }
3717 
3718 @safe pure nothrow @nogc unittest
3719 {
3720     import std.range;
3721     {
3722         enum testStr = "𐁄𐂌𐃯 hello ディラン";
3723         char[testStr.length] s;
3724         int i;
3725         foreach (c; testStr.byCodeUnit().byCodeUnit())
3726         {
3727             s[i++] = c;
3728         }
3729         assert(s == testStr);
3730     }
3731     {
3732         enum testStr = "𐁄𐂌𐃯 hello ディラン"w;
3733         wchar[testStr.length] s;
3734         int i;
3735         foreach (c; testStr.byCodeUnit().byCodeUnit())
3736         {
3737             s[i++] = c;
3738         }
3739         assert(s == testStr);
3740     }
3741     {
3742         enum testStr = "𐁄𐂌𐃯 hello ディラン"d;
3743         dchar[testStr.length] s;
3744         int i;
3745         foreach (c; testStr.byCodeUnit().byCodeUnit())
3746         {
3747             s[i++] = c;
3748         }
3749         assert(s == testStr);
3750     }
3751     {
3752         auto bcu = "hello".byCodeUnit();
3753         assert(bcu.length == 5);
3754         assert(bcu[3] == 'l');
3755         assert(bcu[2 .. 4][1] == 'l');
3756     }
3757     {
3758         char[5] orig = "hello";
3759         auto bcu = orig[].byCodeUnit();
3760         bcu.front = 'H';
3761         assert(bcu.front == 'H');
3762         bcu[1] = 'E';
3763         assert(bcu[1] == 'E');
3764     }
3765     {
3766         auto bcu = "hello".byCodeUnit().byCodeUnit();
3767         static assert(isForwardRange!(typeof(bcu)));
3768         static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3769         auto s = bcu.save;
3770         bcu.popFront();
3771         assert(s.front == 'h');
3772     }
3773     {
3774         auto bcu = "hello".byCodeUnit();
3775         static assert(hasSlicing!(typeof(bcu)));
3776         static assert(isBidirectionalRange!(typeof(bcu)));
3777         static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3778         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3779         auto ret = bcu.retro;
3780         assert(ret.front == 'o');
3781         ret.popFront();
3782         assert(ret.front == 'l');
3783     }
3784     {
3785         auto bcu = "κόσμε"w.byCodeUnit();
3786         static assert(hasSlicing!(typeof(bcu)));
3787         static assert(isBidirectionalRange!(typeof(bcu)));
3788         static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring);
3789         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3790         auto ret = bcu.retro;
3791         assert(ret.front == 'ε');
3792         ret.popFront();
3793         assert(ret.front == 'μ');
3794     }
3795     {
3796         static struct Stringish
3797         {
3798             string s;
3799             alias s this;
3800         }
3801 
3802         auto orig = Stringish("\U0010fff8 𐁊 foo 𐂓");
3803         auto bcu = orig.byCodeUnit();
3804         static assert(is(typeof(bcu) == struct));
3805         static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish);
3806         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3807         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3808         assert(bcu.front == cast(char) 244);
3809     }
3810     {
3811         static struct WStringish
3812         {
3813             wstring s;
3814             alias s this;
3815         }
3816 
3817         auto orig = WStringish("\U0010fff8 𐁊 foo 𐂓"w);
3818         auto bcu = orig.byCodeUnit();
3819         static assert(is(typeof(bcu) == struct));
3820         static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish);
3821         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3822         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3823         assert(bcu.front == cast(wchar) 56319);
3824     }
3825     {
3826         static struct DStringish
3827         {
3828             dstring s;
3829             alias s this;
3830         }
3831 
3832         auto orig = DStringish("\U0010fff8 𐁊 foo 𐂓"d);
3833         auto bcu = orig.byCodeUnit();
3834         static assert(is(typeof(bcu) == dstring));
3835         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3836         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3837         assert(bcu.front == cast(dchar) 1114104);
3838     }
3839     {
3840         static struct FuncStringish
3841         {
3842             string str;
3843             string s() pure nothrow @nogc { return str; }
3844             alias s this;
3845         }
3846 
3847         auto orig = FuncStringish("\U0010fff8 𐁊 foo 𐂓");
3848         auto bcu = orig.byCodeUnit();
3849         static if (isAutodecodableString!FuncStringish)
3850             static assert(is(typeof(bcu) == struct));
3851         else
3852             static assert(is(typeof(bcu) == string));
3853         static assert(!is(typeof(bcu) == FuncStringish));
3854         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3855         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3856         assert(bcu.front == cast(char) 244);
3857     }
3858     {
3859         static struct Range
3860         {
3861             string data;
3862             bool empty() pure nothrow @nogc { return data.empty; }
3863             char front() pure nothrow @nogc { return data[0]; }
3864             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3865         }
3866 
3867         auto orig = Range("\U0010fff8 𐁊 foo 𐂓");
3868         auto bcu = orig.byCodeUnit();
3869         static assert(is(typeof(bcu) == Range));
3870         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3871         static assert(is(ElementType!(typeof(bcu)) == char));
3872         assert(bcu.front == cast(char) 244);
3873     }
3874     {
3875         static struct WRange
3876         {
3877             wstring data;
3878             bool empty() pure nothrow @nogc { return data.empty; }
3879             wchar front() pure nothrow @nogc { return data[0]; }
3880             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3881         }
3882 
3883         auto orig = WRange("\U0010fff8 𐁊 foo 𐂓"w);
3884         auto bcu = orig.byCodeUnit();
3885         static assert(is(typeof(bcu) == WRange));
3886         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3887         static assert(is(ElementType!(typeof(bcu)) == wchar));
3888         assert(bcu.front == 56319);
3889     }
3890     {
3891         static struct DRange
3892         {
3893             dstring data;
3894             bool empty() pure nothrow @nogc { return data.empty; }
3895             dchar front() pure nothrow @nogc { return data[0]; }
3896             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3897         }
3898 
3899         auto orig = DRange("\U0010fff8 𐁊 foo 𐂓"d);
3900         auto bcu = orig.byCodeUnit();
3901         static assert(is(typeof(bcu) == DRange));
3902         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3903         static assert(is(ElementType!(typeof(bcu)) == dchar));
3904         assert(bcu.front == 1114104);
3905     }
3906     {
3907         static struct RangeAndStringish
3908         {
3909             bool empty() pure nothrow @nogc { return data.empty; }
3910             char front() pure nothrow @nogc { return data[0]; }
3911             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3912 
3913             string data;
3914             string s;
3915             alias s this;
3916         }
3917 
3918         auto orig = RangeAndStringish("test.d", "other");
3919         auto bcu = orig.byCodeUnit();
3920         static assert(is(typeof(bcu) == RangeAndStringish));
3921         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3922         static assert(is(ElementType!(typeof(bcu)) == char));
3923         assert(bcu.front == 't');
3924     }
3925     {
3926         static struct WRangeAndStringish
3927         {
3928             bool empty() pure nothrow @nogc { return data.empty; }
3929             wchar front() pure nothrow @nogc { return data[0]; }
3930             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3931 
3932             wstring data;
3933             wstring s;
3934             alias s this;
3935         }
3936 
3937         auto orig = WRangeAndStringish("test.d"w, "other"w);
3938         auto bcu = orig.byCodeUnit();
3939         static assert(is(typeof(bcu) == WRangeAndStringish));
3940         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3941         static assert(is(ElementType!(typeof(bcu)) == wchar));
3942         assert(bcu.front == 't');
3943     }
3944     {
3945         static struct DRangeAndStringish
3946         {
3947             bool empty() pure nothrow @nogc { return data.empty; }
3948             dchar front() pure nothrow @nogc { return data[0]; }
3949             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3950 
3951             dstring data;
3952             dstring s;
3953             alias s this;
3954         }
3955 
3956         auto orig = DRangeAndStringish("test.d"d, "other"d);
3957         auto bcu = orig.byCodeUnit();
3958         static assert(is(typeof(bcu) == DRangeAndStringish));
3959         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3960         static assert(is(ElementType!(typeof(bcu)) == dchar));
3961         assert(bcu.front == 't');
3962     }
3963     {
3964         enum Enum : string { a = "test.d" }
3965 
3966         auto orig = Enum.a;
3967         auto bcu = orig.byCodeUnit();
3968         static assert(!is(typeof(bcu) == Enum));
3969         static if (isAutodecodableString!Enum)
3970             static assert(is(typeof(bcu) == struct));
3971         else
3972             static assert(is(typeof(bcu) == string));
3973         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3974         assert(bcu.front == 't');
3975     }
3976     {
3977         enum WEnum : wstring { a = "test.d"w }
3978 
3979         auto orig = WEnum.a;
3980         auto bcu = orig.byCodeUnit();
3981         static assert(!is(typeof(bcu) == WEnum));
3982         static if (isAutodecodableString!WEnum)
3983             static assert(is(typeof(bcu) == struct));
3984         else
3985             static assert(is(typeof(bcu) == wstring));
3986         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3987         assert(bcu.front == 't');
3988     }
3989     {
3990         enum DEnum : dstring { a = "test.d"d }
3991 
3992         auto orig = DEnum.a;
3993         auto bcu = orig.byCodeUnit();
3994         static assert(is(typeof(bcu) == dstring));
3995         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3996         assert(bcu.front == 't');
3997     }
3998 
3999     static if (autodecodeStrings)
4000     {
4001         static assert(!is(typeof(byCodeUnit("hello")) == string));
4002         static assert(!is(typeof(byCodeUnit("hello"w)) == wstring));
4003     }
4004     else
4005     {
4006         static assert(is(typeof(byCodeUnit("hello")) == string));
4007         static assert(is(typeof(byCodeUnit("hello"w)) == wstring));
4008     }
4009     static assert(is(typeof(byCodeUnit("hello"d)) == dstring));
4010 
4011     static assert(!__traits(compiles, byCodeUnit((char[5]).init)));
4012     static assert(!__traits(compiles, byCodeUnit((wchar[5]).init)));
4013     static assert(!__traits(compiles, byCodeUnit((dchar[5]).init)));
4014 
4015     enum SEnum : char[5] { a = "hello" }
4016     enum WSEnum : wchar[5] { a = "hello"w }
4017     enum DSEnum : dchar[5] { a = "hello"d }
4018 
4019     static assert(!__traits(compiles, byCodeUnit(SEnum.a)));
4020     static assert(!__traits(compiles, byCodeUnit(WSEnum.a)));
4021     static assert(!__traits(compiles, byCodeUnit(DSEnum.a)));
4022 }
4023 
4024 /****************************
4025  * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4026  * of characters by char, wchar, or dchar.
4027  * These aliases simply forward to $(LREF byUTF) with the
4028  * corresponding C argument.
4029  *
4030  * Params:
4031  *      r = input range of characters, or array of characters
4032  */
4033 alias byChar = byUTF!char;
4034 
4035 /// Ditto
4036 alias byWchar = byUTF!wchar;
4037 
4038 /// Ditto
4039 alias byDchar = byUTF!dchar;
4040 
4041 @safe pure nothrow @nogc unittest
4042 {
4043   {
4044     char[5] s;
4045     int i;
4046     foreach (c; "hello".byChar.byChar())
4047     {
4048         //writefln("[%d] '%c'", i, c);
4049         s[i++] = c;
4050     }
4051     assert(s == "hello");
4052   }
4053   {
4054     char[5+2+3+4+3+3] s;
4055     int i;
4056     dchar[10] a;
4057     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4058     a[8] = 0xD800;   // invalid
4059     a[9] = cast(dchar) 0x110000; // invalid
4060     foreach (c; a[].byChar())
4061     {
4062         //writefln("[%d] '%c'", i, c);
4063         s[i++] = c;
4064     }
4065     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
4066   }
4067   {
4068     auto r = "hello"w.byChar();
4069     r.popFront();
4070     r.popFront();
4071     assert(r.front == 'l');
4072   }
4073   {
4074     auto r = "hello"d.byChar();
4075     r.popFront();
4076     r.popFront();
4077     assert(r.front == 'l');
4078   }
4079   {
4080     auto r = "hello"d.byChar();
4081     assert(isForwardRange!(typeof(r)));
4082     auto s = r.save;
4083     r.popFront();
4084     assert(s.front == 'h');
4085   }
4086 }
4087 
4088 @safe pure nothrow @nogc unittest
4089 {
4090   {
4091     wchar[11] s;
4092     int i;
4093     dchar[10] a;
4094     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4095     a[8] = 0xD800;   // invalid
4096     a[9] = cast(dchar) 0x110000; // invalid
4097     foreach (c; a[].byWchar())
4098     {
4099         //writefln("[%d] '%c' x%x", i, c, c);
4100         s[i++] = c;
4101     }
4102     foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w)
4103     {
4104         //writefln("[%d] '%c' x%x", j, c, c);
4105     }
4106     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w);
4107   }
4108 
4109   {
4110     auto r = "hello".byWchar();
4111     r.popFront();
4112     r.popFront();
4113     assert(r.front == 'l');
4114   }
4115   {
4116     auto r = "hello"d.byWchar();
4117     r.popFront();
4118     r.popFront();
4119     assert(r.front == 'l');
4120   }
4121   {
4122     auto r = "hello"d.byWchar();
4123     assert(isForwardRange!(typeof(r)));
4124     auto s = r.save;
4125     r.popFront();
4126     assert(s.front == 'h');
4127   }
4128 }
4129 
4130 @safe pure nothrow @nogc unittest
4131 {
4132   {
4133     dchar[9] s;
4134     int i;
4135     string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
4136     foreach (c; a.byDchar())
4137     {
4138         s[i++] = c;
4139     }
4140     assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d);
4141   }
4142   {
4143     foreach (s; invalidUTFstrings!char())
4144     {
4145         auto r = s.byDchar();
4146         assert(!r.empty);
4147         assert(r.front == r.front);
4148         dchar c = r.front;
4149         assert(c == replacementDchar);
4150     }
4151   }
4152   {
4153     auto r = "hello".byDchar();
4154     r.popFront();
4155     r.popFront();
4156     assert(r.front == 'l');
4157   }
4158 
4159   {
4160     dchar[8] s;
4161     int i;
4162     wstring a = "hello\u07FF\uD7FF\U0010FFFF"w;
4163     foreach (c; a.byDchar())
4164     {
4165         //writefln("[%d] '%c' x%x", i, c, c);
4166         s[i++] = c;
4167     }
4168     assert(s == "hello\u07FF\uD7FF\U0010FFFF"d);
4169   }
4170   {
4171     foreach (s; invalidUTFstrings!wchar())
4172     {
4173         auto r = s.byDchar();
4174         assert(!r.empty);
4175         assert(r.front == r.front);
4176         dchar c = r.front;
4177         assert(c == replacementDchar);
4178     }
4179   }
4180   {
4181     wchar[2] ws;
4182     ws[0] = 0xD800;
4183     ws[1] = 0xDD00;             // correct surrogate pair
4184     auto r = ws[].byDchar();
4185     assert(!r.empty);
4186     assert(r.front == r.front);
4187     dchar c = r.front;
4188     assert(c == '\U00010100');
4189   }
4190   {
4191     auto r = "hello"w.byDchar();
4192     r.popFront();
4193     r.popFront();
4194     assert(r.front == 'l');
4195   }
4196 
4197   {
4198     dchar[5] s;
4199     int i;
4200     dstring a = "hello"d;
4201     foreach (c; a.byDchar.byDchar())
4202     {
4203         //writefln("[%d] '%c' x%x", i, c, c);
4204         s[i++] = c;
4205     }
4206     assert(s == "hello"d);
4207   }
4208   {
4209     auto r = "hello".byDchar();
4210     assert(isForwardRange!(typeof(r)));
4211     auto s = r.save;
4212     r.popFront();
4213     assert(s.front == 'h');
4214   }
4215   {
4216     auto r = "hello"w.byDchar();
4217     assert(isForwardRange!(typeof(r)));
4218     auto s = r.save;
4219     r.popFront();
4220     assert(s.front == 'h');
4221   }
4222 }
4223 
4224 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
4225 // which needs to support ranges with and without those attributes
4226 
4227 pure @safe nothrow @nogc unittest
4228 {
4229     dchar[5] s = "hello"d;
4230     foreach (c; s[].byChar())  { }
4231     foreach (c; s[].byWchar()) { }
4232     foreach (c; s[].byDchar()) { }
4233 }
4234 
4235 version (StdUnittest)
4236 private int impureVariable;
4237 
4238 @system unittest
4239 {
4240     static struct ImpureThrowingSystemRange(Char)
4241     {
4242         @property bool empty() const { return true; }
4243         @property Char front() const { return Char.init; }
4244         void popFront()
4245         {
4246             impureVariable++;
4247             throw new Exception("only for testing nothrow");
4248         }
4249     }
4250 
4251     foreach (Char; AliasSeq!(char, wchar, dchar))
4252     {
4253         ImpureThrowingSystemRange!Char range;
4254         foreach (c; range.byChar())  { }
4255         foreach (c; range.byWchar()) { }
4256         foreach (c; range.byDchar()) { }
4257     }
4258 }
4259 
4260 /****************************
4261  * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4262  * of characters by char type `C` by encoding the elements of the range.
4263  *
4264  * UTF sequences that cannot be converted to the specified encoding are either
4265  * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
4266  * of the Unicode Standard 6.2 or result in a thrown UTFException.
4267  *  Hence byUTF is not symmetric.
4268  * This algorithm is lazy, and does not allocate memory.
4269  * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
4270  * `r` parameter.
4271  *
4272  * Params:
4273  *      C = `char`, `wchar`, or `dchar`
4274  *      useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`,
4275  *                            UseReplacementDchar.no means throw `UTFException` for invalid UTF
4276  *
4277  * Throws:
4278  *      `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.no`
4279  *
4280  * GC:
4281  *      Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.yes`
4282  *
4283  * Returns:
4284  *      A bidirectional range if `R` is a bidirectional range and not auto-decodable,
4285  *      as defined by $(REF isAutodecodableString, std, traits).
4286  *
4287  *      A forward range if `R` is a forward range and not auto-decodable.
4288  *
4289  *      Or, if `R` is a range and it is auto-decodable and
4290  *      `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
4291  *      to $(LREF byCodeUnit).
4292  *
4293  *      Otherwise, an input range of characters.
4294  */
4295 template byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar)
4296 if (isSomeChar!C)
4297 {
4298     static if (is(immutable C == immutable UC, UC) && !is(C == UC))
4299         alias byUTF = byUTF!UC;
4300     else:
4301 
4302     auto ref byUTF(R)(R r)
4303         if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4304     {
4305         return byUTF(r.byCodeUnit());
4306     }
4307 
4308     auto ref byUTF(R)(R r)
4309         if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4310     {
4311         static if (is(immutable ElementEncodingType!R == immutable RC, RC) && is(RC == C))
4312         {
4313             return r.byCodeUnit();
4314         }
4315         else static if (is(C == dchar))
4316         {
4317             static struct Result
4318             {
4319                 enum Empty = uint.max;  // range is empty or just constructed
4320 
4321                 this(return scope R r)
4322                 {
4323                     this.r = r;
4324                 }
4325 
4326                 this(return scope R r, uint buff)
4327                 {
4328                     this.r = r;
4329                     this.buff = buff;
4330                 }
4331 
4332                 static if (isBidirectionalRange!R)
4333                 {
4334                     this(return scope R r, uint frontBuff, uint backBuff)
4335                     {
4336                         this.r = r;
4337                         this.buff = frontBuff;
4338                         this.backBuff = backBuff;
4339                     }
4340                 }
4341 
4342                 @property bool empty()
4343                 {
4344                     static if (isBidirectionalRange!R)
4345                         return buff == Empty && backBuff == Empty && r.empty;
4346                     else
4347                         return buff == Empty && r.empty;
4348                 }
4349 
4350                 @property dchar front() scope // 'scope' required by call to decodeFront() below
4351                 {
4352                     if (buff == Empty)
4353                     {
4354                         auto c = r.front;
4355 
4356                         static if (is(RC == wchar))
4357                             enum firstMulti = 0xD800; // First high surrogate.
4358                         else
4359                             enum firstMulti = 0x80; // First non-ASCII.
4360                         if (c < firstMulti)
4361                         {
4362                             r.popFront;
4363                             buff = cast(dchar) c;
4364                         }
4365                         else
4366                         {
4367                             buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4368                         }
4369                     }
4370                     return cast(dchar) buff;
4371                 }
4372 
4373                 void popFront()
4374                 {
4375                     if (buff == Empty)
4376                         front();
4377                     buff = Empty;
4378                 }
4379 
4380                 static if (isForwardRange!R)
4381                 {
4382                     @property auto save()
4383                     {
4384                         static if (isBidirectionalRange!R)
4385                         {
4386                             return Result(r.save, buff, backBuff);
4387                         }
4388                         else
4389                         {
4390                             return Result(r.save, buff);
4391                         }
4392                     }
4393                 }
4394 
4395                 static if (isBidirectionalRange!R)
4396                 {
4397                     @property dchar back() scope // 'scope' required by call to decodeBack() below
4398                     {
4399                         if (backBuff != Empty)
4400                             return cast(dchar) backBuff;
4401 
4402                         auto c = r.back;
4403                         static if (is(RC == wchar))
4404                             enum firstMulti = 0xD800; // First high surrogate.
4405                         else
4406                             enum firstMulti = 0x80; // First non-ASCII.
4407                         if (c < firstMulti)
4408                         {
4409                             r.popBack;
4410                             backBuff = cast(dchar) c;
4411                         }
4412                         else
4413                         {
4414                             backBuff = () @trusted { return decodeBack!useReplacementDchar(r); }();
4415                         }
4416                         return cast(dchar) backBuff;
4417 
4418                     }
4419 
4420                     void popBack()
4421                     {
4422                         if (backBuff == Empty)
4423                             back();
4424                         backBuff = Empty;
4425                     }
4426                 }
4427 
4428             private:
4429 
4430                 R r;
4431                 uint buff = Empty;      // one character lookahead buffer
4432                 static if (isBidirectionalRange!R)
4433                     uint backBuff = Empty;
4434             }
4435 
4436             return Result(r);
4437         }
4438         else
4439         {
4440             static struct Result
4441             {
4442                 this(return scope R r)
4443                 {
4444                     this.r = r;
4445                 }
4446 
4447                 this(return scope R r, ushort pos, ushort fill, C[4 / C.sizeof] buf)
4448                 {
4449                     this.r = r;
4450                     this.pos = pos;
4451                     this.fill = fill;
4452                     this.buf = buf;
4453                 }
4454 
4455                 static if (isBidirectionalRange!R)
4456                 {
4457                     this(return scope R r, ushort frontPos, ushort frontFill,
4458                          ushort backPos, ushort backFill, C[4 / C.sizeof] buf)
4459                     {
4460                         this.r = r;
4461                         this.pos = frontPos;
4462                         this.fill = frontFill;
4463                         this.backPos = backPos;
4464                         this.backFill = backFill;
4465                         this.buf = buf;
4466                     }
4467                 }
4468 
4469                 @property bool empty()
4470                 {
4471                     static if (isBidirectionalRange!R)
4472                         return pos == fill && backPos == backFill && r.empty;
4473                     else
4474                         return pos == fill && r.empty;
4475                 }
4476 
4477                 @property auto front() scope // 'scope' required by call to decodeFront() below
4478                 {
4479                     if (pos == fill)
4480                     {
4481                         pos = 0;
4482                         auto c = r.front;
4483 
4484                         static if (C.sizeof >= 2 && RC.sizeof >= 2)
4485                             enum firstMulti = 0xD800; // First high surrogate.
4486                         else
4487                             enum firstMulti = 0x80; // First non-ASCII.
4488                         if (c < firstMulti)
4489                         {
4490                             fill = 1;
4491                             r.popFront;
4492                             buf[pos] = cast(C) c;
4493                         }
4494                         else
4495                         {
4496                             static if (is(RC == dchar))
4497                             {
4498                                 r.popFront;
4499                                 dchar dc = c;
4500                             }
4501                             else
4502                                 dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4503                             fill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4504                         }
4505                     }
4506                     return buf[pos];
4507                 }
4508 
4509                 void popFront()
4510                 {
4511                     if (pos == fill)
4512                         front;
4513                     ++pos;
4514                 }
4515 
4516                 static if (isForwardRange!R)
4517                 {
4518                     @property auto save()
4519                     {
4520                         static if (isBidirectionalRange!R)
4521                         {
4522                             return Result(r.save, pos, fill, backPos, backFill, buf);
4523                         }
4524                         else
4525                         {
4526                             return Result(r.save, pos, fill, buf);
4527                         }
4528                     }
4529                 }
4530 
4531                 static if (isBidirectionalRange!R)
4532                 {
4533                     @property auto back() scope // 'scope' required by call to decodeBack() below
4534                     {
4535                         if (backPos != backFill)
4536                             return buf[cast(ushort) (backFill - backPos - 1)];
4537 
4538                         backPos = 0;
4539                         auto c = r.back;
4540                         static if (C.sizeof >= 2 && RC.sizeof >= 2)
4541                             enum firstMulti = 0xD800; // First high surrogate.
4542                         else
4543                             enum firstMulti = 0x80; // First non-ASCII.
4544                         if (c < firstMulti)
4545                         {
4546                             backFill = 1;
4547                             r.popBack;
4548                             buf[cast(ushort) (backFill - backPos - 1)] = cast(C) c;
4549                         }
4550                         else
4551                         {
4552                             static if (is(RC == dchar))
4553                             {
4554                                 r.popBack;
4555                                 dchar dc = c;
4556                             }
4557                             else
4558                                 dchar dc = () @trusted { return decodeBack!(useReplacementDchar)(r); }();
4559                             backFill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4560                         }
4561                         return buf[cast(ushort) (backFill - backPos - 1)];
4562                     }
4563 
4564                     void popBack()
4565                     {
4566                         if (backPos == backFill)
4567                             back;
4568                         ++backPos;
4569                     }
4570                 }
4571 
4572             private:
4573 
4574                 R r;
4575                 ushort pos, fill;
4576                 static if (isBidirectionalRange!R)
4577                     ushort backPos, backFill;
4578                 C[4 / C.sizeof] buf = void;
4579             }
4580 
4581             return Result(r);
4582         }
4583     }
4584 }
4585 
4586 ///
4587 @safe pure nothrow unittest
4588 {
4589     import std.algorithm.comparison : equal;
4590 
4591     // hellö as a range of `char`s, which are UTF-8
4592     assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]));
4593 
4594     // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4595     assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö']));
4596 
4597     // 𐐷 is four code units in UTF-8, two in UTF-16, and one in UTF-32
4598     assert("𐐷".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7]));
4599     assert("𐐷".byUTF!wchar().equal([0xD801, 0xDC37]));
4600     assert("𐐷".byUTF!dchar().equal([0x00010437]));
4601 }
4602 
4603 ///
4604 @safe unittest
4605 {
4606     import std.algorithm.comparison : equal;
4607     import std.exception : assertThrown;
4608 
4609     assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty"));
4610     assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty"));
4611 }
4612 
4613 @safe unittest
4614 {
4615     {
4616         wchar[] s = ['a', 'b', 0x219];
4617         auto r = s.byUTF!char;
4618         assert(isBidirectionalRange!(typeof(r)));
4619         assert(r.back == 0x99);
4620         r.popBack;
4621         assert(r.back == 0xc8);
4622         r.popBack;
4623         assert(r.back == 'b');
4624 
4625     }
4626 
4627     {
4628         wchar[] s = ['a', 'b', 0x219];
4629         auto r = s.byUTF!wchar;
4630         uint i;
4631         assert(isBidirectionalRange!(typeof(r)));
4632         assert(r.back == 0x219);
4633         r.popBack;
4634         assert(r.back == 'b');
4635     }
4636 
4637     {
4638         wchar[] s = ['a', 'b', 0x219];
4639         auto r = s.byUTF!dchar;
4640         assert(isBidirectionalRange!(typeof(r)));
4641         assert(r.back == 0x219);
4642         r.popBack;
4643         assert(r.back == 'b');
4644     }
4645 
4646     {
4647         dchar[] s = ['𐐷', '😁'];
4648         auto r = s.byUTF!wchar;
4649         assert(r.back == 0xde01);
4650         r.popBack;
4651         assert(r.back == 0xd83d);
4652         r.popBack;
4653         assert(r.back == 0xdc37);
4654         r.popBack;
4655         assert(r.back == 0xd801);
4656     }
4657 
4658     {
4659         dchar[] s = ['𐐷', '😁'];
4660         auto r = s.byUTF!char;
4661         char[] res;
4662         while (!r.empty)
4663         {
4664             res ~= r.back;
4665             r.popBack;
4666         }
4667         import std.algorithm.comparison : equal;
4668         assert(res.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0]));
4669     }
4670 
4671     {
4672         dchar[] res;
4673         auto r = ['a', 'b', 'c', 'd', 'e'].byUTF!dchar;
4674         while (!r.empty)
4675         {
4676             res ~= r.back;
4677             r.popBack;
4678         }
4679         import std.algorithm.comparison : equal;
4680         assert(res.equal(['e', 'd', 'c', 'b', 'a']));
4681     }
4682 
4683     {
4684         //testing the save() function
4685         wchar[] s = ['Ă','ț'];
4686 
4687         auto rc = s.byUTF!char;
4688         rc.popBack;
4689         auto rcCopy = rc.save;
4690         assert(rc.back == rcCopy.back);
4691         assert(rcCopy.back == 0xc8);
4692 
4693         auto rd = s.byUTF!dchar;
4694         rd.popBack;
4695         auto rdCopy = rd.save;
4696         assert(rd.back == rdCopy.back);
4697         assert(rdCopy.back == 'Ă');
4698     }
4699 }
4700 
4701 ///
4702 @safe pure nothrow unittest
4703 {
4704     import std.range.primitives;
4705     wchar[] s = ['ă', 'î'];
4706 
4707     auto rc = s.byUTF!char;
4708     static assert(isBidirectionalRange!(typeof(rc)));
4709     assert(rc.back == 0xae);
4710     rc.popBack;
4711     assert(rc.back == 0xc3);
4712     rc.popBack;
4713     assert(rc.back == 0x83);
4714     rc.popBack;
4715     assert(rc.back == 0xc4);
4716 
4717     auto rw = s.byUTF!wchar;
4718     static assert(isBidirectionalRange!(typeof(rw)));
4719     assert(rw.back == 'î');
4720     rw.popBack;
4721     assert(rw.back == 'ă');
4722 
4723     auto rd = s.byUTF!dchar;
4724     static assert(isBidirectionalRange!(typeof(rd)));
4725     assert(rd.back == 'î');
4726     rd.popBack;
4727     assert(rd.back == 'ă');
4728 }
The OpenD Programming Language