std.encoding source code

1 // Written in the D programming language.
2 
3 /**
4 Classes and functions for handling and transcoding between various encodings.
5 
6 For cases where the encoding is known at compile-time, functions are provided
7 for arbitrary encoding and decoding of characters, arbitrary transcoding
8 between strings of different type, as well as validation and sanitization.
9 
10 Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
11 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250, WINDOWS-1251
12 and WINDOWS-1252.
13 
14 $(SCRIPT inhibitQuickIndex = 1;)
15 $(DIVC quickindex,
16 $(BOOKTABLE,
17 $(TR $(TH Category) $(TH Functions))
18 $(TR $(TD Decode) $(TD
19     $(LREF codePoints)
20     $(LREF decode)
21     $(LREF decodeReverse)
22     $(LREF safeDecode)
23 ))
24 $(TR $(TD Conversion) $(TD
25     $(LREF codeUnits)
26     $(LREF sanitize)
27     $(LREF transcode)
28 ))
29 $(TR $(TD Classification) $(TD
30     $(LREF canEncode)
31     $(LREF isValid)
32     $(LREF isValidCodePoint)
33     $(LREF isValidCodeUnit)
34 ))
35 $(TR $(TD BOM) $(TD
36     $(LREF BOM)
37     $(LREF BOMSeq)
38     $(LREF getBOM)
39     $(LREF utfBOM)
40 ))
41 $(TR $(TD Length &amp; Index) $(TD
42     $(LREF firstSequence)
43     $(LREF encodedLength)
44     $(LREF index)
45     $(LREF lastSequence)
46     $(LREF validLength)
47 ))
48 $(TR $(TD Encoding schemes) $(TD
49     $(LREF encodingName)
50     $(LREF EncodingScheme)
51     $(LREF EncodingSchemeASCII)
52     $(LREF EncodingSchemeLatin1)
53     $(LREF EncodingSchemeLatin2)
54     $(LREF EncodingSchemeUtf16Native)
55     $(LREF EncodingSchemeUtf32Native)
56     $(LREF EncodingSchemeUtf8)
57     $(LREF EncodingSchemeWindows1250)
58     $(LREF EncodingSchemeWindows1251)
59     $(LREF EncodingSchemeWindows1252)
60 ))
61 $(TR $(TD Representation) $(TD
62     $(LREF AsciiChar)
63     $(LREF AsciiString)
64     $(LREF Latin1Char)
65     $(LREF Latin1String)
66     $(LREF Latin2Char)
67     $(LREF Latin2String)
68     $(LREF Windows1250Char)
69     $(LREF Windows1250String)
70     $(LREF Windows1251Char)
71     $(LREF Windows1251String)
72     $(LREF Windows1252Char)
73     $(LREF Windows1252String)
74 ))
75 $(TR $(TD Exceptions) $(TD
76     $(LREF INVALID_SEQUENCE)
77     $(LREF EncodingException)
78 ))
79 ))
80 
81 For cases where the encoding is not known at compile-time, but is
82 known at run-time, the abstract class $(LREF EncodingScheme)
83 and its subclasses is provided.  To construct a run-time encoder/decoder,
84 one does e.g.
85 
86 ----------------------------------------------------
87 auto e = EncodingScheme.create("utf-8");
88 ----------------------------------------------------
89 
90 This library supplies $(LREF EncodingScheme) subclasses for ASCII,
91 ISO-8859-1 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250,
92 WINDOWS-1251, WINDOWS-1252, UTF-8, and (on little-endian architectures)
93 UTF-16LE and UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE.
94 
95 This library provides a mechanism whereby other modules may add $(LREF
96 EncodingScheme) subclasses for any other encoding.
97 
98 Copyright: Copyright Janice Caron 2008 - 2009.
99 License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
100 Authors:   Janice Caron
101 Source:    $(PHOBOSSRC std/encoding.d)
102 */
103 /*
104          Copyright Janice Caron 2008 - 2009.
105 Distributed under the Boost Software License, Version 1.0.
106    (See accompanying file LICENSE_1_0.txt or copy at
107          http://www.boost.org/LICENSE_1_0.txt)
108 */
109 module std.encoding;
110 
111 import std.range.primitives;
112 import std.traits;
113 import std.typecons;
114 
115 @system unittest
116 {
117     static ubyte[][] validStrings =
118     [
119         // Plain ASCII
120         cast(ubyte[])"hello",
121 
122         // First possible sequence of a certain length
123         [ 0x00 ],                       // U+00000000   one byte
124         [ 0xC2, 0x80 ],                 // U+00000080   two bytes
125         [ 0xE0, 0xA0, 0x80 ],           // U+00000800   three bytes
126         [ 0xF0, 0x90, 0x80, 0x80 ],     // U+00010000   three bytes
127 
128         // Last possible sequence of a certain length
129         [ 0x7F ],                       // U+0000007F   one byte
130         [ 0xDF, 0xBF ],                 // U+000007FF   two bytes
131         [ 0xEF, 0xBF, 0xBF ],           // U+0000FFFF   three bytes
132 
133         // Other boundary conditions
134         [ 0xED, 0x9F, 0xBF ],
135         // U+0000D7FF   Last character before surrogates
136         [ 0xEE, 0x80, 0x80 ],
137         // U+0000E000   First character after surrogates
138         [ 0xEF, 0xBF, 0xBD ],
139         // U+0000FFFD   Unicode replacement character
140         [ 0xF4, 0x8F, 0xBF, 0xBF ],
141         // U+0010FFFF   Very last character
142 
143         // Non-character code points
144         /*  NOTE: These are legal in UTF, and may be converted from
145             one UTF to another, however they do not represent Unicode
146             characters. These code points have been reserved by
147             Unicode as non-character code points. They are permissible
148             for data exchange within an application, but they are are
149             not permitted to be used as characters. Since this module
150             deals with UTF, and not with Unicode per se, we choose to
151             accept them here. */
152         [ 0xDF, 0xBE ],                 // U+0000FFFE
153         [ 0xDF, 0xBF ],                 // U+0000FFFF
154     ];
155 
156     static ubyte[][] invalidStrings =
157     [
158         // First possible sequence of a certain length, but greater
159         // than U+10FFFF
160         [ 0xF8, 0x88, 0x80, 0x80, 0x80 ],           // U+00200000   five bytes
161         [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ],     // U+04000000   six bytes
162 
163         // Last possible sequence of a certain length, but greater than U+10FFFF
164         [ 0xF7, 0xBF, 0xBF, 0xBF ],                 // U+001FFFFF   four bytes
165         [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ],           // U+03FFFFFF   five bytes
166         [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+7FFFFFFF   six bytes
167 
168         // Other boundary conditions
169         [ 0xF4, 0x90, 0x80, 0x80 ],                 // U+00110000
170                                                     // First code
171                                                     // point after
172                                                     // last character
173 
174         // Unexpected continuation bytes
175         [ 0x80 ],
176         [ 0xBF ],
177         [ 0x20, 0x80, 0x20 ],
178         [ 0x20, 0xBF, 0x20 ],
179         [ 0x80, 0x9F, 0xA0 ],
180 
181         // Lonely start bytes
182         [ 0xC0 ],
183         [ 0xCF ],
184         [ 0x20, 0xC0, 0x20 ],
185         [ 0x20, 0xCF, 0x20 ],
186         [ 0xD0 ],
187         [ 0xDF ],
188         [ 0x20, 0xD0, 0x20 ],
189         [ 0x20, 0xDF, 0x20 ],
190         [ 0xE0 ],
191         [ 0xEF ],
192         [ 0x20, 0xE0, 0x20 ],
193         [ 0x20, 0xEF, 0x20 ],
194         [ 0xF0 ],
195         [ 0xF1 ],
196         [ 0xF2 ],
197         [ 0xF3 ],
198         [ 0xF4 ],
199         [ 0xF5 ],   // If this were legal it would start a character > U+10FFFF
200         [ 0xF6 ],   // If this were legal it would start a character > U+10FFFF
201         [ 0xF7 ],   // If this were legal it would start a character > U+10FFFF
202 
203         [ 0xEF, 0xBF ],             // Three byte sequence with third byte missing
204         [ 0xF7, 0xBF, 0xBF ],       // Four byte sequence with fourth byte missing
205         [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ],   // Concatenation of the above
206 
207         // Impossible bytes
208         [ 0xF8 ],
209         [ 0xF9 ],
210         [ 0xFA ],
211         [ 0xFB ],
212         [ 0xFC ],
213         [ 0xFD ],
214         [ 0xFE ],
215         [ 0xFF ],
216         [ 0x20, 0xF8, 0x20 ],
217         [ 0x20, 0xF9, 0x20 ],
218         [ 0x20, 0xFA, 0x20 ],
219         [ 0x20, 0xFB, 0x20 ],
220         [ 0x20, 0xFC, 0x20 ],
221         [ 0x20, 0xFD, 0x20 ],
222         [ 0x20, 0xFE, 0x20 ],
223         [ 0x20, 0xFF, 0x20 ],
224 
225         // Overlong sequences, all representing U+002F
226         /*  With a safe UTF-8 decoder, all of the following five overlong
227             representations of the ASCII character slash ("/") should be
228             rejected like a malformed UTF-8 sequence */
229         [ 0xC0, 0xAF ],
230         [ 0xE0, 0x80, 0xAF ],
231         [ 0xF0, 0x80, 0x80, 0xAF ],
232         [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
233         [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
234 
235         // Maximum overlong sequences
236         /*  Below you see the highest Unicode value that is still resulting in
237             an overlong sequence if represented with the given number of bytes.
238             This is a boundary test for safe UTF-8 decoders. All five
239             characters should be rejected like malformed UTF-8 sequences. */
240         [ 0xC1, 0xBF ],                             // U+0000007F
241         [ 0xE0, 0x9F, 0xBF ],                       // U+000007FF
242         [ 0xF0, 0x8F, 0xBF, 0xBF ],                 // U+0000FFFF
243         [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ],           // U+001FFFFF
244         [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+03FFFFFF
245 
246         // Overlong representation of the NUL character
247         /*  The following five sequences should also be rejected like malformed
248             UTF-8 sequences and should not be treated like the ASCII NUL
249             character. */
250         [ 0xC0, 0x80 ],
251         [ 0xE0, 0x80, 0x80 ],
252         [ 0xF0, 0x80, 0x80, 0x80 ],
253         [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
254         [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
255 
256         // Illegal code positions
257         /*  The following UTF-8 sequences should be rejected like malformed
258             sequences, because they never represent valid ISO 10646 characters
259             and a UTF-8 decoder that accepts them might introduce security
260             problems comparable to overlong UTF-8 sequences. */
261         [ 0xED, 0xA0, 0x80 ],       // U+D800
262         [ 0xED, 0xAD, 0xBF ],       // U+DB7F
263         [ 0xED, 0xAE, 0x80 ],       // U+DB80
264         [ 0xED, 0xAF, 0xBF ],       // U+DBFF
265         [ 0xED, 0xB0, 0x80 ],       // U+DC00
266         [ 0xED, 0xBE, 0x80 ],       // U+DF80
267         [ 0xED, 0xBF, 0xBF ],       // U+DFFF
268     ];
269 
270     static string[] sanitizedStrings =
271     [
272         "\uFFFD","\uFFFD",
273         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
274         " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
275         "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
276         " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
277         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
278         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
279         " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
280         " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
281         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
282         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
283     ];
284 
285     // HELPER FUNCTIONS
286     // we can probably do this better...
287     static char toHexDigit(int n)
288     {
289         return "0123456789ABCDEF"[n & 0xF];
290     }
291 
292     static string makeReadable(string s)
293     {
294         string r = "\"";
295         foreach (char c;s)
296         {
297             if (c >= 0x20 && c < 0x80)
298             {
299                 r ~= c;
300             }
301             else
302             {
303                 r ~= "\\x";
304                 r ~= toHexDigit(c >> 4);
305                 r ~= toHexDigit(c);
306             }
307         }
308         r ~= "\"";
309         return r;
310     }
311 
312     void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r)
313     {
314         static if (is(Src == Dst))
315         {
316             return s;
317         }
318         else static if (is(Src == AsciiChar))
319         {
320             transcodeReverse!(char,Dst)(cast(string) s,r);
321         }
322         else
323         {
324             foreach_reverse (d;codePoints(s))
325             {
326                 foreach_reverse (c;codeUnits!(Dst)(d))
327                 {
328                     r = c ~ r;
329                 }
330             }
331         }
332     }
333 
334     // Make sure everything that should be valid, is
335     foreach (a;validStrings)
336     {
337         string s = cast(string) a;
338         assert(isValid(s),"Failed to validate: "~makeReadable(s));
339     }
340 
341     // Make sure everything that shouldn't be valid, isn't
342     foreach (a;invalidStrings)
343     {
344         string s = cast(string) a;
345         assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
346     }
347 
348     // Make sure we can sanitize everything bad
349     assert(invalidStrings.length == sanitizedStrings.length);
350     for (int i=0; i<invalidStrings.length; ++i)
351     {
352         string s = cast(string) invalidStrings[i];
353         string t = sanitize(s);
354         assert(isValid(t));
355         assert(t == sanitizedStrings[i]);
356         ubyte[] u = cast(ubyte[]) t;
357         validStrings ~= u;
358     }
359 
360     // Make sure all transcodings work in both directions, using both forward
361     // and reverse iteration
362     foreach (a; validStrings)
363     {
364         string s = cast(string) a;
365         string s2;
366         wstring ws, ws2;
367         dstring ds, ds2;
368 
369         transcode(s,ws);
370         assert(isValid(ws));
371         transcode(ws,s2);
372         assert(s == s2);
373 
374         transcode(s,ds);
375         assert(isValid(ds));
376         transcode(ds,s2);
377         assert(s == s2);
378 
379         transcode(ws,s);
380         assert(isValid(s));
381         transcode(s,ws2);
382         assert(ws == ws2);
383 
384         transcode(ws,ds);
385         assert(isValid(ds));
386         transcode(ds,ws2);
387         assert(ws == ws2);
388 
389         transcode(ds,s);
390         assert(isValid(s));
391         transcode(s,ds2);
392         assert(ds == ds2);
393 
394         transcode(ds,ws);
395         assert(isValid(ws));
396         transcode(ws,ds2);
397         assert(ds == ds2);
398 
399         transcodeReverse(s,ws);
400         assert(isValid(ws));
401         transcodeReverse(ws,s2);
402         assert(s == s2);
403 
404         transcodeReverse(s,ds);
405         assert(isValid(ds));
406         transcodeReverse(ds,s2);
407         assert(s == s2);
408 
409         transcodeReverse(ws,s);
410         assert(isValid(s));
411         transcodeReverse(s,ws2);
412         assert(ws == ws2);
413 
414         transcodeReverse(ws,ds);
415         assert(isValid(ds));
416         transcodeReverse(ds,ws2);
417         assert(ws == ws2);
418 
419         transcodeReverse(ds,s);
420         assert(isValid(s));
421         transcodeReverse(s,ds2);
422         assert(ds == ds2);
423 
424         transcodeReverse(ds,ws);
425         assert(isValid(ws));
426         transcodeReverse(ws,ds2);
427         assert(ds == ds2);
428     }
429 
430     // Make sure the non-UTF encodings work too
431     {
432         auto s = "\u20AC100";
433         Windows1252String t;
434         transcode(s,t);
435         assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']);
436         string u;
437         transcode(s,u);
438         assert(s == u);
439         Latin1String v;
440         transcode(s,v);
441         assert(cast(string) v == "?100");
442         AsciiString w;
443         transcode(v,w);
444         assert(cast(string) w == "?100");
445         s = "\u017Dlu\u0165ou\u010Dk\u00FD k\u016F\u0148";
446         Latin2String x;
447         transcode(s,x);
448         assert(x == cast(Latin2Char[])[0xae, 'l', 'u', 0xbb, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
449         Windows1250String y;
450         transcode(s,y);
451         assert(y == cast(Windows1250Char[])[0x8e, 'l', 'u', 0x9d, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
452         s = "\u0402lu\u0403ou\u201D\u045C k\u0414\u044F";
453         Windows1251String s51;
454         transcode(s,s51);
455         assert(s51 == cast(Windows1251Char[])[0x80, 'l', 'u', 0x81, 'o', 'u', 0x94, 0x9d, ' ', 'k', 0xc4, 0xff]);
456     }
457 
458     // Make sure we can count properly
459     {
460         assert(encodedLength!(char)('A') == 1);
461         assert(encodedLength!(char)('\u00E3') == 2);
462         assert(encodedLength!(char)('\u2028') == 3);
463         assert(encodedLength!(char)('\U0010FFF0') == 4);
464         assert(encodedLength!(wchar)('A') == 1);
465         assert(encodedLength!(wchar)('\U0010FFF0') == 2);
466     }
467 
468     // Make sure we can write into mutable arrays
469     {
470         char[4] buffer;
471         auto n = encode(cast(dchar)'\u00E3',buffer);
472         assert(n == 2);
473         assert(buffer[0] == 0xC3);
474         assert(buffer[1] == 0xA3);
475     }
476 }
477 
478 //=============================================================================
479 
480 /** Special value returned by `safeDecode` */
481 enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF;
482 
483 template EncoderFunctions()
484 {
485     // Various forms of read
486 
487     template ReadFromString()
488     {
489         @property bool canRead() { return s.length != 0; }
490         E peek() @safe pure @nogc nothrow { return s[0]; }
491         E read() @safe pure @nogc nothrow { E t = s[0]; s = s[1..$]; return t; }
492     }
493 
494     template ReverseReadFromString()
495     {
496         @property bool canRead() { return s.length != 0; }
497         E peek() @safe pure @nogc nothrow { return s[$-1]; }
498         E read() @safe pure @nogc nothrow { E t = s[$-1]; s = s[0..$-1]; return t; }
499     }
500 
501     // Various forms of Write
502 
503     template WriteToString()
504     {
505         E[] s;
506         void write(E c) @safe pure nothrow { s ~= c; }
507     }
508 
509     template WriteToArray()
510     {
511         void write(E c) @safe pure @nogc nothrow { array[0] = c; array = array[1..$]; }
512     }
513 
514     template WriteToDelegate()
515     {
516         void write(E c) { dg(c); }
517     }
518 
519     // Functions we will export
520 
521     template EncodeViaWrite()
522     {
523         mixin encodeViaWrite;
524         void encode(dchar c) { encodeViaWrite(c); }
525     }
526 
527     template SkipViaRead()
528     {
529         mixin skipViaRead;
530         void skip() @safe pure @nogc nothrow { skipViaRead(); }
531     }
532 
533     template DecodeViaRead()
534     {
535         mixin decodeViaRead;
536         dchar decode() @safe pure @nogc nothrow { return decodeViaRead(); }
537     }
538 
539     template SafeDecodeViaRead()
540     {
541         mixin safeDecodeViaRead;
542         dchar safeDecode() @safe pure @nogc nothrow { return safeDecodeViaRead(); }
543     }
544 
545     template DecodeReverseViaRead()
546     {
547         mixin decodeReverseViaRead;
548         dchar decodeReverse() @safe pure @nogc nothrow { return decodeReverseViaRead(); }
549     }
550 
551     // Encoding to different destinations
552 
553     template EncodeToString()
554     {
555         mixin WriteToString;
556         mixin EncodeViaWrite;
557     }
558 
559     template EncodeToArray()
560     {
561         mixin WriteToArray;
562         mixin EncodeViaWrite;
563     }
564 
565     template EncodeToDelegate()
566     {
567         mixin WriteToDelegate;
568         mixin EncodeViaWrite;
569     }
570 
571     // Decoding functions
572 
573     template SkipFromString()
574     {
575         mixin ReadFromString;
576         mixin SkipViaRead;
577     }
578 
579     template DecodeFromString()
580     {
581         mixin ReadFromString;
582         mixin DecodeViaRead;
583     }
584 
585     template SafeDecodeFromString()
586     {
587         mixin ReadFromString;
588         mixin SafeDecodeViaRead;
589     }
590 
591     template DecodeReverseFromString()
592     {
593         mixin ReverseReadFromString;
594         mixin DecodeReverseViaRead;
595     }
596 
597     //=========================================================================
598 
599     // Below are the functions we will ultimately expose to the user
600 
601     E[] encode(dchar c) @safe pure nothrow
602     {
603         mixin EncodeToString e;
604         e.encode(c);
605         return e.s;
606     }
607 
608     void encode(dchar c, ref E[] array) @safe pure nothrow
609     {
610         mixin EncodeToArray e;
611         e.encode(c);
612     }
613 
614     void encode(dchar c, void delegate(E) dg)
615     {
616         mixin EncodeToDelegate e;
617         e.encode(c);
618     }
619 
620     void skip(ref const(E)[] s) @safe pure nothrow
621     {
622         mixin SkipFromString e;
623         e.skip();
624     }
625 
626     dchar decode(S)(ref S s)
627     {
628         mixin DecodeFromString e;
629         return e.decode();
630     }
631 
632     dchar safeDecode(S)(ref S s)
633     {
634         mixin SafeDecodeFromString e;
635         return e.safeDecode();
636     }
637 
638     dchar decodeReverse(ref const(E)[] s) @safe pure nothrow
639     {
640         mixin DecodeReverseFromString e;
641         return e.decodeReverse();
642     }
643 }
644 
645 //=========================================================================
646 
647 struct CodePoints(E)
648 {
649     const(E)[] s;
650 
651     this(const(E)[] s)
652     in
653     {
654         assert(isValid(s));
655     }
656     do
657     {
658         this.s = s;
659     }
660 
661     int opApply(scope int delegate(ref dchar) dg)
662     {
663         int result = 0;
664         while (s.length != 0)
665         {
666             dchar c = decode(s);
667             result = dg(c);
668             if (result != 0) break;
669         }
670         return result;
671     }
672 
673     int opApply(scope int delegate(ref size_t, ref dchar) dg)
674     {
675         size_t i = 0;
676         int result = 0;
677         while (s.length != 0)
678         {
679             immutable len = s.length;
680             dchar c = decode(s);
681             size_t j = i; // We don't want the delegate corrupting i
682             result = dg(j,c);
683             if (result != 0) break;
684             i += len - s.length;
685         }
686         return result;
687     }
688 
689     int opApplyReverse(scope int delegate(ref dchar) dg)
690     {
691         int result = 0;
692         while (s.length != 0)
693         {
694             dchar c = decodeReverse(s);
695             result = dg(c);
696             if (result != 0) break;
697         }
698         return result;
699     }
700 
701     int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg)
702     {
703         int result = 0;
704         while (s.length != 0)
705         {
706             dchar c = decodeReverse(s);
707             size_t i = s.length;
708             result = dg(i,c);
709             if (result != 0) break;
710         }
711         return result;
712     }
713 }
714 
715 struct CodeUnits(E)
716 {
717     E[] s;
718 
719     this(dchar d)
720     in
721     {
722         assert(isValidCodePoint(d));
723     }
724     do
725     {
726         s = encode!(E)(d);
727     }
728 
729     int opApply(scope int delegate(ref E) dg)
730     {
731         int result = 0;
732         foreach (E c;s)
733         {
734             result = dg(c);
735             if (result != 0) break;
736         }
737         return result;
738     }
739 
740     int opApplyReverse(scope int delegate(ref E) dg)
741     {
742         int result = 0;
743         foreach_reverse (E c;s)
744         {
745             result = dg(c);
746             if (result != 0) break;
747         }
748         return result;
749     }
750 }
751 
752 //=============================================================================
753 
754 template EncoderInstance(E)
755 {
756     static assert(false,"Cannot instantiate EncoderInstance for type "
757         ~ E.stringof);
758 }
759 
760 private template GenericEncoder()
761 {
762     bool canEncode(dchar c) @safe pure @nogc nothrow
763     {
764         if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) return true;
765         if (c >= 0xFFFD) return false;
766 
767         auto idx = 0;
768         while (idx < bstMap.length)
769         {
770             if (bstMap[idx][0] == c) return true;
771             idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
772         }
773 
774         return false;
775     }
776 
777     bool isValidCodeUnit(E c) @safe pure @nogc nothrow
778     {
779         if (c < m_charMapStart || c > m_charMapEnd) return true;
780         return charMap[c-m_charMapStart] != 0xFFFD;
781     }
782 
783     size_t encodedLength(dchar c) @safe pure @nogc nothrow
784     in
785     {
786         assert(canEncode(c));
787     }
788     do
789     {
790         return 1;
791     }
792 
793     void encodeViaWrite()(dchar c)
794     {
795         if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) {}
796         else if (c >= 0xFFFD) { c = '?'; }
797         else
798         {
799             auto idx = 0;
800             while (idx < bstMap.length)
801             {
802                 if (bstMap[idx][0] == c)
803                 {
804                     write(cast(E) bstMap[idx][1]);
805                     return;
806                 }
807                 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
808             }
809             c = '?';
810         }
811         write(cast(E) c);
812     }
813 
814     void skipViaRead()()
815     {
816         read();
817     }
818 
819     dchar decodeViaRead()()
820     {
821         E c = read();
822         return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
823     }
824 
825     dchar safeDecodeViaRead()()
826     {
827         immutable E c = read();
828         immutable d = (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
829         return d == 0xFFFD ? INVALID_SEQUENCE : d;
830     }
831 
832     dchar decodeReverseViaRead()()
833     {
834         E c = read();
835         return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
836     }
837 
838     @property EString replacementSequence() @safe pure @nogc nothrow
839     {
840         return cast(EString)("?");
841     }
842 
843     mixin EncoderFunctions;
844 }
845 
846 //=============================================================================
847 //          ASCII
848 //=============================================================================
849 
850 /** Defines various character sets. */
851 enum AsciiChar : ubyte { _init }
852 /// Ditto
853 alias AsciiString = immutable(AsciiChar)[];
854 
855 template EncoderInstance(CharType : AsciiChar)
856 {
857     alias E = AsciiChar;
858     alias EString = AsciiString;
859 
860     @property string encodingName() @safe pure nothrow @nogc
861     {
862         return "ASCII";
863     }
864 
865     bool canEncode(dchar c) @safe pure nothrow @nogc
866     {
867         return c < 0x80;
868     }
869 
870     bool isValidCodeUnit(AsciiChar c) @safe pure nothrow @nogc
871     {
872         return c < 0x80;
873     }
874 
875     size_t encodedLength(dchar c) @safe pure nothrow @nogc
876     in
877     {
878         assert(canEncode(c));
879     }
880     do
881     {
882         return 1;
883     }
884 
885     void encodeX(Range)(dchar c, Range r)
886     {
887         if (!canEncode(c)) c = '?';
888         r.write(cast(AsciiChar) c);
889     }
890 
891     void encodeViaWrite()(dchar c)
892     {
893         if (!canEncode(c)) c = '?';
894         write(cast(AsciiChar) c);
895     }
896 
897     void skipViaRead()()
898     {
899         read();
900     }
901 
902     dchar decodeViaRead()()
903     {
904         return read();
905     }
906 
907     dchar safeDecodeViaRead()()
908     {
909         immutable c = read();
910         return canEncode(c) ? c : INVALID_SEQUENCE;
911     }
912 
913     dchar decodeReverseViaRead()()
914     {
915         return read();
916     }
917 
918     @property EString replacementSequence() @safe pure nothrow @nogc
919     {
920         return cast(EString)("?");
921     }
922 
923     mixin EncoderFunctions;
924 }
925 
926 //=============================================================================
927 //          ISO-8859-1
928 //=============================================================================
929 
930 /** Defines an Latin1-encoded character. */
931 enum Latin1Char : ubyte { _init }
932 /**
933 Defines an Latin1-encoded string (as an array of $(D
934 immutable(Latin1Char))).
935  */
936 alias Latin1String = immutable(Latin1Char)[];
937 
938 template EncoderInstance(CharType : Latin1Char)
939 {
940     alias E = Latin1Char;
941     alias EString = Latin1String;
942 
943     @property string encodingName() @safe pure nothrow @nogc
944     {
945         return "ISO-8859-1";
946     }
947 
948     bool canEncode(dchar c) @safe pure nothrow @nogc
949     {
950         return c < 0x100;
951     }
952 
953     bool isValidCodeUnit(Latin1Char c) @safe pure nothrow @nogc
954     {
955         return true;
956     }
957 
958     size_t encodedLength(dchar c) @safe pure nothrow @nogc
959     in
960     {
961         assert(canEncode(c));
962     }
963     do
964     {
965         return 1;
966     }
967 
968     void encodeViaWrite()(dchar c)
969     {
970         if (!canEncode(c)) c = '?';
971         write(cast(Latin1Char) c);
972     }
973 
974     void skipViaRead()()
975     {
976         read();
977     }
978 
979     dchar decodeViaRead()()
980     {
981         return read();
982     }
983 
984     dchar safeDecodeViaRead()()
985     {
986         return read();
987     }
988 
989     dchar decodeReverseViaRead()()
990     {
991         return read();
992     }
993 
994     @property EString replacementSequence() @safe pure nothrow @nogc
995     {
996         return cast(EString)("?");
997     }
998 
999     mixin EncoderFunctions;
1000 }
1001 
1002 //=============================================================================
1003 //          ISO-8859-2
1004 //=============================================================================
1005 
1006 /// Defines a Latin2-encoded character.
1007 enum Latin2Char : ubyte { _init }
1008 
1009 /**
1010  * Defines an Latin2-encoded string (as an array of $(D
1011  * immutable(Latin2Char))).
1012  */
1013 alias Latin2String = immutable(Latin2Char)[];
1014 
1015 private template EncoderInstance(CharType : Latin2Char)
1016 {
1017     import std.typecons : Tuple, tuple;
1018 
1019     alias E = Latin2Char;
1020     alias EString = Latin2String;
1021 
1022     @property string encodingName() @safe pure nothrow @nogc
1023     {
1024         return "ISO-8859-2";
1025     }
1026 
1027     private static immutable dchar m_charMapStart = 0xa1;
1028     private static immutable dchar m_charMapEnd = 0xff;
1029 
1030     private immutable wstring charMap =
1031         "\u0104\u02D8\u0141\u00A4\u013D\u015A\u00A7\u00A8"~
1032         "\u0160\u015E\u0164\u0179\u00AD\u017D\u017B\u00B0"~
1033         "\u0105\u02DB\u0142\u00B4\u013E\u015B\u02C7\u00B8"~
1034         "\u0161\u015F\u0165\u017A\u02DD\u017E\u017C\u0154"~
1035         "\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7\u010C"~
1036         "\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E\u0110"~
1037         "\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7\u0158"~
1038         "\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF\u0155"~
1039         "\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7\u010D"~
1040         "\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F\u0111"~
1041         "\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7\u0159"~
1042         "\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
1043 
1044     private immutable Tuple!(wchar, char)[] bstMap = [
1045         tuple('\u0148','\xF2'), tuple('\u00F3','\xF3'), tuple('\u0165','\xBB'),
1046         tuple('\u00D3','\xD3'), tuple('\u010F','\xEF'), tuple('\u015B','\xB6'),
1047         tuple('\u017C','\xBF'), tuple('\u00C1','\xC1'), tuple('\u00E1','\xE1'),
1048         tuple('\u0103','\xE3'), tuple('\u013A','\xE5'), tuple('\u0155','\xE0'),
1049         tuple('\u0161','\xB9'), tuple('\u0171','\xFB'), tuple('\u02D8','\xA2'),
1050         tuple('\u00AD','\xAD'), tuple('\u00C9','\xC9'), tuple('\u00DA','\xDA'),
1051         tuple('\u00E9','\xE9'), tuple('\u00FA','\xFA'), tuple('\u0107','\xE6'),
1052         tuple('\u0119','\xEA'), tuple('\u0142','\xB3'), tuple('\u0151','\xF5'),
1053         tuple('\u0159','\xF8'), tuple('\u015F','\xBA'), tuple('\u0163','\xFE'),
1054         tuple('\u016F','\xF9'), tuple('\u017A','\xBC'), tuple('\u017E','\xBE'),
1055         tuple('\u02DB','\xB2'), tuple('\u00A7','\xA7'), tuple('\u00B4','\xB4'),
1056         tuple('\u00C4','\xC4'), tuple('\u00CD','\xCD'), tuple('\u00D6','\xD6'),
1057         tuple('\u00DD','\xDD'), tuple('\u00E4','\xE4'), tuple('\u00ED','\xED'),
1058         tuple('\u00F6','\xF6'), tuple('\u00FD','\xFD'), tuple('\u0105','\xB1'),
1059         tuple('\u010D','\xE8'), tuple('\u0111','\xF0'), tuple('\u011B','\xEC'),
1060         tuple('\u013E','\xB5'), tuple('\u0144','\xF1'), tuple('\u0150','\xD5'),
1061         tuple('\u0154','\xC0'), tuple('\u0158','\xD8'), tuple('\u015A','\xA6'),
1062         tuple('\u015E','\xAA'), tuple('\u0160','\xA9'), tuple('\u0162','\xDE'),
1063         tuple('\u0164','\xAB'), tuple('\u016E','\xD9'), tuple('\u0170','\xDB'),
1064         tuple('\u0179','\xAC'), tuple('\u017B','\xAF'), tuple('\u017D','\xAE'),
1065         tuple('\u02C7','\xB7'), tuple('\u02D9','\xFF'), tuple('\u02DD','\xBD'),
1066         tuple('\u00A4','\xA4'), tuple('\u00A8','\xA8'), tuple('\u00B0','\xB0'),
1067         tuple('\u00B8','\xB8'), tuple('\u00C2','\xC2'), tuple('\u00C7','\xC7'),
1068         tuple('\u00CB','\xCB'), tuple('\u00CE','\xCE'), tuple('\u00D4','\xD4'),
1069         tuple('\u00D7','\xD7'), tuple('\u00DC','\xDC'), tuple('\u00DF','\xDF'),
1070         tuple('\u00E2','\xE2'), tuple('\u00E7','\xE7'), tuple('\u00EB','\xEB'),
1071         tuple('\u00EE','\xEE'), tuple('\u00F4','\xF4'), tuple('\u00F7','\xF7'),
1072         tuple('\u00FC','\xFC'), tuple('\u0102','\xC3'), tuple('\u0104','\xA1'),
1073         tuple('\u0106','\xC6'), tuple('\u010C','\xC8'), tuple('\u010E','\xCF'),
1074         tuple('\u0110','\xD0'), tuple('\u0118','\xCA'), tuple('\u011A','\xCC'),
1075         tuple('\u0139','\xC5'), tuple('\u013D','\xA5'), tuple('\u0141','\xA3'),
1076         tuple('\u0143','\xD1'), tuple('\u0147','\xD2')
1077     ];
1078 
1079     mixin GenericEncoder!();
1080 }
1081 
1082 //=============================================================================
1083 //          WINDOWS-1250
1084 //=============================================================================
1085 
1086 /// Defines a Windows1250-encoded character.
1087 enum Windows1250Char : ubyte { _init }
1088 
1089 /**
1090  * Defines an Windows1250-encoded string (as an array of $(D
1091  * immutable(Windows1250Char))).
1092  */
1093 alias Windows1250String = immutable(Windows1250Char)[];
1094 
1095 private template EncoderInstance(CharType : Windows1250Char)
1096 {
1097     import std.typecons : Tuple, tuple;
1098 
1099     alias E = Windows1250Char;
1100     alias EString = Windows1250String;
1101 
1102     @property string encodingName() @safe pure nothrow @nogc
1103     {
1104         return "windows-1250";
1105     }
1106 
1107     private static immutable dchar m_charMapStart = 0x80;
1108     private static immutable dchar m_charMapEnd = 0xff;
1109 
1110     private immutable wstring charMap =
1111         "\u20AC\uFFFD\u201A\uFFFD\u201E\u2026\u2020\u2021"~
1112         "\uFFFD\u2030\u0160\u2039\u015A\u0164\u017D\u0179"~
1113         "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1114         "\uFFFD\u2122\u0161\u203A\u015B\u0165\u017E\u017A"~
1115         "\u00A0\u02C7\u02D8\u0141\u00A4\u0104\u00A6\u00A7"~
1116         "\u00A8\u00A9\u015E\u00AB\u00AC\u00AD\u00AE\u017B"~
1117         "\u00B0\u00B1\u02DB\u0142\u00B4\u00B5\u00B6\u00B7"~
1118         "\u00B8\u0105\u015F\u00BB\u013D\u02DD\u013E\u017C"~
1119         "\u0154\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7"~
1120         "\u010C\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E"~
1121         "\u0110\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7"~
1122         "\u0158\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF"~
1123         "\u0155\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7"~
1124         "\u010D\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F"~
1125         "\u0111\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7"~
1126         "\u0159\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
1127 
1128     private immutable Tuple!(wchar, char)[] bstMap = [
1129         tuple('\u011A','\xCC'), tuple('\u00DC','\xDC'), tuple('\u0179','\x8F'),
1130         tuple('\u00B7','\xB7'), tuple('\u00FC','\xFC'), tuple('\u0158','\xD8'),
1131         tuple('\u201C','\x93'), tuple('\u00AC','\xAC'), tuple('\u00CB','\xCB'),
1132         tuple('\u00EB','\xEB'), tuple('\u010C','\xC8'), tuple('\u0143','\xD1'),
1133         tuple('\u0162','\xDE'), tuple('\u02D9','\xFF'), tuple('\u2039','\x8B'),
1134         tuple('\u00A7','\xA7'), tuple('\u00B1','\xB1'), tuple('\u00C2','\xC2'),
1135         tuple('\u00D4','\xD4'), tuple('\u00E2','\xE2'), tuple('\u00F4','\xF4'),
1136         tuple('\u0104','\xA5'), tuple('\u0110','\xD0'), tuple('\u013D','\xBC'),
1137         tuple('\u0150','\xD5'), tuple('\u015E','\xAA'), tuple('\u016E','\xD9'),
1138         tuple('\u017D','\x8E'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1139         tuple('\u20AC','\x80'), tuple('\u00A4','\xA4'), tuple('\u00A9','\xA9'),
1140         tuple('\u00AE','\xAE'), tuple('\u00B5','\xB5'), tuple('\u00BB','\xBB'),
1141         tuple('\u00C7','\xC7'), tuple('\u00CE','\xCE'), tuple('\u00D7','\xD7'),
1142         tuple('\u00DF','\xDF'), tuple('\u00E7','\xE7'), tuple('\u00EE','\xEE'),
1143         tuple('\u00F7','\xF7'), tuple('\u0102','\xC3'), tuple('\u0106','\xC6'),
1144         tuple('\u010E','\xCF'), tuple('\u0118','\xCA'), tuple('\u0139','\xC5'),
1145         tuple('\u0141','\xA3'), tuple('\u0147','\xD2'), tuple('\u0154','\xC0'),
1146         tuple('\u015A','\x8C'), tuple('\u0160','\x8A'), tuple('\u0164','\x8D'),
1147         tuple('\u0170','\xDB'), tuple('\u017B','\xAF'), tuple('\u02C7','\xA1'),
1148         tuple('\u02DD','\xBD'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1149         tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1150         tuple('\u00A0','\xA0'), tuple('\u00A6','\xA6'), tuple('\u00A8','\xA8'),
1151         tuple('\u00AB','\xAB'), tuple('\u00AD','\xAD'), tuple('\u00B0','\xB0'),
1152         tuple('\u00B4','\xB4'), tuple('\u00B6','\xB6'), tuple('\u00B8','\xB8'),
1153         tuple('\u00C1','\xC1'), tuple('\u00C4','\xC4'), tuple('\u00C9','\xC9'),
1154         tuple('\u00CD','\xCD'), tuple('\u00D3','\xD3'), tuple('\u00D6','\xD6'),
1155         tuple('\u00DA','\xDA'), tuple('\u00DD','\xDD'), tuple('\u00E1','\xE1'),
1156         tuple('\u00E4','\xE4'), tuple('\u00E9','\xE9'), tuple('\u00ED','\xED'),
1157         tuple('\u00F3','\xF3'), tuple('\u00F6','\xF6'), tuple('\u00FA','\xFA'),
1158         tuple('\u00FD','\xFD'), tuple('\u0103','\xE3'), tuple('\u0105','\xB9'),
1159         tuple('\u0107','\xE6'), tuple('\u010D','\xE8'), tuple('\u010F','\xEF'),
1160         tuple('\u0111','\xF0'), tuple('\u0119','\xEA'), tuple('\u011B','\xEC'),
1161         tuple('\u013A','\xE5'), tuple('\u013E','\xBE'), tuple('\u0142','\xB3'),
1162         tuple('\u0144','\xF1'), tuple('\u0148','\xF2'), tuple('\u0151','\xF5'),
1163         tuple('\u0155','\xE0'), tuple('\u0159','\xF8'), tuple('\u015B','\x9C'),
1164         tuple('\u015F','\xBA'), tuple('\u0161','\x9A'), tuple('\u0163','\xFE'),
1165         tuple('\u0165','\x9D'), tuple('\u016F','\xF9'), tuple('\u0171','\xFB'),
1166         tuple('\u017A','\x9F'), tuple('\u017C','\xBF'), tuple('\u017E','\x9E'),
1167         tuple('\u02D8','\xA2'), tuple('\u02DB','\xB2'), tuple('\u2013','\x96'),
1168         tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1169         tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1170     ];
1171 
1172     mixin GenericEncoder!();
1173 }
1174 
1175 //=============================================================================
1176 //          WINDOWS-1251
1177 //=============================================================================
1178 
1179 /// Defines a Windows1251-encoded character.
1180 enum Windows1251Char : ubyte { _init }
1181 
1182 /**
1183  * Defines an Windows1251-encoded string (as an array of $(D
1184  * immutable(Windows1251Char))).
1185  */
1186 alias Windows1251String = immutable(Windows1251Char)[];
1187 
1188 private template EncoderInstance(CharType : Windows1251Char)
1189 {
1190     import std.typecons : Tuple, tuple;
1191 
1192     alias E = Windows1251Char;
1193     alias EString = Windows1251String;
1194 
1195     @property string encodingName() @safe pure nothrow @nogc
1196     {
1197         return "windows-1251";
1198     }
1199 
1200     private static immutable dchar m_charMapStart = 0x80;
1201     private static immutable dchar m_charMapEnd = 0xff;
1202 
1203     private immutable wstring charMap =
1204         "\u0402\u0403\u201A\u0453\u201E\u2026\u2020\u2021"~
1205         "\u20AC\u2030\u0409\u2039\u040A\u040C\u040B\u040F"~
1206         "\u0452\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1207         "\uFFFD\u2122\u0459\u203A\u045A\u045C\u045B\u045F"~
1208         "\u00A0\u040E\u045E\u0408\u00A4\u0490\u00A6\u00A7"~
1209         "\u0401\u00A9\u0404\u00AB\u00AC\u00AD\u00AE\u0407"~
1210         "\u00B0\u00B1\u0406\u0456\u0491\u00B5\u00B6\u00B7"~
1211         "\u0451\u2116\u0454\u00BB\u0458\u0405\u0455\u0457"~
1212         "\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417"~
1213         "\u0418\u0419\u041A\u041B\u041C\u041D\u041E\u041F"~
1214         "\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427"~
1215         "\u0428\u0429\u042A\u042B\u042C\u042D\u042E\u042F"~
1216         "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437"~
1217         "\u0438\u0439\u043A\u043B\u043C\u043D\u043E\u043F"~
1218         "\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447"~
1219         "\u0448\u0449\u044A\u044B\u044C\u044D\u044E\u044F";
1220 
1221     private immutable Tuple!(wchar, char)[] bstMap = [
1222         tuple('\u0432','\xE2'),tuple('\u0412','\xC2'),tuple('\u0453','\x83'),
1223         tuple('\u0401','\xA8'),tuple('\u0422','\xD2'),tuple('\u0442','\xF2'),
1224         tuple('\u2018','\x91'),tuple('\u00AD','\xAD'),tuple('\u0409','\x8A'),
1225         tuple('\u041A','\xCA'),tuple('\u042A','\xDA'),tuple('\u043A','\xEA'),
1226         tuple('\u044A','\xFA'),tuple('\u045B','\x9E'),tuple('\u2022','\x95'),
1227         tuple('\u00A7','\xA7'),tuple('\u00B5','\xB5'),tuple('\u0405','\xBD'),
1228         tuple('\u040E','\xA1'),tuple('\u0416','\xC6'),tuple('\u041E','\xCE'),
1229         tuple('\u0426','\xD6'),tuple('\u042E','\xDE'),tuple('\u0436','\xE6'),
1230         tuple('\u043E','\xEE'),tuple('\u0446','\xF6'),tuple('\u044E','\xFE'),
1231         tuple('\u0457','\xBF'),tuple('\u0490','\xA5'),tuple('\u201D','\x94'),
1232         tuple('\u203A','\x9B'),tuple('\u00A4','\xA4'),tuple('\u00AB','\xAB'),
1233         tuple('\u00B0','\xB0'),tuple('\u00B7','\xB7'),tuple('\u0403','\x81'),
1234         tuple('\u0407','\xAF'),tuple('\u040B','\x8E'),tuple('\u0410','\xC0'),
1235         tuple('\u0414','\xC4'),tuple('\u0418','\xC8'),tuple('\u041C','\xCC'),
1236         tuple('\u0420','\xD0'),tuple('\u0424','\xD4'),tuple('\u0428','\xD8'),
1237         tuple('\u042C','\xDC'),tuple('\u0430','\xE0'),tuple('\u0434','\xE4'),
1238         tuple('\u0438','\xE8'),tuple('\u043C','\xEC'),tuple('\u0440','\xF0'),
1239         tuple('\u0444','\xF4'),tuple('\u0448','\xF8'),tuple('\u044C','\xFC'),
1240         tuple('\u0451','\xB8'),tuple('\u0455','\xBE'),tuple('\u0459','\x9A'),
1241         tuple('\u045E','\xA2'),tuple('\u2013','\x96'),tuple('\u201A','\x82'),
1242         tuple('\u2020','\x86'),tuple('\u2030','\x89'),tuple('\u2116','\xB9'),
1243         tuple('\u00A0','\xA0'),tuple('\u00A6','\xA6'),tuple('\u00A9','\xA9'),
1244         tuple('\u00AC','\xAC'),tuple('\u00AE','\xAE'),tuple('\u00B1','\xB1'),
1245         tuple('\u00B6','\xB6'),tuple('\u00BB','\xBB'),tuple('\u0402','\x80'),
1246         tuple('\u0404','\xAA'),tuple('\u0406','\xB2'),tuple('\u0408','\xA3'),
1247         tuple('\u040A','\x8C'),tuple('\u040C','\x8D'),tuple('\u040F','\x8F'),
1248         tuple('\u0411','\xC1'),tuple('\u0413','\xC3'),tuple('\u0415','\xC5'),
1249         tuple('\u0417','\xC7'),tuple('\u0419','\xC9'),tuple('\u041B','\xCB'),
1250         tuple('\u041D','\xCD'),tuple('\u041F','\xCF'),tuple('\u0421','\xD1'),
1251         tuple('\u0423','\xD3'),tuple('\u0425','\xD5'),tuple('\u0427','\xD7'),
1252         tuple('\u0429','\xD9'),tuple('\u042B','\xDB'),tuple('\u042D','\xDD'),
1253         tuple('\u042F','\xDF'),tuple('\u0431','\xE1'),tuple('\u0433','\xE3'),
1254         tuple('\u0435','\xE5'),tuple('\u0437','\xE7'),tuple('\u0439','\xE9'),
1255         tuple('\u043B','\xEB'),tuple('\u043D','\xED'),tuple('\u043F','\xEF'),
1256         tuple('\u0441','\xF1'),tuple('\u0443','\xF3'),tuple('\u0445','\xF5'),
1257         tuple('\u0447','\xF7'),tuple('\u0449','\xF9'),tuple('\u044B','\xFB'),
1258         tuple('\u044D','\xFD'),tuple('\u044F','\xFF'),tuple('\u0452','\x90'),
1259         tuple('\u0454','\xBA'),tuple('\u0456','\xB3'),tuple('\u0458','\xBC'),
1260         tuple('\u045A','\x9C'),tuple('\u045C','\x9D'),tuple('\u045F','\x9F'),
1261         tuple('\u0491','\xB4'),tuple('\u2014','\x97'),tuple('\u2019','\x92'),
1262         tuple('\u201C','\x93'),tuple('\u201E','\x84'),tuple('\u2021','\x87'),
1263         tuple('\u2026','\x85'),tuple('\u2039','\x8B'),tuple('\u20AC','\x88'),
1264         tuple('\u2122','\x99')
1265     ];
1266 
1267     mixin GenericEncoder!();
1268 }
1269 
1270 //=============================================================================
1271 //          WINDOWS-1252
1272 //=============================================================================
1273 
1274 /// Defines a Windows1252-encoded character.
1275 enum Windows1252Char : ubyte { _init }
1276 
1277 /**
1278  * Defines an Windows1252-encoded string (as an array of $(D
1279  * immutable(Windows1252Char))).
1280  */
1281 alias Windows1252String = immutable(Windows1252Char)[];
1282 
1283 template EncoderInstance(CharType : Windows1252Char)
1284 {
1285     import std.typecons : Tuple, tuple;
1286 
1287     alias E = Windows1252Char;
1288     alias EString = Windows1252String;
1289 
1290     @property string encodingName() @safe pure nothrow @nogc
1291     {
1292         return "windows-1252";
1293     }
1294 
1295     private static immutable dchar m_charMapStart = 0x80;
1296     private static immutable dchar m_charMapEnd = 0x9f;
1297 
1298     private immutable wstring charMap =
1299         "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"~
1300         "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"~
1301         "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1302         "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178";
1303 
1304     private immutable Tuple!(wchar, char)[] bstMap = [
1305         tuple('\u201C','\x93'), tuple('\u0192','\x83'), tuple('\u2039','\x8B'),
1306         tuple('\u0161','\x9A'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1307         tuple('\u20AC','\x80'), tuple('\u0153','\x9C'), tuple('\u017D','\x8E'),
1308         tuple('\u02DC','\x98'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1309         tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1310         tuple('\u0152','\x8C'), tuple('\u0160','\x8A'), tuple('\u0178','\x9F'),
1311         tuple('\u017E','\x9E'), tuple('\u02C6','\x88'), tuple('\u2013','\x96'),
1312         tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1313         tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1314     ];
1315 
1316     mixin GenericEncoder!();
1317 }
1318 
1319 //=============================================================================
1320 //          UTF-8
1321 //=============================================================================
1322 
1323 template EncoderInstance(CharType : char)
1324 {
1325     alias E = char;
1326     alias EString = immutable(char)[];
1327 
1328     @property string encodingName() @safe pure nothrow @nogc
1329     {
1330         return "UTF-8";
1331     }
1332 
1333     bool canEncode(dchar c) @safe pure nothrow @nogc
1334     {
1335         return isValidCodePoint(c);
1336     }
1337 
1338     bool isValidCodeUnit(char c) @safe pure nothrow @nogc
1339     {
1340         return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
1341     }
1342 
1343     immutable ubyte[128] tailTable =
1344     [
1345         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1346         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1347         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1348         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1349         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1350         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1351         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1352         3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
1353     ];
1354 
1355     private int tails(char c) @safe pure nothrow @nogc
1356     in
1357     {
1358         assert(c >= 0x80);
1359     }
1360     do
1361     {
1362         return tailTable[c-0x80];
1363     }
1364 
1365     size_t encodedLength(dchar c) @safe pure nothrow @nogc
1366     in
1367     {
1368         assert(canEncode(c));
1369     }
1370     do
1371     {
1372         if (c < 0x80) return 1;
1373         if (c < 0x800) return 2;
1374         if (c < 0x10000) return 3;
1375         return 4;
1376     }
1377 
1378     void encodeViaWrite()(dchar c)
1379     {
1380         if (c < 0x80)
1381         {
1382             write(cast(char) c);
1383         }
1384         else if (c < 0x800)
1385         {
1386             write(cast(char)((c >> 6) + 0xC0));
1387             write(cast(char)((c & 0x3F) + 0x80));
1388         }
1389         else if (c < 0x10000)
1390         {
1391             write(cast(char)((c >> 12) + 0xE0));
1392             write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1393             write(cast(char)((c & 0x3F) + 0x80));
1394         }
1395         else
1396         {
1397             write(cast(char)((c >> 18) + 0xF0));
1398             write(cast(char)(((c >> 12) & 0x3F) + 0x80));
1399             write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1400             write(cast(char)((c & 0x3F) + 0x80));
1401         }
1402     }
1403 
1404     void skipViaRead()()
1405     {
1406         auto c = read();
1407         if (c < 0xC0) return;
1408         int n = tails(cast(char) c);
1409         for (size_t i=0; i<n; ++i)
1410         {
1411             read();
1412         }
1413     }
1414 
1415     dchar decodeViaRead()()
1416     {
1417         dchar c = read();
1418         if (c < 0xC0) return c;
1419         int n = tails(cast(char) c);
1420         c &= (1 << (6 - n)) - 1;
1421         for (size_t i=0; i<n; ++i)
1422         {
1423             c = (c << 6) + (read() & 0x3F);
1424         }
1425         return c;
1426     }
1427 
1428     dchar safeDecodeViaRead()()
1429     {
1430         dchar c = read();
1431         if (c < 0x80) return c;
1432         int n = tails(cast(char) c);
1433         if (n == 0) return INVALID_SEQUENCE;
1434 
1435         if (!canRead) return INVALID_SEQUENCE;
1436         size_t d = peek();
1437         immutable err =
1438         (
1439             (c < 0xC2)                              // fail overlong 2-byte sequences
1440         ||  (c > 0xF4)                              // fail overlong 4-6-byte sequences
1441         ||  (c == 0xE0 && ((d & 0xE0) == 0x80))     // fail overlong 3-byte sequences
1442         ||  (c == 0xED && ((d & 0xE0) == 0xA0))     // fail surrogates
1443         ||  (c == 0xF0 && ((d & 0xF0) == 0x80))     // fail overlong 4-byte sequences
1444         ||  (c == 0xF4 && ((d & 0xF0) >= 0x90))     // fail code points > 0x10FFFF
1445         );
1446 
1447         c &= (1 << (6 - n)) - 1;
1448         for (size_t i=0; i<n; ++i)
1449         {
1450             if (!canRead) return INVALID_SEQUENCE;
1451             d = peek();
1452             if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
1453             c = (c << 6) + (read() & 0x3F);
1454         }
1455 
1456         return err ? INVALID_SEQUENCE : c;
1457     }
1458 
1459     dchar decodeReverseViaRead()()
1460     {
1461         dchar c = read();
1462         if (c < 0x80) return c;
1463         size_t shift = 0;
1464         c &= 0x3F;
1465         for (size_t i=0; i<4; ++i)
1466         {
1467             shift += 6;
1468             auto d = read();
1469             size_t n = tails(cast(char) d);
1470             immutable mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
1471             c += ((d & mask) << shift);
1472             if (n != 0) break;
1473         }
1474         return c;
1475     }
1476 
1477     @property EString replacementSequence() @safe pure nothrow @nogc
1478     {
1479         return "\uFFFD";
1480     }
1481 
1482     mixin EncoderFunctions;
1483 }
1484 
1485 //=============================================================================
1486 //          UTF-16
1487 //=============================================================================
1488 
1489 template EncoderInstance(CharType : wchar)
1490 {
1491     alias E = wchar;
1492     alias EString = immutable(wchar)[];
1493 
1494     @property string encodingName() @safe pure nothrow @nogc
1495     {
1496         return "UTF-16";
1497     }
1498 
1499     bool canEncode(dchar c) @safe pure nothrow @nogc
1500     {
1501         return isValidCodePoint(c);
1502     }
1503 
1504     bool isValidCodeUnit(wchar c) @safe pure nothrow @nogc
1505     {
1506         return true;
1507     }
1508 
1509     size_t encodedLength(dchar c) @safe pure nothrow @nogc
1510     in
1511     {
1512         assert(canEncode(c));
1513     }
1514     do
1515     {
1516         return (c < 0x10000) ? 1 : 2;
1517     }
1518 
1519     void encodeViaWrite()(dchar c)
1520     {
1521         if (c < 0x10000)
1522         {
1523             write(cast(wchar) c);
1524         }
1525         else
1526         {
1527             size_t n = c - 0x10000;
1528             write(cast(wchar)(0xD800 + (n >> 10)));
1529             write(cast(wchar)(0xDC00 + (n & 0x3FF)));
1530         }
1531     }
1532 
1533     void skipViaRead()()
1534     {
1535         immutable c = read();
1536         if (c < 0xD800 || c >= 0xE000) return;
1537         read();
1538     }
1539 
1540     dchar decodeViaRead()()
1541     {
1542         wchar c = read();
1543         if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1544         wchar d = read();
1545         c &= 0x3FF;
1546         d &= 0x3FF;
1547         return 0x10000 + (c << 10) + d;
1548     }
1549 
1550     dchar safeDecodeViaRead()()
1551     {
1552         wchar c = read();
1553         if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1554         if (c >= 0xDC00) return INVALID_SEQUENCE;
1555         if (!canRead) return INVALID_SEQUENCE;
1556         wchar d = peek();
1557         if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
1558         d = read();
1559         c &= 0x3FF;
1560         d &= 0x3FF;
1561         return 0x10000 + (c << 10) + d;
1562     }
1563 
1564     dchar decodeReverseViaRead()()
1565     {
1566         wchar c = read();
1567         if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1568         wchar d = read();
1569         c &= 0x3FF;
1570         d &= 0x3FF;
1571         return 0x10000 + (d << 10) + c;
1572     }
1573 
1574     @property EString replacementSequence() @safe pure nothrow @nogc
1575     {
1576         return "\uFFFD"w;
1577     }
1578 
1579     mixin EncoderFunctions;
1580 }
1581 
1582 //=============================================================================
1583 //          UTF-32
1584 //=============================================================================
1585 
1586 template EncoderInstance(CharType : dchar)
1587 {
1588     alias E = dchar;
1589     alias EString = immutable(dchar)[];
1590 
1591     @property string encodingName() @safe pure nothrow @nogc
1592     {
1593         return "UTF-32";
1594     }
1595 
1596     bool canEncode(dchar c) @safe pure @nogc nothrow
1597     {
1598         return isValidCodePoint(c);
1599     }
1600 
1601     bool isValidCodeUnit(dchar c) @safe pure @nogc nothrow
1602     {
1603         return isValidCodePoint(c);
1604     }
1605 
1606     size_t encodedLength(dchar c) @safe pure @nogc nothrow
1607     in
1608     {
1609         assert(canEncode(c));
1610     }
1611     do
1612     {
1613         return 1;
1614     }
1615 
1616     void encodeViaWrite()(dchar c)
1617     {
1618         write(c);
1619     }
1620 
1621     void skipViaRead()()
1622     {
1623         read();
1624     }
1625 
1626     dchar decodeViaRead()()
1627     {
1628         return cast(dchar) read();
1629     }
1630 
1631     dchar safeDecodeViaRead()()
1632     {
1633         immutable c = read();
1634         return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
1635     }
1636 
1637     dchar decodeReverseViaRead()()
1638     {
1639         return cast(dchar) read();
1640     }
1641 
1642     @property EString replacementSequence() @safe pure nothrow @nogc
1643     {
1644         return "\uFFFD"d;
1645     }
1646 
1647     mixin EncoderFunctions;
1648 }
1649 
1650 //=============================================================================
1651 // Below are forwarding functions which expose the function to the user
1652 
1653 /**
1654 Returns true if c is a valid code point
1655 
1656  Note that this includes the non-character code points U+FFFE and U+FFFF,
1657  since these are valid code points (even though they are not valid
1658  characters).
1659 
1660  Supersedes:
1661  This function supersedes `std.utf.startsValidDchar()`.
1662 
1663  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1664  WINDOWS-1251, WINDOWS-1252
1665 
1666  Params:
1667     c = the code point to be tested
1668  */
1669 bool isValidCodePoint(dchar c) @safe pure nothrow @nogc
1670 {
1671     return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
1672 }
1673 
1674 /**
1675  Returns the name of an encoding.
1676 
1677  The type of encoding cannot be deduced. Therefore, it is necessary to
1678  explicitly specify the encoding type.
1679 
1680  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1681  WINDOWS-1251, WINDOWS-1252
1682  */
1683 @property string encodingName(T)()
1684 {
1685     return EncoderInstance!(T).encodingName;
1686 }
1687 
1688 ///
1689 @safe unittest
1690 {
1691     assert(encodingName!(char) == "UTF-8");
1692     assert(encodingName!(wchar) == "UTF-16");
1693     assert(encodingName!(dchar) == "UTF-32");
1694     assert(encodingName!(AsciiChar) == "ASCII");
1695     assert(encodingName!(Latin1Char) == "ISO-8859-1");
1696     assert(encodingName!(Latin2Char) == "ISO-8859-2");
1697     assert(encodingName!(Windows1250Char) == "windows-1250");
1698     assert(encodingName!(Windows1251Char) == "windows-1251");
1699     assert(encodingName!(Windows1252Char) == "windows-1252");
1700 }
1701 
1702 /**
1703  Returns true iff it is possible to represent the specified codepoint
1704  in the encoding.
1705 
1706  The type of encoding cannot be deduced. Therefore, it is necessary to
1707  explicitly specify the encoding type.
1708 
1709  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1710  WINDOWS-1251, WINDOWS-1252
1711  */
1712 bool canEncode(E)(dchar c)
1713 {
1714     return EncoderInstance!(E).canEncode(c);
1715 }
1716 
1717 ///
1718 @safe pure unittest
1719 {
1720     assert( canEncode!(Latin1Char)('A'));
1721     assert( canEncode!(Latin2Char)('A'));
1722     assert(!canEncode!(AsciiChar)('\u00A0'));
1723     assert( canEncode!(Latin1Char)('\u00A0'));
1724     assert( canEncode!(Latin2Char)('\u00A0'));
1725     assert( canEncode!(Windows1250Char)('\u20AC'));
1726     assert(!canEncode!(Windows1250Char)('\u20AD'));
1727     assert(!canEncode!(Windows1250Char)('\uFFFD'));
1728     assert( canEncode!(Windows1251Char)('\u0402'));
1729     assert(!canEncode!(Windows1251Char)('\u20AD'));
1730     assert(!canEncode!(Windows1251Char)('\uFFFD'));
1731     assert( canEncode!(Windows1252Char)('\u20AC'));
1732     assert(!canEncode!(Windows1252Char)('\u20AD'));
1733     assert(!canEncode!(Windows1252Char)('\uFFFD'));
1734     assert(!canEncode!(char)(cast(dchar) 0x110000));
1735 }
1736 
1737 /// How to check an entire string
1738 @safe pure unittest
1739 {
1740     import std.algorithm.searching : find;
1741     import std.utf : byDchar;
1742 
1743     assert("The quick brown fox"
1744         .byDchar
1745         .find!(x => !canEncode!AsciiChar(x))
1746         .empty);
1747 }
1748 
1749 /**
1750  Returns true if the code unit is legal. For example, the byte 0x80 would
1751  not be legal in ASCII, because ASCII code units must always be in the range
1752  0x00 to 0x7F.
1753 
1754  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1755  WINDOWS-1251, WINDOWS-1252
1756 
1757  Params:
1758     c = the code unit to be tested
1759  */
1760 bool isValidCodeUnit(E)(E c)
1761 {
1762     return EncoderInstance!(E).isValidCodeUnit(c);
1763 }
1764 
1765 ///
1766 @system pure unittest
1767 {
1768     assert(!isValidCodeUnit(cast(char) 0xC0));
1769     assert(!isValidCodeUnit(cast(char) 0xFF));
1770     assert( isValidCodeUnit(cast(wchar) 0xD800));
1771     assert(!isValidCodeUnit(cast(dchar) 0xD800));
1772     assert(!isValidCodeUnit(cast(AsciiChar) 0xA0));
1773     assert( isValidCodeUnit(cast(Windows1250Char) 0x80));
1774     assert(!isValidCodeUnit(cast(Windows1250Char) 0x81));
1775     assert( isValidCodeUnit(cast(Windows1251Char) 0x80));
1776     assert(!isValidCodeUnit(cast(Windows1251Char) 0x98));
1777     assert( isValidCodeUnit(cast(Windows1252Char) 0x80));
1778     assert(!isValidCodeUnit(cast(Windows1252Char) 0x81));
1779 }
1780 
1781 /**
1782  Returns true if the string is encoded correctly
1783 
1784  Supersedes:
1785  This function supersedes std.utf.validate(), however note that this
1786  function returns a bool indicating whether the input was valid or not,
1787  whereas the older function would throw an exception.
1788 
1789  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1790  WINDOWS-1251, WINDOWS-1252
1791 
1792  Params:
1793     s = the string to be tested
1794  */
1795 bool isValid(E)(const(E)[] s)
1796 {
1797     return s.length == validLength(s);
1798 }
1799 
1800 ///
1801 @system pure unittest
1802 {
1803     assert( isValid("\u20AC100"));
1804     assert(!isValid(cast(char[3])[167, 133, 175]));
1805 }
1806 
1807 /**
1808  Returns the length of the longest possible substring, starting from
1809  the first code unit, which is validly encoded.
1810 
1811  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1812  WINDOWS-1251, WINDOWS-1252
1813 
1814  Params:
1815     s = the string to be tested
1816  */
1817 size_t validLength(E)(const(E)[] s)
1818 {
1819     size_t result, before = void;
1820     while ((before = s.length) > 0)
1821     {
1822         if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1823             break;
1824         result += before - s.length;
1825     }
1826     return result;
1827 }
1828 
1829 /**
1830  Sanitizes a string by replacing malformed code unit sequences with valid
1831  code unit sequences. The result is guaranteed to be valid for this encoding.
1832 
1833  If the input string is already valid, this function returns the original,
1834  otherwise it constructs a new string by replacing all illegal code unit
1835  sequences with the encoding's replacement character, Invalid sequences will
1836  be replaced with the Unicode replacement character (U+FFFD) if the
1837  character repertoire contains it, otherwise invalid sequences will be
1838  replaced with '?'.
1839 
1840  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1841  WINDOWS-1251, WINDOWS-1252
1842 
1843  Params:
1844     s = the string to be sanitized
1845  */
1846 immutable(E)[] sanitize(E)(immutable(E)[] s)
1847 {
1848     size_t n = validLength(s);
1849     if (n == s.length) return s;
1850 
1851     auto repSeq = EncoderInstance!(E).replacementSequence;
1852 
1853     // Count how long the string needs to be.
1854     // Overestimating is not a problem
1855     size_t len = s.length;
1856     const(E)[] t = s[n..$];
1857     while (t.length != 0)
1858     {
1859         immutable c = EncoderInstance!(E).safeDecode(t);
1860         assert(c == INVALID_SEQUENCE);
1861         len += repSeq.length;
1862         t = t[validLength(t)..$];
1863     }
1864 
1865     // Now do the write
1866     E[] array = new E[len];
1867     array[0 .. n] = s[0 .. n];
1868     size_t offset = n;
1869 
1870     t = s[n..$];
1871     while (t.length != 0)
1872     {
1873         immutable c = EncoderInstance!(E).safeDecode(t);
1874         assert(c == INVALID_SEQUENCE);
1875         array[offset .. offset+repSeq.length] = repSeq[];
1876         offset += repSeq.length;
1877         n = validLength(t);
1878         array[offset .. offset+n] = t[0 .. n];
1879         offset += n;
1880         t = t[n..$];
1881     }
1882     return cast(immutable(E)[])array[0 .. offset];
1883 }
1884 
1885 ///
1886 @system pure unittest
1887 {
1888     assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1889 }
1890 
1891 /**
1892  Returns the length of the first encoded sequence.
1893 
1894  The input to this function MUST be validly encoded.
1895  This is enforced by the function's in-contract.
1896 
1897  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1898  WINDOWS-1251, WINDOWS-1252
1899 
1900  Params:
1901  s = the string to be sliced
1902  */
1903 size_t firstSequence(E)(const(E)[] s)
1904 in
1905 {
1906     assert(s.length != 0);
1907     const(E)[] u = s;
1908     assert(safeDecode(u) != INVALID_SEQUENCE);
1909 }
1910 do
1911 {
1912     auto before = s.length;
1913     EncoderInstance!(E).skip(s);
1914     return before - s.length;
1915 }
1916 
1917 ///
1918 @system pure unittest
1919 {
1920     assert(firstSequence("\u20AC1000") == "\u20AC".length);
1921     assert(firstSequence("hel") == "h".length);
1922 }
1923 
1924 /**
1925  Returns the length of the last encoded sequence.
1926 
1927  The input to this function MUST be validly encoded.
1928  This is enforced by the function's in-contract.
1929 
1930  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1931  WINDOWS-1251, WINDOWS-1252
1932 
1933  Params:
1934     s = the string to be sliced
1935  */
1936 size_t lastSequence(E)(const(E)[] s)
1937 in
1938 {
1939     assert(s.length != 0);
1940     assert(isValid(s));
1941 }
1942 do
1943 {
1944     const(E)[] t = s;
1945     EncoderInstance!(E).decodeReverse(s);
1946     return t.length - s.length;
1947 }
1948 
1949 ///
1950 @system pure unittest
1951 {
1952     assert(lastSequence("1000\u20AC") == "\u20AC".length);
1953     assert(lastSequence("hellö") == "ö".length);
1954 }
1955 
1956 /**
1957  Returns the array index at which the (n+1)th code point begins.
1958 
1959  The input to this function MUST be validly encoded.
1960  This is enforced by the function's in-contract.
1961 
1962  Supersedes:
1963  This function supersedes std.utf.toUTFindex().
1964 
1965  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1966  WINDOWS-1251, WINDOWS-1252
1967 
1968  Params:
1969     s = the string to be counted
1970     n = the current code point index
1971  */
1972 ptrdiff_t index(E)(const(E)[] s,int n)
1973 in
1974 {
1975     assert(isValid(s));
1976     assert(n >= 0);
1977 }
1978 do
1979 {
1980     const(E)[] t = s;
1981     for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1982     return t.length - s.length;
1983 }
1984 
1985 ///
1986 @system pure unittest
1987 {
1988     assert(index("\u20AC100",1) == 3);
1989     assert(index("hällo",2) == 3);
1990 }
1991 
1992 /**
1993  Decodes a single code point.
1994 
1995  This function removes one or more code units from the start of a string,
1996  and returns the decoded code point which those code units represent.
1997 
1998  The input to this function MUST be validly encoded.
1999  This is enforced by the function's in-contract.
2000 
2001  Supersedes:
2002  This function supersedes std.utf.decode(), however, note that the
2003  function codePoints() supersedes it more conveniently.
2004 
2005  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2006  WINDOWS-1251, WINDOWS-1252
2007 
2008  Params:
2009     s = the string whose first code point is to be decoded
2010  */
2011 dchar decode(S)(ref S s)
2012 in
2013 {
2014     assert(s.length != 0);
2015     auto u = s;
2016     assert(safeDecode(u) != INVALID_SEQUENCE);
2017 }
2018 do
2019 {
2020     return EncoderInstance!(typeof(s[0])).decode(s);
2021 }
2022 
2023 /**
2024  Decodes a single code point from the end of a string.
2025 
2026  This function removes one or more code units from the end of a string,
2027  and returns the decoded code point which those code units represent.
2028 
2029  The input to this function MUST be validly encoded.
2030  This is enforced by the function's in-contract.
2031 
2032  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2033  WINDOWS-1251, WINDOWS-1252
2034 
2035  Params:
2036     s = the string whose first code point is to be decoded
2037  */
2038 dchar decodeReverse(E)(ref const(E)[] s)
2039 in
2040 {
2041     assert(s.length != 0);
2042     assert(isValid(s));
2043 }
2044 do
2045 {
2046     return EncoderInstance!(E).decodeReverse(s);
2047 }
2048 
2049 /**
2050  Decodes a single code point. The input does not have to be valid.
2051 
2052  This function removes one or more code units from the start of a string,
2053  and returns the decoded code point which those code units represent.
2054 
2055  This function will accept an invalidly encoded string as input.
2056  If an invalid sequence is found at the start of the string, this
2057  function will remove it, and return the value INVALID_SEQUENCE.
2058 
2059  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2060  WINDOWS-1251, WINDOWS-1252
2061 
2062  Params:
2063     s = the string whose first code point is to be decoded
2064  */
2065 dchar safeDecode(S)(ref S s)
2066 in
2067 {
2068     assert(s.length != 0);
2069 }
2070 do
2071 {
2072     return EncoderInstance!(typeof(s[0])).safeDecode(s);
2073 }
2074 
2075 /**
2076  Returns the number of code units required to encode a single code point.
2077 
2078  The input to this function MUST be a valid code point.
2079  This is enforced by the function's in-contract.
2080 
2081  The type of the output cannot be deduced. Therefore, it is necessary to
2082  explicitly specify the encoding as a template parameter.
2083 
2084  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2085  WINDOWS-1251, WINDOWS-1252
2086 
2087  Params:
2088     c = the code point to be encoded
2089  */
2090 size_t encodedLength(E)(dchar c)
2091 in
2092 {
2093     assert(isValidCodePoint(c));
2094 }
2095 do
2096 {
2097     return EncoderInstance!(E).encodedLength(c);
2098 }
2099 
2100 /**
2101  Encodes a single code point.
2102 
2103  This function encodes a single code point into one or more code units.
2104  It returns a string containing those code units.
2105 
2106  The input to this function MUST be a valid code point.
2107  This is enforced by the function's in-contract.
2108 
2109  The type of the output cannot be deduced. Therefore, it is necessary to
2110  explicitly specify the encoding as a template parameter.
2111 
2112  Supersedes:
2113  This function supersedes std.utf.encode(), however, note that the
2114  function codeUnits() supersedes it more conveniently.
2115 
2116  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2117  WINDOWS-1251, WINDOWS-1252
2118 
2119  Params:
2120     c = the code point to be encoded
2121  */
2122 E[] encode(E)(dchar c)
2123 in
2124 {
2125     assert(isValidCodePoint(c));
2126 }
2127 do
2128 {
2129     return EncoderInstance!(E).encode(c);
2130 }
2131 
2132 /**
2133  Encodes a single code point into an array.
2134 
2135  This function encodes a single code point into one or more code units
2136  The code units are stored in a user-supplied fixed-size array,
2137  which must be passed by reference.
2138 
2139  The input to this function MUST be a valid code point.
2140  This is enforced by the function's in-contract.
2141 
2142  The type of the output cannot be deduced. Therefore, it is necessary to
2143  explicitly specify the encoding as a template parameter.
2144 
2145  Supersedes:
2146  This function supersedes std.utf.encode(), however, note that the
2147  function codeUnits() supersedes it more conveniently.
2148 
2149  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2150  WINDOWS-1251, WINDOWS-1252
2151 
2152  Params:
2153     c     = the code point to be encoded
2154     array = the destination array
2155 
2156  Returns:
2157           the number of code units written to the array
2158  */
2159 size_t encode(E)(dchar c, E[] array)
2160 in
2161 {
2162     assert(isValidCodePoint(c));
2163 }
2164 do
2165 {
2166     E[] t = array;
2167     EncoderInstance!(E).encode(c,t);
2168     return array.length - t.length;
2169 }
2170 
2171 /*
2172 Encodes `c` in units of type `E` and writes the result to the
2173 output range `R`. Returns the number of `E`s written.
2174  */
2175 size_t encode(E, R)(dchar c, auto ref R range)
2176 if (isNativeOutputRange!(R, E))
2177 {
2178     static if (is(immutable E == immutable char))
2179     {
2180         if (c <= 0x7F)
2181         {
2182             put(range, cast(char) c);
2183             return 1;
2184         }
2185         if (c <= 0x7FF)
2186         {
2187             put(range, cast(char)(0xC0 | (c >> 6)));
2188             put(range, cast(char)(0x80 | (c & 0x3F)));
2189             return 2;
2190         }
2191         if (c <= 0xFFFF)
2192         {
2193             put(range, cast(char)(0xE0 | (c >> 12)));
2194             put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2195             put(range, cast(char)(0x80 | (c & 0x3F)));
2196             return 3;
2197         }
2198         if (c <= 0x10FFFF)
2199         {
2200             put(range, cast(char)(0xF0 | (c >> 18)));
2201             put(range, cast(char)(0x80 | ((c >> 12) & 0x3F)));
2202             put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2203             put(range, cast(char)(0x80 | (c & 0x3F)));
2204             return 4;
2205         }
2206         else
2207         {
2208             assert(0);
2209         }
2210     }
2211     else static if (is(immutable E == immutable wchar))
2212     {
2213         if (c <= 0xFFFF)
2214         {
2215             range.put(cast(wchar) c);
2216             return 1;
2217         }
2218         range.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
2219         range.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
2220         return 2;
2221     }
2222     else static if (is(immutable E == immutable dchar))
2223     {
2224         range.put(c);
2225         return 1;
2226     }
2227     else
2228     {
2229         static assert(0);
2230     }
2231 }
2232 
2233 @safe pure unittest
2234 {
2235     import std.array;
2236     Appender!(char[]) r;
2237     assert(encode!(char)('T', r) == 1);
2238     assert(encode!(wchar)('T', r) == 1);
2239     assert(encode!(dchar)('T', r) == 1);
2240 }
2241 
2242 /**
2243  Encodes a single code point to a delegate.
2244 
2245  This function encodes a single code point into one or more code units.
2246  The code units are passed one at a time to the supplied delegate.
2247 
2248  The input to this function MUST be a valid code point.
2249  This is enforced by the function's in-contract.
2250 
2251  The type of the output cannot be deduced. Therefore, it is necessary to
2252  explicitly specify the encoding as a template parameter.
2253 
2254  Supersedes:
2255  This function supersedes std.utf.encode(), however, note that the
2256  function codeUnits() supersedes it more conveniently.
2257 
2258  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2259  WINDOWS-1251, WINDOWS-1252
2260 
2261  Params:
2262     c  = the code point to be encoded
2263     dg = the delegate to invoke for each code unit
2264  */
2265 void encode(E)(dchar c, void delegate(E) dg)
2266 in
2267 {
2268     assert(isValidCodePoint(c));
2269 }
2270 do
2271 {
2272     EncoderInstance!(E).encode(c,dg);
2273 }
2274 
2275 /**
2276 Encodes the contents of `s` in units of type `Tgt`, writing the result to an
2277 output range.
2278 
2279 Returns: The number of `Tgt` elements written.
2280 Params:
2281 Tgt = Element type of `range`.
2282 s = Input array.
2283 range = Output range.
2284  */
2285 size_t encode(Tgt, Src, R)(in Src[] s, R range)
2286 {
2287     size_t result;
2288     foreach (c; s)
2289     {
2290         result += encode!(Tgt)(c, range);
2291     }
2292     return result;
2293 }
2294 
2295 /**
2296  Returns a foreachable struct which can bidirectionally iterate over all
2297  code points in a string.
2298 
2299  The input to this function MUST be validly encoded.
2300  This is enforced by the function's in-contract.
2301 
2302  You can foreach either
2303  with or without an index. If an index is specified, it will be initialized
2304  at each iteration with the offset into the string at which the code point
2305  begins.
2306 
2307  Supersedes:
2308  This function supersedes std.utf.decode().
2309 
2310  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2311  WINDOWS-1251, WINDOWS-1252
2312 
2313  Params:
2314     s = the string to be decoded
2315 
2316  Example:
2317  --------------------------------------------------------
2318  string s = "hello world";
2319  foreach (c;codePoints(s))
2320  {
2321      // do something with c (which will always be a dchar)
2322  }
2323  --------------------------------------------------------
2324 
2325  Note that, currently, foreach (c:codePoints(s)) is superior to foreach (c;s)
2326  in that the latter will fall over on encountering U+FFFF.
2327  */
2328 CodePoints!(E) codePoints(E)(immutable(E)[] s)
2329 in
2330 {
2331     assert(isValid(s));
2332 }
2333 do
2334 {
2335     return CodePoints!(E)(s);
2336 }
2337 
2338 ///
2339 @system unittest
2340 {
2341     string s = "hello";
2342     string t;
2343     foreach (c;codePoints(s))
2344     {
2345         t ~= cast(char) c;
2346     }
2347     assert(s == t);
2348 }
2349 
2350 /**
2351  Returns a foreachable struct which can bidirectionally iterate over all
2352  code units in a code point.
2353 
2354  The input to this function MUST be a valid code point.
2355  This is enforced by the function's in-contract.
2356 
2357  The type of the output cannot be deduced. Therefore, it is necessary to
2358  explicitly specify the encoding type in the template parameter.
2359 
2360  Supersedes:
2361  This function supersedes std.utf.encode().
2362 
2363  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2364  WINDOWS-1251, WINDOWS-1252
2365 
2366  Params:
2367     c = the code point to be encoded
2368  */
2369 CodeUnits!(E) codeUnits(E)(dchar c)
2370 in
2371 {
2372     assert(isValidCodePoint(c));
2373 }
2374 do
2375 {
2376     return CodeUnits!(E)(c);
2377 }
2378 
2379 ///
2380 @system unittest
2381 {
2382     char[] a;
2383     foreach (c;codeUnits!(char)(cast(dchar)'\u20AC'))
2384     {
2385         a ~= c;
2386     }
2387     assert(a.length == 3);
2388     assert(a[0] == 0xE2);
2389     assert(a[1] == 0x82);
2390     assert(a[2] == 0xAC);
2391 }
2392 
2393 /**
2394  Convert a string from one encoding to another.
2395 
2396  Supersedes:
2397  This function supersedes std.utf.toUTF8(), std.utf.toUTF16() and
2398  std.utf.toUTF32()
2399  (but note that to!() supersedes it more conveniently).
2400 
2401  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2402  WINDOWS-1251, WINDOWS-1252
2403 
2404  Params:
2405     s = Source string. $(B Must) be validly encoded.
2406         This is enforced by the function's in-contract.
2407     r = Destination string
2408 
2409  See_Also:
2410     $(REF to, std,conv)
2411  */
2412 void transcode(Src, Dst)(Src[] s, out Dst[] r)
2413 in
2414 {
2415     assert(isValid(s));
2416 }
2417 do
2418 {
2419     static if (is(Src == Dst) && is(Src == immutable))
2420     {
2421         r = s;
2422     }
2423     else static if (is(immutable Src == immutable AsciiChar))
2424     {
2425         transcode(cast(const(char)[])s, r);
2426     }
2427     else
2428     {
2429         static if (is(immutable Dst == immutable wchar))
2430         {
2431             immutable minReservePlace = 2;
2432         }
2433         else static if (is(immutable Dst == immutable dchar))
2434         {
2435             immutable minReservePlace = 1;
2436         }
2437         else
2438         {
2439             immutable minReservePlace = 6;
2440         }
2441 
2442         auto buffer = new Unqual!Dst[s.length];
2443         auto tmpBuffer = buffer;
2444 
2445         while (s.length != 0)
2446         {
2447             if (tmpBuffer.length < minReservePlace)
2448             {
2449                 size_t prevLength = buffer.length;
2450                 buffer.length += s.length + minReservePlace;
2451                 tmpBuffer = buffer[prevLength - tmpBuffer.length .. $];
2452             }
2453             EncoderInstance!(Unqual!Dst).encode(decode(s), tmpBuffer);
2454         }
2455 
2456         r = cast(Dst[]) buffer[0 .. buffer.length - tmpBuffer.length];
2457     }
2458 }
2459 
2460 ///
2461 @system pure unittest
2462 {
2463     wstring ws;
2464     // transcode from UTF-8 to UTF-16
2465     transcode("hello world",ws);
2466     assert(ws == "hello world"w);
2467 
2468     Latin1String ls;
2469     // transcode from UTF-16 to ISO-8859-1
2470     transcode(ws, ls);
2471     assert(ls == "hello world");
2472 }
2473 
2474 @system pure unittest
2475 {
2476     import std.meta;
2477     import std.range;
2478     {
2479         import std.conv : to;
2480 
2481         string asciiCharString = to!string(iota(0, 128, 1));
2482 
2483         alias Types = AliasSeq!(string, Latin1String, Latin2String, AsciiString,
2484             Windows1250String, Windows1251String, Windows1252String, dstring, wstring);
2485         foreach (S; Types)
2486             foreach (D; Types)
2487             {
2488                 string str;
2489                 S sStr;
2490                 D dStr;
2491                 transcode(asciiCharString, sStr);
2492                 transcode(sStr, dStr);
2493                 transcode(dStr, str);
2494                 assert(asciiCharString == str);
2495             }
2496     }
2497     {
2498         string czechChars = "Příliš žluťoučký kůň úpěl ďábelské ódy.";
2499         alias Types = AliasSeq!(string, dstring, wstring);
2500         foreach (S; Types)
2501             foreach (D; Types)
2502             {
2503                 string str;
2504                 S sStr;
2505                 D dStr;
2506                 transcode(czechChars, sStr);
2507                 transcode(sStr, dStr);
2508                 transcode(dStr, str);
2509                 assert(czechChars == str);
2510             }
2511     }
2512 }
2513 
2514 @system unittest // mutable/const input/output
2515 {
2516     import std.meta : AliasSeq;
2517 
2518     static foreach (O; AliasSeq!(Latin1Char, const Latin1Char, immutable Latin1Char))
2519     {{
2520         O[] output;
2521 
2522         char[] mutableInput = "äbc".dup;
2523         transcode(mutableInput, output);
2524         assert(output == [0xE4, 'b', 'c']);
2525 
2526         const char[] constInput = "öbc";
2527         transcode(constInput, output);
2528         assert(output == [0xF6, 'b', 'c']);
2529 
2530         immutable char[] immutInput = "übc";
2531         transcode(immutInput, output);
2532         assert(output == [0xFC, 'b', 'c']);
2533     }}
2534 
2535     // Make sure that const/mutable input is copied.
2536     static foreach (C; AliasSeq!(char, const char))
2537     {{
2538         C[] input = "foo".dup;
2539         C[] output;
2540         transcode(input, output);
2541         assert(input == output);
2542         assert(input !is output);
2543     }}
2544 
2545     // But immutable input should not be copied.
2546     string input = "foo";
2547     string output;
2548     transcode(input, output);
2549     assert(input is output);
2550 }
2551 
2552 //=============================================================================
2553 
2554 /** The base class for exceptions thrown by this module */
2555 class EncodingException : Exception { this(string msg) @safe pure { super(msg); } }
2556 
2557 class UnrecognizedEncodingException : EncodingException
2558 {
2559     private this(string msg) @safe pure { super(msg); }
2560 }
2561 
2562 /** Abstract base class of all encoding schemes */
2563 abstract class EncodingScheme
2564 {
2565     import std.uni : toLower;
2566 
2567     /**
2568      * Registers a subclass of EncodingScheme.
2569      *
2570      * This function allows user-defined subclasses of EncodingScheme to
2571      * be declared in other modules.
2572      *
2573      * Params:
2574      *     Klass = The subclass of EncodingScheme to register.
2575      *
2576      * Example:
2577      * ----------------------------------------------
2578      * class Amiga1251 : EncodingScheme
2579      * {
2580      *     shared static this()
2581      *     {
2582      *         EncodingScheme.register!Amiga1251;
2583      *     }
2584      * }
2585      * ----------------------------------------------
2586      */
2587     static void register(Klass:EncodingScheme)()
2588     {
2589         scope scheme = new Klass();
2590         foreach (encodingName;scheme.names())
2591         {
2592             supported[toLower(encodingName)] = () => new Klass();
2593         }
2594     }
2595 
2596     deprecated("Please pass the EncodingScheme subclass as template argument instead.")
2597     static void register(string className)
2598     {
2599         auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2600         if (scheme is null)
2601             throw new EncodingException("Unable to create class "~className);
2602         foreach (encodingName;scheme.names())
2603         {
2604             supportedFactories[toLower(encodingName)] = className;
2605         }
2606     }
2607 
2608     /**
2609      * Obtains a subclass of EncodingScheme which is capable of encoding
2610      * and decoding the named encoding scheme.
2611      *
2612      * This function is only aware of EncodingSchemes which have been
2613      * registered with the register() function.
2614      *
2615      * Example:
2616      * ---------------------------------------------------
2617      * auto scheme = EncodingScheme.create("Amiga-1251");
2618      * ---------------------------------------------------
2619      */
2620     static EncodingScheme create(string encodingName)
2621     {
2622         static bool registerDefaultEncodings()
2623         {
2624             EncodingScheme.register!EncodingSchemeASCII;
2625             EncodingScheme.register!EncodingSchemeLatin1;
2626             EncodingScheme.register!EncodingSchemeLatin2;
2627             EncodingScheme.register!EncodingSchemeWindows1250;
2628             EncodingScheme.register!EncodingSchemeWindows1251;
2629             EncodingScheme.register!EncodingSchemeWindows1252;
2630             EncodingScheme.register!EncodingSchemeUtf8;
2631             EncodingScheme.register!EncodingSchemeUtf16Native;
2632             EncodingScheme.register!EncodingSchemeUtf32Native;
2633             return true;
2634         }
2635 
2636         static shared bool initialized;
2637         import std.concurrency : initOnce;
2638         initOnce!initialized(registerDefaultEncodings());
2639         encodingName = toLower(encodingName);
2640 
2641         if (auto p = encodingName in supported)
2642             return (*p)();
2643 
2644         auto p = encodingName in supportedFactories;
2645         if (p is null)
2646             throw new EncodingException("Unrecognized Encoding: "~encodingName);
2647         string className = *p;
2648         auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2649         if (scheme is null) throw new EncodingException("Unable to create class "~className);
2650         return scheme;
2651     }
2652 
2653     const
2654     {
2655         /**
2656          * Returns the standard name of the encoding scheme
2657          */
2658         abstract override string toString();
2659 
2660         /**
2661          * Returns an array of all known names for this encoding scheme
2662          */
2663         abstract string[] names();
2664 
2665         /**
2666          * Returns true if the character c can be represented
2667          * in this encoding scheme.
2668          */
2669         abstract bool canEncode(dchar c);
2670 
2671         /**
2672          * Returns the number of ubytes required to encode this code point.
2673          *
2674          * The input to this function MUST be a valid code point.
2675          *
2676          * Params:
2677          *    c = the code point to be encoded
2678          *
2679          * Returns:
2680          *    the number of ubytes required.
2681          */
2682         abstract size_t encodedLength(dchar c);
2683 
2684         /**
2685          * Encodes a single code point into a user-supplied, fixed-size buffer.
2686          *
2687          * This function encodes a single code point into one or more ubytes.
2688          * The supplied buffer must be code unit aligned.
2689          * (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
2690          * UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
2691          *
2692          * The input to this function MUST be a valid code point.
2693          *
2694          * Params:
2695          *    c      = the code point to be encoded
2696          *    buffer = the destination array
2697          *
2698          * Returns:
2699          *    the number of ubytes written.
2700          */
2701         abstract size_t encode(dchar c, ubyte[] buffer);
2702 
2703         /**
2704          * Decodes a single code point.
2705          *
2706          * This function removes one or more ubytes from the start of an array,
2707          * and returns the decoded code point which those ubytes represent.
2708          *
2709          * The input to this function MUST be validly encoded.
2710          *
2711          * Params:
2712          *    s = the array whose first code point is to be decoded
2713          */
2714         abstract dchar decode(ref const(ubyte)[] s);
2715 
2716         /**
2717          * Decodes a single code point. The input does not have to be valid.
2718          *
2719          * This function removes one or more ubytes from the start of an array,
2720          * and returns the decoded code point which those ubytes represent.
2721          *
2722          * This function will accept an invalidly encoded array as input.
2723          * If an invalid sequence is found at the start of the string, this
2724          * function will remove it, and return the value INVALID_SEQUENCE.
2725          *
2726          * Params:
2727          *    s = the array whose first code point is to be decoded
2728          */
2729         abstract dchar safeDecode(ref const(ubyte)[] s);
2730 
2731         /**
2732          * Returns the sequence of ubytes to be used to represent
2733          * any character which cannot be represented in the encoding scheme.
2734          *
2735          * Normally this will be a representation of some substitution
2736          * character, such as U+FFFD or '?'.
2737          */
2738         abstract @property immutable(ubyte)[] replacementSequence();
2739     }
2740 
2741     /**
2742      * Returns true if the array is encoded correctly
2743      *
2744      * Params:
2745      *    s = the array to be tested
2746      */
2747     bool isValid(const(ubyte)[] s)
2748     {
2749         while (s.length != 0)
2750         {
2751             if (safeDecode(s) == INVALID_SEQUENCE)
2752                 return false;
2753         }
2754         return true;
2755     }
2756 
2757     /**
2758      * Returns the length of the longest possible substring, starting from
2759      * the first element, which is validly encoded.
2760      *
2761      * Params:
2762      *    s = the array to be tested
2763      */
2764     size_t validLength()(const(ubyte)[] s)
2765     {
2766         const(ubyte)[] r = s;
2767         const(ubyte)[] t = s;
2768         while (s.length != 0)
2769         {
2770             if (safeDecode(s) == INVALID_SEQUENCE) break;
2771             t = s;
2772         }
2773         return r.length - t.length;
2774     }
2775 
2776     /**
2777      * Sanitizes an array by replacing malformed ubyte sequences with valid
2778      * ubyte sequences. The result is guaranteed to be valid for this
2779      * encoding scheme.
2780      *
2781      * If the input array is already valid, this function returns the
2782      * original, otherwise it constructs a new array by replacing all illegal
2783      * sequences with the encoding scheme's replacement sequence.
2784      *
2785      * Params:
2786      *    s = the string to be sanitized
2787      */
2788     immutable(ubyte)[] sanitize()(immutable(ubyte)[] s)
2789     {
2790         auto n = validLength(s);
2791         if (n == s.length) return s;
2792 
2793         auto repSeq = replacementSequence;
2794 
2795         // Count how long the string needs to be.
2796         // Overestimating is not a problem
2797         auto len = s.length;
2798         const(ubyte)[] t = s[n..$];
2799         while (t.length != 0)
2800         {
2801             immutable c = safeDecode(t);
2802             assert(c == INVALID_SEQUENCE);
2803             len += repSeq.length;
2804             t = t[validLength(t)..$];
2805         }
2806 
2807         // Now do the write
2808         ubyte[] array = new ubyte[len];
2809         array[0 .. n] = s[0 .. n];
2810         auto offset = n;
2811 
2812         t = s[n..$];
2813         while (t.length != 0)
2814         {
2815             immutable c = safeDecode(t);
2816             assert(c == INVALID_SEQUENCE);
2817             array[offset .. offset+repSeq.length] = repSeq[];
2818             offset += repSeq.length;
2819             n = validLength(t);
2820             array[offset .. offset+n] = t[0 .. n];
2821             offset += n;
2822             t = t[n..$];
2823         }
2824         return cast(immutable(ubyte)[])array[0 .. offset];
2825     }
2826 
2827     /**
2828      * Returns the length of the first encoded sequence.
2829      *
2830      * The input to this function MUST be validly encoded.
2831      * This is enforced by the function's in-contract.
2832      *
2833      * Params:
2834      *    s = the array to be sliced
2835      */
2836     size_t firstSequence()(const(ubyte)[] s)
2837     in
2838     {
2839         assert(s.length != 0);
2840         const(ubyte)[] u = s;
2841         assert(safeDecode(u) != INVALID_SEQUENCE);
2842     }
2843     do
2844     {
2845         const(ubyte)[] t = s;
2846         decode(s);
2847         return t.length - s.length;
2848     }
2849 
2850     /**
2851      * Returns the total number of code points encoded in a ubyte array.
2852      *
2853      * The input to this function MUST be validly encoded.
2854      * This is enforced by the function's in-contract.
2855      *
2856      * Params:
2857      *    s = the string to be counted
2858      */
2859     size_t count()(const(ubyte)[] s)
2860     in
2861     {
2862         assert(isValid(s));
2863     }
2864     do
2865     {
2866         size_t n = 0;
2867         while (s.length != 0)
2868         {
2869             decode(s);
2870             ++n;
2871         }
2872         return n;
2873     }
2874 
2875     /**
2876      * Returns the array index at which the (n+1)th code point begins.
2877      *
2878      * The input to this function MUST be validly encoded.
2879      * This is enforced by the function's in-contract.
2880      *
2881      * Params:
2882      *    s = the string to be counted
2883      *    n = the current code point index
2884      */
2885     ptrdiff_t index()(const(ubyte)[] s, size_t n)
2886     in
2887     {
2888         assert(isValid(s));
2889         assert(n >= 0);
2890     }
2891     do
2892     {
2893         const(ubyte)[] t = s;
2894         for (size_t i=0; i<n; ++i) decode(s);
2895         return t.length - s.length;
2896     }
2897 
2898     __gshared EncodingScheme function()[string] supported;
2899     __gshared string[string] supportedFactories;
2900 }
2901 
2902 /**
2903  EncodingScheme to handle ASCII
2904 
2905  This scheme recognises the following names:
2906                  "ANSI_X3.4-1968",
2907                  "ANSI_X3.4-1986",
2908                  "ASCII",
2909                  "IBM367",
2910                  "ISO646-US",
2911                  "ISO_646.irv:1991",
2912                  "US-ASCII",
2913                  "cp367",
2914                  "csASCII"
2915                  "iso-ir-6",
2916                  "us"
2917  */
2918 class EncodingSchemeASCII : EncodingScheme
2919 {
2920     /* // moved to std.internal.phobosinit
2921     shared static this()
2922     {
2923         EncodingScheme.register("std.encoding.EncodingSchemeASCII");
2924     }*/
2925 
2926     const
2927     {
2928         override string[] names() @safe pure nothrow
2929         {
2930             return
2931             [
2932                 "ANSI_X3.4-1968",
2933                 "ANSI_X3.4-1986",
2934                 "ASCII",
2935                 "IBM367",
2936                 "ISO646-US",
2937                 "ISO_646.irv:1991",
2938                 "US-ASCII",
2939                 "cp367",
2940                 "csASCII",
2941                 "iso-ir-6",
2942                 "us"
2943             ];
2944         }
2945 
2946         override string toString() @safe pure nothrow @nogc
2947         {
2948             return "ASCII";
2949         }
2950 
2951         override bool canEncode(dchar c) @safe pure nothrow @nogc
2952         {
2953             return std.encoding.canEncode!(AsciiChar)(c);
2954         }
2955 
2956         override size_t encodedLength(dchar c)  @safe pure nothrow @nogc
2957         {
2958             return std.encoding.encodedLength!(AsciiChar)(c);
2959         }
2960 
2961         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2962         {
2963             auto r = cast(AsciiChar[]) buffer;
2964             return std.encoding.encode(c,r);
2965         }
2966 
2967         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2968         {
2969             auto t = cast(const(AsciiChar)[]) s;
2970             dchar c = std.encoding.decode(t);
2971             s = s[$-t.length..$];
2972             return c;
2973         }
2974 
2975         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2976         {
2977             auto t = cast(const(AsciiChar)[]) s;
2978             dchar c = std.encoding.safeDecode(t);
2979             s = s[$-t.length..$];
2980             return c;
2981         }
2982 
2983         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2984         {
2985             return cast(immutable(ubyte)[])"?";
2986         }
2987     }
2988 }
2989 
2990 /**
2991  EncodingScheme to handle Latin-1
2992 
2993  This scheme recognises the following names:
2994                  "CP819",
2995                  "IBM819",
2996                  "ISO-8859-1",
2997                  "ISO_8859-1",
2998                  "ISO_8859-1:1987",
2999                  "csISOLatin1",
3000                  "iso-ir-100",
3001                  "l1",
3002                  "latin1"
3003  */
3004 class EncodingSchemeLatin1 : EncodingScheme
3005 {
3006     /* // moved to std.internal.phobosinit
3007     shared static this()
3008     {
3009         EncodingScheme.register("std.encoding.EncodingSchemeLatin1");
3010     }*/
3011 
3012     const
3013     {
3014         override string[] names() @safe pure nothrow
3015         {
3016             return
3017             [
3018                 "CP819",
3019                 "IBM819",
3020                 "ISO-8859-1",
3021                 "ISO_8859-1",
3022                 "ISO_8859-1:1987",
3023                 "csISOLatin1",
3024                 "iso-ir-100",
3025                 "l1",
3026                 "latin1"
3027             ];
3028         }
3029 
3030         override string toString() @safe pure nothrow @nogc
3031         {
3032             return "ISO-8859-1";
3033         }
3034 
3035         override bool canEncode(dchar c) @safe pure nothrow @nogc
3036         {
3037             return std.encoding.canEncode!(Latin1Char)(c);
3038         }
3039 
3040         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3041         {
3042             return std.encoding.encodedLength!(Latin1Char)(c);
3043         }
3044 
3045         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3046         {
3047             auto r = cast(Latin1Char[]) buffer;
3048             return std.encoding.encode(c,r);
3049         }
3050 
3051         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3052         {
3053             auto t = cast(const(Latin1Char)[]) s;
3054             dchar c = std.encoding.decode(t);
3055             s = s[$-t.length..$];
3056             return c;
3057         }
3058 
3059         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3060         {
3061             auto t = cast(const(Latin1Char)[]) s;
3062             dchar c = std.encoding.safeDecode(t);
3063             s = s[$-t.length..$];
3064             return c;
3065         }
3066 
3067         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3068         {
3069             return cast(immutable(ubyte)[])"?";
3070         }
3071     }
3072 }
3073 
3074 /**
3075  EncodingScheme to handle Latin-2
3076 
3077  This scheme recognises the following names:
3078                  "Latin 2",
3079                  "ISO-8859-2",
3080                  "ISO_8859-2",
3081                  "ISO_8859-2:1999",
3082                  "Windows-28592"
3083  */
3084 class EncodingSchemeLatin2 : EncodingScheme
3085 {
3086     /* // moved to std.internal.phobosinit
3087     shared static this()
3088     {
3089         EncodingScheme.register("std.encoding.EncodingSchemeLatin2");
3090     }*/
3091 
3092     const
3093     {
3094         override string[] names() @safe pure nothrow
3095         {
3096             return
3097             [
3098                 "Latin 2",
3099                 "ISO-8859-2",
3100                 "ISO_8859-2",
3101                 "ISO_8859-2:1999",
3102                 "windows-28592"
3103             ];
3104         }
3105 
3106         override string toString() @safe pure nothrow @nogc
3107         {
3108             return "ISO-8859-2";
3109         }
3110 
3111         override bool canEncode(dchar c) @safe pure nothrow @nogc
3112         {
3113             return std.encoding.canEncode!(Latin2Char)(c);
3114         }
3115 
3116         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3117         {
3118             return std.encoding.encodedLength!(Latin2Char)(c);
3119         }
3120 
3121         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3122         {
3123             auto r = cast(Latin2Char[]) buffer;
3124             return std.encoding.encode(c,r);
3125         }
3126 
3127         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3128         {
3129             auto t = cast(const(Latin2Char)[]) s;
3130             dchar c = std.encoding.decode(t);
3131             s = s[$-t.length..$];
3132             return c;
3133         }
3134 
3135         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3136         {
3137             auto t = cast(const(Latin2Char)[]) s;
3138             dchar c = std.encoding.safeDecode(t);
3139             s = s[$-t.length..$];
3140             return c;
3141         }
3142 
3143         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3144         {
3145             return cast(immutable(ubyte)[])"?";
3146         }
3147     }
3148 }
3149 
3150 /**
3151  EncodingScheme to handle Windows-1250
3152 
3153  This scheme recognises the following names:
3154                  "windows-1250"
3155  */
3156 class EncodingSchemeWindows1250 : EncodingScheme
3157 {
3158     /* // moved to std.internal.phobosinit
3159     shared static this()
3160     {
3161         EncodingScheme.register("std.encoding.EncodingSchemeWindows1250");
3162     }*/
3163 
3164     const
3165     {
3166         override string[] names() @safe pure nothrow
3167         {
3168             return
3169             [
3170                 "windows-1250"
3171             ];
3172         }
3173 
3174         override string toString() @safe pure nothrow @nogc
3175         {
3176             return "windows-1250";
3177         }
3178 
3179         override bool canEncode(dchar c) @safe pure nothrow @nogc
3180         {
3181             return std.encoding.canEncode!(Windows1250Char)(c);
3182         }
3183 
3184         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3185         {
3186             return std.encoding.encodedLength!(Windows1250Char)(c);
3187         }
3188 
3189         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3190         {
3191             auto r = cast(Windows1250Char[]) buffer;
3192             return std.encoding.encode(c,r);
3193         }
3194 
3195         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3196         {
3197             auto t = cast(const(Windows1250Char)[]) s;
3198             dchar c = std.encoding.decode(t);
3199             s = s[$-t.length..$];
3200             return c;
3201         }
3202 
3203         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3204         {
3205             auto t = cast(const(Windows1250Char)[]) s;
3206             dchar c = std.encoding.safeDecode(t);
3207             s = s[$-t.length..$];
3208             return c;
3209         }
3210 
3211         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3212         {
3213             return cast(immutable(ubyte)[])"?";
3214         }
3215     }
3216 }
3217 
3218 /**
3219  EncodingScheme to handle Windows-1251
3220 
3221  This scheme recognises the following names:
3222                  "windows-1251"
3223  */
3224 class EncodingSchemeWindows1251 : EncodingScheme
3225 {
3226     /* // moved to std.internal.phobosinit
3227     shared static this()
3228     {
3229         EncodingScheme.register("std.encoding.EncodingSchemeWindows1251");
3230     }*/
3231 
3232     const
3233     {
3234         override string[] names() @safe pure nothrow
3235         {
3236             return
3237             [
3238                 "windows-1251"
3239             ];
3240         }
3241 
3242         override string toString() @safe pure nothrow @nogc
3243         {
3244             return "windows-1251";
3245         }
3246 
3247         override bool canEncode(dchar c) @safe pure nothrow @nogc
3248         {
3249             return std.encoding.canEncode!(Windows1251Char)(c);
3250         }
3251 
3252         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3253         {
3254             return std.encoding.encodedLength!(Windows1251Char)(c);
3255         }
3256 
3257         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3258         {
3259             auto r = cast(Windows1251Char[]) buffer;
3260             return std.encoding.encode(c,r);
3261         }
3262 
3263         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3264         {
3265             auto t = cast(const(Windows1251Char)[]) s;
3266             dchar c = std.encoding.decode(t);
3267             s = s[$-t.length..$];
3268             return c;
3269         }
3270 
3271         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3272         {
3273             auto t = cast(const(Windows1251Char)[]) s;
3274             dchar c = std.encoding.safeDecode(t);
3275             s = s[$-t.length..$];
3276             return c;
3277         }
3278 
3279         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3280         {
3281             return cast(immutable(ubyte)[])"?";
3282         }
3283     }
3284 }
3285 
3286 /**
3287  EncodingScheme to handle Windows-1252
3288 
3289  This scheme recognises the following names:
3290                  "windows-1252"
3291  */
3292 class EncodingSchemeWindows1252 : EncodingScheme
3293 {
3294     /* // moved to std.internal.phobosinit
3295     shared static this()
3296     {
3297         EncodingScheme.register("std.encoding.EncodingSchemeWindows1252");
3298     }*/
3299 
3300     const
3301     {
3302         override string[] names() @safe pure nothrow
3303         {
3304             return
3305             [
3306                 "windows-1252"
3307             ];
3308         }
3309 
3310         override string toString() @safe pure nothrow @nogc
3311         {
3312             return "windows-1252";
3313         }
3314 
3315         override bool canEncode(dchar c) @safe pure nothrow @nogc
3316         {
3317             return std.encoding.canEncode!(Windows1252Char)(c);
3318         }
3319 
3320         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3321         {
3322             return std.encoding.encodedLength!(Windows1252Char)(c);
3323         }
3324 
3325         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3326         {
3327             auto r = cast(Windows1252Char[]) buffer;
3328             return std.encoding.encode(c,r);
3329         }
3330 
3331         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3332         {
3333             auto t = cast(const(Windows1252Char)[]) s;
3334             dchar c = std.encoding.decode(t);
3335             s = s[$-t.length..$];
3336             return c;
3337         }
3338 
3339         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3340         {
3341             auto t = cast(const(Windows1252Char)[]) s;
3342             dchar c = std.encoding.safeDecode(t);
3343             s = s[$-t.length..$];
3344             return c;
3345         }
3346 
3347         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3348         {
3349             return cast(immutable(ubyte)[])"?";
3350         }
3351     }
3352 }
3353 
3354 @system unittest
3355 {
3356     static string[] schemeNames =
3357     [
3358         "ASCII",
3359         "ISO-8859-1",
3360         "ISO-8859-2",
3361         "windows-1250",
3362         "windows-1251",
3363         "windows-1252"
3364     ];
3365 
3366     EncodingScheme[] schemes;
3367 
3368     foreach (name;schemeNames)
3369     {
3370        schemes ~= EncodingScheme.create(name);
3371     }
3372 
3373     ubyte[1] buffer;
3374     static dchar[][] valid =
3375     [
3376         //Valid ASCII
3377         ['\u0001','\u0020','\u0040','\u0060','\u007F'],
3378         //Vaild 8859-1
3379         ['\u0001','\u0020','\u0070','\u00DA','\u00FF'],
3380         //Valid 8859-2
3381         ['\u0020','\u00D7','\u00DF','\u010F','\u02D9'],
3382         //Valid 1250
3383         ['\u0020','\u20AC','\u201E','\u2021','\u2039'],
3384         //Valid 1251
3385         ['\u0402','\u00A4','\u0415','\u0439','\u044F'],
3386         //Valid 1252
3387         ['\u20AC','\u0160','\u2019','\u2122','\u0178'],
3388     ];
3389 
3390     static const(ubyte)[] invalid = [0xA0,0xFF,0xFF,0x81,0x98,0x81];
3391 
3392     foreach (i,scheme;schemes)
3393     {
3394         assert(scheme.toString() == schemeNames[i],"Error in the name of encoding scheme"~schemeNames[i]);
3395         assert(!scheme.canEncode('\uFFFD'));
3396         assert(scheme.encodedLength('A') == 1);
3397         const(ubyte)[] encodeStr;
3398         dchar[] decStr;
3399         foreach (chr;valid[i])
3400         {
3401             assert(scheme.encode(chr,buffer) == 1);
3402             encodeStr ~= buffer;
3403             const(ubyte)[] buf = buffer;
3404             decStr ~= scheme.decode(buf);
3405         }
3406 
3407         assert(scheme.isValid(encodeStr),"Not correctly encoded UTF => " ~ schemeNames[i]);
3408         assert(valid[i] == decStr,"Error encode/decode UTF8 <=> " ~ schemeNames[i]);
3409 
3410         if (schemeNames[i] == "ISO-8859-1" || schemeNames[i] == "ISO-8859-2")
3411         {
3412             assert(scheme.safeDecode(invalid) != INVALID_SEQUENCE);
3413         }
3414         else
3415         {
3416             assert(scheme.safeDecode(invalid) == INVALID_SEQUENCE);
3417         }
3418         assert(scheme.replacementSequence() == cast(immutable(ubyte)[])"?");
3419     }
3420     assert(invalid.length == 0);
3421 }
3422 
3423 /**
3424  EncodingScheme to handle UTF-8
3425 
3426  This scheme recognises the following names:
3427                  "UTF-8"
3428  */
3429 class EncodingSchemeUtf8 : EncodingScheme
3430 {
3431     /* // moved to std.internal.phobosinit
3432     shared static this()
3433     {
3434         EncodingScheme.register("std.encoding.EncodingSchemeUtf8");
3435     }*/
3436 
3437     const
3438     {
3439         override string[] names() @safe pure nothrow
3440         {
3441             return
3442             [
3443                 "UTF-8"
3444             ];
3445         }
3446 
3447         override string toString() @safe pure nothrow @nogc
3448         {
3449             return "UTF-8";
3450         }
3451 
3452         override bool canEncode(dchar c) @safe pure nothrow @nogc
3453         {
3454             return std.encoding.canEncode!(char)(c);
3455         }
3456 
3457         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3458         {
3459             return std.encoding.encodedLength!(char)(c);
3460         }
3461 
3462         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3463         {
3464             auto r = cast(char[]) buffer;
3465             return std.encoding.encode(c,r);
3466         }
3467 
3468         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3469         {
3470             auto t = cast(const(char)[]) s;
3471             dchar c = std.encoding.decode(t);
3472             s = s[$-t.length..$];
3473             return c;
3474         }
3475 
3476         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3477         {
3478             auto t = cast(const(char)[]) s;
3479             dchar c = std.encoding.safeDecode(t);
3480             s = s[$-t.length..$];
3481             return c;
3482         }
3483 
3484         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3485         {
3486             return cast(immutable(ubyte)[])"\uFFFD";
3487         }
3488     }
3489 }
3490 
3491 /**
3492  EncodingScheme to handle UTF-16 in native byte order
3493 
3494  This scheme recognises the following names:
3495                  "UTF-16LE" (little-endian architecture only)
3496                  "UTF-16BE" (big-endian architecture only)
3497  */
3498 class EncodingSchemeUtf16Native : EncodingScheme
3499 {
3500     /* // moved to std.internal.phobosinit
3501     shared static this()
3502     {
3503         EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native");
3504     }*/
3505 
3506     const
3507     {
3508         version (LittleEndian) { enum string NAME = "UTF-16LE"; }
3509         version (BigEndian)    { enum string NAME = "UTF-16BE"; }
3510 
3511         override string[] names() @safe pure nothrow
3512         {
3513             return [ NAME ];
3514         }
3515 
3516         override string toString() @safe pure nothrow @nogc
3517         {
3518             return NAME;
3519         }
3520 
3521         override bool canEncode(dchar c) @safe pure nothrow @nogc
3522         {
3523             return std.encoding.canEncode!(wchar)(c);
3524         }
3525 
3526         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3527         {
3528             return std.encoding.encodedLength!(wchar)(c);
3529         }
3530 
3531         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3532         {
3533             auto r = cast(wchar[]) buffer;
3534             return wchar.sizeof * std.encoding.encode(c,r);
3535         }
3536 
3537         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3538         in
3539         {
3540             assert((s.length & 1) == 0);
3541         }
3542         do
3543         {
3544             auto t = cast(const(wchar)[]) s;
3545             dchar c = std.encoding.decode(t);
3546             s = s[$-t.length * wchar.sizeof..$];
3547             return c;
3548         }
3549 
3550         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3551         in
3552         {
3553             assert((s.length & 1) == 0);
3554         }
3555         do
3556         {
3557             auto t = cast(const(wchar)[]) s;
3558             dchar c = std.encoding.safeDecode(t);
3559             s = s[$-t.length * wchar.sizeof..$];
3560             return c;
3561         }
3562 
3563         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3564         {
3565             return cast(immutable(ubyte)[])"\uFFFD"w;
3566         }
3567     }
3568 }
3569 @system unittest
3570 {
3571     version (LittleEndian)
3572     {
3573         auto efrom = EncodingScheme.create("utf-16le");
3574         ubyte[6] sample = [154,1, 155,1, 156,1];
3575     }
3576     version (BigEndian)
3577     {
3578         auto efrom = EncodingScheme.create("utf-16be");
3579         ubyte[6] sample = [1,154, 1,155, 1,156];
3580     }
3581     const(ubyte)[] ub = cast(const(ubyte)[])sample;
3582     dchar dc = efrom.safeDecode(ub);
3583     assert(dc == 410);
3584     assert(ub.length == 4);
3585 }
3586 
3587 /**
3588  EncodingScheme to handle UTF-32 in native byte order
3589 
3590  This scheme recognises the following names:
3591                  "UTF-32LE" (little-endian architecture only)
3592                  "UTF-32BE" (big-endian architecture only)
3593  */
3594 class EncodingSchemeUtf32Native : EncodingScheme
3595 {
3596     /* // moved to std.internal.phobosinit
3597     shared static this()
3598     {
3599         EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native");
3600     }*/
3601 
3602     const
3603     {
3604         version (LittleEndian) { enum string NAME = "UTF-32LE"; }
3605         version (BigEndian)    { enum string NAME = "UTF-32BE"; }
3606 
3607         override string[] names() @safe pure nothrow
3608         {
3609             return [ NAME ];
3610         }
3611 
3612         override string toString() @safe pure nothrow @nogc
3613         {
3614             return NAME;
3615         }
3616 
3617         override bool canEncode(dchar c) @safe pure nothrow @nogc
3618         {
3619             return std.encoding.canEncode!(dchar)(c);
3620         }
3621 
3622         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3623         {
3624             return std.encoding.encodedLength!(dchar)(c);
3625         }
3626 
3627         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3628         {
3629             auto r = cast(dchar[]) buffer;
3630             return dchar.sizeof * std.encoding.encode(c,r);
3631         }
3632 
3633         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3634         in
3635         {
3636             assert((s.length & 3) == 0);
3637         }
3638         do
3639         {
3640             auto t = cast(const(dchar)[]) s;
3641             dchar c = std.encoding.decode(t);
3642             s = s[$-t.length * dchar.sizeof..$];
3643             return c;
3644         }
3645 
3646         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3647         in
3648         {
3649             assert((s.length & 3) == 0);
3650         }
3651         do
3652         {
3653             auto t = cast(const(dchar)[]) s;
3654             dchar c = std.encoding.safeDecode(t);
3655             s = s[$-t.length * dchar.sizeof..$];
3656             return c;
3657         }
3658 
3659         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3660         {
3661             return cast(immutable(ubyte)[])"\uFFFD"d;
3662         }
3663     }
3664 }
3665 @system unittest
3666 {
3667     version (LittleEndian)
3668     {
3669         auto efrom = EncodingScheme.create("utf-32le");
3670         ubyte[12] sample = [154,1,0,0, 155,1,0,0, 156,1,0,0];
3671     }
3672     version (BigEndian)
3673     {
3674         auto efrom = EncodingScheme.create("utf-32be");
3675         ubyte[12] sample = [0,0,1,154, 0,0,1,155, 0,0,1,156];
3676     }
3677     const(ubyte)[] ub = cast(const(ubyte)[])sample;
3678     dchar dc = efrom.safeDecode(ub);
3679     assert(dc == 410);
3680     assert(ub.length == 8);
3681 }
3682 
3683 //=============================================================================
3684 
3685 
3686 /** Definitions of common Byte Order Marks.
3687 The elements of the `enum` can used as indices into `bomTable` to get
3688 matching `BOMSeq`.
3689 */
3690 enum BOM
3691 {
3692     none      = 0,  /// no BOM was found
3693     utf32be   = 1,  /// [0x00, 0x00, 0xFE, 0xFF]
3694     utf32le   = 2,  /// [0xFF, 0xFE, 0x00, 0x00]
3695     utf7      = 3,  /** [0x2B, 0x2F, 0x76, 0x38]
3696                         [0x2B, 0x2F, 0x76, 0x39],
3697                         [0x2B, 0x2F, 0x76, 0x2B],
3698                         [0x2B, 0x2F, 0x76, 0x2F],
3699                         [0x2B, 0x2F, 0x76, 0x38, 0x2D]
3700                     */
3701     utf1      = 8,  /// [0xF7, 0x64, 0x4C]
3702     utfebcdic = 9,  /// [0xDD, 0x73, 0x66, 0x73]
3703     scsu      = 10, /// [0x0E, 0xFE, 0xFF]
3704     bocu1     = 11, /// [0xFB, 0xEE, 0x28]
3705     gb18030   = 12, /// [0x84, 0x31, 0x95, 0x33]
3706     utf8      = 13, /// [0xEF, 0xBB, 0xBF]
3707     utf16be   = 14, /// [0xFE, 0xFF]
3708     utf16le   = 15  /// [0xFF, 0xFE]
3709 }
3710 
3711 /// The type stored inside `bomTable`.
3712 alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence");
3713 
3714 /** Mapping of a byte sequence to $(B Byte Order Mark (BOM))
3715 */
3716 immutable bomTable = [
3717     BOMSeq(BOM.none, null),
3718     BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])),
3719     BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])),
3720     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])),
3721     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])),
3722     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])),
3723     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])),
3724     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])),
3725     BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])),
3726     BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])),
3727     BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])),
3728     BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])),
3729     BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])),
3730     BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])),
3731     BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])),
3732     BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE]))
3733 ];
3734 
3735 /** Returns a `BOMSeq` for a given `input`.
3736 If no `BOM` is present the `BOMSeq` for `BOM.none` is
3737 returned. The `BOM` sequence at the beginning of the range will
3738 not be comsumed from the passed range. If you pass a reference type
3739 range make sure that `save` creates a deep copy.
3740 
3741 Params:
3742     input = The sequence to check for the `BOM`
3743 
3744 Returns:
3745     the found `BOMSeq` corresponding to the passed `input`.
3746 */
3747 immutable(BOMSeq) getBOM(Range)(Range input)
3748 if (isForwardRange!Range && is(immutable ElementType!Range == immutable ubyte))
3749 {
3750     import std.algorithm.searching : startsWith;
3751     foreach (it; bomTable[1 .. $])
3752     {
3753         if (startsWith(input.save, it.sequence))
3754         {
3755             return it;
3756         }
3757     }
3758 
3759     return bomTable[0];
3760 }
3761 
3762 ///
3763 @system unittest
3764 {
3765     import std.format : format;
3766 
3767     auto ts = dchar(0x0000FEFF) ~ "Hello World"d;
3768 
3769     auto entry = getBOM(cast(ubyte[]) ts);
3770     version (BigEndian)
3771     {
3772         assert(entry.schema == BOM.utf32be, format("%s", entry.schema));
3773     }
3774     else
3775     {
3776         assert(entry.schema == BOM.utf32le, format("%s", entry.schema));
3777     }
3778 }
3779 
3780 @system unittest
3781 {
3782     import std.format : format;
3783 
3784     foreach (idx, it; bomTable)
3785     {
3786         auto s = it[1] ~ cast(ubyte[])"hello world";
3787         auto i = getBOM(s);
3788         assert(i[0] == bomTable[idx][0]);
3789 
3790         if (idx < 4 || idx > 7) // get around the multiple utf7 bom's
3791         {
3792             assert(i[0] == BOM.init + idx);
3793             assert(i[1] == it[1]);
3794         }
3795     }
3796 }
3797 
3798 @safe pure unittest
3799 {
3800     struct BOMInputRange
3801     {
3802         ubyte[] arr;
3803 
3804         @property ubyte front()
3805         {
3806             return this.arr.front;
3807         }
3808 
3809         @property bool empty()
3810         {
3811             return this.arr.empty;
3812         }
3813 
3814         void popFront()
3815         {
3816             this.arr = this.arr[1 .. $];
3817         }
3818 
3819         @property typeof(this) save()
3820         {
3821             return this;
3822         }
3823     }
3824 
3825     static assert( isInputRange!BOMInputRange);
3826     static assert(!isArray!BOMInputRange);
3827 
3828     ubyte[] dummyEnd = [0,0,0,0];
3829 
3830     foreach (idx, it; bomTable[1 .. $])
3831     {
3832         {
3833             auto ir = BOMInputRange(it.sequence.dup);
3834 
3835             auto b = getBOM(ir);
3836             assert(b.schema == it.schema);
3837             assert(ir.arr == it.sequence);
3838         }
3839 
3840         {
3841             auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd;
3842             size_t oldLen = noBom.length;
3843             assert(oldLen - 4 < it.sequence.length);
3844 
3845             auto ir = BOMInputRange(noBom.dup);
3846             auto b = getBOM(ir);
3847             assert(b.schema == BOM.none);
3848             assert(noBom.length == oldLen);
3849         }
3850     }
3851 }
3852 
3853 /** Constant defining a fully decoded BOM */
3854 enum dchar utfBOM = 0xfeff;
The OpenD Programming Language