1 // Written in the D programming language. 2 3 /** 4 Classes and functions for handling and transcoding between various encodings. 5 6 For cases where the encoding is known at compile-time, functions are provided 7 for arbitrary encoding and decoding of characters, arbitrary transcoding 8 between strings of different type, as well as validation and sanitization. 9 10 Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1 11 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250, WINDOWS-1251 12 and WINDOWS-1252. 13 14 $(SCRIPT inhibitQuickIndex = 1;) 15 $(DIVC quickindex, 16 $(BOOKTABLE, 17 $(TR $(TH Category) $(TH Functions)) 18 $(TR $(TD Decode) $(TD 19 $(LREF codePoints) 20 $(LREF decode) 21 $(LREF decodeReverse) 22 $(LREF safeDecode) 23 )) 24 $(TR $(TD Conversion) $(TD 25 $(LREF codeUnits) 26 $(LREF sanitize) 27 $(LREF transcode) 28 )) 29 $(TR $(TD Classification) $(TD 30 $(LREF canEncode) 31 $(LREF isValid) 32 $(LREF isValidCodePoint) 33 $(LREF isValidCodeUnit) 34 )) 35 $(TR $(TD BOM) $(TD 36 $(LREF BOM) 37 $(LREF BOMSeq) 38 $(LREF getBOM) 39 $(LREF utfBOM) 40 )) 41 $(TR $(TD Length & Index) $(TD 42 $(LREF firstSequence) 43 $(LREF encodedLength) 44 $(LREF index) 45 $(LREF lastSequence) 46 $(LREF validLength) 47 )) 48 $(TR $(TD Encoding schemes) $(TD 49 $(LREF encodingName) 50 $(LREF EncodingScheme) 51 $(LREF EncodingSchemeASCII) 52 $(LREF EncodingSchemeLatin1) 53 $(LREF EncodingSchemeLatin2) 54 $(LREF EncodingSchemeUtf16Native) 55 $(LREF EncodingSchemeUtf32Native) 56 $(LREF EncodingSchemeUtf8) 57 $(LREF EncodingSchemeWindows1250) 58 $(LREF EncodingSchemeWindows1251) 59 $(LREF EncodingSchemeWindows1252) 60 )) 61 $(TR $(TD Representation) $(TD 62 $(LREF AsciiChar) 63 $(LREF AsciiString) 64 $(LREF Latin1Char) 65 $(LREF Latin1String) 66 $(LREF Latin2Char) 67 $(LREF Latin2String) 68 $(LREF Windows1250Char) 69 $(LREF Windows1250String) 70 $(LREF Windows1251Char) 71 $(LREF Windows1251String) 72 $(LREF Windows1252Char) 73 $(LREF Windows1252String) 74 )) 75 $(TR $(TD Exceptions) $(TD 76 $(LREF INVALID_SEQUENCE) 77 $(LREF EncodingException) 78 )) 79 )) 80 81 For cases where the encoding is not known at compile-time, but is 82 known at run-time, the abstract class $(LREF EncodingScheme) 83 and its subclasses is provided. To construct a run-time encoder/decoder, 84 one does e.g. 85 86 ---------------------------------------------------- 87 auto e = EncodingScheme.create("utf-8"); 88 ---------------------------------------------------- 89 90 This library supplies $(LREF EncodingScheme) subclasses for ASCII, 91 ISO-8859-1 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250, 92 WINDOWS-1251, WINDOWS-1252, UTF-8, and (on little-endian architectures) 93 UTF-16LE and UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE. 94 95 This library provides a mechanism whereby other modules may add $(LREF 96 EncodingScheme) subclasses for any other encoding. 97 98 Copyright: Copyright Janice Caron 2008 - 2009. 99 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 100 Authors: Janice Caron 101 Source: $(PHOBOSSRC std/encoding.d) 102 */ 103 /* 104 Copyright Janice Caron 2008 - 2009. 105 Distributed under the Boost Software License, Version 1.0. 106 (See accompanying file LICENSE_1_0.txt or copy at 107 http://www.boost.org/LICENSE_1_0.txt) 108 */ 109 module std.encoding; 110 111 import std.range.primitives; 112 import std.traits; 113 import std.typecons; 114 115 @system unittest 116 { 117 static ubyte[][] validStrings = 118 [ 119 // Plain ASCII 120 cast(ubyte[])"hello", 121 122 // First possible sequence of a certain length 123 [ 0x00 ], // U+00000000 one byte 124 [ 0xC2, 0x80 ], // U+00000080 two bytes 125 [ 0xE0, 0xA0, 0x80 ], // U+00000800 three bytes 126 [ 0xF0, 0x90, 0x80, 0x80 ], // U+00010000 three bytes 127 128 // Last possible sequence of a certain length 129 [ 0x7F ], // U+0000007F one byte 130 [ 0xDF, 0xBF ], // U+000007FF two bytes 131 [ 0xEF, 0xBF, 0xBF ], // U+0000FFFF three bytes 132 133 // Other boundary conditions 134 [ 0xED, 0x9F, 0xBF ], 135 // U+0000D7FF Last character before surrogates 136 [ 0xEE, 0x80, 0x80 ], 137 // U+0000E000 First character after surrogates 138 [ 0xEF, 0xBF, 0xBD ], 139 // U+0000FFFD Unicode replacement character 140 [ 0xF4, 0x8F, 0xBF, 0xBF ], 141 // U+0010FFFF Very last character 142 143 // Non-character code points 144 /* NOTE: These are legal in UTF, and may be converted from 145 one UTF to another, however they do not represent Unicode 146 characters. These code points have been reserved by 147 Unicode as non-character code points. They are permissible 148 for data exchange within an application, but they are are 149 not permitted to be used as characters. Since this module 150 deals with UTF, and not with Unicode per se, we choose to 151 accept them here. */ 152 [ 0xDF, 0xBE ], // U+0000FFFE 153 [ 0xDF, 0xBF ], // U+0000FFFF 154 ]; 155 156 static ubyte[][] invalidStrings = 157 [ 158 // First possible sequence of a certain length, but greater 159 // than U+10FFFF 160 [ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U+00200000 five bytes 161 [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U+04000000 six bytes 162 163 // Last possible sequence of a certain length, but greater than U+10FFFF 164 [ 0xF7, 0xBF, 0xBF, 0xBF ], // U+001FFFFF four bytes 165 [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF five bytes 166 [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U+7FFFFFFF six bytes 167 168 // Other boundary conditions 169 [ 0xF4, 0x90, 0x80, 0x80 ], // U+00110000 170 // First code 171 // point after 172 // last character 173 174 // Unexpected continuation bytes 175 [ 0x80 ], 176 [ 0xBF ], 177 [ 0x20, 0x80, 0x20 ], 178 [ 0x20, 0xBF, 0x20 ], 179 [ 0x80, 0x9F, 0xA0 ], 180 181 // Lonely start bytes 182 [ 0xC0 ], 183 [ 0xCF ], 184 [ 0x20, 0xC0, 0x20 ], 185 [ 0x20, 0xCF, 0x20 ], 186 [ 0xD0 ], 187 [ 0xDF ], 188 [ 0x20, 0xD0, 0x20 ], 189 [ 0x20, 0xDF, 0x20 ], 190 [ 0xE0 ], 191 [ 0xEF ], 192 [ 0x20, 0xE0, 0x20 ], 193 [ 0x20, 0xEF, 0x20 ], 194 [ 0xF0 ], 195 [ 0xF1 ], 196 [ 0xF2 ], 197 [ 0xF3 ], 198 [ 0xF4 ], 199 [ 0xF5 ], // If this were legal it would start a character > U+10FFFF 200 [ 0xF6 ], // If this were legal it would start a character > U+10FFFF 201 [ 0xF7 ], // If this were legal it would start a character > U+10FFFF 202 203 [ 0xEF, 0xBF ], // Three byte sequence with third byte missing 204 [ 0xF7, 0xBF, 0xBF ], // Four byte sequence with fourth byte missing 205 [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ], // Concatenation of the above 206 207 // Impossible bytes 208 [ 0xF8 ], 209 [ 0xF9 ], 210 [ 0xFA ], 211 [ 0xFB ], 212 [ 0xFC ], 213 [ 0xFD ], 214 [ 0xFE ], 215 [ 0xFF ], 216 [ 0x20, 0xF8, 0x20 ], 217 [ 0x20, 0xF9, 0x20 ], 218 [ 0x20, 0xFA, 0x20 ], 219 [ 0x20, 0xFB, 0x20 ], 220 [ 0x20, 0xFC, 0x20 ], 221 [ 0x20, 0xFD, 0x20 ], 222 [ 0x20, 0xFE, 0x20 ], 223 [ 0x20, 0xFF, 0x20 ], 224 225 // Overlong sequences, all representing U+002F 226 /* With a safe UTF-8 decoder, all of the following five overlong 227 representations of the ASCII character slash ("/") should be 228 rejected like a malformed UTF-8 sequence */ 229 [ 0xC0, 0xAF ], 230 [ 0xE0, 0x80, 0xAF ], 231 [ 0xF0, 0x80, 0x80, 0xAF ], 232 [ 0xF8, 0x80, 0x80, 0x80, 0xAF ], 233 [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ], 234 235 // Maximum overlong sequences 236 /* Below you see the highest Unicode value that is still resulting in 237 an overlong sequence if represented with the given number of bytes. 238 This is a boundary test for safe UTF-8 decoders. All five 239 characters should be rejected like malformed UTF-8 sequences. */ 240 [ 0xC1, 0xBF ], // U+0000007F 241 [ 0xE0, 0x9F, 0xBF ], // U+000007FF 242 [ 0xF0, 0x8F, 0xBF, 0xBF ], // U+0000FFFF 243 [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U+001FFFFF 244 [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF 245 246 // Overlong representation of the NUL character 247 /* The following five sequences should also be rejected like malformed 248 UTF-8 sequences and should not be treated like the ASCII NUL 249 character. */ 250 [ 0xC0, 0x80 ], 251 [ 0xE0, 0x80, 0x80 ], 252 [ 0xF0, 0x80, 0x80, 0x80 ], 253 [ 0xF8, 0x80, 0x80, 0x80, 0x80 ], 254 [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ], 255 256 // Illegal code positions 257 /* The following UTF-8 sequences should be rejected like malformed 258 sequences, because they never represent valid ISO 10646 characters 259 and a UTF-8 decoder that accepts them might introduce security 260 problems comparable to overlong UTF-8 sequences. */ 261 [ 0xED, 0xA0, 0x80 ], // U+D800 262 [ 0xED, 0xAD, 0xBF ], // U+DB7F 263 [ 0xED, 0xAE, 0x80 ], // U+DB80 264 [ 0xED, 0xAF, 0xBF ], // U+DBFF 265 [ 0xED, 0xB0, 0x80 ], // U+DC00 266 [ 0xED, 0xBE, 0x80 ], // U+DF80 267 [ 0xED, 0xBF, 0xBF ], // U+DFFF 268 ]; 269 270 static string[] sanitizedStrings = 271 [ 272 "\uFFFD","\uFFFD", 273 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ", 274 " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ", 275 "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ", 276 " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", 277 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD", 278 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ", 279 " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ", 280 " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", 281 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", 282 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", 283 ]; 284 285 // HELPER FUNCTIONS 286 // we can probably do this better... 287 static char toHexDigit(int n) 288 { 289 return "0123456789ABCDEF"[n & 0xF]; 290 } 291 292 static string makeReadable(string s) 293 { 294 string r = "\""; 295 foreach (char c;s) 296 { 297 if (c >= 0x20 && c < 0x80) 298 { 299 r ~= c; 300 } 301 else 302 { 303 r ~= "\\x"; 304 r ~= toHexDigit(c >> 4); 305 r ~= toHexDigit(c); 306 } 307 } 308 r ~= "\""; 309 return r; 310 } 311 312 void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r) 313 { 314 static if (is(Src == Dst)) 315 { 316 return s; 317 } 318 else static if (is(Src == AsciiChar)) 319 { 320 transcodeReverse!(char,Dst)(cast(string) s,r); 321 } 322 else 323 { 324 foreach_reverse (d;codePoints(s)) 325 { 326 foreach_reverse (c;codeUnits!(Dst)(d)) 327 { 328 r = c ~ r; 329 } 330 } 331 } 332 } 333 334 // Make sure everything that should be valid, is 335 foreach (a;validStrings) 336 { 337 string s = cast(string) a; 338 assert(isValid(s),"Failed to validate: "~makeReadable(s)); 339 } 340 341 // Make sure everything that shouldn't be valid, isn't 342 foreach (a;invalidStrings) 343 { 344 string s = cast(string) a; 345 assert(!isValid(s),"Incorrectly validated: "~makeReadable(s)); 346 } 347 348 // Make sure we can sanitize everything bad 349 assert(invalidStrings.length == sanitizedStrings.length); 350 for (int i=0; i<invalidStrings.length; ++i) 351 { 352 string s = cast(string) invalidStrings[i]; 353 string t = sanitize(s); 354 assert(isValid(t)); 355 assert(t == sanitizedStrings[i]); 356 ubyte[] u = cast(ubyte[]) t; 357 validStrings ~= u; 358 } 359 360 // Make sure all transcodings work in both directions, using both forward 361 // and reverse iteration 362 foreach (a; validStrings) 363 { 364 string s = cast(string) a; 365 string s2; 366 wstring ws, ws2; 367 dstring ds, ds2; 368 369 transcode(s,ws); 370 assert(isValid(ws)); 371 transcode(ws,s2); 372 assert(s == s2); 373 374 transcode(s,ds); 375 assert(isValid(ds)); 376 transcode(ds,s2); 377 assert(s == s2); 378 379 transcode(ws,s); 380 assert(isValid(s)); 381 transcode(s,ws2); 382 assert(ws == ws2); 383 384 transcode(ws,ds); 385 assert(isValid(ds)); 386 transcode(ds,ws2); 387 assert(ws == ws2); 388 389 transcode(ds,s); 390 assert(isValid(s)); 391 transcode(s,ds2); 392 assert(ds == ds2); 393 394 transcode(ds,ws); 395 assert(isValid(ws)); 396 transcode(ws,ds2); 397 assert(ds == ds2); 398 399 transcodeReverse(s,ws); 400 assert(isValid(ws)); 401 transcodeReverse(ws,s2); 402 assert(s == s2); 403 404 transcodeReverse(s,ds); 405 assert(isValid(ds)); 406 transcodeReverse(ds,s2); 407 assert(s == s2); 408 409 transcodeReverse(ws,s); 410 assert(isValid(s)); 411 transcodeReverse(s,ws2); 412 assert(ws == ws2); 413 414 transcodeReverse(ws,ds); 415 assert(isValid(ds)); 416 transcodeReverse(ds,ws2); 417 assert(ws == ws2); 418 419 transcodeReverse(ds,s); 420 assert(isValid(s)); 421 transcodeReverse(s,ds2); 422 assert(ds == ds2); 423 424 transcodeReverse(ds,ws); 425 assert(isValid(ws)); 426 transcodeReverse(ws,ds2); 427 assert(ds == ds2); 428 } 429 430 // Make sure the non-UTF encodings work too 431 { 432 auto s = "\u20AC100"; 433 Windows1252String t; 434 transcode(s,t); 435 assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']); 436 string u; 437 transcode(s,u); 438 assert(s == u); 439 Latin1String v; 440 transcode(s,v); 441 assert(cast(string) v == "?100"); 442 AsciiString w; 443 transcode(v,w); 444 assert(cast(string) w == "?100"); 445 s = "\u017Dlu\u0165ou\u010Dk\u00FD k\u016F\u0148"; 446 Latin2String x; 447 transcode(s,x); 448 assert(x == cast(Latin2Char[])[0xae, 'l', 'u', 0xbb, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]); 449 Windows1250String y; 450 transcode(s,y); 451 assert(y == cast(Windows1250Char[])[0x8e, 'l', 'u', 0x9d, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]); 452 s = "\u0402lu\u0403ou\u201D\u045C k\u0414\u044F"; 453 Windows1251String s51; 454 transcode(s,s51); 455 assert(s51 == cast(Windows1251Char[])[0x80, 'l', 'u', 0x81, 'o', 'u', 0x94, 0x9d, ' ', 'k', 0xc4, 0xff]); 456 } 457 458 // Make sure we can count properly 459 { 460 assert(encodedLength!(char)('A') == 1); 461 assert(encodedLength!(char)('\u00E3') == 2); 462 assert(encodedLength!(char)('\u2028') == 3); 463 assert(encodedLength!(char)('\U0010FFF0') == 4); 464 assert(encodedLength!(wchar)('A') == 1); 465 assert(encodedLength!(wchar)('\U0010FFF0') == 2); 466 } 467 468 // Make sure we can write into mutable arrays 469 { 470 char[4] buffer; 471 auto n = encode(cast(dchar)'\u00E3',buffer); 472 assert(n == 2); 473 assert(buffer[0] == 0xC3); 474 assert(buffer[1] == 0xA3); 475 } 476 } 477 478 //============================================================================= 479 480 /** Special value returned by `safeDecode` */ 481 enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF; 482 483 template EncoderFunctions() 484 { 485 // Various forms of read 486 487 template ReadFromString() 488 { 489 @property bool canRead() { return s.length != 0; } 490 E peek() @safe pure @nogc nothrow { return s[0]; } 491 E read() @safe pure @nogc nothrow { E t = s[0]; s = s[1..$]; return t; } 492 } 493 494 template ReverseReadFromString() 495 { 496 @property bool canRead() { return s.length != 0; } 497 E peek() @safe pure @nogc nothrow { return s[$-1]; } 498 E read() @safe pure @nogc nothrow { E t = s[$-1]; s = s[0..$-1]; return t; } 499 } 500 501 // Various forms of Write 502 503 template WriteToString() 504 { 505 E[] s; 506 void write(E c) @safe pure nothrow { s ~= c; } 507 } 508 509 template WriteToArray() 510 { 511 void write(E c) @safe pure @nogc nothrow { array[0] = c; array = array[1..$]; } 512 } 513 514 template WriteToDelegate() 515 { 516 void write(E c) { dg(c); } 517 } 518 519 // Functions we will export 520 521 template EncodeViaWrite() 522 { 523 mixin encodeViaWrite; 524 void encode(dchar c) { encodeViaWrite(c); } 525 } 526 527 template SkipViaRead() 528 { 529 mixin skipViaRead; 530 void skip() @safe pure @nogc nothrow { skipViaRead(); } 531 } 532 533 template DecodeViaRead() 534 { 535 mixin decodeViaRead; 536 dchar decode() @safe pure @nogc nothrow { return decodeViaRead(); } 537 } 538 539 template SafeDecodeViaRead() 540 { 541 mixin safeDecodeViaRead; 542 dchar safeDecode() @safe pure @nogc nothrow { return safeDecodeViaRead(); } 543 } 544 545 template DecodeReverseViaRead() 546 { 547 mixin decodeReverseViaRead; 548 dchar decodeReverse() @safe pure @nogc nothrow { return decodeReverseViaRead(); } 549 } 550 551 // Encoding to different destinations 552 553 template EncodeToString() 554 { 555 mixin WriteToString; 556 mixin EncodeViaWrite; 557 } 558 559 template EncodeToArray() 560 { 561 mixin WriteToArray; 562 mixin EncodeViaWrite; 563 } 564 565 template EncodeToDelegate() 566 { 567 mixin WriteToDelegate; 568 mixin EncodeViaWrite; 569 } 570 571 // Decoding functions 572 573 template SkipFromString() 574 { 575 mixin ReadFromString; 576 mixin SkipViaRead; 577 } 578 579 template DecodeFromString() 580 { 581 mixin ReadFromString; 582 mixin DecodeViaRead; 583 } 584 585 template SafeDecodeFromString() 586 { 587 mixin ReadFromString; 588 mixin SafeDecodeViaRead; 589 } 590 591 template DecodeReverseFromString() 592 { 593 mixin ReverseReadFromString; 594 mixin DecodeReverseViaRead; 595 } 596 597 //========================================================================= 598 599 // Below are the functions we will ultimately expose to the user 600 601 E[] encode(dchar c) @safe pure nothrow 602 { 603 mixin EncodeToString e; 604 e.encode(c); 605 return e.s; 606 } 607 608 void encode(dchar c, ref E[] array) @safe pure nothrow 609 { 610 mixin EncodeToArray e; 611 e.encode(c); 612 } 613 614 void encode(dchar c, void delegate(E) dg) 615 { 616 mixin EncodeToDelegate e; 617 e.encode(c); 618 } 619 620 void skip(ref const(E)[] s) @safe pure nothrow 621 { 622 mixin SkipFromString e; 623 e.skip(); 624 } 625 626 dchar decode(S)(ref S s) 627 { 628 mixin DecodeFromString e; 629 return e.decode(); 630 } 631 632 dchar safeDecode(S)(ref S s) 633 { 634 mixin SafeDecodeFromString e; 635 return e.safeDecode(); 636 } 637 638 dchar decodeReverse(ref const(E)[] s) @safe pure nothrow 639 { 640 mixin DecodeReverseFromString e; 641 return e.decodeReverse(); 642 } 643 } 644 645 //========================================================================= 646 647 struct CodePoints(E) 648 { 649 const(E)[] s; 650 651 this(const(E)[] s) 652 in 653 { 654 assert(isValid(s)); 655 } 656 do 657 { 658 this.s = s; 659 } 660 661 int opApply(scope int delegate(ref dchar) dg) 662 { 663 int result = 0; 664 while (s.length != 0) 665 { 666 dchar c = decode(s); 667 result = dg(c); 668 if (result != 0) break; 669 } 670 return result; 671 } 672 673 int opApply(scope int delegate(ref size_t, ref dchar) dg) 674 { 675 size_t i = 0; 676 int result = 0; 677 while (s.length != 0) 678 { 679 immutable len = s.length; 680 dchar c = decode(s); 681 size_t j = i; // We don't want the delegate corrupting i 682 result = dg(j,c); 683 if (result != 0) break; 684 i += len - s.length; 685 } 686 return result; 687 } 688 689 int opApplyReverse(scope int delegate(ref dchar) dg) 690 { 691 int result = 0; 692 while (s.length != 0) 693 { 694 dchar c = decodeReverse(s); 695 result = dg(c); 696 if (result != 0) break; 697 } 698 return result; 699 } 700 701 int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg) 702 { 703 int result = 0; 704 while (s.length != 0) 705 { 706 dchar c = decodeReverse(s); 707 size_t i = s.length; 708 result = dg(i,c); 709 if (result != 0) break; 710 } 711 return result; 712 } 713 } 714 715 struct CodeUnits(E) 716 { 717 E[] s; 718 719 this(dchar d) 720 in 721 { 722 assert(isValidCodePoint(d)); 723 } 724 do 725 { 726 s = encode!(E)(d); 727 } 728 729 int opApply(scope int delegate(ref E) dg) 730 { 731 int result = 0; 732 foreach (E c;s) 733 { 734 result = dg(c); 735 if (result != 0) break; 736 } 737 return result; 738 } 739 740 int opApplyReverse(scope int delegate(ref E) dg) 741 { 742 int result = 0; 743 foreach_reverse (E c;s) 744 { 745 result = dg(c); 746 if (result != 0) break; 747 } 748 return result; 749 } 750 } 751 752 //============================================================================= 753 754 template EncoderInstance(E) 755 { 756 static assert(false,"Cannot instantiate EncoderInstance for type " 757 ~ E.stringof); 758 } 759 760 private template GenericEncoder() 761 { 762 bool canEncode(dchar c) @safe pure @nogc nothrow 763 { 764 if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) return true; 765 if (c >= 0xFFFD) return false; 766 767 auto idx = 0; 768 while (idx < bstMap.length) 769 { 770 if (bstMap[idx][0] == c) return true; 771 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index 772 } 773 774 return false; 775 } 776 777 bool isValidCodeUnit(E c) @safe pure @nogc nothrow 778 { 779 if (c < m_charMapStart || c > m_charMapEnd) return true; 780 return charMap[c-m_charMapStart] != 0xFFFD; 781 } 782 783 size_t encodedLength(dchar c) @safe pure @nogc nothrow 784 in 785 { 786 assert(canEncode(c)); 787 } 788 do 789 { 790 return 1; 791 } 792 793 void encodeViaWrite()(dchar c) 794 { 795 if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) {} 796 else if (c >= 0xFFFD) { c = '?'; } 797 else 798 { 799 auto idx = 0; 800 while (idx < bstMap.length) 801 { 802 if (bstMap[idx][0] == c) 803 { 804 write(cast(E) bstMap[idx][1]); 805 return; 806 } 807 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index 808 } 809 c = '?'; 810 } 811 write(cast(E) c); 812 } 813 814 void skipViaRead()() 815 { 816 read(); 817 } 818 819 dchar decodeViaRead()() 820 { 821 E c = read(); 822 return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c; 823 } 824 825 dchar safeDecodeViaRead()() 826 { 827 immutable E c = read(); 828 immutable d = (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c; 829 return d == 0xFFFD ? INVALID_SEQUENCE : d; 830 } 831 832 dchar decodeReverseViaRead()() 833 { 834 E c = read(); 835 return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c; 836 } 837 838 @property EString replacementSequence() @safe pure @nogc nothrow 839 { 840 return cast(EString)("?"); 841 } 842 843 mixin EncoderFunctions; 844 } 845 846 //============================================================================= 847 // ASCII 848 //============================================================================= 849 850 /** Defines various character sets. */ 851 enum AsciiChar : ubyte { _init } 852 /// Ditto 853 alias AsciiString = immutable(AsciiChar)[]; 854 855 template EncoderInstance(CharType : AsciiChar) 856 { 857 alias E = AsciiChar; 858 alias EString = AsciiString; 859 860 @property string encodingName() @safe pure nothrow @nogc 861 { 862 return "ASCII"; 863 } 864 865 bool canEncode(dchar c) @safe pure nothrow @nogc 866 { 867 return c < 0x80; 868 } 869 870 bool isValidCodeUnit(AsciiChar c) @safe pure nothrow @nogc 871 { 872 return c < 0x80; 873 } 874 875 size_t encodedLength(dchar c) @safe pure nothrow @nogc 876 in 877 { 878 assert(canEncode(c)); 879 } 880 do 881 { 882 return 1; 883 } 884 885 void encodeX(Range)(dchar c, Range r) 886 { 887 if (!canEncode(c)) c = '?'; 888 r.write(cast(AsciiChar) c); 889 } 890 891 void encodeViaWrite()(dchar c) 892 { 893 if (!canEncode(c)) c = '?'; 894 write(cast(AsciiChar) c); 895 } 896 897 void skipViaRead()() 898 { 899 read(); 900 } 901 902 dchar decodeViaRead()() 903 { 904 return read(); 905 } 906 907 dchar safeDecodeViaRead()() 908 { 909 immutable c = read(); 910 return canEncode(c) ? c : INVALID_SEQUENCE; 911 } 912 913 dchar decodeReverseViaRead()() 914 { 915 return read(); 916 } 917 918 @property EString replacementSequence() @safe pure nothrow @nogc 919 { 920 return cast(EString)("?"); 921 } 922 923 mixin EncoderFunctions; 924 } 925 926 //============================================================================= 927 // ISO-8859-1 928 //============================================================================= 929 930 /** Defines an Latin1-encoded character. */ 931 enum Latin1Char : ubyte { _init } 932 /** 933 Defines an Latin1-encoded string (as an array of $(D 934 immutable(Latin1Char))). 935 */ 936 alias Latin1String = immutable(Latin1Char)[]; 937 938 template EncoderInstance(CharType : Latin1Char) 939 { 940 alias E = Latin1Char; 941 alias EString = Latin1String; 942 943 @property string encodingName() @safe pure nothrow @nogc 944 { 945 return "ISO-8859-1"; 946 } 947 948 bool canEncode(dchar c) @safe pure nothrow @nogc 949 { 950 return c < 0x100; 951 } 952 953 bool isValidCodeUnit(Latin1Char c) @safe pure nothrow @nogc 954 { 955 return true; 956 } 957 958 size_t encodedLength(dchar c) @safe pure nothrow @nogc 959 in 960 { 961 assert(canEncode(c)); 962 } 963 do 964 { 965 return 1; 966 } 967 968 void encodeViaWrite()(dchar c) 969 { 970 if (!canEncode(c)) c = '?'; 971 write(cast(Latin1Char) c); 972 } 973 974 void skipViaRead()() 975 { 976 read(); 977 } 978 979 dchar decodeViaRead()() 980 { 981 return read(); 982 } 983 984 dchar safeDecodeViaRead()() 985 { 986 return read(); 987 } 988 989 dchar decodeReverseViaRead()() 990 { 991 return read(); 992 } 993 994 @property EString replacementSequence() @safe pure nothrow @nogc 995 { 996 return cast(EString)("?"); 997 } 998 999 mixin EncoderFunctions; 1000 } 1001 1002 //============================================================================= 1003 // ISO-8859-2 1004 //============================================================================= 1005 1006 /// Defines a Latin2-encoded character. 1007 enum Latin2Char : ubyte { _init } 1008 1009 /** 1010 * Defines an Latin2-encoded string (as an array of $(D 1011 * immutable(Latin2Char))). 1012 */ 1013 alias Latin2String = immutable(Latin2Char)[]; 1014 1015 private template EncoderInstance(CharType : Latin2Char) 1016 { 1017 import std.typecons : Tuple, tuple; 1018 1019 alias E = Latin2Char; 1020 alias EString = Latin2String; 1021 1022 @property string encodingName() @safe pure nothrow @nogc 1023 { 1024 return "ISO-8859-2"; 1025 } 1026 1027 private static immutable dchar m_charMapStart = 0xa1; 1028 private static immutable dchar m_charMapEnd = 0xff; 1029 1030 private immutable wstring charMap = 1031 "\u0104\u02D8\u0141\u00A4\u013D\u015A\u00A7\u00A8"~ 1032 "\u0160\u015E\u0164\u0179\u00AD\u017D\u017B\u00B0"~ 1033 "\u0105\u02DB\u0142\u00B4\u013E\u015B\u02C7\u00B8"~ 1034 "\u0161\u015F\u0165\u017A\u02DD\u017E\u017C\u0154"~ 1035 "\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7\u010C"~ 1036 "\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E\u0110"~ 1037 "\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7\u0158"~ 1038 "\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF\u0155"~ 1039 "\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7\u010D"~ 1040 "\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F\u0111"~ 1041 "\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7\u0159"~ 1042 "\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9"; 1043 1044 private immutable Tuple!(wchar, char)[] bstMap = [ 1045 tuple('\u0148','\xF2'), tuple('\u00F3','\xF3'), tuple('\u0165','\xBB'), 1046 tuple('\u00D3','\xD3'), tuple('\u010F','\xEF'), tuple('\u015B','\xB6'), 1047 tuple('\u017C','\xBF'), tuple('\u00C1','\xC1'), tuple('\u00E1','\xE1'), 1048 tuple('\u0103','\xE3'), tuple('\u013A','\xE5'), tuple('\u0155','\xE0'), 1049 tuple('\u0161','\xB9'), tuple('\u0171','\xFB'), tuple('\u02D8','\xA2'), 1050 tuple('\u00AD','\xAD'), tuple('\u00C9','\xC9'), tuple('\u00DA','\xDA'), 1051 tuple('\u00E9','\xE9'), tuple('\u00FA','\xFA'), tuple('\u0107','\xE6'), 1052 tuple('\u0119','\xEA'), tuple('\u0142','\xB3'), tuple('\u0151','\xF5'), 1053 tuple('\u0159','\xF8'), tuple('\u015F','\xBA'), tuple('\u0163','\xFE'), 1054 tuple('\u016F','\xF9'), tuple('\u017A','\xBC'), tuple('\u017E','\xBE'), 1055 tuple('\u02DB','\xB2'), tuple('\u00A7','\xA7'), tuple('\u00B4','\xB4'), 1056 tuple('\u00C4','\xC4'), tuple('\u00CD','\xCD'), tuple('\u00D6','\xD6'), 1057 tuple('\u00DD','\xDD'), tuple('\u00E4','\xE4'), tuple('\u00ED','\xED'), 1058 tuple('\u00F6','\xF6'), tuple('\u00FD','\xFD'), tuple('\u0105','\xB1'), 1059 tuple('\u010D','\xE8'), tuple('\u0111','\xF0'), tuple('\u011B','\xEC'), 1060 tuple('\u013E','\xB5'), tuple('\u0144','\xF1'), tuple('\u0150','\xD5'), 1061 tuple('\u0154','\xC0'), tuple('\u0158','\xD8'), tuple('\u015A','\xA6'), 1062 tuple('\u015E','\xAA'), tuple('\u0160','\xA9'), tuple('\u0162','\xDE'), 1063 tuple('\u0164','\xAB'), tuple('\u016E','\xD9'), tuple('\u0170','\xDB'), 1064 tuple('\u0179','\xAC'), tuple('\u017B','\xAF'), tuple('\u017D','\xAE'), 1065 tuple('\u02C7','\xB7'), tuple('\u02D9','\xFF'), tuple('\u02DD','\xBD'), 1066 tuple('\u00A4','\xA4'), tuple('\u00A8','\xA8'), tuple('\u00B0','\xB0'), 1067 tuple('\u00B8','\xB8'), tuple('\u00C2','\xC2'), tuple('\u00C7','\xC7'), 1068 tuple('\u00CB','\xCB'), tuple('\u00CE','\xCE'), tuple('\u00D4','\xD4'), 1069 tuple('\u00D7','\xD7'), tuple('\u00DC','\xDC'), tuple('\u00DF','\xDF'), 1070 tuple('\u00E2','\xE2'), tuple('\u00E7','\xE7'), tuple('\u00EB','\xEB'), 1071 tuple('\u00EE','\xEE'), tuple('\u00F4','\xF4'), tuple('\u00F7','\xF7'), 1072 tuple('\u00FC','\xFC'), tuple('\u0102','\xC3'), tuple('\u0104','\xA1'), 1073 tuple('\u0106','\xC6'), tuple('\u010C','\xC8'), tuple('\u010E','\xCF'), 1074 tuple('\u0110','\xD0'), tuple('\u0118','\xCA'), tuple('\u011A','\xCC'), 1075 tuple('\u0139','\xC5'), tuple('\u013D','\xA5'), tuple('\u0141','\xA3'), 1076 tuple('\u0143','\xD1'), tuple('\u0147','\xD2') 1077 ]; 1078 1079 mixin GenericEncoder!(); 1080 } 1081 1082 //============================================================================= 1083 // WINDOWS-1250 1084 //============================================================================= 1085 1086 /// Defines a Windows1250-encoded character. 1087 enum Windows1250Char : ubyte { _init } 1088 1089 /** 1090 * Defines an Windows1250-encoded string (as an array of $(D 1091 * immutable(Windows1250Char))). 1092 */ 1093 alias Windows1250String = immutable(Windows1250Char)[]; 1094 1095 private template EncoderInstance(CharType : Windows1250Char) 1096 { 1097 import std.typecons : Tuple, tuple; 1098 1099 alias E = Windows1250Char; 1100 alias EString = Windows1250String; 1101 1102 @property string encodingName() @safe pure nothrow @nogc 1103 { 1104 return "windows-1250"; 1105 } 1106 1107 private static immutable dchar m_charMapStart = 0x80; 1108 private static immutable dchar m_charMapEnd = 0xff; 1109 1110 private immutable wstring charMap = 1111 "\u20AC\uFFFD\u201A\uFFFD\u201E\u2026\u2020\u2021"~ 1112 "\uFFFD\u2030\u0160\u2039\u015A\u0164\u017D\u0179"~ 1113 "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~ 1114 "\uFFFD\u2122\u0161\u203A\u015B\u0165\u017E\u017A"~ 1115 "\u00A0\u02C7\u02D8\u0141\u00A4\u0104\u00A6\u00A7"~ 1116 "\u00A8\u00A9\u015E\u00AB\u00AC\u00AD\u00AE\u017B"~ 1117 "\u00B0\u00B1\u02DB\u0142\u00B4\u00B5\u00B6\u00B7"~ 1118 "\u00B8\u0105\u015F\u00BB\u013D\u02DD\u013E\u017C"~ 1119 "\u0154\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7"~ 1120 "\u010C\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E"~ 1121 "\u0110\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7"~ 1122 "\u0158\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF"~ 1123 "\u0155\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7"~ 1124 "\u010D\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F"~ 1125 "\u0111\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7"~ 1126 "\u0159\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9"; 1127 1128 private immutable Tuple!(wchar, char)[] bstMap = [ 1129 tuple('\u011A','\xCC'), tuple('\u00DC','\xDC'), tuple('\u0179','\x8F'), 1130 tuple('\u00B7','\xB7'), tuple('\u00FC','\xFC'), tuple('\u0158','\xD8'), 1131 tuple('\u201C','\x93'), tuple('\u00AC','\xAC'), tuple('\u00CB','\xCB'), 1132 tuple('\u00EB','\xEB'), tuple('\u010C','\xC8'), tuple('\u0143','\xD1'), 1133 tuple('\u0162','\xDE'), tuple('\u02D9','\xFF'), tuple('\u2039','\x8B'), 1134 tuple('\u00A7','\xA7'), tuple('\u00B1','\xB1'), tuple('\u00C2','\xC2'), 1135 tuple('\u00D4','\xD4'), tuple('\u00E2','\xE2'), tuple('\u00F4','\xF4'), 1136 tuple('\u0104','\xA5'), tuple('\u0110','\xD0'), tuple('\u013D','\xBC'), 1137 tuple('\u0150','\xD5'), tuple('\u015E','\xAA'), tuple('\u016E','\xD9'), 1138 tuple('\u017D','\x8E'), tuple('\u2014','\x97'), tuple('\u2021','\x87'), 1139 tuple('\u20AC','\x80'), tuple('\u00A4','\xA4'), tuple('\u00A9','\xA9'), 1140 tuple('\u00AE','\xAE'), tuple('\u00B5','\xB5'), tuple('\u00BB','\xBB'), 1141 tuple('\u00C7','\xC7'), tuple('\u00CE','\xCE'), tuple('\u00D7','\xD7'), 1142 tuple('\u00DF','\xDF'), tuple('\u00E7','\xE7'), tuple('\u00EE','\xEE'), 1143 tuple('\u00F7','\xF7'), tuple('\u0102','\xC3'), tuple('\u0106','\xC6'), 1144 tuple('\u010E','\xCF'), tuple('\u0118','\xCA'), tuple('\u0139','\xC5'), 1145 tuple('\u0141','\xA3'), tuple('\u0147','\xD2'), tuple('\u0154','\xC0'), 1146 tuple('\u015A','\x8C'), tuple('\u0160','\x8A'), tuple('\u0164','\x8D'), 1147 tuple('\u0170','\xDB'), tuple('\u017B','\xAF'), tuple('\u02C7','\xA1'), 1148 tuple('\u02DD','\xBD'), tuple('\u2019','\x92'), tuple('\u201E','\x84'), 1149 tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'), 1150 tuple('\u00A0','\xA0'), tuple('\u00A6','\xA6'), tuple('\u00A8','\xA8'), 1151 tuple('\u00AB','\xAB'), tuple('\u00AD','\xAD'), tuple('\u00B0','\xB0'), 1152 tuple('\u00B4','\xB4'), tuple('\u00B6','\xB6'), tuple('\u00B8','\xB8'), 1153 tuple('\u00C1','\xC1'), tuple('\u00C4','\xC4'), tuple('\u00C9','\xC9'), 1154 tuple('\u00CD','\xCD'), tuple('\u00D3','\xD3'), tuple('\u00D6','\xD6'), 1155 tuple('\u00DA','\xDA'), tuple('\u00DD','\xDD'), tuple('\u00E1','\xE1'), 1156 tuple('\u00E4','\xE4'), tuple('\u00E9','\xE9'), tuple('\u00ED','\xED'), 1157 tuple('\u00F3','\xF3'), tuple('\u00F6','\xF6'), tuple('\u00FA','\xFA'), 1158 tuple('\u00FD','\xFD'), tuple('\u0103','\xE3'), tuple('\u0105','\xB9'), 1159 tuple('\u0107','\xE6'), tuple('\u010D','\xE8'), tuple('\u010F','\xEF'), 1160 tuple('\u0111','\xF0'), tuple('\u0119','\xEA'), tuple('\u011B','\xEC'), 1161 tuple('\u013A','\xE5'), tuple('\u013E','\xBE'), tuple('\u0142','\xB3'), 1162 tuple('\u0144','\xF1'), tuple('\u0148','\xF2'), tuple('\u0151','\xF5'), 1163 tuple('\u0155','\xE0'), tuple('\u0159','\xF8'), tuple('\u015B','\x9C'), 1164 tuple('\u015F','\xBA'), tuple('\u0161','\x9A'), tuple('\u0163','\xFE'), 1165 tuple('\u0165','\x9D'), tuple('\u016F','\xF9'), tuple('\u0171','\xFB'), 1166 tuple('\u017A','\x9F'), tuple('\u017C','\xBF'), tuple('\u017E','\x9E'), 1167 tuple('\u02D8','\xA2'), tuple('\u02DB','\xB2'), tuple('\u2013','\x96'), 1168 tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'), 1169 tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89') 1170 ]; 1171 1172 mixin GenericEncoder!(); 1173 } 1174 1175 //============================================================================= 1176 // WINDOWS-1251 1177 //============================================================================= 1178 1179 /// Defines a Windows1251-encoded character. 1180 enum Windows1251Char : ubyte { _init } 1181 1182 /** 1183 * Defines an Windows1251-encoded string (as an array of $(D 1184 * immutable(Windows1251Char))). 1185 */ 1186 alias Windows1251String = immutable(Windows1251Char)[]; 1187 1188 private template EncoderInstance(CharType : Windows1251Char) 1189 { 1190 import std.typecons : Tuple, tuple; 1191 1192 alias E = Windows1251Char; 1193 alias EString = Windows1251String; 1194 1195 @property string encodingName() @safe pure nothrow @nogc 1196 { 1197 return "windows-1251"; 1198 } 1199 1200 private static immutable dchar m_charMapStart = 0x80; 1201 private static immutable dchar m_charMapEnd = 0xff; 1202 1203 private immutable wstring charMap = 1204 "\u0402\u0403\u201A\u0453\u201E\u2026\u2020\u2021"~ 1205 "\u20AC\u2030\u0409\u2039\u040A\u040C\u040B\u040F"~ 1206 "\u0452\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~ 1207 "\uFFFD\u2122\u0459\u203A\u045A\u045C\u045B\u045F"~ 1208 "\u00A0\u040E\u045E\u0408\u00A4\u0490\u00A6\u00A7"~ 1209 "\u0401\u00A9\u0404\u00AB\u00AC\u00AD\u00AE\u0407"~ 1210 "\u00B0\u00B1\u0406\u0456\u0491\u00B5\u00B6\u00B7"~ 1211 "\u0451\u2116\u0454\u00BB\u0458\u0405\u0455\u0457"~ 1212 "\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417"~ 1213 "\u0418\u0419\u041A\u041B\u041C\u041D\u041E\u041F"~ 1214 "\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427"~ 1215 "\u0428\u0429\u042A\u042B\u042C\u042D\u042E\u042F"~ 1216 "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437"~ 1217 "\u0438\u0439\u043A\u043B\u043C\u043D\u043E\u043F"~ 1218 "\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447"~ 1219 "\u0448\u0449\u044A\u044B\u044C\u044D\u044E\u044F"; 1220 1221 private immutable Tuple!(wchar, char)[] bstMap = [ 1222 tuple('\u0432','\xE2'),tuple('\u0412','\xC2'),tuple('\u0453','\x83'), 1223 tuple('\u0401','\xA8'),tuple('\u0422','\xD2'),tuple('\u0442','\xF2'), 1224 tuple('\u2018','\x91'),tuple('\u00AD','\xAD'),tuple('\u0409','\x8A'), 1225 tuple('\u041A','\xCA'),tuple('\u042A','\xDA'),tuple('\u043A','\xEA'), 1226 tuple('\u044A','\xFA'),tuple('\u045B','\x9E'),tuple('\u2022','\x95'), 1227 tuple('\u00A7','\xA7'),tuple('\u00B5','\xB5'),tuple('\u0405','\xBD'), 1228 tuple('\u040E','\xA1'),tuple('\u0416','\xC6'),tuple('\u041E','\xCE'), 1229 tuple('\u0426','\xD6'),tuple('\u042E','\xDE'),tuple('\u0436','\xE6'), 1230 tuple('\u043E','\xEE'),tuple('\u0446','\xF6'),tuple('\u044E','\xFE'), 1231 tuple('\u0457','\xBF'),tuple('\u0490','\xA5'),tuple('\u201D','\x94'), 1232 tuple('\u203A','\x9B'),tuple('\u00A4','\xA4'),tuple('\u00AB','\xAB'), 1233 tuple('\u00B0','\xB0'),tuple('\u00B7','\xB7'),tuple('\u0403','\x81'), 1234 tuple('\u0407','\xAF'),tuple('\u040B','\x8E'),tuple('\u0410','\xC0'), 1235 tuple('\u0414','\xC4'),tuple('\u0418','\xC8'),tuple('\u041C','\xCC'), 1236 tuple('\u0420','\xD0'),tuple('\u0424','\xD4'),tuple('\u0428','\xD8'), 1237 tuple('\u042C','\xDC'),tuple('\u0430','\xE0'),tuple('\u0434','\xE4'), 1238 tuple('\u0438','\xE8'),tuple('\u043C','\xEC'),tuple('\u0440','\xF0'), 1239 tuple('\u0444','\xF4'),tuple('\u0448','\xF8'),tuple('\u044C','\xFC'), 1240 tuple('\u0451','\xB8'),tuple('\u0455','\xBE'),tuple('\u0459','\x9A'), 1241 tuple('\u045E','\xA2'),tuple('\u2013','\x96'),tuple('\u201A','\x82'), 1242 tuple('\u2020','\x86'),tuple('\u2030','\x89'),tuple('\u2116','\xB9'), 1243 tuple('\u00A0','\xA0'),tuple('\u00A6','\xA6'),tuple('\u00A9','\xA9'), 1244 tuple('\u00AC','\xAC'),tuple('\u00AE','\xAE'),tuple('\u00B1','\xB1'), 1245 tuple('\u00B6','\xB6'),tuple('\u00BB','\xBB'),tuple('\u0402','\x80'), 1246 tuple('\u0404','\xAA'),tuple('\u0406','\xB2'),tuple('\u0408','\xA3'), 1247 tuple('\u040A','\x8C'),tuple('\u040C','\x8D'),tuple('\u040F','\x8F'), 1248 tuple('\u0411','\xC1'),tuple('\u0413','\xC3'),tuple('\u0415','\xC5'), 1249 tuple('\u0417','\xC7'),tuple('\u0419','\xC9'),tuple('\u041B','\xCB'), 1250 tuple('\u041D','\xCD'),tuple('\u041F','\xCF'),tuple('\u0421','\xD1'), 1251 tuple('\u0423','\xD3'),tuple('\u0425','\xD5'),tuple('\u0427','\xD7'), 1252 tuple('\u0429','\xD9'),tuple('\u042B','\xDB'),tuple('\u042D','\xDD'), 1253 tuple('\u042F','\xDF'),tuple('\u0431','\xE1'),tuple('\u0433','\xE3'), 1254 tuple('\u0435','\xE5'),tuple('\u0437','\xE7'),tuple('\u0439','\xE9'), 1255 tuple('\u043B','\xEB'),tuple('\u043D','\xED'),tuple('\u043F','\xEF'), 1256 tuple('\u0441','\xF1'),tuple('\u0443','\xF3'),tuple('\u0445','\xF5'), 1257 tuple('\u0447','\xF7'),tuple('\u0449','\xF9'),tuple('\u044B','\xFB'), 1258 tuple('\u044D','\xFD'),tuple('\u044F','\xFF'),tuple('\u0452','\x90'), 1259 tuple('\u0454','\xBA'),tuple('\u0456','\xB3'),tuple('\u0458','\xBC'), 1260 tuple('\u045A','\x9C'),tuple('\u045C','\x9D'),tuple('\u045F','\x9F'), 1261 tuple('\u0491','\xB4'),tuple('\u2014','\x97'),tuple('\u2019','\x92'), 1262 tuple('\u201C','\x93'),tuple('\u201E','\x84'),tuple('\u2021','\x87'), 1263 tuple('\u2026','\x85'),tuple('\u2039','\x8B'),tuple('\u20AC','\x88'), 1264 tuple('\u2122','\x99') 1265 ]; 1266 1267 mixin GenericEncoder!(); 1268 } 1269 1270 //============================================================================= 1271 // WINDOWS-1252 1272 //============================================================================= 1273 1274 /// Defines a Windows1252-encoded character. 1275 enum Windows1252Char : ubyte { _init } 1276 1277 /** 1278 * Defines an Windows1252-encoded string (as an array of $(D 1279 * immutable(Windows1252Char))). 1280 */ 1281 alias Windows1252String = immutable(Windows1252Char)[]; 1282 1283 template EncoderInstance(CharType : Windows1252Char) 1284 { 1285 import std.typecons : Tuple, tuple; 1286 1287 alias E = Windows1252Char; 1288 alias EString = Windows1252String; 1289 1290 @property string encodingName() @safe pure nothrow @nogc 1291 { 1292 return "windows-1252"; 1293 } 1294 1295 private static immutable dchar m_charMapStart = 0x80; 1296 private static immutable dchar m_charMapEnd = 0x9f; 1297 1298 private immutable wstring charMap = 1299 "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"~ 1300 "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"~ 1301 "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~ 1302 "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178"; 1303 1304 private immutable Tuple!(wchar, char)[] bstMap = [ 1305 tuple('\u201C','\x93'), tuple('\u0192','\x83'), tuple('\u2039','\x8B'), 1306 tuple('\u0161','\x9A'), tuple('\u2014','\x97'), tuple('\u2021','\x87'), 1307 tuple('\u20AC','\x80'), tuple('\u0153','\x9C'), tuple('\u017D','\x8E'), 1308 tuple('\u02DC','\x98'), tuple('\u2019','\x92'), tuple('\u201E','\x84'), 1309 tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'), 1310 tuple('\u0152','\x8C'), tuple('\u0160','\x8A'), tuple('\u0178','\x9F'), 1311 tuple('\u017E','\x9E'), tuple('\u02C6','\x88'), tuple('\u2013','\x96'), 1312 tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'), 1313 tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89') 1314 ]; 1315 1316 mixin GenericEncoder!(); 1317 } 1318 1319 //============================================================================= 1320 // UTF-8 1321 //============================================================================= 1322 1323 template EncoderInstance(CharType : char) 1324 { 1325 alias E = char; 1326 alias EString = immutable(char)[]; 1327 1328 @property string encodingName() @safe pure nothrow @nogc 1329 { 1330 return "UTF-8"; 1331 } 1332 1333 bool canEncode(dchar c) @safe pure nothrow @nogc 1334 { 1335 return isValidCodePoint(c); 1336 } 1337 1338 bool isValidCodeUnit(char c) @safe pure nothrow @nogc 1339 { 1340 return (c < 0xC0 || (c >= 0xC2 && c < 0xF5)); 1341 } 1342 1343 immutable ubyte[128] tailTable = 1344 [ 1345 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1346 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1347 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1348 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1349 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1350 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1351 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 1352 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0, 1353 ]; 1354 1355 private int tails(char c) @safe pure nothrow @nogc 1356 in 1357 { 1358 assert(c >= 0x80); 1359 } 1360 do 1361 { 1362 return tailTable[c-0x80]; 1363 } 1364 1365 size_t encodedLength(dchar c) @safe pure nothrow @nogc 1366 in 1367 { 1368 assert(canEncode(c)); 1369 } 1370 do 1371 { 1372 if (c < 0x80) return 1; 1373 if (c < 0x800) return 2; 1374 if (c < 0x10000) return 3; 1375 return 4; 1376 } 1377 1378 void encodeViaWrite()(dchar c) 1379 { 1380 if (c < 0x80) 1381 { 1382 write(cast(char) c); 1383 } 1384 else if (c < 0x800) 1385 { 1386 write(cast(char)((c >> 6) + 0xC0)); 1387 write(cast(char)((c & 0x3F) + 0x80)); 1388 } 1389 else if (c < 0x10000) 1390 { 1391 write(cast(char)((c >> 12) + 0xE0)); 1392 write(cast(char)(((c >> 6) & 0x3F) + 0x80)); 1393 write(cast(char)((c & 0x3F) + 0x80)); 1394 } 1395 else 1396 { 1397 write(cast(char)((c >> 18) + 0xF0)); 1398 write(cast(char)(((c >> 12) & 0x3F) + 0x80)); 1399 write(cast(char)(((c >> 6) & 0x3F) + 0x80)); 1400 write(cast(char)((c & 0x3F) + 0x80)); 1401 } 1402 } 1403 1404 void skipViaRead()() 1405 { 1406 auto c = read(); 1407 if (c < 0xC0) return; 1408 int n = tails(cast(char) c); 1409 for (size_t i=0; i<n; ++i) 1410 { 1411 read(); 1412 } 1413 } 1414 1415 dchar decodeViaRead()() 1416 { 1417 dchar c = read(); 1418 if (c < 0xC0) return c; 1419 int n = tails(cast(char) c); 1420 c &= (1 << (6 - n)) - 1; 1421 for (size_t i=0; i<n; ++i) 1422 { 1423 c = (c << 6) + (read() & 0x3F); 1424 } 1425 return c; 1426 } 1427 1428 dchar safeDecodeViaRead()() 1429 { 1430 dchar c = read(); 1431 if (c < 0x80) return c; 1432 int n = tails(cast(char) c); 1433 if (n == 0) return INVALID_SEQUENCE; 1434 1435 if (!canRead) return INVALID_SEQUENCE; 1436 size_t d = peek(); 1437 immutable err = 1438 ( 1439 (c < 0xC2) // fail overlong 2-byte sequences 1440 || (c > 0xF4) // fail overlong 4-6-byte sequences 1441 || (c == 0xE0 && ((d & 0xE0) == 0x80)) // fail overlong 3-byte sequences 1442 || (c == 0xED && ((d & 0xE0) == 0xA0)) // fail surrogates 1443 || (c == 0xF0 && ((d & 0xF0) == 0x80)) // fail overlong 4-byte sequences 1444 || (c == 0xF4 && ((d & 0xF0) >= 0x90)) // fail code points > 0x10FFFF 1445 ); 1446 1447 c &= (1 << (6 - n)) - 1; 1448 for (size_t i=0; i<n; ++i) 1449 { 1450 if (!canRead) return INVALID_SEQUENCE; 1451 d = peek(); 1452 if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE; 1453 c = (c << 6) + (read() & 0x3F); 1454 } 1455 1456 return err ? INVALID_SEQUENCE : c; 1457 } 1458 1459 dchar decodeReverseViaRead()() 1460 { 1461 dchar c = read(); 1462 if (c < 0x80) return c; 1463 size_t shift = 0; 1464 c &= 0x3F; 1465 for (size_t i=0; i<4; ++i) 1466 { 1467 shift += 6; 1468 auto d = read(); 1469 size_t n = tails(cast(char) d); 1470 immutable mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1; 1471 c += ((d & mask) << shift); 1472 if (n != 0) break; 1473 } 1474 return c; 1475 } 1476 1477 @property EString replacementSequence() @safe pure nothrow @nogc 1478 { 1479 return "\uFFFD"; 1480 } 1481 1482 mixin EncoderFunctions; 1483 } 1484 1485 //============================================================================= 1486 // UTF-16 1487 //============================================================================= 1488 1489 template EncoderInstance(CharType : wchar) 1490 { 1491 alias E = wchar; 1492 alias EString = immutable(wchar)[]; 1493 1494 @property string encodingName() @safe pure nothrow @nogc 1495 { 1496 return "UTF-16"; 1497 } 1498 1499 bool canEncode(dchar c) @safe pure nothrow @nogc 1500 { 1501 return isValidCodePoint(c); 1502 } 1503 1504 bool isValidCodeUnit(wchar c) @safe pure nothrow @nogc 1505 { 1506 return true; 1507 } 1508 1509 size_t encodedLength(dchar c) @safe pure nothrow @nogc 1510 in 1511 { 1512 assert(canEncode(c)); 1513 } 1514 do 1515 { 1516 return (c < 0x10000) ? 1 : 2; 1517 } 1518 1519 void encodeViaWrite()(dchar c) 1520 { 1521 if (c < 0x10000) 1522 { 1523 write(cast(wchar) c); 1524 } 1525 else 1526 { 1527 size_t n = c - 0x10000; 1528 write(cast(wchar)(0xD800 + (n >> 10))); 1529 write(cast(wchar)(0xDC00 + (n & 0x3FF))); 1530 } 1531 } 1532 1533 void skipViaRead()() 1534 { 1535 immutable c = read(); 1536 if (c < 0xD800 || c >= 0xE000) return; 1537 read(); 1538 } 1539 1540 dchar decodeViaRead()() 1541 { 1542 wchar c = read(); 1543 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c; 1544 wchar d = read(); 1545 c &= 0x3FF; 1546 d &= 0x3FF; 1547 return 0x10000 + (c << 10) + d; 1548 } 1549 1550 dchar safeDecodeViaRead()() 1551 { 1552 wchar c = read(); 1553 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c; 1554 if (c >= 0xDC00) return INVALID_SEQUENCE; 1555 if (!canRead) return INVALID_SEQUENCE; 1556 wchar d = peek(); 1557 if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE; 1558 d = read(); 1559 c &= 0x3FF; 1560 d &= 0x3FF; 1561 return 0x10000 + (c << 10) + d; 1562 } 1563 1564 dchar decodeReverseViaRead()() 1565 { 1566 wchar c = read(); 1567 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c; 1568 wchar d = read(); 1569 c &= 0x3FF; 1570 d &= 0x3FF; 1571 return 0x10000 + (d << 10) + c; 1572 } 1573 1574 @property EString replacementSequence() @safe pure nothrow @nogc 1575 { 1576 return "\uFFFD"w; 1577 } 1578 1579 mixin EncoderFunctions; 1580 } 1581 1582 //============================================================================= 1583 // UTF-32 1584 //============================================================================= 1585 1586 template EncoderInstance(CharType : dchar) 1587 { 1588 alias E = dchar; 1589 alias EString = immutable(dchar)[]; 1590 1591 @property string encodingName() @safe pure nothrow @nogc 1592 { 1593 return "UTF-32"; 1594 } 1595 1596 bool canEncode(dchar c) @safe pure @nogc nothrow 1597 { 1598 return isValidCodePoint(c); 1599 } 1600 1601 bool isValidCodeUnit(dchar c) @safe pure @nogc nothrow 1602 { 1603 return isValidCodePoint(c); 1604 } 1605 1606 size_t encodedLength(dchar c) @safe pure @nogc nothrow 1607 in 1608 { 1609 assert(canEncode(c)); 1610 } 1611 do 1612 { 1613 return 1; 1614 } 1615 1616 void encodeViaWrite()(dchar c) 1617 { 1618 write(c); 1619 } 1620 1621 void skipViaRead()() 1622 { 1623 read(); 1624 } 1625 1626 dchar decodeViaRead()() 1627 { 1628 return cast(dchar) read(); 1629 } 1630 1631 dchar safeDecodeViaRead()() 1632 { 1633 immutable c = read(); 1634 return isValidCodePoint(c) ? c : INVALID_SEQUENCE; 1635 } 1636 1637 dchar decodeReverseViaRead()() 1638 { 1639 return cast(dchar) read(); 1640 } 1641 1642 @property EString replacementSequence() @safe pure nothrow @nogc 1643 { 1644 return "\uFFFD"d; 1645 } 1646 1647 mixin EncoderFunctions; 1648 } 1649 1650 //============================================================================= 1651 // Below are forwarding functions which expose the function to the user 1652 1653 /** 1654 Returns true if c is a valid code point 1655 1656 Note that this includes the non-character code points U+FFFE and U+FFFF, 1657 since these are valid code points (even though they are not valid 1658 characters). 1659 1660 Supersedes: 1661 This function supersedes `std.utf.startsValidDchar()`. 1662 1663 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1664 WINDOWS-1251, WINDOWS-1252 1665 1666 Params: 1667 c = the code point to be tested 1668 */ 1669 bool isValidCodePoint(dchar c) @safe pure nothrow @nogc 1670 { 1671 return c < 0xD800 || (c >= 0xE000 && c < 0x110000); 1672 } 1673 1674 /** 1675 Returns the name of an encoding. 1676 1677 The type of encoding cannot be deduced. Therefore, it is necessary to 1678 explicitly specify the encoding type. 1679 1680 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1681 WINDOWS-1251, WINDOWS-1252 1682 */ 1683 @property string encodingName(T)() 1684 { 1685 return EncoderInstance!(T).encodingName; 1686 } 1687 1688 /// 1689 @safe unittest 1690 { 1691 assert(encodingName!(char) == "UTF-8"); 1692 assert(encodingName!(wchar) == "UTF-16"); 1693 assert(encodingName!(dchar) == "UTF-32"); 1694 assert(encodingName!(AsciiChar) == "ASCII"); 1695 assert(encodingName!(Latin1Char) == "ISO-8859-1"); 1696 assert(encodingName!(Latin2Char) == "ISO-8859-2"); 1697 assert(encodingName!(Windows1250Char) == "windows-1250"); 1698 assert(encodingName!(Windows1251Char) == "windows-1251"); 1699 assert(encodingName!(Windows1252Char) == "windows-1252"); 1700 } 1701 1702 /** 1703 Returns true iff it is possible to represent the specified codepoint 1704 in the encoding. 1705 1706 The type of encoding cannot be deduced. Therefore, it is necessary to 1707 explicitly specify the encoding type. 1708 1709 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1710 WINDOWS-1251, WINDOWS-1252 1711 */ 1712 bool canEncode(E)(dchar c) 1713 { 1714 return EncoderInstance!(E).canEncode(c); 1715 } 1716 1717 /// 1718 @safe pure unittest 1719 { 1720 assert( canEncode!(Latin1Char)('A')); 1721 assert( canEncode!(Latin2Char)('A')); 1722 assert(!canEncode!(AsciiChar)('\u00A0')); 1723 assert( canEncode!(Latin1Char)('\u00A0')); 1724 assert( canEncode!(Latin2Char)('\u00A0')); 1725 assert( canEncode!(Windows1250Char)('\u20AC')); 1726 assert(!canEncode!(Windows1250Char)('\u20AD')); 1727 assert(!canEncode!(Windows1250Char)('\uFFFD')); 1728 assert( canEncode!(Windows1251Char)('\u0402')); 1729 assert(!canEncode!(Windows1251Char)('\u20AD')); 1730 assert(!canEncode!(Windows1251Char)('\uFFFD')); 1731 assert( canEncode!(Windows1252Char)('\u20AC')); 1732 assert(!canEncode!(Windows1252Char)('\u20AD')); 1733 assert(!canEncode!(Windows1252Char)('\uFFFD')); 1734 assert(!canEncode!(char)(cast(dchar) 0x110000)); 1735 } 1736 1737 /// How to check an entire string 1738 @safe pure unittest 1739 { 1740 import std.algorithm.searching : find; 1741 import std.utf : byDchar; 1742 1743 assert("The quick brown fox" 1744 .byDchar 1745 .find!(x => !canEncode!AsciiChar(x)) 1746 .empty); 1747 } 1748 1749 /** 1750 Returns true if the code unit is legal. For example, the byte 0x80 would 1751 not be legal in ASCII, because ASCII code units must always be in the range 1752 0x00 to 0x7F. 1753 1754 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1755 WINDOWS-1251, WINDOWS-1252 1756 1757 Params: 1758 c = the code unit to be tested 1759 */ 1760 bool isValidCodeUnit(E)(E c) 1761 { 1762 return EncoderInstance!(E).isValidCodeUnit(c); 1763 } 1764 1765 /// 1766 @system pure unittest 1767 { 1768 assert(!isValidCodeUnit(cast(char) 0xC0)); 1769 assert(!isValidCodeUnit(cast(char) 0xFF)); 1770 assert( isValidCodeUnit(cast(wchar) 0xD800)); 1771 assert(!isValidCodeUnit(cast(dchar) 0xD800)); 1772 assert(!isValidCodeUnit(cast(AsciiChar) 0xA0)); 1773 assert( isValidCodeUnit(cast(Windows1250Char) 0x80)); 1774 assert(!isValidCodeUnit(cast(Windows1250Char) 0x81)); 1775 assert( isValidCodeUnit(cast(Windows1251Char) 0x80)); 1776 assert(!isValidCodeUnit(cast(Windows1251Char) 0x98)); 1777 assert( isValidCodeUnit(cast(Windows1252Char) 0x80)); 1778 assert(!isValidCodeUnit(cast(Windows1252Char) 0x81)); 1779 } 1780 1781 /** 1782 Returns true if the string is encoded correctly 1783 1784 Supersedes: 1785 This function supersedes std.utf.validate(), however note that this 1786 function returns a bool indicating whether the input was valid or not, 1787 whereas the older function would throw an exception. 1788 1789 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1790 WINDOWS-1251, WINDOWS-1252 1791 1792 Params: 1793 s = the string to be tested 1794 */ 1795 bool isValid(E)(const(E)[] s) 1796 { 1797 return s.length == validLength(s); 1798 } 1799 1800 /// 1801 @system pure unittest 1802 { 1803 assert( isValid("\u20AC100")); 1804 assert(!isValid(cast(char[3])[167, 133, 175])); 1805 } 1806 1807 /** 1808 Returns the length of the longest possible substring, starting from 1809 the first code unit, which is validly encoded. 1810 1811 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1812 WINDOWS-1251, WINDOWS-1252 1813 1814 Params: 1815 s = the string to be tested 1816 */ 1817 size_t validLength(E)(const(E)[] s) 1818 { 1819 size_t result, before = void; 1820 while ((before = s.length) > 0) 1821 { 1822 if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE) 1823 break; 1824 result += before - s.length; 1825 } 1826 return result; 1827 } 1828 1829 /** 1830 Sanitizes a string by replacing malformed code unit sequences with valid 1831 code unit sequences. The result is guaranteed to be valid for this encoding. 1832 1833 If the input string is already valid, this function returns the original, 1834 otherwise it constructs a new string by replacing all illegal code unit 1835 sequences with the encoding's replacement character, Invalid sequences will 1836 be replaced with the Unicode replacement character (U+FFFD) if the 1837 character repertoire contains it, otherwise invalid sequences will be 1838 replaced with '?'. 1839 1840 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1841 WINDOWS-1251, WINDOWS-1252 1842 1843 Params: 1844 s = the string to be sanitized 1845 */ 1846 immutable(E)[] sanitize(E)(immutable(E)[] s) 1847 { 1848 size_t n = validLength(s); 1849 if (n == s.length) return s; 1850 1851 auto repSeq = EncoderInstance!(E).replacementSequence; 1852 1853 // Count how long the string needs to be. 1854 // Overestimating is not a problem 1855 size_t len = s.length; 1856 const(E)[] t = s[n..$]; 1857 while (t.length != 0) 1858 { 1859 immutable c = EncoderInstance!(E).safeDecode(t); 1860 assert(c == INVALID_SEQUENCE); 1861 len += repSeq.length; 1862 t = t[validLength(t)..$]; 1863 } 1864 1865 // Now do the write 1866 E[] array = new E[len]; 1867 array[0 .. n] = s[0 .. n]; 1868 size_t offset = n; 1869 1870 t = s[n..$]; 1871 while (t.length != 0) 1872 { 1873 immutable c = EncoderInstance!(E).safeDecode(t); 1874 assert(c == INVALID_SEQUENCE); 1875 array[offset .. offset+repSeq.length] = repSeq[]; 1876 offset += repSeq.length; 1877 n = validLength(t); 1878 array[offset .. offset+n] = t[0 .. n]; 1879 offset += n; 1880 t = t[n..$]; 1881 } 1882 return cast(immutable(E)[])array[0 .. offset]; 1883 } 1884 1885 /// 1886 @system pure unittest 1887 { 1888 assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld"); 1889 } 1890 1891 /** 1892 Returns the length of the first encoded sequence. 1893 1894 The input to this function MUST be validly encoded. 1895 This is enforced by the function's in-contract. 1896 1897 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1898 WINDOWS-1251, WINDOWS-1252 1899 1900 Params: 1901 s = the string to be sliced 1902 */ 1903 size_t firstSequence(E)(const(E)[] s) 1904 in 1905 { 1906 assert(s.length != 0); 1907 const(E)[] u = s; 1908 assert(safeDecode(u) != INVALID_SEQUENCE); 1909 } 1910 do 1911 { 1912 auto before = s.length; 1913 EncoderInstance!(E).skip(s); 1914 return before - s.length; 1915 } 1916 1917 /// 1918 @system pure unittest 1919 { 1920 assert(firstSequence("\u20AC1000") == "\u20AC".length); 1921 assert(firstSequence("hel") == "h".length); 1922 } 1923 1924 /** 1925 Returns the length of the last encoded sequence. 1926 1927 The input to this function MUST be validly encoded. 1928 This is enforced by the function's in-contract. 1929 1930 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1931 WINDOWS-1251, WINDOWS-1252 1932 1933 Params: 1934 s = the string to be sliced 1935 */ 1936 size_t lastSequence(E)(const(E)[] s) 1937 in 1938 { 1939 assert(s.length != 0); 1940 assert(isValid(s)); 1941 } 1942 do 1943 { 1944 const(E)[] t = s; 1945 EncoderInstance!(E).decodeReverse(s); 1946 return t.length - s.length; 1947 } 1948 1949 /// 1950 @system pure unittest 1951 { 1952 assert(lastSequence("1000\u20AC") == "\u20AC".length); 1953 assert(lastSequence("hellö") == "ö".length); 1954 } 1955 1956 /** 1957 Returns the array index at which the (n+1)th code point begins. 1958 1959 The input to this function MUST be validly encoded. 1960 This is enforced by the function's in-contract. 1961 1962 Supersedes: 1963 This function supersedes std.utf.toUTFindex(). 1964 1965 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1966 WINDOWS-1251, WINDOWS-1252 1967 1968 Params: 1969 s = the string to be counted 1970 n = the current code point index 1971 */ 1972 ptrdiff_t index(E)(const(E)[] s,int n) 1973 in 1974 { 1975 assert(isValid(s)); 1976 assert(n >= 0); 1977 } 1978 do 1979 { 1980 const(E)[] t = s; 1981 for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s); 1982 return t.length - s.length; 1983 } 1984 1985 /// 1986 @system pure unittest 1987 { 1988 assert(index("\u20AC100",1) == 3); 1989 assert(index("hällo",2) == 3); 1990 } 1991 1992 /** 1993 Decodes a single code point. 1994 1995 This function removes one or more code units from the start of a string, 1996 and returns the decoded code point which those code units represent. 1997 1998 The input to this function MUST be validly encoded. 1999 This is enforced by the function's in-contract. 2000 2001 Supersedes: 2002 This function supersedes std.utf.decode(), however, note that the 2003 function codePoints() supersedes it more conveniently. 2004 2005 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2006 WINDOWS-1251, WINDOWS-1252 2007 2008 Params: 2009 s = the string whose first code point is to be decoded 2010 */ 2011 dchar decode(S)(ref S s) 2012 in 2013 { 2014 assert(s.length != 0); 2015 auto u = s; 2016 assert(safeDecode(u) != INVALID_SEQUENCE); 2017 } 2018 do 2019 { 2020 return EncoderInstance!(typeof(s[0])).decode(s); 2021 } 2022 2023 /** 2024 Decodes a single code point from the end of a string. 2025 2026 This function removes one or more code units from the end of a string, 2027 and returns the decoded code point which those code units represent. 2028 2029 The input to this function MUST be validly encoded. 2030 This is enforced by the function's in-contract. 2031 2032 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2033 WINDOWS-1251, WINDOWS-1252 2034 2035 Params: 2036 s = the string whose first code point is to be decoded 2037 */ 2038 dchar decodeReverse(E)(ref const(E)[] s) 2039 in 2040 { 2041 assert(s.length != 0); 2042 assert(isValid(s)); 2043 } 2044 do 2045 { 2046 return EncoderInstance!(E).decodeReverse(s); 2047 } 2048 2049 /** 2050 Decodes a single code point. The input does not have to be valid. 2051 2052 This function removes one or more code units from the start of a string, 2053 and returns the decoded code point which those code units represent. 2054 2055 This function will accept an invalidly encoded string as input. 2056 If an invalid sequence is found at the start of the string, this 2057 function will remove it, and return the value INVALID_SEQUENCE. 2058 2059 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2060 WINDOWS-1251, WINDOWS-1252 2061 2062 Params: 2063 s = the string whose first code point is to be decoded 2064 */ 2065 dchar safeDecode(S)(ref S s) 2066 in 2067 { 2068 assert(s.length != 0); 2069 } 2070 do 2071 { 2072 return EncoderInstance!(typeof(s[0])).safeDecode(s); 2073 } 2074 2075 /** 2076 Returns the number of code units required to encode a single code point. 2077 2078 The input to this function MUST be a valid code point. 2079 This is enforced by the function's in-contract. 2080 2081 The type of the output cannot be deduced. Therefore, it is necessary to 2082 explicitly specify the encoding as a template parameter. 2083 2084 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2085 WINDOWS-1251, WINDOWS-1252 2086 2087 Params: 2088 c = the code point to be encoded 2089 */ 2090 size_t encodedLength(E)(dchar c) 2091 in 2092 { 2093 assert(isValidCodePoint(c)); 2094 } 2095 do 2096 { 2097 return EncoderInstance!(E).encodedLength(c); 2098 } 2099 2100 /** 2101 Encodes a single code point. 2102 2103 This function encodes a single code point into one or more code units. 2104 It returns a string containing those code units. 2105 2106 The input to this function MUST be a valid code point. 2107 This is enforced by the function's in-contract. 2108 2109 The type of the output cannot be deduced. Therefore, it is necessary to 2110 explicitly specify the encoding as a template parameter. 2111 2112 Supersedes: 2113 This function supersedes std.utf.encode(), however, note that the 2114 function codeUnits() supersedes it more conveniently. 2115 2116 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2117 WINDOWS-1251, WINDOWS-1252 2118 2119 Params: 2120 c = the code point to be encoded 2121 */ 2122 E[] encode(E)(dchar c) 2123 in 2124 { 2125 assert(isValidCodePoint(c)); 2126 } 2127 do 2128 { 2129 return EncoderInstance!(E).encode(c); 2130 } 2131 2132 /** 2133 Encodes a single code point into an array. 2134 2135 This function encodes a single code point into one or more code units 2136 The code units are stored in a user-supplied fixed-size array, 2137 which must be passed by reference. 2138 2139 The input to this function MUST be a valid code point. 2140 This is enforced by the function's in-contract. 2141 2142 The type of the output cannot be deduced. Therefore, it is necessary to 2143 explicitly specify the encoding as a template parameter. 2144 2145 Supersedes: 2146 This function supersedes std.utf.encode(), however, note that the 2147 function codeUnits() supersedes it more conveniently. 2148 2149 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2150 WINDOWS-1251, WINDOWS-1252 2151 2152 Params: 2153 c = the code point to be encoded 2154 array = the destination array 2155 2156 Returns: 2157 the number of code units written to the array 2158 */ 2159 size_t encode(E)(dchar c, E[] array) 2160 in 2161 { 2162 assert(isValidCodePoint(c)); 2163 } 2164 do 2165 { 2166 E[] t = array; 2167 EncoderInstance!(E).encode(c,t); 2168 return array.length - t.length; 2169 } 2170 2171 /* 2172 Encodes `c` in units of type `E` and writes the result to the 2173 output range `R`. Returns the number of `E`s written. 2174 */ 2175 size_t encode(E, R)(dchar c, auto ref R range) 2176 if (isNativeOutputRange!(R, E)) 2177 { 2178 static if (is(immutable E == immutable char)) 2179 { 2180 if (c <= 0x7F) 2181 { 2182 put(range, cast(char) c); 2183 return 1; 2184 } 2185 if (c <= 0x7FF) 2186 { 2187 put(range, cast(char)(0xC0 | (c >> 6))); 2188 put(range, cast(char)(0x80 | (c & 0x3F))); 2189 return 2; 2190 } 2191 if (c <= 0xFFFF) 2192 { 2193 put(range, cast(char)(0xE0 | (c >> 12))); 2194 put(range, cast(char)(0x80 | ((c >> 6) & 0x3F))); 2195 put(range, cast(char)(0x80 | (c & 0x3F))); 2196 return 3; 2197 } 2198 if (c <= 0x10FFFF) 2199 { 2200 put(range, cast(char)(0xF0 | (c >> 18))); 2201 put(range, cast(char)(0x80 | ((c >> 12) & 0x3F))); 2202 put(range, cast(char)(0x80 | ((c >> 6) & 0x3F))); 2203 put(range, cast(char)(0x80 | (c & 0x3F))); 2204 return 4; 2205 } 2206 else 2207 { 2208 assert(0); 2209 } 2210 } 2211 else static if (is(immutable E == immutable wchar)) 2212 { 2213 if (c <= 0xFFFF) 2214 { 2215 range.put(cast(wchar) c); 2216 return 1; 2217 } 2218 range.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800)); 2219 range.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00)); 2220 return 2; 2221 } 2222 else static if (is(immutable E == immutable dchar)) 2223 { 2224 range.put(c); 2225 return 1; 2226 } 2227 else 2228 { 2229 static assert(0); 2230 } 2231 } 2232 2233 @safe pure unittest 2234 { 2235 import std.array; 2236 Appender!(char[]) r; 2237 assert(encode!(char)('T', r) == 1); 2238 assert(encode!(wchar)('T', r) == 1); 2239 assert(encode!(dchar)('T', r) == 1); 2240 } 2241 2242 /** 2243 Encodes a single code point to a delegate. 2244 2245 This function encodes a single code point into one or more code units. 2246 The code units are passed one at a time to the supplied delegate. 2247 2248 The input to this function MUST be a valid code point. 2249 This is enforced by the function's in-contract. 2250 2251 The type of the output cannot be deduced. Therefore, it is necessary to 2252 explicitly specify the encoding as a template parameter. 2253 2254 Supersedes: 2255 This function supersedes std.utf.encode(), however, note that the 2256 function codeUnits() supersedes it more conveniently. 2257 2258 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2259 WINDOWS-1251, WINDOWS-1252 2260 2261 Params: 2262 c = the code point to be encoded 2263 dg = the delegate to invoke for each code unit 2264 */ 2265 void encode(E)(dchar c, void delegate(E) dg) 2266 in 2267 { 2268 assert(isValidCodePoint(c)); 2269 } 2270 do 2271 { 2272 EncoderInstance!(E).encode(c,dg); 2273 } 2274 2275 /** 2276 Encodes the contents of `s` in units of type `Tgt`, writing the result to an 2277 output range. 2278 2279 Returns: The number of `Tgt` elements written. 2280 Params: 2281 Tgt = Element type of `range`. 2282 s = Input array. 2283 range = Output range. 2284 */ 2285 size_t encode(Tgt, Src, R)(in Src[] s, R range) 2286 { 2287 size_t result; 2288 foreach (c; s) 2289 { 2290 result += encode!(Tgt)(c, range); 2291 } 2292 return result; 2293 } 2294 2295 /** 2296 Returns a foreachable struct which can bidirectionally iterate over all 2297 code points in a string. 2298 2299 The input to this function MUST be validly encoded. 2300 This is enforced by the function's in-contract. 2301 2302 You can foreach either 2303 with or without an index. If an index is specified, it will be initialized 2304 at each iteration with the offset into the string at which the code point 2305 begins. 2306 2307 Supersedes: 2308 This function supersedes std.utf.decode(). 2309 2310 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2311 WINDOWS-1251, WINDOWS-1252 2312 2313 Params: 2314 s = the string to be decoded 2315 2316 Example: 2317 -------------------------------------------------------- 2318 string s = "hello world"; 2319 foreach (c;codePoints(s)) 2320 { 2321 // do something with c (which will always be a dchar) 2322 } 2323 -------------------------------------------------------- 2324 2325 Note that, currently, foreach (c:codePoints(s)) is superior to foreach (c;s) 2326 in that the latter will fall over on encountering U+FFFF. 2327 */ 2328 CodePoints!(E) codePoints(E)(immutable(E)[] s) 2329 in 2330 { 2331 assert(isValid(s)); 2332 } 2333 do 2334 { 2335 return CodePoints!(E)(s); 2336 } 2337 2338 /// 2339 @system unittest 2340 { 2341 string s = "hello"; 2342 string t; 2343 foreach (c;codePoints(s)) 2344 { 2345 t ~= cast(char) c; 2346 } 2347 assert(s == t); 2348 } 2349 2350 /** 2351 Returns a foreachable struct which can bidirectionally iterate over all 2352 code units in a code point. 2353 2354 The input to this function MUST be a valid code point. 2355 This is enforced by the function's in-contract. 2356 2357 The type of the output cannot be deduced. Therefore, it is necessary to 2358 explicitly specify the encoding type in the template parameter. 2359 2360 Supersedes: 2361 This function supersedes std.utf.encode(). 2362 2363 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2364 WINDOWS-1251, WINDOWS-1252 2365 2366 Params: 2367 c = the code point to be encoded 2368 */ 2369 CodeUnits!(E) codeUnits(E)(dchar c) 2370 in 2371 { 2372 assert(isValidCodePoint(c)); 2373 } 2374 do 2375 { 2376 return CodeUnits!(E)(c); 2377 } 2378 2379 /// 2380 @system unittest 2381 { 2382 char[] a; 2383 foreach (c;codeUnits!(char)(cast(dchar)'\u20AC')) 2384 { 2385 a ~= c; 2386 } 2387 assert(a.length == 3); 2388 assert(a[0] == 0xE2); 2389 assert(a[1] == 0x82); 2390 assert(a[2] == 0xAC); 2391 } 2392 2393 /** 2394 Convert a string from one encoding to another. 2395 2396 Supersedes: 2397 This function supersedes std.utf.toUTF8(), std.utf.toUTF16() and 2398 std.utf.toUTF32() 2399 (but note that to!() supersedes it more conveniently). 2400 2401 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2402 WINDOWS-1251, WINDOWS-1252 2403 2404 Params: 2405 s = Source string. $(B Must) be validly encoded. 2406 This is enforced by the function's in-contract. 2407 r = Destination string 2408 2409 See_Also: 2410 $(REF to, std,conv) 2411 */ 2412 void transcode(Src, Dst)(Src[] s, out Dst[] r) 2413 in 2414 { 2415 assert(isValid(s)); 2416 } 2417 do 2418 { 2419 static if (is(Src == Dst) && is(Src == immutable)) 2420 { 2421 r = s; 2422 } 2423 else static if (is(immutable Src == immutable AsciiChar)) 2424 { 2425 transcode(cast(const(char)[])s, r); 2426 } 2427 else 2428 { 2429 static if (is(immutable Dst == immutable wchar)) 2430 { 2431 immutable minReservePlace = 2; 2432 } 2433 else static if (is(immutable Dst == immutable dchar)) 2434 { 2435 immutable minReservePlace = 1; 2436 } 2437 else 2438 { 2439 immutable minReservePlace = 6; 2440 } 2441 2442 auto buffer = new Unqual!Dst[s.length]; 2443 auto tmpBuffer = buffer; 2444 2445 while (s.length != 0) 2446 { 2447 if (tmpBuffer.length < minReservePlace) 2448 { 2449 size_t prevLength = buffer.length; 2450 buffer.length += s.length + minReservePlace; 2451 tmpBuffer = buffer[prevLength - tmpBuffer.length .. $]; 2452 } 2453 EncoderInstance!(Unqual!Dst).encode(decode(s), tmpBuffer); 2454 } 2455 2456 r = cast(Dst[]) buffer[0 .. buffer.length - tmpBuffer.length]; 2457 } 2458 } 2459 2460 /// 2461 @system pure unittest 2462 { 2463 wstring ws; 2464 // transcode from UTF-8 to UTF-16 2465 transcode("hello world",ws); 2466 assert(ws == "hello world"w); 2467 2468 Latin1String ls; 2469 // transcode from UTF-16 to ISO-8859-1 2470 transcode(ws, ls); 2471 assert(ls == "hello world"); 2472 } 2473 2474 @system pure unittest 2475 { 2476 import std.meta; 2477 import std.range; 2478 { 2479 import std.conv : to; 2480 2481 string asciiCharString = to!string(iota(0, 128, 1)); 2482 2483 alias Types = AliasSeq!(string, Latin1String, Latin2String, AsciiString, 2484 Windows1250String, Windows1251String, Windows1252String, dstring, wstring); 2485 foreach (S; Types) 2486 foreach (D; Types) 2487 { 2488 string str; 2489 S sStr; 2490 D dStr; 2491 transcode(asciiCharString, sStr); 2492 transcode(sStr, dStr); 2493 transcode(dStr, str); 2494 assert(asciiCharString == str); 2495 } 2496 } 2497 { 2498 string czechChars = "Příliš žluťoučký kůň úpěl ďábelské ódy."; 2499 alias Types = AliasSeq!(string, dstring, wstring); 2500 foreach (S; Types) 2501 foreach (D; Types) 2502 { 2503 string str; 2504 S sStr; 2505 D dStr; 2506 transcode(czechChars, sStr); 2507 transcode(sStr, dStr); 2508 transcode(dStr, str); 2509 assert(czechChars == str); 2510 } 2511 } 2512 } 2513 2514 @system unittest // mutable/const input/output 2515 { 2516 import std.meta : AliasSeq; 2517 2518 static foreach (O; AliasSeq!(Latin1Char, const Latin1Char, immutable Latin1Char)) 2519 {{ 2520 O[] output; 2521 2522 char[] mutableInput = "äbc".dup; 2523 transcode(mutableInput, output); 2524 assert(output == [0xE4, 'b', 'c']); 2525 2526 const char[] constInput = "öbc"; 2527 transcode(constInput, output); 2528 assert(output == [0xF6, 'b', 'c']); 2529 2530 immutable char[] immutInput = "übc"; 2531 transcode(immutInput, output); 2532 assert(output == [0xFC, 'b', 'c']); 2533 }} 2534 2535 // Make sure that const/mutable input is copied. 2536 static foreach (C; AliasSeq!(char, const char)) 2537 {{ 2538 C[] input = "foo".dup; 2539 C[] output; 2540 transcode(input, output); 2541 assert(input == output); 2542 assert(input !is output); 2543 }} 2544 2545 // But immutable input should not be copied. 2546 string input = "foo"; 2547 string output; 2548 transcode(input, output); 2549 assert(input is output); 2550 } 2551 2552 //============================================================================= 2553 2554 /** The base class for exceptions thrown by this module */ 2555 class EncodingException : Exception { this(string msg) @safe pure { super(msg); } } 2556 2557 class UnrecognizedEncodingException : EncodingException 2558 { 2559 private this(string msg) @safe pure { super(msg); } 2560 } 2561 2562 /** Abstract base class of all encoding schemes */ 2563 abstract class EncodingScheme 2564 { 2565 import std.uni : toLower; 2566 2567 /** 2568 * Registers a subclass of EncodingScheme. 2569 * 2570 * This function allows user-defined subclasses of EncodingScheme to 2571 * be declared in other modules. 2572 * 2573 * Params: 2574 * Klass = The subclass of EncodingScheme to register. 2575 * 2576 * Example: 2577 * ---------------------------------------------- 2578 * class Amiga1251 : EncodingScheme 2579 * { 2580 * shared static this() 2581 * { 2582 * EncodingScheme.register!Amiga1251; 2583 * } 2584 * } 2585 * ---------------------------------------------- 2586 */ 2587 static void register(Klass:EncodingScheme)() 2588 { 2589 scope scheme = new Klass(); 2590 foreach (encodingName;scheme.names()) 2591 { 2592 supported[toLower(encodingName)] = () => new Klass(); 2593 } 2594 } 2595 2596 deprecated("Please pass the EncodingScheme subclass as template argument instead.") 2597 static void register(string className) 2598 { 2599 auto scheme = cast(EncodingScheme) ClassInfo.find(className).create(); 2600 if (scheme is null) 2601 throw new EncodingException("Unable to create class "~className); 2602 foreach (encodingName;scheme.names()) 2603 { 2604 supportedFactories[toLower(encodingName)] = className; 2605 } 2606 } 2607 2608 /** 2609 * Obtains a subclass of EncodingScheme which is capable of encoding 2610 * and decoding the named encoding scheme. 2611 * 2612 * This function is only aware of EncodingSchemes which have been 2613 * registered with the register() function. 2614 * 2615 * Example: 2616 * --------------------------------------------------- 2617 * auto scheme = EncodingScheme.create("Amiga-1251"); 2618 * --------------------------------------------------- 2619 */ 2620 static EncodingScheme create(string encodingName) 2621 { 2622 static bool registerDefaultEncodings() 2623 { 2624 EncodingScheme.register!EncodingSchemeASCII; 2625 EncodingScheme.register!EncodingSchemeLatin1; 2626 EncodingScheme.register!EncodingSchemeLatin2; 2627 EncodingScheme.register!EncodingSchemeWindows1250; 2628 EncodingScheme.register!EncodingSchemeWindows1251; 2629 EncodingScheme.register!EncodingSchemeWindows1252; 2630 EncodingScheme.register!EncodingSchemeUtf8; 2631 EncodingScheme.register!EncodingSchemeUtf16Native; 2632 EncodingScheme.register!EncodingSchemeUtf32Native; 2633 return true; 2634 } 2635 2636 static shared bool initialized; 2637 import std.concurrency : initOnce; 2638 initOnce!initialized(registerDefaultEncodings()); 2639 encodingName = toLower(encodingName); 2640 2641 if (auto p = encodingName in supported) 2642 return (*p)(); 2643 2644 auto p = encodingName in supportedFactories; 2645 if (p is null) 2646 throw new EncodingException("Unrecognized Encoding: "~encodingName); 2647 string className = *p; 2648 auto scheme = cast(EncodingScheme) ClassInfo.find(className).create(); 2649 if (scheme is null) throw new EncodingException("Unable to create class "~className); 2650 return scheme; 2651 } 2652 2653 const 2654 { 2655 /** 2656 * Returns the standard name of the encoding scheme 2657 */ 2658 abstract override string toString(); 2659 2660 /** 2661 * Returns an array of all known names for this encoding scheme 2662 */ 2663 abstract string[] names(); 2664 2665 /** 2666 * Returns true if the character c can be represented 2667 * in this encoding scheme. 2668 */ 2669 abstract bool canEncode(dchar c); 2670 2671 /** 2672 * Returns the number of ubytes required to encode this code point. 2673 * 2674 * The input to this function MUST be a valid code point. 2675 * 2676 * Params: 2677 * c = the code point to be encoded 2678 * 2679 * Returns: 2680 * the number of ubytes required. 2681 */ 2682 abstract size_t encodedLength(dchar c); 2683 2684 /** 2685 * Encodes a single code point into a user-supplied, fixed-size buffer. 2686 * 2687 * This function encodes a single code point into one or more ubytes. 2688 * The supplied buffer must be code unit aligned. 2689 * (For example, UTF-16LE or UTF-16BE must be wchar-aligned, 2690 * UTF-32LE or UTF-32BE must be dchar-aligned, etc.) 2691 * 2692 * The input to this function MUST be a valid code point. 2693 * 2694 * Params: 2695 * c = the code point to be encoded 2696 * buffer = the destination array 2697 * 2698 * Returns: 2699 * the number of ubytes written. 2700 */ 2701 abstract size_t encode(dchar c, ubyte[] buffer); 2702 2703 /** 2704 * Decodes a single code point. 2705 * 2706 * This function removes one or more ubytes from the start of an array, 2707 * and returns the decoded code point which those ubytes represent. 2708 * 2709 * The input to this function MUST be validly encoded. 2710 * 2711 * Params: 2712 * s = the array whose first code point is to be decoded 2713 */ 2714 abstract dchar decode(ref const(ubyte)[] s); 2715 2716 /** 2717 * Decodes a single code point. The input does not have to be valid. 2718 * 2719 * This function removes one or more ubytes from the start of an array, 2720 * and returns the decoded code point which those ubytes represent. 2721 * 2722 * This function will accept an invalidly encoded array as input. 2723 * If an invalid sequence is found at the start of the string, this 2724 * function will remove it, and return the value INVALID_SEQUENCE. 2725 * 2726 * Params: 2727 * s = the array whose first code point is to be decoded 2728 */ 2729 abstract dchar safeDecode(ref const(ubyte)[] s); 2730 2731 /** 2732 * Returns the sequence of ubytes to be used to represent 2733 * any character which cannot be represented in the encoding scheme. 2734 * 2735 * Normally this will be a representation of some substitution 2736 * character, such as U+FFFD or '?'. 2737 */ 2738 abstract @property immutable(ubyte)[] replacementSequence(); 2739 } 2740 2741 /** 2742 * Returns true if the array is encoded correctly 2743 * 2744 * Params: 2745 * s = the array to be tested 2746 */ 2747 bool isValid(const(ubyte)[] s) 2748 { 2749 while (s.length != 0) 2750 { 2751 if (safeDecode(s) == INVALID_SEQUENCE) 2752 return false; 2753 } 2754 return true; 2755 } 2756 2757 /** 2758 * Returns the length of the longest possible substring, starting from 2759 * the first element, which is validly encoded. 2760 * 2761 * Params: 2762 * s = the array to be tested 2763 */ 2764 size_t validLength()(const(ubyte)[] s) 2765 { 2766 const(ubyte)[] r = s; 2767 const(ubyte)[] t = s; 2768 while (s.length != 0) 2769 { 2770 if (safeDecode(s) == INVALID_SEQUENCE) break; 2771 t = s; 2772 } 2773 return r.length - t.length; 2774 } 2775 2776 /** 2777 * Sanitizes an array by replacing malformed ubyte sequences with valid 2778 * ubyte sequences. The result is guaranteed to be valid for this 2779 * encoding scheme. 2780 * 2781 * If the input array is already valid, this function returns the 2782 * original, otherwise it constructs a new array by replacing all illegal 2783 * sequences with the encoding scheme's replacement sequence. 2784 * 2785 * Params: 2786 * s = the string to be sanitized 2787 */ 2788 immutable(ubyte)[] sanitize()(immutable(ubyte)[] s) 2789 { 2790 auto n = validLength(s); 2791 if (n == s.length) return s; 2792 2793 auto repSeq = replacementSequence; 2794 2795 // Count how long the string needs to be. 2796 // Overestimating is not a problem 2797 auto len = s.length; 2798 const(ubyte)[] t = s[n..$]; 2799 while (t.length != 0) 2800 { 2801 immutable c = safeDecode(t); 2802 assert(c == INVALID_SEQUENCE); 2803 len += repSeq.length; 2804 t = t[validLength(t)..$]; 2805 } 2806 2807 // Now do the write 2808 ubyte[] array = new ubyte[len]; 2809 array[0 .. n] = s[0 .. n]; 2810 auto offset = n; 2811 2812 t = s[n..$]; 2813 while (t.length != 0) 2814 { 2815 immutable c = safeDecode(t); 2816 assert(c == INVALID_SEQUENCE); 2817 array[offset .. offset+repSeq.length] = repSeq[]; 2818 offset += repSeq.length; 2819 n = validLength(t); 2820 array[offset .. offset+n] = t[0 .. n]; 2821 offset += n; 2822 t = t[n..$]; 2823 } 2824 return cast(immutable(ubyte)[])array[0 .. offset]; 2825 } 2826 2827 /** 2828 * Returns the length of the first encoded sequence. 2829 * 2830 * The input to this function MUST be validly encoded. 2831 * This is enforced by the function's in-contract. 2832 * 2833 * Params: 2834 * s = the array to be sliced 2835 */ 2836 size_t firstSequence()(const(ubyte)[] s) 2837 in 2838 { 2839 assert(s.length != 0); 2840 const(ubyte)[] u = s; 2841 assert(safeDecode(u) != INVALID_SEQUENCE); 2842 } 2843 do 2844 { 2845 const(ubyte)[] t = s; 2846 decode(s); 2847 return t.length - s.length; 2848 } 2849 2850 /** 2851 * Returns the total number of code points encoded in a ubyte array. 2852 * 2853 * The input to this function MUST be validly encoded. 2854 * This is enforced by the function's in-contract. 2855 * 2856 * Params: 2857 * s = the string to be counted 2858 */ 2859 size_t count()(const(ubyte)[] s) 2860 in 2861 { 2862 assert(isValid(s)); 2863 } 2864 do 2865 { 2866 size_t n = 0; 2867 while (s.length != 0) 2868 { 2869 decode(s); 2870 ++n; 2871 } 2872 return n; 2873 } 2874 2875 /** 2876 * Returns the array index at which the (n+1)th code point begins. 2877 * 2878 * The input to this function MUST be validly encoded. 2879 * This is enforced by the function's in-contract. 2880 * 2881 * Params: 2882 * s = the string to be counted 2883 * n = the current code point index 2884 */ 2885 ptrdiff_t index()(const(ubyte)[] s, size_t n) 2886 in 2887 { 2888 assert(isValid(s)); 2889 assert(n >= 0); 2890 } 2891 do 2892 { 2893 const(ubyte)[] t = s; 2894 for (size_t i=0; i<n; ++i) decode(s); 2895 return t.length - s.length; 2896 } 2897 2898 __gshared EncodingScheme function()[string] supported; 2899 __gshared string[string] supportedFactories; 2900 } 2901 2902 /** 2903 EncodingScheme to handle ASCII 2904 2905 This scheme recognises the following names: 2906 "ANSI_X3.4-1968", 2907 "ANSI_X3.4-1986", 2908 "ASCII", 2909 "IBM367", 2910 "ISO646-US", 2911 "ISO_646.irv:1991", 2912 "US-ASCII", 2913 "cp367", 2914 "csASCII" 2915 "iso-ir-6", 2916 "us" 2917 */ 2918 class EncodingSchemeASCII : EncodingScheme 2919 { 2920 /* // moved to std.internal.phobosinit 2921 shared static this() 2922 { 2923 EncodingScheme.register("std.encoding.EncodingSchemeASCII"); 2924 }*/ 2925 2926 const 2927 { 2928 override string[] names() @safe pure nothrow 2929 { 2930 return 2931 [ 2932 "ANSI_X3.4-1968", 2933 "ANSI_X3.4-1986", 2934 "ASCII", 2935 "IBM367", 2936 "ISO646-US", 2937 "ISO_646.irv:1991", 2938 "US-ASCII", 2939 "cp367", 2940 "csASCII", 2941 "iso-ir-6", 2942 "us" 2943 ]; 2944 } 2945 2946 override string toString() @safe pure nothrow @nogc 2947 { 2948 return "ASCII"; 2949 } 2950 2951 override bool canEncode(dchar c) @safe pure nothrow @nogc 2952 { 2953 return std.encoding.canEncode!(AsciiChar)(c); 2954 } 2955 2956 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 2957 { 2958 return std.encoding.encodedLength!(AsciiChar)(c); 2959 } 2960 2961 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 2962 { 2963 auto r = cast(AsciiChar[]) buffer; 2964 return std.encoding.encode(c,r); 2965 } 2966 2967 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 2968 { 2969 auto t = cast(const(AsciiChar)[]) s; 2970 dchar c = std.encoding.decode(t); 2971 s = s[$-t.length..$]; 2972 return c; 2973 } 2974 2975 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 2976 { 2977 auto t = cast(const(AsciiChar)[]) s; 2978 dchar c = std.encoding.safeDecode(t); 2979 s = s[$-t.length..$]; 2980 return c; 2981 } 2982 2983 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 2984 { 2985 return cast(immutable(ubyte)[])"?"; 2986 } 2987 } 2988 } 2989 2990 /** 2991 EncodingScheme to handle Latin-1 2992 2993 This scheme recognises the following names: 2994 "CP819", 2995 "IBM819", 2996 "ISO-8859-1", 2997 "ISO_8859-1", 2998 "ISO_8859-1:1987", 2999 "csISOLatin1", 3000 "iso-ir-100", 3001 "l1", 3002 "latin1" 3003 */ 3004 class EncodingSchemeLatin1 : EncodingScheme 3005 { 3006 /* // moved to std.internal.phobosinit 3007 shared static this() 3008 { 3009 EncodingScheme.register("std.encoding.EncodingSchemeLatin1"); 3010 }*/ 3011 3012 const 3013 { 3014 override string[] names() @safe pure nothrow 3015 { 3016 return 3017 [ 3018 "CP819", 3019 "IBM819", 3020 "ISO-8859-1", 3021 "ISO_8859-1", 3022 "ISO_8859-1:1987", 3023 "csISOLatin1", 3024 "iso-ir-100", 3025 "l1", 3026 "latin1" 3027 ]; 3028 } 3029 3030 override string toString() @safe pure nothrow @nogc 3031 { 3032 return "ISO-8859-1"; 3033 } 3034 3035 override bool canEncode(dchar c) @safe pure nothrow @nogc 3036 { 3037 return std.encoding.canEncode!(Latin1Char)(c); 3038 } 3039 3040 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3041 { 3042 return std.encoding.encodedLength!(Latin1Char)(c); 3043 } 3044 3045 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3046 { 3047 auto r = cast(Latin1Char[]) buffer; 3048 return std.encoding.encode(c,r); 3049 } 3050 3051 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3052 { 3053 auto t = cast(const(Latin1Char)[]) s; 3054 dchar c = std.encoding.decode(t); 3055 s = s[$-t.length..$]; 3056 return c; 3057 } 3058 3059 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3060 { 3061 auto t = cast(const(Latin1Char)[]) s; 3062 dchar c = std.encoding.safeDecode(t); 3063 s = s[$-t.length..$]; 3064 return c; 3065 } 3066 3067 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3068 { 3069 return cast(immutable(ubyte)[])"?"; 3070 } 3071 } 3072 } 3073 3074 /** 3075 EncodingScheme to handle Latin-2 3076 3077 This scheme recognises the following names: 3078 "Latin 2", 3079 "ISO-8859-2", 3080 "ISO_8859-2", 3081 "ISO_8859-2:1999", 3082 "Windows-28592" 3083 */ 3084 class EncodingSchemeLatin2 : EncodingScheme 3085 { 3086 /* // moved to std.internal.phobosinit 3087 shared static this() 3088 { 3089 EncodingScheme.register("std.encoding.EncodingSchemeLatin2"); 3090 }*/ 3091 3092 const 3093 { 3094 override string[] names() @safe pure nothrow 3095 { 3096 return 3097 [ 3098 "Latin 2", 3099 "ISO-8859-2", 3100 "ISO_8859-2", 3101 "ISO_8859-2:1999", 3102 "windows-28592" 3103 ]; 3104 } 3105 3106 override string toString() @safe pure nothrow @nogc 3107 { 3108 return "ISO-8859-2"; 3109 } 3110 3111 override bool canEncode(dchar c) @safe pure nothrow @nogc 3112 { 3113 return std.encoding.canEncode!(Latin2Char)(c); 3114 } 3115 3116 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3117 { 3118 return std.encoding.encodedLength!(Latin2Char)(c); 3119 } 3120 3121 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3122 { 3123 auto r = cast(Latin2Char[]) buffer; 3124 return std.encoding.encode(c,r); 3125 } 3126 3127 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3128 { 3129 auto t = cast(const(Latin2Char)[]) s; 3130 dchar c = std.encoding.decode(t); 3131 s = s[$-t.length..$]; 3132 return c; 3133 } 3134 3135 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3136 { 3137 auto t = cast(const(Latin2Char)[]) s; 3138 dchar c = std.encoding.safeDecode(t); 3139 s = s[$-t.length..$]; 3140 return c; 3141 } 3142 3143 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3144 { 3145 return cast(immutable(ubyte)[])"?"; 3146 } 3147 } 3148 } 3149 3150 /** 3151 EncodingScheme to handle Windows-1250 3152 3153 This scheme recognises the following names: 3154 "windows-1250" 3155 */ 3156 class EncodingSchemeWindows1250 : EncodingScheme 3157 { 3158 /* // moved to std.internal.phobosinit 3159 shared static this() 3160 { 3161 EncodingScheme.register("std.encoding.EncodingSchemeWindows1250"); 3162 }*/ 3163 3164 const 3165 { 3166 override string[] names() @safe pure nothrow 3167 { 3168 return 3169 [ 3170 "windows-1250" 3171 ]; 3172 } 3173 3174 override string toString() @safe pure nothrow @nogc 3175 { 3176 return "windows-1250"; 3177 } 3178 3179 override bool canEncode(dchar c) @safe pure nothrow @nogc 3180 { 3181 return std.encoding.canEncode!(Windows1250Char)(c); 3182 } 3183 3184 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3185 { 3186 return std.encoding.encodedLength!(Windows1250Char)(c); 3187 } 3188 3189 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3190 { 3191 auto r = cast(Windows1250Char[]) buffer; 3192 return std.encoding.encode(c,r); 3193 } 3194 3195 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3196 { 3197 auto t = cast(const(Windows1250Char)[]) s; 3198 dchar c = std.encoding.decode(t); 3199 s = s[$-t.length..$]; 3200 return c; 3201 } 3202 3203 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3204 { 3205 auto t = cast(const(Windows1250Char)[]) s; 3206 dchar c = std.encoding.safeDecode(t); 3207 s = s[$-t.length..$]; 3208 return c; 3209 } 3210 3211 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3212 { 3213 return cast(immutable(ubyte)[])"?"; 3214 } 3215 } 3216 } 3217 3218 /** 3219 EncodingScheme to handle Windows-1251 3220 3221 This scheme recognises the following names: 3222 "windows-1251" 3223 */ 3224 class EncodingSchemeWindows1251 : EncodingScheme 3225 { 3226 /* // moved to std.internal.phobosinit 3227 shared static this() 3228 { 3229 EncodingScheme.register("std.encoding.EncodingSchemeWindows1251"); 3230 }*/ 3231 3232 const 3233 { 3234 override string[] names() @safe pure nothrow 3235 { 3236 return 3237 [ 3238 "windows-1251" 3239 ]; 3240 } 3241 3242 override string toString() @safe pure nothrow @nogc 3243 { 3244 return "windows-1251"; 3245 } 3246 3247 override bool canEncode(dchar c) @safe pure nothrow @nogc 3248 { 3249 return std.encoding.canEncode!(Windows1251Char)(c); 3250 } 3251 3252 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3253 { 3254 return std.encoding.encodedLength!(Windows1251Char)(c); 3255 } 3256 3257 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3258 { 3259 auto r = cast(Windows1251Char[]) buffer; 3260 return std.encoding.encode(c,r); 3261 } 3262 3263 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3264 { 3265 auto t = cast(const(Windows1251Char)[]) s; 3266 dchar c = std.encoding.decode(t); 3267 s = s[$-t.length..$]; 3268 return c; 3269 } 3270 3271 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3272 { 3273 auto t = cast(const(Windows1251Char)[]) s; 3274 dchar c = std.encoding.safeDecode(t); 3275 s = s[$-t.length..$]; 3276 return c; 3277 } 3278 3279 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3280 { 3281 return cast(immutable(ubyte)[])"?"; 3282 } 3283 } 3284 } 3285 3286 /** 3287 EncodingScheme to handle Windows-1252 3288 3289 This scheme recognises the following names: 3290 "windows-1252" 3291 */ 3292 class EncodingSchemeWindows1252 : EncodingScheme 3293 { 3294 /* // moved to std.internal.phobosinit 3295 shared static this() 3296 { 3297 EncodingScheme.register("std.encoding.EncodingSchemeWindows1252"); 3298 }*/ 3299 3300 const 3301 { 3302 override string[] names() @safe pure nothrow 3303 { 3304 return 3305 [ 3306 "windows-1252" 3307 ]; 3308 } 3309 3310 override string toString() @safe pure nothrow @nogc 3311 { 3312 return "windows-1252"; 3313 } 3314 3315 override bool canEncode(dchar c) @safe pure nothrow @nogc 3316 { 3317 return std.encoding.canEncode!(Windows1252Char)(c); 3318 } 3319 3320 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3321 { 3322 return std.encoding.encodedLength!(Windows1252Char)(c); 3323 } 3324 3325 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3326 { 3327 auto r = cast(Windows1252Char[]) buffer; 3328 return std.encoding.encode(c,r); 3329 } 3330 3331 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3332 { 3333 auto t = cast(const(Windows1252Char)[]) s; 3334 dchar c = std.encoding.decode(t); 3335 s = s[$-t.length..$]; 3336 return c; 3337 } 3338 3339 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3340 { 3341 auto t = cast(const(Windows1252Char)[]) s; 3342 dchar c = std.encoding.safeDecode(t); 3343 s = s[$-t.length..$]; 3344 return c; 3345 } 3346 3347 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3348 { 3349 return cast(immutable(ubyte)[])"?"; 3350 } 3351 } 3352 } 3353 3354 @system unittest 3355 { 3356 static string[] schemeNames = 3357 [ 3358 "ASCII", 3359 "ISO-8859-1", 3360 "ISO-8859-2", 3361 "windows-1250", 3362 "windows-1251", 3363 "windows-1252" 3364 ]; 3365 3366 EncodingScheme[] schemes; 3367 3368 foreach (name;schemeNames) 3369 { 3370 schemes ~= EncodingScheme.create(name); 3371 } 3372 3373 ubyte[1] buffer; 3374 static dchar[][] valid = 3375 [ 3376 //Valid ASCII 3377 ['\u0001','\u0020','\u0040','\u0060','\u007F'], 3378 //Vaild 8859-1 3379 ['\u0001','\u0020','\u0070','\u00DA','\u00FF'], 3380 //Valid 8859-2 3381 ['\u0020','\u00D7','\u00DF','\u010F','\u02D9'], 3382 //Valid 1250 3383 ['\u0020','\u20AC','\u201E','\u2021','\u2039'], 3384 //Valid 1251 3385 ['\u0402','\u00A4','\u0415','\u0439','\u044F'], 3386 //Valid 1252 3387 ['\u20AC','\u0160','\u2019','\u2122','\u0178'], 3388 ]; 3389 3390 static const(ubyte)[] invalid = [0xA0,0xFF,0xFF,0x81,0x98,0x81]; 3391 3392 foreach (i,scheme;schemes) 3393 { 3394 assert(scheme.toString() == schemeNames[i],"Error in the name of encoding scheme"~schemeNames[i]); 3395 assert(!scheme.canEncode('\uFFFD')); 3396 assert(scheme.encodedLength('A') == 1); 3397 const(ubyte)[] encodeStr; 3398 dchar[] decStr; 3399 foreach (chr;valid[i]) 3400 { 3401 assert(scheme.encode(chr,buffer) == 1); 3402 encodeStr ~= buffer; 3403 const(ubyte)[] buf = buffer; 3404 decStr ~= scheme.decode(buf); 3405 } 3406 3407 assert(scheme.isValid(encodeStr),"Not correctly encoded UTF => " ~ schemeNames[i]); 3408 assert(valid[i] == decStr,"Error encode/decode UTF8 <=> " ~ schemeNames[i]); 3409 3410 if (schemeNames[i] == "ISO-8859-1" || schemeNames[i] == "ISO-8859-2") 3411 { 3412 assert(scheme.safeDecode(invalid) != INVALID_SEQUENCE); 3413 } 3414 else 3415 { 3416 assert(scheme.safeDecode(invalid) == INVALID_SEQUENCE); 3417 } 3418 assert(scheme.replacementSequence() == cast(immutable(ubyte)[])"?"); 3419 } 3420 assert(invalid.length == 0); 3421 } 3422 3423 /** 3424 EncodingScheme to handle UTF-8 3425 3426 This scheme recognises the following names: 3427 "UTF-8" 3428 */ 3429 class EncodingSchemeUtf8 : EncodingScheme 3430 { 3431 /* // moved to std.internal.phobosinit 3432 shared static this() 3433 { 3434 EncodingScheme.register("std.encoding.EncodingSchemeUtf8"); 3435 }*/ 3436 3437 const 3438 { 3439 override string[] names() @safe pure nothrow 3440 { 3441 return 3442 [ 3443 "UTF-8" 3444 ]; 3445 } 3446 3447 override string toString() @safe pure nothrow @nogc 3448 { 3449 return "UTF-8"; 3450 } 3451 3452 override bool canEncode(dchar c) @safe pure nothrow @nogc 3453 { 3454 return std.encoding.canEncode!(char)(c); 3455 } 3456 3457 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3458 { 3459 return std.encoding.encodedLength!(char)(c); 3460 } 3461 3462 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3463 { 3464 auto r = cast(char[]) buffer; 3465 return std.encoding.encode(c,r); 3466 } 3467 3468 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3469 { 3470 auto t = cast(const(char)[]) s; 3471 dchar c = std.encoding.decode(t); 3472 s = s[$-t.length..$]; 3473 return c; 3474 } 3475 3476 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3477 { 3478 auto t = cast(const(char)[]) s; 3479 dchar c = std.encoding.safeDecode(t); 3480 s = s[$-t.length..$]; 3481 return c; 3482 } 3483 3484 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3485 { 3486 return cast(immutable(ubyte)[])"\uFFFD"; 3487 } 3488 } 3489 } 3490 3491 /** 3492 EncodingScheme to handle UTF-16 in native byte order 3493 3494 This scheme recognises the following names: 3495 "UTF-16LE" (little-endian architecture only) 3496 "UTF-16BE" (big-endian architecture only) 3497 */ 3498 class EncodingSchemeUtf16Native : EncodingScheme 3499 { 3500 /* // moved to std.internal.phobosinit 3501 shared static this() 3502 { 3503 EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native"); 3504 }*/ 3505 3506 const 3507 { 3508 version (LittleEndian) { enum string NAME = "UTF-16LE"; } 3509 version (BigEndian) { enum string NAME = "UTF-16BE"; } 3510 3511 override string[] names() @safe pure nothrow 3512 { 3513 return [ NAME ]; 3514 } 3515 3516 override string toString() @safe pure nothrow @nogc 3517 { 3518 return NAME; 3519 } 3520 3521 override bool canEncode(dchar c) @safe pure nothrow @nogc 3522 { 3523 return std.encoding.canEncode!(wchar)(c); 3524 } 3525 3526 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3527 { 3528 return std.encoding.encodedLength!(wchar)(c); 3529 } 3530 3531 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3532 { 3533 auto r = cast(wchar[]) buffer; 3534 return wchar.sizeof * std.encoding.encode(c,r); 3535 } 3536 3537 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3538 in 3539 { 3540 assert((s.length & 1) == 0); 3541 } 3542 do 3543 { 3544 auto t = cast(const(wchar)[]) s; 3545 dchar c = std.encoding.decode(t); 3546 s = s[$-t.length * wchar.sizeof..$]; 3547 return c; 3548 } 3549 3550 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3551 in 3552 { 3553 assert((s.length & 1) == 0); 3554 } 3555 do 3556 { 3557 auto t = cast(const(wchar)[]) s; 3558 dchar c = std.encoding.safeDecode(t); 3559 s = s[$-t.length * wchar.sizeof..$]; 3560 return c; 3561 } 3562 3563 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3564 { 3565 return cast(immutable(ubyte)[])"\uFFFD"w; 3566 } 3567 } 3568 } 3569 @system unittest 3570 { 3571 version (LittleEndian) 3572 { 3573 auto efrom = EncodingScheme.create("utf-16le"); 3574 ubyte[6] sample = [154,1, 155,1, 156,1]; 3575 } 3576 version (BigEndian) 3577 { 3578 auto efrom = EncodingScheme.create("utf-16be"); 3579 ubyte[6] sample = [1,154, 1,155, 1,156]; 3580 } 3581 const(ubyte)[] ub = cast(const(ubyte)[])sample; 3582 dchar dc = efrom.safeDecode(ub); 3583 assert(dc == 410); 3584 assert(ub.length == 4); 3585 } 3586 3587 /** 3588 EncodingScheme to handle UTF-32 in native byte order 3589 3590 This scheme recognises the following names: 3591 "UTF-32LE" (little-endian architecture only) 3592 "UTF-32BE" (big-endian architecture only) 3593 */ 3594 class EncodingSchemeUtf32Native : EncodingScheme 3595 { 3596 /* // moved to std.internal.phobosinit 3597 shared static this() 3598 { 3599 EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native"); 3600 }*/ 3601 3602 const 3603 { 3604 version (LittleEndian) { enum string NAME = "UTF-32LE"; } 3605 version (BigEndian) { enum string NAME = "UTF-32BE"; } 3606 3607 override string[] names() @safe pure nothrow 3608 { 3609 return [ NAME ]; 3610 } 3611 3612 override string toString() @safe pure nothrow @nogc 3613 { 3614 return NAME; 3615 } 3616 3617 override bool canEncode(dchar c) @safe pure nothrow @nogc 3618 { 3619 return std.encoding.canEncode!(dchar)(c); 3620 } 3621 3622 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3623 { 3624 return std.encoding.encodedLength!(dchar)(c); 3625 } 3626 3627 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3628 { 3629 auto r = cast(dchar[]) buffer; 3630 return dchar.sizeof * std.encoding.encode(c,r); 3631 } 3632 3633 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3634 in 3635 { 3636 assert((s.length & 3) == 0); 3637 } 3638 do 3639 { 3640 auto t = cast(const(dchar)[]) s; 3641 dchar c = std.encoding.decode(t); 3642 s = s[$-t.length * dchar.sizeof..$]; 3643 return c; 3644 } 3645 3646 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3647 in 3648 { 3649 assert((s.length & 3) == 0); 3650 } 3651 do 3652 { 3653 auto t = cast(const(dchar)[]) s; 3654 dchar c = std.encoding.safeDecode(t); 3655 s = s[$-t.length * dchar.sizeof..$]; 3656 return c; 3657 } 3658 3659 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3660 { 3661 return cast(immutable(ubyte)[])"\uFFFD"d; 3662 } 3663 } 3664 } 3665 @system unittest 3666 { 3667 version (LittleEndian) 3668 { 3669 auto efrom = EncodingScheme.create("utf-32le"); 3670 ubyte[12] sample = [154,1,0,0, 155,1,0,0, 156,1,0,0]; 3671 } 3672 version (BigEndian) 3673 { 3674 auto efrom = EncodingScheme.create("utf-32be"); 3675 ubyte[12] sample = [0,0,1,154, 0,0,1,155, 0,0,1,156]; 3676 } 3677 const(ubyte)[] ub = cast(const(ubyte)[])sample; 3678 dchar dc = efrom.safeDecode(ub); 3679 assert(dc == 410); 3680 assert(ub.length == 8); 3681 } 3682 3683 //============================================================================= 3684 3685 3686 /** Definitions of common Byte Order Marks. 3687 The elements of the `enum` can used as indices into `bomTable` to get 3688 matching `BOMSeq`. 3689 */ 3690 enum BOM 3691 { 3692 none = 0, /// no BOM was found 3693 utf32be = 1, /// [0x00, 0x00, 0xFE, 0xFF] 3694 utf32le = 2, /// [0xFF, 0xFE, 0x00, 0x00] 3695 utf7 = 3, /** [0x2B, 0x2F, 0x76, 0x38] 3696 [0x2B, 0x2F, 0x76, 0x39], 3697 [0x2B, 0x2F, 0x76, 0x2B], 3698 [0x2B, 0x2F, 0x76, 0x2F], 3699 [0x2B, 0x2F, 0x76, 0x38, 0x2D] 3700 */ 3701 utf1 = 8, /// [0xF7, 0x64, 0x4C] 3702 utfebcdic = 9, /// [0xDD, 0x73, 0x66, 0x73] 3703 scsu = 10, /// [0x0E, 0xFE, 0xFF] 3704 bocu1 = 11, /// [0xFB, 0xEE, 0x28] 3705 gb18030 = 12, /// [0x84, 0x31, 0x95, 0x33] 3706 utf8 = 13, /// [0xEF, 0xBB, 0xBF] 3707 utf16be = 14, /// [0xFE, 0xFF] 3708 utf16le = 15 /// [0xFF, 0xFE] 3709 } 3710 3711 /// The type stored inside `bomTable`. 3712 alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence"); 3713 3714 /** Mapping of a byte sequence to $(B Byte Order Mark (BOM)) 3715 */ 3716 immutable bomTable = [ 3717 BOMSeq(BOM.none, null), 3718 BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])), 3719 BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])), 3720 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])), 3721 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])), 3722 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])), 3723 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])), 3724 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])), 3725 BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])), 3726 BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])), 3727 BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])), 3728 BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])), 3729 BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])), 3730 BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])), 3731 BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])), 3732 BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE])) 3733 ]; 3734 3735 /** Returns a `BOMSeq` for a given `input`. 3736 If no `BOM` is present the `BOMSeq` for `BOM.none` is 3737 returned. The `BOM` sequence at the beginning of the range will 3738 not be comsumed from the passed range. If you pass a reference type 3739 range make sure that `save` creates a deep copy. 3740 3741 Params: 3742 input = The sequence to check for the `BOM` 3743 3744 Returns: 3745 the found `BOMSeq` corresponding to the passed `input`. 3746 */ 3747 immutable(BOMSeq) getBOM(Range)(Range input) 3748 if (isForwardRange!Range && is(immutable ElementType!Range == immutable ubyte)) 3749 { 3750 import std.algorithm.searching : startsWith; 3751 foreach (it; bomTable[1 .. $]) 3752 { 3753 if (startsWith(input.save, it.sequence)) 3754 { 3755 return it; 3756 } 3757 } 3758 3759 return bomTable[0]; 3760 } 3761 3762 /// 3763 @system unittest 3764 { 3765 import std.format : format; 3766 3767 auto ts = dchar(0x0000FEFF) ~ "Hello World"d; 3768 3769 auto entry = getBOM(cast(ubyte[]) ts); 3770 version (BigEndian) 3771 { 3772 assert(entry.schema == BOM.utf32be, format("%s", entry.schema)); 3773 } 3774 else 3775 { 3776 assert(entry.schema == BOM.utf32le, format("%s", entry.schema)); 3777 } 3778 } 3779 3780 @system unittest 3781 { 3782 import std.format : format; 3783 3784 foreach (idx, it; bomTable) 3785 { 3786 auto s = it[1] ~ cast(ubyte[])"hello world"; 3787 auto i = getBOM(s); 3788 assert(i[0] == bomTable[idx][0]); 3789 3790 if (idx < 4 || idx > 7) // get around the multiple utf7 bom's 3791 { 3792 assert(i[0] == BOM.init + idx); 3793 assert(i[1] == it[1]); 3794 } 3795 } 3796 } 3797 3798 @safe pure unittest 3799 { 3800 struct BOMInputRange 3801 { 3802 ubyte[] arr; 3803 3804 @property ubyte front() 3805 { 3806 return this.arr.front; 3807 } 3808 3809 @property bool empty() 3810 { 3811 return this.arr.empty; 3812 } 3813 3814 void popFront() 3815 { 3816 this.arr = this.arr[1 .. $]; 3817 } 3818 3819 @property typeof(this) save() 3820 { 3821 return this; 3822 } 3823 } 3824 3825 static assert( isInputRange!BOMInputRange); 3826 static assert(!isArray!BOMInputRange); 3827 3828 ubyte[] dummyEnd = [0,0,0,0]; 3829 3830 foreach (idx, it; bomTable[1 .. $]) 3831 { 3832 { 3833 auto ir = BOMInputRange(it.sequence.dup); 3834 3835 auto b = getBOM(ir); 3836 assert(b.schema == it.schema); 3837 assert(ir.arr == it.sequence); 3838 } 3839 3840 { 3841 auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd; 3842 size_t oldLen = noBom.length; 3843 assert(oldLen - 4 < it.sequence.length); 3844 3845 auto ir = BOMInputRange(noBom.dup); 3846 auto b = getBOM(ir); 3847 assert(b.schema == BOM.none); 3848 assert(noBom.length == oldLen); 3849 } 3850 } 3851 } 3852 3853 /** Constant defining a fully decoded BOM */ 3854 enum dchar utfBOM = 0xfeff;