1 // Written in the D programming language. 2 3 /++ 4 Encode and decode UTF-8, UTF-16 and UTF-32 strings. 5 6 UTF character support is restricted to 7 $(D '\u0000' <= character <= '\U0010FFFF'). 8 9 $(SCRIPT inhibitQuickIndex = 1;) 10 $(DIVC quickindex, 11 $(BOOKTABLE, 12 $(TR $(TH Category) $(TH Functions)) 13 $(TR $(TD Decode) $(TD 14 $(LREF decode) 15 $(LREF decodeFront) 16 )) 17 $(TR $(TD Lazy decode) $(TD 18 $(LREF byCodeUnit) 19 $(LREF byChar) 20 $(LREF byWchar) 21 $(LREF byDchar) 22 $(LREF byUTF) 23 )) 24 $(TR $(TD Encode) $(TD 25 $(LREF encode) 26 $(LREF toUTF8) 27 $(LREF toUTF16) 28 $(LREF toUTF32) 29 $(LREF toUTFz) 30 $(LREF toUTF16z) 31 )) 32 $(TR $(TD Length) $(TD 33 $(LREF codeLength) 34 $(LREF count) 35 $(LREF stride) 36 $(LREF strideBack) 37 )) 38 $(TR $(TD Index) $(TD 39 $(LREF toUCSindex) 40 $(LREF toUTFindex) 41 )) 42 $(TR $(TD Validation) $(TD 43 $(LREF isValidDchar) 44 $(LREF isValidCodepoint) 45 $(LREF validate) 46 )) 47 $(TR $(TD Miscellaneous) $(TD 48 $(LREF replacementDchar) 49 $(LREF UseReplacementDchar) 50 $(LREF UTFException) 51 )) 52 )) 53 See_Also: 54 $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> 55 $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> 56 $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) 57 Copyright: Copyright The D Language Foundation 2000 - 2012. 58 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 59 Authors: $(HTTP digitalmars.com, Walter Bright) and 60 $(HTTP jmdavisprog.com, Jonathan M Davis) 61 Source: $(PHOBOSSRC std/utf.d) 62 +/ 63 module std.utf; 64 65 import std.exception : basicExceptionCtors; 66 import core.exception : UnicodeException; 67 import std.meta : AliasSeq; 68 import std.range; 69 import std.traits : isAutodecodableString, isConvertibleToString, 70 isSomeChar, isSomeString, isStaticArray, Unqual; 71 import std.typecons : Flag, Yes, No; 72 73 74 /++ 75 Exception thrown on errors in std.utf functions. 76 +/ 77 class UTFException : UnicodeException 78 { 79 import core.internal.string : unsignedToTempString, UnsignedStringBuf; 80 81 uint[4] sequence; 82 size_t len; 83 84 @safe pure nothrow @nogc 85 UTFException setSequence(scope uint[] data...) return 86 { 87 assert(data.length <= 4); 88 89 len = data.length < 4 ? data.length : 4; 90 sequence[0 .. len] = data[0 .. len]; 91 92 return this; 93 } 94 95 // FIXME: Use std.exception.basicExceptionCtors here once 96 // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed 97 98 /** 99 Standard exception constructors. 100 */ 101 this(string msg, string file = __FILE__, size_t line = __LINE__, 102 Throwable next = null) @nogc @safe pure nothrow 103 { 104 super(msg, 0, file, line, next); 105 } 106 /// ditto 107 this(string msg, size_t index, string file = __FILE__, 108 size_t line = __LINE__, Throwable next = null) @safe pure nothrow 109 { 110 UnsignedStringBuf buf = void; 111 msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")"; 112 super(msg, index, file, line, next); 113 } 114 115 /** 116 Returns: 117 A `string` detailing the invalid UTF sequence. 118 */ 119 override string toString() const 120 { 121 if (len == 0) 122 { 123 /* Exception.toString() is not marked as const, although 124 * it is const-compatible. 125 */ 126 //return super.toString(); 127 auto e = () @trusted { return cast(Exception) super; } (); 128 return e.toString(); 129 } 130 131 string result = "Invalid UTF sequence:"; 132 133 foreach (i; sequence[0 .. len]) 134 { 135 UnsignedStringBuf buf = void; 136 result ~= ' '; 137 auto h = unsignedToTempString!16(i, buf); 138 if (h.length == 1) 139 result ~= '0'; 140 result ~= h; 141 result ~= 'x'; 142 } 143 144 if (super.msg.length > 0) 145 { 146 result ~= " - "; 147 result ~= super.msg; 148 } 149 150 return result; 151 } 152 } 153 154 /// 155 @safe unittest 156 { 157 import std.exception : assertThrown; 158 159 char[4] buf; 160 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 161 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 162 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 163 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 164 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 165 } 166 167 /* 168 Provide array of invalidly encoded UTF strings. Useful for testing. 169 170 Params: 171 Char = char, wchar, or dchar 172 173 Returns: 174 an array of invalidly encoded UTF strings 175 */ 176 177 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow 178 if (isSomeChar!Char) 179 { 180 static if (is(Char == char)) 181 { 182 enum x = 0xDC00; // invalid surrogate value 183 enum y = 0x110000; // out of range 184 185 static immutable string[8] result = 186 [ 187 "\x80", // not a start byte 188 "\xC0", // truncated 189 "\xC0\xC0", // invalid continuation 190 "\xF0\x82\x82\xAC", // overlong 191 [ 192 0xE0 | (x >> 12), 193 0x80 | ((x >> 6) & 0x3F), 194 0x80 | (x & 0x3F) 195 ], 196 [ 197 cast(char)(0xF0 | (y >> 18)), 198 cast(char)(0x80 | ((y >> 12) & 0x3F)), 199 cast(char)(0x80 | ((y >> 6) & 0x3F)), 200 cast(char)(0x80 | (y & 0x3F)) 201 ], 202 [ 203 cast(char)(0xF8 | 3), // 5 byte encoding 204 cast(char)(0x80 | 3), 205 cast(char)(0x80 | 3), 206 cast(char)(0x80 | 3), 207 cast(char)(0x80 | 3), 208 ], 209 [ 210 cast(char)(0xFC | 3), // 6 byte encoding 211 cast(char)(0x80 | 3), 212 cast(char)(0x80 | 3), 213 cast(char)(0x80 | 3), 214 cast(char)(0x80 | 3), 215 cast(char)(0x80 | 3), 216 ], 217 ]; 218 219 return result[]; 220 } 221 else static if (is(Char == wchar)) 222 { 223 static immutable wstring[5] result = 224 [ 225 [ 226 cast(wchar) 0xDC00, 227 ], 228 [ 229 cast(wchar) 0xDFFF, 230 ], 231 [ 232 cast(wchar) 0xDBFF, 233 cast(wchar) 0xDBFF, 234 ], 235 [ 236 cast(wchar) 0xDBFF, 237 cast(wchar) 0xE000, 238 ], 239 [ 240 cast(wchar) 0xD800, 241 ], 242 ]; 243 244 return result[]; 245 } 246 else static if (is(Char == dchar)) 247 { 248 static immutable dstring[3] result = 249 [ 250 [ cast(dchar) 0x110000 ], 251 [ cast(dchar) 0x00D800 ], 252 [ cast(dchar) 0x00DFFF ], 253 ]; 254 255 return result; 256 } 257 else 258 static assert(0); 259 } 260 261 /++ 262 Check whether the given Unicode code point is valid. 263 264 Params: 265 c = code point to check 266 267 Returns: 268 `true` if and only if `c` is a valid Unicode code point 269 270 Note: 271 `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`, 272 as they are permitted for internal use by an application, but they are 273 not allowed for interchange by the Unicode standard. 274 +/ 275 bool isValidDchar(dchar c) pure nothrow @safe @nogc 276 { 277 return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF); 278 } 279 280 /// 281 @safe @nogc pure nothrow unittest 282 { 283 assert( isValidDchar(cast(dchar) 0x41)); 284 assert( isValidDchar(cast(dchar) 0x00)); 285 assert(!isValidDchar(cast(dchar) 0xD800)); 286 assert(!isValidDchar(cast(dchar) 0x11FFFF)); 287 } 288 289 pure nothrow @safe @nogc unittest 290 { 291 import std.exception; 292 293 assertCTFEable!( 294 { 295 assert( isValidDchar(cast(dchar)'a') == true); 296 assert( isValidDchar(cast(dchar) 0x1FFFFF) == false); 297 298 assert(!isValidDchar(cast(dchar) 0x00D800)); 299 assert(!isValidDchar(cast(dchar) 0x00DBFF)); 300 assert(!isValidDchar(cast(dchar) 0x00DC00)); 301 assert(!isValidDchar(cast(dchar) 0x00DFFF)); 302 assert( isValidDchar(cast(dchar) 0x00FFFE)); 303 assert( isValidDchar(cast(dchar) 0x00FFFF)); 304 assert( isValidDchar(cast(dchar) 0x01FFFF)); 305 assert( isValidDchar(cast(dchar) 0x10FFFF)); 306 assert(!isValidDchar(cast(dchar) 0x110000)); 307 }); 308 } 309 310 /** 311 Checks if a single character forms a valid code point. 312 313 When standing alone, some characters are invalid code points. For 314 example the `wchar` `0xD800` is a so called high surrogate, which can 315 only be interpreted together with a low surrogate following it. As a 316 standalone character it is considered invalid. 317 318 See $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/, 319 Unicode Standard, D90, D91 and D92) for more details. 320 321 Params: 322 c = character to test 323 Char = character type of `c` 324 325 Returns: 326 `true`, if `c` forms a valid code point. 327 */ 328 bool isValidCodepoint(Char)(Char c) 329 if (isSomeChar!Char) 330 { 331 alias UChar = Unqual!Char; 332 static if (is(UChar == char)) 333 { 334 return c <= 0x7F; 335 } 336 else static if (is(UChar == wchar)) 337 { 338 return c <= 0xD7FF || c >= 0xE000; 339 } 340 else static if (is(UChar == dchar)) 341 { 342 return isValidDchar(c); 343 } 344 else 345 static assert(false, "unknown character type: `" ~ Char.stringof ~ "`"); 346 } 347 348 /// 349 @safe pure nothrow unittest 350 { 351 assert( isValidCodepoint(cast(char) 0x40)); 352 assert(!isValidCodepoint(cast(char) 0x80)); 353 assert( isValidCodepoint(cast(wchar) 0x1234)); 354 assert(!isValidCodepoint(cast(wchar) 0xD800)); 355 assert( isValidCodepoint(cast(dchar) 0x0010FFFF)); 356 assert(!isValidCodepoint(cast(dchar) 0x12345678)); 357 } 358 359 /++ 360 Calculate the length of the UTF sequence starting at `index` 361 in `str`. 362 363 Params: 364 str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 365 of UTF code units. Must be random access if `index` is passed 366 index = starting index of UTF sequence (default: `0`) 367 368 Returns: 369 The number of code units in the UTF sequence. For UTF-8, this is a 370 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)). 371 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1. 372 373 Throws: 374 May throw a `UTFException` if `str[index]` is not the start of a 375 valid UTF sequence. 376 377 Note: 378 `stride` will only analyze the first `str[index]` element. It 379 will not fully verify the validity of the UTF sequence, nor even verify 380 the presence of the sequence: it will not actually guarantee that 381 $(D index + stride(str, index) <= str.length). 382 +/ 383 uint stride(S)(auto ref S str, size_t index) 384 if (is(S : const char[]) || 385 (isRandomAccessRange!S && is(immutable ElementType!S == immutable char))) 386 { 387 static if (is(typeof(str.length) : ulong)) 388 assert(index < str.length, "Past the end of the UTF-8 sequence"); 389 immutable c = str[index]; 390 391 if (c < 0x80) 392 return 1; 393 else 394 return strideImpl(c, index); 395 } 396 397 /// Ditto 398 uint stride(S)(auto ref S str) 399 if (is(S : const char[]) || 400 (isInputRange!S && is(immutable ElementType!S == immutable char))) 401 { 402 static if (is(S : const char[])) 403 immutable c = str[0]; 404 else 405 immutable c = str.front; 406 407 if (c < 0x80) 408 return 1; 409 else 410 return strideImpl(c, 0); 411 } 412 413 @system unittest 414 { 415 import core.exception : AssertError; 416 import std.conv : to; 417 import std.exception; 418 import std.string : format; 419 import std.traits : FunctionAttribute, functionAttributes, isSafe; 420 static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__) 421 { 422 enforce(stride(s, i) == codeLength!char(c), 423 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 424 425 enforce(stride(RandomCU!char(s), i) == codeLength!char(c), 426 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 427 428 auto refRandom = new RefRandomCU!char(s); 429 immutable randLen = refRandom.length; 430 enforce(stride(refRandom, i) == codeLength!char(c), 431 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 432 enforce(refRandom.length == randLen, 433 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 434 435 if (i == 0) 436 { 437 enforce(stride(s) == codeLength!char(c), 438 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 439 440 enforce(stride(InputCU!char(s)) == codeLength!char(c), 441 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 442 443 auto refBidir = new RefBidirCU!char(s); 444 immutable bidirLen = refBidir.length; 445 enforce(stride(refBidir) == codeLength!char(c), 446 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 447 enforce(refBidir.length == bidirLen, 448 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 449 } 450 } 451 452 assertCTFEable!( 453 { 454 test("a", 'a'); 455 test(" ", ' '); 456 test("\u2029", '\u2029'); //paraSep 457 test("\u0100", '\u0100'); 458 test("\u0430", '\u0430'); 459 test("\U00010143", '\U00010143'); 460 test("abcdefcdef", 'a'); 461 test("hello\U00010143\u0100\U00010143", 'h', 0); 462 test("hello\U00010143\u0100\U00010143", 'e', 1); 463 test("hello\U00010143\u0100\U00010143", 'l', 2); 464 test("hello\U00010143\u0100\U00010143", 'l', 3); 465 test("hello\U00010143\u0100\U00010143", 'o', 4); 466 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 467 test("hello\U00010143\u0100\U00010143", '\u0100', 9); 468 test("hello\U00010143\u0100\U00010143", '\U00010143', 11); 469 470 foreach (S; AliasSeq!(char[], const char[], string)) 471 { 472 enum str = to!S("hello world"); 473 static assert(isSafe!({ stride(str, 0); })); 474 static assert(isSafe!({ stride(str); })); 475 static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0); 476 static assert((functionAttributes!({ stride(str); }) & FunctionAttribute.pure_) != 0); 477 } 478 }); 479 } 480 481 @safe unittest // invalid start bytes 482 { 483 import std.exception : assertThrown; 484 immutable char[] invalidStartBytes = [ 485 0b1111_1000, // indicating a sequence length of 5 486 0b1111_1100, // 6 487 0b1111_1110, // 7 488 0b1111_1111, // 8 489 0b1000_0000, // continuation byte 490 ]; 491 foreach (c; invalidStartBytes) 492 assertThrown!UTFException(stride([c])); 493 } 494 495 /// Ditto 496 uint stride(S)(auto ref S str, size_t index) 497 if (is(S : const wchar[]) || 498 (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar))) 499 { 500 static if (is(typeof(str.length) : ulong)) 501 assert(index < str.length, "Past the end of the UTF-16 sequence"); 502 immutable uint u = str[index]; 503 return 1 + (u >= 0xD800 && u <= 0xDBFF); 504 } 505 506 /// Ditto 507 uint stride(S)(auto ref S str) @safe pure 508 if (is(S : const wchar[])) 509 { 510 return stride(str, 0); 511 } 512 513 /// Ditto 514 uint stride(S)(auto ref S str) 515 if (isInputRange!S && is(immutable ElementType!S == immutable wchar) && 516 !is(S : const wchar[])) 517 { 518 assert(!str.empty, "UTF-16 sequence is empty"); 519 immutable uint u = str.front; 520 return 1 + (u >= 0xD800 && u <= 0xDBFF); 521 } 522 523 @system unittest 524 { 525 import core.exception : AssertError; 526 import std.conv : to; 527 import std.exception; 528 import std.string : format; 529 import std.traits : FunctionAttribute, functionAttributes, isSafe; 530 static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__) 531 { 532 enforce(stride(s, i) == codeLength!wchar(c), 533 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 534 535 enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c), 536 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 537 538 auto refRandom = new RefRandomCU!wchar(s); 539 immutable randLen = refRandom.length; 540 enforce(stride(refRandom, i) == codeLength!wchar(c), 541 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 542 enforce(refRandom.length == randLen, 543 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 544 545 if (i == 0) 546 { 547 enforce(stride(s) == codeLength!wchar(c), 548 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 549 550 enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c), 551 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 552 553 auto refBidir = new RefBidirCU!wchar(s); 554 immutable bidirLen = refBidir.length; 555 enforce(stride(refBidir) == codeLength!wchar(c), 556 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 557 enforce(refBidir.length == bidirLen, 558 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 559 } 560 } 561 562 assertCTFEable!( 563 { 564 test("a", 'a'); 565 test(" ", ' '); 566 test("\u2029", '\u2029'); //paraSep 567 test("\u0100", '\u0100'); 568 test("\u0430", '\u0430'); 569 test("\U00010143", '\U00010143'); 570 test("abcdefcdef", 'a'); 571 test("hello\U00010143\u0100\U00010143", 'h', 0); 572 test("hello\U00010143\u0100\U00010143", 'e', 1); 573 test("hello\U00010143\u0100\U00010143", 'l', 2); 574 test("hello\U00010143\u0100\U00010143", 'l', 3); 575 test("hello\U00010143\u0100\U00010143", 'o', 4); 576 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 577 test("hello\U00010143\u0100\U00010143", '\u0100', 7); 578 test("hello\U00010143\u0100\U00010143", '\U00010143', 8); 579 580 foreach (S; AliasSeq!(wchar[], const wchar[], wstring)) 581 { 582 enum str = to!S("hello world"); 583 static assert(isSafe!(() => stride(str, 0))); 584 static assert(isSafe!(() => stride(str) )); 585 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0); 586 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0); 587 } 588 }); 589 } 590 591 /// Ditto 592 uint stride(S)(auto ref S str, size_t index = 0) 593 if (is(S : const dchar[]) || 594 (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar))) 595 { 596 static if (is(typeof(str.length) : ulong)) 597 assert(index < str.length, "Past the end of the UTF-32 sequence"); 598 else 599 assert(!str.empty, "UTF-32 sequence is empty."); 600 return 1; 601 } 602 603 /// 604 @safe unittest 605 { 606 assert("a".stride == 1); 607 assert("λ".stride == 2); 608 assert("aλ".stride == 1); 609 assert("aλ".stride(1) == 2); 610 assert("𐐷".stride == 4); 611 } 612 613 @system unittest 614 { 615 import core.exception : AssertError; 616 import std.conv : to; 617 import std.exception; 618 import std.string : format; 619 import std.traits : FunctionAttribute, functionAttributes, isSafe; 620 static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__) 621 { 622 enforce(stride(s, i) == codeLength!dchar(c), 623 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 624 625 enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c), 626 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 627 628 auto refRandom = new RefRandomCU!dchar(s); 629 immutable randLen = refRandom.length; 630 enforce(stride(refRandom, i) == codeLength!dchar(c), 631 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 632 enforce(refRandom.length == randLen, 633 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 634 635 if (i == 0) 636 { 637 enforce(stride(s) == codeLength!dchar(c), 638 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 639 640 enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c), 641 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 642 643 auto refBidir = new RefBidirCU!dchar(s); 644 immutable bidirLen = refBidir.length; 645 enforce(stride(refBidir) == codeLength!dchar(c), 646 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 647 enforce(refBidir.length == bidirLen, 648 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 649 } 650 } 651 652 assertCTFEable!( 653 { 654 test("a", 'a'); 655 test(" ", ' '); 656 test("\u2029", '\u2029'); //paraSep 657 test("\u0100", '\u0100'); 658 test("\u0430", '\u0430'); 659 test("\U00010143", '\U00010143'); 660 test("abcdefcdef", 'a'); 661 test("hello\U00010143\u0100\U00010143", 'h', 0); 662 test("hello\U00010143\u0100\U00010143", 'e', 1); 663 test("hello\U00010143\u0100\U00010143", 'l', 2); 664 test("hello\U00010143\u0100\U00010143", 'l', 3); 665 test("hello\U00010143\u0100\U00010143", 'o', 4); 666 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 667 test("hello\U00010143\u0100\U00010143", '\u0100', 6); 668 test("hello\U00010143\u0100\U00010143", '\U00010143', 7); 669 670 foreach (S; AliasSeq!(dchar[], const dchar[], dstring)) 671 { 672 enum str = to!S("hello world"); 673 static assert(isSafe!(() => stride(str, 0))); 674 static assert(isSafe!(() => stride(str) )); 675 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0); 676 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0); 677 } 678 }); 679 } 680 681 private uint strideImpl(char c, size_t index) @trusted pure 682 in { assert(c & 0x80); } 683 do 684 { 685 import core.bitop : bsr; 686 immutable msbs = 7 - bsr((~uint(c)) & 0xFF); 687 if (c == 0xFF || msbs < 2 || msbs > 4) 688 throw new UTFException("Invalid UTF-8 sequence", index); 689 return msbs; 690 } 691 692 /++ 693 Calculate the length of the UTF sequence ending one code unit before 694 `index` in `str`. 695 696 Params: 697 str = bidirectional range of UTF code units. Must be random access if 698 `index` is passed 699 index = index one past end of UTF sequence (default: `str.length`) 700 701 Returns: 702 The number of code units in the UTF sequence. For UTF-8, this is a 703 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)). 704 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1. 705 706 Throws: 707 May throw a `UTFException` if `str[index]` is not one past the 708 end of a valid UTF sequence. 709 710 Note: 711 `strideBack` will only analyze the element at $(D str[index - 1]) 712 element. It will not fully verify the validity of the UTF sequence, nor 713 even verify the presence of the sequence: it will not actually 714 guarantee that $(D strideBack(str, index) <= index). 715 +/ 716 uint strideBack(S)(auto ref S str, size_t index) 717 if (is(S : const char[]) || 718 (isRandomAccessRange!S && is(immutable ElementType!S == immutable char))) 719 { 720 static if (is(typeof(str.length) : ulong)) 721 assert(index <= str.length, "Past the end of the UTF-8 sequence"); 722 assert(index > 0, "Not the end of the UTF-8 sequence"); 723 724 if ((str[index-1] & 0b1100_0000) != 0b1000_0000) 725 return 1; 726 727 if (index >= 4) //single verification for most common case 728 { 729 static foreach (i; 2 .. 5) 730 { 731 if ((str[index-i] & 0b1100_0000) != 0b1000_0000) 732 return i; 733 } 734 } 735 else 736 { 737 static foreach (i; 2 .. 4) 738 { 739 if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000) 740 return i; 741 } 742 } 743 throw new UTFException("Not the end of the UTF sequence", index); 744 } 745 746 /// Ditto 747 uint strideBack(S)(auto ref S str) 748 if (is(S : const char[]) || 749 (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char))) 750 { 751 return strideBack(str, str.length); 752 } 753 754 /// Ditto 755 uint strideBack(S)(auto ref S str) 756 if (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S) 757 { 758 assert(!str.empty, "Past the end of the UTF-8 sequence"); 759 auto temp = str.save; 760 foreach (i; AliasSeq!(1, 2, 3, 4)) 761 { 762 if ((temp.back & 0b1100_0000) != 0b1000_0000) 763 return i; 764 temp.popBack(); 765 if (temp.empty) 766 break; 767 } 768 throw new UTFException("The last code unit is not the end of the UTF-8 sequence"); 769 } 770 771 @system unittest 772 { 773 import core.exception : AssertError; 774 import std.conv : to; 775 import std.exception; 776 import std.string : format; 777 import std.traits : FunctionAttribute, functionAttributes, isSafe; 778 static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 779 { 780 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c), 781 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 782 783 enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c), 784 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 785 786 auto refRandom = new RefRandomCU!char(s); 787 immutable randLen = refRandom.length; 788 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c), 789 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 790 enforce(refRandom.length == randLen, 791 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 792 793 if (i == size_t.max) 794 { 795 enforce(strideBack(s) == codeLength!char(c), 796 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 797 798 enforce(strideBack(BidirCU!char(s)) == codeLength!char(c), 799 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 800 801 auto refBidir = new RefBidirCU!char(s); 802 immutable bidirLen = refBidir.length; 803 enforce(strideBack(refBidir) == codeLength!char(c), 804 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 805 enforce(refBidir.length == bidirLen, 806 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 807 } 808 } 809 810 assertCTFEable!( 811 { 812 test("a", 'a'); 813 test(" ", ' '); 814 test("\u2029", '\u2029'); //paraSep 815 test("\u0100", '\u0100'); 816 test("\u0430", '\u0430'); 817 test("\U00010143", '\U00010143'); 818 test("abcdefcdef", 'f'); 819 test("\U00010143\u0100\U00010143hello", 'o', 15); 820 test("\U00010143\u0100\U00010143hello", 'l', 14); 821 test("\U00010143\u0100\U00010143hello", 'l', 13); 822 test("\U00010143\u0100\U00010143hello", 'e', 12); 823 test("\U00010143\u0100\U00010143hello", 'h', 11); 824 test("\U00010143\u0100\U00010143hello", '\U00010143', 10); 825 test("\U00010143\u0100\U00010143hello", '\u0100', 6); 826 test("\U00010143\u0100\U00010143hello", '\U00010143', 4); 827 828 foreach (S; AliasSeq!(char[], const char[], string)) 829 { 830 enum str = to!S("hello world"); 831 static assert(isSafe!({ strideBack(str, 0); })); 832 static assert(isSafe!({ strideBack(str); })); 833 static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0); 834 static assert((functionAttributes!({ strideBack(str); }) & FunctionAttribute.pure_) != 0); 835 } 836 }); 837 } 838 839 //UTF-16 is self synchronizing: The length of strideBack can be found from 840 //the value of a single wchar 841 /// Ditto 842 uint strideBack(S)(auto ref S str, size_t index) 843 if (is(S : const wchar[]) || 844 (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar))) 845 { 846 static if (is(typeof(str.length) : ulong)) 847 assert(index <= str.length, "Past the end of the UTF-16 sequence"); 848 assert(index > 0, "Not the end of a UTF-16 sequence"); 849 850 immutable c2 = str[index-1]; 851 return 1 + (0xDC00 <= c2 && c2 < 0xE000); 852 } 853 854 /// Ditto 855 uint strideBack(S)(auto ref S str) 856 if (is(S : const wchar[]) || 857 (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar))) 858 { 859 assert(!str.empty, "UTF-16 sequence is empty"); 860 861 static if (is(S : const(wchar)[])) 862 immutable c2 = str[$ - 1]; 863 else 864 immutable c2 = str.back; 865 866 return 1 + (0xDC00 <= c2 && c2 <= 0xE000); 867 } 868 869 @system unittest 870 { 871 import core.exception : AssertError; 872 import std.conv : to; 873 import std.exception; 874 import std.string : format; 875 import std.traits : FunctionAttribute, functionAttributes, isSafe; 876 static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 877 { 878 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c), 879 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 880 881 enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c), 882 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 883 884 auto refRandom = new RefRandomCU!wchar(s); 885 immutable randLen = refRandom.length; 886 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c), 887 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 888 enforce(refRandom.length == randLen, 889 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 890 891 if (i == size_t.max) 892 { 893 enforce(strideBack(s) == codeLength!wchar(c), 894 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 895 896 enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c), 897 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 898 899 auto refBidir = new RefBidirCU!wchar(s); 900 immutable bidirLen = refBidir.length; 901 enforce(strideBack(refBidir) == codeLength!wchar(c), 902 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 903 enforce(refBidir.length == bidirLen, 904 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 905 } 906 } 907 908 assertCTFEable!( 909 { 910 test("a", 'a'); 911 test(" ", ' '); 912 test("\u2029", '\u2029'); //paraSep 913 test("\u0100", '\u0100'); 914 test("\u0430", '\u0430'); 915 test("\U00010143", '\U00010143'); 916 test("abcdefcdef", 'f'); 917 test("\U00010143\u0100\U00010143hello", 'o', 10); 918 test("\U00010143\u0100\U00010143hello", 'l', 9); 919 test("\U00010143\u0100\U00010143hello", 'l', 8); 920 test("\U00010143\u0100\U00010143hello", 'e', 7); 921 test("\U00010143\u0100\U00010143hello", 'h', 6); 922 test("\U00010143\u0100\U00010143hello", '\U00010143', 5); 923 test("\U00010143\u0100\U00010143hello", '\u0100', 3); 924 test("\U00010143\u0100\U00010143hello", '\U00010143', 2); 925 926 foreach (S; AliasSeq!(wchar[], const wchar[], wstring)) 927 { 928 enum str = to!S("hello world"); 929 static assert(isSafe!(() => strideBack(str, 0))); 930 static assert(isSafe!(() => strideBack(str) )); 931 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0); 932 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0); 933 } 934 }); 935 } 936 937 /// Ditto 938 uint strideBack(S)(auto ref S str, size_t index) 939 if (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar)) 940 { 941 static if (is(typeof(str.length) : ulong)) 942 assert(index <= str.length, "Past the end of the UTF-32 sequence"); 943 assert(index > 0, "Not the end of the UTF-32 sequence"); 944 return 1; 945 } 946 947 /// Ditto 948 uint strideBack(S)(auto ref S str) 949 if (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar)) 950 { 951 assert(!str.empty, "Empty UTF-32 sequence"); 952 return 1; 953 } 954 955 /// 956 @safe unittest 957 { 958 assert("a".strideBack == 1); 959 assert("λ".strideBack == 2); 960 assert("aλ".strideBack == 2); 961 assert("aλ".strideBack(1) == 1); 962 assert("𐐷".strideBack == 4); 963 } 964 965 @system unittest 966 { 967 import core.exception : AssertError; 968 import std.conv : to; 969 import std.exception; 970 import std.string : format; 971 import std.traits : FunctionAttribute, functionAttributes, isSafe; 972 static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 973 { 974 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c), 975 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 976 977 enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c), 978 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 979 980 auto refRandom = new RefRandomCU!dchar(s); 981 immutable randLen = refRandom.length; 982 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c), 983 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 984 enforce(refRandom.length == randLen, 985 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 986 987 if (i == size_t.max) 988 { 989 enforce(strideBack(s) == codeLength!dchar(c), 990 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 991 992 enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c), 993 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 994 995 auto refBidir = new RefBidirCU!dchar(s); 996 immutable bidirLen = refBidir.length; 997 enforce(strideBack(refBidir) == codeLength!dchar(c), 998 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 999 enforce(refBidir.length == bidirLen, 1000 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 1001 } 1002 } 1003 1004 assertCTFEable!( 1005 { 1006 test("a", 'a'); 1007 test(" ", ' '); 1008 test("\u2029", '\u2029'); //paraSep 1009 test("\u0100", '\u0100'); 1010 test("\u0430", '\u0430'); 1011 test("\U00010143", '\U00010143'); 1012 test("abcdefcdef", 'f'); 1013 test("\U00010143\u0100\U00010143hello", 'o', 8); 1014 test("\U00010143\u0100\U00010143hello", 'l', 7); 1015 test("\U00010143\u0100\U00010143hello", 'l', 6); 1016 test("\U00010143\u0100\U00010143hello", 'e', 5); 1017 test("\U00010143\u0100\U00010143hello", 'h', 4); 1018 test("\U00010143\u0100\U00010143hello", '\U00010143', 3); 1019 test("\U00010143\u0100\U00010143hello", '\u0100', 2); 1020 test("\U00010143\u0100\U00010143hello", '\U00010143', 1); 1021 1022 foreach (S; AliasSeq!(dchar[], const dchar[], dstring)) 1023 { 1024 enum str = to!S("hello world"); 1025 static assert(isSafe!(() => strideBack(str, 0))); 1026 static assert(isSafe!(() => strideBack(str) )); 1027 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0); 1028 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0); 1029 } 1030 }); 1031 } 1032 1033 1034 /++ 1035 Given `index` into `str` and assuming that `index` is at the start 1036 of a UTF sequence, `toUCSindex` determines the number of UCS characters 1037 up to `index`. So, `index` is the index of a code unit at the 1038 beginning of a code point, and the return value is how many code points into 1039 the string that that code point is. 1040 +/ 1041 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure 1042 if (isSomeChar!C) 1043 { 1044 static if (is(immutable C == immutable dchar)) 1045 return index; 1046 else 1047 { 1048 size_t n = 0; 1049 size_t j = 0; 1050 1051 for (; j < index; ++n) 1052 j += stride(str, j); 1053 1054 if (j > index) 1055 { 1056 static if (is(immutable C == immutable char)) 1057 throw new UTFException("Invalid UTF-8 sequence", index); 1058 else 1059 throw new UTFException("Invalid UTF-16 sequence", index); 1060 } 1061 1062 return n; 1063 } 1064 } 1065 1066 /// 1067 @safe unittest 1068 { 1069 assert(toUCSindex(`hello world`, 7) == 7); 1070 assert(toUCSindex(`hello world`w, 7) == 7); 1071 assert(toUCSindex(`hello world`d, 7) == 7); 1072 1073 assert(toUCSindex(`Ma Chérie`, 7) == 6); 1074 assert(toUCSindex(`Ma Chérie`w, 7) == 7); 1075 assert(toUCSindex(`Ma Chérie`d, 7) == 7); 1076 1077 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3); 1078 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9); 1079 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9); 1080 } 1081 1082 1083 /++ 1084 Given a UCS index `n` into `str`, returns the UTF index. 1085 So, `n` is how many code points into the string the code point is, and 1086 the array index of the code unit is returned. 1087 +/ 1088 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure 1089 if (isSomeChar!C) 1090 { 1091 static if (is(immutable C == immutable dchar)) 1092 { 1093 return n; 1094 } 1095 else 1096 { 1097 size_t i; 1098 while (n--) 1099 { 1100 i += stride(str, i); 1101 } 1102 return i; 1103 } 1104 } 1105 1106 /// 1107 @safe unittest 1108 { 1109 assert(toUTFindex(`hello world`, 7) == 7); 1110 assert(toUTFindex(`hello world`w, 7) == 7); 1111 assert(toUTFindex(`hello world`d, 7) == 7); 1112 1113 assert(toUTFindex(`Ma Chérie`, 6) == 7); 1114 assert(toUTFindex(`Ma Chérie`w, 7) == 7); 1115 assert(toUTFindex(`Ma Chérie`d, 7) == 7); 1116 1117 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9); 1118 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9); 1119 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9); 1120 } 1121 1122 1123 /* =================== Decode ======================= */ 1124 1125 /// Whether or not to replace invalid UTF with $(LREF replacementDchar) 1126 alias UseReplacementDchar = Flag!"useReplacementDchar"; 1127 1128 /++ 1129 Decodes and returns the code point starting at `str[index]`. `index` 1130 is advanced to one past the decoded code point. If the code point is not 1131 well-formed, then a `UTFException` is thrown and `index` remains 1132 unchanged. 1133 1134 decode will only work with strings and random access ranges of code units 1135 with length and slicing, whereas $(LREF decodeFront) will work with any 1136 input range of code units. 1137 1138 Params: 1139 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1140 str = input string or indexable Range 1141 index = starting index into s[]; incremented by number of code units processed 1142 1143 Returns: 1144 decoded character 1145 1146 Throws: 1147 $(LREF UTFException) if `str[index]` is not the start of a valid UTF 1148 sequence and useReplacementDchar is `No.useReplacementDchar` 1149 +/ 1150 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index) 1151 if (!isSomeString!S && 1152 isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S)) 1153 in 1154 { 1155 assert(index < str.length, "Attempted to decode past the end of a string"); 1156 } 1157 out (result) 1158 { 1159 assert(isValidDchar(result)); 1160 } 1161 do 1162 { 1163 if (str[index] < codeUnitLimit!S) 1164 return str[index++]; 1165 else 1166 return decodeImpl!(true, useReplacementDchar)(str, index); 1167 } 1168 1169 /// ditto 1170 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1171 auto ref scope S str, ref size_t index) @trusted pure 1172 if (isSomeString!S) 1173 in 1174 { 1175 assert(index < str.length, "Attempted to decode past the end of a string"); 1176 } 1177 out (result) 1178 { 1179 assert(isValidDchar(result)); 1180 } 1181 do 1182 { 1183 if (str[index] < codeUnitLimit!S) 1184 return str[index++]; 1185 else static if (is(immutable S == immutable C[], C)) 1186 return decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index); 1187 } 1188 1189 /// 1190 @safe pure unittest 1191 { 1192 size_t i; 1193 1194 assert("a".decode(i) == 'a' && i == 1); 1195 i = 0; 1196 assert("å".decode(i) == 'å' && i == 2); 1197 i = 1; 1198 assert("aå".decode(i) == 'å' && i == 3); 1199 i = 0; 1200 assert("å"w.decode(i) == 'å' && i == 1); 1201 1202 // ë as a multi-code point grapheme 1203 i = 0; 1204 assert("e\u0308".decode(i) == 'e' && i == 1); 1205 // ë as a single code point grapheme 1206 i = 0; 1207 assert("ë".decode(i) == 'ë' && i == 2); 1208 i = 0; 1209 assert("ë"w.decode(i) == 'ë' && i == 1); 1210 } 1211 1212 @safe pure unittest // https://issues.dlang.org/show_bug.cgi?id=22867 1213 { 1214 import std.conv : hexString; 1215 string data = hexString!"f787a598"; 1216 size_t offset = 0; 1217 try data.decode(offset); 1218 catch (UTFException ex) assert(offset == 0); 1219 } 1220 1221 /++ 1222 `decodeFront` is a variant of $(LREF decode) which specifically decodes 1223 the first code point. Unlike $(LREF decode), `decodeFront` accepts any 1224 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 1225 of code units (rather than just a string or random access 1226 range). It also takes the range by `ref` and pops off the elements as it 1227 decodes them. If `numCodeUnits` is passed in, it gets set to the number 1228 of code units which were in the code point which was decoded. 1229 1230 Params: 1231 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1232 str = input string or indexable Range 1233 numCodeUnits = set to number of code units processed 1234 1235 Returns: 1236 decoded character 1237 1238 Throws: 1239 $(LREF UTFException) if `str.front` is not the start of a valid UTF 1240 sequence. If an exception is thrown, then there is no guarantee as to 1241 the number of code units which were popped off, as it depends on the 1242 type of range being used and how many code units had to be popped off 1243 before the code point was determined to be invalid. 1244 +/ 1245 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1246 ref S str, out size_t numCodeUnits) 1247 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S)) 1248 in 1249 { 1250 assert(!str.empty); 1251 } 1252 out (result) 1253 { 1254 assert(isValidDchar(result)); 1255 } 1256 do 1257 { 1258 immutable fst = str.front; 1259 1260 if (fst < codeUnitLimit!S) 1261 { 1262 str.popFront(); 1263 numCodeUnits = 1; 1264 return fst; 1265 } 1266 else 1267 { 1268 // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be 1269 // done outside of decodeImpl, which is undesirable, since not all 1270 // overloads of decodeImpl need it. So, it should be moved back into 1271 // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521 1272 // has been fixed. 1273 enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S; 1274 immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits); 1275 1276 // The other range types were already popped by decodeImpl. 1277 static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1278 str = str[numCodeUnits .. str.length]; 1279 1280 return retval; 1281 } 1282 } 1283 1284 /// ditto 1285 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1286 ref scope S str, out size_t numCodeUnits) @trusted pure 1287 if (isSomeString!S) 1288 in 1289 { 1290 assert(!str.empty); 1291 } 1292 out (result) 1293 { 1294 assert(isValidDchar(result)); 1295 } 1296 do 1297 { 1298 if (str[0] < codeUnitLimit!S) 1299 { 1300 numCodeUnits = 1; 1301 immutable retval = str[0]; 1302 str = str[1 .. $]; 1303 return retval; 1304 } 1305 else static if (is(immutable S == immutable C[], C)) 1306 { 1307 immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, numCodeUnits); 1308 str = str[numCodeUnits .. $]; 1309 return retval; 1310 } 1311 } 1312 1313 /++ Ditto +/ 1314 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str) 1315 if (isInputRange!S && isSomeChar!(ElementType!S)) 1316 { 1317 size_t numCodeUnits; 1318 return decodeFront!useReplacementDchar(str, numCodeUnits); 1319 } 1320 1321 /// 1322 @safe pure unittest 1323 { 1324 import std.range.primitives; 1325 string str = "Hello, World!"; 1326 1327 assert(str.decodeFront == 'H' && str == "ello, World!"); 1328 str = "å"; 1329 assert(str.decodeFront == 'å' && str.empty); 1330 str = "å"; 1331 size_t i; 1332 assert(str.decodeFront(i) == 'å' && i == 2 && str.empty); 1333 } 1334 1335 /++ 1336 `decodeBack` is a variant of $(LREF decode) which specifically decodes 1337 the last code point. Unlike $(LREF decode), `decodeBack` accepts any 1338 bidirectional range of code units (rather than just a string or random access 1339 range). It also takes the range by `ref` and pops off the elements as it 1340 decodes them. If `numCodeUnits` is passed in, it gets set to the number 1341 of code units which were in the code point which was decoded. 1342 1343 Params: 1344 useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing 1345 str = input string or bidirectional Range 1346 numCodeUnits = gives the number of code units processed 1347 1348 Returns: 1349 A decoded UTF character. 1350 1351 Throws: 1352 $(LREF UTFException) if `str.back` is not the end of a valid UTF 1353 sequence. If an exception is thrown, the `str` itself remains unchanged, 1354 but there is no guarantee as to the value of `numCodeUnits` (when passed). 1355 +/ 1356 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1357 ref S str, out size_t numCodeUnits) 1358 if (isSomeString!S) 1359 in 1360 { 1361 assert(!str.empty); 1362 } 1363 out (result) 1364 { 1365 assert(isValidDchar(result)); 1366 } 1367 do 1368 { 1369 if (str[$ - 1] < codeUnitLimit!S) 1370 { 1371 numCodeUnits = 1; 1372 immutable retval = str[$ - 1]; 1373 str = str[0 .. $ - 1]; 1374 return retval; 1375 } 1376 else static if (is(immutable S == immutable C[], C)) 1377 { 1378 numCodeUnits = strideBack(str); 1379 immutable newLength = str.length - numCodeUnits; 1380 size_t index = newLength; 1381 immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index); 1382 str = str[0 .. newLength]; 1383 return retval; 1384 } 1385 } 1386 1387 /++ Ditto +/ 1388 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1389 ref S str, out size_t numCodeUnits) 1390 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S 1391 && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S)) 1392 in 1393 { 1394 assert(!str.empty); 1395 } 1396 out (result) 1397 { 1398 assert(isValidDchar(result)); 1399 } 1400 do 1401 { 1402 if (str.back < codeUnitLimit!S) 1403 { 1404 numCodeUnits = 1; 1405 immutable retval = str.back; 1406 str.popBack(); 1407 return retval; 1408 } 1409 else 1410 { 1411 numCodeUnits = strideBack(str); 1412 static if (isRandomAccessRange!S) 1413 { 1414 size_t index = str.length - numCodeUnits; 1415 immutable retval = decodeImpl!(true, useReplacementDchar)(str, index); 1416 str.popBackExactly(numCodeUnits); 1417 return retval; 1418 } 1419 else 1420 { 1421 alias Char = Unqual!(ElementType!S); 1422 Char[4] codeUnits; 1423 S tmp = str.save; 1424 for (size_t i = numCodeUnits; i > 0; ) 1425 { 1426 codeUnits[--i] = tmp.back; 1427 tmp.popBack(); 1428 } 1429 const Char[] codePoint = codeUnits[0 .. numCodeUnits]; 1430 size_t index = 0; 1431 immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index); 1432 str = tmp; 1433 return retval; 1434 } 1435 } 1436 } 1437 1438 /++ Ditto +/ 1439 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str) 1440 if (isSomeString!S 1441 || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S)) 1442 || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S))) 1443 in 1444 { 1445 assert(!str.empty); 1446 } 1447 out (result) 1448 { 1449 assert(isValidDchar(result)); 1450 } 1451 do 1452 { 1453 size_t numCodeUnits; 1454 return decodeBack!useReplacementDchar(str, numCodeUnits); 1455 } 1456 1457 /// 1458 @system pure unittest 1459 { 1460 import std.range.primitives; 1461 string str = "Hello, World!"; 1462 1463 assert(str.decodeBack == '!' && str == "Hello, World"); 1464 str = "å"; 1465 assert(str.decodeBack == 'å' && str.empty); 1466 str = "å"; 1467 size_t i; 1468 assert(str.decodeBack(i) == 'å' && i == 2 && str.empty); 1469 } 1470 1471 // For the given range, code unit values less than this 1472 // are guaranteed to be valid single-codepoint encodings. 1473 package template codeUnitLimit(S) 1474 if (isSomeChar!(ElementEncodingType!S)) 1475 { 1476 static if (is(immutable ElementEncodingType!S == immutable char)) 1477 enum char codeUnitLimit = 0x80; 1478 else static if (is(immutable ElementEncodingType!S == immutable wchar)) 1479 enum wchar codeUnitLimit = 0xD800; 1480 else 1481 enum dchar codeUnitLimit = 0xD800; 1482 } 1483 1484 /* 1485 * For strings, this function does its own bounds checking to give a 1486 * more useful error message when attempting to decode past the end of a string. 1487 * Subsequently it uses a pointer instead of an array to avoid 1488 * redundant bounds checking. 1489 * 1490 * The three overloads of this operate on chars, wchars, and dchars. 1491 * 1492 * Params: 1493 * canIndex = if S is indexable 1494 * useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1495 * str = input string or Range 1496 * index = starting index into s[]; incremented by number of code units processed 1497 * 1498 * Returns: 1499 * decoded character 1500 */ 1501 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1502 auto ref S str, ref size_t index) 1503 if ( 1504 is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char))) 1505 { 1506 /* The following encodings are valid, except for the 5 and 6 byte 1507 * combinations: 1508 * 0xxxxxxx 1509 * 110xxxxx 10xxxxxx 1510 * 1110xxxx 10xxxxxx 10xxxxxx 1511 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 1512 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1513 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1514 */ 1515 1516 /* Dchar bitmask for different numbers of UTF-8 code units. 1517 */ 1518 alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1); 1519 1520 static if (is(S : const char[])) 1521 auto pstr = str.ptr + index; // this is what makes decodeImpl() @system code 1522 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1523 auto pstr = str[index .. str.length]; 1524 else 1525 alias pstr = str; 1526 1527 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done 1528 // outside of decodeImpl 1529 //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S); 1530 1531 static if (canIndex) 1532 { 1533 immutable length = str.length - index; 1534 ubyte fst = pstr[0]; 1535 } 1536 else 1537 { 1538 ubyte fst = pstr.front; 1539 pstr.popFront(); 1540 } 1541 1542 static if (!useReplacementDchar) 1543 { 1544 static if (canIndex) 1545 { 1546 static UTFException exception(S)(S str, string msg) 1547 { 1548 uint[4] sequence = void; 1549 size_t i; 1550 1551 do 1552 { 1553 sequence[i] = str[i]; 1554 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80); 1555 1556 return new UTFException(msg, i).setSequence(sequence[0 .. i]); 1557 } 1558 } 1559 1560 UTFException invalidUTF() 1561 { 1562 static if (canIndex) 1563 return exception(pstr[0 .. length], "Invalid UTF-8 sequence"); 1564 else 1565 { 1566 //We can't include the invalid sequence with input strings without 1567 //saving each of the code units along the way, and we can't do it with 1568 //forward ranges without saving the entire range. Both would incur a 1569 //cost for the decoding of every character just to provide a better 1570 //error message for the (hopefully) rare case when an invalid UTF-8 1571 //sequence is encountered, so we don't bother trying to include the 1572 //invalid sequence here, unlike with strings and sliceable ranges. 1573 return new UTFException("Invalid UTF-8 sequence"); 1574 } 1575 } 1576 1577 UTFException outOfBounds() 1578 { 1579 static if (canIndex) 1580 return exception(pstr[0 .. length], "Attempted to decode past the end of a string"); 1581 else 1582 return new UTFException("Attempted to decode past the end of a string"); 1583 } 1584 } 1585 1586 if ((fst & 0b1100_0000) != 0b1100_0000) 1587 { 1588 static if (useReplacementDchar) 1589 { 1590 ++index; // always consume bad input to avoid infinite loops 1591 return replacementDchar; 1592 } 1593 else 1594 throw invalidUTF(); // starter must have at least 2 first bits set 1595 } 1596 ubyte tmp = void; 1597 dchar d = fst; // upper control bits are masked out later 1598 fst <<= 1; 1599 1600 foreach (i; AliasSeq!(1, 2, 3)) 1601 { 1602 1603 static if (canIndex) 1604 { 1605 if (i == length) 1606 { 1607 static if (useReplacementDchar) 1608 { 1609 index += i; 1610 return replacementDchar; 1611 } 1612 else 1613 throw outOfBounds(); 1614 } 1615 } 1616 else 1617 { 1618 if (pstr.empty) 1619 { 1620 static if (useReplacementDchar) 1621 { 1622 index += i; 1623 return replacementDchar; 1624 } 1625 else 1626 throw outOfBounds(); 1627 } 1628 } 1629 1630 static if (canIndex) 1631 tmp = pstr[i]; 1632 else 1633 { 1634 tmp = pstr.front; 1635 pstr.popFront(); 1636 } 1637 1638 if ((tmp & 0xC0) != 0x80) 1639 { 1640 static if (useReplacementDchar) 1641 { 1642 index += i + 1; 1643 return replacementDchar; 1644 } 1645 else 1646 throw invalidUTF(); 1647 } 1648 1649 d = (d << 6) | (tmp & 0x3F); 1650 fst <<= 1; 1651 1652 if (!(fst & 0x80)) // no more bytes 1653 { 1654 d &= bitMask[i]; // mask out control bits 1655 1656 // overlong, could have been encoded with i bytes 1657 if ((d & ~bitMask[i - 1]) == 0) 1658 { 1659 static if (useReplacementDchar) 1660 { 1661 index += i + 1; 1662 return replacementDchar; 1663 } 1664 else 1665 throw invalidUTF(); 1666 } 1667 1668 // check for surrogates only needed for 3 bytes 1669 static if (i == 2) 1670 { 1671 if (!isValidDchar(d)) 1672 { 1673 static if (useReplacementDchar) 1674 { 1675 index += i + 1; 1676 return replacementDchar; 1677 } 1678 else 1679 throw invalidUTF(); 1680 } 1681 } 1682 1683 static if (i == 3) 1684 { 1685 if (d > dchar.max) 1686 { 1687 static if (useReplacementDchar) 1688 d = replacementDchar; 1689 else 1690 throw invalidUTF(); 1691 } 1692 } 1693 1694 index += i + 1; 1695 return d; 1696 } 1697 } 1698 1699 static if (useReplacementDchar) 1700 { 1701 index += 4; // read 4 chars by now 1702 return replacementDchar; 1703 } 1704 else 1705 throw invalidUTF(); 1706 } 1707 1708 @safe pure @nogc nothrow 1709 unittest 1710 { 1711 // Add tests for useReplacemendDchar == yes path 1712 1713 static struct R 1714 { 1715 @safe pure @nogc nothrow: 1716 this(string s) { this.s = s; } 1717 @property bool empty() { return idx == s.length; } 1718 @property char front() { return s[idx]; } 1719 void popFront() { ++idx; } 1720 size_t idx; 1721 string s; 1722 } 1723 1724 foreach (s; invalidUTFstrings!char()) 1725 { 1726 auto r = R(s); 1727 size_t index; 1728 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1729 assert(dc == replacementDchar); 1730 assert(1 <= index && index <= s.length); 1731 } 1732 } 1733 1734 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S) 1735 (auto ref S str, ref size_t index) 1736 if (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar))) 1737 { 1738 static if (is(S : const wchar[])) 1739 auto pstr = str.ptr + index; 1740 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1741 auto pstr = str[index .. str.length]; 1742 else 1743 alias pstr = str; 1744 1745 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done 1746 // outside of decodeImpl 1747 //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S); 1748 1749 static if (canIndex) 1750 { 1751 immutable length = str.length - index; 1752 uint u = pstr[0]; 1753 } 1754 else 1755 { 1756 uint u = pstr.front; 1757 pstr.popFront(); 1758 } 1759 1760 static if (!useReplacementDchar) 1761 { 1762 UTFException exception(string msg) 1763 { 1764 static if (canIndex) 1765 return new UTFException(msg).setSequence(pstr[0]); 1766 else 1767 return new UTFException(msg); 1768 } 1769 } 1770 1771 // The < case must be taken care of before decodeImpl is called. 1772 assert(u >= 0xD800); 1773 1774 if (u <= 0xDBFF) 1775 { 1776 static if (canIndex) 1777 immutable onlyOneCodeUnit = length == 1; 1778 else 1779 immutable onlyOneCodeUnit = pstr.empty; 1780 1781 if (onlyOneCodeUnit) 1782 { 1783 static if (useReplacementDchar) 1784 { 1785 ++index; 1786 return replacementDchar; 1787 } 1788 else 1789 throw exception("surrogate UTF-16 high value past end of string"); 1790 } 1791 1792 static if (canIndex) 1793 immutable uint u2 = pstr[1]; 1794 else 1795 { 1796 immutable uint u2 = pstr.front; 1797 pstr.popFront(); 1798 } 1799 1800 if (u2 < 0xDC00 || u2 > 0xDFFF) 1801 { 1802 static if (useReplacementDchar) 1803 u = replacementDchar; 1804 else 1805 throw exception("surrogate UTF-16 low value out of range"); 1806 } 1807 else 1808 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 1809 ++index; 1810 } 1811 else if (u >= 0xDC00 && u <= 0xDFFF) 1812 { 1813 static if (useReplacementDchar) 1814 u = replacementDchar; 1815 else 1816 throw exception("unpaired surrogate UTF-16 value"); 1817 } 1818 ++index; 1819 1820 // Note: u+FFFE and u+FFFF are specifically permitted by the 1821 // Unicode standard for application internal use (see isValidDchar) 1822 1823 return cast(dchar) u; 1824 } 1825 1826 @safe pure @nogc nothrow 1827 unittest 1828 { 1829 // Add tests for useReplacemendDchar == true path 1830 1831 static struct R 1832 { 1833 @safe pure @nogc nothrow: 1834 this(wstring s) { this.s = s; } 1835 @property bool empty() { return idx == s.length; } 1836 @property wchar front() { return s[idx]; } 1837 void popFront() { ++idx; } 1838 size_t idx; 1839 wstring s; 1840 } 1841 1842 foreach (s; invalidUTFstrings!wchar()) 1843 { 1844 auto r = R(s); 1845 size_t index; 1846 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1847 assert(dc == replacementDchar); 1848 assert(1 <= index && index <= s.length); 1849 } 1850 } 1851 1852 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1853 auto ref S str, ref size_t index) 1854 if (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar))) 1855 { 1856 static if (is(S : const dchar[])) 1857 auto pstr = str.ptr; 1858 else 1859 alias pstr = str; 1860 1861 static if (is(S : const dchar[]) || isRandomAccessRange!S) 1862 { 1863 dchar dc = pstr[index]; 1864 if (!isValidDchar(dc)) 1865 { 1866 static if (useReplacementDchar) 1867 dc = replacementDchar; 1868 else 1869 throw new UTFException("Invalid UTF-32 value").setSequence(dc); 1870 } 1871 ++index; 1872 return dc; 1873 } 1874 else 1875 { 1876 dchar dc = pstr.front; 1877 if (!isValidDchar(dc)) 1878 { 1879 static if (useReplacementDchar) 1880 dc = replacementDchar; 1881 else 1882 throw new UTFException("Invalid UTF-32 value").setSequence(dc); 1883 } 1884 ++index; 1885 pstr.popFront(); 1886 return dc; 1887 } 1888 } 1889 1890 @safe pure @nogc nothrow 1891 unittest 1892 { 1893 // Add tests for useReplacemendDchar == true path 1894 1895 static struct R 1896 { 1897 @safe pure @nogc nothrow: 1898 this(dstring s) { this.s = s; } 1899 @property bool empty() { return idx == s.length; } 1900 @property dchar front() { return s[idx]; } 1901 void popFront() { ++idx; } 1902 size_t idx; 1903 dstring s; 1904 } 1905 1906 foreach (s; invalidUTFstrings!dchar()) 1907 { 1908 auto r = R(s); 1909 size_t index; 1910 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1911 assert(dc == replacementDchar); 1912 assert(1 <= index && index <= s.length); 1913 } 1914 } 1915 1916 1917 version (StdUnittest) private void testDecode(R)(R range, 1918 size_t index, 1919 dchar expectedChar, 1920 size_t expectedIndex, 1921 size_t line = __LINE__) 1922 { 1923 import core.exception : AssertError; 1924 import std.exception : enforce; 1925 import std.string : format; 1926 import std.traits : isNarrowString; 1927 1928 static if (hasLength!R) 1929 immutable lenBefore = range.length; 1930 1931 static if (isRandomAccessRange!R && !isNarrowString!R) 1932 { 1933 { 1934 immutable result = decode(range, index); 1935 enforce(result == expectedChar, 1936 new AssertError(format("decode: Wrong character: %s", result), __FILE__, line)); 1937 enforce(index == expectedIndex, 1938 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line)); 1939 static if (hasLength!R) 1940 { 1941 enforce(range.length == lenBefore, 1942 new AssertError(format("decode: length changed: %s", range.length), __FILE__, line)); 1943 } 1944 } 1945 } 1946 } 1947 1948 version (StdUnittest) private void testDecodeFront(R)(ref R range, 1949 dchar expectedChar, 1950 size_t expectedNumCodeUnits, 1951 size_t line = __LINE__) 1952 { 1953 import core.exception : AssertError; 1954 import std.exception : enforce; 1955 import std.string : format; 1956 1957 static if (hasLength!R) 1958 immutable lenBefore = range.length; 1959 1960 size_t numCodeUnits; 1961 immutable result = decodeFront(range, numCodeUnits); 1962 enforce(result == expectedChar, 1963 new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line)); 1964 enforce(numCodeUnits == expectedNumCodeUnits, 1965 new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line)); 1966 1967 static if (hasLength!R) 1968 { 1969 enforce(range.length == lenBefore - numCodeUnits, 1970 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line)); 1971 } 1972 } 1973 1974 version (StdUnittest) private void testDecodeBack(R)(ref R range, 1975 dchar expectedChar, 1976 size_t expectedNumCodeUnits, 1977 size_t line = __LINE__) 1978 { 1979 // This condition is to allow unit testing all `decode` functions together 1980 static if (!isBidirectionalRange!R) 1981 return; 1982 else 1983 { 1984 import core.exception : AssertError; 1985 import std.exception : enforce; 1986 import std.string : format; 1987 1988 static if (hasLength!R) 1989 immutable lenBefore = range.length; 1990 1991 size_t numCodeUnits; 1992 immutable result = decodeBack(range, numCodeUnits); 1993 enforce(result == expectedChar, 1994 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line)); 1995 enforce(numCodeUnits == expectedNumCodeUnits, 1996 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line)); 1997 1998 static if (hasLength!R) 1999 { 2000 enforce(range.length == lenBefore - numCodeUnits, 2001 new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line)); 2002 } 2003 } 2004 } 2005 2006 version (StdUnittest) private void testAllDecode(R)(R range, 2007 dchar expectedChar, 2008 size_t expectedIndex, 2009 size_t line = __LINE__) 2010 { 2011 testDecode(range, 0, expectedChar, expectedIndex, line); 2012 static if (isBidirectionalRange!R) 2013 { 2014 auto rangeCopy = range.save; 2015 testDecodeBack(rangeCopy, expectedChar, expectedIndex, line); 2016 } 2017 testDecodeFront(range, expectedChar, expectedIndex, line); 2018 } 2019 2020 version (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__) 2021 { 2022 import core.exception : AssertError; 2023 import std.exception : assertThrown, enforce; 2024 import std.string : format; 2025 2026 immutable initialIndex = index; 2027 2028 static if (hasLength!R) 2029 immutable lenBefore = range.length; 2030 2031 static if (isRandomAccessRange!R) 2032 { 2033 assertThrown!UTFException(decode(range, index), null, __FILE__, line); 2034 enforce(index == initialIndex, 2035 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line)); 2036 static if (hasLength!R) 2037 { 2038 enforce(range.length == lenBefore, 2039 new AssertError(format("decode: length changed:", range.length), __FILE__, line)); 2040 } 2041 } 2042 2043 if (initialIndex == 0) 2044 assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line); 2045 } 2046 2047 version (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__) 2048 { 2049 // This condition is to allow unit testing all `decode` functions together 2050 static if (!isBidirectionalRange!R) 2051 return; 2052 else 2053 { 2054 import core.exception : AssertError; 2055 import std.exception : assertThrown, enforce; 2056 import std.string : format; 2057 2058 static if (hasLength!R) 2059 immutable lenBefore = range.length; 2060 2061 static if (isRandomAccessRange!R) 2062 { 2063 assertThrown!UTFException(decodeBack(range), null, __FILE__, line); 2064 static if (hasLength!R) 2065 { 2066 enforce(range.length == lenBefore, 2067 new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line)); 2068 } 2069 } 2070 } 2071 } 2072 2073 @system unittest 2074 { 2075 import std.conv : to; 2076 import std.exception; 2077 2078 assertCTFEable!( 2079 { 2080 foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char, 2081 (string s) => new RefBidirCU!char(s), 2082 (string s) => new RefRandomCU!char(s))) 2083 { 2084 enum sHasLength = hasLength!(typeof(S("abcd"))); 2085 2086 { 2087 auto range = S("abcd"); 2088 testDecode(range, 0, 'a', 1); 2089 testDecode(range, 1, 'b', 2); 2090 testDecodeFront(range, 'a', 1); 2091 testDecodeFront(range, 'b', 1); 2092 assert(decodeFront(range) == 'c'); 2093 assert(decodeFront(range) == 'd'); 2094 } 2095 2096 { 2097 auto range = S("ウェブサイト"); 2098 testDecode(range, 0, 'ウ', 3); 2099 testDecode(range, 3, 'ェ', 6); 2100 testDecodeFront(range, 'ウ', 3); 2101 testDecodeFront(range, 'ェ', 3); 2102 assert(decodeFront(range) == 'ブ'); 2103 assert(decodeFront(range) == 'サ'); 2104 } 2105 2106 { 2107 auto range = S("abcd"); 2108 testDecodeBack(range, 'd', 1); 2109 testDecodeBack(range, 'c', 1); 2110 testDecodeBack(range, 'b', 1); 2111 testDecodeBack(range, 'a', 1); 2112 } 2113 2114 { 2115 auto range = S("ウェブサイト"); 2116 testDecodeBack(range, 'ト', 3); 2117 testDecodeBack(range, 'イ', 3); 2118 testDecodeBack(range, 'サ', 3); 2119 testDecodeBack(range, 'ブ', 3); 2120 } 2121 2122 testAllDecode(S("\xC2\xA9"), '\u00A9', 2); 2123 testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3); 2124 2125 foreach (str; ["\xE2\x89", // too short 2126 "\xC0\x8A", 2127 "\xE0\x80\x8A", 2128 "\xF0\x80\x80\x8A", 2129 "\xF8\x80\x80\x80\x8A", 2130 "\xFC\x80\x80\x80\x80\x8A"]) 2131 { 2132 testBadDecode(S(str), 0); 2133 testBadDecode(S(str), 1); 2134 testBadDecodeBack(S(str)); 2135 } 2136 2137 //Invalid UTF-8 sequence where the first code unit is valid. 2138 testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3); 2139 testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3); 2140 2141 //Invalid UTF-8 sequence where the first code unit isn't valid. 2142 foreach (str; ["\xED\xA0\x80", 2143 "\xED\xAD\xBF", 2144 "\xED\xAE\x80", 2145 "\xED\xAF\xBF", 2146 "\xED\xB0\x80", 2147 "\xED\xBE\x80", 2148 "\xED\xBF\xBF"]) 2149 { 2150 testBadDecode(S(str), 0); 2151 testBadDecodeBack(S(str)); 2152 } 2153 } 2154 }); 2155 } 2156 2157 @system unittest 2158 { 2159 import std.exception; 2160 assertCTFEable!( 2161 { 2162 foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar, 2163 (wstring s) => new RefBidirCU!wchar(s), 2164 (wstring s) => new RefRandomCU!wchar(s))) 2165 { 2166 testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1); 2167 testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2); 2168 testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2); 2169 testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1); 2170 testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1); 2171 2172 testBadDecode(S([ cast(wchar) 0xD801 ]), 0); 2173 testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0); 2174 2175 testBadDecodeBack(S([ cast(wchar) 0xD801 ])); 2176 testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ])); 2177 2178 { 2179 auto range = S("ウェブサイト"); 2180 testDecode(range, 0, 'ウ', 1); 2181 testDecode(range, 1, 'ェ', 2); 2182 testDecodeFront(range, 'ウ', 1); 2183 testDecodeFront(range, 'ェ', 1); 2184 assert(decodeFront(range) == 'ブ'); 2185 assert(decodeFront(range) == 'サ'); 2186 } 2187 2188 { 2189 auto range = S("ウェブサイト"); 2190 testDecodeBack(range, 'ト', 1); 2191 testDecodeBack(range, 'イ', 1); 2192 testDecodeBack(range, 'サ', 1); 2193 testDecodeBack(range, 'ブ', 1); 2194 } 2195 } 2196 2197 foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s))) 2198 { 2199 auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00, 2200 cast(wchar) 0x1400, 2201 cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]); 2202 testDecode(str, 0, cast(dchar) 0x10000, 2); 2203 testDecode(str, 2, cast(dchar) 0x1400, 3); 2204 testDecode(str, 3, cast(dchar) 0xB9DDE, 5); 2205 testDecodeBack(str, cast(dchar) 0xB9DDE, 2); 2206 testDecodeBack(str, cast(dchar) 0x1400, 1); 2207 testDecodeBack(str, cast(dchar) 0x10000, 2); 2208 } 2209 }); 2210 } 2211 2212 @system unittest 2213 { 2214 import std.exception; 2215 assertCTFEable!( 2216 { 2217 foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar, 2218 (dstring s) => new RefBidirCU!dchar(s), 2219 (dstring s) => new RefRandomCU!dchar(s))) 2220 { 2221 testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1); 2222 testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1); 2223 testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1); 2224 testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1); 2225 testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1); 2226 2227 testBadDecode(S([cast(dchar) 0xD800]), 0); 2228 testBadDecode(S([cast(dchar) 0xDFFE]), 0); 2229 testBadDecode(S([cast(dchar) 0x110000]), 0); 2230 2231 testBadDecodeBack(S([cast(dchar) 0xD800])); 2232 testBadDecodeBack(S([cast(dchar) 0xDFFE])); 2233 testBadDecodeBack(S([cast(dchar) 0x110000])); 2234 2235 { 2236 auto range = S("ウェブサイト"); 2237 testDecode(range, 0, 'ウ', 1); 2238 testDecode(range, 1, 'ェ', 2); 2239 testDecodeFront(range, 'ウ', 1); 2240 testDecodeFront(range, 'ェ', 1); 2241 assert(decodeFront(range) == 'ブ'); 2242 assert(decodeFront(range) == 'サ'); 2243 } 2244 2245 { 2246 auto range = S("ウェブサイト"); 2247 testDecodeBack(range, 'ト', 1); 2248 testDecodeBack(range, 'イ', 1); 2249 testDecodeBack(range, 'サ', 1); 2250 testDecodeBack(range, 'ブ', 1); 2251 } 2252 } 2253 2254 foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s))) 2255 { 2256 auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]); 2257 testDecode(str, 0, 0x10000, 1); 2258 testDecode(str, 1, 0x1400, 2); 2259 testDecode(str, 2, 0xB9DDE, 3); 2260 testDecodeBack(str, cast(dchar) 0xB9DDE, 1); 2261 testDecodeBack(str, cast(dchar) 0x1400, 1); 2262 testDecodeBack(str, cast(dchar) 0x10000, 1); 2263 } 2264 }); 2265 } 2266 2267 @safe unittest 2268 { 2269 import std.exception; 2270 import std.traits : FunctionAttribute, functionAttributes, isSafe; 2271 assertCTFEable!( 2272 { 2273 foreach (S; AliasSeq!( char[], const( char)[], string, 2274 wchar[], const(wchar)[], wstring, 2275 dchar[], const(dchar)[], dstring)) 2276 { 2277 static assert(isSafe!({ S str; size_t i = 0; decode(str, i); })); 2278 static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); })); 2279 static assert(isSafe!({ S str; decodeFront(str); })); 2280 static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0); 2281 static assert((functionAttributes!({ 2282 S str; size_t i = 0; decodeFront(str, i); 2283 }) & FunctionAttribute.pure_) != 0); 2284 static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0); 2285 static assert((functionAttributes!({ 2286 S str; size_t i = 0; decodeBack(str, i); 2287 }) & FunctionAttribute.pure_) != 0); 2288 static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0); 2289 } 2290 }); 2291 } 2292 2293 @safe unittest 2294 { 2295 import std.exception; 2296 char[4] val; 2297 val[0] = 0b1111_0111; 2298 val[1] = 0b1011_1111; 2299 val[2] = 0b1011_1111; 2300 val[3] = 0b1011_1111; 2301 size_t i = 0; 2302 assertThrown!UTFException((){ dchar ch = decode(val[], i); }()); 2303 } 2304 /* =================== Encode ======================= */ 2305 2306 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c) 2307 { 2308 static if (useReplacementDchar) 2309 return replacementDchar; 2310 else 2311 throw new UTFException(msg).setSequence(c); 2312 } 2313 2314 /++ 2315 Encodes `c` into the static array, `buf`, and returns the actual 2316 length of the encoded character (a number between `1` and `4` for 2317 `char[4]` buffers and a number between `1` and `2` for 2318 `wchar[2]` buffers). 2319 2320 Throws: 2321 `UTFException` if `c` is not a valid UTF code point. 2322 +/ 2323 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2324 out char[4] buf, dchar c) @safe pure 2325 { 2326 if (c <= 0x7F) 2327 { 2328 assert(isValidDchar(c)); 2329 buf[0] = cast(char) c; 2330 return 1; 2331 } 2332 if (c <= 0x7FF) 2333 { 2334 assert(isValidDchar(c)); 2335 buf[0] = cast(char)(0xC0 | (c >> 6)); 2336 buf[1] = cast(char)(0x80 | (c & 0x3F)); 2337 return 2; 2338 } 2339 if (c <= 0xFFFF) 2340 { 2341 if (0xD800 <= c && c <= 0xDFFF) 2342 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c); 2343 2344 assert(isValidDchar(c)); 2345 L3: 2346 buf[0] = cast(char)(0xE0 | (c >> 12)); 2347 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2348 buf[2] = cast(char)(0x80 | (c & 0x3F)); 2349 return 3; 2350 } 2351 if (c <= 0x10FFFF) 2352 { 2353 assert(isValidDchar(c)); 2354 buf[0] = cast(char)(0xF0 | (c >> 18)); 2355 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 2356 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2357 buf[3] = cast(char)(0x80 | (c & 0x3F)); 2358 return 4; 2359 } 2360 2361 assert(!isValidDchar(c)); 2362 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c); 2363 goto L3; 2364 } 2365 2366 /// 2367 @safe unittest 2368 { 2369 import std.exception : assertThrown; 2370 import std.typecons : Yes; 2371 2372 char[4] buf; 2373 2374 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2375 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F"); 2376 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080"); 2377 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000"); 2378 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE"); 2379 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2380 2381 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2382 auto slice = buf[]; 2383 assert(slice.decodeFront == replacementDchar); 2384 } 2385 2386 /// 2387 @safe unittest 2388 { 2389 import std.exception : assertThrown; 2390 import std.typecons : Yes; 2391 2392 wchar[2] buf; 2393 2394 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2395 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF"); 2396 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000"); 2397 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000"); 2398 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF"); 2399 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2400 2401 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2402 auto slice = buf[]; 2403 assert(slice.decodeFront == replacementDchar); 2404 } 2405 2406 /// 2407 @safe unittest 2408 { 2409 import std.exception : assertThrown; 2410 import std.typecons : Yes; 2411 2412 dchar[1] buf; 2413 2414 assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000'); 2415 assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF'); 2416 assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000'); 2417 assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF'); 2418 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2419 2420 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2421 assert(buf[0] == replacementDchar); 2422 } 2423 2424 @safe unittest 2425 { 2426 import std.exception; 2427 assertCTFEable!( 2428 { 2429 char[4] buf; 2430 2431 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2432 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F"); 2433 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080"); 2434 assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF"); 2435 assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800"); 2436 assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF"); 2437 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000"); 2438 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE"); 2439 assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF"); 2440 assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000"); 2441 assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF"); 2442 2443 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2444 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2445 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2446 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2447 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2448 2449 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2450 enum replacementDcharString = "\uFFFD"; 2451 assert(buf[0 .. replacementDcharString.length] == replacementDcharString); 2452 }); 2453 } 2454 2455 2456 /// Ditto 2457 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2458 out wchar[2] buf, dchar c) @safe pure 2459 { 2460 if (c <= 0xFFFF) 2461 { 2462 if (0xD800 <= c && c <= 0xDFFF) 2463 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c); 2464 2465 assert(isValidDchar(c)); 2466 L1: 2467 buf[0] = cast(wchar) c; 2468 return 1; 2469 } 2470 if (c <= 0x10FFFF) 2471 { 2472 assert(isValidDchar(c)); 2473 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 2474 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 2475 return 2; 2476 } 2477 2478 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c); 2479 goto L1; 2480 } 2481 2482 @safe unittest 2483 { 2484 import std.exception; 2485 assertCTFEable!( 2486 { 2487 wchar[2] buf; 2488 2489 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2490 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF"); 2491 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000"); 2492 assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE); 2493 assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF); 2494 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000"); 2495 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF"); 2496 2497 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2498 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2499 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2500 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2501 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2502 2503 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2504 assert(buf.front == replacementDchar); 2505 }); 2506 } 2507 2508 2509 /// Ditto 2510 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2511 out dchar[1] buf, dchar c) @safe pure 2512 { 2513 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c) 2514 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c); 2515 else 2516 assert(isValidDchar(c)); 2517 buf[0] = c; 2518 return 1; 2519 } 2520 2521 @safe unittest 2522 { 2523 import std.exception; 2524 assertCTFEable!( 2525 { 2526 dchar[1] buf; 2527 2528 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2529 encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF'); 2530 encode(buf, '\uE000'); assert(buf[0] == '\uE000'); 2531 encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE); 2532 encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF); 2533 encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF'); 2534 2535 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2536 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2537 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2538 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2539 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2540 2541 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2542 assert(buf.front == replacementDchar); 2543 }); 2544 } 2545 2546 2547 /++ 2548 Encodes `c` in `str`'s encoding and appends it to `str`. 2549 2550 Throws: 2551 `UTFException` if `c` is not a valid UTF code point. 2552 +/ 2553 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2554 ref scope char[] str, dchar c) @safe pure 2555 { 2556 if (c <= 0x7F) 2557 { 2558 assert(isValidDchar(c)); 2559 str ~= cast(char) c; 2560 } 2561 else 2562 { 2563 char[4] buf; 2564 uint L; 2565 2566 if (c <= 0x7FF) 2567 { 2568 assert(isValidDchar(c)); 2569 buf[0] = cast(char)(0xC0 | (c >> 6)); 2570 buf[1] = cast(char)(0x80 | (c & 0x3F)); 2571 L = 2; 2572 } 2573 else if (c <= 0xFFFF) 2574 { 2575 if (0xD800 <= c && c <= 0xDFFF) 2576 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c); 2577 2578 assert(isValidDchar(c)); 2579 L3: 2580 buf[0] = cast(char)(0xE0 | (c >> 12)); 2581 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2582 buf[2] = cast(char)(0x80 | (c & 0x3F)); 2583 L = 3; 2584 } 2585 else if (c <= 0x10FFFF) 2586 { 2587 assert(isValidDchar(c)); 2588 buf[0] = cast(char)(0xF0 | (c >> 18)); 2589 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 2590 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2591 buf[3] = cast(char)(0x80 | (c & 0x3F)); 2592 L = 4; 2593 } 2594 else 2595 { 2596 assert(!isValidDchar(c)); 2597 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c); 2598 goto L3; 2599 } 2600 str ~= buf[0 .. L]; 2601 } 2602 } 2603 2604 /// 2605 @safe unittest 2606 { 2607 char[] s = "abcd".dup; 2608 dchar d1 = 'a'; 2609 dchar d2 = 'ø'; 2610 2611 encode(s, d1); 2612 assert(s.length == 5); 2613 assert(s == "abcda"); 2614 encode(s, d2); 2615 assert(s.length == 7); 2616 assert(s == "abcdaø"); 2617 } 2618 2619 @safe unittest 2620 { 2621 import std.exception; 2622 2623 assertCTFEable!( 2624 { 2625 char[] s = "abcd".dup; 2626 encode(s, cast(dchar)'a'); 2627 assert(s.length == 5); 2628 assert(s == "abcda"); 2629 2630 encode(s, cast(dchar)'\u00A9'); 2631 assert(s.length == 7); 2632 assert(s == "abcda\xC2\xA9"); 2633 //assert(s == "abcda\u00A9"); // BUG: fix compiler 2634 2635 encode(s, cast(dchar)'\u2260'); 2636 assert(s.length == 10); 2637 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); 2638 }); 2639 } 2640 2641 @safe unittest 2642 { 2643 import std.exception; 2644 assertCTFEable!( 2645 { 2646 char[] buf; 2647 2648 encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000"); 2649 encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F"); 2650 encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080"); 2651 encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF"); 2652 encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800"); 2653 encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF"); 2654 encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000"); 2655 encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE"); 2656 encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF"); 2657 encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000"); 2658 encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF"); 2659 2660 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2661 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2662 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2663 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2664 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2665 2666 enum replacementDcharString = "\uFFFD"; 2667 enum rdcslen = replacementDcharString.length; 2668 assert(buf[$ - rdcslen .. $] != replacementDcharString); 2669 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2670 assert(buf[$ - rdcslen .. $] == replacementDcharString); 2671 }); 2672 } 2673 2674 /// ditto 2675 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2676 ref scope wchar[] str, dchar c) @safe pure 2677 { 2678 if (c <= 0xFFFF) 2679 { 2680 if (0xD800 <= c && c <= 0xDFFF) 2681 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c); 2682 2683 assert(isValidDchar(c)); 2684 L1: 2685 str ~= cast(wchar) c; 2686 } 2687 else if (c <= 0x10FFFF) 2688 { 2689 wchar[2] buf; 2690 2691 assert(isValidDchar(c)); 2692 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 2693 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 2694 str ~= buf; 2695 } 2696 else 2697 { 2698 assert(!isValidDchar(c)); 2699 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c); 2700 goto L1; 2701 } 2702 } 2703 2704 @safe unittest 2705 { 2706 import std.exception; 2707 assertCTFEable!( 2708 { 2709 wchar[] buf; 2710 2711 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2712 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF'); 2713 encode(buf, '\uE000'); assert(buf[2] == '\uE000'); 2714 encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE); 2715 encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF); 2716 encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000"); 2717 encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF"); 2718 2719 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2720 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2721 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2722 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2723 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2724 2725 assert(buf.back != replacementDchar); 2726 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2727 assert(buf.back == replacementDchar); 2728 }); 2729 } 2730 2731 /// ditto 2732 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2733 ref scope dchar[] str, dchar c) @safe pure 2734 { 2735 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c) 2736 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c); 2737 else 2738 assert(isValidDchar(c)); 2739 str ~= c; 2740 } 2741 2742 @safe unittest 2743 { 2744 import std.exception; 2745 assertCTFEable!( 2746 { 2747 dchar[] buf; 2748 2749 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2750 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF'); 2751 encode(buf, '\uE000'); assert(buf[2] == '\uE000'); 2752 encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE); 2753 encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF); 2754 encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF'); 2755 2756 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2757 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2758 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2759 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2760 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2761 2762 assert(buf.back != replacementDchar); 2763 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2764 assert(buf.back == replacementDchar); 2765 }); 2766 } 2767 2768 2769 /++ 2770 Returns the number of code units that are required to encode the code point 2771 `c` when `C` is the character type used to encode it. 2772 +/ 2773 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc 2774 if (isSomeChar!C) 2775 { 2776 static if (C.sizeof == 1) 2777 { 2778 if (c <= 0x7F) return 1; 2779 if (c <= 0x7FF) return 2; 2780 if (c <= 0xFFFF) return 3; 2781 if (c <= 0x10FFFF) return 4; 2782 assert(false); 2783 } 2784 else static if (C.sizeof == 2) 2785 { 2786 return c <= 0xFFFF ? 1 : 2; 2787 } 2788 else 2789 { 2790 static assert(C.sizeof == 4); 2791 return 1; 2792 } 2793 } 2794 2795 /// 2796 @safe pure nothrow @nogc unittest 2797 { 2798 assert(codeLength!char('a') == 1); 2799 assert(codeLength!wchar('a') == 1); 2800 assert(codeLength!dchar('a') == 1); 2801 2802 assert(codeLength!char('\U0010FFFF') == 4); 2803 assert(codeLength!wchar('\U0010FFFF') == 2); 2804 assert(codeLength!dchar('\U0010FFFF') == 1); 2805 } 2806 2807 2808 /++ 2809 Returns the number of code units that are required to encode `str` 2810 in a string whose character type is `C`. This is particularly useful 2811 when slicing one string with the length of another and the two string 2812 types use different character types. 2813 2814 Params: 2815 C = the character type to get the encoding length for 2816 input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 2817 to calculate the encoding length from 2818 Returns: 2819 The number of code units in `input` when encoded to `C` 2820 +/ 2821 size_t codeLength(C, InputRange)(InputRange input) 2822 if (isSomeFiniteCharInputRange!InputRange) 2823 { 2824 alias EncType = Unqual!(ElementEncodingType!InputRange); 2825 static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length))) 2826 return input.length; 2827 else 2828 { 2829 size_t total = 0; 2830 2831 foreach (c; input.byDchar) 2832 total += codeLength!C(c); 2833 2834 return total; 2835 } 2836 } 2837 2838 /// 2839 @safe unittest 2840 { 2841 assert(codeLength!char("hello world") == 2842 "hello world".length); 2843 assert(codeLength!wchar("hello world") == 2844 "hello world"w.length); 2845 assert(codeLength!dchar("hello world") == 2846 "hello world"d.length); 2847 2848 assert(codeLength!char(`プログラミング`) == 2849 `プログラミング`.length); 2850 assert(codeLength!wchar(`プログラミング`) == 2851 `プログラミング`w.length); 2852 assert(codeLength!dchar(`プログラミング`) == 2853 `プログラミング`d.length); 2854 2855 string haystack = `Être sans la verité, ça, ce ne serait pas bien.`; 2856 wstring needle = `Être sans la verité`; 2857 assert(haystack[codeLength!char(needle) .. $] == 2858 `, ça, ce ne serait pas bien.`); 2859 } 2860 2861 @safe unittest 2862 { 2863 import std.algorithm.iteration : filter; 2864 import std.conv : to; 2865 import std.exception; 2866 2867 assertCTFEable!( 2868 { 2869 foreach (S; AliasSeq!( char[], const char[], string, 2870 wchar[], const wchar[], wstring, 2871 dchar[], const dchar[], dstring)) 2872 { 2873 foreach (C; AliasSeq!(char, wchar, dchar)) 2874 { 2875 assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length); 2876 assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length); 2877 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) == 2878 to!(C[])(`ウェブサイト@La_Verité.com`).length); 2879 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) == 2880 to!(C[])(`ウェブサイト@La_Verité.com`).length); 2881 } 2882 } 2883 }); 2884 } 2885 2886 /+ 2887 Internal helper function: 2888 2889 Returns true if it is safe to search for the Codepoint `c` inside 2890 code units, without decoding. 2891 2892 This is a runtime check that is used an optimization in various functions, 2893 particularly, in `std.string`. 2894 +/ 2895 package bool canSearchInCodeUnits(C)(dchar c) 2896 if (isSomeChar!C) 2897 { 2898 static if (C.sizeof == 1) 2899 return c <= 0x7F; 2900 else static if (C.sizeof == 2) 2901 return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF); 2902 else static if (C.sizeof == 4) 2903 return true; 2904 else 2905 static assert(0); 2906 } 2907 @safe unittest 2908 { 2909 assert( canSearchInCodeUnits! char('a')); 2910 assert( canSearchInCodeUnits!wchar('a')); 2911 assert( canSearchInCodeUnits!dchar('a')); 2912 assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF 2913 assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF 2914 assert( canSearchInCodeUnits!wchar('ö')); 2915 assert( canSearchInCodeUnits!dchar('ö')); 2916 assert(!canSearchInCodeUnits! char('日')); 2917 assert( canSearchInCodeUnits!wchar('日')); 2918 assert( canSearchInCodeUnits!dchar('日')); 2919 assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00)); 2920 assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00)); 2921 assert(!canSearchInCodeUnits! char('\U00010001')); 2922 assert(!canSearchInCodeUnits!wchar('\U00010001')); 2923 assert( canSearchInCodeUnits!dchar('\U00010001')); 2924 } 2925 2926 /* =================== Validation ======================= */ 2927 2928 /++ 2929 Checks to see if `str` is well-formed unicode or not. 2930 2931 Throws: 2932 `UTFException` if `str` is not well-formed. 2933 +/ 2934 void validate(S)(in S str) @safe pure 2935 if (isSomeString!S) 2936 { 2937 immutable len = str.length; 2938 for (size_t i = 0; i < len; ) 2939 { 2940 decode(str, i); 2941 } 2942 } 2943 2944 /// 2945 @safe unittest 2946 { 2947 import std.exception : assertThrown; 2948 char[] a = [167, 133, 175]; 2949 assertThrown!UTFException(validate(a)); 2950 } 2951 2952 // https://issues.dlang.org/show_bug.cgi?id=12923 2953 @safe unittest 2954 { 2955 import std.exception; 2956 assertThrown((){ 2957 char[3]a=[167, 133, 175]; 2958 validate(a[]); 2959 }()); 2960 } 2961 2962 /** 2963 * Encodes the elements of `s` to UTF-8 and returns a newly allocated 2964 * string of the elements. 2965 * 2966 * Params: 2967 * s = the string to encode 2968 * Returns: 2969 * A UTF-8 string 2970 * See_Also: 2971 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 2972 */ 2973 string toUTF8(S)(S s) 2974 if (isSomeFiniteCharInputRange!S) 2975 { 2976 return toUTFImpl!string(s); 2977 } 2978 2979 /// 2980 @safe pure unittest 2981 { 2982 import std.algorithm.comparison : equal; 2983 2984 // The ö is represented by two UTF-8 code units 2985 assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); 2986 2987 // 𐐷 is four code units in UTF-8 2988 assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); 2989 } 2990 2991 @system pure unittest 2992 { 2993 import std.algorithm.comparison : equal; 2994 import std.internal.test.dummyrange : ReferenceInputRange; 2995 2996 alias RT = ReferenceInputRange!(ElementType!(string)); 2997 auto r1 = new RT("Hellø"); 2998 auto r2 = new RT("𐐷"); 2999 3000 assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); 3001 assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); 3002 } 3003 3004 /** 3005 * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated 3006 * `wstring` of the elements. 3007 * 3008 * Params: 3009 * s = the range to encode 3010 * Returns: 3011 * A UTF-16 string 3012 * See_Also: 3013 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 3014 */ 3015 wstring toUTF16(S)(S s) 3016 if (isSomeFiniteCharInputRange!S) 3017 { 3018 return toUTFImpl!wstring(s); 3019 } 3020 3021 /// 3022 @safe pure unittest 3023 { 3024 import std.algorithm.comparison : equal; 3025 3026 // these graphemes are two code units in UTF-16 and one in UTF-32 3027 assert("𤭢"d.length == 1); 3028 assert("𐐷"d.length == 1); 3029 3030 assert("𤭢"d.toUTF16.equal([0xD852, 0xDF62])); 3031 assert("𐐷"d.toUTF16.equal([0xD801, 0xDC37])); 3032 } 3033 3034 @system pure unittest 3035 { 3036 import std.algorithm.comparison : equal; 3037 import std.internal.test.dummyrange : ReferenceInputRange; 3038 3039 alias RT = ReferenceInputRange!(ElementType!(string)); 3040 auto r1 = new RT("𤭢"); 3041 auto r2 = new RT("𐐷"); 3042 3043 assert(r1.toUTF16.equal([0xD852, 0xDF62])); 3044 assert(r2.toUTF16.equal([0xD801, 0xDC37])); 3045 } 3046 3047 3048 /** 3049 * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated 3050 * `dstring` of the elements. 3051 * 3052 * Params: 3053 * s = the range to encode 3054 * Returns: 3055 * A UTF-32 string 3056 * See_Also: 3057 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 3058 */ 3059 dstring toUTF32(S)(scope S s) 3060 if (isSomeFiniteCharInputRange!S) 3061 { 3062 return toUTFImpl!dstring(s); 3063 } 3064 3065 /// 3066 @safe pure unittest 3067 { 3068 import std.algorithm.comparison : equal; 3069 3070 // these graphemes are two code units in UTF-16 and one in UTF-32 3071 assert("𤭢"w.length == 2); 3072 assert("𐐷"w.length == 2); 3073 3074 assert("𤭢"w.toUTF32.equal([0x00024B62])); 3075 assert("𐐷"w.toUTF32.equal([0x00010437])); 3076 } 3077 3078 private T toUTFImpl(T, S)(scope S s) 3079 { 3080 static if (is(S : T)) 3081 { 3082 return s.idup; 3083 } 3084 else 3085 { 3086 import std.array : appender; 3087 auto app = appender!T(); 3088 3089 static if (is(S == C[], C) || hasLength!S) 3090 app.reserve(s.length); 3091 3092 foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T))) 3093 app.put(c); 3094 3095 return app.data; 3096 } 3097 } 3098 3099 /* =================== toUTFz ======================= */ 3100 3101 /++ 3102 Returns a C-style zero-terminated string equivalent to `str`. `str` 3103 must not contain embedded `'\0'`'s as any C function will treat the first 3104 `'\0'` that it sees as the end of the string. If `str.empty` is 3105 `true`, then a string containing only `'\0'` is returned. 3106 3107 `toUTFz` accepts any type of string and is templated on the type of 3108 character pointer that you wish to convert to. It will avoid allocating a 3109 new string if it can, but there's a decent chance that it will end up having 3110 to allocate a new string - particularly when dealing with character types 3111 other than `char`. 3112 3113 $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if 3114 anything alters the character one past the end of `str` (which is the 3115 `'\0'` character terminating the string), then the string won't be 3116 zero-terminated anymore. The most likely scenarios for that are if you 3117 append to `str` and no reallocation takes place or when `str` is a 3118 slice of a larger array, and you alter the character in the larger array 3119 which is one character past the end of `str`. Another case where it could 3120 occur would be if you had a mutable character array immediately after 3121 `str` in memory (for example, if they're member variables in a 3122 user-defined type with one declared right after the other) and that 3123 character array happened to start with `'\0'`. Such scenarios will never 3124 occur if you immediately use the zero-terminated string after calling 3125 `toUTFz` and the C function using it doesn't keep a reference to it. 3126 Also, they are unlikely to occur even if you save the zero-terminated string 3127 (the cases above would be among the few examples of where it could happen). 3128 However, if you save the zero-terminate string and want to be absolutely 3129 certain that the string stays zero-terminated, then simply append a 3130 `'\0'` to the string and use its `ptr` property rather than calling 3131 `toUTFz`. 3132 3133 $(RED Warning 2:) When passing a character pointer to a C function, and the 3134 C function keeps it around for any reason, make sure that you keep a 3135 reference to it in your D code. Otherwise, it may go away during a garbage 3136 collection cycle and cause a nasty bug when the C code tries to use it. 3137 +/ 3138 template toUTFz(P) 3139 if (is(P == C*, C) && isSomeChar!C) 3140 { 3141 P toUTFz(S)(S str) @safe pure 3142 if (isSomeString!S) 3143 { 3144 return toUTFzImpl!(P, S)(str); 3145 } 3146 } 3147 3148 /// 3149 @safe pure unittest 3150 { 3151 auto p1 = toUTFz!(char*)("hello world"); 3152 auto p2 = toUTFz!(const(char)*)("hello world"); 3153 auto p3 = toUTFz!(immutable(char)*)("hello world"); 3154 auto p4 = toUTFz!(char*)("hello world"d); 3155 auto p5 = toUTFz!(const(wchar)*)("hello world"); 3156 auto p6 = toUTFz!(immutable(dchar)*)("hello world"w); 3157 } 3158 3159 private P toUTFzImpl(P, S)(return scope S str) @safe pure 3160 if (is(immutable typeof(*P.init) == typeof(str[0]))) 3161 //immutable(C)[] -> C*, const(C)*, or immutable(C)* 3162 { 3163 if (str.empty) 3164 { 3165 typeof(*P.init)[] retval = ['\0']; 3166 3167 auto trustedPtr() @trusted { return retval.ptr; } 3168 return trustedPtr(); 3169 } 3170 3171 alias C = Unqual!(ElementEncodingType!S); 3172 3173 //If the P is mutable, then we have to make a copy. 3174 static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init))) 3175 { 3176 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str); 3177 } 3178 else 3179 { 3180 if (!__ctfe) 3181 { 3182 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; } 3183 immutable p = trustedPtrAdd(str); 3184 3185 // Peek past end of str, if it's 0, no conversion necessary. 3186 // Note that the compiler will put a 0 past the end of static 3187 // strings, and the storage allocator will put a 0 past the end 3188 // of newly allocated char[]'s. 3189 // Is p dereferenceable? A simple test: if the p points to an 3190 // address multiple of 4, then conservatively assume the pointer 3191 // might be pointing to a new block of memory, which might be 3192 // unreadable. Otherwise, it's definitely pointing to valid 3193 // memory. 3194 if ((cast(size_t) p & 3) && *p == '\0') 3195 return &str[0]; 3196 } 3197 3198 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str); 3199 } 3200 } 3201 3202 private P toUTFzImpl(P, S)(return scope S str) @safe pure 3203 if (is(typeof(str[0]) C) && is(immutable typeof(*P.init) == immutable C) && !is(C == immutable)) 3204 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)* 3205 { 3206 alias InChar = typeof(str[0]); 3207 alias OutChar = typeof(*P.init); 3208 3209 //const(C)[] -> const(C)* or 3210 //C[] -> C* or const(C)* 3211 static if (( is(const(Unqual!InChar) == InChar) && is(const(Unqual!OutChar) == OutChar)) || 3212 (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar))) 3213 { 3214 if (!__ctfe) 3215 { 3216 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; } 3217 auto p = trustedPtrAdd(str); 3218 3219 if ((cast(size_t) p & 3) && *p == '\0') 3220 return &str[0]; 3221 } 3222 3223 str ~= '\0'; 3224 return &str[0]; 3225 } 3226 //const(C)[] -> C* or immutable(C)* or 3227 //C[] -> immutable(C)* 3228 else 3229 { 3230 import std.array : uninitializedArray; 3231 auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1); 3232 copy[0 .. $ - 1] = str[]; 3233 copy[$ - 1] = '\0'; 3234 3235 auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; } 3236 return trustedCast(copy); 3237 } 3238 } 3239 3240 private P toUTFzImpl(P, S)(S str) @safe pure 3241 if (!is(immutable typeof(*P.init) == immutable typeof(str[0]))) 3242 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)* 3243 { 3244 import std.array : appender; 3245 auto retval = appender!(typeof(*P.init)[])(); 3246 3247 foreach (dchar c; str) 3248 retval.put(c); 3249 retval.put('\0'); 3250 3251 return () @trusted { return cast(P) retval.data.ptr; } (); 3252 } 3253 3254 @safe pure unittest 3255 { 3256 import core.exception : AssertError; 3257 import std.algorithm; 3258 import std.conv : to; 3259 import std.exception; 3260 import std.string : format; 3261 3262 assertCTFEable!( 3263 { 3264 foreach (S; AliasSeq!(string, wstring, dstring)) 3265 { 3266 alias C = Unqual!(ElementEncodingType!S); 3267 3268 auto s1 = to!S("hello\U00010143\u0100\U00010143"); 3269 auto temp = new C[](s1.length + 1); 3270 temp[0 .. $ - 1] = s1[0 .. $]; 3271 temp[$ - 1] = '\n'; 3272 --temp.length; 3273 auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); } 3274 auto s2 = trustedAssumeUnique(temp); 3275 assert(s1 == s2); 3276 3277 void trustedCStringAssert(P, S)(S s) @trusted 3278 { 3279 auto p = toUTFz!P(s); 3280 assert(p[0 .. s.length] == s); 3281 assert(p[s.length] == '\0'); 3282 } 3283 3284 foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*)) 3285 { 3286 trustedCStringAssert!P(s1); 3287 trustedCStringAssert!P(s2); 3288 } 3289 } 3290 }); 3291 3292 static void test(P, S)(S s, size_t line = __LINE__) @trusted 3293 { 3294 static size_t zeroLen(C)(const(C)* ptr) @trusted 3295 { 3296 size_t len = 0; 3297 while (*ptr != '\0') { ++ptr; ++len; } 3298 return len; 3299 } 3300 3301 auto p = toUTFz!P(s); 3302 immutable len = zeroLen(p); 3303 enforce(cmp(s, p[0 .. len]) == 0, 3304 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof), 3305 __FILE__, line)); 3306 } 3307 3308 assertCTFEable!( 3309 { 3310 foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*, 3311 dchar*, const(dchar)*, immutable(dchar)*)) 3312 { 3313 test!P("hello\U00010143\u0100\U00010143"); 3314 } 3315 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3316 dchar*, const(dchar)*, immutable(dchar)*)) 3317 { 3318 test!P("hello\U00010143\u0100\U00010143"w); 3319 } 3320 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3321 wchar*, const(wchar)*, immutable(wchar)*)) 3322 { 3323 test!P("hello\U00010143\u0100\U00010143"d); 3324 } 3325 foreach (S; AliasSeq!( char[], const( char)[], 3326 wchar[], const(wchar)[], 3327 dchar[], const(dchar)[])) 3328 { 3329 auto s = to!S("hello\U00010143\u0100\U00010143"); 3330 3331 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3332 wchar*, const(wchar)*, immutable(wchar)*, 3333 dchar*, const(dchar)*, immutable(dchar)*)) 3334 { 3335 test!P(s); 3336 } 3337 } 3338 }); 3339 } 3340 3341 3342 /++ 3343 `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`. 3344 3345 Encodes string `s` into UTF-16 and returns the encoded string. 3346 `toUTF16z` is suitable for calling the 'W' functions in the Win32 API 3347 that take an `LPCWSTR` argument. 3348 +/ 3349 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure 3350 if (isSomeChar!C) 3351 { 3352 return toUTFz!(const(wchar)*)(str); 3353 } 3354 3355 /// 3356 @system unittest 3357 { 3358 string str = "Hello, World!"; 3359 const(wchar)* p = str.toUTF16z; 3360 assert(p[str.length] == '\0'); 3361 } 3362 3363 @safe pure unittest 3364 { 3365 import std.conv : to; 3366 //toUTFz is already thoroughly tested, so this will just verify that 3367 //toUTF16z compiles properly for the various string types. 3368 foreach (S; AliasSeq!(string, wstring, dstring)) 3369 assert(toUTF16z(to!S("hello world")) !is null); 3370 } 3371 3372 3373 /* ================================ tests ================================== */ 3374 3375 @safe pure unittest 3376 { 3377 import std.exception; 3378 3379 assertCTFEable!( 3380 { 3381 assert(toUTF16("hello"c) == "hello"); 3382 assert(toUTF32("hello"c) == "hello"); 3383 assert(toUTF8 ("hello"w) == "hello"); 3384 assert(toUTF32("hello"w) == "hello"); 3385 assert(toUTF8 ("hello"d) == "hello"); 3386 assert(toUTF16("hello"d) == "hello"); 3387 3388 assert(toUTF16("hel\u1234o"c) == "hel\u1234o"); 3389 assert(toUTF32("hel\u1234o"c) == "hel\u1234o"); 3390 assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o"); 3391 assert(toUTF32("hel\u1234o"w) == "hel\u1234o"); 3392 assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o"); 3393 assert(toUTF16("hel\u1234o"d) == "hel\u1234o"); 3394 3395 assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo"); 3396 assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo"); 3397 assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo"); 3398 assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo"); 3399 assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo"); 3400 assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo"); 3401 }); 3402 } 3403 3404 3405 /++ 3406 Returns the total number of code points encoded in `str`. 3407 3408 Supercedes: This function supercedes $(LREF toUCSindex). 3409 3410 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 3411 3412 Throws: 3413 `UTFException` if `str` is not well-formed. 3414 +/ 3415 size_t count(C)(const(C)[] str) @safe pure nothrow @nogc 3416 if (isSomeChar!C) 3417 { 3418 return walkLength(str.byDchar); 3419 } 3420 3421 /// 3422 @safe pure nothrow @nogc unittest 3423 { 3424 assert(count("") == 0); 3425 assert(count("a") == 1); 3426 assert(count("abc") == 3); 3427 assert(count("\u20AC100") == 4); 3428 } 3429 3430 @safe pure nothrow @nogc unittest 3431 { 3432 import std.exception; 3433 assertCTFEable!( 3434 { 3435 assert(count("") == 0); 3436 assert(count("a") == 1); 3437 assert(count("abc") == 3); 3438 assert(count("\u20AC100") == 4); 3439 }); 3440 } 3441 3442 3443 // Ranges of code units for testing. 3444 version (StdUnittest) 3445 { 3446 private: 3447 struct InputCU(C) 3448 { 3449 import std.conv : to; 3450 @property bool empty() { return _str.empty; } 3451 @property C front() { return _str[0]; } 3452 void popFront() { _str = _str[1 .. $]; } 3453 3454 this(inout(C)[] str) 3455 { 3456 _str = to!(C[])(str); 3457 } 3458 3459 C[] _str; 3460 } 3461 3462 struct BidirCU(C) 3463 { 3464 import std.conv : to; 3465 @property bool empty() { return _str.empty; } 3466 @property C front() { return _str[0]; } 3467 void popFront() { _str = _str[1 .. $]; } 3468 @property C back() { return _str[$ - 1]; } 3469 void popBack() { _str = _str[0 .. $ - 1]; } 3470 @property auto save() { return BidirCU(_str); } 3471 @property size_t length() { return _str.length; } 3472 3473 this(inout(C)[] str) 3474 { 3475 _str = to!(C[])(str); 3476 } 3477 3478 C[] _str; 3479 } 3480 3481 struct RandomCU(C) 3482 { 3483 import std.conv : to; 3484 @property bool empty() { return _str.empty; } 3485 @property C front() { return _str[0]; } 3486 void popFront() { _str = _str[1 .. $]; } 3487 @property C back() { return _str[$ - 1]; } 3488 void popBack() { _str = _str[0 .. $ - 1]; } 3489 @property auto save() { return RandomCU(_str); } 3490 @property size_t length() { return _str.length; } 3491 C opIndex(size_t i) { return _str[i]; } 3492 auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); } 3493 3494 this(inout(C)[] str) 3495 { 3496 _str = to!(C[])(str); 3497 } 3498 3499 C[] _str; 3500 } 3501 3502 class RefBidirCU(C) 3503 { 3504 import std.conv : to; 3505 @property bool empty() { return _str.empty; } 3506 @property C front() { return _str[0]; } 3507 void popFront() { _str = _str[1 .. $]; } 3508 @property C back() { return _str[$ - 1]; } 3509 void popBack() { _str = _str[0 .. $ - 1]; } 3510 @property auto save() { return new RefBidirCU(_str); } 3511 @property size_t length() { return _str.length; } 3512 3513 this(inout(C)[] str) 3514 { 3515 _str = to!(C[])(str); 3516 } 3517 3518 C[] _str; 3519 } 3520 3521 class RefRandomCU(C) 3522 { 3523 import std.conv : to; 3524 @property bool empty() { return _str.empty; } 3525 @property C front() { return _str[0]; } 3526 void popFront() { _str = _str[1 .. $]; } 3527 @property C back() { return _str[$ - 1]; } 3528 void popBack() { _str = _str[0 .. $ - 1]; } 3529 @property auto save() { return new RefRandomCU(_str); } 3530 @property size_t length() { return _str.length; } 3531 C opIndex(size_t i) { return _str[i]; } 3532 auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); } 3533 3534 this(inout(C)[] str) 3535 { 3536 _str = to!(C[])(str); 3537 } 3538 3539 C[] _str; 3540 } 3541 } 3542 3543 3544 /** 3545 * Inserted in place of invalid UTF sequences. 3546 * 3547 * References: 3548 * $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character) 3549 */ 3550 enum dchar replacementDchar = '\uFFFD'; 3551 3552 /******************************************** 3553 * Iterate a range of char, wchar, or dchars by code unit. 3554 * 3555 * The purpose is to bypass the special case decoding that 3556 * $(REF front, std,range,primitives) does to character arrays. As a result, 3557 * using ranges with `byCodeUnit` can be `nothrow` while 3558 * $(REF front, std,range,primitives) throws when it encounters invalid Unicode 3559 * sequences. 3560 * 3561 * A code unit is a building block of the UTF encodings. Generally, an 3562 * individual code unit does not represent what's perceived as a full 3563 * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters 3564 * are encoded with multiple code units. For example, the UTF-8 code units for 3565 * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit` 3566 * often does not form a character on its own. Attempting to treat it as 3567 * one while iterating over the resulting range will give nonsensical results. 3568 * 3569 * Params: 3570 * r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 3571 * of characters (including strings) or a type that implicitly converts to a string type. 3572 * Returns: 3573 * If `r` is not an auto-decodable string (i.e. a narrow string or a 3574 * user-defined type that implicitly converts to a string type), then `r` 3575 * is returned. 3576 * 3577 * Otherwise, `r` is converted to its corresponding string type (if it's 3578 * not already a string) and wrapped in a random-access range where the 3579 * element encoding type of the string (its code unit) is the element type 3580 * of the range, and that range returned. The range has slicing. 3581 * 3582 * If `r` is quirky enough to be a struct or class which is an input range 3583 * of characters on its own (i.e. it has the input range API as member 3584 * functions), $(I and) it's implicitly convertible to a string type, then 3585 * `r` is returned, and no implicit conversion takes place. 3586 * 3587 * If `r` is wrapped in a new range, then that range has a `source` 3588 * property for returning the string that's currently contained within that 3589 * range. 3590 * 3591 * See_Also: 3592 * Refer to the $(MREF std, uni) docs for a reference on Unicode 3593 * terminology. 3594 * 3595 * For a range that iterates by grapheme cluster (written character) see 3596 * $(REF byGrapheme, std,uni). 3597 */ 3598 auto byCodeUnit(R)(R r) 3599 if ((isConvertibleToString!R && !isStaticArray!R) || 3600 (isInputRange!R && isSomeChar!(ElementEncodingType!R))) 3601 { 3602 import std.traits : StringTypeOf; 3603 static if (// This would be cleaner if we had a way to check whether a type 3604 // was a range without any implicit conversions. 3605 (isAutodecodableString!R && !__traits(hasMember, R, "empty") && 3606 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront"))) 3607 { 3608 static struct ByCodeUnitImpl 3609 { 3610 @safe pure nothrow @nogc: 3611 3612 @property bool empty() const { return source.length == 0; } 3613 @property auto ref front() inout { return source[0]; } 3614 void popFront() { source = source[1 .. $]; } 3615 3616 @property auto save() { return ByCodeUnitImpl(source.save); } 3617 3618 @property auto ref back() inout { return source[$ - 1]; } 3619 void popBack() { source = source[0 .. $-1]; } 3620 3621 auto ref opIndex(size_t index) inout { return source[index]; } 3622 auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); } 3623 3624 @property size_t length() const { return source.length; } 3625 alias opDollar = length; 3626 3627 StringTypeOf!R source; 3628 } 3629 3630 static assert(isRandomAccessRange!ByCodeUnitImpl); 3631 3632 return ByCodeUnitImpl(r); 3633 } 3634 else static if (!isInputRange!R || 3635 (is(R : const dchar[]) && !__traits(hasMember, R, "empty") && 3636 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront"))) 3637 { 3638 return cast(StringTypeOf!R) r; 3639 } 3640 else 3641 { 3642 // byCodeUnit for ranges and dchar[] is a no-op 3643 return r; 3644 } 3645 } 3646 3647 /// 3648 @safe unittest 3649 { 3650 import std.range.primitives; 3651 import std.traits : isAutodecodableString; 3652 3653 auto r = "Hello, World!".byCodeUnit(); 3654 static assert(hasLength!(typeof(r))); 3655 static assert(hasSlicing!(typeof(r))); 3656 static assert(isRandomAccessRange!(typeof(r))); 3657 static assert(is(ElementType!(typeof(r)) == immutable char)); 3658 3659 // contrast with the range capabilities of standard strings (with or 3660 // without autodecoding enabled). 3661 auto s = "Hello, World!"; 3662 static assert(isBidirectionalRange!(typeof(r))); 3663 static if (isAutodecodableString!(typeof(s))) 3664 { 3665 // with autodecoding enabled, strings are non-random-access ranges of 3666 // dchar. 3667 static assert(is(ElementType!(typeof(s)) == dchar)); 3668 static assert(!isRandomAccessRange!(typeof(s))); 3669 static assert(!hasSlicing!(typeof(s))); 3670 static assert(!hasLength!(typeof(s))); 3671 } 3672 else 3673 { 3674 // without autodecoding, strings are normal arrays. 3675 static assert(is(ElementType!(typeof(s)) == immutable char)); 3676 static assert(isRandomAccessRange!(typeof(s))); 3677 static assert(hasSlicing!(typeof(s))); 3678 static assert(hasLength!(typeof(s))); 3679 } 3680 } 3681 3682 /// `byCodeUnit` does no Unicode decoding 3683 @safe unittest 3684 { 3685 string noel1 = "noe\u0308l"; // noël using e + combining diaeresis 3686 assert(noel1.byCodeUnit[2] != 'ë'); 3687 assert(noel1.byCodeUnit[2] == 'e'); 3688 3689 string noel2 = "no\u00EBl"; // noël using a precomposed ë character 3690 // Because string is UTF-8, the code unit at index 2 is just 3691 // the first of a sequence that encodes 'ë' 3692 assert(noel2.byCodeUnit[2] != 'ë'); 3693 } 3694 3695 /// `byCodeUnit` exposes a `source` property when wrapping narrow strings. 3696 @safe unittest 3697 { 3698 import std.algorithm.comparison : equal; 3699 import std.range : popFrontN; 3700 import std.traits : isAutodecodableString; 3701 { 3702 auto range = byCodeUnit("hello world"); 3703 range.popFrontN(3); 3704 assert(equal(range.save, "lo world")); 3705 static if (isAutodecodableString!string) // only enabled with autodecoding 3706 { 3707 string str = range.source; 3708 assert(str == "lo world"); 3709 } 3710 } 3711 // source only exists if the range was wrapped 3712 { 3713 auto range = byCodeUnit("hello world"d); 3714 static assert(!__traits(compiles, range.source)); 3715 } 3716 } 3717 3718 @safe pure nothrow @nogc unittest 3719 { 3720 import std.range; 3721 { 3722 enum testStr = "𐁄𐂌𐃯 hello ディラン"; 3723 char[testStr.length] s; 3724 int i; 3725 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3726 { 3727 s[i++] = c; 3728 } 3729 assert(s == testStr); 3730 } 3731 { 3732 enum testStr = "𐁄𐂌𐃯 hello ディラン"w; 3733 wchar[testStr.length] s; 3734 int i; 3735 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3736 { 3737 s[i++] = c; 3738 } 3739 assert(s == testStr); 3740 } 3741 { 3742 enum testStr = "𐁄𐂌𐃯 hello ディラン"d; 3743 dchar[testStr.length] s; 3744 int i; 3745 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3746 { 3747 s[i++] = c; 3748 } 3749 assert(s == testStr); 3750 } 3751 { 3752 auto bcu = "hello".byCodeUnit(); 3753 assert(bcu.length == 5); 3754 assert(bcu[3] == 'l'); 3755 assert(bcu[2 .. 4][1] == 'l'); 3756 } 3757 { 3758 char[5] orig = "hello"; 3759 auto bcu = orig[].byCodeUnit(); 3760 bcu.front = 'H'; 3761 assert(bcu.front == 'H'); 3762 bcu[1] = 'E'; 3763 assert(bcu[1] == 'E'); 3764 } 3765 { 3766 auto bcu = "hello".byCodeUnit().byCodeUnit(); 3767 static assert(isForwardRange!(typeof(bcu))); 3768 static assert(is(typeof(bcu) == struct) == isAutodecodableString!string); 3769 auto s = bcu.save; 3770 bcu.popFront(); 3771 assert(s.front == 'h'); 3772 } 3773 { 3774 auto bcu = "hello".byCodeUnit(); 3775 static assert(hasSlicing!(typeof(bcu))); 3776 static assert(isBidirectionalRange!(typeof(bcu))); 3777 static assert(is(typeof(bcu) == struct) == isAutodecodableString!string); 3778 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3779 auto ret = bcu.retro; 3780 assert(ret.front == 'o'); 3781 ret.popFront(); 3782 assert(ret.front == 'l'); 3783 } 3784 { 3785 auto bcu = "κόσμε"w.byCodeUnit(); 3786 static assert(hasSlicing!(typeof(bcu))); 3787 static assert(isBidirectionalRange!(typeof(bcu))); 3788 static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring); 3789 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3790 auto ret = bcu.retro; 3791 assert(ret.front == 'ε'); 3792 ret.popFront(); 3793 assert(ret.front == 'μ'); 3794 } 3795 { 3796 static struct Stringish 3797 { 3798 string s; 3799 alias s this; 3800 } 3801 3802 auto orig = Stringish("\U0010fff8 𐁊 foo 𐂓"); 3803 auto bcu = orig.byCodeUnit(); 3804 static assert(is(typeof(bcu) == struct)); 3805 static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish); 3806 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3807 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3808 assert(bcu.front == cast(char) 244); 3809 } 3810 { 3811 static struct WStringish 3812 { 3813 wstring s; 3814 alias s this; 3815 } 3816 3817 auto orig = WStringish("\U0010fff8 𐁊 foo 𐂓"w); 3818 auto bcu = orig.byCodeUnit(); 3819 static assert(is(typeof(bcu) == struct)); 3820 static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish); 3821 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3822 static assert(is(ElementType!(typeof(bcu)) == immutable wchar)); 3823 assert(bcu.front == cast(wchar) 56319); 3824 } 3825 { 3826 static struct DStringish 3827 { 3828 dstring s; 3829 alias s this; 3830 } 3831 3832 auto orig = DStringish("\U0010fff8 𐁊 foo 𐂓"d); 3833 auto bcu = orig.byCodeUnit(); 3834 static assert(is(typeof(bcu) == dstring)); 3835 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3836 static assert(is(ElementType!(typeof(bcu)) == immutable dchar)); 3837 assert(bcu.front == cast(dchar) 1114104); 3838 } 3839 { 3840 static struct FuncStringish 3841 { 3842 string str; 3843 string s() pure nothrow @nogc { return str; } 3844 alias s this; 3845 } 3846 3847 auto orig = FuncStringish("\U0010fff8 𐁊 foo 𐂓"); 3848 auto bcu = orig.byCodeUnit(); 3849 static if (isAutodecodableString!FuncStringish) 3850 static assert(is(typeof(bcu) == struct)); 3851 else 3852 static assert(is(typeof(bcu) == string)); 3853 static assert(!is(typeof(bcu) == FuncStringish)); 3854 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3855 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3856 assert(bcu.front == cast(char) 244); 3857 } 3858 { 3859 static struct Range 3860 { 3861 string data; 3862 bool empty() pure nothrow @nogc { return data.empty; } 3863 char front() pure nothrow @nogc { return data[0]; } 3864 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3865 } 3866 3867 auto orig = Range("\U0010fff8 𐁊 foo 𐂓"); 3868 auto bcu = orig.byCodeUnit(); 3869 static assert(is(typeof(bcu) == Range)); 3870 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3871 static assert(is(ElementType!(typeof(bcu)) == char)); 3872 assert(bcu.front == cast(char) 244); 3873 } 3874 { 3875 static struct WRange 3876 { 3877 wstring data; 3878 bool empty() pure nothrow @nogc { return data.empty; } 3879 wchar front() pure nothrow @nogc { return data[0]; } 3880 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3881 } 3882 3883 auto orig = WRange("\U0010fff8 𐁊 foo 𐂓"w); 3884 auto bcu = orig.byCodeUnit(); 3885 static assert(is(typeof(bcu) == WRange)); 3886 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3887 static assert(is(ElementType!(typeof(bcu)) == wchar)); 3888 assert(bcu.front == 56319); 3889 } 3890 { 3891 static struct DRange 3892 { 3893 dstring data; 3894 bool empty() pure nothrow @nogc { return data.empty; } 3895 dchar front() pure nothrow @nogc { return data[0]; } 3896 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3897 } 3898 3899 auto orig = DRange("\U0010fff8 𐁊 foo 𐂓"d); 3900 auto bcu = orig.byCodeUnit(); 3901 static assert(is(typeof(bcu) == DRange)); 3902 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3903 static assert(is(ElementType!(typeof(bcu)) == dchar)); 3904 assert(bcu.front == 1114104); 3905 } 3906 { 3907 static struct RangeAndStringish 3908 { 3909 bool empty() pure nothrow @nogc { return data.empty; } 3910 char front() pure nothrow @nogc { return data[0]; } 3911 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3912 3913 string data; 3914 string s; 3915 alias s this; 3916 } 3917 3918 auto orig = RangeAndStringish("test.d", "other"); 3919 auto bcu = orig.byCodeUnit(); 3920 static assert(is(typeof(bcu) == RangeAndStringish)); 3921 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3922 static assert(is(ElementType!(typeof(bcu)) == char)); 3923 assert(bcu.front == 't'); 3924 } 3925 { 3926 static struct WRangeAndStringish 3927 { 3928 bool empty() pure nothrow @nogc { return data.empty; } 3929 wchar front() pure nothrow @nogc { return data[0]; } 3930 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3931 3932 wstring data; 3933 wstring s; 3934 alias s this; 3935 } 3936 3937 auto orig = WRangeAndStringish("test.d"w, "other"w); 3938 auto bcu = orig.byCodeUnit(); 3939 static assert(is(typeof(bcu) == WRangeAndStringish)); 3940 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3941 static assert(is(ElementType!(typeof(bcu)) == wchar)); 3942 assert(bcu.front == 't'); 3943 } 3944 { 3945 static struct DRangeAndStringish 3946 { 3947 bool empty() pure nothrow @nogc { return data.empty; } 3948 dchar front() pure nothrow @nogc { return data[0]; } 3949 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3950 3951 dstring data; 3952 dstring s; 3953 alias s this; 3954 } 3955 3956 auto orig = DRangeAndStringish("test.d"d, "other"d); 3957 auto bcu = orig.byCodeUnit(); 3958 static assert(is(typeof(bcu) == DRangeAndStringish)); 3959 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3960 static assert(is(ElementType!(typeof(bcu)) == dchar)); 3961 assert(bcu.front == 't'); 3962 } 3963 { 3964 enum Enum : string { a = "test.d" } 3965 3966 auto orig = Enum.a; 3967 auto bcu = orig.byCodeUnit(); 3968 static assert(!is(typeof(bcu) == Enum)); 3969 static if (isAutodecodableString!Enum) 3970 static assert(is(typeof(bcu) == struct)); 3971 else 3972 static assert(is(typeof(bcu) == string)); 3973 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3974 assert(bcu.front == 't'); 3975 } 3976 { 3977 enum WEnum : wstring { a = "test.d"w } 3978 3979 auto orig = WEnum.a; 3980 auto bcu = orig.byCodeUnit(); 3981 static assert(!is(typeof(bcu) == WEnum)); 3982 static if (isAutodecodableString!WEnum) 3983 static assert(is(typeof(bcu) == struct)); 3984 else 3985 static assert(is(typeof(bcu) == wstring)); 3986 static assert(is(ElementType!(typeof(bcu)) == immutable wchar)); 3987 assert(bcu.front == 't'); 3988 } 3989 { 3990 enum DEnum : dstring { a = "test.d"d } 3991 3992 auto orig = DEnum.a; 3993 auto bcu = orig.byCodeUnit(); 3994 static assert(is(typeof(bcu) == dstring)); 3995 static assert(is(ElementType!(typeof(bcu)) == immutable dchar)); 3996 assert(bcu.front == 't'); 3997 } 3998 3999 static if (autodecodeStrings) 4000 { 4001 static assert(!is(typeof(byCodeUnit("hello")) == string)); 4002 static assert(!is(typeof(byCodeUnit("hello"w)) == wstring)); 4003 } 4004 else 4005 { 4006 static assert(is(typeof(byCodeUnit("hello")) == string)); 4007 static assert(is(typeof(byCodeUnit("hello"w)) == wstring)); 4008 } 4009 static assert(is(typeof(byCodeUnit("hello"d)) == dstring)); 4010 4011 static assert(!__traits(compiles, byCodeUnit((char[5]).init))); 4012 static assert(!__traits(compiles, byCodeUnit((wchar[5]).init))); 4013 static assert(!__traits(compiles, byCodeUnit((dchar[5]).init))); 4014 4015 enum SEnum : char[5] { a = "hello" } 4016 enum WSEnum : wchar[5] { a = "hello"w } 4017 enum DSEnum : dchar[5] { a = "hello"d } 4018 4019 static assert(!__traits(compiles, byCodeUnit(SEnum.a))); 4020 static assert(!__traits(compiles, byCodeUnit(WSEnum.a))); 4021 static assert(!__traits(compiles, byCodeUnit(DSEnum.a))); 4022 } 4023 4024 /**************************** 4025 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 4026 * of characters by char, wchar, or dchar. 4027 * These aliases simply forward to $(LREF byUTF) with the 4028 * corresponding C argument. 4029 * 4030 * Params: 4031 * r = input range of characters, or array of characters 4032 */ 4033 alias byChar = byUTF!char; 4034 4035 /// Ditto 4036 alias byWchar = byUTF!wchar; 4037 4038 /// Ditto 4039 alias byDchar = byUTF!dchar; 4040 4041 @safe pure nothrow @nogc unittest 4042 { 4043 { 4044 char[5] s; 4045 int i; 4046 foreach (c; "hello".byChar.byChar()) 4047 { 4048 //writefln("[%d] '%c'", i, c); 4049 s[i++] = c; 4050 } 4051 assert(s == "hello"); 4052 } 4053 { 4054 char[5+2+3+4+3+3] s; 4055 int i; 4056 dchar[10] a; 4057 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d; 4058 a[8] = 0xD800; // invalid 4059 a[9] = cast(dchar) 0x110000; // invalid 4060 foreach (c; a[].byChar()) 4061 { 4062 //writefln("[%d] '%c'", i, c); 4063 s[i++] = c; 4064 } 4065 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"); 4066 } 4067 { 4068 auto r = "hello"w.byChar(); 4069 r.popFront(); 4070 r.popFront(); 4071 assert(r.front == 'l'); 4072 } 4073 { 4074 auto r = "hello"d.byChar(); 4075 r.popFront(); 4076 r.popFront(); 4077 assert(r.front == 'l'); 4078 } 4079 { 4080 auto r = "hello"d.byChar(); 4081 assert(isForwardRange!(typeof(r))); 4082 auto s = r.save; 4083 r.popFront(); 4084 assert(s.front == 'h'); 4085 } 4086 } 4087 4088 @safe pure nothrow @nogc unittest 4089 { 4090 { 4091 wchar[11] s; 4092 int i; 4093 dchar[10] a; 4094 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d; 4095 a[8] = 0xD800; // invalid 4096 a[9] = cast(dchar) 0x110000; // invalid 4097 foreach (c; a[].byWchar()) 4098 { 4099 //writefln("[%d] '%c' x%x", i, c, c); 4100 s[i++] = c; 4101 } 4102 foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w) 4103 { 4104 //writefln("[%d] '%c' x%x", j, c, c); 4105 } 4106 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w); 4107 } 4108 4109 { 4110 auto r = "hello".byWchar(); 4111 r.popFront(); 4112 r.popFront(); 4113 assert(r.front == 'l'); 4114 } 4115 { 4116 auto r = "hello"d.byWchar(); 4117 r.popFront(); 4118 r.popFront(); 4119 assert(r.front == 'l'); 4120 } 4121 { 4122 auto r = "hello"d.byWchar(); 4123 assert(isForwardRange!(typeof(r))); 4124 auto s = r.save; 4125 r.popFront(); 4126 assert(s.front == 'h'); 4127 } 4128 } 4129 4130 @safe pure nothrow @nogc unittest 4131 { 4132 { 4133 dchar[9] s; 4134 int i; 4135 string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences 4136 foreach (c; a.byDchar()) 4137 { 4138 s[i++] = c; 4139 } 4140 assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d); 4141 } 4142 { 4143 foreach (s; invalidUTFstrings!char()) 4144 { 4145 auto r = s.byDchar(); 4146 assert(!r.empty); 4147 assert(r.front == r.front); 4148 dchar c = r.front; 4149 assert(c == replacementDchar); 4150 } 4151 } 4152 { 4153 auto r = "hello".byDchar(); 4154 r.popFront(); 4155 r.popFront(); 4156 assert(r.front == 'l'); 4157 } 4158 4159 { 4160 dchar[8] s; 4161 int i; 4162 wstring a = "hello\u07FF\uD7FF\U0010FFFF"w; 4163 foreach (c; a.byDchar()) 4164 { 4165 //writefln("[%d] '%c' x%x", i, c, c); 4166 s[i++] = c; 4167 } 4168 assert(s == "hello\u07FF\uD7FF\U0010FFFF"d); 4169 } 4170 { 4171 foreach (s; invalidUTFstrings!wchar()) 4172 { 4173 auto r = s.byDchar(); 4174 assert(!r.empty); 4175 assert(r.front == r.front); 4176 dchar c = r.front; 4177 assert(c == replacementDchar); 4178 } 4179 } 4180 { 4181 wchar[2] ws; 4182 ws[0] = 0xD800; 4183 ws[1] = 0xDD00; // correct surrogate pair 4184 auto r = ws[].byDchar(); 4185 assert(!r.empty); 4186 assert(r.front == r.front); 4187 dchar c = r.front; 4188 assert(c == '\U00010100'); 4189 } 4190 { 4191 auto r = "hello"w.byDchar(); 4192 r.popFront(); 4193 r.popFront(); 4194 assert(r.front == 'l'); 4195 } 4196 4197 { 4198 dchar[5] s; 4199 int i; 4200 dstring a = "hello"d; 4201 foreach (c; a.byDchar.byDchar()) 4202 { 4203 //writefln("[%d] '%c' x%x", i, c, c); 4204 s[i++] = c; 4205 } 4206 assert(s == "hello"d); 4207 } 4208 { 4209 auto r = "hello".byDchar(); 4210 assert(isForwardRange!(typeof(r))); 4211 auto s = r.save; 4212 r.popFront(); 4213 assert(s.front == 'h'); 4214 } 4215 { 4216 auto r = "hello"w.byDchar(); 4217 assert(isForwardRange!(typeof(r))); 4218 auto s = r.save; 4219 r.popFront(); 4220 assert(s.front == 'h'); 4221 } 4222 } 4223 4224 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar, 4225 // which needs to support ranges with and without those attributes 4226 4227 pure @safe nothrow @nogc unittest 4228 { 4229 dchar[5] s = "hello"d; 4230 foreach (c; s[].byChar()) { } 4231 foreach (c; s[].byWchar()) { } 4232 foreach (c; s[].byDchar()) { } 4233 } 4234 4235 version (StdUnittest) 4236 private int impureVariable; 4237 4238 @system unittest 4239 { 4240 static struct ImpureThrowingSystemRange(Char) 4241 { 4242 @property bool empty() const { return true; } 4243 @property Char front() const { return Char.init; } 4244 void popFront() 4245 { 4246 impureVariable++; 4247 throw new Exception("only for testing nothrow"); 4248 } 4249 } 4250 4251 foreach (Char; AliasSeq!(char, wchar, dchar)) 4252 { 4253 ImpureThrowingSystemRange!Char range; 4254 foreach (c; range.byChar()) { } 4255 foreach (c; range.byWchar()) { } 4256 foreach (c; range.byDchar()) { } 4257 } 4258 } 4259 4260 /**************************** 4261 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 4262 * of characters by char type `C` by encoding the elements of the range. 4263 * 4264 * UTF sequences that cannot be converted to the specified encoding are either 4265 * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution" 4266 * of the Unicode Standard 6.2 or result in a thrown UTFException. 4267 * Hence byUTF is not symmetric. 4268 * This algorithm is lazy, and does not allocate memory. 4269 * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the 4270 * `r` parameter. 4271 * 4272 * Params: 4273 * C = `char`, `wchar`, or `dchar` 4274 * useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`, 4275 * UseReplacementDchar.no means throw `UTFException` for invalid UTF 4276 * 4277 * Throws: 4278 * `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.no` 4279 * 4280 * GC: 4281 * Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.yes` 4282 * 4283 * Returns: 4284 * A bidirectional range if `R` is a bidirectional range and not auto-decodable, 4285 * as defined by $(REF isAutodecodableString, std, traits). 4286 * 4287 * A forward range if `R` is a forward range and not auto-decodable. 4288 * 4289 * Or, if `R` is a range and it is auto-decodable and 4290 * `is(ElementEncodingType!typeof(r) == C)`, then the range is passed 4291 * to $(LREF byCodeUnit). 4292 * 4293 * Otherwise, an input range of characters. 4294 */ 4295 template byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar) 4296 if (isSomeChar!C) 4297 { 4298 static if (is(immutable C == immutable UC, UC) && !is(C == UC)) 4299 alias byUTF = byUTF!UC; 4300 else: 4301 4302 auto ref byUTF(R)(R r) 4303 if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R)) 4304 { 4305 return byUTF(r.byCodeUnit()); 4306 } 4307 4308 auto ref byUTF(R)(R r) 4309 if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R)) 4310 { 4311 static if (is(immutable ElementEncodingType!R == immutable RC, RC) && is(RC == C)) 4312 { 4313 return r.byCodeUnit(); 4314 } 4315 else static if (is(C == dchar)) 4316 { 4317 static struct Result 4318 { 4319 enum Empty = uint.max; // range is empty or just constructed 4320 4321 this(return scope R r) 4322 { 4323 this.r = r; 4324 } 4325 4326 this(return scope R r, uint buff) 4327 { 4328 this.r = r; 4329 this.buff = buff; 4330 } 4331 4332 static if (isBidirectionalRange!R) 4333 { 4334 this(return scope R r, uint frontBuff, uint backBuff) 4335 { 4336 this.r = r; 4337 this.buff = frontBuff; 4338 this.backBuff = backBuff; 4339 } 4340 } 4341 4342 @property bool empty() 4343 { 4344 static if (isBidirectionalRange!R) 4345 return buff == Empty && backBuff == Empty && r.empty; 4346 else 4347 return buff == Empty && r.empty; 4348 } 4349 4350 @property dchar front() scope // 'scope' required by call to decodeFront() below 4351 { 4352 if (buff == Empty) 4353 { 4354 auto c = r.front; 4355 4356 static if (is(RC == wchar)) 4357 enum firstMulti = 0xD800; // First high surrogate. 4358 else 4359 enum firstMulti = 0x80; // First non-ASCII. 4360 if (c < firstMulti) 4361 { 4362 r.popFront; 4363 buff = cast(dchar) c; 4364 } 4365 else 4366 { 4367 buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }(); 4368 } 4369 } 4370 return cast(dchar) buff; 4371 } 4372 4373 void popFront() 4374 { 4375 if (buff == Empty) 4376 front(); 4377 buff = Empty; 4378 } 4379 4380 static if (isForwardRange!R) 4381 { 4382 @property auto save() 4383 { 4384 static if (isBidirectionalRange!R) 4385 { 4386 return Result(r.save, buff, backBuff); 4387 } 4388 else 4389 { 4390 return Result(r.save, buff); 4391 } 4392 } 4393 } 4394 4395 static if (isBidirectionalRange!R) 4396 { 4397 @property dchar back() scope // 'scope' required by call to decodeBack() below 4398 { 4399 if (backBuff != Empty) 4400 return cast(dchar) backBuff; 4401 4402 auto c = r.back; 4403 static if (is(RC == wchar)) 4404 enum firstMulti = 0xD800; // First high surrogate. 4405 else 4406 enum firstMulti = 0x80; // First non-ASCII. 4407 if (c < firstMulti) 4408 { 4409 r.popBack; 4410 backBuff = cast(dchar) c; 4411 } 4412 else 4413 { 4414 backBuff = () @trusted { return decodeBack!useReplacementDchar(r); }(); 4415 } 4416 return cast(dchar) backBuff; 4417 4418 } 4419 4420 void popBack() 4421 { 4422 if (backBuff == Empty) 4423 back(); 4424 backBuff = Empty; 4425 } 4426 } 4427 4428 private: 4429 4430 R r; 4431 uint buff = Empty; // one character lookahead buffer 4432 static if (isBidirectionalRange!R) 4433 uint backBuff = Empty; 4434 } 4435 4436 return Result(r); 4437 } 4438 else 4439 { 4440 static struct Result 4441 { 4442 this(return scope R r) 4443 { 4444 this.r = r; 4445 } 4446 4447 this(return scope R r, ushort pos, ushort fill, C[4 / C.sizeof] buf) 4448 { 4449 this.r = r; 4450 this.pos = pos; 4451 this.fill = fill; 4452 this.buf = buf; 4453 } 4454 4455 static if (isBidirectionalRange!R) 4456 { 4457 this(return scope R r, ushort frontPos, ushort frontFill, 4458 ushort backPos, ushort backFill, C[4 / C.sizeof] buf) 4459 { 4460 this.r = r; 4461 this.pos = frontPos; 4462 this.fill = frontFill; 4463 this.backPos = backPos; 4464 this.backFill = backFill; 4465 this.buf = buf; 4466 } 4467 } 4468 4469 @property bool empty() 4470 { 4471 static if (isBidirectionalRange!R) 4472 return pos == fill && backPos == backFill && r.empty; 4473 else 4474 return pos == fill && r.empty; 4475 } 4476 4477 @property auto front() scope // 'scope' required by call to decodeFront() below 4478 { 4479 if (pos == fill) 4480 { 4481 pos = 0; 4482 auto c = r.front; 4483 4484 static if (C.sizeof >= 2 && RC.sizeof >= 2) 4485 enum firstMulti = 0xD800; // First high surrogate. 4486 else 4487 enum firstMulti = 0x80; // First non-ASCII. 4488 if (c < firstMulti) 4489 { 4490 fill = 1; 4491 r.popFront; 4492 buf[pos] = cast(C) c; 4493 } 4494 else 4495 { 4496 static if (is(RC == dchar)) 4497 { 4498 r.popFront; 4499 dchar dc = c; 4500 } 4501 else 4502 dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }(); 4503 fill = cast(ushort) encode!(useReplacementDchar)(buf, dc); 4504 } 4505 } 4506 return buf[pos]; 4507 } 4508 4509 void popFront() 4510 { 4511 if (pos == fill) 4512 front; 4513 ++pos; 4514 } 4515 4516 static if (isForwardRange!R) 4517 { 4518 @property auto save() 4519 { 4520 static if (isBidirectionalRange!R) 4521 { 4522 return Result(r.save, pos, fill, backPos, backFill, buf); 4523 } 4524 else 4525 { 4526 return Result(r.save, pos, fill, buf); 4527 } 4528 } 4529 } 4530 4531 static if (isBidirectionalRange!R) 4532 { 4533 @property auto back() scope // 'scope' required by call to decodeBack() below 4534 { 4535 if (backPos != backFill) 4536 return buf[cast(ushort) (backFill - backPos - 1)]; 4537 4538 backPos = 0; 4539 auto c = r.back; 4540 static if (C.sizeof >= 2 && RC.sizeof >= 2) 4541 enum firstMulti = 0xD800; // First high surrogate. 4542 else 4543 enum firstMulti = 0x80; // First non-ASCII. 4544 if (c < firstMulti) 4545 { 4546 backFill = 1; 4547 r.popBack; 4548 buf[cast(ushort) (backFill - backPos - 1)] = cast(C) c; 4549 } 4550 else 4551 { 4552 static if (is(RC == dchar)) 4553 { 4554 r.popBack; 4555 dchar dc = c; 4556 } 4557 else 4558 dchar dc = () @trusted { return decodeBack!(useReplacementDchar)(r); }(); 4559 backFill = cast(ushort) encode!(useReplacementDchar)(buf, dc); 4560 } 4561 return buf[cast(ushort) (backFill - backPos - 1)]; 4562 } 4563 4564 void popBack() 4565 { 4566 if (backPos == backFill) 4567 back; 4568 ++backPos; 4569 } 4570 } 4571 4572 private: 4573 4574 R r; 4575 ushort pos, fill; 4576 static if (isBidirectionalRange!R) 4577 ushort backPos, backFill; 4578 C[4 / C.sizeof] buf = void; 4579 } 4580 4581 return Result(r); 4582 } 4583 } 4584 } 4585 4586 /// 4587 @safe pure nothrow unittest 4588 { 4589 import std.algorithm.comparison : equal; 4590 4591 // hellö as a range of `char`s, which are UTF-8 4592 assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6])); 4593 4594 // `wchar`s are able to hold the ö in a single element (UTF-16 code unit) 4595 assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö'])); 4596 4597 // 𐐷 is four code units in UTF-8, two in UTF-16, and one in UTF-32 4598 assert("𐐷".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7])); 4599 assert("𐐷".byUTF!wchar().equal([0xD801, 0xDC37])); 4600 assert("𐐷".byUTF!dchar().equal([0x00010437])); 4601 } 4602 4603 /// 4604 @safe unittest 4605 { 4606 import std.algorithm.comparison : equal; 4607 import std.exception : assertThrown; 4608 4609 assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty")); 4610 assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty")); 4611 } 4612 4613 @safe unittest 4614 { 4615 { 4616 wchar[] s = ['a', 'b', 0x219]; 4617 auto r = s.byUTF!char; 4618 assert(isBidirectionalRange!(typeof(r))); 4619 assert(r.back == 0x99); 4620 r.popBack; 4621 assert(r.back == 0xc8); 4622 r.popBack; 4623 assert(r.back == 'b'); 4624 4625 } 4626 4627 { 4628 wchar[] s = ['a', 'b', 0x219]; 4629 auto r = s.byUTF!wchar; 4630 uint i; 4631 assert(isBidirectionalRange!(typeof(r))); 4632 assert(r.back == 0x219); 4633 r.popBack; 4634 assert(r.back == 'b'); 4635 } 4636 4637 { 4638 wchar[] s = ['a', 'b', 0x219]; 4639 auto r = s.byUTF!dchar; 4640 assert(isBidirectionalRange!(typeof(r))); 4641 assert(r.back == 0x219); 4642 r.popBack; 4643 assert(r.back == 'b'); 4644 } 4645 4646 { 4647 dchar[] s = ['𐐷', '😁']; 4648 auto r = s.byUTF!wchar; 4649 assert(r.back == 0xde01); 4650 r.popBack; 4651 assert(r.back == 0xd83d); 4652 r.popBack; 4653 assert(r.back == 0xdc37); 4654 r.popBack; 4655 assert(r.back == 0xd801); 4656 } 4657 4658 { 4659 dchar[] s = ['𐐷', '😁']; 4660 auto r = s.byUTF!char; 4661 char[] res; 4662 while (!r.empty) 4663 { 4664 res ~= r.back; 4665 r.popBack; 4666 } 4667 import std.algorithm.comparison : equal; 4668 assert(res.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0])); 4669 } 4670 4671 { 4672 dchar[] res; 4673 auto r = ['a', 'b', 'c', 'd', 'e'].byUTF!dchar; 4674 while (!r.empty) 4675 { 4676 res ~= r.back; 4677 r.popBack; 4678 } 4679 import std.algorithm.comparison : equal; 4680 assert(res.equal(['e', 'd', 'c', 'b', 'a'])); 4681 } 4682 4683 { 4684 //testing the save() function 4685 wchar[] s = ['Ă','ț']; 4686 4687 auto rc = s.byUTF!char; 4688 rc.popBack; 4689 auto rcCopy = rc.save; 4690 assert(rc.back == rcCopy.back); 4691 assert(rcCopy.back == 0xc8); 4692 4693 auto rd = s.byUTF!dchar; 4694 rd.popBack; 4695 auto rdCopy = rd.save; 4696 assert(rd.back == rdCopy.back); 4697 assert(rdCopy.back == 'Ă'); 4698 } 4699 } 4700 4701 /// 4702 @safe pure nothrow unittest 4703 { 4704 import std.range.primitives; 4705 wchar[] s = ['ă', 'î']; 4706 4707 auto rc = s.byUTF!char; 4708 static assert(isBidirectionalRange!(typeof(rc))); 4709 assert(rc.back == 0xae); 4710 rc.popBack; 4711 assert(rc.back == 0xc3); 4712 rc.popBack; 4713 assert(rc.back == 0x83); 4714 rc.popBack; 4715 assert(rc.back == 0xc4); 4716 4717 auto rw = s.byUTF!wchar; 4718 static assert(isBidirectionalRange!(typeof(rw))); 4719 assert(rw.back == 'î'); 4720 rw.popBack; 4721 assert(rw.back == 'ă'); 4722 4723 auto rd = s.byUTF!dchar; 4724 static assert(isBidirectionalRange!(typeof(rd))); 4725 assert(rd.back == 'î'); 4726 rd.popBack; 4727 assert(rd.back == 'ă'); 4728 }