1 // Written in the D programming language. 2 3 /** 4 * Encode and decode Uniform Resource Identifiers (URIs). 5 * URIs are used in internet transfer protocols. 6 * Valid URI characters consist of letters, digits, 7 * and the characters $(B ;/?:@&=+$,-_.!~*'()) 8 * Reserved URI characters are $(B ;/?:@&=+$,) 9 * Escape sequences consist of $(B %) followed by two hex digits. 10 * 11 * See_Also: 12 * $(LINK2 https://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br> 13 * $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia) 14 * Copyright: Copyright The D Language Foundation 2000 - 2009. 15 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 16 * Authors: $(HTTP digitalmars.com, Walter Bright) 17 * Source: $(PHOBOSSRC std/uri.d) 18 */ 19 /* Copyright The D Language Foundation 2000 - 2009. 20 * Distributed under the Boost Software License, Version 1.0. 21 * (See accompanying file LICENSE_1_0.txt or copy at 22 * http://www.boost.org/LICENSE_1_0.txt) 23 */ 24 module std.uri; 25 26 //debug=uri; // uncomment to turn on debugging writefln's 27 debug(uri) import std.stdio; 28 import std.traits : isSomeChar; 29 30 /** This Exception is thrown if something goes wrong when encoding or 31 decoding a URI. 32 */ 33 class URIException : Exception 34 { 35 import std.exception : basicExceptionCtors; 36 mixin basicExceptionCtors; 37 } 38 39 /// 40 @safe unittest 41 { 42 import std.exception : assertThrown; 43 assertThrown!URIException("%ab".decode); 44 } 45 46 private enum 47 { 48 URI_Alpha = 1, 49 URI_Reserved = 2, 50 URI_Mark = 4, 51 URI_Digit = 8, 52 URI_Hash = 0x10, // '#' 53 } 54 55 private immutable char[16] hex2ascii = "0123456789ABCDEF"; 56 57 private immutable ubyte[128] uri_flags = // indexed by character 58 ({ 59 ubyte[128] uflags; 60 61 // Compile time initialize 62 uflags['#'] |= URI_Hash; 63 64 foreach (c; 'A' .. 'Z' + 1) 65 { 66 uflags[c] |= URI_Alpha; 67 uflags[c + 0x20] |= URI_Alpha; // lowercase letters 68 } 69 foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit; 70 foreach (c; ";/?:@&=+$,") uflags[c] |= URI_Reserved; 71 foreach (c; "-_.!~*'()") uflags[c] |= URI_Mark; 72 return uflags; 73 })(); 74 75 private string URI_Encode(dstring str, uint unescapedSet) @safe pure 76 { 77 uint j; 78 uint k; 79 dchar V; 80 dchar C; 81 82 // result buffer 83 char[50] buffer = void; 84 char[] R; 85 uint Rlen; 86 uint Rsize; // alloc'd size 87 88 immutable len = str.length; 89 90 R = buffer[]; 91 Rsize = buffer.length; 92 Rlen = 0; 93 94 for (k = 0; k != len; k++) 95 { 96 C = str[k]; 97 // if (C in unescapedSet) 98 if (C < uri_flags.length && uri_flags[C] & unescapedSet) 99 { 100 if (Rlen == Rsize) 101 { 102 char[] R2; 103 104 Rsize *= 2; 105 R2 = new char[Rsize]; 106 R2[0 .. Rlen] = R[0 .. Rlen]; 107 R = R2; 108 } 109 R[Rlen] = cast(char) C; 110 Rlen++; 111 } 112 else 113 { 114 char[6] Octet; 115 uint L; 116 117 V = C; 118 119 // Transform V into octets 120 if (V <= 0x7F) 121 { 122 Octet[0] = cast(char) V; 123 L = 1; 124 } 125 else if (V <= 0x7FF) 126 { 127 Octet[0] = cast(char)(0xC0 | (V >> 6)); 128 Octet[1] = cast(char)(0x80 | (V & 0x3F)); 129 L = 2; 130 } 131 else if (V <= 0xFFFF) 132 { 133 Octet[0] = cast(char)(0xE0 | (V >> 12)); 134 Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F)); 135 Octet[2] = cast(char)(0x80 | (V & 0x3F)); 136 L = 3; 137 } 138 else if (V <= 0x1FFFFF) 139 { 140 Octet[0] = cast(char)(0xF0 | (V >> 18)); 141 Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F)); 142 Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F)); 143 Octet[3] = cast(char)(0x80 | (V & 0x3F)); 144 L = 4; 145 } 146 else 147 { 148 throw new URIException("Undefined UTF-32 code point"); 149 } 150 151 if (Rlen + L * 3 > Rsize) 152 { 153 char[] R2; 154 155 Rsize = 2 * (Rlen + L * 3); 156 R2 = new char[Rsize]; 157 R2[0 .. Rlen] = R[0 .. Rlen]; 158 R = R2; 159 } 160 161 for (j = 0; j < L; j++) 162 { 163 R[Rlen] = '%'; 164 R[Rlen + 1] = hex2ascii[Octet[j] >> 4]; 165 R[Rlen + 2] = hex2ascii[Octet[j] & 15]; 166 167 Rlen += 3; 168 } 169 } 170 } 171 172 return R[0 .. Rlen].idup; 173 } 174 175 @safe pure unittest 176 { 177 import std.exception : assertThrown; 178 179 assert(URI_Encode("", 0) == ""); 180 assert(URI_Encode(URI_Decode("%F0%BF%BF%BF", 0), 0) == "%F0%BF%BF%BF"); 181 dstring a; 182 a ~= cast(dchar) 0xFFFFFFFF; 183 assertThrown(URI_Encode(a, 0)); 184 assert(URI_Encode("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0).length == 3 * 60); 185 } 186 187 private uint ascii2hex(dchar c) @nogc @safe pure nothrow 188 { 189 return (c <= '9') ? c - '0' : 190 (c <= 'F') ? c - 'A' + 10 : 191 c - 'a' + 10; 192 } 193 194 private dstring URI_Decode(Char)(scope const(Char)[] uri, uint reservedSet) 195 if (isSomeChar!Char) 196 { 197 import std.ascii : isHexDigit; 198 199 uint j; 200 uint k; 201 uint V; 202 dchar C; 203 204 uint Rlen; 205 immutable len = uri.length; 206 auto s = uri; 207 208 auto Rsize = len; 209 dchar[] R = new dchar[Rsize]; 210 Rlen = 0; 211 212 for (k = 0; k != len; k++) 213 { 214 char B; 215 uint start; 216 217 C = s[k]; 218 if (C != '%') 219 { 220 R[Rlen] = C; 221 Rlen++; 222 continue; 223 } 224 start = k; 225 if (k + 2 >= len) 226 throw new URIException("Unexpected end of URI"); 227 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2])) 228 throw new URIException("Expected two hexadecimal digits after '%'"); 229 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); 230 k += 2; 231 if ((B & 0x80) == 0) 232 { 233 C = B; 234 } 235 else 236 { 237 uint n; 238 239 for (n = 1; ; n++) 240 { 241 if (n > 4) 242 throw new URIException("UTF-32 code point size too large"); 243 if (((B << n) & 0x80) == 0) 244 { 245 if (n == 1) 246 throw new URIException("UTF-32 code point size too small"); 247 break; 248 } 249 } 250 251 // Pick off (7 - n) significant bits of B from first byte of octet 252 V = B & ((1 << (7 - n)) - 1); // (!!!) 253 254 if (k + (3 * (n - 1)) >= len) 255 throw new URIException("UTF-32 unaligned String"); 256 for (j = 1; j != n; j++) 257 { 258 k++; 259 if (s[k] != '%') 260 throw new URIException("Expected: '%'"); 261 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2])) 262 throw new URIException("Expected two hexadecimal digits after '%'"); 263 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); 264 if ((B & 0xC0) != 0x80) 265 throw new URIException("Incorrect UTF-32 multi-byte sequence"); 266 k += 2; 267 V = (V << 6) | (B & 0x3F); 268 } 269 if (V > 0x10FFFF) 270 throw new URIException("Unknown UTF-32 code point"); 271 C = V; 272 } 273 if (C < uri_flags.length && uri_flags[C] & reservedSet) 274 { 275 // R ~= s[start .. k + 1]; 276 immutable width = (k + 1) - start; 277 for (int ii = 0; ii < width; ii++) 278 R[Rlen + ii] = s[start + ii]; 279 Rlen += width; 280 } 281 else 282 { 283 R[Rlen] = C; 284 Rlen++; 285 } 286 } 287 assert(Rlen <= Rsize); // enforce our preallocation size guarantee 288 289 // Copy array on stack to array in memory 290 return R[0 .. Rlen].idup; 291 } 292 293 @safe pure unittest 294 { 295 import std.exception : assertThrown; 296 297 assert(URI_Decode("", 0) == ""); 298 assertThrown!URIException(URI_Decode("%", 0)); 299 assertThrown!URIException(URI_Decode("%xx", 0)); 300 assertThrown!URIException(URI_Decode("%FF", 0)); 301 assertThrown!URIException(URI_Decode("%C0", 0)); 302 assertThrown!URIException(URI_Decode("%C0000000", 0)); 303 assertThrown!URIException(URI_Decode("%C0%xx0000", 0)); 304 assertThrown!URIException(URI_Decode("%C0%C00000", 0)); 305 assertThrown!URIException(URI_Decode("%F7%BF%BF%BF", 0)); 306 assert(URI_Decode("%23", URI_Hash) == "%23"); 307 } 308 309 /************************************* 310 * Decodes the URI string encodedURI into a UTF-8 string and returns it. 311 * Escape sequences that resolve to reserved URI characters are not replaced. 312 * Escape sequences that resolve to the '#' character are not replaced. 313 */ 314 string decode(Char)(scope const(Char)[] encodedURI) 315 if (isSomeChar!Char) 316 { 317 import std.algorithm.iteration : each; 318 import std.utf : encode; 319 auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash); 320 char[] r; 321 s.each!(c => encode(r, c)); 322 return r; 323 } 324 325 /// 326 @safe unittest 327 { 328 assert("foo%20bar".decode == "foo bar"); 329 assert("%3C%3E.@.%E2%84%A2".decode == "<>.@.™"); 330 assert("foo&/".decode == "foo&/"); 331 assert("!@#$&*(".decode == "!@#$&*("); 332 } 333 334 /******************************* 335 * Decodes the URI string encodedURI into a UTF-8 string and returns it. All 336 * escape sequences are decoded. 337 */ 338 string decodeComponent(Char)(scope const(Char)[] encodedURIComponent) 339 if (isSomeChar!Char) 340 { 341 import std.algorithm.iteration : each; 342 import std.utf : encode; 343 auto s = URI_Decode(encodedURIComponent, 0); 344 char[] r; 345 s.each!(c => encode(r, c)); 346 return r; 347 } 348 349 /// 350 @safe unittest 351 { 352 assert("foo%2F%26".decodeComponent == "foo/&"); 353 assert("dl%C3%A4ng%20r%C3%B6cks".decodeComponent == "dläng röcks"); 354 assert("!%40%23%24%25%5E%26*(".decodeComponent == "!@#$%^&*("); 355 } 356 357 /***************************** 358 * Encodes the UTF-8 string uri into a URI and returns that URI. Any character 359 * not a valid URI character is escaped. The '#' character is not escaped. 360 */ 361 string encode(Char)(scope const(Char)[] uri) 362 if (isSomeChar!Char) 363 { 364 import std.utf : toUTF32; 365 auto s = toUTF32(uri); 366 return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark); 367 } 368 369 /// 370 @safe unittest 371 { 372 assert("foo bar".encode == "foo%20bar"); 373 assert("<>.@.™".encode == "%3C%3E.@.%E2%84%A2"); 374 assert("foo/#?a=1&b=2".encode == "foo/#?a=1&b=2"); 375 assert("dlang+rocks!".encode == "dlang+rocks!"); 376 assert("!@#$%^&*(".encode == "!@#$%25%5E&*("); 377 } 378 379 /******************************** 380 * Encodes the UTF-8 string uriComponent into a URI and returns that URI. 381 * Any character not a letter, digit, or one of -_.!~*'() is escaped. 382 */ 383 string encodeComponent(Char)(scope const(Char)[] uriComponent) 384 if (isSomeChar!Char) 385 { 386 import std.utf : toUTF32; 387 auto s = toUTF32(uriComponent); 388 return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark); 389 } 390 391 /// 392 @safe unittest 393 { 394 assert("!@#$%^&*(".encodeComponent == "!%40%23%24%25%5E%26*("); 395 assert("<>.@.™".encodeComponent == "%3C%3E.%40.%E2%84%A2"); 396 assert("foo/&".encodeComponent == "foo%2F%26"); 397 assert("dläng röcks".encodeComponent == "dl%C3%A4ng%20r%C3%B6cks"); 398 assert("dlang+rocks!".encodeComponent == "dlang%2Brocks!"); 399 } 400 401 /* Encode associative array using www-form-urlencoding 402 * 403 * Params: 404 * values = an associative array containing the values to be encoded. 405 * 406 * Returns: 407 * A string encoded using www-form-urlencoding. 408 */ 409 package string urlEncode(scope string[string] values) @safe pure 410 { 411 if (values.length == 0) 412 return ""; 413 414 import std.array : Appender; 415 import std.format.write : formattedWrite; 416 417 Appender!string enc; 418 enc.reserve(values.length * 128); 419 420 bool first = true; 421 foreach (k, v; values) 422 { 423 if (!first) 424 enc.put('&'); 425 formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v)); 426 first = false; 427 } 428 return enc.data; 429 } 430 431 @safe pure unittest 432 { 433 // @system because urlEncode -> encodeComponent -> URI_Encode 434 // URI_Encode uses alloca and pointer slicing 435 string[string] a; 436 assert(urlEncode(a) == ""); 437 assert(urlEncode(["name1" : "value1"]) == "name1=value1"); 438 auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]); 439 assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1"); 440 } 441 442 /*************************** 443 * Does string s[] start with a URL? 444 * Returns: 445 * -1 it does not 446 * len it does, and s[0 .. len] is the slice of s[] that is that URL 447 */ 448 449 ptrdiff_t uriLength(Char)(scope const(Char)[] s) 450 if (isSomeChar!Char) 451 { 452 /* Must start with one of: 453 * http:// 454 * https:// 455 * www. 456 */ 457 import std.ascii : isAlphaNum; 458 import std.uni : icmp; 459 460 ptrdiff_t i; 461 462 if (s.length <= 4) 463 return -1; 464 465 if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0) 466 { 467 i = 7; 468 } 469 else 470 { 471 if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0) 472 i = 8; 473 else 474 return -1; 475 } 476 477 ptrdiff_t lastdot; 478 for (; i < s.length; i++) 479 { 480 auto c = s[i]; 481 if (isAlphaNum(c)) 482 continue; 483 if (c == '-' || c == '_' || c == '?' || 484 c == '=' || c == '%' || c == '&' || 485 c == '/' || c == '+' || c == '#' || 486 c == '~' || c == '$') 487 continue; 488 if (c == '.') 489 { 490 lastdot = i; 491 continue; 492 } 493 break; 494 } 495 if (!lastdot) 496 return -1; 497 498 return i; 499 } 500 501 /// 502 @safe pure unittest 503 { 504 string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!"; 505 assert(uriLength(s1) == 49); 506 string s2 = "no uri here"; 507 assert(uriLength(s2) == -1); 508 assert(uriLength("issue 14924") < 0); 509 } 510 511 @safe pure nothrow @nogc unittest 512 { 513 assert(uriLength("") == -1); 514 assert(uriLength("https://www") == -1); 515 } 516 517 /*************************** 518 * Does string s[] start with an email address? 519 * Returns: 520 * -1 it does not 521 * len it does, and s[0 .. i] is the slice of s[] that is that email address 522 * References: 523 * RFC2822 524 */ 525 ptrdiff_t emailLength(Char)(scope const(Char)[] s) 526 if (isSomeChar!Char) 527 { 528 import std.ascii : isAlpha, isAlphaNum; 529 530 ptrdiff_t i; 531 532 if (s.length == 0) 533 return -1; 534 535 if (!isAlpha(s[0])) 536 return -1; 537 538 for (i = 1; 1; i++) 539 { 540 if (i == s.length) 541 return -1; 542 auto c = s[i]; 543 if (isAlphaNum(c)) 544 continue; 545 if (c == '-' || c == '_' || c == '.') 546 continue; 547 if (c != '@') 548 return -1; 549 i++; 550 break; 551 } 552 553 /* Now do the part past the '@' 554 */ 555 ptrdiff_t lastdot; 556 for (; i < s.length; i++) 557 { 558 auto c = s[i]; 559 if (isAlphaNum(c)) 560 continue; 561 if (c == '-' || c == '_') 562 continue; 563 if (c == '.') 564 { 565 lastdot = i; 566 continue; 567 } 568 break; 569 } 570 if (!lastdot || (i - lastdot != 3 && i - lastdot != 4)) 571 return -1; 572 573 return i; 574 } 575 576 /// 577 @safe pure unittest 578 { 579 string s1 = "my.e-mail@www.example-domain.com with garbage added"; 580 assert(emailLength(s1) == 32); 581 string s2 = "no email address here"; 582 assert(emailLength(s2) == -1); 583 assert(emailLength("issue 14924") < 0); 584 } 585 586 @safe pure unittest 587 { 588 //@system because of encode -> URI_Encode 589 debug(uri) writeln("uri.encodeURI.unittest"); 590 591 string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo"; 592 string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo"; 593 594 auto result = encode(source); 595 debug(uri) writefln("result = '%s'", result); 596 assert(result == target); 597 result = decode(target); 598 debug(uri) writefln("result = '%s'", result); 599 assert(result == source); 600 601 result = encode(decode("%E3%81%82%E3%81%82")); 602 assert(result == "%E3%81%82%E3%81%82"); 603 604 result = encodeComponent("c++"); 605 assert(result == "c%2B%2B"); 606 607 auto str = new char[10_000_000]; 608 str[] = 'A'; 609 result = encodeComponent(str); 610 foreach (char c; result) 611 assert(c == 'A'); 612 613 result = decode("%41%42%43"); 614 debug(uri) writeln(result); 615 616 import std.meta : AliasSeq; 617 static foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring)) 618 {{ 619 import std.conv : to; 620 StringType decoded1 = source.to!StringType; 621 string encoded1 = encode(decoded1); 622 assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed 623 assert(encoded1 == target); 624 assert(decoded1 == decode(encoded1).to!StringType); 625 626 StringType encoded2 = target.to!StringType; 627 string decoded2 = decode(encoded2); 628 assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed 629 assert(decoded2 == source); 630 assert(encoded2 == encode(decoded2).to!StringType); 631 }} 632 } 633 634 @safe pure nothrow @nogc unittest 635 { 636 assert(emailLength("") == -1); 637 assert(emailLength("@") == -1); 638 assert(emailLength("abcd") == -1); 639 assert(emailLength("blah@blub") == -1); 640 assert(emailLength("blah@blub.") == -1); 641 assert(emailLength("blah@blub.domain") == -1); 642 }