1 /++ 2 $(H1 @nogc Simple Base64 parsing) 3 4 License: $(HTTP www.apache.org/licenses/LICENSE-2.0, Apache-2.0) 5 Authors: Harrison Ford 6 Copyright: 2021 Harrison Ford, Symmetry Investments 7 +/ 8 module mir.base64; 9 10 import mir.exception: toMutable; 11 12 package static immutable base64DecodeInvalidCharMsg = "base64: Invalid character encountered."; 13 package static immutable base64DecodeInvalidLenMsg = "Cannot decode a buffer with given length (not a multiple of 4, missing padding?)"; 14 version(D_Exceptions) { 15 package static immutable base64DecodeInvalidCharException = new Exception(base64DecodeInvalidCharMsg); 16 package static immutable base64DecodeInvalidLenException = new Exception(base64DecodeInvalidLenMsg); 17 } 18 19 // NOTE: I do not know if this would work on big-endian systems. 20 // Needs further testing to figure out if it *does* work on them. 21 22 // Technique borrowed from http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html#branchless-code-for-lookup-table 23 private char lookup_encoding(ubyte i, char plusChar = '+', char slashChar = '/') @safe @nogc pure { 24 assert(i < 64); 25 26 ubyte shift; 27 28 if (i < 26) 29 { 30 // range A-Z 31 shift = 'A'; 32 } 33 else if (i >= 26 && i < 52) 34 { 35 // range a-z 36 shift = 'a' - 26; 37 } 38 else if (i >= 52 && i < 62) 39 { 40 // range 0-9 41 shift = cast(ubyte)('0' - 52); 42 } 43 else if (i == 62) 44 { 45 // character plus 46 shift = cast(ubyte)(plusChar - 62); 47 } 48 else if (i == 63) 49 { 50 // character slash 51 shift = cast(ubyte)(slashChar - 63); 52 } 53 54 return cast(char)(i + shift); 55 } 56 57 // Do the inverse of above (convert an ASCII value into the Base64 character set) 58 private ubyte lookup_decoding(char i, char plusChar = '+', char slashChar = '/') @safe @nogc pure 59 { 60 // Branching bad, but this isn't performance sensitive 61 if (i <= 'Z' && i >= 'A') { 62 return cast(ubyte)(i - 'A'); 63 } 64 else if (i <= 'z' && i >= 'a') { 65 return cast(ubyte)(i - 'a' + 26); 66 } 67 else if (i <= '9' && i >= '0') { 68 return cast(ubyte)(i - '0' + 52); 69 } 70 else if (i == plusChar) { 71 return 62; 72 } 73 else if (i == slashChar) { 74 return 63; 75 } 76 // Just return 0 for padding, 77 // as it typically means nothing. 78 else if (i == '=') { 79 return 0; 80 } 81 else { 82 version(D_Exceptions) { 83 throw base64DecodeInvalidCharException.toMutable; 84 } else { 85 assert(0, base64DecodeInvalidCharMsg); 86 } 87 } 88 89 } 90 91 /++ 92 Decode a Base64 encoded value, returning the buffer. 93 +/ 94 ubyte[] decodeBase64(scope const(char)[] data, char plusChar = '+', char slashChar = '/') @safe pure 95 { 96 import mir.appender : scopedBuffer; 97 auto app = scopedBuffer!ubyte; 98 decodeBase64(data, app, plusChar, slashChar); 99 return app.data.dup; 100 } 101 102 /++ 103 Decode a Base64 encoded value, placing the result onto an Appender. 104 +/ 105 void decodeBase64(Appender)(scope const(char)[] data, 106 scope ref Appender appender, 107 char plusChar = '+', 108 char slashChar = '/') @safe pure 109 { 110 // We expect data should be well-formed (with padding), 111 // so we should throw if it is not well-formed. 112 if (data.length % 4 != 0) 113 { 114 version(D_Exceptions) { 115 throw base64DecodeInvalidLenException.toMutable; 116 } else { 117 assert(0, base64DecodeInvalidLenMsg); 118 } 119 } 120 121 ubyte[3] decodedByteGroup; 122 ubyte sz = 0; 123 124 // We can't use mir.ndslice.chunk.chunks here, as it violates 125 // the scope requirements. 126 for (size_t i = 0; i < data.length; i += 4) 127 { 128 auto group = data[i .. (i + 4)]; 129 130 ubyte[4] decodedBytes; 131 decodedBytes[0] = lookup_decoding(group[0], plusChar, slashChar); 132 decodedBytes[1] = lookup_decoding(group[1], plusChar, slashChar); 133 134 uint transformed_group = (decodedBytes[0] << 26) | (decodedBytes[1] << 20); 135 136 // According to RFC4648 Section 3.3, we don't have to accept extra padding characters, 137 // and we can safely throw (and stay within spec). 138 // x=== is also invalid, so we can just throw on that here. 139 if (group[0] == '=' || group[1] == '=') 140 { 141 version(D_Exceptions) 142 throw base64DecodeInvalidCharException.toMutable; 143 else 144 assert(0, base64DecodeInvalidCharMsg); 145 } 146 147 // xx=(=)? 148 if (group[2] == '=') 149 { 150 // If we are not at the end of a string, according to RFC4648, 151 // we can safely treat a padding character as "non-alphabet data", 152 // and as such, we should throw. See RFC4648 Section 3.3 for more information 153 if ((i / 4) != ((data.length / 4) - 1)) 154 { 155 version(D_Exceptions) 156 throw base64DecodeInvalidCharException.toMutable; 157 else 158 assert(0, base64DecodeInvalidCharMsg); 159 } 160 161 if (group[3] == '=') 162 { 163 // xx== 164 sz = 1; 165 } 166 // xx=x (invalid) 167 // Padding should not be in the middle of a chunk 168 else 169 { 170 version(D_Exceptions) 171 throw base64DecodeInvalidCharException.toMutable; 172 else 173 assert(0, base64DecodeInvalidCharMsg); 174 } 175 } 176 // xxx= 177 else if (group[3] == '=') 178 { 179 // If we are not at the end of a string, according to RFC4648, 180 // we can safely treat a padding character as "non-alphabet data", 181 // and as such, we should throw. See RFC4648 Section 3.3 for more information 182 if ((i / 4) != ((data.length / 4) - 1)) 183 { 184 version(D_Exceptions) 185 throw base64DecodeInvalidCharException.toMutable; 186 else 187 assert(0, base64DecodeInvalidCharMsg); 188 } 189 190 decodedBytes[2] = lookup_decoding(group[2], plusChar, slashChar); 191 transformed_group |= (decodedBytes[2] << 14); 192 sz = 2; 193 } 194 // xxxx 195 else 196 { 197 decodedBytes[2] = lookup_decoding(group[2], plusChar, slashChar); 198 decodedBytes[3] = lookup_decoding(group[3], plusChar, slashChar); 199 transformed_group |= ((decodedBytes[2] << 14) | (decodedBytes[3] << 8)); 200 sz = 3; 201 } 202 203 decodedByteGroup[0] = (transformed_group >> 24) & 0xff; 204 decodedByteGroup[1] = (transformed_group >> 16) & 0xff; 205 decodedByteGroup[2] = (transformed_group >> 8) & 0xff; 206 207 // Only emit the transformed bytes that we got data for. 208 appender.put(decodedByteGroup[0 .. sz]); 209 } 210 } 211 212 /// Test decoding of data which has a length which can be 213 /// cleanly decoded. 214 version(mir_test) 215 @safe pure unittest 216 { 217 { 218 enum data = "QUJD"; 219 assert(data.decodeBase64 == "ABC"); 220 } 221 222 { 223 enum data = "QQ=="; 224 assert(data.decodeBase64 == "A"); 225 } 226 227 { 228 enum data = "YSBiIGMgZCBlIGYgZyBoIGkgaiBrIGwgbSBuIG8gcCBxIHIgcyB0IHUgdiB3IHggeSB6"; 229 assert(data.decodeBase64 == "a b c d e f g h i j k l m n o p q r s t u v w x y z"); 230 } 231 232 { 233 enum data = "LCAuIDsgLyBbICcgXSBcID0gLSAwIDkgOCA3IDYgNSA0IDMgMiAxIGAgfiAhIEAgIyAkICUgXiAmICogKCApIF8gKyB8IDogPCA+ID8="; 234 assert(data.decodeBase64 == ", . ; / [ ' ] \\ = - 0 9 8 7 6 5 4 3 2 1 ` ~ ! @ # $ % ^ & * ( ) _ + | : < > ?"); 235 } 236 237 { 238 enum data = "AAA="; 239 assert(data.decodeBase64 == "\x00\x00"); 240 } 241 242 { 243 enum data = "AAAABBCC"; 244 assert(data.decodeBase64 == "\x00\x00\x00\x04\x10\x82"); 245 } 246 247 { 248 enum data = "AA=="; 249 assert(data.decodeBase64 == "\x00"); 250 } 251 252 { 253 enum data = "AA/="; 254 assert(data.decodeBase64 == "\x00\x0f"); 255 } 256 } 257 258 /// Test decoding invalid data 259 version(mir_test) 260 @safe pure unittest 261 { 262 void testFail(const(char)[] input) @safe pure 263 { 264 bool thrown = false; 265 try { 266 ubyte[] decoded = input.decodeBase64; 267 } catch (Exception t) { 268 thrown = true; 269 } 270 271 assert(thrown); 272 } 273 274 testFail("===A"); 275 testFail("A="); 276 testFail("AA="); 277 testFail("A=AA"); 278 testFail("AA=A"); 279 testFail("AA=A===="); 280 testFail("=AAA"); 281 testFail("AAA=QUJD"); 282 // This fails because we don't allow extra padding (than what is necessary) 283 testFail("AA======"); 284 // This fails because we don't allow padding before the end of the string (otherwise we'd have a side-channel) 285 testFail("QU==QUJD"); 286 testFail("QU======QUJD"); 287 // Invalid data that's out of the alphabet 288 testFail("!@##@@!@"); 289 } 290 291 /++ 292 Encode a ubyte array as Base64, returning the encoded value. 293 +/ 294 string encodeBase64(scope const(ubyte)[] buf, char plusChar = '+', char slashChar = '/') @safe pure 295 { 296 import mir.appender : scopedBuffer; 297 auto app = scopedBuffer!char; 298 encodeBase64(buf, app, plusChar, slashChar); 299 return app.data.idup; 300 } 301 302 /++ 303 Encode a ubyte array as Base64, placing the result onto an Appender. 304 +/ 305 void encodeBase64(Appender)(scope const(ubyte)[] input, 306 scope ref Appender appender, 307 char plusChar = '+', 308 char slashChar = '/') @safe pure 309 { 310 import core.bitop : bswap; 311 import mir.ndslice.topology : bytegroup, map; 312 // Slice our input array so that n % 3 == 0 (we have a multiple of 3) 313 // If we have less then 3, then this is effectively a no-op (will result in a 0-length slice) 314 char[4] encodedByteGroup; 315 const(ubyte)[] window = input[0 .. input.length - (input.length % 3)]; 316 foreach(group; window.bytegroup!(3, uint).map!bswap) { 317 const(ubyte) a = (group >> 26) & 0x3f; 318 const(ubyte) b = (group >> 20) & 0x3f; 319 const(ubyte) c = (group >> 14) & 0x3f; 320 const(ubyte) d = (group >> 8) & 0x3f; 321 322 encodedByteGroup[0] = a.lookup_encoding(plusChar, slashChar); 323 encodedByteGroup[1] = b.lookup_encoding(plusChar, slashChar); 324 encodedByteGroup[2] = c.lookup_encoding(plusChar, slashChar); 325 encodedByteGroup[3] = d.lookup_encoding(plusChar, slashChar); 326 appender.put(encodedByteGroup[]); 327 } 328 329 // If it's a clean multiple of 3, then it requires no padding. 330 // If not, then we need to add padding. 331 if (input.length % 3 != 0) 332 { 333 window = input[window.length .. input.length]; 334 335 uint group = (window[0] << 24); 336 337 if (window.length == 1) { 338 const(ubyte) a = (group >> 26) & 0x3f; 339 const(ubyte) b = (group >> 20) & 0x3f; 340 encodedByteGroup[0] = a.lookup_encoding(plusChar, slashChar); 341 encodedByteGroup[1] = b.lookup_encoding(plusChar, slashChar); 342 encodedByteGroup[2] = '='; 343 encodedByteGroup[3] = '='; 344 } 345 else { 346 // Just in case 347 assert(window.length == 2); 348 349 group |= (window[1] << 16); 350 const(ubyte) a = (group >> 26) & 0x3f; 351 const(ubyte) b = (group >> 20) & 0x3f; 352 const(ubyte) c = (group >> 14) & 0x3f; 353 encodedByteGroup[0] = a.lookup_encoding(plusChar, slashChar); 354 encodedByteGroup[1] = b.lookup_encoding(plusChar, slashChar); 355 encodedByteGroup[2] = c.lookup_encoding(plusChar, slashChar); 356 encodedByteGroup[3] = '='; 357 } 358 359 appender.put(encodedByteGroup[]); 360 } 361 } 362 363 /// Test encoding of data which has a length that can be cleanly 364 /// encoded. 365 version(mir_test) 366 @safe pure unittest 367 { 368 // 3 bytes 369 { 370 enum data = cast(immutable(ubyte)[])"ABC"; 371 assert(data.encodeBase64 == "QUJD"); 372 } 373 374 // 6 bytes 375 { 376 enum data = cast(immutable(ubyte)[])"ABCDEF"; 377 assert(data.encodeBase64 == "QUJDREVG"); 378 } 379 380 // 9 bytes 381 { 382 enum data = cast(immutable(ubyte)[])"ABCDEFGHI"; 383 assert(data.encodeBase64 == "QUJDREVGR0hJ"); 384 } 385 386 // 12 bytes 387 { 388 enum data = cast(immutable(ubyte)[])"ABCDEFGHIJKL"; 389 assert(data.encodeBase64 == "QUJDREVGR0hJSktM"); 390 } 391 } 392 393 /// Test encoding of data which has a length which CANNOT be cleanly encoded. 394 /// This typically means that there's padding. 395 version(mir_test) 396 @safe pure unittest 397 { 398 // 1 byte 399 { 400 enum data = cast(immutable(ubyte)[])"A"; 401 assert(data.encodeBase64 == "QQ=="); 402 } 403 // 2 bytes 404 { 405 enum data = cast(immutable(ubyte)[])"AB"; 406 assert(data.encodeBase64 == "QUI="); 407 } 408 // 2 bytes 409 { 410 enum data = [0xFF, 0xFF]; 411 assert(data.encodeBase64 == "//8="); 412 } 413 // 4 bytes 414 { 415 enum data = [0xDE, 0xAD, 0xBA, 0xBE]; 416 assert(data.encodeBase64 == "3q26vg=="); 417 } 418 // 37 bytes 419 { 420 enum data = cast(immutable(ubyte)[])"A Very Very Very Very Large Test Blob"; 421 assert(data.encodeBase64 == "QSBWZXJ5IFZlcnkgVmVyeSBWZXJ5IExhcmdlIFRlc3QgQmxvYg=="); 422 } 423 } 424 425 /// Test nogc encoding 426 version(mir_test) 427 @safe pure @nogc unittest 428 { 429 import mir.appender : scopedBuffer; 430 431 { 432 enum data = cast(immutable(ubyte)[])"A Very Very Very Very Large Test Blob"; 433 auto appender = scopedBuffer!char(); 434 data.encodeBase64(appender); 435 assert(appender.data == "QSBWZXJ5IFZlcnkgVmVyeSBWZXJ5IExhcmdlIFRlc3QgQmxvYg=="); 436 } 437 438 { 439 enum data = cast(immutable(ubyte)[])"abc123!?$*&()'-=@~"; 440 auto appender = scopedBuffer!char(); 441 data.encodeBase64(appender); 442 assert(appender.data == "YWJjMTIzIT8kKiYoKSctPUB+"); 443 } 444 } 445 446 /// Make sure we can decode what we encode. 447 version(mir_test) 448 @safe pure unittest 449 { 450 // Test an example string 451 { 452 enum data = cast(immutable(ubyte)[])"abc123!?$*&()'-=@~"; 453 assert(data.encodeBase64.decodeBase64 == data); 454 } 455 // Test an example from Ion data 456 { 457 enum data = cast(immutable(ubyte)[])"a b c d e f g h i j k l m n o p q r s t u v w x y z"; 458 assert(data.encodeBase64.decodeBase64 == data); 459 } 460 } 461