The OpenD Programming Language

1 /++
2 $(H1 @nogc Simple Base64 parsing)
3 
4 License: $(HTTP www.apache.org/licenses/LICENSE-2.0, Apache-2.0)
5 Authors: Harrison Ford
6 Copyright: 2021 Harrison Ford, Symmetry Investments
7 +/
8 module mir.base64;
9 
10 import mir.exception: toMutable;
11 
12 package static immutable base64DecodeInvalidCharMsg = "base64: Invalid character encountered.";
13 package static immutable base64DecodeInvalidLenMsg = "Cannot decode a buffer with given length (not a multiple of 4, missing padding?)";
14 version(D_Exceptions) {
15     package static immutable base64DecodeInvalidCharException = new Exception(base64DecodeInvalidCharMsg);
16     package static immutable base64DecodeInvalidLenException = new Exception(base64DecodeInvalidLenMsg);
17 }
18 
19 // NOTE: I do not know if this would work on big-endian systems.
20 // Needs further testing to figure out if it *does* work on them.
21 
22 // Technique borrowed from http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html#branchless-code-for-lookup-table
23 private char lookup_encoding(ubyte i, char plusChar = '+', char slashChar = '/') @safe @nogc pure {
24     assert(i < 64);
25 
26     ubyte shift;
27 
28     if (i < 26)
29     {
30         // range A-Z
31         shift = 'A';
32     }
33     else if (i >= 26 && i < 52)
34     {
35         // range a-z
36         shift = 'a' - 26;
37     }
38     else if (i >= 52 && i < 62)
39     {
40         // range 0-9
41         shift = cast(ubyte)('0' - 52);
42     }
43     else if (i == 62)
44     {
45         // character plus
46         shift = cast(ubyte)(plusChar - 62);
47     }
48     else if (i == 63)
49     {
50         // character slash
51         shift = cast(ubyte)(slashChar - 63);
52     }
53 
54     return cast(char)(i + shift);
55 }
56 
57 // Do the inverse of above (convert an ASCII value into the Base64 character set)
58 private ubyte lookup_decoding(char i, char plusChar = '+', char slashChar = '/') @safe @nogc pure
59 {
60     // Branching bad, but this isn't performance sensitive
61     if (i <= 'Z' && i >= 'A') {
62         return cast(ubyte)(i - 'A');
63     }
64     else if (i <= 'z' && i >= 'a') {
65         return cast(ubyte)(i - 'a' + 26); 
66     }
67     else if (i <= '9' && i >= '0') {
68         return cast(ubyte)(i - '0' + 52);
69     }
70     else if (i == plusChar) {
71         return 62;
72     }
73     else if (i == slashChar) {
74         return 63;
75     }
76     // Just return 0 for padding,
77     // as it typically means nothing.
78     else if (i == '=') {
79         return 0;
80     }
81     else {
82         version(D_Exceptions) {
83             throw base64DecodeInvalidCharException.toMutable;
84         } else {
85             assert(0, base64DecodeInvalidCharMsg);
86         }
87     }
88 
89 }
90 
91 /++
92 Decode a Base64 encoded value, returning the buffer.
93 +/
94 ubyte[] decodeBase64(scope const(char)[] data, char plusChar = '+', char slashChar = '/') @safe pure
95 {
96     import mir.appender : scopedBuffer;
97     auto app = scopedBuffer!ubyte;
98     decodeBase64(data, app, plusChar, slashChar);
99     return app.data.dup;
100 }
101 
102 /++
103 Decode a Base64 encoded value, placing the result onto an Appender.
104 +/
105 void decodeBase64(Appender)(scope const(char)[] data,
106                             scope ref Appender appender,
107                             char plusChar = '+',
108                             char slashChar = '/') @safe pure
109 {
110     // We expect data should be well-formed (with padding),
111     // so we should throw if it is not well-formed.
112     if (data.length % 4 != 0)
113     {
114         version(D_Exceptions) {
115             throw base64DecodeInvalidLenException.toMutable;
116         } else {
117             assert(0, base64DecodeInvalidLenMsg);
118         }
119     }
120     
121     ubyte[3] decodedByteGroup;
122     ubyte sz = 0;
123 
124     // We can't use mir.ndslice.chunk.chunks here, as it violates
125     // the scope requirements.
126     for (size_t i = 0; i < data.length; i += 4)
127     {
128         auto group = data[i .. (i + 4)];
129 
130         ubyte[4] decodedBytes;
131         decodedBytes[0] = lookup_decoding(group[0], plusChar, slashChar);
132         decodedBytes[1] = lookup_decoding(group[1], plusChar, slashChar);
133 
134         uint transformed_group = (decodedBytes[0] << 26) | (decodedBytes[1] << 20);
135 
136         // According to RFC4648 Section 3.3, we don't have to accept extra padding characters,
137         // and we can safely throw (and stay within spec).
138         // x=== is also invalid, so we can just throw on that here.
139         if (group[0] == '=' || group[1] == '=')
140         {
141             version(D_Exceptions)
142                 throw base64DecodeInvalidCharException.toMutable;
143             else
144                 assert(0, base64DecodeInvalidCharMsg);
145         }
146 
147         // xx=(=)?
148         if (group[2] == '=')
149         {
150             // If we are not at the end of a string, according to RFC4648,
151             // we can safely treat a padding character as "non-alphabet data",
152             // and as such, we should throw. See RFC4648 Section 3.3 for more information
153             if ((i / 4) != ((data.length / 4) - 1))
154             {
155                 version(D_Exceptions)
156                     throw base64DecodeInvalidCharException.toMutable;
157                 else
158                     assert(0, base64DecodeInvalidCharMsg);
159             }
160 
161             if (group[3] == '=')
162             {
163                 // xx==
164                 sz = 1;
165             }
166             // xx=x (invalid)
167             // Padding should not be in the middle of a chunk
168             else
169             {
170                 version(D_Exceptions)
171                     throw base64DecodeInvalidCharException.toMutable;
172                 else
173                     assert(0, base64DecodeInvalidCharMsg);
174             }
175         }
176         // xxx=
177         else if (group[3] == '=')
178         {
179             // If we are not at the end of a string, according to RFC4648,
180             // we can safely treat a padding character as "non-alphabet data",
181             // and as such, we should throw. See RFC4648 Section 3.3 for more information
182             if ((i / 4) != ((data.length / 4) - 1))
183             {
184                 version(D_Exceptions)
185                     throw base64DecodeInvalidCharException.toMutable;
186                 else
187                     assert(0, base64DecodeInvalidCharMsg);
188             }
189 
190             decodedBytes[2] = lookup_decoding(group[2], plusChar, slashChar);
191             transformed_group |= (decodedBytes[2] << 14);
192             sz = 2;
193         }
194         // xxxx
195         else 
196         {
197             decodedBytes[2] = lookup_decoding(group[2], plusChar, slashChar);
198             decodedBytes[3] = lookup_decoding(group[3], plusChar, slashChar);
199             transformed_group |= ((decodedBytes[2] << 14) | (decodedBytes[3] << 8)); 
200             sz = 3;
201         }
202 
203         decodedByteGroup[0] = (transformed_group >> 24) & 0xff;
204         decodedByteGroup[1] = (transformed_group >> 16) & 0xff;
205         decodedByteGroup[2] = (transformed_group >> 8) & 0xff;
206 
207         // Only emit the transformed bytes that we got data for. 
208         appender.put(decodedByteGroup[0 .. sz]);
209     }
210 }
211 
212 /// Test decoding of data which has a length which can be
213 /// cleanly decoded.
214 version(mir_test)
215 @safe pure unittest
216 {
217     {
218         enum data = "QUJD";
219         assert(data.decodeBase64 == "ABC");
220     }
221 
222     {
223         enum data = "QQ==";
224         assert(data.decodeBase64 == "A");
225     }
226 
227     {
228         enum data = "YSBiIGMgZCBlIGYgZyBoIGkgaiBrIGwgbSBuIG8gcCBxIHIgcyB0IHUgdiB3IHggeSB6";
229         assert(data.decodeBase64 == "a b c d e f g h i j k l m n o p q r s t u v w x y z");
230     }
231 
232     {
233         enum data = "LCAuIDsgLyBbICcgXSBcID0gLSAwIDkgOCA3IDYgNSA0IDMgMiAxIGAgfiAhIEAgIyAkICUgXiAmICogKCApIF8gKyB8IDogPCA+ID8=";
234         assert(data.decodeBase64 == ", . ; / [ ' ] \\ = - 0 9 8 7 6 5 4 3 2 1 ` ~ ! @ # $ % ^ & * ( ) _ + | : < > ?");
235     }
236 
237     {
238         enum data = "AAA=";
239         assert(data.decodeBase64 == "\x00\x00");
240     }
241 
242     {
243         enum data = "AAAABBCC";
244         assert(data.decodeBase64 == "\x00\x00\x00\x04\x10\x82");
245     }
246 
247     {
248         enum data = "AA==";
249         assert(data.decodeBase64 == "\x00");
250     }
251     
252     {
253         enum data = "AA/=";
254         assert(data.decodeBase64 == "\x00\x0f");
255     }
256 }
257 
258 /// Test decoding invalid data
259 version(mir_test)
260 @safe pure unittest
261 {
262     void testFail(const(char)[] input) @safe pure
263     {
264         bool thrown = false;
265         try {
266             ubyte[] decoded = input.decodeBase64;
267         } catch (Exception t) {
268             thrown = true;
269         }
270 
271         assert(thrown);
272     }
273 
274     testFail("===A");
275     testFail("A=");
276     testFail("AA=");
277     testFail("A=AA");
278     testFail("AA=A");
279     testFail("AA=A====");
280     testFail("=AAA");
281     testFail("AAA=QUJD");
282     // This fails because we don't allow extra padding (than what is necessary)
283     testFail("AA======");
284     // This fails because we don't allow padding before the end of the string (otherwise we'd have a side-channel)
285     testFail("QU==QUJD");
286     testFail("QU======QUJD");
287     // Invalid data that's out of the alphabet
288     testFail("!@##@@!@");
289 }
290 
291 /++
292 Encode a ubyte array as Base64, returning the encoded value.
293 +/
294 string encodeBase64(scope const(ubyte)[] buf, char plusChar = '+', char slashChar = '/') @safe pure
295 {
296     import mir.appender : scopedBuffer;
297     auto app = scopedBuffer!char;
298     encodeBase64(buf, app, plusChar, slashChar);
299     return app.data.idup;
300 }
301 
302 /++
303 Encode a ubyte array as Base64, placing the result onto an Appender.
304 +/
305 void encodeBase64(Appender)(scope const(ubyte)[] input,
306                             scope ref Appender appender,
307                             char plusChar = '+',
308                             char slashChar = '/') @safe pure
309 {
310     import core.bitop : bswap;
311     import mir.ndslice.topology : bytegroup, map;
312     // Slice our input array so that n % 3 == 0 (we have a multiple of 3) 
313     // If we have less then 3, then this is effectively a no-op (will result in a 0-length slice)
314     char[4] encodedByteGroup;
315     const(ubyte)[] window = input[0 .. input.length - (input.length % 3)];
316     foreach(group; window.bytegroup!(3, uint).map!bswap) {
317         const(ubyte) a = (group >> 26) & 0x3f;
318         const(ubyte) b = (group >> 20) & 0x3f;
319         const(ubyte) c = (group >> 14) & 0x3f;
320         const(ubyte) d = (group >> 8) & 0x3f;
321 
322         encodedByteGroup[0] = a.lookup_encoding(plusChar, slashChar);
323         encodedByteGroup[1] = b.lookup_encoding(plusChar, slashChar);
324         encodedByteGroup[2] = c.lookup_encoding(plusChar, slashChar);
325         encodedByteGroup[3] = d.lookup_encoding(plusChar, slashChar);
326         appender.put(encodedByteGroup[]);
327     }
328 
329     // If it's a clean multiple of 3, then it requires no padding.
330     // If not, then we need to add padding.
331     if (input.length % 3 != 0)
332     {
333         window = input[window.length .. input.length];
334 
335         uint group = (window[0] << 24);
336 
337         if (window.length == 1) {
338             const(ubyte) a = (group >> 26) & 0x3f;
339             const(ubyte) b = (group >> 20) & 0x3f;
340             encodedByteGroup[0] = a.lookup_encoding(plusChar, slashChar);
341             encodedByteGroup[1] = b.lookup_encoding(plusChar, slashChar);
342             encodedByteGroup[2] = '=';
343             encodedByteGroup[3] = '=';
344         }
345         else {
346             // Just in case 
347             assert(window.length == 2);
348 
349             group |= (window[1] << 16);
350             const(ubyte) a = (group >> 26) & 0x3f;
351             const(ubyte) b = (group >> 20) & 0x3f;
352             const(ubyte) c = (group >> 14) & 0x3f;
353             encodedByteGroup[0] = a.lookup_encoding(plusChar, slashChar);
354             encodedByteGroup[1] = b.lookup_encoding(plusChar, slashChar);
355             encodedByteGroup[2] = c.lookup_encoding(plusChar, slashChar);
356             encodedByteGroup[3] = '=';
357         }
358 
359         appender.put(encodedByteGroup[]);
360     }
361 }
362 
363 /// Test encoding of data which has a length that can be cleanly
364 /// encoded.
365 version(mir_test)
366 @safe pure unittest
367 {
368     // 3 bytes
369     {
370         enum data = cast(immutable(ubyte)[])"ABC";
371         assert(data.encodeBase64 == "QUJD");
372     }
373 
374     // 6 bytes
375     {
376         enum data = cast(immutable(ubyte)[])"ABCDEF";
377         assert(data.encodeBase64 == "QUJDREVG");
378     }
379 
380     // 9 bytes
381     {
382         enum data = cast(immutable(ubyte)[])"ABCDEFGHI";
383         assert(data.encodeBase64 == "QUJDREVGR0hJ");
384     }
385 
386     // 12 bytes
387     {
388         enum data = cast(immutable(ubyte)[])"ABCDEFGHIJKL";
389         assert(data.encodeBase64 == "QUJDREVGR0hJSktM");
390     }
391 }
392 
393 /// Test encoding of data which has a length which CANNOT be cleanly encoded.
394 /// This typically means that there's padding.
395 version(mir_test)
396 @safe pure unittest
397 {
398     // 1 byte 
399     {
400         enum data = cast(immutable(ubyte)[])"A";
401         assert(data.encodeBase64 == "QQ==");
402     }
403     // 2 bytes
404     {
405         enum data = cast(immutable(ubyte)[])"AB";
406         assert(data.encodeBase64 == "QUI=");
407     }
408     // 2 bytes
409     {
410         enum data = [0xFF, 0xFF];
411         assert(data.encodeBase64 == "//8=");
412     }
413     // 4 bytes
414     {
415         enum data = [0xDE, 0xAD, 0xBA, 0xBE];
416         assert(data.encodeBase64 == "3q26vg==");
417     }
418     // 37 bytes
419     {
420         enum data = cast(immutable(ubyte)[])"A Very Very Very Very Large Test Blob";
421         assert(data.encodeBase64 == "QSBWZXJ5IFZlcnkgVmVyeSBWZXJ5IExhcmdlIFRlc3QgQmxvYg==");
422     }
423 }
424 
425 /// Test nogc encoding
426 version(mir_test)
427 @safe pure @nogc unittest
428 {
429     import mir.appender : scopedBuffer;
430 
431     {
432         enum data = cast(immutable(ubyte)[])"A Very Very Very Very Large Test Blob";
433         auto appender = scopedBuffer!char();
434         data.encodeBase64(appender); 
435         assert(appender.data == "QSBWZXJ5IFZlcnkgVmVyeSBWZXJ5IExhcmdlIFRlc3QgQmxvYg==");     
436     }
437 
438     {
439         enum data = cast(immutable(ubyte)[])"abc123!?$*&()'-=@~";
440         auto appender = scopedBuffer!char();
441         data.encodeBase64(appender);
442         assert(appender.data == "YWJjMTIzIT8kKiYoKSctPUB+");
443     }
444 }
445 
446 /// Make sure we can decode what we encode.
447 version(mir_test)
448 @safe pure unittest
449 {
450     // Test an example string
451     {
452         enum data = cast(immutable(ubyte)[])"abc123!?$*&()'-=@~";
453         assert(data.encodeBase64.decodeBase64 == data);
454     }
455     // Test an example from Ion data
456     {
457         enum data = cast(immutable(ubyte)[])"a b c d e f g h i j k l m n o p q r s t u v w x y z";
458         assert(data.encodeBase64.decodeBase64 == data);
459     }
460 }
461