mir.deser.text.tokenizer source code

1 /+
2 Tokenizer to split up the contents of an Ion Text file into tokens
3 
4 Authors: Harrison Ford
5 +/
6 module mir.deser.text.tokenizer;
7 
8 import mir.deser.text.readers;
9 import mir.deser.text.skippers;
10 import mir.deser.text.tokens;
11 
12 /+
13 Create a tokenizer for a given UTF-8 string.
14 
15 This function will take in a given string, and duplicate it.
16 Then, it will proceed to tokenize it.
17 
18 $(NOTE If this string is not a UTF-8 string, consider using the overload which accepts a UTF-16/UTF-32 string.)
19 
20 Params:
21     input = String to tokenize
22 Returns:
23     [IonTokenizer]
24 +/
25 IonTokenizer tokenizeString(const(char)[] input) @safe @nogc pure {
26     return IonTokenizer(input);
27 }
28 
29 /+
30 Tokenizer based off of how ion-go handles tokenization
31 +/
32 struct IonTokenizer {
33     /+ Our input range that we read from +/
34     const(char)[] input;
35 
36     /+ The current window that we're reading from (sliding window) +/
37     const(char)[] window;
38 
39     /+ The escape sequence that we're reading from the wire +/
40     char[4] escapeSequence; 
41 
42     /+ Bool specifying if we want to read through the contents of the current token +/
43     bool finished;
44 
45     /+ Current position within our input range +/
46     size_t position;
47 
48     /+ Current token that we're located on +/
49     IonTokenType currentToken;
50 
51     /+ 
52     Constructor
53     Params:
54         input = The input range to read over 
55     +/
56     this(const(char)[] input) @safe @nogc pure {
57         this.input = input;
58         this.finished = true;
59         resizeWindow(0);
60     }
61 
62     /+
63     Update the sliding window's beginning index
64     Params:
65         start = The beginning index to start at
66     +/
67     void resizeWindow(size_t start) @safe @nogc pure {
68         if (start > input.length) {
69             throw IonTokenizerErrorCode.cannotUpdateWindow.ionTokenizerException;
70         }
71 
72         window = input[start .. $];
73         this.position = start;
74     }
75 
76     /+
77     Clear out the escape sequence buffer.
78     +/
79     void resetEscapeBuffer() @safe @nogc pure {
80         this.escapeSequence[0] = '\0';
81         this.escapeSequence[1] = '\0';
82         this.escapeSequence[2] = '\0';
83         this.escapeSequence[3] = '\0';
84     }
85 
86     /+
87     Variable to indicate if we at the end of our range
88     Returns:
89         true if end of file, false otherwise
90     +/
91     bool isEOF() @safe @nogc pure {
92         return this.window.length == 0
93                || this.currentToken == IonTokenType.TokenEOF 
94                || this.position >= this.input.length;
95     }
96 
97     /+ 
98     Unread a given character and append it to the peek buffer 
99     Params:
100         c = Character to append to the top of the peek buffer.
101     +/
102     void unread(char c) @safe @nogc pure  {
103         if (this.position <= 0) {
104             throw IonTokenizerErrorCode.cannotUnreadAtPos0.ionTokenizerException;
105         }
106 
107         if (c == 0) {
108             return;
109         } else {
110             resizeWindow(this.position - 1);
111         }
112     }
113     // Test reading / unreading bytes
114     version(mir_ion_parser_test) unittest
115     {
116         auto t = tokenizeString("abc\rd\ne\r\n");
117 
118         t.testRead('a');
119         t.unread('a');
120 
121         t.testRead('a');
122         t.testRead('b');
123         t.testRead('c');
124         t.unread('c');
125         t.unread('b');
126 
127         t.testRead('b');
128         t.testRead('c');
129         t.testRead('\r');
130         t.unread('\r');
131 
132         t.testRead('\r');
133         t.testRead('d');
134         t.testRead('\n');
135         t.testRead('e');
136         t.testRead('\r');
137         t.testRead('\n');
138         t.testRead(0); // test EOF
139 
140         t.unread(0); // unread EOF
141         t.unread('\n');
142 
143         t.testRead('\n');
144         t.testRead(0); // test EOF
145         t.testRead(0); // test EOF
146     }
147 
148     /+ 
149     Skip a single character within our input range, and discard it 
150     Returns:
151         true if it was able to skip a single character,
152         false if it was unable (due to hitting an EOF or the like)
153     +/
154     bool skipOne() @safe @nogc pure  {
155         const(char) c = readInput();
156         if (c == 0) {
157             return false;
158         }
159         return true;
160     }
161 
162     /+
163     Skip exactly n input characters from the input range
164 
165     $(NOTE
166         This function will only return true IF it is able to skip *the entire amount specified*)
167     Params:
168         n = Number of characters to skip
169     Returns:
170         true if skipped the entire range,
171         false if unable to skip the full range specified.
172     +/
173     bool skipExactly(size_t n) @safe @nogc pure {
174         for (size_t i = 0; i < n; i++) {
175             if (!skipOne()) { 
176                 return false;
177             }
178         }
179         return true;
180     }
181 
182     /+
183     Read ahead at most n characters from the input range without discarding them.
184 
185     $(NOTE
186         This function does not require n characters to be present.
187         If it encounters an EOF, it will simply return a shorter range.)
188     Params:
189         n = Max number of characters to peek
190     Returns:
191         Array of peeked characters
192     +/
193     auto peekMax(size_t wanted = 4096) @safe @nogc pure {
194         size_t n = wanted; 
195         if (n >= window.length) {
196             n = window.length;
197         }
198 
199         auto arr = window[0 .. n];
200         return arr;
201     }
202 
203     /+
204     Read ahead exactly n characters from the input range without discarding them.
205 
206     $(NOTE
207         This function will throw if all n characters are not present.
208         If you would like to peek as many as possible, use [peekMax] instead.)
209     Params:
210         n = Number of characters to peek
211     Returns:
212         An array filled with n characters.
213     Throws:
214         [IonTokenizerException]
215     +/
216     auto peekExactly(size_t required = 4096) @safe @nogc pure {
217         size_t n = required; 
218         if (n > window.length) {
219             unexpectedEOF();
220         }
221 
222         auto buf = window[0 .. n];
223 
224         return buf;
225     }
226     // Test peekExactly
227     version(mir_ion_parser_test) unittest
228     {
229         import std.exception : assertThrown;
230         import mir.exception : enforce;
231         import mir.deser.text.tokens : IonTokenizerException;
232 
233         auto t = tokenizeString("abc\r\ndef");
234         
235         assert(t.peekExactly(1).ptr == t.window.ptr);
236         assert(t.peekExactly(1) == "a");
237         assert(t.peekExactly(2) == "ab");
238         assert(t.peekExactly(3) == "abc");
239 
240         t.testRead('a');
241         t.testRead('b');
242         
243         assert(t.peekExactly(3).ptr == t.window.ptr);
244         assert(t.peekExactly(3) == "c\r\n");
245         assert(t.peekExactly(2) == "c\r");
246         assert(t.peekExactly(3) == "c\r\n");
247 
248         t.testRead('c');
249         t.testRead('\r');
250         t.testRead('\n');
251         t.testRead('d');
252 
253         assertThrown!IonTokenizerException(t.peekExactly(3));
254         assertThrown!IonTokenizerException(t.peekExactly(3));
255         assert(t.peekExactly(2) == "ef");
256 
257         t.testRead('e');
258         t.testRead('f');
259         t.testRead(0);
260 
261         assertThrown!IonTokenizerException(t.peekExactly(10));
262     }
263 
264     /+
265     Read ahead one character from the input range without discarding it.
266 
267     $(NOTE
268         This function will throw if it cannot read one character ahead.
269         Use [peekMax] if you want to read without throwing.)
270     Returns:
271         A single character read ahead from the input range.
272     Throws:
273         [IonTokenizerException]
274     +/
275     char peekOne() @safe @nogc pure {
276         if (isEOF) {
277             this.unexpectedEOF();
278         }
279 
280         char c;
281         c = readInput();
282         unread(c);
283         
284         return c;
285     }
286     // Test peeking the next byte in the stream
287     version(mir_ion_parser_test) unittest
288     {
289         import std.exception : assertThrown;
290         import mir.deser.text.tokens : IonTokenizerException;
291 
292         auto t = tokenizeString("abc");
293 
294         t.testPeek('a');
295         t.testPeek('a');
296         t.testRead('a');
297 
298         t.testPeek('b');
299         t.unread('a');
300 
301         t.testPeek('a');
302         t.testRead('a');
303         t.testRead('b');
304         t.testPeek('c');
305         t.testPeek('c');
306         t.testRead('c');
307         
308         assertThrown!IonTokenizerException(t.peekOne() == 0);
309         assertThrown!IonTokenizerException(t.peekOne() == 0);
310         assert(t.readInput() == 0);
311     }
312 
313     /+
314     Read a single character from the input range (or from the peek buffer, if it's not empty)
315 
316     $(NOTE `readInput` does NOT normalize CRLF to a simple new-line.)
317     Returns:
318         a single character from the input range, or 0 if the EOF is encountered.
319     Throws:
320         [IonTokenizerException]
321     +/
322     char readInput() @safe @nogc pure {
323         if (isEOF) {
324             return 0;
325         }
326 
327         immutable char c = this.window[0];
328         resizeWindow(this.position + 1);
329         /*
330         if (c == '\r') {
331             // EOFs should've been normalized at the first stage
332             throw Mir(IonTokenizerErrorCode.normalizeEOFFail);
333         }
334         */
335 
336         return c;
337     }
338     // Test reading bytes off of a range
339     version(mir_ion_parser_test) unittest 
340     {
341         auto t = tokenizeString("abcdefghijklmopqrstuvwxyz1234567890");
342         t.testRead('a');
343         t.testRead('b');
344         t.testRead('c');
345         t.testRead('d');
346         t.testRead('e');
347         t.testRead('f');
348         t.testRead('g');
349         t.testRead('h');
350         t.testRead('i');
351     }
352     // Test the normalization of CRLFs
353     version(mir_ion_parser_test) unittest
354     {
355         auto t = tokenizeString("a\r\nb\r\nc\rd");
356         t.testRead('a');
357         t.testRead('\r');
358         t.testRead('\n');
359         t.testRead('b');
360         t.testRead('\r');
361         t.testRead('\n');
362         t.testRead('c');
363         t.testRead('\r');
364         t.testRead('d');
365         t.testRead(0);
366     }
367 
368     /+
369     Skip any whitespace that is present between our current token and the next valid token.
370 
371     Additionally, skip comments (or fail on comments).
372 
373     $(NOTE `skipComments` and `failOnComment` cannot both be true.)
374     Returns:
375         The character located directly after the whitespace.
376     Throws:
377         [IonTokenizerException]
378     +/
379     char skipWhitespace(bool skipComments = true, bool failOnComment = false)() @safe @nogc pure 
380     if (skipComments != failOnComment || (skipComments == false && skipComments == failOnComment)) { // just a sanity check, we cannot skip comments and also fail on comments -- it is one or another (fail or skip)
381         while (true) {
382             char c = readInput();
383             sw: switch(c) {
384                 static foreach(member; ION_WHITESPACE) {
385                     case member:
386                         break sw;
387                 }
388                 
389                 case '/': {
390                     static if (failOnComment) {
391                         throw IonTokenizerErrorCode.commentsNotAllowed.ionTokenizerException; 
392                     } else static if(skipComments) {
393                         // Peek on the next letter, and check if it's a second slash / star
394                         // This may fail if we read a comment and do not find the end (newline / '*/')
395                         // Undetermined if I need to unread the last char if this happens?
396                         if (this.skipComment()) 
397                             break;
398                         else
399                             goto default;
400                     }
401                     else {
402                         return '/';
403                     }
404                 }
405                 // If this is a non-whitespace character, unread it
406                 default:
407                     return c;
408             }
409         }
410         return 0;
411     }
412     // Test skipping over whitespace 
413     version(mir_ion_parser_test) unittest
414     {
415         import std.exception : assertNotThrown;
416         import mir.exception : enforce;
417         import mir.deser.text.tokens : IonTokenizerException;
418         void test(string txt, char expectedChar) {
419             auto t = tokenizeString(txt);
420             assertNotThrown!IonTokenizerException(
421                 enforce!"skipWhitespace did not return expected character"(t.skipWhitespace() == expectedChar)
422             );
423         }
424 
425         test("/ 0)", '/');
426         test("xyz_", 'x');
427         test(" / 0)", '/');
428         test(" xyz_", 'x');
429         test(" \t\r\n / 0)", '/');
430         test("\t\t  // comment\t\r\n\t\t  x", 'x');
431         test(" \r\n /* comment *//* \r\n comment */x", 'x');
432     }
433 
434     /+
435     Skip whitespace within a clob/blob. 
436 
437     This function is just a wrapper around skipWhitespace, but toggles on it's "fail on comment" mode, as
438     comments are not allowed within clobs/blobs.
439     Returns:
440         a character located after the whitespace within a clob/blob
441     Throws:
442         IonTokenizerException if a comment is found
443     +/
444     char skipLobWhitespace() @safe @nogc pure {
445         return skipWhitespace!(false, false);
446     }
447     // Test skipping over whitespace within a (c|b)lob
448     version(mir_ion_parser_test) unittest
449     {
450         import std.exception : assertNotThrown;
451         import mir.exception : enforce;
452         import mir.deser.text.tokens : IonTokenizerException;
453         void test(string txt, char expectedChar)() {
454             auto t = tokenizeString(txt);
455             assertNotThrown!IonTokenizerException(
456                 enforce!"Lob whitespace did not match expected character"(t.skipLobWhitespace() == expectedChar)
457             );
458         }
459 
460         test!("//=", '/');
461         test!("xyz_", 'x');
462         test!(" //=", '/');
463         test!(" xyz_", 'x');
464         test!("\r\n\t//=", '/');
465         test!("\r\n\txyz_", 'x');
466     }
467 
468     /+
469     Check if the next characters within the input range are a double colon, representing an annotation.
470     Returns:
471         true if it finds a double colon, false if it does not.
472     +/
473     bool isDoubleColon() @safe @nogc pure {
474         char c = skipWhitespace();
475         unread(c);
476 
477         auto cs = peekMax(2); 
478         if (cs.length == 2 && cs[0] == ':' && cs[1] == ':') {
479             return true;
480         }
481         return false;
482     }
483 
484     /+
485     Check if the next characters within the input range are the special "nan" type.
486     Params:
487         c = The last character read off of the stream (typically 'n')
488     Returns:
489         true if it is the nan type, false if it is not.
490     +/
491     bool isNAN(char c) @safe @nogc pure {
492         if (c != 'n') return false;
493 
494         auto cs = peekMax(4);
495 
496         if (cs.length < 2 || cs[0] != 'a' || cs[1] != 'n') {
497             return false;
498         }
499         
500         if (cs.length == 2) { // is this just 'an' + EOF?
501             skipExactly(2);
502             return true;
503         } else if (cs.length == 3 && isStopChar(cs[2])) { // is this 'an' + stop char
504             skipExactly(2);
505             return true;
506         // is this 'an' + comment (block or regular)
507         } else if ((cs.length > 2 && cs[2] == '/') && cs.length > 3 && (cs[3] == '/' || cs[3] == '*')) {
508             skipExactly(2);
509             return true;
510         }
511 
512         return false;
513     }
514     // Test scanning for nan
515     version(mir_ion_parser_test) unittest
516     {
517         void test(string txt, bool nan, char after) {
518             auto t = tokenizeString(txt);
519             auto c = t.readInput();
520             assert(t.isNAN(c) == nan);
521             assert(t.readInput() == after);
522         }
523         
524         test("nan", true, 0);
525         test("nan/*", true, '/');
526         test("nan\t", true, '\t');
527         test("nan\n", true, '\n');
528         test("nan ", true, ' ');
529 
530         test("-nan", false, 'n');
531         test("+nan", false, 'n');
532         test("nat\t", false, 'a');
533         test("nat/*", false, 'a');
534         test("nat//", false, 'a');
535         test("na", false, 'a');
536         test("n", false, 0);
537     }
538 
539 
540     /+
541     Check if the next characters within the input range are the special "infinity" type.
542 
543     Params:
544         c = The last character read off of the stream (typically '+' or '-')
545     Returns:
546         true if it is the infinity type, false if it is not.
547     +/
548     bool isInfinity(char c) @safe @nogc pure {
549         if (c != '+' && c != '-') return false;
550 
551         auto cs = peekMax(5);
552 
553         if (cs.length < 3 || cs[0] != 'i' || cs[1] != 'n' || cs[2] != 'f') {
554             return false;
555         }
556 
557         if (cs.length == 3) {
558             skipExactly(3);
559             return true;
560         } else if (cs.length > 3 && isStopChar(cs[3])) { // cleanly terminated with a stop char
561             skipExactly(3);
562             return true;
563         } else if ((cs.length > 3 && cs[3] == '/') && cs.length > 4 && (cs[4] == '/' || cs[4] == '*')) {
564             skipExactly(3);
565             return true;
566         }
567 
568         return false;
569     }
570     // Test scanning for inf
571     version(mir_ion_parser_test) unittest
572     {
573         void test(string txt, bool inf, char after) {
574             auto t = tokenizeString(txt);
575             auto c = t.readInput();
576             assert(t.isInfinity(c) == inf);
577             assert(t.readInput() == after);
578         }
579         
580         test("+inf", true, 0);
581         test("-inf", true, 0);
582         test("+inf ", true, ' ');
583         test("-inf\t", true, '\t');
584         test("-inf\n", true, '\n');
585         test("+inf,", true, ',');
586         test("-inf}", true, '}');
587         test("+inf)", true, ')');
588         test("-inf]", true, ']');
589         test("+inf//", true, '/');
590         test("+inf/*", true, '/');
591 
592         test("+inf/", false, 'i');
593         test("-inf/0", false, 'i');
594         test("+int//", false, 'i');
595         test("+int/*", false, 'i');
596         test("+int", false, 'i');
597         test("-iot", false, 'i');
598         test("+unf", false, 'u');
599         test("_inf", false, 'i');
600 
601         test("-in", false, 'i');
602         test("+i", false, 'i');
603         test("+", false, 0);
604         test("-", false, 0);
605     }
606 
607     /+
608     Check if the current character selected is part of a triple quote (''')
609 
610     $(NOTE This function will not throw if an EOF is hit. It will simply return false.)
611     Returns:
612         true if the character is part of a triple quote,
613         false if it is not.
614     +/
615     bool isTripleQuote() @safe @nogc pure {
616         auto cs = peekMax(2);
617 
618         // If the next two characters are '', then it is a triple-quote.
619         if (cs.length == 2 && cs[0] == '\'' && cs[1] == '\'') { 
620             skipExactly(2);
621             return true;
622         }
623 
624         return false;
625     }
626 
627     /+
628     Check if the current character selected is part of a whole number.
629 
630     If it is part of a whole number, then return the type of number (hex, binary, timestamp, number)
631     Params:
632         c = The last character read from the range
633     Returns:
634         the corresponding number type (or invalid)
635     +/
636     IonTokenType scanForNumber(char c) @safe @nogc pure 
637     in {
638         assert(isDigit(c), "Scan for number called with non-digit number");
639     } do {
640         const(char)[] cs;
641         try {
642             cs = peekMax(4);
643         } catch(IonTokenizerException e) {
644             return IonTokenType.TokenInvalid;
645         }
646 
647         // Check if the first character is a 0, then check if the next character is a radix identifier (binary / hex)
648         if (c == '0' && cs.length > 0) {
649             switch(cs[0]) {
650                 case 'b':
651                 case 'B':
652                     return IonTokenType.TokenBinary;
653                 
654                 case 'x':
655                 case 'X':
656                     return IonTokenType.TokenHex;
657                 
658                 default:
659                     break;
660             }
661         }
662 
663         // Otherwise, it's not, and we check if it's a timestamp or just a plain number.
664         if (cs.length == 4) {
665             if (!isDigit(cs[0])) return IonTokenType.TokenNumber;
666 
667             // "time-of-day" extension
668             if (cs[1] == ':')
669             {
670                 return IonTokenType.TokenTimestamp;
671             }
672             else
673             {
674                 foreach(i; 1 .. 3)
675                 {
676                     if (!isDigit(cs[i])) return IonTokenType.TokenNumber;
677                 }
678                 if (cs[3] == '-' || cs[3] == 'T') {
679                     return IonTokenType.TokenTimestamp;
680                 }
681             }
682 
683         }
684         return IonTokenType.TokenNumber;
685 
686     }
687     // Test scanning for numbers 
688     version(mir_ion_parser_test) unittest
689     {
690         import mir.deser.text.tokens : IonTokenType;
691 
692         void test(string txt, IonTokenType expectedToken) {
693             auto t = tokenizeString(txt);
694             auto c = t.readInput();
695             assert(t.scanForNumber(c) == expectedToken);
696         }
697 
698         test("0b0101", IonTokenType.TokenBinary);
699         test("0B", IonTokenType.TokenBinary);
700         test("0xABCD", IonTokenType.TokenHex);
701         test("0X", IonTokenType.TokenHex);
702         test("0000-00-00", IonTokenType.TokenTimestamp);
703         test("0000T", IonTokenType.TokenTimestamp);
704 
705         test("0", IonTokenType.TokenNumber);
706         test("1b0101", IonTokenType.TokenNumber);
707         test("1B", IonTokenType.TokenNumber);
708         test("1x0101", IonTokenType.TokenNumber);
709         test("1X", IonTokenType.TokenNumber);
710         test("1234", IonTokenType.TokenNumber);
711         test("12345", IonTokenType.TokenNumber);
712         test("1,23T", IonTokenType.TokenNumber);
713         test("12,3T", IonTokenType.TokenNumber);
714         test("123,T", IonTokenType.TokenNumber);
715     }
716 
717     /+
718     Set the current token, and if we want to go into the token.
719     Params:
720         token = The updated token type
721         finished = Whether or not we want to go into the token (and parse it)
722     +/
723     void ok(IonTokenType token, bool finished) @safe @nogc pure {
724         this.currentToken = token;
725         this.finished = finished;
726     }
727 
728     /+
729     Read the next token from the range.
730     Returns:
731         true if it was able to read a valid token from the range.
732     +/
733     bool nextToken() @safe @nogc pure {
734         char c;
735         // if we're finished with the current value, then skip over the rest of it and go to the next token
736         // this typically happens when we hit commas (or the like) and don't have anything to extract
737         if (!this.finished) {
738             c = this.skipValue();
739         } else {
740             c = skipWhitespace();
741         }
742 
743         // NOTE: these variable declarations are up here
744         // since we would miss them within the switch decl.
745 
746         // have we hit an inf?
747         bool inf;
748 
749         // second character
750         char cs;
751         
752         with(IonTokenType) switch(c) {
753             case 0:
754                 ok(TokenEOF, false);
755                 return true;
756             case ':':
757                 cs = peekOne();
758                 if (cs == ':') {
759                     skipOne();
760                     ok(TokenDoubleColon, true);
761                 } else {
762                     ok(TokenColon, true);
763                 }
764                 return true;
765             case '{': 
766                 cs = peekOne();
767                 if (cs == '{') {
768                     skipOne();
769                     ok(TokenOpenDoubleBrace, false);
770                 } else {
771                     ok(TokenOpenBrace, false);
772                 }
773                 return true;
774             case '}':
775                 ok(TokenCloseBrace, true);
776                 return true;
777             case '[':
778                 ok(TokenOpenBracket, false);
779                 return true;
780             case ']':
781                 ok(TokenCloseBracket, false);
782                 return true;
783             case '(':
784                 ok(TokenOpenParen, false);
785                 return true;
786             case ')':
787                 ok(TokenCloseParen, false);
788                 return true;
789             case ',':
790                 ok(TokenComma, true);
791                 return true;
792             case '.':
793                 cs = peekOne();
794                 if (isOperatorChar(cs)) {
795                     unread(cs);
796                     ok(TokenSymbolOperator, false);
797                     return true;
798                 }
799 
800                 if (cs == ' ' || isIdentifierPart(cs)) {
801                     unread(cs);
802                 }
803                 ok(TokenDot, true);
804                 return true;
805             case '\'':
806                 if (isTripleQuote()) {
807                     ok(TokenLongString, false);
808                     return true;
809                 }
810                 ok(TokenSymbolQuoted, false);
811                 return true;
812             case '+':
813                 inf = isInfinity(c);
814                 if (inf) {
815                     ok(TokenFloatInf, true);
816                     return true;
817                 }
818                 unread(c);
819                 ok(TokenSymbolOperator, false);
820                 return true;
821             case '-':
822                 cs = peekOne();
823                 if (isDigit(cs)) {
824                     skipOne();
825                     IonTokenType tokenType = scanForNumber(cs);
826                     if (tokenType == TokenTimestamp) {
827                         throw IonTokenizerErrorCode.negativeTimestamp.ionTokenizerException;
828                     }
829                     unread(cs);
830                     unread(c);
831                     ok(tokenType, false);
832                     return true;
833                 }
834 
835                 inf = isInfinity(c);
836                 if (inf) {
837                     ok(TokenFloatMinusInf, true);
838                     return true;
839                 }
840                 unread(c);
841                 ok(TokenSymbolOperator, false);
842                 return true;
843 
844            static foreach(member; ION_OPERATOR_CHARS) {
845                 static if (member != '+' && member != '-' && member != '"' && member != '.') {
846                     case member:
847                         unread(c);
848                         ok(TokenSymbolOperator, false);
849                         return true;
850                 }
851             }
852 
853             case '"':
854                 ok(TokenString, false);
855                 return true;
856 
857             static foreach(member; ION_IDENTIFIER_START_CHARS) {
858                 case member:
859                     static if (member == 'n') {
860                         if (isNAN(c)) {
861                             ok(TokenFloatNaN, false);
862                             return true;
863                         }
864                     }
865                     unread(c);
866                     ok(TokenSymbol, false);
867                     return true;
868             } 
869 
870             static foreach(member; ION_DIGITS) {
871                 case member:
872                     IonTokenType t = scanForNumber(c);
873                     unread(c);
874                     ok(t, false);
875                     return true;
876             }
877 
878             default:
879                 unexpectedChar(c);
880                 return false;
881         }
882     }
883 
884     /+
885     Finish reading the current token, and skip to the end of it.
886     
887     This function will only work if we are in the middle of reading a token.
888     Returns:
889         false if we already finished with a token,
890         true if we were able to skip to the end of it.
891     Throws:
892         IonTokenizerException if we were not able to skip to the end.
893     +/
894     bool finish() @safe @nogc pure {
895         if (finished) {
896             return false;
897         }
898 
899         immutable char c = this.skipValue();
900         unread(c);
901         finished = true;
902         return true;
903     }
904 
905     /+
906     Check if the given character is a "stop" character.
907 
908     Stop characters are typically terminators of objects, but here we overload and check if there's a comment after our character.
909     Params:
910         c = The last character read from the input range.
911     Returns:
912         true if the character is the "stop" character.
913     +/
914     bool isStopChar(char c) @safe @nogc pure {
915         if (mir.deser.text.tokens.isStopChar(c)) { // make sure
916             return true;
917         }
918 
919         if (c == '/') {
920             const(char) c2 = peekOne();
921             if (c2 == '/' || c2 == '*') {
922                 return true;
923             }
924         }
925 
926         return false;
927     }
928 
929     /+
930     Helper to generate a thrown exception (if an unexpected character is hit)
931     +/
932     void unexpectedChar(char c, size_t pos = -1, string file = __FILE__, int line = __LINE__) @safe @nogc pure {
933         static if (__traits(compiles, ()@nogc { throw new Exception(""); }))
934             throw new IonTokenizerException(c ? IonTokenizerErrorCode.unexpectedCharacter : IonTokenizerErrorCode.unexpectedEOF, file, line);
935         else
936             throw ionTokenizerException(c ? IonTokenizerErrorCode.unexpectedCharacter : IonTokenizerErrorCode.unexpectedEOF, /+file, line+/);
937     }
938 
939     /+
940     Helper to throw if an unexpected end-of-file is hit.
941     +/
942     void unexpectedEOF(size_t pos = -1, string file = __FILE__, int line = __LINE__) @safe @nogc pure {
943         if (pos == -1)
944             pos = this.position;
945         unexpectedChar(0, pos, file, line);
946     }
947 
948     /+
949     Ensure that the next item in the range fulfills the predicate given.
950     Params:
951         pred = A predicate that the next character in the range must fulfill
952     Throws:
953         [IonTokenizerException] if the predicate is not fulfilled
954     +/
955     template expect(alias pred = "a", bool noRead = false) {
956         import mir.functional : naryFun;
957         static if (noRead) {
958             char expect(char c, string file = __FILE__, int line = __LINE__) @trusted @nogc pure {
959                 if (!naryFun!pred(c)) {
960                     unexpectedChar(c, -1, file, line);
961                 }
962 
963                 return c;
964             }
965         } else {
966             char expect(string file = __FILE__, int line = __LINE__) @trusted @nogc pure {
967                 char c = readInput();
968                 if (!naryFun!pred(c)) {
969                     unexpectedChar(c, -1, file, line);
970                 }
971 
972                 return c;
973             }
974         }
975     }
976     // Text expect()
977     version(mir_ion_parser_test) unittest
978     {
979         import mir.deser.text.tokens : IonTokenizerException, isHexDigit;
980 
981         void testIsHex(string ts) {
982             auto t = tokenizeString(ts);
983             while (!t.isEOF) {
984                 import std.exception : assertNotThrown;
985                 assertNotThrown!IonTokenizerException(t.expect!(isHexDigit));
986             }
987         }
988 
989         void testFailHex(string ts) {
990             auto t = tokenizeString(ts);
991             while (!t.isEOF) {
992                 import std.exception : assertThrown;
993                 assertThrown!IonTokenizerException(t.expect!(isHexDigit));
994             }
995         }
996 
997         testIsHex("1231231231");
998         testIsHex("BADBAB3");
999         testIsHex("F00BAD");
1000         testIsHex("420");
1001         testIsHex("41414141");
1002         testIsHex("BADF00D");
1003         testIsHex("BaDf00D");
1004         testIsHex("badf00d");
1005         testIsHex("AbCdEf123");
1006 
1007         testFailHex("HIWORLT");
1008         testFailHex("Tst");
1009     }
1010 
1011     /+
1012     Ensure that the next item in the range does NOT fulfill the predicate given.
1013 
1014     This is the opposite of `expect` - which expects that the predicate is fulfilled.
1015     However, for all intents and purposes, the functionality of `expectFalse` is identical to `expect`.
1016     Params:
1017         pred = A predicate that the next character in the range must NOT fulfill.
1018     Throws:
1019         [IonTokenizerException] if the predicate is fulfilled.
1020     +/
1021     template expectFalse(alias pred = "a", bool noRead = false, string file = __FILE__, int line = __LINE__) {
1022         import mir.functional : naryFun;
1023         static if (noRead) {
1024             char expectFalse(char c) @trusted @nogc pure {
1025                 if (naryFun!pred(c)) {
1026                     unexpectedChar(c, -1, file, line);
1027                 }
1028 
1029                 return c;
1030             }
1031         } else {
1032             char expectFalse() @trusted @nogc pure {
1033                 char c = readInput();
1034                 if (naryFun!pred(c)) {
1035                     unexpectedChar(c, -1, file, line);
1036                 }
1037 
1038                 return c;
1039             }
1040         }
1041     }
1042 }
1043 
1044 /+
1045 Generic helper to verify the functionality of the parsing code in unit-tests
1046 +/
1047 void testRead(T)(ref T t, char expected, string file = __FILE__, int line = __LINE__) {
1048     import mir.exception : MirError;
1049     char v = t.readInput();
1050     if (v != expected) {
1051         import mir.format : stringBuf, print;
1052         auto buf = stringBuf;
1053         buf.print("Expected ", expected, " but got ", v);
1054         throw new MirError(buf.data, file, line);
1055     }
1056 }
1057 
1058 /+
1059 Generic helper to verify the functionality of the parsing code in unit-tests
1060 +/
1061 void testPeek(T)(ref T t, char expected, string file = __FILE__, int line = __LINE__) {
1062     import mir.exception : MirError;
1063     char v = t.peekOne();
1064     if (v != expected) {
1065         import mir.format : stringBuf, print;
1066         auto buf = stringBuf;
1067         buf.print("Expected ", expected, " but got ", v);
1068         throw new MirError(buf.data, file, line);
1069     }
1070 }
The OpenD Programming Language