The OpenD Programming Language

1 /*
2     Implementation of std.regex IR, an intermediate representation
3     of a regular expression pattern.
4 
5     This is a common ground between frontend regex component (parser)
6     and backend components - generators, matchers and other "filters".
7 */
8 module std.regex.internal.ir;
9 
10 package(std.regex):
11 
12 import std.exception, std.meta, std.range.primitives, std.traits, std.uni;
13 
14 debug(std_regex_parser) import std.stdio;
15 // just a common trait, may be moved elsewhere
16 alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);
17 
18 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
19 
20 // heuristic value determines maximum CodepointSet length suitable for linear search
21 enum maxCharsetUsed = 6;
22 
23 // another variable to tweak behavior of caching generated Tries for character classes
24 enum maxCachedMatchers = 8;
25 
26 alias Trie = CodepointSetTrie!(13, 8);
27 alias makeTrie = codepointSetTrie!(13, 8);
28 
29 CharMatcher[CodepointSet] matcherCache;
30 
31 //accessor with caching
32 @trusted CharMatcher getMatcher(CodepointSet set)
33 {
34     // almost all properties of AA are not @safe
35     // https://issues.dlang.org/show_bug.cgi?id=6357
36     if (__ctfe || maxCachedMatchers == 0)
37         return CharMatcher(set);
38     else
39     {
40         auto p = set in matcherCache;
41         if (p)
42             return *p;
43         if (matcherCache.length == maxCachedMatchers)
44         {
45             // flush enmatchers in trieCache
46             matcherCache = null;
47         }
48         return (matcherCache[set] = CharMatcher(set));
49     }
50 }
51 
52 // Force pure because that is needed
53 // Templated so that we don't pull in std.uni wordCharacter unnecessarily.
54 @property ref wordMatcher()() pure
55 {
56     static auto actual()
57     {
58         static CharMatcher matcher;
59         static bool haveMatcher;
60 
61         if (!haveMatcher)
62         {
63             matcher = CharMatcher(wordCharacter);
64             haveMatcher = true;
65         }
66 
67         return &matcher;
68     }
69 
70     // WORKAROUND: if the compiler won't memoize the output of the function for us,
71     //  we'll do it with pure and there will be casts and it'll be happy about it.
72     // This is unfortunately needed to make std.regex as a whole faster to import & use
73     //  in build times ~500ms.
74     return *(cast(immutable(CharMatcher)* function() @safe nothrow @nogc pure)&actual)();
75 }
76 
77 // some special Unicode white space characters
78 private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029';
79 
80 //Regular expression engine/parser options:
81 // global - search  all nonoverlapping matches in input
82 // casefold - case insensitive matching, do casefolding on match in unicode mode
83 // freeform - ignore whitespace in pattern, to match space use [ ] or \s
84 // multiline - switch  ^, $ detect start and end of linesinstead of just start and end of input
85 enum RegexOption: uint {
86     global = 0x1,
87     casefold = 0x2,
88     freeform = 0x4,
89     nonunicode = 0x8,
90     multiline = 0x10,
91     singleline = 0x20
92 }
93 //do not reorder this list
94 alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's');
95 static assert( RegexOption.max < 0x80);
96 
97 package(std) string regexOptionsToString()(uint flags) nothrow pure @safe
98 {
99     flags &= (RegexOption.max << 1) - 1;
100     if (!flags)
101         return "";
102     char[RegexOptionNames.length] buffer = void;
103     size_t pos = 0;
104     foreach (i, flag; __traits(allMembers, RegexOption))
105         if (flags & __traits(getMember, RegexOption, flag))
106             buffer[pos++] = RegexOptionNames[i];
107     return buffer[0 .. pos].idup;
108 }
109 
110 // flags that allow guide execution of engine
111 enum RegexInfo : uint { oneShot = 0x80 }
112 
113 // IR bit pattern: 0b1_xxxxx_yy
114 // where yy indicates class of instruction, xxxxx for actual operation code
115 //     00: atom, a normal instruction
116 //     01: open, opening of a group, has length of contained IR in the low bits
117 //     10: close, closing of a group, has length of contained IR in the low bits
118 //     11 unused
119 //
120 // Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version
121 // Possible changes:
122 //* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2})
123 //* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops
124 //  (like lookaround), or make it easier to identify hotspots.
125 
126 enum IR:uint {
127     Char              = 0b1_00000_00, //a character
128     Any               = 0b1_00001_00, //any character
129     CodepointSet      = 0b1_00010_00, //a most generic CodepointSet [...]
130     Trie              = 0b1_00011_00, //CodepointSet implemented as Trie
131     //match with any of a consecutive OrChar's in this sequence
132     //(used for case insensitive match)
133     //OrChar holds in upper two bits of data total number of OrChars in this _sequence_
134     //the drawback of this representation is that it is difficult
135     // to detect a jump in the middle of it
136     OrChar             = 0b1_00100_00,
137     Nop                = 0b1_00101_00, //no operation (padding)
138     End                = 0b1_00110_00, //end of program
139     Bol                = 0b1_00111_00, //beginning of a line ^
140     Eol                = 0b1_01000_00, //end of a line $
141     Wordboundary       = 0b1_01001_00, //boundary of a word
142     Notwordboundary    = 0b1_01010_00, //not a word boundary
143     Backref            = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index)
144     GroupStart         = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit))
145     GroupEnd           = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit))
146     Option             = 0b1_01110_00, //start of an option within an alternation x | y (length)
147     GotoEndOr          = 0b1_01111_00, //end of an option (length of the rest)
148     Bof                = 0b1_10000_00, //begining of "file" (string) ^
149     Eof                = 0b1_10001_00, //end of "file" (string) $
150     //... any additional atoms here
151 
152     OrStart            = 0b1_00000_01, //start of alternation group  (length)
153     OrEnd              = 0b1_00000_10, //end of the or group (length,mergeIndex)
154     //with this instruction order
155     //bit mask 0b1_00001_00 could be used to test/set greediness
156     InfiniteStart      = 0b1_00001_01, //start of an infinite repetition x* (length)
157     InfiniteEnd        = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex)
158     InfiniteQStart     = 0b1_00010_01, //start of a non eager infinite repetition x*? (length)
159     InfiniteQEnd       = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex)
160     InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length)
161     InfiniteBloomEnd   = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex)
162     RepeatStart        = 0b1_00100_01, //start of a {n,m} repetition (length)
163     RepeatEnd          = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep)
164     RepeatQStart       = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length)
165     RepeatQEnd         = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep)
166 
167     //
168     LookaheadStart     = 0b1_00110_01, //begin of the lookahead group (length)
169     LookaheadEnd       = 0b1_00110_10, //end of a lookahead group (length)
170     NeglookaheadStart  = 0b1_00111_01, //start of a negative lookahead (length)
171     NeglookaheadEnd    = 0b1_00111_10, //end of a negative lookahead (length)
172     LookbehindStart    = 0b1_01000_01, //start of a lookbehind (length)
173     LookbehindEnd      = 0b1_01000_10, //end of a lookbehind (length)
174     NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length)
175     NeglookbehindEnd   = 0b1_01001_10, //end of negative lookbehind (length)
176 }
177 
178 //a shorthand for IR length - full length of specific opcode evaluated at compile time
179 template IRL(IR code)
180 {
181     enum uint IRL =  lengthOfIR(code);
182 }
183 static assert(IRL!(IR.LookaheadStart) == 3);
184 
185 //how many parameters follow the IR, should be optimized fixing some IR bits
186 int immediateParamsIR(IR i) @safe pure nothrow @nogc
187 {
188     switch (i)
189     {
190     case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd:
191         return 1;  // merge table index
192     case IR.InfiniteBloomEnd:
193         return 2;  // bloom filter index + merge table index
194     case IR.RepeatEnd, IR.RepeatQEnd:
195         return 4;
196     case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
197         return 2;  // start-end of captures used
198     default:
199         return 0;
200     }
201 }
202 
203 //full length of IR instruction inlcuding all parameters that might follow it
204 int lengthOfIR(IR i) @safe pure nothrow @nogc
205 {
206     return 1 + immediateParamsIR(i);
207 }
208 
209 //full length of the paired IR instruction inlcuding all parameters that might follow it
210 int lengthOfPairedIR(IR i) @safe pure nothrow @nogc
211 {
212     return 1 + immediateParamsIR(pairedIR(i));
213 }
214 
215 //if the operation has a merge point (this relies on the order of the ops)
216 bool hasMerge(IR i) @safe pure nothrow @nogc
217 {
218     return (i&0b11)==0b10 && i <= IR.RepeatQEnd;
219 }
220 
221 //is an IR that opens a "group"
222 bool isStartIR(IR i) @safe pure nothrow @nogc
223 {
224     return (i&0b11)==0b01;
225 }
226 
227 //is an IR that ends a "group"
228 bool isEndIR(IR i) @safe pure nothrow @nogc
229 {
230     return (i&0b11)==0b10;
231 }
232 
233 //is a standalone IR
234 bool isAtomIR(IR i) @safe pure nothrow @nogc
235 {
236     return (i&0b11)==0b00;
237 }
238 
239 //makes respective pair out of IR i, swapping start/end bits of instruction
240 IR pairedIR(IR i) @safe pure nothrow @nogc
241 {
242     assert(isStartIR(i) || isEndIR(i));
243     return cast(IR) (i ^ 0b11);
244 }
245 
246 //encoded IR instruction
247 @safe pure
248 struct Bytecode
249 {
250     uint raw;
251     //natural constraints
252     enum maxSequence = 2+4;
253     enum maxData = 1 << 22;
254     enum maxRaw = 1 << 31;
255 
256 @safe pure:
257     this(IR code, uint data)
258     {
259         assert(data < (1 << 22) && code < 256);
260         raw = code << 24 | data;
261     }
262 
263     this(IR code, uint data, uint seq)
264     {
265         assert(data < (1 << 22) && code < 256 );
266         assert(seq >= 2 && seq < maxSequence);
267         raw = code << 24 | (seq - 2)<<22 | data;
268     }
269 
270     //store raw data
271     static Bytecode fromRaw(uint data)
272     {
273         Bytecode t;
274         t.raw = data;
275         return t;
276     }
277 
278     // bit twiddling helpers
279     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
280     @property uint data()() const { return raw & 0x003f_ffff; }
281 
282     @property void data()(uint val)
283     {
284         raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff);
285     }
286 
287     // ditto
288     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
289     @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); }
290 
291     // ditto
292     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
293     @property IR code()() const { return cast(IR)(raw >> 24); }
294 
295     //ditto
296     @property bool hotspot() const { return hasMerge(code); }
297 
298     //test the class of this instruction
299     @property bool isAtom() const { return isAtomIR(code); }
300 
301     //ditto
302     @property bool isStart() const { return isStartIR(code); }
303 
304     //ditto
305     @property bool isEnd() const { return isEndIR(code); }
306 
307     //number of arguments for this instruction
308     @property int args() const { return immediateParamsIR(code); }
309 
310     //mark this GroupStart or GroupEnd as referenced in backreference
311     void setBackrefence()
312     {
313         assert(code == IR.GroupStart || code == IR.GroupEnd);
314         raw = raw | 1 << 23;
315     }
316 
317     //is referenced
318     @property bool backreference() const
319     {
320         assert(code == IR.GroupStart || code == IR.GroupEnd);
321         return cast(bool)(raw & 1 << 23);
322     }
323 
324     //mark as local reference (for backrefs in lookarounds)
325     void setLocalRef()
326     {
327         assert(code == IR.Backref);
328         raw = raw | 1 << 23;
329     }
330 
331     //is a local ref
332     @property bool localRef() const
333     {
334         assert(code == IR.Backref);
335         return cast(bool)(raw & 1 << 23);
336     }
337 
338     //human readable name of instruction
339     @trusted @property string mnemonic()() const
340     {//@@@BUG@@@ to is @system
341         import std.conv : to;
342         return to!string(code);
343     }
344 
345     //full length of instruction
346     @property uint length() const
347     {
348         return lengthOfIR(code);
349     }
350 
351     //full length of respective start/end of this instruction
352     @property uint pairedLength() const
353     {
354         return lengthOfPairedIR(code);
355     }
356 
357     //returns bytecode of paired instruction (assuming this one is start or end)
358     @property Bytecode paired() const
359     {//depends on bit and struct layout order
360         assert(isStart || isEnd);
361         return Bytecode.fromRaw(raw ^ 0b11 << 24);
362     }
363 
364     //gets an index into IR block of the respective pair
365     uint indexOfPair(uint pc) const
366     {
367         assert(isStart || isEnd);
368         return isStart ? pc + data + length  : pc - data - lengthOfPairedIR(code);
369     }
370 }
371 
372 static assert(Bytecode.sizeof == 4);
373 
374 
375 //index entry structure for name --> number of submatch
376 struct NamedGroup
377 {
378     string name;
379     uint group;
380 }
381 
382 //holds pair of start-end markers for a submatch
383 struct Group(DataIndex)
384 {
385     DataIndex begin = DataIndex.max;
386     DataIndex end   = DataIndex.min;
387 
388     bool opCast(T : bool)() const
389     {
390         return begin <= end;
391     }
392 
393     @trusted string toString()() const
394     {
395         if (begin < end)
396             return "(unmatched)";
397         import std.array : appender;
398         import std.format.write : formattedWrite;
399         auto a = appender!string();
400         formattedWrite(a, "%s..%s", begin, end);
401         return a.data;
402     }
403 }
404 
405 //debugging tool, prints out instruction along with opcodes
406 @trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[])
407 {
408     import std.array : appender;
409     import std.format.write : formattedWrite;
410     auto output = appender!string();
411     formattedWrite(output,"%s", irb[pc].mnemonic);
412     switch (irb[pc].code)
413     {
414     case IR.Char:
415         formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data);
416         break;
417     case IR.OrChar:
418         formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence);
419         break;
420     case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart,
421     IR.Option, IR.GotoEndOr, IR.OrStart:
422         //forward-jump instructions
423         uint len = irb[pc].data;
424         formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart));
425         break;
426     case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions
427         uint len = irb[pc].data;
428         formattedWrite(output, " pc=>%u min=%u max=%u step=%u",
429             pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw);
430         break;
431     case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto
432         uint len = irb[pc].data;
433         formattedWrite(output, " pc=>%u", pc-len);
434         break;
435     case  IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto
436         uint len = irb[pc].data;
437         formattedWrite(output, " pc=>%u", pc-len);
438         break;
439     case IR.GroupStart, IR.GroupEnd:
440         uint n = irb[pc].data;
441         string name;
442         foreach (v;dict)
443             if (v.group == n)
444             {
445                 name = "'"~v.name~"'";
446                 break;
447             }
448         formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""),
449                 name, n);
450         break;
451     case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
452         uint len = irb[pc].data;
453         uint start = irb[pc+1].raw, end = irb[pc+2].raw;
454         formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end);
455         break;
456     case IR.Backref: case IR.CodepointSet: case IR.Trie:
457         uint n = irb[pc].data;
458         formattedWrite(output, " %u",  n);
459         if (irb[pc].code == IR.Backref)
460             formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global");
461         break;
462     default://all data-free instructions
463     }
464     if (irb[pc].hotspot)
465         formattedWrite(output, " Hotspot %u", irb[pc+1].raw);
466     return output.data;
467 }
468 
469 //disassemble the whole chunk
470 @trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[])
471 {
472     import std.stdio : writeln;
473     for (uint pc=0; pc<slice.length; pc += slice[pc].length)
474         writeln("\t", disassemble(slice, pc, dict));
475 }
476 
477 // Encapsulates memory management, explicit ref counting
478 // and the exact type of engine created
479 // there is a single instance per engine combination type x Char
480 // In future may also maintain a (TLS?) cache of memory
481 interface MatcherFactory(Char)
482 {
483 @safe:
484     Matcher!Char create(const ref Regex!Char, in Char[] input) const;
485     Matcher!Char dup(Matcher!Char m, in Char[] input) const;
486     size_t incRef(Matcher!Char m) const;
487     size_t decRef(Matcher!Char m) const;
488 }
489 
490 // Only memory management, no compile-time vs run-time specialities
491 abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char
492 {
493     import core.memory : pureFree;
494     import std.internal.memory : enforceMalloc;
495     import core.memory : GC;
496     // round up to next multiple of size_t for alignment purposes
497     enum classSize = (__traits(classInstanceSize, EngineType!Char) + size_t.sizeof - 1) & ~(size_t.sizeof - 1);
498 
499     EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const;
500 
501     override EngineType!Char create(const ref Regex!Char re, in Char[] input) const @trusted
502     {
503         immutable size = EngineType!Char.initialMemory(re) + classSize;
504         auto memory = enforceMalloc(size)[0 .. size];
505         scope(failure) pureFree(memory.ptr);
506         GC.addRange(memory.ptr, classSize);
507         auto engine = construct(re, input, memory);
508         assert(engine.refCount == 1);
509         assert(cast(void*) engine == memory.ptr);
510         return engine;
511     }
512 
513     override EngineType!Char dup(Matcher!Char engine, in Char[] input) const @trusted
514     {
515         immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize;
516         auto memory = enforceMalloc(size)[0 .. size];
517         scope(failure) pureFree(memory.ptr);
518         auto copy = construct(engine.pattern, input, memory);
519         GC.addRange(memory.ptr, classSize);
520         engine.dupTo(copy, memory[classSize .. size]);
521         assert(copy.refCount == 1);
522         return copy;
523     }
524 
525     override size_t incRef(Matcher!Char m) const
526     {
527         return ++m.refCount;
528     }
529 
530     override size_t decRef(Matcher!Char m) const  @trusted
531     {
532         assert(m.refCount != 0);
533         auto cnt = --m.refCount;
534         if (cnt == 0)
535         {
536             void* ptr = cast(void*) m;
537             GC.removeRange(ptr);
538             pureFree(ptr);
539         }
540         return cnt;
541     }
542 }
543 
544 // A factory for run-time engines
545 class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char)
546 {
547     override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
548     {
549         import core.lifetime : emplace;
550         return emplace!(EngineType!Char)(memory[0 .. classSize],
551             re, Input!Char(input), memory[classSize .. $]);
552     }
553 }
554 
555 // A factory for compile-time engine
556 class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char)
557 {
558     override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
559     {
560         import core.lifetime : emplace;
561         return emplace!(EngineType!Char)(memory[0 .. classSize],
562             re, &func, Input!Char(input), memory[classSize .. $]);
563     }
564 }
565 
566 private auto defaultFactoryImpl(Char)(const ref Regex!Char re)
567 {
568     import std.regex.internal.backtracking : BacktrackingMatcher;
569     import std.regex.internal.thompson : ThompsonMatcher;
570     import std.algorithm.searching : canFind;
571     static MatcherFactory!Char backtrackingFactory;
572     static MatcherFactory!Char thompsonFactory;
573     if (re.backrefed.canFind!"a != 0")
574     {
575         if (backtrackingFactory is null)
576             backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char);
577         return backtrackingFactory;
578     }
579     else
580     {
581         if (thompsonFactory is null)
582             thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char);
583         return thompsonFactory;
584     }
585 }
586 
587 // Used to generate a pure wrapper for defaultFactoryImpl. Based on the example in
588 // the std.traits.SetFunctionAttributes documentation.
589 auto assumePureFunction(T)(T t)
590 if (isFunctionPointer!T)
591 {
592     enum attrs = functionAttributes!T | FunctionAttribute.pure_;
593     return cast(SetFunctionAttributes!(T, functionLinkage!T, attrs)) t;
594 }
595 
596 // A workaround for R-T enum re = regex(...)
597 template defaultFactory(Char)
598 {
599     // defaultFactory is constructed as a safe, pure wrapper over defaultFactoryImpl.
600     // It can be faked as pure because the static mutable variables are used to cache
601     // the key and character matcher. The technique used avoids delegates and GC.
602     @property MatcherFactory!Char defaultFactory(const ref Regex!Char re) @safe pure
603     {
604         static auto impl(const ref Regex!Char re)
605         {
606             return defaultFactoryImpl(re);
607         }
608 
609         static auto pureImpl(const ref Regex!Char re) @trusted
610         {
611             auto p = assumePureFunction(&impl);
612             return p(re);
613         }
614 
615         return pureImpl(re);
616     }
617 }
618 
619 // Defining it as an interface has the undesired side-effect:
620 // casting any class to an interface silently adjusts pointer to point to a nested vtbl
621 abstract class Matcher(Char)
622 {
623 abstract:
624     // Get a (next) match
625     int match(Group!size_t[] matches) pure;
626     // This only maintains internal ref-count,
627     // deallocation happens inside MatcherFactory
628     @property ref size_t refCount() @safe;
629     // Copy internal state to another engine, using memory arena 'memory'
630     void dupTo(Matcher!Char m, void[] memory);
631     // The pattern loaded
632     @property ref const(Regex!Char) pattern() @safe;
633     // Re-arm the engine with new Input
634     Matcher rearm(in Char[] stream);
635 }
636 
637 /++
638     `Regex` object holds regular expression pattern in compiled form.
639     Instances of this object are constructed via calls to `regex`.
640     This is an intended form for caching and storage of frequently
641     used regular expressions.
642 +/
643 struct Regex(Char)
644 {
645     //temporary workaround for identifier lookup
646     CodepointSet[] charsets; //
647     Bytecode[] ir;      //compiled bytecode of pattern
648 
649 
650     @safe @property bool empty() const nothrow {  return ir is null; }
651     /++
652     `namedCaptures` returns a range of all named captures in a given regular expression.
653     +/
654     @safe @property auto namedCaptures()
655     {
656         static struct NamedGroupRange
657         {
658         private:
659             const(NamedGroup)[] groups;
660             size_t start;
661             size_t end;
662         public:
663             this(const(NamedGroup)[] g, size_t s, size_t e)
664             {
665                 assert(s <= e);
666                 assert(e <= g.length);
667                 groups = g;
668                 start = s;
669                 end = e;
670             }
671 
672             @property string front() { return groups[start].name; }
673             @property string back() { return groups[end-1].name; }
674             @property bool empty() { return start >= end; }
675             @property size_t length() { return end - start; }
676             alias opDollar = length;
677             @property NamedGroupRange save()
678             {
679                 return NamedGroupRange(groups, start, end);
680             }
681             void popFront() { assert(!empty); start++; }
682             void popBack() { assert(!empty); end--; }
683             string opIndex()(size_t i)
684             {
685                 assert(start + i < end,
686                        "Requested named group is out of range.");
687                 return groups[start+i].name;
688             }
689             NamedGroupRange opSlice(size_t low, size_t high) {
690                 assert(low <= high);
691                 assert(start + high <= end);
692                 return NamedGroupRange(groups, start + low, start + high);
693             }
694             NamedGroupRange opSlice() { return this.save; }
695         }
696         return NamedGroupRange(dict, 0, dict.length);
697     }
698 
699 package(std.regex):
700     import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
701     const(NamedGroup)[] dict;              // maps name -> user group number
702     uint ngroup;                           // number of internal groups
703     uint maxCounterDepth;                  // max depth of nested {n,m} repetitions
704     uint hotspotTableSize;                 // number of entries in merge table
705     uint threadCount;                      // upper bound on number of Thompson VM threads
706     uint flags;                            // global regex flags
707     public const(CharMatcher)[]  matchers; // tables that represent character sets
708     public const(BitTable)[] filters;      // bloom filters for conditional loops
709     uint[] backrefed;                      // bit array of backreferenced submatches
710     Kickstart!Char kickstart;
711     MatcherFactory!Char factory;           // produces optimal matcher for this pattern
712     immutable(Char)[] pattern;             // copy of pattern to serve as cache key
713 
714     const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted
715     {
716         auto r = cast() this;
717         r.factory = factory;
718         return r;
719     }
720 
721     const(Regex) withFlags(uint newFlags) pure const @trusted
722     {
723         auto r = cast() this;
724         r.flags = newFlags;
725         return r;
726     }
727 
728     const(Regex) withCode(const(Bytecode)[] code) pure const @trusted
729     {
730         auto r = cast() this;
731         r.ir = code.dup; // TODO: sidestep const instead?
732         return r;
733     }
734 
735     const(Regex) withNGroup(uint nGroup) pure const @trusted
736     {
737         auto r = cast() this;
738         r.ngroup = nGroup;
739         return r;
740     }
741 
742     //bit access helper
743     uint isBackref(uint n)
744     {
745         if (n/32 >= backrefed.length)
746             return 0;
747         return backrefed[n / 32] & (1 << (n & 31));
748     }
749 
750     //check if searching is not needed
751     void checkIfOneShot()
752     {
753     L_CheckLoop:
754         for (uint i = 0; i < ir.length; i += ir[i].length)
755         {
756             switch (ir[i].code)
757             {
758                 case IR.Bof:
759                     flags |= RegexInfo.oneShot;
760                     break L_CheckLoop;
761                 case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof,
762                 IR.Wordboundary, IR.Notwordboundary:
763                     break;
764                 default:
765                     break L_CheckLoop;
766             }
767         }
768     }
769 
770     //print out disassembly a program's IR
771     @trusted debug(std_regex_parser) void print() const
772     {//@@@BUG@@@ write is system
773         for (uint i = 0; i < ir.length; i += ir[i].length)
774         {
775             writefln("%d\t%s ", i, disassemble(ir, i, dict));
776         }
777         writeln("Total merge table size: ", hotspotTableSize);
778         writeln("Max counter nesting depth: ", maxCounterDepth);
779     }
780 
781     public string toString()() const
782     {
783         import std.format : format;
784         static if (is(typeof(pattern) : string))
785             alias patternString = pattern;
786         else
787         {
788             import std.conv : to;
789             auto patternString = conv.to!string(pattern);
790         }
791         auto quotedEscapedPattern = format("%(%s %)", [patternString]);
792         auto flagString = regexOptionsToString(flags);
793         return "Regex!" ~ Char.stringof ~ "(" ~ quotedEscapedPattern ~ ", \"" ~ flagString ~ "\")";
794     }
795 }
796 
797 // The stuff below this point is temporarrily part of IR module
798 // but may need better place in the future (all internals)
799 package(std.regex):
800 
801 //Simple UTF-string abstraction compatible with stream interface
802 struct Input(Char)
803 if (is(Char :dchar))
804 {
805     import std.utf : decode;
806     alias DataIndex = size_t;
807     enum bool isLoopback = false;
808     alias String = const(Char)[];
809     String _origin;
810     size_t _index;
811 
812     //constructs Input object out of plain string
813     this(String input, size_t idx = 0)
814     {
815         _origin = input;
816         _index = idx;
817     }
818 
819     //codepoint at current stream position
820     pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos)
821     {
822         pos = _index;
823         // DMD's inliner hates multiple return functions
824         // but can live with single statement if/else bodies
825         bool n = !(_index == _origin.length);
826         if (n)
827             res = decode(_origin, _index);
828         return n;
829     }
830     @property bool atEnd(){
831         return _index == _origin.length;
832     }
833     bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos)
834     {
835         size_t idx = kick.search(_origin, _index);
836         _index = idx;
837         return nextChar(res, pos);
838     }
839 
840     //index of at End position
841     @property size_t lastIndex(){   return _origin.length; }
842 
843     //support for backtracker engine, might not be present
844     void reset(size_t index){   _index = index;  }
845 
846     String opSlice(size_t start, size_t end){   return _origin[start .. end]; }
847 
848     auto loopBack(size_t index){   return BackLooper!Input(this, index); }
849 }
850 
851 struct BackLooperImpl(Input)
852 {
853     import std.utf : strideBack;
854     alias DataIndex = size_t;
855     alias String = Input.String;
856     enum bool isLoopback = true;
857     String _origin;
858     size_t _index;
859     this(Input input, size_t index)
860     {
861         _origin = input._origin;
862         _index = index;
863     }
864     this(String input)
865     {
866         _origin = input;
867         _index = input.length;
868     }
869     @trusted bool nextChar(ref dchar res,ref size_t pos)
870     {
871         pos = _index;
872         if (_index == 0)
873             return false;
874 
875         res = _origin[0.._index].back;
876         _index -= strideBack(_origin, _index);
877 
878         return true;
879     }
880     @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); }
881     auto loopBack(size_t index){   return Input(_origin, index); }
882 
883     //support for backtracker engine, might not be present
884     //void reset(size_t index){   _index = index ? index-std.utf.strideBack(_origin, index) : 0;  }
885     void reset(size_t index){   _index = index;  }
886 
887     String opSlice(size_t start, size_t end){   return _origin[end .. start]; }
888     //index of at End position
889     @property size_t lastIndex(){   return 0; }
890 }
891 
892 template BackLooper(E)
893 {
894     static if (is(E : BackLooperImpl!U, U))
895     {
896         alias BackLooper = U;
897     }
898     else
899     {
900         alias BackLooper = BackLooperImpl!E;
901     }
902 }
903 
904 //both helpers below are internal, on its own are quite "explosive"
905 //unsafe, no initialization of elements
906 @system pure T[] mallocArray(T)(size_t len)
907 {
908     import core.memory : pureMalloc;
909     return (cast(T*) pureMalloc(len * T.sizeof))[0 .. len];
910 }
911 
912 //very unsafe, no initialization
913 @system T[] arrayInChunk(T)(size_t len, ref void[] chunk)
914 {
915     auto ret = (cast(T*) chunk.ptr)[0 .. len];
916     chunk = chunk[len * T.sizeof .. $];
917     return ret;
918 }
919 
920 //
921 @trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name)
922 {//equal is @system?
923     import std.algorithm.comparison : equal;
924     import std.algorithm.iteration : map;
925     import std.conv : text;
926     import std.range : assumeSorted;
927 
928     auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
929     enforce(fnd < dict.length && equal(dict[fnd].name, name),
930         text("no submatch named ", name));
931     return dict[fnd].group;
932 }
933 
934 // whether ch is one of unicode newline sequences
935 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
936 bool endOfLine()(dchar front, bool seenCr)
937 {
938     return ((front == '\n') ^ seenCr) || front == '\r'
939     || front == NEL || front == LS || front == PS;
940 }
941 
942 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
943 bool startOfLine()(dchar back, bool seenNl)
944 {
945     return ((back == '\r') ^ seenNl) || back == '\n'
946     || back == NEL || back == LS || back == PS;
947 }
948 
949 ///Exception object thrown in case of errors during regex compilation.
950 public class RegexException : Exception
951 {
952     mixin basicExceptionCtors;
953 }
954 
955 // simple 128-entry bit-table used with a hash function
956 struct BitTable {
957     uint[4] filter;
958 
959     this(CodepointSet set){
960         foreach (iv; set.byInterval)
961         {
962             foreach (v; iv.a .. iv.b)
963                 add(v);
964         }
965     }
966 
967     void add()(dchar ch){
968         immutable i = index(ch);
969         filter[i >> 5]  |=  1<<(i & 31);
970     }
971     // non-zero -> might be present, 0 -> absent
972     bool opIndex()(dchar ch) const{
973         immutable i = index(ch);
974         return (filter[i >> 5]>>(i & 31)) & 1;
975     }
976 
977     static uint index()(dchar ch){
978         return ((ch >> 7) ^ ch) & 0x7F;
979     }
980 }
981 
982 struct CharMatcher {
983     BitTable ascii; // fast path for ASCII
984     Trie trie;      // slow path for Unicode
985 
986     this(CodepointSet set)
987     {
988         auto asciiSet = set & unicode.ASCII;
989         ascii = BitTable(asciiSet);
990         trie = makeTrie(set);
991     }
992 
993     bool opIndex()(dchar ch) const
994     {
995         if (ch < 0x80)
996             return ascii[ch];
997         else
998             return trie[ch];
999     }
1000 }
1001 
1002 // Internal non-resizeble array, switches between inline storage and CoW
1003 // POD-only
1004 struct SmallFixedArray(T, uint SMALL=3)
1005 if (!hasElaborateDestructor!T)
1006 {
1007     import std.internal.memory : enforceMalloc;
1008     import core.memory : pureFree;
1009     static struct Payload
1010     {
1011         size_t refcount;
1012         T[0] placeholder;
1013         inout(T)* ptr() inout { return placeholder.ptr; }
1014     }
1015     static assert(Payload.sizeof == size_t.sizeof);
1016     union
1017     {
1018         Payload* big;
1019         T[SMALL] small;
1020     }
1021     size_t _sizeMask;
1022     enum BIG_MASK = size_t(1)<<(8*size_t.sizeof-1);
1023     enum SIZE_MASK = ~BIG_MASK;
1024 
1025     @property bool isBig() const { return (_sizeMask & BIG_MASK) != 0; }
1026     @property size_t length() const { return _sizeMask & SIZE_MASK; }
1027 
1028     this(size_t size)
1029     {
1030         if (size <= SMALL)
1031         {
1032             small[] = T.init;
1033             _sizeMask = size;
1034         }
1035         else
1036         {
1037             big = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*size);
1038             big.refcount = 1;
1039             _sizeMask = size | BIG_MASK;
1040         }
1041     }
1042 
1043     private @trusted @property inout(T)[] internalSlice() inout
1044     {
1045         return isBig ? big.ptr[0 .. length] : small[0 .. length];
1046     }
1047 
1048     this(this) @trusted
1049     {
1050         if (isBig)
1051         {
1052             big.refcount++;
1053         }
1054     }
1055 
1056     bool opEquals(SmallFixedArray a)
1057     {
1058         return internalSlice[] == a.internalSlice[];
1059     }
1060 
1061     size_t toHash() const
1062     {
1063         return hashOf(internalSlice[]);
1064     }
1065 
1066     ref inout(T) opIndex(size_t idx) inout
1067     {
1068         return internalSlice[idx];
1069     }
1070 
1071     // accesses big to test self-referencing so not @safe
1072     @trusted ref opAssign(SmallFixedArray arr)
1073     {
1074         if (isBig)
1075         {
1076             if (arr.isBig)
1077             {
1078                 if (big is arr.big) return this; // self-assign
1079                 else
1080                 {
1081                     abandonRef();
1082                     _sizeMask = arr._sizeMask;
1083                     big = arr.big;
1084                     big.refcount++;
1085                 }
1086             }
1087             else
1088             {
1089                 abandonRef();
1090                 _sizeMask = arr._sizeMask;
1091                 small = arr.small;
1092             }
1093         }
1094         else
1095         {
1096             if (arr.isBig)
1097             {
1098                 _sizeMask = arr._sizeMask;
1099                 big = arr.big;
1100                 big.refcount++;
1101             }
1102             else
1103             {
1104                 _sizeMask = arr._sizeMask;
1105                 small = arr.small;
1106             }
1107         }
1108         return this;
1109     }
1110 
1111     void mutate(scope void delegate(T[]) pure filler)
1112     {
1113         if (isBig && big.refcount != 1) // copy on write
1114         {
1115             auto oldSizeMask = _sizeMask;
1116             auto newbig = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*length);
1117             newbig.refcount = 1;
1118             abandonRef();
1119             big = newbig;
1120             _sizeMask = oldSizeMask;
1121         }
1122         filler(internalSlice);
1123     }
1124 
1125     ~this()
1126     {
1127         if (isBig)
1128         {
1129             abandonRef();
1130         }
1131     }
1132 
1133     @trusted private void abandonRef()
1134     {
1135         assert(isBig);
1136         if (--big.refcount == 0)
1137         {
1138             pureFree(big);
1139             _sizeMask = 0;
1140             assert(!isBig);
1141         }
1142     }
1143 }
1144 
1145 @system unittest
1146 {
1147     alias SA = SmallFixedArray!(int, 2);
1148     SA create(int[] data)
1149     {
1150         SA a = SA(data.length);
1151         a.mutate((slice) { slice[] = data[]; });
1152         assert(a.internalSlice == data);
1153         return a;
1154     }
1155 
1156     {
1157         SA a;
1158         a = SA(1);
1159         assert(a.length == 1);
1160         a = SA.init;
1161         assert(a.length == 0);
1162     }
1163 
1164     {
1165         SA a, b, c, d;
1166         assert(a.length == 0);
1167         assert(a.internalSlice == b.internalSlice);
1168         a = create([1]);
1169         assert(a.internalSlice == [1]);
1170         b = create([2, 3]);
1171         assert(b.internalSlice == [2, 3]);
1172         c = create([3, 4, 5]);
1173         d = create([5, 6, 7, 8]);
1174         assert(c.isBig);
1175         a = c;
1176         assert(a.isBig);
1177         assert(a.big is c.big);
1178         assert(a.big.refcount == 2);
1179         assert(a.internalSlice == [3, 4, 5]);
1180         assert(c.internalSlice == [3, 4, 5]);
1181         a = b;
1182         assert(!a.isBig);
1183         assert(a.internalSlice == [2, 3]);
1184         assert(c.big.refcount == 1);
1185         a = c;
1186         assert(c.big.refcount == 2);
1187 
1188         // mutate copies on write if ref-count is not 1
1189         a.mutate((slice){ slice[] = 1; });
1190         assert(a.internalSlice == [1, 1, 1]);
1191         assert(c.internalSlice == [3, 4, 5]);
1192         assert(a.isBig && c.isBig);
1193         assert(a.big.refcount == 1);
1194         assert(c.big.refcount == 1);
1195 
1196         auto e = d;
1197         assert(e.big.refcount == 2);
1198         auto f = d;
1199         f = a;
1200         assert(f.isBig);
1201         assert(f.internalSlice == [1, 1, 1]);
1202         assert(f.big.refcount == 2); // a & f
1203         assert(e.big.refcount == 2); // d & e
1204         a = c;
1205         assert(f.big.refcount == 1); // f
1206         assert(e.big.refcount == 2); // d & e
1207         a = a;
1208         a = a;
1209         a = a;
1210         assert(a.big.refcount == 2); // a & c
1211     }
1212 }