1 /* 2 Implementation of std.regex IR, an intermediate representation 3 of a regular expression pattern. 4 5 This is a common ground between frontend regex component (parser) 6 and backend components - generators, matchers and other "filters". 7 */ 8 module std.regex.internal.ir; 9 10 package(std.regex): 11 12 import std.exception, std.meta, std.range.primitives, std.traits, std.uni; 13 14 debug(std_regex_parser) import std.stdio; 15 // just a common trait, may be moved elsewhere 16 alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range); 17 18 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; 19 20 // heuristic value determines maximum CodepointSet length suitable for linear search 21 enum maxCharsetUsed = 6; 22 23 // another variable to tweak behavior of caching generated Tries for character classes 24 enum maxCachedMatchers = 8; 25 26 alias Trie = CodepointSetTrie!(13, 8); 27 alias makeTrie = codepointSetTrie!(13, 8); 28 29 CharMatcher[CodepointSet] matcherCache; 30 31 //accessor with caching 32 @trusted CharMatcher getMatcher(CodepointSet set) 33 { 34 // almost all properties of AA are not @safe 35 // https://issues.dlang.org/show_bug.cgi?id=6357 36 if (__ctfe || maxCachedMatchers == 0) 37 return CharMatcher(set); 38 else 39 { 40 auto p = set in matcherCache; 41 if (p) 42 return *p; 43 if (matcherCache.length == maxCachedMatchers) 44 { 45 // flush enmatchers in trieCache 46 matcherCache = null; 47 } 48 return (matcherCache[set] = CharMatcher(set)); 49 } 50 } 51 52 // Force pure because that is needed 53 // Templated so that we don't pull in std.uni wordCharacter unnecessarily. 54 @property ref wordMatcher()() pure 55 { 56 static auto actual() 57 { 58 static CharMatcher matcher; 59 static bool haveMatcher; 60 61 if (!haveMatcher) 62 { 63 matcher = CharMatcher(wordCharacter); 64 haveMatcher = true; 65 } 66 67 return &matcher; 68 } 69 70 // WORKAROUND: if the compiler won't memoize the output of the function for us, 71 // we'll do it with pure and there will be casts and it'll be happy about it. 72 // This is unfortunately needed to make std.regex as a whole faster to import & use 73 // in build times ~500ms. 74 return *(cast(immutable(CharMatcher)* function() @safe nothrow @nogc pure)&actual)(); 75 } 76 77 // some special Unicode white space characters 78 private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029'; 79 80 //Regular expression engine/parser options: 81 // global - search all nonoverlapping matches in input 82 // casefold - case insensitive matching, do casefolding on match in unicode mode 83 // freeform - ignore whitespace in pattern, to match space use [ ] or \s 84 // multiline - switch ^, $ detect start and end of linesinstead of just start and end of input 85 enum RegexOption: uint { 86 global = 0x1, 87 casefold = 0x2, 88 freeform = 0x4, 89 nonunicode = 0x8, 90 multiline = 0x10, 91 singleline = 0x20 92 } 93 //do not reorder this list 94 alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's'); 95 static assert( RegexOption.max < 0x80); 96 97 package(std) string regexOptionsToString()(uint flags) nothrow pure @safe 98 { 99 flags &= (RegexOption.max << 1) - 1; 100 if (!flags) 101 return ""; 102 char[RegexOptionNames.length] buffer = void; 103 size_t pos = 0; 104 foreach (i, flag; __traits(allMembers, RegexOption)) 105 if (flags & __traits(getMember, RegexOption, flag)) 106 buffer[pos++] = RegexOptionNames[i]; 107 return buffer[0 .. pos].idup; 108 } 109 110 // flags that allow guide execution of engine 111 enum RegexInfo : uint { oneShot = 0x80 } 112 113 // IR bit pattern: 0b1_xxxxx_yy 114 // where yy indicates class of instruction, xxxxx for actual operation code 115 // 00: atom, a normal instruction 116 // 01: open, opening of a group, has length of contained IR in the low bits 117 // 10: close, closing of a group, has length of contained IR in the low bits 118 // 11 unused 119 // 120 // Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version 121 // Possible changes: 122 //* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2}) 123 //* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops 124 // (like lookaround), or make it easier to identify hotspots. 125 126 enum IR:uint { 127 Char = 0b1_00000_00, //a character 128 Any = 0b1_00001_00, //any character 129 CodepointSet = 0b1_00010_00, //a most generic CodepointSet [...] 130 Trie = 0b1_00011_00, //CodepointSet implemented as Trie 131 //match with any of a consecutive OrChar's in this sequence 132 //(used for case insensitive match) 133 //OrChar holds in upper two bits of data total number of OrChars in this _sequence_ 134 //the drawback of this representation is that it is difficult 135 // to detect a jump in the middle of it 136 OrChar = 0b1_00100_00, 137 Nop = 0b1_00101_00, //no operation (padding) 138 End = 0b1_00110_00, //end of program 139 Bol = 0b1_00111_00, //beginning of a line ^ 140 Eol = 0b1_01000_00, //end of a line $ 141 Wordboundary = 0b1_01001_00, //boundary of a word 142 Notwordboundary = 0b1_01010_00, //not a word boundary 143 Backref = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index) 144 GroupStart = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit)) 145 GroupEnd = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit)) 146 Option = 0b1_01110_00, //start of an option within an alternation x | y (length) 147 GotoEndOr = 0b1_01111_00, //end of an option (length of the rest) 148 Bof = 0b1_10000_00, //begining of "file" (string) ^ 149 Eof = 0b1_10001_00, //end of "file" (string) $ 150 //... any additional atoms here 151 152 OrStart = 0b1_00000_01, //start of alternation group (length) 153 OrEnd = 0b1_00000_10, //end of the or group (length,mergeIndex) 154 //with this instruction order 155 //bit mask 0b1_00001_00 could be used to test/set greediness 156 InfiniteStart = 0b1_00001_01, //start of an infinite repetition x* (length) 157 InfiniteEnd = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex) 158 InfiniteQStart = 0b1_00010_01, //start of a non eager infinite repetition x*? (length) 159 InfiniteQEnd = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex) 160 InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length) 161 InfiniteBloomEnd = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex) 162 RepeatStart = 0b1_00100_01, //start of a {n,m} repetition (length) 163 RepeatEnd = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep) 164 RepeatQStart = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length) 165 RepeatQEnd = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep) 166 167 // 168 LookaheadStart = 0b1_00110_01, //begin of the lookahead group (length) 169 LookaheadEnd = 0b1_00110_10, //end of a lookahead group (length) 170 NeglookaheadStart = 0b1_00111_01, //start of a negative lookahead (length) 171 NeglookaheadEnd = 0b1_00111_10, //end of a negative lookahead (length) 172 LookbehindStart = 0b1_01000_01, //start of a lookbehind (length) 173 LookbehindEnd = 0b1_01000_10, //end of a lookbehind (length) 174 NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length) 175 NeglookbehindEnd = 0b1_01001_10, //end of negative lookbehind (length) 176 } 177 178 //a shorthand for IR length - full length of specific opcode evaluated at compile time 179 template IRL(IR code) 180 { 181 enum uint IRL = lengthOfIR(code); 182 } 183 static assert(IRL!(IR.LookaheadStart) == 3); 184 185 //how many parameters follow the IR, should be optimized fixing some IR bits 186 int immediateParamsIR(IR i) @safe pure nothrow @nogc 187 { 188 switch (i) 189 { 190 case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd: 191 return 1; // merge table index 192 case IR.InfiniteBloomEnd: 193 return 2; // bloom filter index + merge table index 194 case IR.RepeatEnd, IR.RepeatQEnd: 195 return 4; 196 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 197 return 2; // start-end of captures used 198 default: 199 return 0; 200 } 201 } 202 203 //full length of IR instruction inlcuding all parameters that might follow it 204 int lengthOfIR(IR i) @safe pure nothrow @nogc 205 { 206 return 1 + immediateParamsIR(i); 207 } 208 209 //full length of the paired IR instruction inlcuding all parameters that might follow it 210 int lengthOfPairedIR(IR i) @safe pure nothrow @nogc 211 { 212 return 1 + immediateParamsIR(pairedIR(i)); 213 } 214 215 //if the operation has a merge point (this relies on the order of the ops) 216 bool hasMerge(IR i) @safe pure nothrow @nogc 217 { 218 return (i&0b11)==0b10 && i <= IR.RepeatQEnd; 219 } 220 221 //is an IR that opens a "group" 222 bool isStartIR(IR i) @safe pure nothrow @nogc 223 { 224 return (i&0b11)==0b01; 225 } 226 227 //is an IR that ends a "group" 228 bool isEndIR(IR i) @safe pure nothrow @nogc 229 { 230 return (i&0b11)==0b10; 231 } 232 233 //is a standalone IR 234 bool isAtomIR(IR i) @safe pure nothrow @nogc 235 { 236 return (i&0b11)==0b00; 237 } 238 239 //makes respective pair out of IR i, swapping start/end bits of instruction 240 IR pairedIR(IR i) @safe pure nothrow @nogc 241 { 242 assert(isStartIR(i) || isEndIR(i)); 243 return cast(IR) (i ^ 0b11); 244 } 245 246 //encoded IR instruction 247 @safe pure 248 struct Bytecode 249 { 250 uint raw; 251 //natural constraints 252 enum maxSequence = 2+4; 253 enum maxData = 1 << 22; 254 enum maxRaw = 1 << 31; 255 256 @safe pure: 257 this(IR code, uint data) 258 { 259 assert(data < (1 << 22) && code < 256); 260 raw = code << 24 | data; 261 } 262 263 this(IR code, uint data, uint seq) 264 { 265 assert(data < (1 << 22) && code < 256 ); 266 assert(seq >= 2 && seq < maxSequence); 267 raw = code << 24 | (seq - 2)<<22 | data; 268 } 269 270 //store raw data 271 static Bytecode fromRaw(uint data) 272 { 273 Bytecode t; 274 t.raw = data; 275 return t; 276 } 277 278 // bit twiddling helpers 279 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 280 @property uint data()() const { return raw & 0x003f_ffff; } 281 282 @property void data()(uint val) 283 { 284 raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff); 285 } 286 287 // ditto 288 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 289 @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); } 290 291 // ditto 292 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 293 @property IR code()() const { return cast(IR)(raw >> 24); } 294 295 //ditto 296 @property bool hotspot() const { return hasMerge(code); } 297 298 //test the class of this instruction 299 @property bool isAtom() const { return isAtomIR(code); } 300 301 //ditto 302 @property bool isStart() const { return isStartIR(code); } 303 304 //ditto 305 @property bool isEnd() const { return isEndIR(code); } 306 307 //number of arguments for this instruction 308 @property int args() const { return immediateParamsIR(code); } 309 310 //mark this GroupStart or GroupEnd as referenced in backreference 311 void setBackrefence() 312 { 313 assert(code == IR.GroupStart || code == IR.GroupEnd); 314 raw = raw | 1 << 23; 315 } 316 317 //is referenced 318 @property bool backreference() const 319 { 320 assert(code == IR.GroupStart || code == IR.GroupEnd); 321 return cast(bool)(raw & 1 << 23); 322 } 323 324 //mark as local reference (for backrefs in lookarounds) 325 void setLocalRef() 326 { 327 assert(code == IR.Backref); 328 raw = raw | 1 << 23; 329 } 330 331 //is a local ref 332 @property bool localRef() const 333 { 334 assert(code == IR.Backref); 335 return cast(bool)(raw & 1 << 23); 336 } 337 338 //human readable name of instruction 339 @trusted @property string mnemonic()() const 340 {//@@@BUG@@@ to is @system 341 import std.conv : to; 342 return to!string(code); 343 } 344 345 //full length of instruction 346 @property uint length() const 347 { 348 return lengthOfIR(code); 349 } 350 351 //full length of respective start/end of this instruction 352 @property uint pairedLength() const 353 { 354 return lengthOfPairedIR(code); 355 } 356 357 //returns bytecode of paired instruction (assuming this one is start or end) 358 @property Bytecode paired() const 359 {//depends on bit and struct layout order 360 assert(isStart || isEnd); 361 return Bytecode.fromRaw(raw ^ 0b11 << 24); 362 } 363 364 //gets an index into IR block of the respective pair 365 uint indexOfPair(uint pc) const 366 { 367 assert(isStart || isEnd); 368 return isStart ? pc + data + length : pc - data - lengthOfPairedIR(code); 369 } 370 } 371 372 static assert(Bytecode.sizeof == 4); 373 374 375 //index entry structure for name --> number of submatch 376 struct NamedGroup 377 { 378 string name; 379 uint group; 380 } 381 382 //holds pair of start-end markers for a submatch 383 struct Group(DataIndex) 384 { 385 DataIndex begin = DataIndex.max; 386 DataIndex end = DataIndex.min; 387 388 bool opCast(T : bool)() const 389 { 390 return begin <= end; 391 } 392 393 @trusted string toString()() const 394 { 395 if (begin < end) 396 return "(unmatched)"; 397 import std.array : appender; 398 import std.format.write : formattedWrite; 399 auto a = appender!string(); 400 formattedWrite(a, "%s..%s", begin, end); 401 return a.data; 402 } 403 } 404 405 //debugging tool, prints out instruction along with opcodes 406 @trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[]) 407 { 408 import std.array : appender; 409 import std.format.write : formattedWrite; 410 auto output = appender!string(); 411 formattedWrite(output,"%s", irb[pc].mnemonic); 412 switch (irb[pc].code) 413 { 414 case IR.Char: 415 formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data); 416 break; 417 case IR.OrChar: 418 formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence); 419 break; 420 case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart, 421 IR.Option, IR.GotoEndOr, IR.OrStart: 422 //forward-jump instructions 423 uint len = irb[pc].data; 424 formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart)); 425 break; 426 case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions 427 uint len = irb[pc].data; 428 formattedWrite(output, " pc=>%u min=%u max=%u step=%u", 429 pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw); 430 break; 431 case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto 432 uint len = irb[pc].data; 433 formattedWrite(output, " pc=>%u", pc-len); 434 break; 435 case IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto 436 uint len = irb[pc].data; 437 formattedWrite(output, " pc=>%u", pc-len); 438 break; 439 case IR.GroupStart, IR.GroupEnd: 440 uint n = irb[pc].data; 441 string name; 442 foreach (v;dict) 443 if (v.group == n) 444 { 445 name = "'"~v.name~"'"; 446 break; 447 } 448 formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""), 449 name, n); 450 break; 451 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 452 uint len = irb[pc].data; 453 uint start = irb[pc+1].raw, end = irb[pc+2].raw; 454 formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end); 455 break; 456 case IR.Backref: case IR.CodepointSet: case IR.Trie: 457 uint n = irb[pc].data; 458 formattedWrite(output, " %u", n); 459 if (irb[pc].code == IR.Backref) 460 formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global"); 461 break; 462 default://all data-free instructions 463 } 464 if (irb[pc].hotspot) 465 formattedWrite(output, " Hotspot %u", irb[pc+1].raw); 466 return output.data; 467 } 468 469 //disassemble the whole chunk 470 @trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[]) 471 { 472 import std.stdio : writeln; 473 for (uint pc=0; pc<slice.length; pc += slice[pc].length) 474 writeln("\t", disassemble(slice, pc, dict)); 475 } 476 477 // Encapsulates memory management, explicit ref counting 478 // and the exact type of engine created 479 // there is a single instance per engine combination type x Char 480 // In future may also maintain a (TLS?) cache of memory 481 interface MatcherFactory(Char) 482 { 483 @safe: 484 Matcher!Char create(const ref Regex!Char, in Char[] input) const; 485 Matcher!Char dup(Matcher!Char m, in Char[] input) const; 486 size_t incRef(Matcher!Char m) const; 487 size_t decRef(Matcher!Char m) const; 488 } 489 490 // Only memory management, no compile-time vs run-time specialities 491 abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char 492 { 493 import core.memory : pureFree; 494 import std.internal.memory : enforceMalloc; 495 import core.memory : GC; 496 // round up to next multiple of size_t for alignment purposes 497 enum classSize = (__traits(classInstanceSize, EngineType!Char) + size_t.sizeof - 1) & ~(size_t.sizeof - 1); 498 499 EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const; 500 501 override EngineType!Char create(const ref Regex!Char re, in Char[] input) const @trusted 502 { 503 immutable size = EngineType!Char.initialMemory(re) + classSize; 504 auto memory = enforceMalloc(size)[0 .. size]; 505 scope(failure) pureFree(memory.ptr); 506 GC.addRange(memory.ptr, classSize); 507 auto engine = construct(re, input, memory); 508 assert(engine.refCount == 1); 509 assert(cast(void*) engine == memory.ptr); 510 return engine; 511 } 512 513 override EngineType!Char dup(Matcher!Char engine, in Char[] input) const @trusted 514 { 515 immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize; 516 auto memory = enforceMalloc(size)[0 .. size]; 517 scope(failure) pureFree(memory.ptr); 518 auto copy = construct(engine.pattern, input, memory); 519 GC.addRange(memory.ptr, classSize); 520 engine.dupTo(copy, memory[classSize .. size]); 521 assert(copy.refCount == 1); 522 return copy; 523 } 524 525 override size_t incRef(Matcher!Char m) const 526 { 527 return ++m.refCount; 528 } 529 530 override size_t decRef(Matcher!Char m) const @trusted 531 { 532 assert(m.refCount != 0); 533 auto cnt = --m.refCount; 534 if (cnt == 0) 535 { 536 void* ptr = cast(void*) m; 537 GC.removeRange(ptr); 538 pureFree(ptr); 539 } 540 return cnt; 541 } 542 } 543 544 // A factory for run-time engines 545 class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char) 546 { 547 override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const 548 { 549 import core.lifetime : emplace; 550 return emplace!(EngineType!Char)(memory[0 .. classSize], 551 re, Input!Char(input), memory[classSize .. $]); 552 } 553 } 554 555 // A factory for compile-time engine 556 class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char) 557 { 558 override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const 559 { 560 import core.lifetime : emplace; 561 return emplace!(EngineType!Char)(memory[0 .. classSize], 562 re, &func, Input!Char(input), memory[classSize .. $]); 563 } 564 } 565 566 private auto defaultFactoryImpl(Char)(const ref Regex!Char re) 567 { 568 import std.regex.internal.backtracking : BacktrackingMatcher; 569 import std.regex.internal.thompson : ThompsonMatcher; 570 import std.algorithm.searching : canFind; 571 static MatcherFactory!Char backtrackingFactory; 572 static MatcherFactory!Char thompsonFactory; 573 if (re.backrefed.canFind!"a != 0") 574 { 575 if (backtrackingFactory is null) 576 backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char); 577 return backtrackingFactory; 578 } 579 else 580 { 581 if (thompsonFactory is null) 582 thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char); 583 return thompsonFactory; 584 } 585 } 586 587 // Used to generate a pure wrapper for defaultFactoryImpl. Based on the example in 588 // the std.traits.SetFunctionAttributes documentation. 589 auto assumePureFunction(T)(T t) 590 if (isFunctionPointer!T) 591 { 592 enum attrs = functionAttributes!T | FunctionAttribute.pure_; 593 return cast(SetFunctionAttributes!(T, functionLinkage!T, attrs)) t; 594 } 595 596 // A workaround for R-T enum re = regex(...) 597 template defaultFactory(Char) 598 { 599 // defaultFactory is constructed as a safe, pure wrapper over defaultFactoryImpl. 600 // It can be faked as pure because the static mutable variables are used to cache 601 // the key and character matcher. The technique used avoids delegates and GC. 602 @property MatcherFactory!Char defaultFactory(const ref Regex!Char re) @safe pure 603 { 604 static auto impl(const ref Regex!Char re) 605 { 606 return defaultFactoryImpl(re); 607 } 608 609 static auto pureImpl(const ref Regex!Char re) @trusted 610 { 611 auto p = assumePureFunction(&impl); 612 return p(re); 613 } 614 615 return pureImpl(re); 616 } 617 } 618 619 // Defining it as an interface has the undesired side-effect: 620 // casting any class to an interface silently adjusts pointer to point to a nested vtbl 621 abstract class Matcher(Char) 622 { 623 abstract: 624 // Get a (next) match 625 int match(Group!size_t[] matches) pure; 626 // This only maintains internal ref-count, 627 // deallocation happens inside MatcherFactory 628 @property ref size_t refCount() @safe; 629 // Copy internal state to another engine, using memory arena 'memory' 630 void dupTo(Matcher!Char m, void[] memory); 631 // The pattern loaded 632 @property ref const(Regex!Char) pattern() @safe; 633 // Re-arm the engine with new Input 634 Matcher rearm(in Char[] stream); 635 } 636 637 /++ 638 `Regex` object holds regular expression pattern in compiled form. 639 Instances of this object are constructed via calls to `regex`. 640 This is an intended form for caching and storage of frequently 641 used regular expressions. 642 +/ 643 struct Regex(Char) 644 { 645 //temporary workaround for identifier lookup 646 CodepointSet[] charsets; // 647 Bytecode[] ir; //compiled bytecode of pattern 648 649 650 @safe @property bool empty() const nothrow { return ir is null; } 651 /++ 652 `namedCaptures` returns a range of all named captures in a given regular expression. 653 +/ 654 @safe @property auto namedCaptures() 655 { 656 static struct NamedGroupRange 657 { 658 private: 659 const(NamedGroup)[] groups; 660 size_t start; 661 size_t end; 662 public: 663 this(const(NamedGroup)[] g, size_t s, size_t e) 664 { 665 assert(s <= e); 666 assert(e <= g.length); 667 groups = g; 668 start = s; 669 end = e; 670 } 671 672 @property string front() { return groups[start].name; } 673 @property string back() { return groups[end-1].name; } 674 @property bool empty() { return start >= end; } 675 @property size_t length() { return end - start; } 676 alias opDollar = length; 677 @property NamedGroupRange save() 678 { 679 return NamedGroupRange(groups, start, end); 680 } 681 void popFront() { assert(!empty); start++; } 682 void popBack() { assert(!empty); end--; } 683 string opIndex()(size_t i) 684 { 685 assert(start + i < end, 686 "Requested named group is out of range."); 687 return groups[start+i].name; 688 } 689 NamedGroupRange opSlice(size_t low, size_t high) { 690 assert(low <= high); 691 assert(start + high <= end); 692 return NamedGroupRange(groups, start + low, start + high); 693 } 694 NamedGroupRange opSlice() { return this.save; } 695 } 696 return NamedGroupRange(dict, 0, dict.length); 697 } 698 699 package(std.regex): 700 import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency 701 const(NamedGroup)[] dict; // maps name -> user group number 702 uint ngroup; // number of internal groups 703 uint maxCounterDepth; // max depth of nested {n,m} repetitions 704 uint hotspotTableSize; // number of entries in merge table 705 uint threadCount; // upper bound on number of Thompson VM threads 706 uint flags; // global regex flags 707 public const(CharMatcher)[] matchers; // tables that represent character sets 708 public const(BitTable)[] filters; // bloom filters for conditional loops 709 uint[] backrefed; // bit array of backreferenced submatches 710 Kickstart!Char kickstart; 711 MatcherFactory!Char factory; // produces optimal matcher for this pattern 712 immutable(Char)[] pattern; // copy of pattern to serve as cache key 713 714 const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted 715 { 716 auto r = cast() this; 717 r.factory = factory; 718 return r; 719 } 720 721 const(Regex) withFlags(uint newFlags) pure const @trusted 722 { 723 auto r = cast() this; 724 r.flags = newFlags; 725 return r; 726 } 727 728 const(Regex) withCode(const(Bytecode)[] code) pure const @trusted 729 { 730 auto r = cast() this; 731 r.ir = code.dup; // TODO: sidestep const instead? 732 return r; 733 } 734 735 const(Regex) withNGroup(uint nGroup) pure const @trusted 736 { 737 auto r = cast() this; 738 r.ngroup = nGroup; 739 return r; 740 } 741 742 //bit access helper 743 uint isBackref(uint n) 744 { 745 if (n/32 >= backrefed.length) 746 return 0; 747 return backrefed[n / 32] & (1 << (n & 31)); 748 } 749 750 //check if searching is not needed 751 void checkIfOneShot() 752 { 753 L_CheckLoop: 754 for (uint i = 0; i < ir.length; i += ir[i].length) 755 { 756 switch (ir[i].code) 757 { 758 case IR.Bof: 759 flags |= RegexInfo.oneShot; 760 break L_CheckLoop; 761 case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof, 762 IR.Wordboundary, IR.Notwordboundary: 763 break; 764 default: 765 break L_CheckLoop; 766 } 767 } 768 } 769 770 //print out disassembly a program's IR 771 @trusted debug(std_regex_parser) void print() const 772 {//@@@BUG@@@ write is system 773 for (uint i = 0; i < ir.length; i += ir[i].length) 774 { 775 writefln("%d\t%s ", i, disassemble(ir, i, dict)); 776 } 777 writeln("Total merge table size: ", hotspotTableSize); 778 writeln("Max counter nesting depth: ", maxCounterDepth); 779 } 780 781 public string toString()() const 782 { 783 import std.format : format; 784 static if (is(typeof(pattern) : string)) 785 alias patternString = pattern; 786 else 787 { 788 import std.conv : to; 789 auto patternString = conv.to!string(pattern); 790 } 791 auto quotedEscapedPattern = format("%(%s %)", [patternString]); 792 auto flagString = regexOptionsToString(flags); 793 return "Regex!" ~ Char.stringof ~ "(" ~ quotedEscapedPattern ~ ", \"" ~ flagString ~ "\")"; 794 } 795 } 796 797 // The stuff below this point is temporarrily part of IR module 798 // but may need better place in the future (all internals) 799 package(std.regex): 800 801 //Simple UTF-string abstraction compatible with stream interface 802 struct Input(Char) 803 if (is(Char :dchar)) 804 { 805 import std.utf : decode; 806 alias DataIndex = size_t; 807 enum bool isLoopback = false; 808 alias String = const(Char)[]; 809 String _origin; 810 size_t _index; 811 812 //constructs Input object out of plain string 813 this(String input, size_t idx = 0) 814 { 815 _origin = input; 816 _index = idx; 817 } 818 819 //codepoint at current stream position 820 pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos) 821 { 822 pos = _index; 823 // DMD's inliner hates multiple return functions 824 // but can live with single statement if/else bodies 825 bool n = !(_index == _origin.length); 826 if (n) 827 res = decode(_origin, _index); 828 return n; 829 } 830 @property bool atEnd(){ 831 return _index == _origin.length; 832 } 833 bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos) 834 { 835 size_t idx = kick.search(_origin, _index); 836 _index = idx; 837 return nextChar(res, pos); 838 } 839 840 //index of at End position 841 @property size_t lastIndex(){ return _origin.length; } 842 843 //support for backtracker engine, might not be present 844 void reset(size_t index){ _index = index; } 845 846 String opSlice(size_t start, size_t end){ return _origin[start .. end]; } 847 848 auto loopBack(size_t index){ return BackLooper!Input(this, index); } 849 } 850 851 struct BackLooperImpl(Input) 852 { 853 import std.utf : strideBack; 854 alias DataIndex = size_t; 855 alias String = Input.String; 856 enum bool isLoopback = true; 857 String _origin; 858 size_t _index; 859 this(Input input, size_t index) 860 { 861 _origin = input._origin; 862 _index = index; 863 } 864 this(String input) 865 { 866 _origin = input; 867 _index = input.length; 868 } 869 @trusted bool nextChar(ref dchar res,ref size_t pos) 870 { 871 pos = _index; 872 if (_index == 0) 873 return false; 874 875 res = _origin[0.._index].back; 876 _index -= strideBack(_origin, _index); 877 878 return true; 879 } 880 @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); } 881 auto loopBack(size_t index){ return Input(_origin, index); } 882 883 //support for backtracker engine, might not be present 884 //void reset(size_t index){ _index = index ? index-std.utf.strideBack(_origin, index) : 0; } 885 void reset(size_t index){ _index = index; } 886 887 String opSlice(size_t start, size_t end){ return _origin[end .. start]; } 888 //index of at End position 889 @property size_t lastIndex(){ return 0; } 890 } 891 892 template BackLooper(E) 893 { 894 static if (is(E : BackLooperImpl!U, U)) 895 { 896 alias BackLooper = U; 897 } 898 else 899 { 900 alias BackLooper = BackLooperImpl!E; 901 } 902 } 903 904 //both helpers below are internal, on its own are quite "explosive" 905 //unsafe, no initialization of elements 906 @system pure T[] mallocArray(T)(size_t len) 907 { 908 import core.memory : pureMalloc; 909 return (cast(T*) pureMalloc(len * T.sizeof))[0 .. len]; 910 } 911 912 //very unsafe, no initialization 913 @system T[] arrayInChunk(T)(size_t len, ref void[] chunk) 914 { 915 auto ret = (cast(T*) chunk.ptr)[0 .. len]; 916 chunk = chunk[len * T.sizeof .. $]; 917 return ret; 918 } 919 920 // 921 @trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name) 922 {//equal is @system? 923 import std.algorithm.comparison : equal; 924 import std.algorithm.iteration : map; 925 import std.conv : text; 926 import std.range : assumeSorted; 927 928 auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length; 929 enforce(fnd < dict.length && equal(dict[fnd].name, name), 930 text("no submatch named ", name)); 931 return dict[fnd].group; 932 } 933 934 // whether ch is one of unicode newline sequences 935 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 936 bool endOfLine()(dchar front, bool seenCr) 937 { 938 return ((front == '\n') ^ seenCr) || front == '\r' 939 || front == NEL || front == LS || front == PS; 940 } 941 942 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 943 bool startOfLine()(dchar back, bool seenNl) 944 { 945 return ((back == '\r') ^ seenNl) || back == '\n' 946 || back == NEL || back == LS || back == PS; 947 } 948 949 ///Exception object thrown in case of errors during regex compilation. 950 public class RegexException : Exception 951 { 952 mixin basicExceptionCtors; 953 } 954 955 // simple 128-entry bit-table used with a hash function 956 struct BitTable { 957 uint[4] filter; 958 959 this(CodepointSet set){ 960 foreach (iv; set.byInterval) 961 { 962 foreach (v; iv.a .. iv.b) 963 add(v); 964 } 965 } 966 967 void add()(dchar ch){ 968 immutable i = index(ch); 969 filter[i >> 5] |= 1<<(i & 31); 970 } 971 // non-zero -> might be present, 0 -> absent 972 bool opIndex()(dchar ch) const{ 973 immutable i = index(ch); 974 return (filter[i >> 5]>>(i & 31)) & 1; 975 } 976 977 static uint index()(dchar ch){ 978 return ((ch >> 7) ^ ch) & 0x7F; 979 } 980 } 981 982 struct CharMatcher { 983 BitTable ascii; // fast path for ASCII 984 Trie trie; // slow path for Unicode 985 986 this(CodepointSet set) 987 { 988 auto asciiSet = set & unicode.ASCII; 989 ascii = BitTable(asciiSet); 990 trie = makeTrie(set); 991 } 992 993 bool opIndex()(dchar ch) const 994 { 995 if (ch < 0x80) 996 return ascii[ch]; 997 else 998 return trie[ch]; 999 } 1000 } 1001 1002 // Internal non-resizeble array, switches between inline storage and CoW 1003 // POD-only 1004 struct SmallFixedArray(T, uint SMALL=3) 1005 if (!hasElaborateDestructor!T) 1006 { 1007 import std.internal.memory : enforceMalloc; 1008 import core.memory : pureFree; 1009 static struct Payload 1010 { 1011 size_t refcount; 1012 T[0] placeholder; 1013 inout(T)* ptr() inout { return placeholder.ptr; } 1014 } 1015 static assert(Payload.sizeof == size_t.sizeof); 1016 union 1017 { 1018 Payload* big; 1019 T[SMALL] small; 1020 } 1021 size_t _sizeMask; 1022 enum BIG_MASK = size_t(1)<<(8*size_t.sizeof-1); 1023 enum SIZE_MASK = ~BIG_MASK; 1024 1025 @property bool isBig() const { return (_sizeMask & BIG_MASK) != 0; } 1026 @property size_t length() const { return _sizeMask & SIZE_MASK; } 1027 1028 this(size_t size) 1029 { 1030 if (size <= SMALL) 1031 { 1032 small[] = T.init; 1033 _sizeMask = size; 1034 } 1035 else 1036 { 1037 big = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*size); 1038 big.refcount = 1; 1039 _sizeMask = size | BIG_MASK; 1040 } 1041 } 1042 1043 private @trusted @property inout(T)[] internalSlice() inout 1044 { 1045 return isBig ? big.ptr[0 .. length] : small[0 .. length]; 1046 } 1047 1048 this(this) @trusted 1049 { 1050 if (isBig) 1051 { 1052 big.refcount++; 1053 } 1054 } 1055 1056 bool opEquals(SmallFixedArray a) 1057 { 1058 return internalSlice[] == a.internalSlice[]; 1059 } 1060 1061 size_t toHash() const 1062 { 1063 return hashOf(internalSlice[]); 1064 } 1065 1066 ref inout(T) opIndex(size_t idx) inout 1067 { 1068 return internalSlice[idx]; 1069 } 1070 1071 // accesses big to test self-referencing so not @safe 1072 @trusted ref opAssign(SmallFixedArray arr) 1073 { 1074 if (isBig) 1075 { 1076 if (arr.isBig) 1077 { 1078 if (big is arr.big) return this; // self-assign 1079 else 1080 { 1081 abandonRef(); 1082 _sizeMask = arr._sizeMask; 1083 big = arr.big; 1084 big.refcount++; 1085 } 1086 } 1087 else 1088 { 1089 abandonRef(); 1090 _sizeMask = arr._sizeMask; 1091 small = arr.small; 1092 } 1093 } 1094 else 1095 { 1096 if (arr.isBig) 1097 { 1098 _sizeMask = arr._sizeMask; 1099 big = arr.big; 1100 big.refcount++; 1101 } 1102 else 1103 { 1104 _sizeMask = arr._sizeMask; 1105 small = arr.small; 1106 } 1107 } 1108 return this; 1109 } 1110 1111 void mutate(scope void delegate(T[]) pure filler) 1112 { 1113 if (isBig && big.refcount != 1) // copy on write 1114 { 1115 auto oldSizeMask = _sizeMask; 1116 auto newbig = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*length); 1117 newbig.refcount = 1; 1118 abandonRef(); 1119 big = newbig; 1120 _sizeMask = oldSizeMask; 1121 } 1122 filler(internalSlice); 1123 } 1124 1125 ~this() 1126 { 1127 if (isBig) 1128 { 1129 abandonRef(); 1130 } 1131 } 1132 1133 @trusted private void abandonRef() 1134 { 1135 assert(isBig); 1136 if (--big.refcount == 0) 1137 { 1138 pureFree(big); 1139 _sizeMask = 0; 1140 assert(!isBig); 1141 } 1142 } 1143 } 1144 1145 @system unittest 1146 { 1147 alias SA = SmallFixedArray!(int, 2); 1148 SA create(int[] data) 1149 { 1150 SA a = SA(data.length); 1151 a.mutate((slice) { slice[] = data[]; }); 1152 assert(a.internalSlice == data); 1153 return a; 1154 } 1155 1156 { 1157 SA a; 1158 a = SA(1); 1159 assert(a.length == 1); 1160 a = SA.init; 1161 assert(a.length == 0); 1162 } 1163 1164 { 1165 SA a, b, c, d; 1166 assert(a.length == 0); 1167 assert(a.internalSlice == b.internalSlice); 1168 a = create([1]); 1169 assert(a.internalSlice == [1]); 1170 b = create([2, 3]); 1171 assert(b.internalSlice == [2, 3]); 1172 c = create([3, 4, 5]); 1173 d = create([5, 6, 7, 8]); 1174 assert(c.isBig); 1175 a = c; 1176 assert(a.isBig); 1177 assert(a.big is c.big); 1178 assert(a.big.refcount == 2); 1179 assert(a.internalSlice == [3, 4, 5]); 1180 assert(c.internalSlice == [3, 4, 5]); 1181 a = b; 1182 assert(!a.isBig); 1183 assert(a.internalSlice == [2, 3]); 1184 assert(c.big.refcount == 1); 1185 a = c; 1186 assert(c.big.refcount == 2); 1187 1188 // mutate copies on write if ref-count is not 1 1189 a.mutate((slice){ slice[] = 1; }); 1190 assert(a.internalSlice == [1, 1, 1]); 1191 assert(c.internalSlice == [3, 4, 5]); 1192 assert(a.isBig && c.isBig); 1193 assert(a.big.refcount == 1); 1194 assert(c.big.refcount == 1); 1195 1196 auto e = d; 1197 assert(e.big.refcount == 2); 1198 auto f = d; 1199 f = a; 1200 assert(f.isBig); 1201 assert(f.internalSlice == [1, 1, 1]); 1202 assert(f.big.refcount == 2); // a & f 1203 assert(e.big.refcount == 2); // d & e 1204 a = c; 1205 assert(f.big.refcount == 1); // f 1206 assert(e.big.refcount == 2); // d & e 1207 a = a; 1208 a = a; 1209 a = a; 1210 assert(a.big.refcount == 2); // a & c 1211 } 1212 }