1 /++ 2 +/ 3 module mir.ion.symbol_table; 4 5 import mir.utility: min, max, swap, _expect; 6 import mir.internal.memory; 7 8 // hash - string hash 9 // key - string start position 10 // value - symbol id 11 12 /++ 13 Each version of the Ion specification defines the corresponding system symbol table version. 14 Ion 1.0 uses the `"$ion"` symbol table, version 1, 15 and future versions of Ion will use larger versions of the `"$ion"` symbol table. 16 `$ion_1_1` will probably use version 2, while `$ion_2_0` might use version 5. 17 18 Applications and users should never have to care about these symbol table versions, 19 since they are never explicit in user data: this specification disallows (by ignoring) imports named `"$ion"`. 20 21 Here are the system symbols for Ion 1.0. 22 +/ 23 static immutable string[] IonSystemSymbolTable_v1 = [ 24 "$0", 25 "$ion", 26 "$ion_1_0", 27 "$ion_symbol_table", 28 "name", 29 "version", 30 "imports", 31 "symbols", 32 "max_id", 33 "$ion_shared_symbol_table", 34 ]; 35 36 /++ 37 +/ 38 enum IonSystemSymbol : ubyte 39 { 40 /// 41 zero, 42 /// 43 ion, 44 /// 45 ion_1_0, 46 /// 47 ion_symbol_table, 48 /// 49 name, 50 /// 51 version_, 52 /// 53 imports, 54 /// 55 symbols, 56 /// 57 max_id, 58 /// 59 ion_shared_symbol_table, 60 } 61 62 package(mir) string[] removeSystemSymbols(const(string)[] keys) @safe pure nothrow 63 { 64 string[] ret; 65 F: foreach (key; keys) switch(key) 66 { 67 static foreach (skey; IonSystemSymbolTable_v1) 68 { 69 case skey: continue F; 70 } 71 default: 72 ret ~= key; 73 } 74 return ret; 75 } 76 77 struct IonSymbolTableSequental 78 { 79 import mir.ser.ion: IonSerializer; 80 import mir.ndslice.slice; 81 import core.stdc.string; 82 import core.stdc.stdio; 83 84 static struct Entry 85 { 86 ulong* data; 87 uint[] ids; 88 } 89 90 ulong[] temporalStorage; 91 Entry[] entries; 92 uint nextID = IonSystemSymbol.max + 1; 93 IonSerializer!(1024, null, false) serializer = void; 94 enum size_t annotationWrapperState = 0; 95 enum size_t annotationsState = 5; 96 enum size_t structState = 5; 97 enum size_t listState = 9; 98 99 @trusted pure nothrow @nogc: 100 101 // $ion_symbol_table:: 102 // { 103 // symbols:[ ... ] 104 // } 105 void initialize()(size_t n = 64) 106 { 107 pragma(inline, true); 108 auto llen = n / ulong.sizeof + (n % ulong.sizeof != 0); 109 auto temporalStoragePtr = cast(ulong*) malloc(llen * ulong.sizeof); 110 this.temporalStorage = temporalStoragePtr[0 .. llen]; 111 auto entriesPtr = cast(Entry*) malloc(n * Entry.sizeof); 112 this.entries = entriesPtr[0 .. n]; 113 this.entries[] = Entry.init; 114 this.nextID = IonSystemSymbol.max + 1; 115 this.serializer.initializeNoTable; 116 117 auto annotationWrapperState = serializer.annotationWrapperBegin; 118 assert(annotationWrapperState == this.annotationWrapperState); 119 serializer.putAnnotationId(IonSystemSymbol.ion_symbol_table); 120 auto annotationsState = serializer.annotationsEnd(annotationWrapperState); 121 assert(annotationsState == this.annotationsState); 122 auto structState = serializer.structBegin(); 123 assert(structState == this.structState); 124 serializer.putKeyId(IonSystemSymbol.symbols); 125 auto listState = serializer.listBegin(); 126 assert(listState == this.listState); 127 } 128 129 void finalize()() 130 { 131 pragma(inline, true); 132 if (nextID > IonSystemSymbol.max + 1) 133 { 134 serializer.listEnd(listState); 135 serializer.structEnd(structState); 136 serializer.annotationWrapperEnd(annotationsState, annotationWrapperState); 137 } 138 else 139 { 140 serializer.buffer._currentLength = 0; 141 } 142 143 temporalStorage.ptr.free; 144 foreach(ref e; entries) 145 { 146 e.data.free; 147 e.ids.ptr.free; 148 } 149 entries.ptr.free; 150 } 151 152 uint insert()(scope const(char)[] str) 153 { 154 pragma(inline, true); 155 auto n = str.length; 156 auto llen = n / ulong.sizeof + (n % ulong.sizeof != 0); 157 if (_expect(n >= entries.length, false)) 158 { 159 auto oldLength = entries.length; 160 auto temporalStoragePtr = cast(ulong*) realloc(temporalStorage.ptr, llen * ulong.sizeof); 161 this.temporalStorage = temporalStoragePtr[0 .. llen]; 162 this.temporalStorage.ptr[0] = 0; 163 auto entriesPtr = cast(Entry*) realloc(entries.ptr, (n + 1) * Entry.sizeof); 164 this.entries = entriesPtr[0 .. n + 1]; 165 this.entries[oldLength .. $] = Entry.init; 166 } 167 temporalStorage.ptr[0] = 0; 168 memcpy(cast(ubyte*)(temporalStorage.ptr + llen) - str.length, str.ptr, str.length); 169 170 // { 171 // auto tempPtr0 = cast(ubyte*)(temporalStorage.ptr + llen) - str.length; 172 // foreach (i; 0 .. str.length) 173 // tempPtr0[i] = str[i]; 174 // } 175 176 with(entries[n]) 177 { 178 if (_expect(ids.length == 0, false)) 179 { 180 auto idsPtr = cast(uint*)malloc(uint.sizeof); 181 ids = idsPtr[0 .. 1]; 182 if (llen) 183 { 184 data = cast(ulong*) malloc(ulong.sizeof * llen); 185 memcpy(data, temporalStorage.ptr, llen * ulong.sizeof); 186 } 187 goto R; 188 } 189 if (llen == 0) 190 return ids[0]; 191 { 192 sizediff_t i = ids.length - 1; 193 L: do 194 { 195 sizediff_t j = llen - 1; 196 auto datai = data + i * llen; 197 for(;;) 198 { 199 if (datai[j] == temporalStorage[j]) 200 { 201 if (--j >= 0) 202 continue; 203 return ids.ptr[i]; 204 } 205 break; 206 } 207 } 208 while (--i >= 0); 209 } 210 211 { 212 auto idsPtr = cast(uint*)realloc(ids.ptr, (ids.length + 1) * uint.sizeof); 213 ids = idsPtr[0 .. ids.length + 1]; 214 data = cast(ulong*) realloc(data, ids.length * ulong.sizeof * llen); 215 memcpy(data + llen * (ids.length - 1), temporalStorage.ptr, llen * ulong.sizeof); 216 } 217 R: 218 serializer.putValue(str); 219 return ids.ptr[ids.length - 1] = nextID++; 220 } 221 } 222 } 223 224 unittest 225 { 226 import mir.test; 227 import mir.ion.conv: text2ion; 228 229 { 230 IonSymbolTableSequental symbolTable = void; 231 symbolTable.initialize; 232 symbolTable.finalize; 233 symbolTable.serializer.data.should == []; 234 } 235 236 237 { 238 IonSymbolTableSequental symbolTable = void; 239 symbolTable.initialize; 240 symbolTable.insert(`id`); 241 symbolTable.finalize; 242 symbolTable.serializer.data.should == [0xE8, 0x81, 0x83, 0xD5, 0x87, 0xB3, 0x82, 0x69, 0x64]; 243 } 244 245 { 246 IonSymbolTableSequental symbolTable = void; 247 symbolTable.initialize; 248 symbolTable.insert(`id`); 249 symbolTable.insert(`id`); 250 symbolTable.insert(`id`); 251 symbolTable.insert(`id`); 252 symbolTable.finalize; 253 symbolTable.serializer.data.should == [0xE8, 0x81, 0x83, 0xD5, 0x87, 0xB3, 0x82, 0x69, 0x64]; 254 } 255 256 { 257 IonSymbolTableSequental symbolTable = void; 258 symbolTable.initialize; 259 auto d = symbolTable.insert(`id`); 260 auto D = symbolTable.insert(`qwertyuioplkjhgfdszxcvbnm`); 261 assert(symbolTable.insert(`id`) == d); 262 assert(symbolTable.insert(`qwertyuioplkjhgfdszxcvbnm`) == D); 263 symbolTable.finalize; 264 symbolTable.serializer.data.should == [0xEE, 0xA5, 0x81, 0x83, 0xDE, 0xA1, 0x87, 0xBE, 0x9E, 0x82, 0x69, 0x64, 0x8E, 0x99, 0x71, 0x77, 0x65, 0x72, 0x74, 0x79, 0x75, 0x69, 0x6F, 0x70, 0x6C, 0x6B, 0x6A, 0x68, 0x67, 0x66, 0x64, 0x73, 0x7A, 0x78, 0x63, 0x76, 0x62, 0x6E, 0x6D]; 265 } 266 267 { 268 IonSymbolTableSequental symbolTable = void; 269 symbolTable.initialize; 270 auto d = symbolTable.insert(`id`); 271 assert(symbolTable.insert(`qwertyuioOlkjhgfdszxcvbnm`) == d + 1); 272 assert(symbolTable.insert(`ID`) == d + 2); 273 assert(symbolTable.insert(`qwertyuioplkjhgfdszxcvbnm`) == d + 3); 274 symbolTable.finalize; 275 symbolTable.serializer.data.should == `[id, qwertyuioOlkjhgfdszxcvbnm, ID, qwertyuioplkjhgfdszxcvbnm]`.text2ion[4 .. $ - 9]; 276 } 277 } 278 279 280 /++ 281 +/ 282 struct IonSymbolTable(bool gc) 283 { 284 private static align(16) struct Entry 285 { 286 int probeCount = -1; 287 uint hash; 288 uint keyPosition; 289 uint keyLength; 290 uint value; 291 292 @safe pure nothrow @nogc @property: 293 294 bool empty() const scope 295 { 296 return probeCount < 0; 297 } 298 } 299 300 enum double maxLoadFactor = 0.8; 301 enum uint initialMaxProbe = 8; 302 enum uint initialLength = 1 << initialMaxProbe; 303 304 Entry* entries; 305 uint nextKeyPosition; 306 uint lengthMinusOne = initialLength - 1; 307 uint maxProbe = initialMaxProbe; 308 uint elementCount; 309 uint startId = IonSystemSymbol.max + 1; 310 ubyte[] keySpace; 311 312 static if (!gc) 313 { 314 Entry[initialLength + initialMaxProbe] initialStackSpace = void; 315 ubyte[8192] initialKeysSpace = void; 316 } 317 318 import mir.ion.type_code: IonTypeCode; 319 import mir.ion.tape: ionPut, ionPutEnd, ionPutVarUInt, ionPutAnnotationsListEnd, ionPutStartLength, ionPutAnnotationsListStartLength; 320 321 @disable this(this); 322 323 pure nothrow: 324 325 static if (!gc) 326 ~this() @trusted 327 { 328 if (entries != initialStackSpace.ptr) 329 free(entries); 330 if (keySpace.ptr != initialKeysSpace.ptr) 331 free(keySpace.ptr); 332 entries = null; 333 keySpace = null; 334 } 335 336 /// 337 bool initialized() @property 338 { 339 return keySpace.length != 0; 340 } 341 342 /// 343 void initializeNull() @property 344 { 345 entries = null; 346 nextKeyPosition = 0; 347 lengthMinusOne = initialLength - 1; 348 maxProbe = initialMaxProbe; 349 elementCount = 0; 350 startId = IonSystemSymbol.max + 1; 351 keySpace = null; 352 } 353 354 /// 355 void initialize() scope 356 { 357 initializeNull; 358 static if (gc) 359 { 360 entries = new Entry[initialLength + initialMaxProbe].ptr; 361 keySpace = new ubyte[1024]; 362 } 363 else 364 { 365 initialStackSpace[] = Entry.init; 366 entries = initialStackSpace.ptr; 367 keySpace = initialKeysSpace[]; 368 } 369 370 assert(nextKeyPosition == 0); 371 nextKeyPosition += ionPutStartLength; // annotation object 372 nextKeyPosition += ionPutVarUInt(keySpace.ptr + nextKeyPosition, ubyte(1u)); 373 nextKeyPosition += ionPutVarUInt(keySpace.ptr + nextKeyPosition, ubyte(IonSystemSymbol.ion_symbol_table)); 374 assert(nextKeyPosition == 5); 375 nextKeyPosition += ionPutStartLength; // object 376 nextKeyPosition += ionPutVarUInt(keySpace.ptr + nextKeyPosition, IonSystemSymbol.symbols); 377 assert(nextKeyPosition == 9); 378 nextKeyPosition += ionPutStartLength; // symbol array 379 assert(nextKeyPosition == unfinilizedFirstKeyPosition); 380 } 381 382 package enum unfinilizedFirstKeyPosition = 12; 383 384 const(ubyte)[] unfinilizedKeysData() scope const { 385 return keySpace[unfinilizedFirstKeyPosition .. nextKeyPosition]; 386 } 387 388 /++ 389 Prepare the table for writing. 390 The table shouldn't be used after that. 391 +/ 392 void finalize() @trusted 393 { 394 if (nextKeyPosition == unfinilizedFirstKeyPosition) 395 { 396 nextKeyPosition = 0; 397 return; 398 } 399 { 400 auto shift = 9; 401 auto length = nextKeyPosition - (shift + ionPutStartLength); 402 nextKeyPosition = cast(uint)(shift + ionPutEnd(keySpace.ptr + shift, IonTypeCode.list, length)); 403 } 404 405 { 406 auto shift = 5; 407 auto length = nextKeyPosition - (shift + ionPutStartLength); 408 nextKeyPosition = cast(uint)(shift + ionPutEnd(keySpace.ptr + shift, IonTypeCode.struct_, length)); 409 } 410 411 { 412 auto shift = 0; 413 auto length = nextKeyPosition - (shift + ionPutStartLength); 414 nextKeyPosition = cast(uint)(shift + ionPutEnd(keySpace.ptr + shift, IonTypeCode.annotations, length)); 415 } 416 } 417 418 inout(ubyte)[] data() inout @property 419 { 420 return keySpace[0 .. nextKeyPosition]; 421 } 422 423 private inout(Entry)[] currentEntries() inout @property 424 { 425 return entries[0 .. lengthMinusOne + 1 + maxProbe]; 426 } 427 428 private void grow() 429 { 430 pragma(inline, false); 431 auto currentEntries = this.currentEntries[0 .. $-1]; 432 433 lengthMinusOne = lengthMinusOne * 2 + 1; 434 maxProbe++; 435 436 static if (gc) 437 { 438 entries = new Entry[lengthMinusOne + 1 + maxProbe].ptr; 439 } 440 else 441 { 442 entries = cast(Entry*)malloc((lengthMinusOne + 1 + maxProbe) * Entry.sizeof); 443 if (entries is null) 444 assert(0); 445 } 446 447 this.currentEntries[] = Entry.init; 448 this.currentEntries[$ - 1].probeCount = 0; 449 450 foreach (i, ref entry; currentEntries) 451 { 452 if (!entry.empty) 453 { 454 auto current = entries + (entry.hash & lengthMinusOne); 455 int probeCount; 456 457 while (current[probeCount].probeCount >= probeCount) 458 probeCount++; 459 460 assert (elementCount + 1 < (lengthMinusOne + 1) * maxLoadFactor && probeCount < maxProbe); 461 462 entry.probeCount = probeCount; 463 current += entry.probeCount; 464 if (current.empty) 465 { 466 *current = entry; 467 continue; 468 } 469 if (entry.probeCount <= current.probeCount) 470 { 471 L: 472 do { 473 entry.probeCount++; 474 current++; 475 } 476 while (entry.probeCount <= current.probeCount); 477 } 478 479 assert (current <= entries + lengthMinusOne + maxProbe); 480 481 swap(*current, entry); 482 if (!entry.empty) 483 goto L; 484 } 485 } 486 487 static if (!gc) 488 { 489 if (currentEntries.ptr != initialStackSpace.ptr) 490 free(currentEntries.ptr); 491 } 492 } 493 494 /// 495 uint insert(scope const(char)[] key) 496 { 497 pragma(inline, true); 498 uint ret = insert(key, cast(uint)dlang_hll_murmurhash(key)); 499 return ret; 500 } 501 502 /// 503 uint insert(scope const(char)[] key, uint hash) 504 { 505 L0: 506 pragma(inline, true); 507 auto current = entries + (hash & lengthMinusOne); 508 int probeCount; 509 for (;;) 510 { 511 if (current[probeCount].probeCount < probeCount) 512 break; 513 probeCount++; 514 if (hash != current[probeCount - 1].hash) 515 continue; 516 auto pos = current[probeCount - 1].keyPosition; 517 auto len = current[probeCount - 1].keyLength; 518 if (key == keySpace[pos .. pos + len]) 519 return current[probeCount - 1].value; 520 } 521 522 if (_expect(elementCount + 1 > (lengthMinusOne + 1) * maxLoadFactor || probeCount == maxProbe, false)) 523 { 524 grow(); 525 goto L0; 526 } 527 528 // add key 529 if (_expect(nextKeyPosition + key.length + 16 > keySpace.length, false)) 530 { 531 auto newLength = max(nextKeyPosition + key.length + 16, keySpace.length * 2); 532 static if (gc) 533 { 534 keySpace.length = newLength; 535 } 536 else 537 { 538 if (keySpace.ptr == initialKeysSpace.ptr) 539 { 540 import core.stdc.string: memcpy; 541 keySpace = (cast(ubyte*)malloc(newLength))[0 .. newLength]; 542 if (keySpace.ptr is null) 543 assert(0); 544 memcpy(keySpace.ptr, initialKeysSpace.ptr, initialKeysSpace.length); 545 } 546 else 547 { 548 if (keySpace.length == 0) 549 assert(0); 550 keySpace = (cast(ubyte*)realloc(keySpace.ptr, newLength))[0 .. newLength]; 551 if (keySpace.ptr is null) 552 assert(0); 553 } 554 } 555 } 556 nextKeyPosition += cast(uint) ionPut(keySpace.ptr + nextKeyPosition, key); 557 558 Entry entry; 559 entry.probeCount = probeCount; 560 entry.hash = hash; 561 entry.value = elementCount++ + startId; 562 entry.keyPosition = nextKeyPosition - cast(uint)key.length; 563 entry.keyLength = cast(uint)key.length; 564 current += entry.probeCount; 565 566 auto ret = entry.value; 567 568 if (current.empty) 569 { 570 *current = entry; 571 return ret; 572 } 573 L1: 574 if (entry.probeCount <= current.probeCount) 575 { 576 L2: 577 do { 578 entry.probeCount++; 579 current++; 580 } 581 while (entry.probeCount <= current.probeCount); 582 } 583 if (_expect(current < entries + lengthMinusOne + maxProbe, true)) 584 { 585 swap(*current, entry); 586 if (!entry.empty) 587 goto L2; 588 return ret; 589 } 590 grow(); 591 current = entries + (entry.hash & lengthMinusOne); 592 entry.probeCount = 0; 593 for (;;) 594 { 595 if (current.probeCount < entry.probeCount) 596 break; 597 current++; 598 entry.probeCount++; 599 } 600 goto L1; 601 } 602 } 603 604 version(mir_ion_test) unittest 605 { 606 IonSymbolTable!true table; 607 table.initialize; 608 609 foreach (i; 0 .. 20) 610 { 611 auto id = table.insert("a key a bit larger then 14"); 612 assert(id == IonSystemSymbol.max + 1); 613 } 614 615 table.finalize; 616 } 617 618 package(mir) auto findKey()(const string[] symbolTable, string key) 619 { 620 import mir.algorithm.iteration: findIndex; 621 auto ret = symbolTable.findIndex!(a => a == key); 622 assert(ret != size_t.max, "Missing key: " ~ key); 623 return ret; 624 } 625 626 private: 627 628 pragma(inline, true) 629 uint dlang_hll_murmurhash(scope const(char)[] data) 630 @trusted pure nothrow @nogc 631 { 632 if (__ctfe) 633 return cast(uint) hashOf(data); 634 return murmur3_32(cast(const ubyte*)data.ptr, data.length); 635 } 636 637 638 pragma(inline, true) 639 @safe pure nothrow @nogc 640 static uint murmur_32_scramble(uint k) { 641 k *= 0xcc9e2d51; 642 k = (k << 15) | (k >> 17); 643 k *= 0x1b873593; 644 return k; 645 } 646 pragma(inline, true) 647 @trusted pure nothrow @nogc 648 uint murmur3_32(const(ubyte)* key, size_t len, uint seed = 0) 649 { 650 uint h = seed; 651 uint k; 652 /* Read in groups of 4. */ 653 for (size_t i = len >> 2; i; i--) { 654 import core.stdc.string; 655 // Here is a source of differing results across endiannesses. 656 // A swap here has no effects on hash properties though. 657 k = *cast(const uint*) key; 658 key += 4; 659 h ^= murmur_32_scramble(k); 660 h = (h << 13) | (h >> 19); 661 h = h * 5 + 0xe6546b64; 662 } 663 /* Read the rest. */ 664 k = 0; 665 for (size_t i = len & 3; i; i--) { 666 k <<= 8; 667 k |= key[i - 1]; 668 } 669 // A swap is *not* necessary here because the preceding loop already 670 // places the low bytes in the low places according to whatever endianness 671 // we use. Swaps only apply when the memory is copied in a chunk. 672 h ^= murmur_32_scramble(k); 673 /* Finalize. */ 674 h ^= len; 675 h ^= h >> 16; 676 h *= 0x85ebca6b; 677 h ^= h >> 13; 678 h *= 0xc2b2ae35; 679 h ^= h >> 16; 680 return h; 681 }