1 /++ 2 Mir String Table designed for fast deserialization routines. 3 4 License: $(HTTP www.apache.org/licenses/LICENSE-2.0, Apache-2.0) 5 Authors: Ilia Ki 6 Macros: 7 +/ 8 module mir.string_table; 9 10 /++ 11 Fast string table used to get key's id. 12 The keys should be first sorted by length and then lexicographically. 13 Params: 14 U = an unsigned type that can hold an index of sorted keys. `U.max` must be less then length of the table. 15 C = character type 16 +/ 17 struct MirStringTable(U, C = char) 18 if (__traits(isUnsigned, U) && (is(C == char) || is(C == wchar) || is(C == dchar))) 19 { 20 /++ 21 Keys sorted by length and then lexicographically. 22 +/ 23 const(immutable(C)[])[] sortedKeys; 24 25 private U[] table; 26 27 /++ 28 The keys should be first sorted by length and then lexicographically. 29 30 The constructor uses GC. 31 It can be used in `@nogc` code when if constructed in compile time. 32 +/ 33 this()(const(immutable(C)[])[] sortedKeys) 34 @trusted pure nothrow 35 { 36 pragma(inline, false); 37 this.sortedKeys = sortedKeys; 38 assert(sortedKeys.length <= U.max); 39 const largest = sortedKeys.length ? sortedKeys[$ - 1].length : 0; 40 table = new U[largest + 2]; 41 size_t ski; 42 foreach (length; 0 .. largest + 1) 43 { 44 while(ski < sortedKeys.length && sortedKeys[ski].length == length) 45 ski++; 46 table[length + 1] = cast(U)ski; 47 } 48 } 49 50 /++ 51 Params: 52 key = string to find index for 53 index = (ref) index to fill with key's position. 54 Returns: 55 true if keys index has been found 56 +/ 57 bool get()(scope const C[] key, ref uint index) 58 const @trusted pure nothrow @nogc 59 { 60 import mir.utility: _expect; 61 if (_expect(key.length + 1 < table.length, true)) 62 { 63 64 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 65 // 0 1 2 3 4 5 6 8 9 10 12 16 66 67 auto low = table[key.length] + 0u; 68 auto high = table[key.length + 1] + 0u; 69 auto items = sortedKeys.ptr; 70 if (low < high) 71 { 72 version (none) 73 { 74 if (key.length == 0) 75 { 76 index = 0; 77 return true; 78 } 79 } 80 L: do { 81 auto mid = (low + high) / 2; 82 83 version (all) 84 { 85 import core.stdc.string: memcmp; 86 int r = void; 87 88 if (__ctfe) 89 r = __cmp(key, items[mid]); 90 else 91 version (BigEndian) 92 r = memcmp(key.ptr, items[mid].ptr, key.length); 93 else 94 static if (C.sizeof == 1) 95 r = memcmp(key.ptr, items[mid].ptr, key.length); 96 else 97 r = __cmp(key, items[mid]); 98 99 if (r == 0) 100 { 101 index = mid; 102 return true; 103 } 104 if (r > 0) 105 low = mid + 1; 106 else 107 high = mid; 108 } 109 else 110 { 111 size_t i; 112 auto value = items[mid]; 113 do { 114 if (key[i] < value[i]) 115 { 116 high = mid; 117 continue L; 118 } 119 else 120 if (key[i] > value[i]) 121 { 122 low = mid + 1; 123 continue L; 124 } 125 } while (++i < key.length); 126 index = mid; 127 return true; 128 } 129 } 130 while(low < high); 131 } 132 } 133 return false; 134 } 135 136 /// 137 uint opIndex()(scope const C[] key) 138 const @trusted pure nothrow @nogc 139 { 140 import mir.utility: _expect; 141 uint ret = 0; 142 if (get(key, ret)._expect(true)) 143 return ret; 144 assert(0); 145 } 146 } 147 148 /// ditto 149 struct MirStringTable(size_t length, size_t maxKeyLength, bool caseInsensetive = false, C = char) 150 if (is(C == char) || is(C == wchar) || is(C == dchar)) 151 { 152 /// 153 const(immutable(C)[])[length] sortedKeys; 154 155 private alias U = minimalIndexType!length; 156 private U[maxKeyLength + 2] table; 157 158 /++ 159 The keys should be first sorted by length and then lexicographically. 160 161 The constructor uses GC. 162 It can be used in `@nogc` code when if constructed in compile time. 163 +/ 164 this(immutable(C)[][length] sortedKeys) 165 @trusted pure nothrow 166 { 167 pragma(inline, false); 168 this.sortedKeys = sortedKeys; 169 size_t ski; 170 foreach (length; 0 .. maxKeyLength + 1) 171 { 172 while(ski < sortedKeys.length && sortedKeys[ski].length == length) 173 ski++; 174 table[length + 1] = cast(U)ski; 175 } 176 } 177 178 /++ 179 Params: 180 key = string to find index for 181 index = (ref) index to fill with key's position. 182 Returns: 183 true if keys index has been found 184 +/ 185 bool get()(scope const(C)[] key, ref uint index) 186 const @trusted pure nothrow @nogc 187 { 188 import mir.utility: _expect; 189 if (_expect(key.length <= maxKeyLength, true)) 190 { 191 static if (caseInsensetive) 192 { 193 C[maxKeyLength] buffer = void; 194 foreach(i, C c; key) 195 buffer[i] = c.fastToUpper; 196 key = buffer[0 .. key.length]; 197 } 198 auto low = table[key.length] + 0u; 199 auto high = table[key.length + 1] + 0u; 200 auto items = sortedKeys.ptr; 201 if (low < high) 202 { 203 static if (!(maxKeyLength >= 16)) 204 { 205 if (key.length == 0) 206 { 207 index = 0; 208 return true; 209 } 210 } 211 L: do { 212 auto mid = (low + high) / 2; 213 214 static if (maxKeyLength >= 16) 215 { 216 import core.stdc.string: memcmp; 217 int r = void; 218 219 if (__ctfe) 220 r = __cmp(key, items[mid]); 221 else 222 version (BigEndian) 223 r = memcmp(key.ptr, items[mid].ptr, key.length); 224 else 225 static if (C.sizeof == 1) 226 r = memcmp(key.ptr, items[mid].ptr, key.length); 227 else 228 r = __cmp(key, items[mid]); 229 230 if (r == 0) 231 { 232 index = mid; 233 return true; 234 } 235 if (r > 0) 236 low = mid + 1; 237 else 238 high = mid; 239 } 240 else 241 { 242 size_t i; 243 auto value = items[mid]; 244 do { 245 if (key[i] < value[i]) 246 { 247 high = mid; 248 continue L; 249 } 250 else 251 if (key[i] > value[i]) 252 { 253 low = mid + 1; 254 continue L; 255 } 256 } while (++i < key.length); 257 index = mid; 258 return true; 259 } 260 } 261 while(low < high); 262 } 263 } 264 return false; 265 } 266 267 /// 268 uint opIndex()(scope const C[] key) 269 const @trusted pure nothrow @nogc 270 { 271 import mir.utility: _expect; 272 uint ret = 0; 273 if (get(key, ret)._expect(true)) 274 return ret; 275 assert(0); 276 } 277 } 278 279 /// 280 @safe pure nothrow @nogc 281 version(mir_core_test) unittest 282 { 283 static immutable sortedKeys = ["", "a", "b", "aab", "abb", "aaaaa"]; 284 static immutable table = MirStringTable!ubyte(sortedKeys); // CTFE 285 static assert (table[""] == 0); 286 static assert (table["a"] == 1); 287 static assert (table["b"] == 2); 288 static assert (table["abb"] == 4); 289 assert (table["aaaaa"] == 5); 290 } 291 292 293 /// 294 @safe pure nothrow 295 version(mir_core_test) unittest 296 { 297 import mir.utility: simpleSort; 298 auto keys = ["aaaaa", "abb", "", "b", "a", "aab"]; 299 // sorts keys by length and then lexicographically. 300 keys.simpleSort!smallerStringFirst; 301 assert(keys == ["", "a", "b", "aab", "abb", "aaaaa"]); 302 } 303 304 @safe pure nothrow 305 version(mir_core_test) unittest 306 { 307 import mir.utility: simpleSort; 308 auto keys = ["aaaaa"w, "abb"w, ""w, "b"w, "a"w, "aab"w]; 309 // sorts keys by length and then lexicographically. 310 keys.simpleSort!smallerStringFirst; 311 assert(keys == [""w, "a"w, "b"w, "aab"w, "abb"w, "aaaaa"w]); 312 } 313 314 package template minimalIndexType(size_t length) 315 { 316 static if (length <= ubyte.max) 317 alias minimalIndexType = ubyte; 318 else 319 static if (length <= ushort.max) 320 alias minimalIndexType = ushort; 321 else 322 static if (length <= uint.max) 323 alias minimalIndexType = uint; 324 else 325 alias minimalIndexType = ulong; 326 } 327 328 package template minimalSignedIndexType(size_t length) 329 { 330 static if (length <= byte.max) 331 alias minimalSignedIndexType = byte; 332 else 333 static if (length <= short.max) 334 alias minimalSignedIndexType = short; 335 else 336 static if (length <= int.max) 337 alias minimalSignedIndexType = int; 338 else 339 alias minimalSignedIndexType = long; 340 } 341 342 /++ 343 Compares strings by length and then lexicographically. 344 +/ 345 sizediff_t smallerStringFirstCmp(T)(T[] a, T[] b) 346 { 347 if (sizediff_t d = a.length - b.length) 348 { 349 return d; 350 } 351 352 import std.traits: Unqual; 353 static if (is(Unqual!T == ubyte) || is(Unqual!T == char)) 354 { 355 import core.stdc.string: memcmp; 356 if (__ctfe) 357 return __cmp(a, b); 358 else 359 return (() @trusted => memcmp(a.ptr, b.ptr, a.length))(); 360 } 361 else 362 { 363 return __cmp(a, b); 364 } 365 } 366 367 /// 368 @safe pure nothrow @nogc 369 version(mir_core_test) unittest 370 { 371 assert(smallerStringFirstCmp("aa", "bb") < 0); 372 assert(smallerStringFirstCmp("aa", "aa") == 0); 373 assert(smallerStringFirstCmp("aaa", "aa") > 0); 374 375 static assert(smallerStringFirstCmp("aa", "bb") < 0); 376 static assert(smallerStringFirstCmp("aa", "aa") == 0); 377 static assert(smallerStringFirstCmp("aaa", "aa") > 0); 378 } 379 380 /++ 381 Compares strings by length and then lexicographically. 382 +/ 383 template smallerStringFirst(alias direction = "<") 384 if (direction == "<" || direction == ">") 385 { 386 /// 387 bool smallerStringFirst(T)(T[] a, T[] b) 388 { 389 auto r = smallerStringFirstCmp(a, b); 390 static if (direction == "<") 391 return r < 0; 392 else 393 return r > 0; 394 } 395 } 396 397 /// 398 @safe pure nothrow @nogc 399 version(mir_core_test) unittest 400 { 401 assert(smallerStringFirst("aa", "bb") == true); 402 assert(smallerStringFirst("aa", "aa") == false); 403 assert(smallerStringFirst("aaa", "aa") == false); 404 } 405 406 package auto fastToUpper(C)(const C a) 407 { // std.ascii may not be inlined 408 return 'a' <= a && a <= 'z' ? cast(C)(a ^ 0x20) : a; 409 } 410 411 package @safe pure nothrow @nogc 412 C[] fastToUpperInPlace(C)(scope return C[] a) 413 { 414 foreach(ref C e; a) 415 e = e.fastToUpper; 416 return a; 417 } 418 419 package immutable(C)[][] prepareStringTableKeys(bool caseInsensetive = false, C)(immutable(C)[][] keys) 420 { 421 static if (caseInsensetive) 422 { 423 foreach (ref key; keys) 424 { 425 auto upper = cast(immutable) key.dup.fastToUpperInPlace; 426 if (upper != key) 427 key = upper; 428 } 429 } 430 import mir.utility: simpleSort; 431 return keys.simpleSort!smallerStringFirst; 432 } 433 434 package template createTable(C) 435 if (is(C == char) || is(C == wchar) || is(C == dchar)) 436 { 437 auto createTable(immutable(C)[][] keys, bool caseInsensetive = false)() 438 { 439 static immutable C[][] sortedKeys = prepareStringTableKeys!caseInsensetive(keys); 440 alias Table = MirStringTable!(sortedKeys.length, sortedKeys.length ? sortedKeys[$ - 1].length : 0, caseInsensetive, C); 441 static if (sortedKeys.length) 442 return Table(sortedKeys[0 .. sortedKeys.length]); 443 else 444 return Table.init; 445 } 446 }