The OpenD Programming Language

1 /++
2 Mir String Table designed for fast deserialization routines.
3 
4 License: $(HTTP www.apache.org/licenses/LICENSE-2.0, Apache-2.0)
5 Authors: Ilia Ki 
6 Macros:
7 +/
8 module mir.string_table;
9 
10 /++
11 Fast string table used to get key's id.
12 The keys should be first sorted by length and then lexicographically.
13 Params:
14     U = an unsigned type that can hold an index of sorted keys. `U.max` must be less then length of the table.
15     C = character type
16 +/
17 struct MirStringTable(U, C = char)
18     if (__traits(isUnsigned, U) && (is(C == char) || is(C == wchar) || is(C == dchar)))
19 {
20     /++
21     Keys sorted by length and then lexicographically.
22     +/
23     const(immutable(C)[])[] sortedKeys;
24 
25     private U[] table;
26 
27     /++
28     The keys should be first sorted by length and then lexicographically.
29 
30     The constructor uses GC.
31     It can be used in `@nogc` code when if constructed in compile time.
32     +/
33     this()(const(immutable(C)[])[] sortedKeys)
34         @trusted pure nothrow
35     {
36         pragma(inline, false);
37         this.sortedKeys = sortedKeys;
38         assert(sortedKeys.length <= U.max);
39         const largest = sortedKeys.length ? sortedKeys[$ - 1].length : 0;
40         table = new U[largest + 2];
41         size_t ski;
42         foreach (length; 0 .. largest + 1)
43         {
44             while(ski < sortedKeys.length && sortedKeys[ski].length == length)
45                 ski++;
46             table[length + 1] = cast(U)ski;
47         }
48     }
49 
50     /++
51     Params:
52         key = string to find index for
53         index = (ref) index to fill with key's position.
54     Returns:
55         true if keys index has been found
56     +/
57     bool get()(scope const C[] key, ref uint index)
58          const @trusted pure nothrow @nogc
59     {
60         import mir.utility: _expect;
61         if (_expect(key.length + 1 < table.length, true))
62         {
63 
64             // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
65             // 0 1 2 3 4 5 6   8 9 10    12          16
66 
67             auto low = table[key.length] + 0u;
68             auto high = table[key.length + 1] + 0u;
69             auto items = sortedKeys.ptr;
70             if (low < high)
71             {
72                 version (none)
73                 {
74                     if (key.length == 0)
75                     {
76                         index = 0;
77                         return true;
78                     }
79                 }
80                 L: do {
81                     auto mid = (low + high) / 2;
82 
83                     version (all)
84                     {
85                         import core.stdc.string: memcmp;
86                         int r = void;
87 
88                         if (__ctfe)
89                             r = __cmp(key, items[mid]);
90                         else
91                         version (BigEndian)
92                             r = memcmp(key.ptr, items[mid].ptr, key.length);
93                         else
94                         static if (C.sizeof == 1)
95                             r = memcmp(key.ptr, items[mid].ptr, key.length);
96                         else
97                             r = __cmp(key, items[mid]);
98 
99                         if (r == 0)
100                         {
101                             index = mid;
102                             return true;
103                         }
104                         if (r > 0)
105                             low = mid + 1;
106                         else
107                             high = mid;
108                     }
109                     else
110                     {
111                         size_t i;
112                         auto value = items[mid];
113                         do {
114                             if (key[i] < value[i])
115                             {
116                                 high = mid;
117                                 continue L;
118                             }
119                             else
120                             if (key[i] > value[i])
121                             {
122                                 low = mid + 1;
123                                 continue L;
124                             }
125                         } while (++i < key.length);
126                         index = mid;
127                         return true;
128                     }
129                 }
130                 while(low < high);
131             }
132         }
133         return false;
134     }
135 
136     ///
137     uint opIndex()(scope const C[] key)
138          const @trusted pure nothrow @nogc
139     {
140         import mir.utility: _expect;
141         uint ret = 0;
142         if (get(key, ret)._expect(true))
143             return ret;
144         assert(0);
145     }
146 }
147 
148 /// ditto
149 struct MirStringTable(size_t length, size_t maxKeyLength, bool caseInsensetive = false, C = char)
150     if (is(C == char) || is(C == wchar) || is(C == dchar))
151 {
152     ///
153     const(immutable(C)[])[length] sortedKeys;
154 
155     private alias U = minimalIndexType!length;
156     private U[maxKeyLength + 2] table;
157 
158     /++
159     The keys should be first sorted by length and then lexicographically.
160 
161     The constructor uses GC.
162     It can be used in `@nogc` code when if constructed in compile time.
163     +/
164     this(immutable(C)[][length] sortedKeys)
165         @trusted pure nothrow
166     {
167         pragma(inline, false);
168         this.sortedKeys = sortedKeys;
169         size_t ski;
170         foreach (length; 0 .. maxKeyLength + 1)
171         {
172             while(ski < sortedKeys.length && sortedKeys[ski].length == length)
173                 ski++;
174             table[length + 1] = cast(U)ski;
175         }
176     }
177 
178     /++
179     Params:
180         key = string to find index for
181         index = (ref) index to fill with key's position.
182     Returns:
183         true if keys index has been found
184     +/
185     bool get()(scope const(C)[] key, ref uint index)
186          const @trusted pure nothrow @nogc
187     {
188         import mir.utility: _expect;
189         if (_expect(key.length <= maxKeyLength, true))
190         {
191             static if (caseInsensetive)
192             {
193                 C[maxKeyLength] buffer = void;
194                 foreach(i, C c; key)
195                     buffer[i] = c.fastToUpper;
196                 key = buffer[0 .. key.length];
197             }
198             auto low = table[key.length] + 0u;
199             auto high = table[key.length + 1] + 0u;
200             auto items = sortedKeys.ptr;
201             if (low < high)
202             {
203                 static if (!(maxKeyLength >= 16))
204                 {
205                     if (key.length == 0)
206                     {
207                         index = 0;
208                         return true;
209                     }
210                 }
211                 L: do {
212                     auto mid = (low + high) / 2;
213 
214                     static if (maxKeyLength >= 16)
215                     {
216                         import core.stdc.string: memcmp;
217                         int r = void;
218 
219                         if (__ctfe)
220                             r = __cmp(key, items[mid]);
221                         else
222                         version (BigEndian)
223                             r = memcmp(key.ptr, items[mid].ptr, key.length);
224                         else
225                         static if (C.sizeof == 1)
226                             r = memcmp(key.ptr, items[mid].ptr, key.length);
227                         else
228                             r = __cmp(key, items[mid]);
229 
230                         if (r == 0)
231                         {
232                             index = mid;
233                             return true;
234                         }
235                         if (r > 0)
236                             low = mid + 1;
237                         else
238                             high = mid;
239                     }
240                     else
241                     {
242                         size_t i;
243                         auto value = items[mid];
244                         do {
245                             if (key[i] < value[i])
246                             {
247                                 high = mid;
248                                 continue L;
249                             }
250                             else
251                             if (key[i] > value[i])
252                             {
253                                 low = mid + 1;
254                                 continue L;
255                             }
256                         } while (++i < key.length);
257                         index = mid;
258                         return true;
259                     }
260                 }
261                 while(low < high);
262             }
263         }
264         return false;
265     }
266 
267     ///
268     uint opIndex()(scope const C[] key)
269          const @trusted pure nothrow @nogc
270     {
271         import mir.utility: _expect;
272         uint ret = 0;
273         if (get(key, ret)._expect(true))
274             return ret;
275         assert(0);
276     }
277 }
278 
279 ///
280 @safe pure nothrow @nogc
281 version(mir_core_test) unittest
282 {
283     static immutable sortedKeys = ["", "a", "b", "aab", "abb", "aaaaa"];
284     static immutable table = MirStringTable!ubyte(sortedKeys); // CTFE
285     static assert (table[""] == 0);
286     static assert (table["a"] == 1);
287     static assert (table["b"] == 2);
288     static assert (table["abb"] == 4);
289     assert (table["aaaaa"] == 5);
290 }
291 
292 
293 ///
294 @safe pure nothrow
295 version(mir_core_test) unittest
296 {
297     import mir.utility: simpleSort;
298     auto keys = ["aaaaa", "abb", "", "b", "a", "aab"];
299     // sorts keys by length and then lexicographically.
300     keys.simpleSort!smallerStringFirst;
301     assert(keys == ["", "a", "b", "aab", "abb", "aaaaa"]);
302 }
303 
304 @safe pure nothrow
305 version(mir_core_test) unittest
306 {
307     import mir.utility: simpleSort;
308     auto keys = ["aaaaa"w, "abb"w, ""w, "b"w, "a"w, "aab"w];
309     // sorts keys by length and then lexicographically.
310     keys.simpleSort!smallerStringFirst;
311     assert(keys == [""w, "a"w, "b"w, "aab"w, "abb"w, "aaaaa"w]);
312 }
313 
314 package template minimalIndexType(size_t length)
315 {
316     static if (length <= ubyte.max)
317         alias minimalIndexType = ubyte;
318     else
319     static if (length <= ushort.max)
320         alias minimalIndexType = ushort;
321     else
322     static if (length <= uint.max)
323         alias minimalIndexType = uint;
324     else
325         alias minimalIndexType = ulong;
326 }
327 
328 package template minimalSignedIndexType(size_t length)
329 {
330     static if (length <= byte.max)
331         alias minimalSignedIndexType = byte;
332     else
333     static if (length <= short.max)
334         alias minimalSignedIndexType = short;
335     else
336     static if (length <= int.max)
337         alias minimalSignedIndexType = int;
338     else
339         alias minimalSignedIndexType = long;
340 }
341 
342 /++
343 Compares strings by length and then lexicographically.
344 +/
345 sizediff_t smallerStringFirstCmp(T)(T[] a, T[] b)
346 {
347     if (sizediff_t d = a.length - b.length)
348     {
349         return d;
350     }
351 
352     import std.traits: Unqual;
353     static if (is(Unqual!T == ubyte) || is(Unqual!T == char))
354     {
355         import core.stdc.string: memcmp;
356         if (__ctfe)
357             return __cmp(a, b);
358         else
359             return (() @trusted => memcmp(a.ptr, b.ptr, a.length))();
360     }
361     else
362     {
363         return __cmp(a, b);
364     }
365 }
366 
367 ///
368 @safe pure nothrow @nogc
369 version(mir_core_test) unittest
370 {
371     assert(smallerStringFirstCmp("aa", "bb") < 0);
372     assert(smallerStringFirstCmp("aa", "aa") == 0);
373     assert(smallerStringFirstCmp("aaa", "aa") > 0);
374 
375     static assert(smallerStringFirstCmp("aa", "bb") < 0);
376     static assert(smallerStringFirstCmp("aa", "aa") == 0);
377     static assert(smallerStringFirstCmp("aaa", "aa") > 0);
378 }
379 
380 /++
381 Compares strings by length and then lexicographically.
382 +/
383 template smallerStringFirst(alias direction = "<")
384     if (direction == "<" || direction == ">")
385 {
386     ///
387     bool smallerStringFirst(T)(T[] a, T[] b)
388     {
389         auto r = smallerStringFirstCmp(a, b);
390         static if (direction == "<")
391             return r < 0;
392         else
393             return r > 0;
394     }
395 }
396 
397 ///
398 @safe pure nothrow @nogc
399 version(mir_core_test) unittest
400 {
401     assert(smallerStringFirst("aa", "bb") == true);
402     assert(smallerStringFirst("aa", "aa") == false);
403     assert(smallerStringFirst("aaa", "aa") == false);
404 }
405 
406 package auto fastToUpper(C)(const C a)
407 {   // std.ascii may not be inlined
408     return 'a' <= a && a <= 'z' ? cast(C)(a ^ 0x20) : a;
409 }
410 
411 package @safe pure nothrow @nogc
412 C[] fastToUpperInPlace(C)(scope return C[] a)
413 {
414     foreach(ref C e; a)
415         e = e.fastToUpper;
416     return a;
417 }
418 
419 package immutable(C)[][] prepareStringTableKeys(bool caseInsensetive = false, C)(immutable(C)[][] keys)
420 {
421     static if (caseInsensetive)
422     {
423         foreach (ref key; keys)
424         {
425             auto upper = cast(immutable) key.dup.fastToUpperInPlace;
426             if (upper != key)
427                 key = upper;
428         }
429     }
430     import mir.utility: simpleSort;
431     return keys.simpleSort!smallerStringFirst;
432 }
433 
434 package template createTable(C)
435     if (is(C == char) || is(C == wchar) || is(C == dchar))
436 {
437     auto createTable(immutable(C)[][] keys, bool caseInsensetive = false)()
438     {
439         static immutable C[][] sortedKeys = prepareStringTableKeys!caseInsensetive(keys);
440         alias Table = MirStringTable!(sortedKeys.length, sortedKeys.length ? sortedKeys[$ - 1].length : 0, caseInsensetive, C);
441         static if (sortedKeys.length)
442             return Table(sortedKeys[0 .. sortedKeys.length]);
443         else
444             return Table.init;
445     }
446 }