The OpenD Programming Language

1 /**
2 * SSE4.2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3
4 *
5 * Copyright: Guillaume Piolat 2022.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.nmmintrin;
9 
10 public import inteli.types;
11 import inteli.internals;
12 public import inteli.smmintrin;
13 import core.bitop: bsf, bsr;
14 
15 
16 // Note: this header will work whether you have SSE4.2 enabled or not.
17 // With LDC, use "dflags-ldc": ["-mattr=+sse4.2"] or equivalent to actively 
18 // generate SSE4.2 instruction (they are often enabled with -O1 or greater).
19 // - Additionally, you need ["-mattr=+crc"] on ARM if you want hardware CRC instructions.
20 // - Since LDC 1.30, you need ["-mattr=+crc32"] on x86_64 if you want hardware CRC instructions,
21 //   it is not considered implied by sse4.2 anymore.
22 // With GDC, use "dflags-gdc": ["-msse4.2"] or equivalent to generate SSE4.2 instructions.
23 
24 nothrow @nogc:
25 
26 // <Data size and signedness>
27 
28 /// String contains unsigned 8-bit characters (default).
29 enum int _SIDD_UBYTE_OPS = 0;
30 
31 /// String contains unsigned 16-bit characters.
32 enum int _SIDD_UWORD_OPS = 1;
33 
34 /// String contains signed 8-bit characters.
35 enum int _SIDD_SBYTE_OPS = 2;
36 
37 /// String contains signed 16-bit characters.
38 enum int _SIDD_SWORD_OPS = 3;
39 
40 // </Data size and signedness>
41 
42 
43 // <Comparison options>
44 
45 /// For each character in `b`, find if it is in `a` (default)
46 /// The resulting mask has bit set at b positions that were found in a.
47 enum int _SIDD_CMP_EQUAL_ANY = 0;
48 
49 /// For each character in `b`, determine if
50 /// `a[0] <= c <= a[1] or a[1] <= c <= a[2]...`
51 /// Contrarily to false documentation on the Internet, pairs must be in `a`!
52 enum int _SIDD_CMP_RANGES = 4;
53 
54 /// The strings defined by `a` and `b` are equal
55 enum int _SIDD_CMP_EQUAL_EACH = 8;
56 
57 /// Search for the defined substring in the target
58 enum int _SIDD_CMP_EQUAL_ORDERED = 12;
59 
60 // </Comparison options>
61 
62 // <Result polarity>
63 
64 /// Do not negate results (default, no effect)
65 enum int _SIDD_POSITIVE_POLARITY = 0;
66 
67 /// Negates results
68 enum int _SIDD_NEGATIVE_POLARITY = 16;
69 
70 /// No effect. Do not negate results before the end of the string. (default when using `_SIDD_NEGATIVE_POLARITY`)
71 /// You basically never want this.
72 enum int _SIDD_MASKED_POSITIVE_POLARITY = 32;
73 
74 /// Negates results only before the end of the string
75 enum int _SIDD_MASKED_NEGATIVE_POLARITY = 48;
76 
77 // </Result polarity>
78 
79 // <Bit returned>
80 
81 /// **Index only**: return the least significant bit (default).
82 enum int _SIDD_LEAST_SIGNIFICANT = 0;
83 
84 /// **Index only**: return the most significant bit.
85 enum int _SIDD_MOST_SIGNIFICANT = 64;
86 
87 // </Bit returned>
88 
89 /// **Mask only**: return the bit mask (default).
90 enum int _SIDD_BIT_MASK = 0;
91 
92 /// **Mask only**: return the byte/word mask.
93 enum int _SIDD_UNIT_MASK = 64;
94 
95 /// So SSE4.2 has a lot of hard-to-understand instructions. Here is another explanation.
96 ///
97 /// Alternative explanation of imm8
98 ///
99 /// imm8 is an 8-bit immediate operand specifying whether the characters are bytes or
100 ///    words and the type of comparison to do.
101 ///
102 ///    Bits [1:0]: Determine source data format.
103 ///      00: 16 unsigned bytes
104 ///      01: 8 unsigned words
105 ///      10: 16 signed bytes
106 ///      11: 8 signed words
107 ///
108 ///    Bits [3:2]: Determine comparison type and aggregation method.
109 ///      00: Subset: Each character in B is compared for equality with all
110 ///          the characters in A.
111 ///      01: Ranges: Each character in B is compared to A pairs. The comparison
112 ///          basis is greater than or equal for even-indexed elements in A,
113 ///          and less than or equal for odd-indexed elements in A.
114 ///      10: Match: Compare each pair of corresponding characters in A and
115 ///          B for equality.
116 ///      11: Substring: Search B for substring matches of A.
117 ///
118 ///    Bits [5:4]: Determine whether to do a one's complement on the bit
119 ///                mask of the comparison results. \n
120 ///      00: No effect. \n
121 ///      01: Negate the bit mask. \n
122 ///      10: No effect. \n
123 ///      11: Negate the bit mask only for bits with an index less than or equal
124 ///          to the size of \a A or \a B.
125 ///
126 
127 
128 
129 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
130 /// the control in `imm8`, and returns 1 if `b` "does not contain a null character"
131 /// and the resulting mask was zero, and 0 otherwise.
132 /// Warning: actually it seems the instruction does accept \0 in input, just the length must be >= count.
133 ///          It's not clear for what purpose.
134 int _mm_cmpestra(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
135 {
136     static if (GDC_with_SSE42)
137     {
138         return cast(int) __builtin_ia32_pcmpestria128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
139     }
140     else static if (LDC_with_SSE42)
141     {
142         return __builtin_ia32_pcmpestria128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
143     }
144     else
145     {
146         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
147         __m128i equalZero = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
148         int sigbits = _mm_movemask_epi8(equalZero);
149         enum int Count = (imm8 & 1) ? 8 : 16;
150         return (sigbits == 0xffff) && (lb >= Count);
151     }
152 }
153 unittest
154 {
155     char[16] A = "Maximum\x00length!!";
156     char[16] B = "Mbximum\x00length!!";
157     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
158     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
159 
160     // string matching a-la strcmp, for 16-bytes of data
161     // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one
162     assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
163                             | _SIDD_CMP_EQUAL_EACH
164                             | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmA, 16));
165     assert(0 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
166                             | _SIDD_CMP_EQUAL_EACH
167                             | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmB, 16));
168 
169     // test negative length, this will be clamped to 16
170     assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
171                             | _SIDD_CMP_EQUAL_EACH
172                             | _SIDD_NEGATIVE_POLARITY)(mmA, -160, mmA, -17));
173 
174     // it seems you can't compare shorter strings for equality using _mm_cmpestra (!)
175 
176     // Test 16-bit format
177     assert(1 == _mm_cmpestra!(_SIDD_SWORD_OPS 
178                             | _SIDD_CMP_EQUAL_EACH
179                             | _SIDD_NEGATIVE_POLARITY)(mmA, 8, mmA, 8));
180 }
181 
182 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
183 /// the control in `imm8`, and returns 1 if the resulting mask was non-zero,
184 /// and 0 otherwise.
185 int _mm_cmpestrc(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
186 {
187     static if (GDC_with_SSE42)
188     {
189         return cast(int) __builtin_ia32_pcmpestric128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
190     }
191     else static if (LDC_with_SSE42)
192     {
193         return cast(int) __builtin_ia32_pcmpestric128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
194     }
195     else
196     {
197         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
198         int sigbits = _mm_movemask_epi8(mask);
199         return (sigbits != 0);
200     }
201 }
202 unittest
203 {
204     // Compare two shorter strings
205     {
206         char[16] A = "Hello world";
207         char[16] B = "Hello moon";
208         __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
209         __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
210         __m128i mask = _mm_cmpestrm!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
211                                      | _SIDD_CMP_EQUAL_EACH
212                                      | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6);
213         assert(0 == _mm_cmpestrc!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
214                                 | _SIDD_CMP_EQUAL_EACH
215                                 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6));
216         assert(1 == _mm_cmpestrc!(_SIDD_UBYTE_OPS 
217                                 | _SIDD_CMP_EQUAL_EACH
218                                 | _SIDD_NEGATIVE_POLARITY)(mmA, 7, mmB, 7));
219     }
220 }
221 
222 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using
223 /// the control in `imm8`, and return the generated index.
224 /// Note: if the mask is all zeroes, the returned index is always `Count` 
225 /// (8 or 16 depending on size).
226 int _mm_cmpestri(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
227 {
228     static if (GDC_with_SSE42)
229     {
230         return __builtin_ia32_pcmpestri128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
231     }
232     else static if (LDC_with_SSE42)
233     {
234         return __builtin_ia32_pcmpestri128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
235     }
236     else
237     {
238         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
239 
240         // Convert the unit mask to bit mask
241         static if (imm8 & 1)
242         {
243             enum int Count = 8;
244             mask = _mm_packs_epi16(mask, _mm_setzero_si128());
245         }
246         else
247         {
248             enum int Count = 16;
249         }
250         int signbits = _mm_movemask_epi8(mask);
251         static if (imm8 & _SIDD_MOST_SIGNIFICANT)
252         {
253             if (signbits == 0)
254                 return Count;
255             else
256                 return bsr(signbits);
257         }
258         else
259         {
260             if (signbits == 0)
261                 return Count;
262             else
263                 return bsf(signbits);
264         }
265     }
266 }
267 unittest
268 {
269     // Find the index of the first difference (at index 6)
270     //                  v 
271     char[16] A = "Hello sun";
272     char[16] B = "Hello moon";
273 
274     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
275     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
276 
277     int index = _mm_cmpestri!(_SIDD_UBYTE_OPS
278                             | _SIDD_CMP_EQUAL_EACH
279                             | _SIDD_NEGATIVE_POLARITY
280                             | _SIDD_LEAST_SIGNIFICANT)(mmA, 9, mmB, 10);
281     assert(index == 6);
282 
283     // Those string must compare equal, regardless of what happens after their length.
284     index = _mm_cmpestri!(_SIDD_UBYTE_OPS
285                         | _SIDD_CMP_EQUAL_EACH
286                         | _SIDD_NEGATIVE_POLARITY
287                         | _SIDD_LEAST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars
288     assert(index == 16);
289 
290     index = _mm_cmpestri!(_SIDD_UBYTE_OPS
291                         | _SIDD_CMP_EQUAL_EACH
292                         | _SIDD_NEGATIVE_POLARITY
293                         | _SIDD_MOST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars
294     assert(index == 16);
295 }
296 unittest
297 {
298     // Identify the last character that isn't an identifier character.
299     //                   v (at index 7)
300     char[16] A = "my_i(en)ifie";
301     char[16] identRanges = "__azAz09";
302     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
303     __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr);
304     byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS
305                                             | _SIDD_CMP_RANGES
306                                             | _SIDD_MASKED_NEGATIVE_POLARITY
307                                             | _SIDD_UNIT_MASK)(mmI, 8, mmA, 12);
308     byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0];
309     assert(mask.array == correctM);
310 
311     int index = _mm_cmpestri!(_SIDD_UBYTE_OPS
312                             | _SIDD_CMP_RANGES
313                             | _SIDD_MASKED_NEGATIVE_POLARITY
314                             | _SIDD_MOST_SIGNIFICANT)(mmI, 8, mmA, 12);
315     assert(index == 7); // ')' is the last char not to be in [__azAz09]
316 }
317 unittest
318 {
319     // testing _SIDD_CMP_RANGES but with signed shorts comparison instead (this only makes sense for _SIDD_CMP_RANGES)
320     short[8] ranges  = [0,  -1,  1000, 2000,    0,    0,    0, 0];
321     short[8] numbers = [-32768, -1000, -1, -0, 0, 1, 1000, 32767];
322     __m128i mmRanges = _mm_loadu_si128(cast(__m128i*)ranges.ptr);
323     __m128i mmNumbers = _mm_loadu_si128(cast(__m128i*)numbers.ptr);
324 
325     short8 mask = cast(short8)_mm_cmpestrm!(_SIDD_UWORD_OPS
326                                           | _SIDD_CMP_RANGES
327                                           | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8);
328     short[8] correctM = [ -1, -1, -1, -1, -1, -1, -1, -1];
329     mask = cast(short8)_mm_cmpestrm!(_SIDD_SWORD_OPS
330                                    | _SIDD_CMP_RANGES
331                                    | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8);
332     short[8] correctZ = [ 0, 0, 0, 0, 0, 0, -1, 0];
333     assert(mask.array == correctZ);
334 }
335 unittest
336 {
337     // Find a substring
338     char[16] A = "def";
339     char[16] B = "abcdefghdefff";
340     char[16] C = "no substring";
341     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
342     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
343     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
344 
345     byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS
346                                             | _SIDD_CMP_EQUAL_ORDERED
347                                             | _SIDD_UNIT_MASK)(mmA, 3, mmB, 13);
348     byte[16] correctM = [0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0];
349     assert(mask.array == correctM);
350 
351     int firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
352                                  | _SIDD_CMP_EQUAL_ORDERED)(mmA, 3, mmB, 13);
353     assert(firstMatch == 3);
354 
355     int lastMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
356                                  | _SIDD_CMP_EQUAL_ORDERED
357                                  | _SIDD_MOST_SIGNIFICANT)(mmA, 3, mmB, 13);
358     assert(lastMatch == 8);
359     firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
360                                  | _SIDD_CMP_EQUAL_ORDERED)(mmA, -3, mmC, -12);
361     assert(firstMatch == 16); // no substring found
362 }
363 
364 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
365 /// the control in `imm8`, and return the generated mask.
366 __m128i _mm_cmpestrm(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
367 {
368     static if (GDC_with_SSE42)
369     {
370         return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
371     }
372     else static if (LDC_with_SSE42)
373     {
374         return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
375     }
376     else
377     {
378         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
379         
380         static if (imm8 & _SIDD_UNIT_MASK)
381         {
382             return mask;
383         }
384         else
385         {
386             // _SIDD_BIT_MASK
387             static if (imm8 & 1)
388             {
389                 mask = _mm_packs_epi16(mask, _mm_setzero_si128());
390             }
391             return _mm_cvtsi32_si128( _mm_movemask_epi8(mask));
392         }
393     }
394 }
395 unittest
396 {
397     char[16] A = "Hello world!";
398     char[16] B = "aeiou!";
399     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
400     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
401 
402     // Find which letters from B where found in A.
403     byte16 R = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 
404                                        | _SIDD_CMP_EQUAL_ANY
405                                        | _SIDD_BIT_MASK)(mmA, -12, mmB, -6);
406     // because 'e', 'o', and '!' were found
407     byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
408     assert(R.array == correctR);
409     byte16 M = cast(byte16) _mm_cmpestrm!(_SIDD_UBYTE_OPS 
410                                         | _SIDD_CMP_EQUAL_ANY
411                                         | _SIDD_UNIT_MASK)(mmA, 12, mmB, 6);
412     byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
413     assert(M.array == correctM);
414 }
415 
416 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
417 /// the control in `imm8`, and returns bit 0 of the resulting bit mask.
418 int _mm_cmpestro(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
419 {
420     static if (GDC_with_SSE42)
421     {
422         return __builtin_ia32_pcmpestrio128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
423     }
424     else static if (LDC_with_SSE42)
425     {
426         return __builtin_ia32_pcmpestrio128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
427     }
428     else
429     {
430         int4 mask = cast(int4) cmpstrMaskExplicit!imm8(a, la, b, lb);
431         return mask.array[0] & 1;
432     }
433 }
434 unittest
435 {
436     char[16] A = "Hallo world!";
437     char[16] B = "aeiou!";
438     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
439     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
440 
441     // Find which letters from B were found in A.
442     int res = _mm_cmpestro!(_SIDD_UBYTE_OPS 
443                           | _SIDD_CMP_EQUAL_ANY
444                           | _SIDD_BIT_MASK)(mmA, 12, mmB, -6);
445     // because 'a' was found in "Hallo world!"
446     assert(res == 1);
447 }
448 
449 /// Returns 1 if "any character in a was null", and 0 otherwise.
450 /// Warning: what they mean is it returns 1 if the given length `la` is < Count.
451 int _mm_cmpestrs(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
452 {
453     static if (GDC_with_SSE42)
454     {
455         return __builtin_ia32_pcmpestris128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
456     }
457     else static if (LDC_with_SSE42)
458     {
459         return __builtin_ia32_pcmpestris128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
460     }
461     else
462     {
463         // Yes, this intrinsic is there for symmetrical reasons and probably useless.
464         // saturates lengths (the Intrinsics Guide doesn't tell this)
465         if (la < 0) la = -la;
466         if (la > 16) la = 16;
467         enum int Count = (imm8 & 1) ? 8 : 16;
468         return (la < Count);
469     }
470 }
471 unittest
472 {
473     __m128i a;
474     a = 0;
475     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 15, a, 8) == 1);
476     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 16, a, 8) == 0);
477     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -15, a, 8) == 1);
478     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -16, a, 8) == 0);
479 }
480 
481 /// Returns 1 if "any character in b was null", and 0 otherwise.
482 /// Warning: what they mean is it returns 1 if the given length `lb` is < Count.
483 int _mm_cmpestrz(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
484 {
485     static if (GDC_with_SSE42)
486     {
487         return __builtin_ia32_pcmpestriz128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
488     }
489     else static if (LDC_with_SSE42)
490     {
491         return __builtin_ia32_pcmpestriz128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
492     }
493     else
494     {
495         // Yes, this intrinsic is there for symmetrical reasons and probably useless.
496         // saturates lengths (the Intrinsics Guide doesn't tell this)
497         if (lb < 0) lb = -lb;
498         if (lb > 16) lb = 16;
499         enum int Count = (imm8 & 1) ? 8 : 16;
500         return (lb < Count);
501     }
502 }
503 unittest
504 {
505     __m128i b;
506     b = 0;
507     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 15, b, 15) == 1);
508     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 16, b, 16) == 0);
509     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -15, b, -15) == 1);
510     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -16, b, -16) == 0);
511 }
512 
513 /// Compare packed signed 64-bit integers in a and b for greater-than.
514 __m128i _mm_cmpgt_epi64 (__m128i a, __m128i b) pure @trusted
515 {
516     long2 la = cast(long2)a;
517     long2 lb = cast(long2)b;
518     // PERF: with DMD, enabling this requires SSE4.2, hence D_AVX
519     /*static if (SIMD_COMPARISON_MASKS_16B)
520     {
521         return cast(__m128i)(la > lb);
522     }
523     else*/
524     static if (GDC_with_SSE42)
525     {
526         return cast(__m128i) __builtin_ia32_pcmpgtq(la, lb);
527     }
528     else version(LDC)
529     {
530         // LDC x86: Optimized since LDC 1.1.0 -O1
531         //   arm64: Optimized since LDC 1.8.0 -O1
532         // When SSE4.2 is disabled, this gives same sequence than below.
533         static if (SIMD_COMPARISON_MASKS_16B)
534             return cast(__m128i)(la > lb);
535         else
536             return cast(__m128i)( greaterMask!long2(la, lb));
537     }
538     else
539     {        
540         long2 r;
541         r.ptr[0] = (la.array[0] > lb.array[0]) ? 0xffffffff_ffffffff : 0;
542         r.ptr[1] = (la.array[1] > lb.array[1]) ? 0xffffffff_ffffffff : 0;
543         return cast(__m128i)r;  
544     }
545 }
546 unittest
547 {
548     __m128i A = _mm_setr_epi64(-3,  2);
549     __m128i B = _mm_setr_epi64(4, -2);
550     long[2] correct = [ 0, -1 ];
551     long2 R = cast(long2)(_mm_cmpgt_epi32(A, B));
552     assert(R.array == correct);
553 }
554 
555 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`,
556 /// and returns 1 if `b` did not contain a null character and the resulting mask was zero, 
557 /// and 0 otherwise.
558 int _mm_cmpistra(int imm8)(__m128i a, __m128i b) @trusted
559 {
560     static if (GDC_with_SSE42)
561     {
562         return cast(int) __builtin_ia32_pcmpistria128(cast(ubyte16)a, cast(ubyte16)b, imm8);
563     }
564     else static if (LDC_with_SSE42)
565     {
566         return __builtin_ia32_pcmpistria128(cast(byte16)a, cast(byte16)b, imm8);
567     }
568     else
569     {
570         static if (imm8 & 1)
571         {
572             int la = findLengthShort(a);
573             int lb = findLengthShort(b);
574         }
575         else
576         {
577             int la = findLengthByte(a);
578             int lb = findLengthByte(b);
579         }
580         return _mm_cmpestra!imm8(a, la, b, lb);
581     }
582 }
583 unittest
584 {
585     char[16] A = "Maximum\x00one";
586     char[16] B = "Maximum\x00four";
587     char[16] C = "Mbximum\x00length!";
588     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
589     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
590     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
591 
592     // string matching a-la strcmp, for 16-bytes of data
593     // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one
594     assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 
595                             | _SIDD_CMP_EQUAL_EACH
596                             | _SIDD_MASKED_NEGATIVE_POLARITY)(mmA, mmB)); // match, but b is too short
597 
598     assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 
599                             | _SIDD_CMP_EQUAL_EACH
600                             | _SIDD_NEGATIVE_POLARITY)(mmA, mmC)); // do not match
601 }
602 
603 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`,
604 /// and returns 1 if the resulting mask was non-zero, and 0 otherwise.
605 int _mm_cmpistrc(int imm8)(__m128i a, __m128i b) @trusted
606 {
607     static if (GDC_with_SSE42)
608     {
609         return cast(int) __builtin_ia32_pcmpistric128(cast(ubyte16)a, cast(ubyte16)b, imm8);
610     }
611     else static if (LDC_with_SSE42)
612     {
613         return cast(int) __builtin_ia32_pcmpistric128(cast(byte16)a, cast(byte16)b, imm8);
614     }
615     else
616     {
617         static if (imm8 & 1)
618         {
619             int la = findLengthShort(a);
620             int lb = findLengthShort(b);
621         }
622         else
623         {
624             int la = findLengthByte(a);
625             int lb = findLengthByte(b);
626         }
627         return _mm_cmpestrc!imm8(a, la, b, lb);
628     }
629 }
630 unittest
631 {
632     // Compare two shorter strings
633     {
634         char[16] A = "Hello";
635         char[16] B = "Hello moon";
636         __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
637         __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
638         assert(0 == _mm_cmpistrc!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
639                                 | _SIDD_CMP_EQUAL_EACH
640                                 | _SIDD_NEGATIVE_POLARITY)(mmA, mmA));
641         assert(1 == _mm_cmpistrc!(_SIDD_UBYTE_OPS 
642                                 | _SIDD_CMP_EQUAL_EACH
643                                 | _SIDD_NEGATIVE_POLARITY)(mmA, mmB));
644     }
645 }
646 
647 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`
648 /// and return the generated index.
649 /// Note: if the mask is all zeroes, the returned index is always `Count` 
650 /// (8 or 16 depending on size).
651 int _mm_cmpistri(int imm8)(__m128i a, __m128i b) @trusted
652 {
653     static if (GDC_with_SSE42)
654     {
655         return __builtin_ia32_pcmpistri128(cast(ubyte16)a, cast(ubyte16)b, imm8);
656     }
657     else static if (LDC_with_SSE42)
658     {
659         return __builtin_ia32_pcmpistri128(cast(byte16)a, cast(byte16)b, imm8);
660     }
661     else
662     {
663         static if (imm8 & 1)
664         {
665             int la = findLengthShort(a);
666             int lb = findLengthShort(b);
667         }
668         else
669         {
670             int la = findLengthByte(a);
671             int lb = findLengthByte(b);
672         }
673         return _mm_cmpestri!imm8(a, la, b, lb);
674     }
675 }
676 unittest
677 {
678     // Identify the last character that isn't an identifier character.
679     //                   v (at index 7)
680     char[16] A = "my_i(en)ifie";
681     char[16] identRanges = "__azAz09";
682     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
683     __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr);
684     byte16 mask = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS
685                                             | _SIDD_CMP_RANGES
686                                             | _SIDD_MASKED_NEGATIVE_POLARITY
687                                             | _SIDD_UNIT_MASK)(mmI, mmA);
688     byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0];
689     assert(mask.array == correctM);
690 
691     int index = _mm_cmpistri!(_SIDD_UBYTE_OPS
692                             | _SIDD_CMP_RANGES
693                             | _SIDD_MASKED_NEGATIVE_POLARITY
694                             | _SIDD_MOST_SIGNIFICANT)(mmI, mmA);
695     assert(index == 7); // ')' is the last char not to be in [__azAz09]
696 }
697 
698 /// Compare packed strings with implicit lengths in `a` and `b` using the control in
699 /// `imm8`, and return the generated mask.
700 __m128i _mm_cmpistrm(int imm8)(__m128i a, __m128i b) @trusted
701 {
702     static if (GDC_with_SSE42)
703     {
704         return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(ubyte16)a, cast(ubyte16)b, imm8);
705     }
706     else static if (LDC_with_SSE42)
707     {
708         return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(byte16)a, cast(byte16)b, imm8);
709     }
710     else
711     {
712         static if (imm8 & 1)
713         {
714             int la = findLengthShort(a);
715             int lb = findLengthShort(b);
716         }
717         else
718         {
719             int la = findLengthByte(a);
720             int lb = findLengthByte(b);
721         }
722         return _mm_cmpestrm!imm8(a, la, b, lb);
723     }
724 }
725 unittest
726 {
727     char[16] A = "Hello world!";
728     char[16] B = "aeiou!";
729     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
730     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
731 
732     // Find which letters from B where found in A.
733     byte16 R = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 
734                                        | _SIDD_CMP_EQUAL_ANY
735                                        | _SIDD_BIT_MASK)(mmA, mmB);
736     // because 'e', 'o', and '!' were found
737     byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
738     assert(R.array == correctR);
739     byte16 M = cast(byte16) _mm_cmpistrm!(_SIDD_UBYTE_OPS 
740                                         | _SIDD_CMP_EQUAL_ANY
741                                         | _SIDD_UNIT_MASK)(mmA, mmB);
742     byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
743     assert(M.array == correctM);
744 }
745 
746 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
747 /// the control in `imm8`, and returns bit 0 of the resulting bit mask.
748 int _mm_cmpistro(int imm8)(__m128i a, __m128i b) @trusted
749 {
750     static if (GDC_with_SSE42)
751     {
752         return __builtin_ia32_pcmpistrio128(cast(ubyte16)a, cast(ubyte16)b, imm8);
753     }
754     else static if (LDC_with_SSE42)
755     {
756         return __builtin_ia32_pcmpistrio128(cast(byte16)a, cast(byte16)b, imm8);
757     }
758     else
759     {
760         static if (imm8 & 1)
761         {
762             int la = findLengthShort(a);
763             int lb = findLengthShort(b);
764         }
765         else
766         {
767             int la = findLengthByte(a);
768             int lb = findLengthByte(b);
769         }
770         return _mm_cmpestro!imm8(a, la, b, lb);
771     }
772 }
773 unittest
774 {
775     char[16] A = "Hallo world!";
776     char[16] B = "aeiou!";
777     char[16] C = "Z";
778     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
779     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
780     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
781 
782     // Find which letters from B where found in A.
783     int res = _mm_cmpistro!(_SIDD_UBYTE_OPS 
784                           | _SIDD_CMP_EQUAL_ANY
785                           | _SIDD_BIT_MASK)(mmA, mmB);
786     // because 'a' was found in "Hallo world!"
787     assert(res == 1);
788     res = _mm_cmpistro!(_SIDD_UBYTE_OPS 
789                       | _SIDD_CMP_EQUAL_ANY
790                       | _SIDD_BIT_MASK)(mmA, mmC);
791     assert(res == 0); // because 'Z' wasn't found in A
792 }
793 
794 /// Returns 1 if any character in `a` was null, and 0 otherwise.
795 int _mm_cmpistrs(int imm8)(__m128i a, __m128i b) @trusted
796 {
797     static if (GDC_with_SSE42)
798     {
799         return __builtin_ia32_pcmpistris128(cast(ubyte16)a, cast(ubyte16)b, imm8);
800     }
801     else static if (LDC_with_SSE42)
802     {
803         return __builtin_ia32_pcmpistris128(cast(byte16)a, cast(byte16)b, imm8);
804     }
805     else
806     {
807         static if (imm8 & 1)
808         {
809             int la = findLengthShort(a);
810             return la != 8;
811         }
812         else
813         {
814             int la = findLengthByte(a);
815             return la != 16;
816         }
817     }
818 }
819 unittest
820 {
821     char[16] A = "";
822     char[16] B = "hello";
823     char[16] C = "Maximum length!!";
824     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
825     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
826     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
827     assert(_mm_cmpistrs!_SIDD_UBYTE_OPS(mmA, mmA) == 1);
828     assert(_mm_cmpistrs!_SIDD_SBYTE_OPS(mmB, mmB) == 1);
829     assert(_mm_cmpistrs!_SIDD_UWORD_OPS(mmC, mmC) == 0);
830 }
831 
832 /// Returns 1 if any character in `b` was null, and 0 otherwise.
833 int _mm_cmpistrz(int imm8)(__m128i a, __m128i b) @trusted
834 {
835     static if (GDC_with_SSE42)
836     {
837         return __builtin_ia32_pcmpistriz128(cast(ubyte16)a, cast(ubyte16)b, imm8);
838     }
839     else static if (LDC_with_SSE42)
840     {
841         return __builtin_ia32_pcmpistriz128(cast(byte16)a, cast(byte16)b, imm8);
842     }
843     else
844     {
845         static if (imm8 & 1)
846         {
847             int lb = findLengthShort(b);
848             return lb != 8;
849         }
850         else
851         {
852             int lb = findLengthByte(b);
853             return lb != 16;
854         }
855     }
856 }
857 unittest
858 {
859     char[16] A = "";
860     char[16] B = "hello";
861     char[16] C = "Maximum length!!";
862     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
863     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
864     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
865     assert(_mm_cmpistrz!_SIDD_UBYTE_OPS(mmC, mmA) == 1);
866     assert(_mm_cmpistrz!_SIDD_SBYTE_OPS(mmC, mmB) == 1);
867     assert(_mm_cmpistrz!_SIDD_UWORD_OPS(mmA, mmC) == 0);
868 }
869 
870 
871 /// Starting with the initial value in `crc`, accumulates a CR32 value 
872 /// for unsigned 16-bit integer `v`.
873 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
874 uint _mm_crc32_u16 (uint crc, ushort v) @safe
875 {
876     static if (GDC_with_SSE42)
877     {
878         return __builtin_ia32_crc32hi(crc, v);
879     }
880     else static if (LDC_with_CRC32)
881     {
882         return __builtin_ia32_crc32hi(crc, v);
883     }
884     else static if (LDC_with_ARM64_CRC)
885     {
886         return __crc32ch(crc, v);
887     }
888     else
889     {
890         crc = _mm_crc32_u8(crc, v & 0xff);
891         crc = _mm_crc32_u8(crc, v >> 8);
892         return crc;
893     }
894 }
895 unittest
896 {
897     uint A = _mm_crc32_u16(0x12345678, 0x4512);
898     uint B = _mm_crc32_u16(0x76543210, 0xf50f);
899     uint C = _mm_crc32_u16(0xDEADBEEF, 0x0017);
900     assert(A == 0x39c3f0ff);
901     assert(B == 0xcffbcf07);
902     assert(C == 0xc7e3fe85);
903 }
904 
905 /// Starting with the initial value in `crc`, accumulates a CRC32 value 
906 /// for unsigned 32-bit integer `v`.
907 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
908 uint _mm_crc32_u32 (uint crc, uint v) @safe
909 {
910     static if (GDC_with_SSE42)
911     {
912         return __builtin_ia32_crc32si(crc, v);
913     }
914     else static if (LDC_with_CRC32)
915     {
916         return __builtin_ia32_crc32si(crc, v);
917     }
918     else static if (LDC_with_ARM64_CRC)
919     {
920         return __crc32cw(crc, v);
921     }
922     else
923     {
924         crc = _mm_crc32_u8(crc, v & 0xff);
925         crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
926         crc = _mm_crc32_u8(crc, (v >> 16) & 0xff);
927         crc = _mm_crc32_u8(crc, (v >> 24) & 0xff);
928         return crc;
929     }
930 }
931 unittest
932 {
933     uint A = _mm_crc32_u32(0x12345678, 0x45123563);
934     uint B = _mm_crc32_u32(0x76543210, 0xf50f9993);
935     uint C = _mm_crc32_u32(0xDEADBEEF, 0x00170017);
936     assert(A == 0x22a6ec54);
937     assert(B == 0x7019a6cf);
938     assert(C == 0xbc552c27);
939 }
940 
941 /// Starting with the initial value in `crc`, accumulates a CRC32 
942 /// value for unsigned 64-bit integer `v`.
943 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
944 ulong _mm_crc32_u64 (ulong crc, ulong v)
945 {
946     version(X86_64)
947         enum bool hasX86Intrin = GDC_with_SSE42 || LDC_with_CRC32;
948     else
949         enum bool hasX86Intrin = false; // intrinsics not available in 32-bit
950 
951     static if (hasX86Intrin)
952     {
953         return __builtin_ia32_crc32di(crc, v);
954     }
955     else static if (LDC_with_ARM64_CRC)
956     {
957         return __crc32cd(cast(uint)crc, v);
958     }
959     else
960     {
961         uint crc32 = cast(uint)crc;
962         crc32 = _mm_crc32_u8(crc32, (v >> 0) & 0xff);
963         crc32 = _mm_crc32_u8(crc32, (v >> 8) & 0xff);
964         crc32 = _mm_crc32_u8(crc32, (v >> 16) & 0xff);
965         crc32 = _mm_crc32_u8(crc32, (v >> 24) & 0xff);
966         crc32 = _mm_crc32_u8(crc32, (v >> 32) & 0xff);
967         crc32 = _mm_crc32_u8(crc32, (v >> 40) & 0xff);
968         crc32 = _mm_crc32_u8(crc32, (v >> 48) & 0xff);
969         crc32 = _mm_crc32_u8(crc32, (v >> 56) & 0xff);
970         return crc32;
971     }
972 }
973 unittest
974 {
975     ulong A = _mm_crc32_u64(0x1234567812345678, 0x39C3F0FFCFFBCF07);
976     ulong B = _mm_crc32_u64(0x7654321001234567, 0xFACEFEED);
977     ulong C = _mm_crc32_u64(0xDEADBEEFCAFEBABE, 0x0017C7E3FE850017);
978     assert(A == 0xd66b1074);
979     assert(B == 0xac12f9c6);
980     assert(C == 0xa2d13dd8);
981 }
982 
983 /// Starting with the initial value in `crc`, accumulates a CRC32 value 
984 /// for unsigned 8-bit integer `v`.
985 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
986 uint _mm_crc32_u8 (uint crc, ubyte v) @safe
987 {
988     static if (GDC_with_SSE42)
989     {
990         return __builtin_ia32_crc32qi(crc, v);
991     }
992     else static if (LDC_with_CRC32)
993     {
994         return __builtin_ia32_crc32qi(crc, v);
995     }
996     else static if (LDC_with_ARM64_CRC)
997     {
998         return __crc32cb(crc, v);
999     }
1000     else
1001     {
1002         return CRC32cTable[(crc ^ v) & 0xFF] ^ (crc >> 8); 
1003     }
1004 }
1005 unittest
1006 {
1007     uint A = _mm_crc32_u8(0x12345678, 0x45);
1008     uint B = _mm_crc32_u8(0x76543210, 0xf5);
1009     uint C = _mm_crc32_u8(0xDEADBEEF, 0x00);
1010     assert(A == 0x8fd93134);
1011     assert(B == 0xd6b7e834);
1012     assert(C == 0xbdfd3980);
1013 }
1014 
1015 
1016 // Utilities for this file
1017 
1018 private:
1019 
1020 static if (GDC_with_SSE42)
1021 {
1022     version(X86_64)
1023         enum bool NeedCRC32CTable = false;
1024     else
1025         enum bool NeedCRC32CTable = true;
1026 }
1027 else static if (LDC_with_CRC32)
1028 {
1029     version(X86_64)
1030         enum bool NeedCRC32CTable = false;
1031     else
1032         enum bool NeedCRC32CTable = true;
1033 }
1034 else static if (LDC_with_ARM64_CRC)
1035 {
1036     enum bool NeedCRC32CTable = false;
1037 }
1038 else
1039 {
1040     enum bool NeedCRC32CTable = true;
1041 }
1042 
1043 static if (NeedCRC32CTable)
1044 {
1045     static immutable uint[256] CRC32cTable =
1046     [
1047         0x0, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
1048         0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
1049         0x105ec76f, 0xe235446c, 0xf165b798, 0x30e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
1050         0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
1051         0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x61c6936, 0xf477ea35,
1052         0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
1053         0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x5125dad, 0x1642ae59, 0xe4292d5a,
1054         0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
1055         0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
1056         0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0xc38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
1057         0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
1058         0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0xf36e6f7,
1059         0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
1060         0xeb1fcbad, 0x197448ae, 0xa24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
1061         0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
1062         0xfb410cc2, 0x92a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
1063         0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
1064         0x82f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
1065         0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
1066         0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0xb21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
1067         0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
1068         0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0xe330a81, 0xfc588982,
1069         0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
1070         0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0xd3d3e1a, 0x1e6dcdee, 0xec064eed,
1071         0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
1072         0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
1073         0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x7198540,
1074         0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
1075         0xe330a81a, 0x115b2b19, 0x20bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
1076         0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
1077         0xf36e6f75, 0x105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
1078         0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,
1079     ];
1080 }
1081 
1082 int findLengthByte(__m128i a) pure @safe
1083 {
1084     const __m128i zero = _mm_setzero_si128();
1085     const __m128i zeroMask = _mm_cmpeq_epi8(a, zero); // 0xff where a byte is zero
1086     int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index
1087     if (mask == 0)
1088         return 16;
1089     else
1090         return bsf(mask);
1091 }
1092 unittest
1093 {
1094     char[16] A = "Hel!o";
1095     char[16] B = "Maximum length!!";
1096     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
1097     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
1098     assert(findLengthByte(mmA) == 5);
1099     assert(findLengthByte(mmB) == 16);
1100 }
1101 
1102 int findLengthShort(__m128i a) pure @safe
1103 {
1104     const __m128i zero = _mm_setzero_si128();
1105     const __m128i zeroMask = _mm_cmpeq_epi16(a, zero); // 0xffff where a short is zero
1106     int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index
1107     if (mask == 0)
1108         return 8;
1109     else
1110         return bsf(mask) >> 1;
1111 }
1112 unittest
1113 {
1114     short[8] A = [10, 5423, 475, 0, 1, 1, 1, 1 ];
1115     short[8] B = [-1, -2, -3, 4, 5, 6, -32768, 1];
1116     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
1117     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
1118     assert(findLengthShort(mmA) == 3);
1119     assert(findLengthShort(mmB) == 8);
1120 }
1121 
1122 static immutable byte[32] MASK_DATA =
1123 [
1124     -1, -1, -1, -1, -1, -1, -1, -1,
1125     -1, -1, -1, -1, -1, -1, -1, -1,
1126      0,  0,  0,  0,  0,  0,  0,  0,
1127      0,  0,  0,  0,  0,  0,  0,  0,
1128 ];
1129 
1130 // Makes a byte validity mask with a given explicit length string.
1131 __m128i validMask8e(int len) @trusted
1132 {
1133     return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len]);
1134 }
1135 unittest
1136 {
1137     char[16] A = "";
1138     char[16] B = "0123456789abcdef";
1139     byte[16] correctA = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
1140     byte[16] correctB = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1];
1141     byte16 MA = cast(byte16) validMask8e(0);
1142     byte16 MB = cast(byte16) validMask8e(16);
1143     assert(MA.array == correctA);
1144     assert(MB.array == correctB);
1145 }
1146 
1147 // Makes a short validity mask with a given explicit length string.
1148 __m128i validMask16e(int len) @trusted
1149 {
1150     return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len*2]);
1151 }
1152 unittest
1153 {
1154     short[8] A = [3, 4, 5, 0, 3, 4, 5, 6];
1155     short[8] correctA = [-1, -1, -1, 0, 0, 0, 0, 0];
1156     short8 MA = cast(short8) validMask16e(3);
1157     assert(MA.array == correctA);
1158 }
1159 
1160 // Internal implementation for non-SSE4.2
1161 // Compare 8-bit or 16-bit strings, get a mask.
1162 // `aValid` and `bValid` are byte-mask or word-mask of the valid
1163 // zone in `a` and `b`.
1164 __m128i cmpstrMaskExplicit(int imm8)(__m128i a, 
1165                                      ref int la, 
1166                                      __m128i b, 
1167                                      ref int lb) @safe
1168 {
1169     // saturates lengths (the Intrinsics Guide doesn't tell this)
1170     if (la < 0) la = -la;
1171     if (lb < 0) lb = -lb;
1172     if (la > 16) la = 16;
1173     if (lb > 16) lb = 16;
1174 
1175     static if (imm8 & 1)
1176     {
1177         __m128i aValid = validMask16e(la);
1178         __m128i bValid = validMask16e(lb);
1179     }
1180     else
1181     {
1182         __m128i aValid = validMask8e(la);
1183         __m128i bValid = validMask8e(lb);
1184     }
1185     return cmpstrMask!imm8(a, aValid, b, bValid);
1186 }
1187 
1188 //ditto
1189 __m128i cmpstrMask(int imm8)(__m128i a, 
1190                              __m128i aValid, 
1191                              __m128i b, 
1192                              const __m128i bValid) @safe
1193 {
1194     enum bool chars16Bits = imm8 & 1;
1195     enum int Mode = (imm8 >> 2) & 3;
1196 
1197     static if (Mode == 0) // equal any
1198     {
1199         __m128i R = _mm_setzero_si128();
1200         static if (chars16Bits) // 64 comparisons
1201         {
1202             for (int k = 0; k < 8; ++k)
1203             {
1204                 __m128i eqMask = _mm_cmpeq_epi16(a, b);
1205                 eqMask = _mm_and_si128(eqMask, aValid);
1206                 R = _mm_or_si128(R, eqMask);
1207 
1208                 // rotate a and aValid
1209                 a = _mm_or_si128(_mm_srli_si128!2(a), _mm_slli_si128!14(a));
1210                 aValid = _mm_or_si128(_mm_srli_si128!2(aValid), _mm_slli_si128!14(aValid));
1211             }
1212         }
1213         else
1214         {
1215             for (int k = 0; k < 16; ++k)
1216             {
1217                 __m128i eqMask = _mm_cmpeq_epi8(a, b);
1218                 eqMask = _mm_and_si128(eqMask, aValid);
1219                 R = _mm_or_si128(R, eqMask);
1220 
1221                 // rotate a and aValid
1222                 a = _mm_or_si128(_mm_srli_si128!1(a), _mm_slli_si128!15(a));
1223                 aValid = _mm_or_si128(_mm_srli_si128!1(aValid), _mm_slli_si128!15(aValid));
1224             }
1225         }
1226         R = _mm_and_si128(R, bValid);
1227     }
1228     else static if (Mode == 1) // ranges
1229     {
1230         enum bool signed = (imm8 & 2) != 0;
1231 
1232         // For each character in b, the returned mask says if it was found in a range-pair in `a`.
1233         __m128i R = _mm_setzero_si128();
1234         static if (chars16Bits)
1235         {
1236             for (int pos = 0; pos < 8; pos += 2)
1237             {
1238                 short min = (cast(short8)a).array[pos];
1239                 short max = (cast(short8)a).array[pos+1];
1240                 static if (signed)
1241                 {
1242                     __m128i ge = ~_mm_cmplt_epi16(b, _mm_set1_epi16(min));
1243                     __m128i le = ~_mm_cmpgt_epi16(b, _mm_set1_epi16(max));
1244                 }
1245                 else
1246                 {
1247                     // No SSE way to do 16-bit unsigned comparisons, 
1248                     // but flipping the sign bit let us used signed comp
1249                     __m128i firstBits = _mm_set1_epi16(-32768);
1250                     __m128i reverseB = _mm_xor_si128(b, firstBits);
1251                     __m128i reverseMin = _mm_xor_si128(_mm_set1_epi16(min), firstBits);
1252                     __m128i reverseMax = _mm_xor_si128(_mm_set1_epi16(max), firstBits);
1253                     __m128i ge = ~_mm_cmplt_epi16(reverseB, reverseMin);
1254                     __m128i le = ~_mm_cmpgt_epi16(reverseB, reverseMax);
1255                 }
1256                 __m128i inRange = _mm_and_si128(le, ge);
1257 
1258                 // Not considered in range a is invalid here.
1259                 short aValidHere = (cast(short8)aValid).array[pos+1];
1260                 __m128i mmAValidHere = _mm_set1_epi16(aValidHere);
1261                 inRange = _mm_and_si128(inRange, mmAValidHere); 
1262 
1263                 R = _mm_or_si128(R, inRange);
1264             }            
1265         }
1266         else // 8-bits
1267         {
1268             for (int pos = 0; pos < 16; pos += 2)
1269             {
1270                 byte min = (cast(byte16)a).array[pos];
1271                 byte max = (cast(byte16)a).array[pos+1];
1272                 static if (signed)
1273                 {
1274                     __m128i ge = _mm_xor_si128(_mm_cmplt_epi8(b, _mm_set1_epi8(min)));
1275                     __m128i le = _mm_xor_si128(_mm_cmpgt_epi8(b, _mm_set1_epi8(max)));
1276                 }
1277                 else
1278                 {
1279                     // No SSE way to do 16-bit unsigned comparisons, 
1280                     // but flipping the sign bit let us used signed comp
1281                     __m128i firstBits = _mm_set1_epi8(-128);
1282                     __m128i reverseB = _mm_xor_si128(b, firstBits);
1283                     __m128i reverseMin = _mm_xor_si128(_mm_set1_epi8(min), firstBits);
1284                     __m128i reverseMax = _mm_xor_si128(_mm_set1_epi8(max), firstBits);
1285                     __m128i ge = ~_mm_cmplt_epi8(reverseB, reverseMin);
1286                     __m128i le = ~_mm_cmpgt_epi8(reverseB, reverseMax);
1287                 }
1288                 __m128i inRange = _mm_and_si128(le, ge);
1289 
1290                 // Not considered in range a is invalid here.
1291                 byte aValidHere = (cast(byte16)aValid).array[pos+1];
1292                 __m128i mmAValidHere = _mm_set1_epi8(aValidHere);
1293                 inRange = _mm_and_si128(inRange, mmAValidHere); 
1294 
1295                 R = _mm_or_si128(R, inRange);
1296             }
1297         }
1298         // invalid b part is not in range
1299         R = _mm_and_si128(R, bValid);
1300     }
1301     else static if (Mode == 2) // equal each, just 16 comparisons not 256
1302     {
1303         static if (chars16Bits)
1304         {
1305             __m128i R = _mm_cmpeq_epi16(a, b);
1306         }
1307         else
1308         {
1309             __m128i R = _mm_cmpeq_epi8(a, b);
1310         }
1311 
1312         // if only a or b is invalid, consider not equal
1313         R = _mm_andnot_si128(_mm_xor_si128(aValid, bValid), R);
1314 
1315         // if a and b are both invalid, consider equal
1316         R = _mm_or_si128(R, ~_mm_or_si128(aValid, bValid));
1317     }  
1318     else static if (Mode == 3) // equal ordered
1319     {
1320         // a is searched in b.
1321 
1322         __m128i bValidShift = bValid;
1323 
1324         __m128i R = _mm_set1_epi32(-1); // all b positions possible for containing a
1325         static if (chars16Bits)
1326         {
1327             for (int pos = 0; pos < 8; ++pos)
1328             {
1329                 // compare character k of a, where can it go in b?
1330                 short charK = (cast(short8)a).array[pos];
1331                 __m128i mmcharK = _mm_set1_epi16(charK);
1332 
1333                 short aValidHere = (cast(short8)aValid).array[pos];
1334                 __m128i mmAValidHere = _mm_set1_epi16(aValidHere);
1335                 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1));
1336                 __m128i eqMask = _mm_cmpeq_epi16(mmcharK, b);
1337 
1338                 // Where A is invalid, the comparison always holds "equal"
1339                 eqMask = _mm_or_si128(eqMask, mmAInvalidHere);
1340 
1341                 // Where B is invalid, and A is valid, the comparison is forced to false
1342                 eqMask = _mm_and_si128(eqMask, _mm_or_si128(bValidShift, mmAInvalidHere));
1343 
1344                 R = _mm_and_si128(eqMask);
1345 
1346                 // drop first char of b
1347                 b = _mm_srli_si128!2(b);
1348                 bValidShift = _mm_srli_si128!2(bValidShift);
1349             }
1350         }
1351         else
1352         {
1353             for (int pos = 0; pos < 16; ++pos)
1354             {
1355                 // compare character k of a, where can it go in b?
1356                 byte charK = (cast(byte16)a).array[pos];
1357                 __m128i mmcharK = _mm_set1_epi8(charK);
1358 
1359                 byte aValidHere = (cast(byte16)aValid).array[pos];            
1360                 __m128i mmAValidHere = _mm_set1_epi8(aValidHere);
1361                 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1));
1362                 __m128i eqMask = _mm_cmpeq_epi8(mmcharK, b);
1363 
1364                 // Where A is invalid, the comparison always holds "equal"
1365                 eqMask = _mm_or_si128(eqMask, mmAInvalidHere);
1366 
1367                 // Where B is invalid, and A is valid, the comparison is forced to false
1368                 eqMask = _mm_and_si128(eqMask, _mm_or_si128(bValidShift, mmAInvalidHere));
1369 
1370                 R = _mm_and_si128(R, eqMask);
1371 
1372                 // drop first char of b
1373                 b = _mm_srli_si128!1(b);
1374                 bValidShift = _mm_srli_si128!1(bValidShift);
1375             }
1376         }
1377     }
1378     else 
1379         static assert(0);
1380 
1381     // Optionally negate result
1382     static if (imm8 & _SIDD_NEGATIVE_POLARITY)
1383     {
1384         static if (imm8 & _SIDD_MASKED_POSITIVE_POLARITY) 
1385         {
1386             R = _mm_xor_si128(R, bValid); // only negate valid b
1387         }
1388         else
1389         {
1390             R = _mm_xor_si128(R, _mm_set1_epi32(-1)); // negate all
1391         }
1392     }
1393     return R;
1394 }