1 /** 2 * SSE4.2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.nmmintrin; 9 10 public import inteli.types; 11 import inteli.internals; 12 public import inteli.smmintrin; 13 import core.bitop: bsf, bsr; 14 15 16 // Note: this header will work whether you have SSE4.2 enabled or not. 17 // With LDC, use "dflags-ldc": ["-mattr=+sse4.2"] or equivalent to actively 18 // generate SSE4.2 instruction (they are often enabled with -O1 or greater). 19 // - Additionally, you need ["-mattr=+crc"] on ARM if you want hardware CRC instructions. 20 // - Since LDC 1.30, you need ["-mattr=+crc32"] on x86_64 if you want hardware CRC instructions, 21 // it is not considered implied by sse4.2 anymore. 22 // With GDC, use "dflags-gdc": ["-msse4.2"] or equivalent to generate SSE4.2 instructions. 23 24 nothrow @nogc: 25 26 // <Data size and signedness> 27 28 /// String contains unsigned 8-bit characters (default). 29 enum int _SIDD_UBYTE_OPS = 0; 30 31 /// String contains unsigned 16-bit characters. 32 enum int _SIDD_UWORD_OPS = 1; 33 34 /// String contains signed 8-bit characters. 35 enum int _SIDD_SBYTE_OPS = 2; 36 37 /// String contains signed 16-bit characters. 38 enum int _SIDD_SWORD_OPS = 3; 39 40 // </Data size and signedness> 41 42 43 // <Comparison options> 44 45 /// For each character in `b`, find if it is in `a` (default) 46 /// The resulting mask has bit set at b positions that were found in a. 47 enum int _SIDD_CMP_EQUAL_ANY = 0; 48 49 /// For each character in `b`, determine if 50 /// `a[0] <= c <= a[1] or a[1] <= c <= a[2]...` 51 /// Contrarily to false documentation on the Internet, pairs must be in `a`! 52 enum int _SIDD_CMP_RANGES = 4; 53 54 /// The strings defined by `a` and `b` are equal 55 enum int _SIDD_CMP_EQUAL_EACH = 8; 56 57 /// Search for the defined substring in the target 58 enum int _SIDD_CMP_EQUAL_ORDERED = 12; 59 60 // </Comparison options> 61 62 // <Result polarity> 63 64 /// Do not negate results (default, no effect) 65 enum int _SIDD_POSITIVE_POLARITY = 0; 66 67 /// Negates results 68 enum int _SIDD_NEGATIVE_POLARITY = 16; 69 70 /// No effect. Do not negate results before the end of the string. (default when using `_SIDD_NEGATIVE_POLARITY`) 71 /// You basically never want this. 72 enum int _SIDD_MASKED_POSITIVE_POLARITY = 32; 73 74 /// Negates results only before the end of the string 75 enum int _SIDD_MASKED_NEGATIVE_POLARITY = 48; 76 77 // </Result polarity> 78 79 // <Bit returned> 80 81 /// **Index only**: return the least significant bit (default). 82 enum int _SIDD_LEAST_SIGNIFICANT = 0; 83 84 /// **Index only**: return the most significant bit. 85 enum int _SIDD_MOST_SIGNIFICANT = 64; 86 87 // </Bit returned> 88 89 /// **Mask only**: return the bit mask (default). 90 enum int _SIDD_BIT_MASK = 0; 91 92 /// **Mask only**: return the byte/word mask. 93 enum int _SIDD_UNIT_MASK = 64; 94 95 /// So SSE4.2 has a lot of hard-to-understand instructions. Here is another explanation. 96 /// 97 /// Alternative explanation of imm8 98 /// 99 /// imm8 is an 8-bit immediate operand specifying whether the characters are bytes or 100 /// words and the type of comparison to do. 101 /// 102 /// Bits [1:0]: Determine source data format. 103 /// 00: 16 unsigned bytes 104 /// 01: 8 unsigned words 105 /// 10: 16 signed bytes 106 /// 11: 8 signed words 107 /// 108 /// Bits [3:2]: Determine comparison type and aggregation method. 109 /// 00: Subset: Each character in B is compared for equality with all 110 /// the characters in A. 111 /// 01: Ranges: Each character in B is compared to A pairs. The comparison 112 /// basis is greater than or equal for even-indexed elements in A, 113 /// and less than or equal for odd-indexed elements in A. 114 /// 10: Match: Compare each pair of corresponding characters in A and 115 /// B for equality. 116 /// 11: Substring: Search B for substring matches of A. 117 /// 118 /// Bits [5:4]: Determine whether to do a one's complement on the bit 119 /// mask of the comparison results. \n 120 /// 00: No effect. \n 121 /// 01: Negate the bit mask. \n 122 /// 10: No effect. \n 123 /// 11: Negate the bit mask only for bits with an index less than or equal 124 /// to the size of \a A or \a B. 125 /// 126 127 128 129 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 130 /// the control in `imm8`, and returns 1 if `b` "does not contain a null character" 131 /// and the resulting mask was zero, and 0 otherwise. 132 /// Warning: actually it seems the instruction does accept \0 in input, just the length must be >= count. 133 /// It's not clear for what purpose. 134 int _mm_cmpestra(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 135 { 136 static if (GDC_with_SSE42) 137 { 138 return cast(int) __builtin_ia32_pcmpestria128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 139 } 140 else static if (LDC_with_SSE42) 141 { 142 return __builtin_ia32_pcmpestria128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 143 } 144 else 145 { 146 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 147 __m128i equalZero = _mm_cmpeq_epi8(mask, _mm_setzero_si128()); 148 int sigbits = _mm_movemask_epi8(equalZero); 149 enum int Count = (imm8 & 1) ? 8 : 16; 150 return (sigbits == 0xffff) && (lb >= Count); 151 } 152 } 153 unittest 154 { 155 char[16] A = "Maximum\x00length!!"; 156 char[16] B = "Mbximum\x00length!!"; 157 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 158 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 159 160 // string matching a-la strcmp, for 16-bytes of data 161 // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one 162 assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 163 | _SIDD_CMP_EQUAL_EACH 164 | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmA, 16)); 165 assert(0 == _mm_cmpestra!(_SIDD_UBYTE_OPS 166 | _SIDD_CMP_EQUAL_EACH 167 | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmB, 16)); 168 169 // test negative length, this will be clamped to 16 170 assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 171 | _SIDD_CMP_EQUAL_EACH 172 | _SIDD_NEGATIVE_POLARITY)(mmA, -160, mmA, -17)); 173 174 // it seems you can't compare shorter strings for equality using _mm_cmpestra (!) 175 176 // Test 16-bit format 177 assert(1 == _mm_cmpestra!(_SIDD_SWORD_OPS 178 | _SIDD_CMP_EQUAL_EACH 179 | _SIDD_NEGATIVE_POLARITY)(mmA, 8, mmA, 8)); 180 } 181 182 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 183 /// the control in `imm8`, and returns 1 if the resulting mask was non-zero, 184 /// and 0 otherwise. 185 int _mm_cmpestrc(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 186 { 187 static if (GDC_with_SSE42) 188 { 189 return cast(int) __builtin_ia32_pcmpestric128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 190 } 191 else static if (LDC_with_SSE42) 192 { 193 return cast(int) __builtin_ia32_pcmpestric128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 194 } 195 else 196 { 197 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 198 int sigbits = _mm_movemask_epi8(mask); 199 return (sigbits != 0); 200 } 201 } 202 unittest 203 { 204 // Compare two shorter strings 205 { 206 char[16] A = "Hello world"; 207 char[16] B = "Hello moon"; 208 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 209 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 210 __m128i mask = _mm_cmpestrm!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 211 | _SIDD_CMP_EQUAL_EACH 212 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6); 213 assert(0 == _mm_cmpestrc!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 214 | _SIDD_CMP_EQUAL_EACH 215 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6)); 216 assert(1 == _mm_cmpestrc!(_SIDD_UBYTE_OPS 217 | _SIDD_CMP_EQUAL_EACH 218 | _SIDD_NEGATIVE_POLARITY)(mmA, 7, mmB, 7)); 219 } 220 } 221 222 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 223 /// the control in `imm8`, and return the generated index. 224 /// Note: if the mask is all zeroes, the returned index is always `Count` 225 /// (8 or 16 depending on size). 226 int _mm_cmpestri(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 227 { 228 static if (GDC_with_SSE42) 229 { 230 return __builtin_ia32_pcmpestri128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 231 } 232 else static if (LDC_with_SSE42) 233 { 234 return __builtin_ia32_pcmpestri128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 235 } 236 else 237 { 238 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 239 240 // Convert the unit mask to bit mask 241 static if (imm8 & 1) 242 { 243 enum int Count = 8; 244 mask = _mm_packs_epi16(mask, _mm_setzero_si128()); 245 } 246 else 247 { 248 enum int Count = 16; 249 } 250 int signbits = _mm_movemask_epi8(mask); 251 static if (imm8 & _SIDD_MOST_SIGNIFICANT) 252 { 253 if (signbits == 0) 254 return Count; 255 else 256 return bsr(signbits); 257 } 258 else 259 { 260 if (signbits == 0) 261 return Count; 262 else 263 return bsf(signbits); 264 } 265 } 266 } 267 unittest 268 { 269 // Find the index of the first difference (at index 6) 270 // v 271 char[16] A = "Hello sun"; 272 char[16] B = "Hello moon"; 273 274 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 275 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 276 277 int index = _mm_cmpestri!(_SIDD_UBYTE_OPS 278 | _SIDD_CMP_EQUAL_EACH 279 | _SIDD_NEGATIVE_POLARITY 280 | _SIDD_LEAST_SIGNIFICANT)(mmA, 9, mmB, 10); 281 assert(index == 6); 282 283 // Those string must compare equal, regardless of what happens after their length. 284 index = _mm_cmpestri!(_SIDD_UBYTE_OPS 285 | _SIDD_CMP_EQUAL_EACH 286 | _SIDD_NEGATIVE_POLARITY 287 | _SIDD_LEAST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars 288 assert(index == 16); 289 290 index = _mm_cmpestri!(_SIDD_UBYTE_OPS 291 | _SIDD_CMP_EQUAL_EACH 292 | _SIDD_NEGATIVE_POLARITY 293 | _SIDD_MOST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars 294 assert(index == 16); 295 } 296 unittest 297 { 298 // Identify the last character that isn't an identifier character. 299 // v (at index 7) 300 char[16] A = "my_i(en)ifie"; 301 char[16] identRanges = "__azAz09"; 302 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 303 __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr); 304 byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 305 | _SIDD_CMP_RANGES 306 | _SIDD_MASKED_NEGATIVE_POLARITY 307 | _SIDD_UNIT_MASK)(mmI, 8, mmA, 12); 308 byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0]; 309 assert(mask.array == correctM); 310 311 int index = _mm_cmpestri!(_SIDD_UBYTE_OPS 312 | _SIDD_CMP_RANGES 313 | _SIDD_MASKED_NEGATIVE_POLARITY 314 | _SIDD_MOST_SIGNIFICANT)(mmI, 8, mmA, 12); 315 assert(index == 7); // ')' is the last char not to be in [__azAz09] 316 } 317 unittest 318 { 319 // testing _SIDD_CMP_RANGES but with signed shorts comparison instead (this only makes sense for _SIDD_CMP_RANGES) 320 short[8] ranges = [0, -1, 1000, 2000, 0, 0, 0, 0]; 321 short[8] numbers = [-32768, -1000, -1, -0, 0, 1, 1000, 32767]; 322 __m128i mmRanges = _mm_loadu_si128(cast(__m128i*)ranges.ptr); 323 __m128i mmNumbers = _mm_loadu_si128(cast(__m128i*)numbers.ptr); 324 325 short8 mask = cast(short8)_mm_cmpestrm!(_SIDD_UWORD_OPS 326 | _SIDD_CMP_RANGES 327 | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8); 328 short[8] correctM = [ -1, -1, -1, -1, -1, -1, -1, -1]; 329 mask = cast(short8)_mm_cmpestrm!(_SIDD_SWORD_OPS 330 | _SIDD_CMP_RANGES 331 | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8); 332 short[8] correctZ = [ 0, 0, 0, 0, 0, 0, -1, 0]; 333 assert(mask.array == correctZ); 334 } 335 unittest 336 { 337 // Find a substring 338 char[16] A = "def"; 339 char[16] B = "abcdefghdefff"; 340 char[16] C = "no substring"; 341 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 342 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 343 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 344 345 byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 346 | _SIDD_CMP_EQUAL_ORDERED 347 | _SIDD_UNIT_MASK)(mmA, 3, mmB, 13); 348 byte[16] correctM = [0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0]; 349 assert(mask.array == correctM); 350 351 int firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 352 | _SIDD_CMP_EQUAL_ORDERED)(mmA, 3, mmB, 13); 353 assert(firstMatch == 3); 354 355 int lastMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 356 | _SIDD_CMP_EQUAL_ORDERED 357 | _SIDD_MOST_SIGNIFICANT)(mmA, 3, mmB, 13); 358 assert(lastMatch == 8); 359 firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 360 | _SIDD_CMP_EQUAL_ORDERED)(mmA, -3, mmC, -12); 361 assert(firstMatch == 16); // no substring found 362 } 363 364 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 365 /// the control in `imm8`, and return the generated mask. 366 __m128i _mm_cmpestrm(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 367 { 368 static if (GDC_with_SSE42) 369 { 370 return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 371 } 372 else static if (LDC_with_SSE42) 373 { 374 return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 375 } 376 else 377 { 378 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 379 380 static if (imm8 & _SIDD_UNIT_MASK) 381 { 382 return mask; 383 } 384 else 385 { 386 // _SIDD_BIT_MASK 387 static if (imm8 & 1) 388 { 389 mask = _mm_packs_epi16(mask, _mm_setzero_si128()); 390 } 391 return _mm_cvtsi32_si128( _mm_movemask_epi8(mask)); 392 } 393 } 394 } 395 unittest 396 { 397 char[16] A = "Hello world!"; 398 char[16] B = "aeiou!"; 399 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 400 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 401 402 // Find which letters from B where found in A. 403 byte16 R = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 404 | _SIDD_CMP_EQUAL_ANY 405 | _SIDD_BIT_MASK)(mmA, -12, mmB, -6); 406 // because 'e', 'o', and '!' were found 407 byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 408 assert(R.array == correctR); 409 byte16 M = cast(byte16) _mm_cmpestrm!(_SIDD_UBYTE_OPS 410 | _SIDD_CMP_EQUAL_ANY 411 | _SIDD_UNIT_MASK)(mmA, 12, mmB, 6); 412 byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 413 assert(M.array == correctM); 414 } 415 416 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 417 /// the control in `imm8`, and returns bit 0 of the resulting bit mask. 418 int _mm_cmpestro(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 419 { 420 static if (GDC_with_SSE42) 421 { 422 return __builtin_ia32_pcmpestrio128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 423 } 424 else static if (LDC_with_SSE42) 425 { 426 return __builtin_ia32_pcmpestrio128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 427 } 428 else 429 { 430 int4 mask = cast(int4) cmpstrMaskExplicit!imm8(a, la, b, lb); 431 return mask.array[0] & 1; 432 } 433 } 434 unittest 435 { 436 char[16] A = "Hallo world!"; 437 char[16] B = "aeiou!"; 438 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 439 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 440 441 // Find which letters from B were found in A. 442 int res = _mm_cmpestro!(_SIDD_UBYTE_OPS 443 | _SIDD_CMP_EQUAL_ANY 444 | _SIDD_BIT_MASK)(mmA, 12, mmB, -6); 445 // because 'a' was found in "Hallo world!" 446 assert(res == 1); 447 } 448 449 /// Returns 1 if "any character in a was null", and 0 otherwise. 450 /// Warning: what they mean is it returns 1 if the given length `la` is < Count. 451 int _mm_cmpestrs(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 452 { 453 static if (GDC_with_SSE42) 454 { 455 return __builtin_ia32_pcmpestris128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 456 } 457 else static if (LDC_with_SSE42) 458 { 459 return __builtin_ia32_pcmpestris128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 460 } 461 else 462 { 463 // Yes, this intrinsic is there for symmetrical reasons and probably useless. 464 // saturates lengths (the Intrinsics Guide doesn't tell this) 465 if (la < 0) la = -la; 466 if (la > 16) la = 16; 467 enum int Count = (imm8 & 1) ? 8 : 16; 468 return (la < Count); 469 } 470 } 471 unittest 472 { 473 __m128i a; 474 a = 0; 475 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 15, a, 8) == 1); 476 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 16, a, 8) == 0); 477 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -15, a, 8) == 1); 478 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -16, a, 8) == 0); 479 } 480 481 /// Returns 1 if "any character in b was null", and 0 otherwise. 482 /// Warning: what they mean is it returns 1 if the given length `lb` is < Count. 483 int _mm_cmpestrz(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 484 { 485 static if (GDC_with_SSE42) 486 { 487 return __builtin_ia32_pcmpestriz128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 488 } 489 else static if (LDC_with_SSE42) 490 { 491 return __builtin_ia32_pcmpestriz128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 492 } 493 else 494 { 495 // Yes, this intrinsic is there for symmetrical reasons and probably useless. 496 // saturates lengths (the Intrinsics Guide doesn't tell this) 497 if (lb < 0) lb = -lb; 498 if (lb > 16) lb = 16; 499 enum int Count = (imm8 & 1) ? 8 : 16; 500 return (lb < Count); 501 } 502 } 503 unittest 504 { 505 __m128i b; 506 b = 0; 507 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 15, b, 15) == 1); 508 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 16, b, 16) == 0); 509 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -15, b, -15) == 1); 510 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -16, b, -16) == 0); 511 } 512 513 /// Compare packed signed 64-bit integers in a and b for greater-than. 514 __m128i _mm_cmpgt_epi64 (__m128i a, __m128i b) pure @trusted 515 { 516 long2 la = cast(long2)a; 517 long2 lb = cast(long2)b; 518 // PERF: with DMD, enabling this requires SSE4.2, hence D_AVX 519 /*static if (SIMD_COMPARISON_MASKS_16B) 520 { 521 return cast(__m128i)(la > lb); 522 } 523 else*/ 524 static if (GDC_with_SSE42) 525 { 526 return cast(__m128i) __builtin_ia32_pcmpgtq(la, lb); 527 } 528 else version(LDC) 529 { 530 // LDC x86: Optimized since LDC 1.1.0 -O1 531 // arm64: Optimized since LDC 1.8.0 -O1 532 // When SSE4.2 is disabled, this gives same sequence than below. 533 static if (SIMD_COMPARISON_MASKS_16B) 534 return cast(__m128i)(la > lb); 535 else 536 return cast(__m128i)( greaterMask!long2(la, lb)); 537 } 538 else 539 { 540 long2 r; 541 r.ptr[0] = (la.array[0] > lb.array[0]) ? 0xffffffff_ffffffff : 0; 542 r.ptr[1] = (la.array[1] > lb.array[1]) ? 0xffffffff_ffffffff : 0; 543 return cast(__m128i)r; 544 } 545 } 546 unittest 547 { 548 __m128i A = _mm_setr_epi64(-3, 2); 549 __m128i B = _mm_setr_epi64(4, -2); 550 long[2] correct = [ 0, -1 ]; 551 long2 R = cast(long2)(_mm_cmpgt_epi32(A, B)); 552 assert(R.array == correct); 553 } 554 555 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`, 556 /// and returns 1 if `b` did not contain a null character and the resulting mask was zero, 557 /// and 0 otherwise. 558 int _mm_cmpistra(int imm8)(__m128i a, __m128i b) @trusted 559 { 560 static if (GDC_with_SSE42) 561 { 562 return cast(int) __builtin_ia32_pcmpistria128(cast(ubyte16)a, cast(ubyte16)b, imm8); 563 } 564 else static if (LDC_with_SSE42) 565 { 566 return __builtin_ia32_pcmpistria128(cast(byte16)a, cast(byte16)b, imm8); 567 } 568 else 569 { 570 static if (imm8 & 1) 571 { 572 int la = findLengthShort(a); 573 int lb = findLengthShort(b); 574 } 575 else 576 { 577 int la = findLengthByte(a); 578 int lb = findLengthByte(b); 579 } 580 return _mm_cmpestra!imm8(a, la, b, lb); 581 } 582 } 583 unittest 584 { 585 char[16] A = "Maximum\x00one"; 586 char[16] B = "Maximum\x00four"; 587 char[16] C = "Mbximum\x00length!"; 588 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 589 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 590 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 591 592 // string matching a-la strcmp, for 16-bytes of data 593 // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one 594 assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 595 | _SIDD_CMP_EQUAL_EACH 596 | _SIDD_MASKED_NEGATIVE_POLARITY)(mmA, mmB)); // match, but b is too short 597 598 assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 599 | _SIDD_CMP_EQUAL_EACH 600 | _SIDD_NEGATIVE_POLARITY)(mmA, mmC)); // do not match 601 } 602 603 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`, 604 /// and returns 1 if the resulting mask was non-zero, and 0 otherwise. 605 int _mm_cmpistrc(int imm8)(__m128i a, __m128i b) @trusted 606 { 607 static if (GDC_with_SSE42) 608 { 609 return cast(int) __builtin_ia32_pcmpistric128(cast(ubyte16)a, cast(ubyte16)b, imm8); 610 } 611 else static if (LDC_with_SSE42) 612 { 613 return cast(int) __builtin_ia32_pcmpistric128(cast(byte16)a, cast(byte16)b, imm8); 614 } 615 else 616 { 617 static if (imm8 & 1) 618 { 619 int la = findLengthShort(a); 620 int lb = findLengthShort(b); 621 } 622 else 623 { 624 int la = findLengthByte(a); 625 int lb = findLengthByte(b); 626 } 627 return _mm_cmpestrc!imm8(a, la, b, lb); 628 } 629 } 630 unittest 631 { 632 // Compare two shorter strings 633 { 634 char[16] A = "Hello"; 635 char[16] B = "Hello moon"; 636 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 637 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 638 assert(0 == _mm_cmpistrc!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 639 | _SIDD_CMP_EQUAL_EACH 640 | _SIDD_NEGATIVE_POLARITY)(mmA, mmA)); 641 assert(1 == _mm_cmpistrc!(_SIDD_UBYTE_OPS 642 | _SIDD_CMP_EQUAL_EACH 643 | _SIDD_NEGATIVE_POLARITY)(mmA, mmB)); 644 } 645 } 646 647 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8` 648 /// and return the generated index. 649 /// Note: if the mask is all zeroes, the returned index is always `Count` 650 /// (8 or 16 depending on size). 651 int _mm_cmpistri(int imm8)(__m128i a, __m128i b) @trusted 652 { 653 static if (GDC_with_SSE42) 654 { 655 return __builtin_ia32_pcmpistri128(cast(ubyte16)a, cast(ubyte16)b, imm8); 656 } 657 else static if (LDC_with_SSE42) 658 { 659 return __builtin_ia32_pcmpistri128(cast(byte16)a, cast(byte16)b, imm8); 660 } 661 else 662 { 663 static if (imm8 & 1) 664 { 665 int la = findLengthShort(a); 666 int lb = findLengthShort(b); 667 } 668 else 669 { 670 int la = findLengthByte(a); 671 int lb = findLengthByte(b); 672 } 673 return _mm_cmpestri!imm8(a, la, b, lb); 674 } 675 } 676 unittest 677 { 678 // Identify the last character that isn't an identifier character. 679 // v (at index 7) 680 char[16] A = "my_i(en)ifie"; 681 char[16] identRanges = "__azAz09"; 682 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 683 __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr); 684 byte16 mask = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 685 | _SIDD_CMP_RANGES 686 | _SIDD_MASKED_NEGATIVE_POLARITY 687 | _SIDD_UNIT_MASK)(mmI, mmA); 688 byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0]; 689 assert(mask.array == correctM); 690 691 int index = _mm_cmpistri!(_SIDD_UBYTE_OPS 692 | _SIDD_CMP_RANGES 693 | _SIDD_MASKED_NEGATIVE_POLARITY 694 | _SIDD_MOST_SIGNIFICANT)(mmI, mmA); 695 assert(index == 7); // ')' is the last char not to be in [__azAz09] 696 } 697 698 /// Compare packed strings with implicit lengths in `a` and `b` using the control in 699 /// `imm8`, and return the generated mask. 700 __m128i _mm_cmpistrm(int imm8)(__m128i a, __m128i b) @trusted 701 { 702 static if (GDC_with_SSE42) 703 { 704 return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(ubyte16)a, cast(ubyte16)b, imm8); 705 } 706 else static if (LDC_with_SSE42) 707 { 708 return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(byte16)a, cast(byte16)b, imm8); 709 } 710 else 711 { 712 static if (imm8 & 1) 713 { 714 int la = findLengthShort(a); 715 int lb = findLengthShort(b); 716 } 717 else 718 { 719 int la = findLengthByte(a); 720 int lb = findLengthByte(b); 721 } 722 return _mm_cmpestrm!imm8(a, la, b, lb); 723 } 724 } 725 unittest 726 { 727 char[16] A = "Hello world!"; 728 char[16] B = "aeiou!"; 729 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 730 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 731 732 // Find which letters from B where found in A. 733 byte16 R = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 734 | _SIDD_CMP_EQUAL_ANY 735 | _SIDD_BIT_MASK)(mmA, mmB); 736 // because 'e', 'o', and '!' were found 737 byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 738 assert(R.array == correctR); 739 byte16 M = cast(byte16) _mm_cmpistrm!(_SIDD_UBYTE_OPS 740 | _SIDD_CMP_EQUAL_ANY 741 | _SIDD_UNIT_MASK)(mmA, mmB); 742 byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 743 assert(M.array == correctM); 744 } 745 746 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 747 /// the control in `imm8`, and returns bit 0 of the resulting bit mask. 748 int _mm_cmpistro(int imm8)(__m128i a, __m128i b) @trusted 749 { 750 static if (GDC_with_SSE42) 751 { 752 return __builtin_ia32_pcmpistrio128(cast(ubyte16)a, cast(ubyte16)b, imm8); 753 } 754 else static if (LDC_with_SSE42) 755 { 756 return __builtin_ia32_pcmpistrio128(cast(byte16)a, cast(byte16)b, imm8); 757 } 758 else 759 { 760 static if (imm8 & 1) 761 { 762 int la = findLengthShort(a); 763 int lb = findLengthShort(b); 764 } 765 else 766 { 767 int la = findLengthByte(a); 768 int lb = findLengthByte(b); 769 } 770 return _mm_cmpestro!imm8(a, la, b, lb); 771 } 772 } 773 unittest 774 { 775 char[16] A = "Hallo world!"; 776 char[16] B = "aeiou!"; 777 char[16] C = "Z"; 778 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 779 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 780 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 781 782 // Find which letters from B where found in A. 783 int res = _mm_cmpistro!(_SIDD_UBYTE_OPS 784 | _SIDD_CMP_EQUAL_ANY 785 | _SIDD_BIT_MASK)(mmA, mmB); 786 // because 'a' was found in "Hallo world!" 787 assert(res == 1); 788 res = _mm_cmpistro!(_SIDD_UBYTE_OPS 789 | _SIDD_CMP_EQUAL_ANY 790 | _SIDD_BIT_MASK)(mmA, mmC); 791 assert(res == 0); // because 'Z' wasn't found in A 792 } 793 794 /// Returns 1 if any character in `a` was null, and 0 otherwise. 795 int _mm_cmpistrs(int imm8)(__m128i a, __m128i b) @trusted 796 { 797 static if (GDC_with_SSE42) 798 { 799 return __builtin_ia32_pcmpistris128(cast(ubyte16)a, cast(ubyte16)b, imm8); 800 } 801 else static if (LDC_with_SSE42) 802 { 803 return __builtin_ia32_pcmpistris128(cast(byte16)a, cast(byte16)b, imm8); 804 } 805 else 806 { 807 static if (imm8 & 1) 808 { 809 int la = findLengthShort(a); 810 return la != 8; 811 } 812 else 813 { 814 int la = findLengthByte(a); 815 return la != 16; 816 } 817 } 818 } 819 unittest 820 { 821 char[16] A = ""; 822 char[16] B = "hello"; 823 char[16] C = "Maximum length!!"; 824 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 825 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 826 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 827 assert(_mm_cmpistrs!_SIDD_UBYTE_OPS(mmA, mmA) == 1); 828 assert(_mm_cmpistrs!_SIDD_SBYTE_OPS(mmB, mmB) == 1); 829 assert(_mm_cmpistrs!_SIDD_UWORD_OPS(mmC, mmC) == 0); 830 } 831 832 /// Returns 1 if any character in `b` was null, and 0 otherwise. 833 int _mm_cmpistrz(int imm8)(__m128i a, __m128i b) @trusted 834 { 835 static if (GDC_with_SSE42) 836 { 837 return __builtin_ia32_pcmpistriz128(cast(ubyte16)a, cast(ubyte16)b, imm8); 838 } 839 else static if (LDC_with_SSE42) 840 { 841 return __builtin_ia32_pcmpistriz128(cast(byte16)a, cast(byte16)b, imm8); 842 } 843 else 844 { 845 static if (imm8 & 1) 846 { 847 int lb = findLengthShort(b); 848 return lb != 8; 849 } 850 else 851 { 852 int lb = findLengthByte(b); 853 return lb != 16; 854 } 855 } 856 } 857 unittest 858 { 859 char[16] A = ""; 860 char[16] B = "hello"; 861 char[16] C = "Maximum length!!"; 862 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 863 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 864 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 865 assert(_mm_cmpistrz!_SIDD_UBYTE_OPS(mmC, mmA) == 1); 866 assert(_mm_cmpistrz!_SIDD_SBYTE_OPS(mmC, mmB) == 1); 867 assert(_mm_cmpistrz!_SIDD_UWORD_OPS(mmA, mmC) == 0); 868 } 869 870 871 /// Starting with the initial value in `crc`, accumulates a CR32 value 872 /// for unsigned 16-bit integer `v`. 873 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 874 uint _mm_crc32_u16 (uint crc, ushort v) @safe 875 { 876 static if (GDC_with_SSE42) 877 { 878 return __builtin_ia32_crc32hi(crc, v); 879 } 880 else static if (LDC_with_CRC32) 881 { 882 return __builtin_ia32_crc32hi(crc, v); 883 } 884 else static if (LDC_with_ARM64_CRC) 885 { 886 return __crc32ch(crc, v); 887 } 888 else 889 { 890 crc = _mm_crc32_u8(crc, v & 0xff); 891 crc = _mm_crc32_u8(crc, v >> 8); 892 return crc; 893 } 894 } 895 unittest 896 { 897 uint A = _mm_crc32_u16(0x12345678, 0x4512); 898 uint B = _mm_crc32_u16(0x76543210, 0xf50f); 899 uint C = _mm_crc32_u16(0xDEADBEEF, 0x0017); 900 assert(A == 0x39c3f0ff); 901 assert(B == 0xcffbcf07); 902 assert(C == 0xc7e3fe85); 903 } 904 905 /// Starting with the initial value in `crc`, accumulates a CRC32 value 906 /// for unsigned 32-bit integer `v`. 907 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 908 uint _mm_crc32_u32 (uint crc, uint v) @safe 909 { 910 static if (GDC_with_SSE42) 911 { 912 return __builtin_ia32_crc32si(crc, v); 913 } 914 else static if (LDC_with_CRC32) 915 { 916 return __builtin_ia32_crc32si(crc, v); 917 } 918 else static if (LDC_with_ARM64_CRC) 919 { 920 return __crc32cw(crc, v); 921 } 922 else 923 { 924 crc = _mm_crc32_u8(crc, v & 0xff); 925 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); 926 crc = _mm_crc32_u8(crc, (v >> 16) & 0xff); 927 crc = _mm_crc32_u8(crc, (v >> 24) & 0xff); 928 return crc; 929 } 930 } 931 unittest 932 { 933 uint A = _mm_crc32_u32(0x12345678, 0x45123563); 934 uint B = _mm_crc32_u32(0x76543210, 0xf50f9993); 935 uint C = _mm_crc32_u32(0xDEADBEEF, 0x00170017); 936 assert(A == 0x22a6ec54); 937 assert(B == 0x7019a6cf); 938 assert(C == 0xbc552c27); 939 } 940 941 /// Starting with the initial value in `crc`, accumulates a CRC32 942 /// value for unsigned 64-bit integer `v`. 943 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 944 ulong _mm_crc32_u64 (ulong crc, ulong v) 945 { 946 version(X86_64) 947 enum bool hasX86Intrin = GDC_with_SSE42 || LDC_with_CRC32; 948 else 949 enum bool hasX86Intrin = false; // intrinsics not available in 32-bit 950 951 static if (hasX86Intrin) 952 { 953 return __builtin_ia32_crc32di(crc, v); 954 } 955 else static if (LDC_with_ARM64_CRC) 956 { 957 return __crc32cd(cast(uint)crc, v); 958 } 959 else 960 { 961 uint crc32 = cast(uint)crc; 962 crc32 = _mm_crc32_u8(crc32, (v >> 0) & 0xff); 963 crc32 = _mm_crc32_u8(crc32, (v >> 8) & 0xff); 964 crc32 = _mm_crc32_u8(crc32, (v >> 16) & 0xff); 965 crc32 = _mm_crc32_u8(crc32, (v >> 24) & 0xff); 966 crc32 = _mm_crc32_u8(crc32, (v >> 32) & 0xff); 967 crc32 = _mm_crc32_u8(crc32, (v >> 40) & 0xff); 968 crc32 = _mm_crc32_u8(crc32, (v >> 48) & 0xff); 969 crc32 = _mm_crc32_u8(crc32, (v >> 56) & 0xff); 970 return crc32; 971 } 972 } 973 unittest 974 { 975 ulong A = _mm_crc32_u64(0x1234567812345678, 0x39C3F0FFCFFBCF07); 976 ulong B = _mm_crc32_u64(0x7654321001234567, 0xFACEFEED); 977 ulong C = _mm_crc32_u64(0xDEADBEEFCAFEBABE, 0x0017C7E3FE850017); 978 assert(A == 0xd66b1074); 979 assert(B == 0xac12f9c6); 980 assert(C == 0xa2d13dd8); 981 } 982 983 /// Starting with the initial value in `crc`, accumulates a CRC32 value 984 /// for unsigned 8-bit integer `v`. 985 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 986 uint _mm_crc32_u8 (uint crc, ubyte v) @safe 987 { 988 static if (GDC_with_SSE42) 989 { 990 return __builtin_ia32_crc32qi(crc, v); 991 } 992 else static if (LDC_with_CRC32) 993 { 994 return __builtin_ia32_crc32qi(crc, v); 995 } 996 else static if (LDC_with_ARM64_CRC) 997 { 998 return __crc32cb(crc, v); 999 } 1000 else 1001 { 1002 return CRC32cTable[(crc ^ v) & 0xFF] ^ (crc >> 8); 1003 } 1004 } 1005 unittest 1006 { 1007 uint A = _mm_crc32_u8(0x12345678, 0x45); 1008 uint B = _mm_crc32_u8(0x76543210, 0xf5); 1009 uint C = _mm_crc32_u8(0xDEADBEEF, 0x00); 1010 assert(A == 0x8fd93134); 1011 assert(B == 0xd6b7e834); 1012 assert(C == 0xbdfd3980); 1013 } 1014 1015 1016 // Utilities for this file 1017 1018 private: 1019 1020 static if (GDC_with_SSE42) 1021 { 1022 version(X86_64) 1023 enum bool NeedCRC32CTable = false; 1024 else 1025 enum bool NeedCRC32CTable = true; 1026 } 1027 else static if (LDC_with_CRC32) 1028 { 1029 version(X86_64) 1030 enum bool NeedCRC32CTable = false; 1031 else 1032 enum bool NeedCRC32CTable = true; 1033 } 1034 else static if (LDC_with_ARM64_CRC) 1035 { 1036 enum bool NeedCRC32CTable = false; 1037 } 1038 else 1039 { 1040 enum bool NeedCRC32CTable = true; 1041 } 1042 1043 static if (NeedCRC32CTable) 1044 { 1045 static immutable uint[256] CRC32cTable = 1046 [ 1047 0x0, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, 1048 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 1049 0x105ec76f, 0xe235446c, 0xf165b798, 0x30e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, 1050 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, 1051 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x61c6936, 0xf477ea35, 1052 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, 1053 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x5125dad, 0x1642ae59, 0xe4292d5a, 1054 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 1055 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, 1056 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0xc38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, 1057 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 1058 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0xf36e6f7, 1059 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, 1060 0xeb1fcbad, 0x197448ae, 0xa24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 1061 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, 1062 0xfb410cc2, 0x92a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, 1063 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 1064 0x82f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, 1065 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, 1066 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0xb21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 1067 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, 1068 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0xe330a81, 0xfc588982, 1069 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 1070 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0xd3d3e1a, 0x1e6dcdee, 0xec064eed, 1071 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, 1072 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 1073 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x7198540, 1074 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, 1075 0xe330a81a, 0x115b2b19, 0x20bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 1076 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, 1077 0xf36e6f75, 0x105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, 1078 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, 1079 ]; 1080 } 1081 1082 int findLengthByte(__m128i a) pure @safe 1083 { 1084 const __m128i zero = _mm_setzero_si128(); 1085 const __m128i zeroMask = _mm_cmpeq_epi8(a, zero); // 0xff where a byte is zero 1086 int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index 1087 if (mask == 0) 1088 return 16; 1089 else 1090 return bsf(mask); 1091 } 1092 unittest 1093 { 1094 char[16] A = "Hel!o"; 1095 char[16] B = "Maximum length!!"; 1096 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 1097 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 1098 assert(findLengthByte(mmA) == 5); 1099 assert(findLengthByte(mmB) == 16); 1100 } 1101 1102 int findLengthShort(__m128i a) pure @safe 1103 { 1104 const __m128i zero = _mm_setzero_si128(); 1105 const __m128i zeroMask = _mm_cmpeq_epi16(a, zero); // 0xffff where a short is zero 1106 int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index 1107 if (mask == 0) 1108 return 8; 1109 else 1110 return bsf(mask) >> 1; 1111 } 1112 unittest 1113 { 1114 short[8] A = [10, 5423, 475, 0, 1, 1, 1, 1 ]; 1115 short[8] B = [-1, -2, -3, 4, 5, 6, -32768, 1]; 1116 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 1117 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 1118 assert(findLengthShort(mmA) == 3); 1119 assert(findLengthShort(mmB) == 8); 1120 } 1121 1122 static immutable byte[32] MASK_DATA = 1123 [ 1124 -1, -1, -1, -1, -1, -1, -1, -1, 1125 -1, -1, -1, -1, -1, -1, -1, -1, 1126 0, 0, 0, 0, 0, 0, 0, 0, 1127 0, 0, 0, 0, 0, 0, 0, 0, 1128 ]; 1129 1130 // Makes a byte validity mask with a given explicit length string. 1131 __m128i validMask8e(int len) @trusted 1132 { 1133 return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len]); 1134 } 1135 unittest 1136 { 1137 char[16] A = ""; 1138 char[16] B = "0123456789abcdef"; 1139 byte[16] correctA = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1140 byte[16] correctB = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]; 1141 byte16 MA = cast(byte16) validMask8e(0); 1142 byte16 MB = cast(byte16) validMask8e(16); 1143 assert(MA.array == correctA); 1144 assert(MB.array == correctB); 1145 } 1146 1147 // Makes a short validity mask with a given explicit length string. 1148 __m128i validMask16e(int len) @trusted 1149 { 1150 return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len*2]); 1151 } 1152 unittest 1153 { 1154 short[8] A = [3, 4, 5, 0, 3, 4, 5, 6]; 1155 short[8] correctA = [-1, -1, -1, 0, 0, 0, 0, 0]; 1156 short8 MA = cast(short8) validMask16e(3); 1157 assert(MA.array == correctA); 1158 } 1159 1160 // Internal implementation for non-SSE4.2 1161 // Compare 8-bit or 16-bit strings, get a mask. 1162 // `aValid` and `bValid` are byte-mask or word-mask of the valid 1163 // zone in `a` and `b`. 1164 __m128i cmpstrMaskExplicit(int imm8)(__m128i a, 1165 ref int la, 1166 __m128i b, 1167 ref int lb) @safe 1168 { 1169 // saturates lengths (the Intrinsics Guide doesn't tell this) 1170 if (la < 0) la = -la; 1171 if (lb < 0) lb = -lb; 1172 if (la > 16) la = 16; 1173 if (lb > 16) lb = 16; 1174 1175 static if (imm8 & 1) 1176 { 1177 __m128i aValid = validMask16e(la); 1178 __m128i bValid = validMask16e(lb); 1179 } 1180 else 1181 { 1182 __m128i aValid = validMask8e(la); 1183 __m128i bValid = validMask8e(lb); 1184 } 1185 return cmpstrMask!imm8(a, aValid, b, bValid); 1186 } 1187 1188 //ditto 1189 __m128i cmpstrMask(int imm8)(__m128i a, 1190 __m128i aValid, 1191 __m128i b, 1192 const __m128i bValid) @safe 1193 { 1194 enum bool chars16Bits = imm8 & 1; 1195 enum int Mode = (imm8 >> 2) & 3; 1196 1197 static if (Mode == 0) // equal any 1198 { 1199 __m128i R = _mm_setzero_si128(); 1200 static if (chars16Bits) // 64 comparisons 1201 { 1202 for (int k = 0; k < 8; ++k) 1203 { 1204 __m128i eqMask = _mm_cmpeq_epi16(a, b); 1205 eqMask = _mm_and_si128(eqMask, aValid); 1206 R = _mm_or_si128(R, eqMask); 1207 1208 // rotate a and aValid 1209 a = _mm_or_si128(_mm_srli_si128!2(a), _mm_slli_si128!14(a)); 1210 aValid = _mm_or_si128(_mm_srli_si128!2(aValid), _mm_slli_si128!14(aValid)); 1211 } 1212 } 1213 else 1214 { 1215 for (int k = 0; k < 16; ++k) 1216 { 1217 __m128i eqMask = _mm_cmpeq_epi8(a, b); 1218 eqMask = _mm_and_si128(eqMask, aValid); 1219 R = _mm_or_si128(R, eqMask); 1220 1221 // rotate a and aValid 1222 a = _mm_or_si128(_mm_srli_si128!1(a), _mm_slli_si128!15(a)); 1223 aValid = _mm_or_si128(_mm_srli_si128!1(aValid), _mm_slli_si128!15(aValid)); 1224 } 1225 } 1226 R = _mm_and_si128(R, bValid); 1227 } 1228 else static if (Mode == 1) // ranges 1229 { 1230 enum bool signed = (imm8 & 2) != 0; 1231 1232 // For each character in b, the returned mask says if it was found in a range-pair in `a`. 1233 __m128i R = _mm_setzero_si128(); 1234 static if (chars16Bits) 1235 { 1236 for (int pos = 0; pos < 8; pos += 2) 1237 { 1238 short min = (cast(short8)a).array[pos]; 1239 short max = (cast(short8)a).array[pos+1]; 1240 static if (signed) 1241 { 1242 __m128i ge = ~_mm_cmplt_epi16(b, _mm_set1_epi16(min)); 1243 __m128i le = ~_mm_cmpgt_epi16(b, _mm_set1_epi16(max)); 1244 } 1245 else 1246 { 1247 // No SSE way to do 16-bit unsigned comparisons, 1248 // but flipping the sign bit let us used signed comp 1249 __m128i firstBits = _mm_set1_epi16(-32768); 1250 __m128i reverseB = _mm_xor_si128(b, firstBits); 1251 __m128i reverseMin = _mm_xor_si128(_mm_set1_epi16(min), firstBits); 1252 __m128i reverseMax = _mm_xor_si128(_mm_set1_epi16(max), firstBits); 1253 __m128i ge = ~_mm_cmplt_epi16(reverseB, reverseMin); 1254 __m128i le = ~_mm_cmpgt_epi16(reverseB, reverseMax); 1255 } 1256 __m128i inRange = _mm_and_si128(le, ge); 1257 1258 // Not considered in range a is invalid here. 1259 short aValidHere = (cast(short8)aValid).array[pos+1]; 1260 __m128i mmAValidHere = _mm_set1_epi16(aValidHere); 1261 inRange = _mm_and_si128(inRange, mmAValidHere); 1262 1263 R = _mm_or_si128(R, inRange); 1264 } 1265 } 1266 else // 8-bits 1267 { 1268 for (int pos = 0; pos < 16; pos += 2) 1269 { 1270 byte min = (cast(byte16)a).array[pos]; 1271 byte max = (cast(byte16)a).array[pos+1]; 1272 static if (signed) 1273 { 1274 __m128i ge = _mm_xor_si128(_mm_cmplt_epi8(b, _mm_set1_epi8(min))); 1275 __m128i le = _mm_xor_si128(_mm_cmpgt_epi8(b, _mm_set1_epi8(max))); 1276 } 1277 else 1278 { 1279 // No SSE way to do 16-bit unsigned comparisons, 1280 // but flipping the sign bit let us used signed comp 1281 __m128i firstBits = _mm_set1_epi8(-128); 1282 __m128i reverseB = _mm_xor_si128(b, firstBits); 1283 __m128i reverseMin = _mm_xor_si128(_mm_set1_epi8(min), firstBits); 1284 __m128i reverseMax = _mm_xor_si128(_mm_set1_epi8(max), firstBits); 1285 __m128i ge = ~_mm_cmplt_epi8(reverseB, reverseMin); 1286 __m128i le = ~_mm_cmpgt_epi8(reverseB, reverseMax); 1287 } 1288 __m128i inRange = _mm_and_si128(le, ge); 1289 1290 // Not considered in range a is invalid here. 1291 byte aValidHere = (cast(byte16)aValid).array[pos+1]; 1292 __m128i mmAValidHere = _mm_set1_epi8(aValidHere); 1293 inRange = _mm_and_si128(inRange, mmAValidHere); 1294 1295 R = _mm_or_si128(R, inRange); 1296 } 1297 } 1298 // invalid b part is not in range 1299 R = _mm_and_si128(R, bValid); 1300 } 1301 else static if (Mode == 2) // equal each, just 16 comparisons not 256 1302 { 1303 static if (chars16Bits) 1304 { 1305 __m128i R = _mm_cmpeq_epi16(a, b); 1306 } 1307 else 1308 { 1309 __m128i R = _mm_cmpeq_epi8(a, b); 1310 } 1311 1312 // if only a or b is invalid, consider not equal 1313 R = _mm_andnot_si128(_mm_xor_si128(aValid, bValid), R); 1314 1315 // if a and b are both invalid, consider equal 1316 R = _mm_or_si128(R, ~_mm_or_si128(aValid, bValid)); 1317 } 1318 else static if (Mode == 3) // equal ordered 1319 { 1320 // a is searched in b. 1321 1322 __m128i bValidShift = bValid; 1323 1324 __m128i R = _mm_set1_epi32(-1); // all b positions possible for containing a 1325 static if (chars16Bits) 1326 { 1327 for (int pos = 0; pos < 8; ++pos) 1328 { 1329 // compare character k of a, where can it go in b? 1330 short charK = (cast(short8)a).array[pos]; 1331 __m128i mmcharK = _mm_set1_epi16(charK); 1332 1333 short aValidHere = (cast(short8)aValid).array[pos]; 1334 __m128i mmAValidHere = _mm_set1_epi16(aValidHere); 1335 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1)); 1336 __m128i eqMask = _mm_cmpeq_epi16(mmcharK, b); 1337 1338 // Where A is invalid, the comparison always holds "equal" 1339 eqMask = _mm_or_si128(eqMask, mmAInvalidHere); 1340 1341 // Where B is invalid, and A is valid, the comparison is forced to false 1342 eqMask = _mm_and_si128(eqMask, _mm_or_si128(bValidShift, mmAInvalidHere)); 1343 1344 R = _mm_and_si128(eqMask); 1345 1346 // drop first char of b 1347 b = _mm_srli_si128!2(b); 1348 bValidShift = _mm_srli_si128!2(bValidShift); 1349 } 1350 } 1351 else 1352 { 1353 for (int pos = 0; pos < 16; ++pos) 1354 { 1355 // compare character k of a, where can it go in b? 1356 byte charK = (cast(byte16)a).array[pos]; 1357 __m128i mmcharK = _mm_set1_epi8(charK); 1358 1359 byte aValidHere = (cast(byte16)aValid).array[pos]; 1360 __m128i mmAValidHere = _mm_set1_epi8(aValidHere); 1361 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1)); 1362 __m128i eqMask = _mm_cmpeq_epi8(mmcharK, b); 1363 1364 // Where A is invalid, the comparison always holds "equal" 1365 eqMask = _mm_or_si128(eqMask, mmAInvalidHere); 1366 1367 // Where B is invalid, and A is valid, the comparison is forced to false 1368 eqMask = _mm_and_si128(eqMask, _mm_or_si128(bValidShift, mmAInvalidHere)); 1369 1370 R = _mm_and_si128(R, eqMask); 1371 1372 // drop first char of b 1373 b = _mm_srli_si128!1(b); 1374 bValidShift = _mm_srli_si128!1(bValidShift); 1375 } 1376 } 1377 } 1378 else 1379 static assert(0); 1380 1381 // Optionally negate result 1382 static if (imm8 & _SIDD_NEGATIVE_POLARITY) 1383 { 1384 static if (imm8 & _SIDD_MASKED_POSITIVE_POLARITY) 1385 { 1386 R = _mm_xor_si128(R, bValid); // only negate valid b 1387 } 1388 else 1389 { 1390 R = _mm_xor_si128(R, _mm_set1_epi32(-1)); // negate all 1391 } 1392 } 1393 return R; 1394 }