1 /** 2 * SSE2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.emmintrin; 9 10 public import inteli.types; 11 public import inteli.xmmintrin; // SSE2 includes SSE1 12 import inteli.mmx; 13 import inteli.internals; 14 15 nothrow @nogc: 16 17 18 // SSE2 instructions 19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 23 { 24 pragma(inline, true); 25 return cast(__m128i)(cast(short8)a + cast(short8)b); 26 } 27 unittest 28 { 29 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 30 short8 R = cast(short8) _mm_add_epi16(A, A); 31 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 32 assert(R.array == correct); 33 } 34 35 /// Add packed 32-bit integers in `a` and `b`. 36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 37 { 38 pragma(inline, true); 39 return cast(__m128i)(cast(int4)a + cast(int4)b); 40 } 41 unittest 42 { 43 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 44 int4 R = _mm_add_epi32(A, A); 45 int[4] correct = [ -14, -2, 0, 18 ]; 46 assert(R.array == correct); 47 } 48 49 /// Add packed 64-bit integers in `a` and `b`. 50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 51 { 52 pragma(inline, true); 53 return cast(__m128i)(cast(long2)a + cast(long2)b); 54 } 55 unittest 56 { 57 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 58 long2 R = cast(long2) _mm_add_epi64(A, A); 59 long[2] correct = [ -2, 0 ]; 60 assert(R.array == correct); 61 } 62 63 /// Add packed 8-bit integers in `a` and `b`. 64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 65 { 66 pragma(inline, true); 67 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 68 } 69 unittest 70 { 71 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 72 byte16 R = cast(byte16) _mm_add_epi8(A, A); 73 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 74 assert(R.array == correct); 75 } 76 77 /// Add the lower double-precision (64-bit) floating-point element 78 /// in `a` and `b`, store the result in the lower element of dst, 79 /// and copy the upper element from `a` to the upper element of destination. 80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 81 { 82 static if (DMD_with_DSIMD) 83 { 84 return cast(__m128d) __simd(XMM.ADDSD, a, b); 85 } 86 else static if (GDC_with_SSE2) 87 { 88 return __builtin_ia32_addsd(a, b); 89 } 90 else version(DigitalMars) 91 { 92 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 93 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 94 asm pure nothrow @nogc @trusted { nop;} 95 a[0] = a[0] + b[0]; 96 return a; 97 } 98 else 99 { 100 a[0] += b[0]; 101 return a; 102 } 103 } 104 unittest 105 { 106 __m128d a = [1.5, -2.0]; 107 a = _mm_add_sd(a, a); 108 assert(a.array == [3.0, -2.0]); 109 } 110 111 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 112 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 113 { 114 pragma(inline, true); 115 return a + b; 116 } 117 unittest 118 { 119 __m128d a = [1.5, -2.0]; 120 a = _mm_add_pd(a, a); 121 assert(a.array == [3.0, -4.0]); 122 } 123 124 /// Add 64-bit integers `a` and `b`. 125 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 126 { 127 // PERF DMD 128 pragma(inline, true); 129 return a + b; 130 } 131 132 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 133 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 134 { 135 static if (DMD_with_DSIMD) 136 { 137 return cast(__m128i) __simd(XMM.PADDSW, a, b); 138 } 139 else static if (GDC_with_SSE2) 140 { 141 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 142 } 143 else static if(LDC_with_saturated_intrinsics) 144 { 145 return cast(__m128i) inteli_llvm_adds!short8(cast(short8)a, cast(short8)b); 146 } 147 else 148 { 149 short[8] res; // PERF =void; 150 short8 sa = cast(short8)a; 151 short8 sb = cast(short8)b; 152 foreach(i; 0..8) 153 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 154 return _mm_loadu_si128(cast(int4*)res.ptr); 155 } 156 } 157 unittest 158 { 159 short8 res = cast(short8) _mm_adds_epi16(_mm_setr_epi16( 7, 6, 5, -32768, 3, 3, 32767, 0), 160 _mm_setr_epi16( 7, 6, 5, -30000, 3, 1, 1, -10)); 161 static immutable short[8] correctResult = [14, 12, 10, -32768, 6, 4, 32767, -10]; 162 assert(res.array == correctResult); 163 } 164 165 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 166 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 167 { 168 static if (DMD_with_DSIMD) 169 { 170 return cast(__m128i) __simd(XMM.PADDSB, a, b); 171 } 172 else static if (GDC_with_SSE2) 173 { 174 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 175 } 176 else static if(LDC_with_saturated_intrinsics) 177 { 178 return cast(__m128i) inteli_llvm_adds!byte16(cast(byte16)a, cast(byte16)b); 179 } 180 else 181 { 182 byte[16] res; // PERF =void; 183 byte16 sa = cast(byte16)a; 184 byte16 sb = cast(byte16)b; 185 foreach(i; 0..16) 186 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 187 return _mm_loadu_si128(cast(int4*)res.ptr); 188 } 189 } 190 unittest 191 { 192 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0), 193 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0)); 194 static immutable byte[16] correctResult = [0, 2, 4, 6, -128, 10, 12, 14, 195 16, 18, 127, 22, 24, 26, 28, 30]; 196 assert(res.array == correctResult); 197 } 198 199 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 200 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 201 { 202 static if (DMD_with_DSIMD) 203 { 204 return cast(__m128i) __simd(XMM.PADDUSB, a, b); 205 } 206 else static if (GDC_with_SSE2) 207 { 208 return cast(__m128i) __builtin_ia32_paddusb128(cast(ubyte16)a, cast(ubyte16)b); 209 } 210 else static if(LDC_with_saturated_intrinsics) 211 { 212 return cast(__m128i) inteli_llvm_addus!byte16(cast(byte16)a, cast(byte16)b); 213 } 214 else 215 { 216 ubyte[16] res; // PERF =void; 217 byte16 sa = cast(byte16)a; 218 byte16 sb = cast(byte16)b; 219 foreach(i; 0..16) 220 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 221 return _mm_loadu_si128(cast(int4*)res.ptr); 222 } 223 } 224 unittest 225 { 226 byte16 res = cast(byte16) 227 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 228 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 229 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 230 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 231 assert(res.array == correctResult); 232 } 233 234 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 235 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 236 { 237 static if (DMD_with_DSIMD) 238 { 239 // Note: DMD generates a reverted paddusw vs LDC and GDC, but that doesn't change the result anyway 240 return cast(__m128i) __simd(XMM.PADDUSW, a, b); 241 } 242 else static if (GDC_with_SSE2) 243 { 244 return cast(__m128i) __builtin_ia32_paddusw128(cast(short8)a, cast(short8)b); 245 } 246 else static if(LDC_with_saturated_intrinsics) 247 { 248 return cast(__m128i) inteli_llvm_addus!short8(cast(short8)a, cast(short8)b); 249 } 250 else 251 { 252 ushort[8] res; // PERF =void; 253 short8 sa = cast(short8)a; 254 short8 sb = cast(short8)b; 255 foreach(i; 0..8) 256 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 257 return _mm_loadu_si128(cast(int4*)res.ptr); 258 } 259 } 260 unittest 261 { 262 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 263 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 264 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 265 assert(res.array == correctResult); 266 } 267 268 /// Compute the bitwise AND of packed double-precision (64-bit) 269 /// floating-point elements in `a` and `b`. 270 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 271 { 272 pragma(inline, true); 273 return cast(__m128d)( cast(long2)a & cast(long2)b ); 274 } 275 unittest 276 { 277 double a = 4.32; 278 double b = -78.99; 279 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 280 __m128d A = _mm_set_pd(a, b); 281 __m128d B = _mm_set_pd(b, a); 282 long2 R = cast(long2)( _mm_and_pd(A, B) ); 283 assert(R.array[0] == correct); 284 assert(R.array[1] == correct); 285 } 286 287 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 288 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 289 { 290 pragma(inline, true); 291 return a & b; 292 } 293 unittest 294 { 295 __m128i A = _mm_set1_epi32(7); 296 __m128i B = _mm_set1_epi32(14); 297 __m128i R = _mm_and_si128(A, B); 298 int[4] correct = [6, 6, 6, 6]; 299 assert(R.array == correct); 300 } 301 302 /// Compute the bitwise NOT of packed double-precision (64-bit) 303 /// floating-point elements in `a` and then AND with `b`. 304 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 305 { 306 static if (DMD_with_DSIMD) 307 { 308 return cast(__m128d) __simd(XMM.ANDNPD, a, b); 309 } 310 else 311 { 312 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 313 } 314 } 315 unittest 316 { 317 double a = 4.32; 318 double b = -78.99; 319 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 320 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 321 __m128d A = _mm_setr_pd(a, b); 322 __m128d B = _mm_setr_pd(b, a); 323 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 324 assert(R.array[0] == correct); 325 assert(R.array[1] == correct2); 326 } 327 328 /// Compute the bitwise NOT of 128 bits (representing integer data) 329 /// in `a` and then AND with `b`. 330 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 331 { 332 static if (DMD_with_DSIMD) 333 { 334 return cast(__m128i) __simd(XMM.PANDN, a, b); 335 } 336 else 337 { 338 return (~a) & b; 339 } 340 } 341 unittest 342 { 343 __m128i A = _mm_setr_epi32(7, -2, 9, 54654); 344 __m128i B = _mm_setr_epi32(14, 78, 111, -256); 345 __m128i R = _mm_andnot_si128(A, B); 346 int[4] correct = [8, 0, 102, -54784]; 347 assert(R.array == correct); 348 } 349 350 /// Average packed unsigned 16-bit integers in `a` and `b`. 351 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 352 { 353 static if (DMD_with_DSIMD) 354 { 355 return cast(__m128i) __simd(XMM.PAVGW, a, b); 356 } 357 else static if (GDC_with_SSE2) 358 { 359 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 360 } 361 else static if (LDC_with_ARM64) 362 { 363 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 364 } 365 else static if (LDC_with_SSE2 && __VERSION__ >= 2094) 366 { 367 // Exists since LDC 1.18 368 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 369 } 370 else static if (LDC_with_optimizations) 371 { 372 // Generates pavgw even in LDC 1.0, even in -O0 373 // But not in ARM 374 enum ir = ` 375 %ia = zext <8 x i16> %0 to <8 x i32> 376 %ib = zext <8 x i16> %1 to <8 x i32> 377 %isum = add <8 x i32> %ia, %ib 378 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 379 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 380 %r = trunc <8 x i32> %isums to <8 x i16> 381 ret <8 x i16> %r`; 382 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 383 } 384 else 385 { 386 short8 sa = cast(short8)a; 387 short8 sb = cast(short8)b; 388 short8 sr = void; 389 foreach(i; 0..8) 390 { 391 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 392 } 393 return cast(int4)sr; 394 } 395 } 396 unittest 397 { 398 __m128i A = _mm_set1_epi16(31); 399 __m128i B = _mm_set1_epi16(64); 400 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 401 foreach(i; 0..8) 402 assert(avg.array[i] == 48); 403 } 404 405 /// Average packed unsigned 8-bit integers in `a` and `b`. 406 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 407 { 408 static if (DMD_with_DSIMD) 409 { 410 return cast(__m128i) __simd(XMM.PAVGB, a, b); 411 } 412 else static if (GDC_with_SSE2) 413 { 414 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 415 } 416 else static if (LDC_with_SSE2 && __VERSION__ >= 2094) 417 { 418 // Exists since LDC 1.18 419 return cast(__m128i) __builtin_ia32_pavgb128(cast(byte16)a, cast(byte16)b); 420 } 421 else static if (LDC_with_ARM64) 422 { 423 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 424 } 425 else static if (LDC_with_optimizations) 426 { 427 // Generates pavgb even in LDC 1.0, even in -O0 428 // But not in ARM 429 enum ir = ` 430 %ia = zext <16 x i8> %0 to <16 x i16> 431 %ib = zext <16 x i8> %1 to <16 x i16> 432 %isum = add <16 x i16> %ia, %ib 433 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 434 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 435 %r = trunc <16 x i16> %isums to <16 x i8> 436 ret <16 x i8> %r`; 437 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 438 } 439 else 440 { 441 byte16 sa = cast(byte16)a; 442 byte16 sb = cast(byte16)b; 443 byte16 sr = void; 444 foreach(i; 0..16) 445 { 446 sr.ptr[i] = cast(ubyte)( (cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]) + 1) >> 1 ); 447 } 448 return cast(int4)sr; 449 } 450 } 451 unittest 452 { 453 __m128i A = _mm_set1_epi8(31); 454 __m128i B = _mm_set1_epi8(64); 455 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 456 foreach(i; 0..16) 457 assert(avg.array[i] == 48); 458 } 459 460 /// Shift `a` left by `bytes` bytes while shifting in zeros. 461 alias _mm_bslli_si128 = _mm_slli_si128; 462 unittest 463 { 464 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 465 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 466 __m128i result = _mm_bslli_si128!5(toShift); 467 assert( (cast(byte16)result).array == exact); 468 } 469 470 /// Shift `v` right by `bytes` bytes while shifting in zeros. 471 alias _mm_bsrli_si128 = _mm_srli_si128; 472 unittest 473 { 474 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 475 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 476 __m128i result = _mm_bsrli_si128!5(toShift); 477 assert( (cast(byte16)result).array == exact); 478 } 479 480 /// Cast vector of type `__m128d` to type `__m128`. 481 /// Note: Also possible with a regular `cast(__m128)(a)`. 482 __m128 _mm_castpd_ps (__m128d a) pure @safe 483 { 484 return cast(__m128)a; 485 } 486 487 /// Cast vector of type `__m128d` to type `__m128i`. 488 /// Note: Also possible with a regular `cast(__m128i)(a)`. 489 __m128i _mm_castpd_si128 (__m128d a) pure @safe 490 { 491 return cast(__m128i)a; 492 } 493 494 /// Cast vector of type `__m128` to type `__m128d`. 495 /// Note: Also possible with a regular `cast(__m128d)(a)`. 496 __m128d _mm_castps_pd (__m128 a) pure @safe 497 { 498 return cast(__m128d)a; 499 } 500 501 /// Cast vector of type `__m128` to type `__m128i`. 502 /// Note: Also possible with a regular `cast(__m128i)(a)`. 503 __m128i _mm_castps_si128 (__m128 a) pure @safe 504 { 505 return cast(__m128i)a; 506 } 507 508 /// Cast vector of type `__m128i` to type `__m128d`. 509 /// Note: Also possible with a regular `cast(__m128d)(a)`. 510 __m128d _mm_castsi128_pd (__m128i a) pure @safe 511 { 512 return cast(__m128d)a; 513 } 514 515 /// Cast vector of type `__m128i` to type `__m128`. 516 /// Note: Also possible with a regular `cast(__m128)(a)`. 517 __m128 _mm_castsi128_ps (__m128i a) pure @safe 518 { 519 return cast(__m128)a; 520 } 521 522 /// Invalidate and flush the cache line that contains `p` 523 /// from all levels of the cache hierarchy. 524 void _mm_clflush (const(void)* p) @trusted 525 { 526 static if (GDC_with_SSE2) 527 { 528 __builtin_ia32_clflush(p); 529 } 530 else static if (LDC_with_SSE2) 531 { 532 __builtin_ia32_clflush(cast(void*)p); 533 } 534 else version(D_InlineAsm_X86) 535 { 536 asm pure nothrow @nogc @trusted 537 { 538 mov EAX, p; 539 clflush [EAX]; 540 } 541 } 542 else version(D_InlineAsm_X86_64) 543 { 544 asm pure nothrow @nogc @trusted 545 { 546 mov RAX, p; 547 clflush [RAX]; 548 } 549 } 550 else 551 { 552 // Do nothing. Invalidating cacheline does 553 // not affect correctness. 554 } 555 } 556 unittest 557 { 558 ubyte[64] cacheline; 559 _mm_clflush(cacheline.ptr); 560 } 561 562 /// Compare packed 16-bit integers in `a` and `b` for equality. 563 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 564 { 565 static if (SIMD_COMPARISON_MASKS_16B) 566 { 567 return cast(__m128i)(cast(short8)a == cast(short8)b); 568 } 569 else static if (GDC_with_SSE2) 570 { 571 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 572 } 573 else 574 { 575 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 576 } 577 } 578 unittest 579 { 580 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 581 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 582 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 583 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 584 assert(R.array == E); 585 } 586 587 /// Compare packed 32-bit integers in `a` and `b` for equality. 588 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 589 { 590 static if (SIMD_COMPARISON_MASKS_16B) 591 { 592 return cast(__m128i)(cast(int4)a == cast(int4)b); 593 } 594 else static if (GDC_with_SSE2) 595 { 596 return __builtin_ia32_pcmpeqd128(a, b); 597 } 598 else 599 { 600 return equalMask!__m128i(a, b); 601 } 602 } 603 unittest 604 { 605 int4 A = [-3, -2, -1, 0]; 606 int4 B = [ 4, -2, 2, 0]; 607 int[4] E = [ 0, -1, 0, -1]; 608 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 609 assert(R.array == E); 610 } 611 612 /// Compare packed 8-bit integers in `a` and `b` for equality. 613 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 614 { 615 static if (SIMD_COMPARISON_MASKS_16B) 616 { 617 return cast(__m128i)(cast(byte16)a == cast(byte16)b); 618 } 619 else static if (GDC_with_SSE2) 620 { 621 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 622 } 623 else 624 { 625 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 626 } 627 } 628 unittest 629 { 630 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 631 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 632 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 633 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 634 assert(C.array == correct); 635 } 636 637 /// Compare packed double-precision (64-bit) floating-point elements 638 /// in `a` and `b` for equality. 639 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 640 { 641 static if (SIMD_COMPARISON_MASKS_16B) 642 { 643 return cast(double2)(cast(double2)a == cast(double2)b); 644 } 645 else static if (GDC_with_SSE2) 646 { 647 return __builtin_ia32_cmpeqpd(a, b); 648 } 649 else 650 { 651 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 652 } 653 } 654 unittest 655 { 656 double2 A = _mm_setr_pd(1.0, 2.0); 657 double2 B = _mm_setr_pd(0.0, 2.0); 658 double2 N = _mm_setr_pd(double.nan, double.nan); 659 long2 C = cast(long2) _mm_cmpeq_pd(A, B); 660 long[2] correctC = [0, -1]; 661 assert(C.array == correctC); 662 long2 D = cast(long2) _mm_cmpeq_pd(N, N); 663 long[2] correctD = [0, 0]; 664 assert(D.array == correctD); 665 } 666 667 /// Compare the lower double-precision (64-bit) floating-point elements 668 /// in `a` and `b` for equality, store the result in the lower element, 669 /// and copy the upper element from `a`. 670 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 671 { 672 static if (DMD_with_DSIMD) 673 { 674 return cast(__m128d) __simd(XMM.CMPSD, a, b, 0); 675 } 676 else static if (GDC_with_SSE2) 677 { 678 return __builtin_ia32_cmpeqsd(a, b); 679 } 680 else 681 { 682 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 683 } 684 } 685 unittest 686 { 687 double2 A = _mm_setr_pd(0.0, 2.0); 688 double2 B = _mm_setr_pd(1.0, 2.0); 689 double2 C = _mm_setr_pd(1.0, 3.0); 690 double2 D = cast(double2) _mm_cmpeq_sd(A, B); 691 long2 E = cast(long2) _mm_cmpeq_sd(B, C); 692 double[2] correctD = [0.0, 2.0]; 693 double two = 2.0; 694 long[2] correctE = [-1, *cast(long*)&two]; 695 assert(D.array == correctD); 696 assert(E.array == correctE); 697 } 698 699 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal. 700 /// #BONUS 701 __m128i _mm_cmpge_epi16 (__m128i a, __m128i b) pure @safe 702 { 703 static if (SIMD_COMPARISON_MASKS_16B) 704 { 705 return cast(__m128i)(cast(short8)a >= cast(short8)b); 706 } 707 else version (LDC) 708 { 709 // LDC ARM64: generates cmge since -O1 710 return cast(__m128i) greaterOrEqualMask!short8(cast(short8)a, cast(short8)b); 711 } 712 else 713 { 714 return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_cmpgt_epi16(a, b)); 715 } 716 } 717 unittest 718 { 719 short8 A = [-3, -2, -32768, 0, 0, 1, 2, 3]; 720 short8 B = [ 4, 3, 32767, 1, 0, -1, -2, -3]; 721 short[8] E = [ 0, 0, 0, 0, -1, -1, -1, -1]; 722 short8 R = cast(short8)(_mm_cmpge_epi16(cast(__m128i)A, cast(__m128i)B)); 723 assert(R.array == E); 724 } 725 726 /// Compare packed double-precision (64-bit) floating-point elements 727 /// in `a` and `b` for greater-than-or-equal. 728 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 729 { 730 static if (SIMD_COMPARISON_MASKS_16B) 731 { 732 return cast(__m128d)(a >= b); 733 } 734 else static if (GDC_with_SSE2) 735 { 736 return __builtin_ia32_cmpgepd(a, b); 737 } 738 else 739 { 740 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 741 } 742 } 743 744 /// Compare the lower double-precision (64-bit) floating-point elements 745 /// in `a` and `b` for greater-than-or-equal, store the result in the 746 /// lower element, and copy the upper element from `a`. 747 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 748 { 749 static if (DMD_with_DSIMD) 750 { 751 return cast(__m128d) __simd(XMM.CMPSD, b, a, 2); 752 } 753 else static if (GDC_with_SSE2) 754 { 755 return __builtin_ia32_cmplesd(b, a); 756 } 757 else 758 { 759 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 760 } 761 } 762 unittest 763 { 764 __m128d A = _mm_setr_pd(1.0, 0.0); 765 __m128d B = _mm_setr_pd(double.nan, 0.0); 766 __m128d C = _mm_setr_pd(2.0, 0.0); 767 assert( (cast(long2)_mm_cmpge_sd(A, A)).array[0] == -1); 768 assert( (cast(long2)_mm_cmpge_sd(A, B)).array[0] == 0); 769 assert( (cast(long2)_mm_cmpge_sd(A, C)).array[0] == 0); 770 assert( (cast(long2)_mm_cmpge_sd(B, A)).array[0] == 0); 771 assert( (cast(long2)_mm_cmpge_sd(B, B)).array[0] == 0); 772 assert( (cast(long2)_mm_cmpge_sd(B, C)).array[0] == 0); 773 assert( (cast(long2)_mm_cmpge_sd(C, A)).array[0] == -1); 774 assert( (cast(long2)_mm_cmpge_sd(C, B)).array[0] == 0); 775 assert( (cast(long2)_mm_cmpge_sd(C, C)).array[0] == -1); 776 } 777 778 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 779 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 780 { 781 static if (SIMD_COMPARISON_MASKS_16B) 782 { 783 return cast(__m128i)(cast(short8)a > cast(short8)b); 784 } 785 else static if (GDC_with_SSE2) 786 { 787 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 788 } 789 else 790 { 791 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 792 } 793 } 794 unittest 795 { 796 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 797 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 798 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 799 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 800 assert(R.array == E); 801 } 802 803 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 804 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 805 { 806 static if (SIMD_COMPARISON_MASKS_16B) 807 { 808 return cast(__m128i)(cast(int4)a > cast(int4)b); 809 } 810 else static if (GDC_with_SSE2) 811 { 812 return __builtin_ia32_pcmpgtd128(a, b); 813 } 814 else 815 { 816 return cast(__m128i)( greaterMask!int4(a, b)); 817 } 818 } 819 unittest 820 { 821 int4 A = [-3, 2, -1, 0]; 822 int4 B = [ 4, -2, 2, 0]; 823 int[4] E = [ 0, -1, 0, 0]; 824 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 825 assert(R.array == E); 826 } 827 828 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 829 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 830 { 831 static if (SIMD_COMPARISON_MASKS_16B) 832 { 833 return cast(__m128i)(cast(byte16)a > cast(byte16)b); 834 } 835 else 836 { 837 // Note: __builtin_ia32_pcmpgtb128 is buggy, do not use with GDC 838 // TODO: re-check that 839 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 840 } 841 } 842 unittest 843 { 844 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 845 __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 846 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 847 byte[16] correct = [0, 0,-1, 0, -1, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 848 __m128i D = _mm_cmpeq_epi8(A, B); 849 assert(C.array == correct); 850 } 851 852 /// Compare packed double-precision (64-bit) floating-point elements 853 /// in `a` and `b` for greater-than. 854 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 855 { 856 static if (SIMD_COMPARISON_MASKS_16B) 857 { 858 return cast(__m128d)(a > b); 859 } 860 else static if (GDC_with_SSE2) 861 { 862 return __builtin_ia32_cmpgtpd(a, b); 863 } 864 else 865 { 866 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 867 } 868 } 869 870 /// Compare the lower double-precision (64-bit) floating-point elements 871 /// in `a` and `b` for greater-than, store the result in the lower element, 872 /// and copy the upper element from `a`. 873 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 874 { 875 static if (DMD_with_DSIMD) 876 { 877 return cast(__m128d) __simd(XMM.CMPSD, b, a, 1); 878 } 879 else static if (GDC_with_SSE2) 880 { 881 return __builtin_ia32_cmpltsd(b, a); 882 } 883 else 884 { 885 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 886 } 887 } 888 unittest 889 { 890 __m128d A = _mm_setr_pd(1.0, 0.0); 891 __m128d B = _mm_setr_pd(double.nan, 0.0); 892 __m128d C = _mm_setr_pd(2.0, 0.0); 893 assert( (cast(long2)_mm_cmpgt_sd(A, A)).array[0] == 0); 894 assert( (cast(long2)_mm_cmpgt_sd(A, B)).array[0] == 0); 895 assert( (cast(long2)_mm_cmpgt_sd(A, C)).array[0] == 0); 896 assert( (cast(long2)_mm_cmpgt_sd(B, A)).array[0] == 0); 897 assert( (cast(long2)_mm_cmpgt_sd(B, B)).array[0] == 0); 898 assert( (cast(long2)_mm_cmpgt_sd(B, C)).array[0] == 0); 899 assert( (cast(long2)_mm_cmpgt_sd(C, A)).array[0] == -1); 900 assert( (cast(long2)_mm_cmpgt_sd(C, B)).array[0] == 0); 901 assert( (cast(long2)_mm_cmpgt_sd(C, C)).array[0] == 0); 902 } 903 904 905 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal. 906 /// #BONUS 907 __m128i _mm_cmple_epi16 (__m128i a, __m128i b) pure @safe 908 { 909 static if (SIMD_COMPARISON_MASKS_16B) 910 { 911 return cast(__m128i)(cast(short8)a <= cast(short8)b); 912 } 913 else version (LDC) 914 { 915 // LDC ARM64: generates cmge since -O1 916 return cast(__m128i) greaterOrEqualMask!short8(cast(short8)b, cast(short8)a); 917 } 918 else 919 { 920 return _mm_xor_si128(_mm_cmpeq_epi16(b, a), _mm_cmpgt_epi16(b, a)); 921 } 922 } 923 unittest 924 { 925 short8 A = [-3, -2, -32768, 1, 0, 1, 2, 3]; 926 short8 B = [ 4, 3, 32767, 0, 0, -1, -2, -3]; 927 short[8] E = [-1, -1, -1, 0, -1, 0, 0, 0]; 928 short8 R = cast(short8)(_mm_cmple_epi16(cast(__m128i)A, cast(__m128i)B)); 929 assert(R.array == E); 930 } 931 932 /// Compare packed double-precision (64-bit) floating-point elements 933 /// in `a` and `b` for less-than-or-equal. 934 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 935 { 936 static if (SIMD_COMPARISON_MASKS_16B) 937 { 938 return cast(__m128d)(a <= b); 939 } 940 else static if (GDC_with_SSE2) 941 { 942 return __builtin_ia32_cmplepd(a, b); 943 } 944 else 945 { 946 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 947 } 948 } 949 950 /// Compare the lower double-precision (64-bit) floating-point elements 951 /// in `a` and `b` for less-than-or-equal, store the result in the 952 /// lower element, and copy the upper element from `a`. 953 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 954 { 955 static if (DMD_with_DSIMD) 956 { 957 return cast(__m128d) __simd(XMM.CMPSD, a, b, 2); 958 } 959 else static if (GDC_with_SSE2) 960 { 961 return __builtin_ia32_cmplesd(a, b); 962 } 963 else 964 { 965 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 966 } 967 } 968 969 /// Compare packed 16-bit integers in `a` and `b` for less-than. 970 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 971 { 972 return _mm_cmpgt_epi16(b, a); 973 } 974 975 /// Compare packed 32-bit integers in `a` and `b` for less-than. 976 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 977 { 978 return _mm_cmpgt_epi32(b, a); 979 } 980 981 /// Compare packed 8-bit integers in `a` and `b` for less-than. 982 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 983 { 984 return _mm_cmpgt_epi8(b, a); 985 } 986 987 /// Compare packed double-precision (64-bit) floating-point elements 988 /// in `a` and `b` for less-than. 989 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 990 { 991 static if (SIMD_COMPARISON_MASKS_16B) 992 { 993 return cast(__m128d)(a < b); 994 } 995 else static if (GDC_with_SSE2) 996 { 997 return __builtin_ia32_cmpltpd(a, b); 998 } 999 else 1000 { 1001 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 1002 } 1003 } 1004 1005 /// Compare the lower double-precision (64-bit) floating-point elements 1006 /// in `a` and `b` for less-than, store the result in the lower 1007 /// element, and copy the upper element from `a`. 1008 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 1009 { 1010 static if (DMD_with_DSIMD) 1011 { 1012 return cast(__m128d) __simd(XMM.CMPSD, a, b, 1); 1013 } 1014 else static if (GDC_with_SSE2) 1015 { 1016 return __builtin_ia32_cmpltsd(a, b); 1017 } 1018 else 1019 { 1020 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 1021 } 1022 } 1023 1024 /// Compare packed double-precision (64-bit) floating-point elements 1025 /// in `a` and `b` for not-equal. 1026 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 1027 { 1028 static if (GDC_with_SSE2) 1029 { 1030 return __builtin_ia32_cmpneqpd(a, b); 1031 } 1032 else 1033 { 1034 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 1035 } 1036 } 1037 1038 /// Compare the lower double-precision (64-bit) floating-point elements 1039 /// in `a` and `b` for not-equal, store the result in the lower 1040 /// element, and copy the upper element from `a`. 1041 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 1042 { 1043 static if (GDC_with_SSE2) 1044 { 1045 return __builtin_ia32_cmpneqsd(a, b); 1046 } 1047 else 1048 { 1049 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 1050 } 1051 } 1052 1053 /// Compare packed double-precision (64-bit) floating-point elements 1054 /// in `a` and `b` for not-greater-than-or-equal. 1055 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 1056 { 1057 static if (GDC_with_SSE2) 1058 { 1059 return __builtin_ia32_cmpngepd(a, b); 1060 } 1061 else 1062 { 1063 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 1064 } 1065 } 1066 1067 /// Compare the lower double-precision (64-bit) floating-point elements 1068 /// in `a` and `b` for not-greater-than-or-equal, store the result in 1069 /// the lower element, and copy the upper element from `a`. 1070 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 1071 { 1072 // Note: There is no __builtin_ia32_cmpngesd builtin. 1073 static if (GDC_with_SSE2) 1074 { 1075 return __builtin_ia32_cmpltsd(b, a); 1076 } 1077 else 1078 { 1079 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 1080 } 1081 } 1082 1083 /// Compare packed double-precision (64-bit) floating-point elements 1084 /// in `a` and `b` for not-greater-than. 1085 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 1086 { 1087 static if (GDC_with_SSE2) 1088 { 1089 return __builtin_ia32_cmpngtpd(a, b); 1090 } 1091 else 1092 { 1093 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 1094 } 1095 } 1096 1097 /// Compare the lower double-precision (64-bit) floating-point elements 1098 /// in `a` and `b` for not-greater-than, store the result in the 1099 /// lower element, and copy the upper element from `a`. 1100 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 1101 { 1102 // Note: There is no __builtin_ia32_cmpngtsd builtin. 1103 static if (GDC_with_SSE2) 1104 { 1105 return __builtin_ia32_cmplesd(b, a); 1106 } 1107 else 1108 { 1109 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 1110 } 1111 } 1112 1113 /// Compare packed double-precision (64-bit) floating-point elements 1114 /// in `a` and `b` for not-less-than-or-equal. 1115 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 1116 { 1117 static if (GDC_with_SSE2) 1118 { 1119 return __builtin_ia32_cmpnlepd(a, b); 1120 } 1121 else 1122 { 1123 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 1124 } 1125 } 1126 1127 /// Compare the lower double-precision (64-bit) floating-point elements 1128 /// in `a` and `b` for not-less-than-or-equal, store the result in the 1129 /// lower element, and copy the upper element from `a`. 1130 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 1131 { 1132 static if (GDC_with_SSE2) 1133 { 1134 return __builtin_ia32_cmpnlesd(a, b); 1135 } 1136 else 1137 { 1138 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 1139 } 1140 } 1141 1142 /// Compare packed double-precision (64-bit) floating-point elements 1143 /// in `a` and `b` for not-less-than. 1144 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 1145 { 1146 static if (GDC_with_SSE2) 1147 { 1148 return __builtin_ia32_cmpnltpd(a, b); 1149 } 1150 else 1151 { 1152 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 1153 } 1154 } 1155 1156 /// Compare the lower double-precision (64-bit) floating-point elements 1157 /// in `a` and `b` for not-less-than, store the result in the lower 1158 /// element, and copy the upper element from `a`. 1159 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1160 { 1161 static if (GDC_with_SSE2) 1162 { 1163 return __builtin_ia32_cmpnltsd(a, b); 1164 } 1165 else 1166 { 1167 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1168 } 1169 } 1170 1171 /// Compare packed double-precision (64-bit) floating-point elements 1172 /// in `a` and `b` to see if neither is NaN. 1173 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1174 { 1175 static if (GDC_with_SSE2) 1176 { 1177 return __builtin_ia32_cmpordpd(a, b); 1178 } 1179 else 1180 { 1181 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1182 } 1183 } 1184 1185 /// Compare the lower double-precision (64-bit) floating-point elements 1186 /// in `a` and `b` to see if neither is NaN, store the result in the 1187 /// lower element, and copy the upper element from `a` to the upper element. 1188 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1189 { 1190 static if (GDC_with_SSE2) 1191 { 1192 return __builtin_ia32_cmpordsd(a, b); 1193 } 1194 else 1195 { 1196 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1197 } 1198 } 1199 1200 /// Compare packed double-precision (64-bit) floating-point elements 1201 /// in `a` and `b` to see if either is NaN. 1202 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1203 { 1204 static if (GDC_with_SSE2) 1205 { 1206 return __builtin_ia32_cmpunordpd(a, b); 1207 } 1208 else 1209 { 1210 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1211 } 1212 } 1213 1214 /// Compare the lower double-precision (64-bit) floating-point elements 1215 /// in `a` and `b` to see if either is NaN, store the result in the lower 1216 /// element, and copy the upper element from `a` to the upper element. 1217 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1218 { 1219 static if (GDC_with_SSE2) 1220 { 1221 return __builtin_ia32_cmpunordsd(a, b); 1222 } 1223 else 1224 { 1225 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1226 } 1227 } 1228 1229 /// Compare the lower double-precision (64-bit) floating-point element 1230 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1231 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1232 { 1233 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1234 // comisd instruction, it returns false in case of unordered instead. 1235 // 1236 // Actually C++ compilers disagree over the meaning of that instruction. 1237 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1238 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1239 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1240 return a.array[0] == b.array[0]; 1241 } 1242 unittest 1243 { 1244 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1245 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1246 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1247 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1248 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1249 } 1250 1251 /// Compare the lower double-precision (64-bit) floating-point element 1252 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1253 /// result (0 or 1). 1254 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1255 { 1256 return a.array[0] >= b.array[0]; 1257 } 1258 unittest 1259 { 1260 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1261 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1262 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1263 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1264 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1265 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1266 } 1267 1268 /// Compare the lower double-precision (64-bit) floating-point element 1269 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1270 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1271 { 1272 return a.array[0] > b.array[0]; 1273 } 1274 unittest 1275 { 1276 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1277 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1278 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1279 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1280 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1281 } 1282 1283 /// Compare the lower double-precision (64-bit) floating-point element 1284 /// in `a` and `b` for less-than-or-equal. 1285 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1286 { 1287 return a.array[0] <= b.array[0]; 1288 } 1289 unittest 1290 { 1291 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1292 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1293 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1294 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1295 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1296 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1297 } 1298 1299 /// Compare the lower double-precision (64-bit) floating-point element 1300 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1301 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1302 { 1303 return a.array[0] < b.array[0]; 1304 } 1305 unittest 1306 { 1307 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1308 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1309 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1310 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1311 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1312 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1313 } 1314 1315 /// Compare the lower double-precision (64-bit) floating-point element 1316 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1317 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1318 { 1319 return a.array[0] != b.array[0]; 1320 } 1321 unittest 1322 { 1323 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1324 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1325 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1326 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1327 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1328 } 1329 1330 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1331 /// floating-point elements. 1332 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1333 { 1334 static if (LDC_with_optimizations) 1335 { 1336 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1337 enum ir = ` 1338 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1339 %r = sitofp <2 x i32> %v to <2 x double> 1340 ret <2 x double> %r`; 1341 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1342 } 1343 else static if (GDC_with_SSE2) 1344 { 1345 return __builtin_ia32_cvtdq2pd(a); 1346 } 1347 else 1348 { 1349 double2 r = void; 1350 r.ptr[0] = a.array[0]; 1351 r.ptr[1] = a.array[1]; 1352 return r; 1353 } 1354 } 1355 unittest 1356 { 1357 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1358 assert(A.array[0] == 54.0); 1359 assert(A.array[1] == 54.0); 1360 } 1361 1362 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1363 /// floating-point elements. 1364 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1365 { 1366 static if (DMD_with_DSIMD) 1367 { 1368 return cast(__m128)__simd(XMM.CVTDQ2PS, cast(void16) a); 1369 } 1370 else static if (GDC_with_SSE2) 1371 { 1372 return __builtin_ia32_cvtdq2ps(a); 1373 } 1374 else static if (LDC_with_optimizations) 1375 { 1376 // See #86 for why we had to resort to LLVM IR. 1377 // Plain code below was leading to catastrophic behaviour. 1378 // x86: Generates cvtdq2ps since LDC 1.1.0 -O0 1379 // ARM: Generats scvtf.4s since LDC 1.8.0 -O0 1380 enum ir = ` 1381 %r = sitofp <4 x i32> %0 to <4 x float> 1382 ret <4 x float> %r`; 1383 return cast(__m128) LDCInlineIR!(ir, float4, int4)(a); 1384 } 1385 else 1386 { 1387 __m128 res; // PERF =void; 1388 res.ptr[0] = cast(float)a.array[0]; 1389 res.ptr[1] = cast(float)a.array[1]; 1390 res.ptr[2] = cast(float)a.array[2]; 1391 res.ptr[3] = cast(float)a.array[3]; 1392 return res; 1393 } 1394 } 1395 unittest 1396 { 1397 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1398 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1399 } 1400 1401 /// Convert packed double-precision (64-bit) floating-point elements 1402 /// in `a` to packed 32-bit integers. 1403 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1404 { 1405 // PERF ARM32 1406 static if (LDC_with_SSE2) 1407 { 1408 return __builtin_ia32_cvtpd2dq(a); 1409 } 1410 else static if (GDC_with_SSE2) 1411 { 1412 return __builtin_ia32_cvtpd2dq(a); 1413 } 1414 else static if (LDC_with_ARM64) 1415 { 1416 // Get current rounding mode. 1417 uint fpscr = arm_get_fpcr(); 1418 long2 i; 1419 switch(fpscr & _MM_ROUND_MASK_ARM) 1420 { 1421 default: 1422 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1423 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1424 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1425 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1426 } 1427 int4 zero = 0; 1428 return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero); // PERF: this slow down build for nothing, test without shufflevector 1429 } 1430 else 1431 { 1432 // PERF ARM32 1433 __m128i r = _mm_setzero_si128(); 1434 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1435 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1436 return r; 1437 } 1438 } 1439 unittest 1440 { 1441 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1442 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1443 } 1444 1445 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1446 /// to packed 32-bit integers 1447 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1448 { 1449 return to_m64(_mm_cvtpd_epi32(v)); 1450 } 1451 unittest 1452 { 1453 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1454 assert(A.array[0] == 55 && A.array[1] == 61); 1455 } 1456 1457 /// Convert packed double-precision (64-bit) floating-point elements 1458 /// in `a` to packed single-precision (32-bit) floating-point elements. 1459 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1460 { 1461 static if (LDC_with_SSE2) 1462 { 1463 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1464 } 1465 else static if (GDC_with_SSE2) 1466 { 1467 return __builtin_ia32_cvtpd2ps(a); 1468 } 1469 else 1470 { 1471 __m128 r = void; 1472 r.ptr[0] = a.array[0]; 1473 r.ptr[1] = a.array[1]; 1474 r.ptr[2] = 0; 1475 r.ptr[3] = 0; 1476 return r; 1477 } 1478 } 1479 unittest 1480 { 1481 __m128d A = _mm_set_pd(5.25, 4.0); 1482 __m128 B = _mm_cvtpd_ps(A); 1483 assert(B.array == [4.0f, 5.25f, 0, 0]); 1484 } 1485 1486 /// Convert packed 32-bit integers in `v` to packed double-precision 1487 /// (64-bit) floating-point elements. 1488 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1489 { 1490 return _mm_cvtepi32_pd(to_m128i(v)); 1491 } 1492 unittest 1493 { 1494 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1495 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1496 } 1497 1498 /// Convert packed single-precision (32-bit) floating-point elements 1499 /// in `a` to packed 32-bit integers 1500 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1501 { 1502 static if (LDC_with_SSE2) 1503 { 1504 return cast(__m128i) __builtin_ia32_cvtps2dq(a); 1505 } 1506 else static if (GDC_with_SSE2) 1507 { 1508 return __builtin_ia32_cvtps2dq(a); 1509 } 1510 else static if (LDC_with_ARM64) 1511 { 1512 // Get current rounding mode. 1513 uint fpscr = arm_get_fpcr(); 1514 switch(fpscr & _MM_ROUND_MASK_ARM) 1515 { 1516 default: 1517 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1518 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1519 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1520 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1521 } 1522 } 1523 else 1524 { 1525 __m128i r = void; 1526 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1527 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1528 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1529 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1530 return r; 1531 } 1532 } 1533 unittest 1534 { 1535 // GDC bug #98607 1536 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1537 // GDC does not provide optimization barrier for rounding mode. 1538 // Workarounded with different literals. This bug will likely only manifest in unittest. 1539 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1540 1541 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1542 1543 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1544 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1545 assert(A.array == [1, -2, 54, -3]); 1546 1547 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1548 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1549 assert(A.array == [1, -3, 53, -3]); 1550 1551 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1552 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1553 assert(A.array == [2, -2, 54, -2]); 1554 1555 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1556 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1557 assert(A.array == [1, -2, 53, -2]); 1558 1559 _MM_SET_ROUNDING_MODE(savedRounding); 1560 } 1561 1562 /// Convert packed single-precision (32-bit) floating-point elements 1563 /// in `a` to packed double-precision (64-bit) floating-point elements. 1564 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1565 { 1566 static if (LDC_with_optimizations) 1567 { 1568 // Generates cvtps2pd since LDC 1.0 -O0 1569 enum ir = ` 1570 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1571 %r = fpext <2 x float> %v to <2 x double> 1572 ret <2 x double> %r`; 1573 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1574 } 1575 else static if (GDC_with_SSE2) 1576 { 1577 return __builtin_ia32_cvtps2pd(a); 1578 } 1579 else 1580 { 1581 double2 r = void; 1582 r.ptr[0] = a.array[0]; 1583 r.ptr[1] = a.array[1]; 1584 return r; 1585 } 1586 } 1587 unittest 1588 { 1589 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1590 assert(A.array[0] == 54.0); 1591 assert(A.array[1] == 54.0); 1592 } 1593 1594 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1595 double _mm_cvtsd_f64 (__m128d a) pure @safe 1596 { 1597 return a.array[0]; 1598 } 1599 1600 /// Convert the lower double-precision (64-bit) floating-point element 1601 /// in `a` to a 32-bit integer. 1602 int _mm_cvtsd_si32 (__m128d a) @safe 1603 { 1604 static if (LDC_with_SSE2) 1605 { 1606 return __builtin_ia32_cvtsd2si(a); 1607 } 1608 else static if (GDC_with_SSE2) 1609 { 1610 return __builtin_ia32_cvtsd2si(a); 1611 } 1612 else 1613 { 1614 return convertDoubleToInt32UsingMXCSR(a[0]); 1615 } 1616 } 1617 unittest 1618 { 1619 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1620 } 1621 1622 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1623 long _mm_cvtsd_si64 (__m128d a) @trusted 1624 { 1625 static if (LDC_with_SSE2) 1626 { 1627 version (X86_64) 1628 { 1629 return __builtin_ia32_cvtsd2si64(a); 1630 } 1631 else 1632 { 1633 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1634 // using SSE instructions only. So the builtin doesn't exist for this arch. 1635 return convertDoubleToInt64UsingMXCSR(a[0]); 1636 } 1637 } 1638 else 1639 { 1640 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1641 } 1642 } 1643 unittest 1644 { 1645 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1646 1647 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1648 1649 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1650 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1651 1652 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1653 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1654 1655 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1656 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1657 1658 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1659 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1660 1661 _MM_SET_ROUNDING_MODE(savedRounding); 1662 } 1663 1664 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1665 1666 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1667 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1668 /// to the upper elements of result. 1669 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1670 { 1671 static if (GDC_with_SSE2) 1672 { 1673 return __builtin_ia32_cvtsd2ss(a, b); 1674 } 1675 else 1676 { 1677 // Generates cvtsd2ss since LDC 1.3 -O0 1678 a.ptr[0] = b.array[0]; 1679 return a; 1680 } 1681 } 1682 unittest 1683 { 1684 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1685 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1686 } 1687 1688 /// Get the lower 32-bit integer in `a`. 1689 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1690 { 1691 return a.array[0]; 1692 } 1693 1694 /// Get the lower 64-bit integer in `a`. 1695 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1696 { 1697 long2 la = cast(long2)a; 1698 return la.array[0]; 1699 } 1700 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1701 1702 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1703 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1704 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1705 { 1706 a.ptr[0] = cast(double)b; 1707 return a; 1708 } 1709 unittest 1710 { 1711 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1712 assert(a.array == [42.0, 0]); 1713 } 1714 1715 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1716 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1717 { 1718 int4 r = [0, 0, 0, 0]; 1719 r.ptr[0] = a; 1720 return r; 1721 } 1722 unittest 1723 { 1724 __m128i a = _mm_cvtsi32_si128(65); 1725 assert(a.array == [65, 0, 0, 0]); 1726 } 1727 1728 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1729 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1730 1731 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1732 { 1733 a.ptr[0] = cast(double)b; 1734 return a; 1735 } 1736 unittest 1737 { 1738 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1739 assert(a.array == [42.0, 0]); 1740 } 1741 1742 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1743 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1744 { 1745 long2 r = [0, 0]; 1746 r.ptr[0] = a; 1747 return cast(__m128i)(r); 1748 } 1749 1750 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1751 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1752 1753 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1754 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1755 // element of result. 1756 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1757 { 1758 a.ptr[0] = b.array[0]; 1759 return a; 1760 } 1761 unittest 1762 { 1763 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1764 assert(a.array == [42.0, 0]); 1765 } 1766 1767 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1768 long _mm_cvttss_si64 (__m128 a) pure @safe 1769 { 1770 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1771 } 1772 unittest 1773 { 1774 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1775 } 1776 1777 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1778 /// Put zeroes in the upper elements of result. 1779 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1780 { 1781 static if (LDC_with_SSE2) 1782 { 1783 return __builtin_ia32_cvttpd2dq(a); 1784 } 1785 else static if (GDC_with_SSE2) 1786 { 1787 return __builtin_ia32_cvttpd2dq(a); 1788 } 1789 else 1790 { 1791 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1792 __m128i r; // PERF =void; 1793 r.ptr[0] = cast(int)a.array[0]; 1794 r.ptr[1] = cast(int)a.array[1]; 1795 r.ptr[2] = 0; 1796 r.ptr[3] = 0; 1797 return r; 1798 } 1799 } 1800 unittest 1801 { 1802 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1803 assert(R.array == [-4, 45641, 0, 0]); 1804 } 1805 1806 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1807 /// to packed 32-bit integers with truncation. 1808 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1809 { 1810 return to_m64(_mm_cvttpd_epi32(v)); 1811 } 1812 unittest 1813 { 1814 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1815 int[2] correct = [-4, 45641]; 1816 assert(R.array == correct); 1817 } 1818 1819 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1820 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1821 { 1822 // x86: Generates cvttps2dq since LDC 1.3 -O2 1823 // ARM64: generates fcvtze since LDC 1.8 -O2 1824 __m128i r; // PERF = void; 1825 r.ptr[0] = cast(int)a.array[0]; 1826 r.ptr[1] = cast(int)a.array[1]; 1827 r.ptr[2] = cast(int)a.array[2]; 1828 r.ptr[3] = cast(int)a.array[3]; 1829 return r; 1830 } 1831 unittest 1832 { 1833 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1834 assert(R.array == [-4, 45641, 0, 1]); 1835 } 1836 1837 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1838 int _mm_cvttsd_si32 (__m128d a) 1839 { 1840 // Generates cvttsd2si since LDC 1.3 -O0 1841 return cast(int)a.array[0]; 1842 } 1843 1844 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1845 long _mm_cvttsd_si64 (__m128d a) 1846 { 1847 // Generates cvttsd2si since LDC 1.3 -O0 1848 // but in 32-bit instead, it's a long sequence that resort to FPU 1849 return cast(long)a.array[0]; 1850 } 1851 1852 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1853 1854 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1855 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1856 { 1857 pragma(inline, true); 1858 return a / b; 1859 } 1860 1861 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1862 { 1863 static if (GDC_with_SSE2) 1864 { 1865 return __builtin_ia32_divsd(a, b); 1866 } 1867 else version(DigitalMars) 1868 { 1869 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1870 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1871 asm pure nothrow @nogc @trusted { nop;} 1872 a.array[0] = a.array[0] / b.array[0]; 1873 return a; 1874 } 1875 else 1876 { 1877 a.ptr[0] /= b.array[0]; 1878 return a; 1879 } 1880 } 1881 unittest 1882 { 1883 __m128d a = [2.0, 4.5]; 1884 a = _mm_div_sd(a, a); 1885 assert(a.array == [1.0, 4.5]); 1886 } 1887 1888 /// Extract a 16-bit integer from `v`, selected with `index`. 1889 /// Warning: the returned value is zero-extended to 32-bits. 1890 int _mm_extract_epi16(__m128i v, int index) pure @safe 1891 { 1892 short8 r = cast(short8)v; 1893 return cast(ushort)(r.array[index & 7]); 1894 } 1895 unittest 1896 { 1897 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1898 assert(_mm_extract_epi16(A, 6) == 6); 1899 assert(_mm_extract_epi16(A, 0) == 65535); 1900 assert(_mm_extract_epi16(A, 5 + 8) == 5); 1901 } 1902 1903 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1904 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1905 { 1906 short8 r = cast(short8)v; 1907 r.ptr[index & 7] = cast(short)i; 1908 return cast(__m128i)r; 1909 } 1910 unittest 1911 { 1912 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1913 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1914 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1915 assert(R.array == correct); 1916 } 1917 1918 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 1919 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 1920 /// is globally visible before any load instruction which follows the fence in program order. 1921 void _mm_lfence() @trusted 1922 { 1923 version(GNU) 1924 { 1925 static if (GDC_with_SSE2) 1926 { 1927 __builtin_ia32_lfence(); 1928 } 1929 else version(X86) 1930 { 1931 asm pure nothrow @nogc @trusted 1932 { 1933 "lfence;\n" : : : ; 1934 } 1935 } 1936 else 1937 static assert(false); 1938 } 1939 else static if (LDC_with_SSE2) 1940 { 1941 __builtin_ia32_lfence(); 1942 } 1943 else static if (LDC_with_ARM64) 1944 { 1945 __builtin_arm_dmb(9); // dmb ishld 1946 } 1947 else static if (DMD_with_asm) 1948 { 1949 asm nothrow @nogc pure @trusted 1950 { 1951 lfence; 1952 } 1953 } 1954 else version(LDC) 1955 { 1956 // When the architecture is unknown, generate a full memory barrier, 1957 // as the semantics of sfence do not really match those of atomics. 1958 llvm_memory_fence(); 1959 } 1960 else 1961 static assert(false); 1962 } 1963 unittest 1964 { 1965 _mm_lfence(); 1966 } 1967 1968 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1969 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1970 __m128d _mm_load_pd (const(double) * mem_addr) pure 1971 { 1972 pragma(inline, true); 1973 __m128d* aligned = cast(__m128d*)mem_addr; 1974 return *aligned; 1975 } 1976 unittest 1977 { 1978 align(16) double[2] S = [-5.0, 7.0]; 1979 __m128d R = _mm_load_pd(S.ptr); 1980 assert(R.array == S); 1981 } 1982 1983 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1984 /// `mem_addr` does not need to be aligned on any particular boundary. 1985 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1986 { 1987 double m = *mem_addr; 1988 __m128d r; // PERF =void; 1989 r.ptr[0] = m; 1990 r.ptr[1] = m; 1991 return r; 1992 } 1993 unittest 1994 { 1995 double what = 4; 1996 __m128d R = _mm_load_pd1(&what); 1997 double[2] correct = [4.0, 4]; 1998 assert(R.array == correct); 1999 } 2000 2001 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 2002 /// element. `mem_addr` does not need to be aligned on any particular boundary. 2003 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 2004 { 2005 double2 r = [0, 0]; 2006 r.ptr[0] = *mem_addr; 2007 return r; 2008 } 2009 unittest 2010 { 2011 double x = -42; 2012 __m128d a = _mm_load_sd(&x); 2013 assert(a.array == [-42.0, 0.0]); 2014 } 2015 2016 /// Load 128-bits of integer data from memory into dst. 2017 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2018 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @safe 2019 { 2020 pragma(inline, true); 2021 return *mem_addr; 2022 } 2023 unittest 2024 { 2025 align(16) int[4] correct = [-1, 2, 3, 4]; 2026 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 2027 assert(A.array == correct); 2028 } 2029 2030 alias _mm_load1_pd = _mm_load_pd1; /// 2031 2032 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 2033 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 2034 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 2035 { 2036 pragma(inline, true); 2037 a.ptr[1] = *mem_addr; 2038 return a; 2039 } 2040 unittest 2041 { 2042 double A = 7.0; 2043 __m128d B = _mm_setr_pd(4.0, -5.0); 2044 __m128d R = _mm_loadh_pd(B, &A); 2045 double[2] correct = [ 4.0, 7.0 ]; 2046 assert(R.array == correct); 2047 } 2048 2049 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 2050 /// Note: strange signature since the memory doesn't have to aligned, and should point to addressable 64-bit, not 128-bit. 2051 /// You may use `_mm_loadu_si64` instead. 2052 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted 2053 { 2054 pragma(inline, true); 2055 static if (DMD_with_DSIMD) 2056 { 2057 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr); 2058 } 2059 else 2060 { 2061 auto pLong = cast(const(long)*)mem_addr; 2062 long2 r = [0, 0]; 2063 r.ptr[0] = *pLong; 2064 return cast(__m128i)(r); 2065 } 2066 } 2067 unittest 2068 { 2069 long A = 0x7878787870707070; 2070 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 2071 long[2] correct = [0x7878787870707070, 0]; 2072 assert(R.array == correct); 2073 } 2074 2075 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 2076 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 2077 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 2078 { 2079 a.ptr[0] = *mem_addr; 2080 return a; 2081 } 2082 unittest 2083 { 2084 double A = 7.0; 2085 __m128d B = _mm_setr_pd(4.0, -5.0); 2086 __m128d R = _mm_loadl_pd(B, &A); 2087 double[2] correct = [ 7.0, -5.0 ]; 2088 assert(R.array == correct); 2089 } 2090 2091 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 2092 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2093 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 2094 { 2095 __m128d a = *cast(__m128d*)(mem_addr); 2096 __m128d r; // PERF =void; 2097 r.ptr[0] = a.array[1]; 2098 r.ptr[1] = a.array[0]; 2099 return r; 2100 } 2101 unittest 2102 { 2103 align(16) double[2] A = [56.0, -74.0]; 2104 __m128d R = _mm_loadr_pd(A.ptr); 2105 double[2] correct = [-74.0, 56.0]; 2106 assert(R.array == correct); 2107 } 2108 2109 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 2110 /// `mem_addr` does not need to be aligned on any particular boundary. 2111 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 2112 { 2113 pragma(inline, true); 2114 static if (GDC_with_SSE2) 2115 { 2116 return __builtin_ia32_loadupd(mem_addr); 2117 } 2118 else static if (LDC_with_optimizations) 2119 { 2120 return loadUnaligned!(double2)(mem_addr); 2121 } 2122 else version(DigitalMars) 2123 { 2124 // Apparently inside __simd you can use aligned dereferences without fear. 2125 // That was issue 23048 on dlang's Bugzilla. 2126 static if (DMD_with_DSIMD) 2127 { 2128 return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr); 2129 } 2130 else static if (SSESizedVectorsAreEmulated) 2131 { 2132 // Since this vector is emulated, it doesn't have alignement constraints 2133 // and as such we can just cast it. 2134 return *cast(__m128d*)(mem_addr); 2135 } 2136 else 2137 { 2138 __m128d result; 2139 result.ptr[0] = mem_addr[0]; 2140 result.ptr[1] = mem_addr[1]; 2141 return result; 2142 } 2143 } 2144 else 2145 { 2146 __m128d result; 2147 result.ptr[0] = mem_addr[0]; 2148 result.ptr[1] = mem_addr[1]; 2149 return result; 2150 } 2151 } 2152 unittest 2153 { 2154 double[2] A = [56.0, -75.0]; 2155 __m128d R = _mm_loadu_pd(A.ptr); 2156 double[2] correct = [56.0, -75.0]; 2157 assert(R.array == correct); 2158 } 2159 2160 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 2161 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 2162 { 2163 // PERF DMD 2164 pragma(inline, true); 2165 static if (GDC_with_SSE2) 2166 { 2167 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 2168 } 2169 else static if (LDC_with_optimizations) 2170 { 2171 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 2172 } 2173 else 2174 { 2175 const(int)* p = cast(const(int)*)mem_addr; 2176 __m128i r = void; 2177 r.ptr[0] = p[0]; 2178 r.ptr[1] = p[1]; 2179 r.ptr[2] = p[2]; 2180 r.ptr[3] = p[3]; 2181 return r; 2182 } 2183 } 2184 unittest 2185 { 2186 align(16) int[4] correct = [-1, 2, -3, 4]; 2187 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 2188 assert(A.array == correct); 2189 } 2190 2191 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise. 2192 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted // TODO: should be @system actually 2193 { 2194 static if (DMD_with_DSIMD) 2195 { 2196 int r = *cast(short*)(mem_addr); 2197 return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r); 2198 } 2199 else version(DigitalMars) 2200 { 2201 // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672 2202 // DMD cannot handle the below code... 2203 align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0]; 2204 r[0] = *cast(short*)(mem_addr); 2205 return *cast(int4*)(r.ptr); 2206 } 2207 else 2208 { 2209 short r = *cast(short*)(mem_addr); 2210 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 2211 result.ptr[0] = r; 2212 return cast(__m128i)result; 2213 } 2214 } 2215 unittest 2216 { 2217 short r = 13; 2218 short8 A = cast(short8) _mm_loadu_si16(&r); 2219 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 2220 assert(A.array == correct); 2221 } 2222 2223 /// Load unaligned 32-bit integer from memory into the first element of result. 2224 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted // TODO: should be @system actually 2225 { 2226 pragma(inline, true); 2227 int r = *cast(int*)(mem_addr); 2228 int4 result = [0, 0, 0, 0]; 2229 result.ptr[0] = r; 2230 return result; 2231 } 2232 unittest 2233 { 2234 int r = 42; 2235 __m128i A = _mm_loadu_si32(&r); 2236 int[4] correct = [42, 0, 0, 0]; 2237 assert(A.array == correct); 2238 } 2239 2240 /// Load unaligned 64-bit integer from memory into the first element of result. 2241 /// Upper 64-bit is zeroed. 2242 __m128i _mm_loadu_si64 (const(void)* mem_addr) pure @system 2243 { 2244 pragma(inline, true); 2245 static if (DMD_with_DSIMD) 2246 { 2247 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr); 2248 } 2249 else 2250 { 2251 auto pLong = cast(const(long)*)mem_addr; 2252 long2 r = [0, 0]; 2253 r.ptr[0] = *pLong; 2254 return cast(__m128i)r; 2255 } 2256 } 2257 unittest 2258 { 2259 long r = 446446446446; 2260 long2 A = cast(long2) _mm_loadu_si64(&r); 2261 long[2] correct = [446446446446, 0]; 2262 assert(A.array == correct); 2263 } 2264 2265 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2266 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2267 /// and pack the results in destination. 2268 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2269 { 2270 static if (GDC_with_SSE2) 2271 { 2272 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2273 } 2274 else static if (LDC_with_SSE2) 2275 { 2276 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2277 } 2278 else static if (LDC_with_optimizations) 2279 { 2280 // 5 inst with arm64 + LDC 1.32 + -O1 2281 enum ir = ` 2282 %ia = sext <8 x i16> %0 to <8 x i32> 2283 %ib = sext <8 x i16> %1 to <8 x i32> 2284 %p = mul <8 x i32> %ia, %ib 2285 %p_even = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 0, i32 2,i32 4, i32 6> 2286 %p_odd = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 1, i32 3,i32 5, i32 7> 2287 %p_sum = add <4 x i32> %p_even, %p_odd 2288 ret <4 x i32> %p_sum`; 2289 return cast(__m128i) LDCInlineIR!(ir, int4, short8, short8)(cast(short8)a, cast(short8)b); 2290 } 2291 else 2292 { 2293 short8 sa = cast(short8)a; 2294 short8 sb = cast(short8)b; 2295 int4 r; 2296 foreach(i; 0..4) 2297 { 2298 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2299 } 2300 return r; 2301 } 2302 } 2303 unittest 2304 { 2305 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2306 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2307 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2308 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2309 assert(R.array == correct); 2310 } 2311 2312 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2313 /// (elements are not stored when the highest bit is not set in the corresponding element) 2314 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2315 /// boundary. 2316 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2317 { 2318 static if (GDC_with_SSE2) 2319 { 2320 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2321 } 2322 else static if (LDC_with_SSE2) 2323 { 2324 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2325 } 2326 else static if (LDC_with_ARM64) 2327 { 2328 // PERF: catastrophic on ARM32 2329 byte16 bmask = cast(byte16)mask; 2330 byte16 shift = 7; 2331 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2332 mask = cast(__m128i) bmask; 2333 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2334 dest = (a & mask) | (dest & ~mask); 2335 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2336 } 2337 else 2338 { 2339 byte16 b = cast(byte16)a; 2340 byte16 m = cast(byte16)mask; 2341 byte* dest = cast(byte*)(mem_addr); 2342 foreach(j; 0..16) 2343 { 2344 if (m.array[j] & 128) 2345 { 2346 dest[j] = b.array[j]; 2347 } 2348 } 2349 } 2350 } 2351 unittest 2352 { 2353 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2354 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2355 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2356 _mm_maskmoveu_si128(A, mask, dest.ptr); 2357 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2358 assert(dest == correct); 2359 } 2360 2361 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2362 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2363 { 2364 static if (GDC_with_SSE2) 2365 { 2366 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b); 2367 } 2368 else version(LDC) 2369 { 2370 // x86: pmaxsw since LDC 1.0 -O1 2371 // ARM: smax.8h since LDC 1.5 -01 2372 short8 sa = cast(short8)a; 2373 short8 sb = cast(short8)b; 2374 static if (SIMD_COMPARISON_MASKS_16B) 2375 short8 greater = sa > sb; 2376 else 2377 short8 greater = greaterMask!short8(sa, sb); 2378 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2379 } 2380 else 2381 { 2382 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2383 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2384 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2385 return _mm_xor_si128(b, mask); 2386 } 2387 } 2388 unittest 2389 { 2390 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2391 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2392 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2393 assert(R.array == correct); 2394 } 2395 2396 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2397 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2398 { 2399 // PERF DMD 2400 static if (GDC_with_SSE2) 2401 { 2402 return cast(__m128i) __builtin_ia32_pmaxub128(cast(ubyte16)a, cast(ubyte16)b); 2403 } 2404 else version(LDC) 2405 { 2406 // x86: pmaxub since LDC 1.0.0 -O1 2407 // ARM64: umax.16b since LDC 1.5.0 -O1 2408 // PERF: catastrophic on ARM32 2409 ubyte16 sa = cast(ubyte16)a; 2410 ubyte16 sb = cast(ubyte16)b; 2411 static if (SIMD_COMPARISON_MASKS_16B) 2412 ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b); 2413 else 2414 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2415 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2416 } 2417 else 2418 { 2419 // PERF: use algorithm from _mm_max_epu16 2420 __m128i value128 = _mm_set1_epi8(-128); 2421 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2422 __m128i aTob = a ^ b; // a ^ (a ^ b) == b 2423 __m128i mask = aTob & higher; 2424 return b ^ mask; 2425 2426 } 2427 } 2428 unittest 2429 { 2430 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2431 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2432 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2433 assert(R.array == correct); 2434 } 2435 2436 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2437 /// packed maximum values. 2438 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2439 { 2440 static if (GDC_with_SSE2) 2441 { 2442 return __builtin_ia32_maxpd(a, b); 2443 } 2444 else 2445 { 2446 // x86: Generates maxpd starting with LDC 1.9 -O2 2447 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2448 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2449 return a; 2450 } 2451 } 2452 unittest 2453 { 2454 __m128d A = _mm_setr_pd(4.0, 1.0); 2455 __m128d B = _mm_setr_pd(1.0, 8.0); 2456 __m128d M = _mm_max_pd(A, B); 2457 assert(M.array[0] == 4.0); 2458 assert(M.array[1] == 8.0); 2459 } 2460 2461 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2462 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2463 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2464 { 2465 static if (GDC_with_SSE2) 2466 { 2467 return __builtin_ia32_maxsd(a, b); 2468 } 2469 else 2470 { 2471 __m128d r = a; 2472 // Generates maxsd starting with LDC 1.3 2473 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2474 return r; 2475 } 2476 } 2477 unittest 2478 { 2479 __m128d A = _mm_setr_pd(1.0, 1.0); 2480 __m128d B = _mm_setr_pd(4.0, 2.0); 2481 __m128d M = _mm_max_sd(A, B); 2482 assert(M.array[0] == 4.0); 2483 assert(M.array[1] == 1.0); 2484 } 2485 2486 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2487 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2488 /// is globally visible before any memory instruction which follows the fence in program order. 2489 void _mm_mfence() @trusted // not pure! 2490 { 2491 version(GNU) 2492 { 2493 static if (GDC_with_SSE2) 2494 { 2495 __builtin_ia32_mfence(); 2496 } 2497 else version(X86) 2498 { 2499 asm pure nothrow @nogc @trusted 2500 { 2501 "mfence;\n" : : : ; 2502 } 2503 } 2504 else 2505 static assert(false); 2506 } 2507 else static if (LDC_with_SSE2) 2508 { 2509 __builtin_ia32_mfence(); 2510 } 2511 else static if (DMD_with_asm) 2512 { 2513 asm nothrow @nogc pure @trusted 2514 { 2515 mfence; 2516 } 2517 } 2518 else version(LDC) 2519 { 2520 // Note: will generate the DMB ish instruction on ARM 2521 llvm_memory_fence(); 2522 } 2523 else 2524 static assert(false); 2525 } 2526 unittest 2527 { 2528 _mm_mfence(); 2529 } 2530 2531 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2532 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2533 { 2534 static if (GDC_with_SSE2) 2535 { 2536 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b); 2537 } 2538 else version(LDC) 2539 { 2540 // x86: pminsw since LDC 1.0 -O1 2541 // ARM64: smin.8h since LDC 1.5 -01 2542 short8 sa = cast(short8)a; 2543 short8 sb = cast(short8)b; 2544 static if (SIMD_COMPARISON_MASKS_16B) 2545 short8 greater = sa > sb; 2546 else 2547 short8 greater = greaterMask!short8(sa, sb); 2548 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2549 } 2550 else 2551 { 2552 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2553 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2554 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2555 return _mm_xor_si128(b, mask); 2556 } 2557 } 2558 unittest 2559 { 2560 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2561 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2562 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2563 assert(R.array == correct); 2564 } 2565 2566 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2567 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2568 { 2569 static if (GDC_with_SSE2) 2570 { 2571 return cast(__m128i) __builtin_ia32_pminub128(cast(ubyte16)a, cast(ubyte16)b); 2572 } 2573 else version(LDC) 2574 { 2575 // x86: pminub since LDC 1.0.0 -O1 2576 // ARM: umin.16b since LDC 1.5.0 -O1 2577 // PERF: catastrophic on ARM32 2578 ubyte16 sa = cast(ubyte16)a; 2579 ubyte16 sb = cast(ubyte16)b; 2580 static if (SIMD_COMPARISON_MASKS_16B) 2581 ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b); 2582 else 2583 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2584 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2585 } 2586 else 2587 { 2588 // PERF: use the algorithm from _mm_max_epu16 2589 __m128i value128 = _mm_set1_epi8(-128); 2590 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2591 __m128i aTob = a ^ b; // a ^ (a ^ b) == b 2592 __m128i mask = aTob & lower; 2593 return b ^ mask; 2594 } 2595 } 2596 unittest 2597 { 2598 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2599 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2600 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2601 assert(R.array == correct); 2602 } 2603 2604 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2605 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2606 { 2607 static if (GDC_with_SSE2) 2608 { 2609 return __builtin_ia32_minpd(a, b); 2610 } 2611 else 2612 { 2613 // Generates minpd starting with LDC 1.9 2614 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2615 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2616 return a; 2617 } 2618 } 2619 unittest 2620 { 2621 __m128d A = _mm_setr_pd(1.0, 2.0); 2622 __m128d B = _mm_setr_pd(4.0, 1.0); 2623 __m128d M = _mm_min_pd(A, B); 2624 assert(M.array[0] == 1.0); 2625 assert(M.array[1] == 1.0); 2626 } 2627 2628 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2629 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2630 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2631 { 2632 static if (GDC_with_SSE2) 2633 { 2634 return __builtin_ia32_minsd(a, b); 2635 } 2636 else 2637 { 2638 // Generates minsd starting with LDC 1.3 2639 __m128d r = a; 2640 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2641 return r; 2642 } 2643 } 2644 unittest 2645 { 2646 __m128d A = _mm_setr_pd(1.0, 3.0); 2647 __m128d B = _mm_setr_pd(4.0, 2.0); 2648 __m128d M = _mm_min_sd(A, B); 2649 assert(M.array[0] == 1.0); 2650 assert(M.array[1] == 3.0); 2651 } 2652 2653 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2654 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2655 { 2656 static if (GDC_with_SSE2) 2657 { 2658 // slightly better with GDC -O0 2659 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2660 } 2661 else 2662 { 2663 long2 result = [ 0, 0 ]; 2664 long2 la = cast(long2) a; 2665 result.ptr[0] = la.array[0]; 2666 return cast(__m128i)(result); 2667 } 2668 } 2669 unittest 2670 { 2671 long2 A = [13, 47]; 2672 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2673 long[2] correct = [13, 0]; 2674 assert(B.array == correct); 2675 } 2676 2677 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2678 /// the upper element from `a` to the upper element of dst. 2679 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2680 { 2681 static if (GDC_with_SSE2) 2682 { 2683 return __builtin_ia32_movsd(a, b); 2684 } 2685 else 2686 { 2687 b.ptr[1] = a.array[1]; 2688 return b; 2689 } 2690 } 2691 unittest 2692 { 2693 double2 A = [13.0, 47.0]; 2694 double2 B = [34.0, 58.0]; 2695 double2 C = _mm_move_sd(A, B); 2696 double[2] correct = [34.0, 47.0]; 2697 assert(C.array == correct); 2698 } 2699 2700 /// Create mask from the most significant bit of each 8-bit element in `v`. 2701 int _mm_movemask_epi8 (__m128i a) pure @trusted 2702 { 2703 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2704 static if (GDC_with_SSE2) 2705 { 2706 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2707 } 2708 else static if (LDC_with_SSE2) 2709 { 2710 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2711 } 2712 else static if (LDC_with_ARM64) 2713 { 2714 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2715 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2716 // SO there might be something a bit faster, but this one is reasonable and branchless. 2717 byte8 mask_shift; 2718 mask_shift.ptr[0] = 7; 2719 mask_shift.ptr[1] = 6; 2720 mask_shift.ptr[2] = 5; 2721 mask_shift.ptr[3] = 4; 2722 mask_shift.ptr[4] = 3; 2723 mask_shift.ptr[5] = 2; 2724 mask_shift.ptr[6] = 1; 2725 mask_shift.ptr[7] = 0; 2726 byte8 mask_and = byte8(-128); 2727 byte8 lo = vget_low_u8(cast(byte16)a); 2728 byte8 hi = vget_high_u8(cast(byte16)a); 2729 lo = vand_u8(lo, mask_and); 2730 lo = vshr_u8(lo, mask_shift); 2731 hi = vand_u8(hi, mask_and); 2732 hi = vshr_u8(hi, mask_shift); 2733 lo = vpadd_u8(lo,lo); 2734 lo = vpadd_u8(lo,lo); 2735 lo = vpadd_u8(lo,lo); 2736 hi = vpadd_u8(hi,hi); 2737 hi = vpadd_u8(hi,hi); 2738 hi = vpadd_u8(hi,hi); 2739 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2740 } 2741 else 2742 { 2743 byte16 ai = cast(byte16)a; 2744 int r = 0; 2745 foreach(bit; 0..16) 2746 { 2747 if (ai.array[bit] < 0) r += (1 << bit); 2748 } 2749 return r; 2750 } 2751 } 2752 unittest 2753 { 2754 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2755 } 2756 2757 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS 2758 int _mm_movemask_epi16 (__m128i a) pure @trusted 2759 { 2760 return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128())); 2761 } 2762 unittest 2763 { 2764 assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8))); 2765 } 2766 2767 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2768 /// loating-point element in `v`. 2769 int _mm_movemask_pd(__m128d v) pure @safe 2770 { 2771 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2772 static if (GDC_or_LDC_with_SSE2) 2773 { 2774 return __builtin_ia32_movmskpd(v); 2775 } 2776 else 2777 { 2778 long2 lv = cast(long2)v; 2779 int r = 0; 2780 if (lv.array[0] < 0) r += 1; 2781 if (lv.array[1] < 0) r += 2; 2782 return r; 2783 } 2784 } 2785 unittest 2786 { 2787 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2788 assert(_mm_movemask_pd(A) == 2); 2789 } 2790 2791 /// Copy the lower 64-bit integer in `v`. 2792 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2793 { 2794 long2 lv = cast(long2)v; 2795 return long1(lv.array[0]); 2796 } 2797 unittest 2798 { 2799 __m128i A = _mm_set_epi64x(-1, -2); 2800 __m64 R = _mm_movepi64_pi64(A); 2801 assert(R.array[0] == -2); 2802 } 2803 2804 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2805 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2806 { 2807 long2 r; 2808 r.ptr[0] = a.array[0]; 2809 r.ptr[1] = 0; 2810 return cast(__m128i)r; 2811 } 2812 2813 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, 2814 /// and store the unsigned 64-bit results. 2815 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2816 { 2817 // PERF DMD D_SIMD 2818 static if (GDC_with_SSE2) 2819 { 2820 return cast(__m128i) __builtin_ia32_pmuludq128 (a, b); 2821 } 2822 else 2823 { 2824 version(LDC) 2825 { 2826 static if (__VERSION__ >= 2088) 2827 { 2828 // Need LLVM9 for proper optimization 2829 long2 la, lb; 2830 la.ptr[0] = cast(uint)a.array[0]; 2831 la.ptr[1] = cast(uint)a.array[2]; 2832 lb.ptr[0] = cast(uint)b.array[0]; 2833 lb.ptr[1] = cast(uint)b.array[2]; 2834 } 2835 else 2836 { 2837 __m128i zero; 2838 zero = 0; 2839 long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero); 2840 long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero); 2841 } 2842 } 2843 else 2844 { 2845 long2 la, lb; 2846 la.ptr[0] = cast(uint)a.array[0]; 2847 la.ptr[1] = cast(uint)a.array[2]; 2848 lb.ptr[0] = cast(uint)b.array[0]; 2849 lb.ptr[1] = cast(uint)b.array[2]; 2850 } 2851 2852 version(DigitalMars) 2853 { 2854 // DMD has no long2 mul 2855 la.ptr[0] *= lb.array[0]; 2856 la.ptr[1] *= lb.array[1]; 2857 return cast(__m128i)(la); 2858 } 2859 else 2860 { 2861 static if (__VERSION__ >= 2076) 2862 { 2863 return cast(__m128i)(la * lb); 2864 } 2865 else 2866 { 2867 // long2 mul not supported before LDC 1.5 2868 la.ptr[0] *= lb.array[0]; 2869 la.ptr[1] *= lb.array[1]; 2870 return cast(__m128i)(la); 2871 } 2872 } 2873 } 2874 } 2875 unittest 2876 { 2877 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2878 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2879 __m128i C = _mm_mul_epu32(A, B); 2880 long2 LC = cast(long2)C; 2881 assert(LC.array[0] == 18446744065119617025uL); 2882 assert(LC.array[1] == 12723420444339690338uL); 2883 } 2884 2885 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2886 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2887 { 2888 pragma(inline, true); 2889 return a * b; 2890 } 2891 unittest 2892 { 2893 __m128d a = [-2.0, 1.5]; 2894 a = _mm_mul_pd(a, a); 2895 assert(a.array == [4.0, 2.25]); 2896 } 2897 2898 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2899 /// element of result, and copy the upper element from `a` to the upper element of result. 2900 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2901 { 2902 version(DigitalMars) 2903 { 2904 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2905 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2906 asm pure nothrow @nogc @trusted { nop;} 2907 a.array[0] = a.array[0] * b.array[0]; 2908 return a; 2909 } 2910 else static if (GDC_with_SSE2) 2911 { 2912 return __builtin_ia32_mulsd(a, b); 2913 } 2914 else 2915 { 2916 a.ptr[0] *= b.array[0]; 2917 return a; 2918 } 2919 } 2920 unittest 2921 { 2922 __m128d a = [-2.0, 1.5]; 2923 a = _mm_mul_sd(a, a); 2924 assert(a.array == [4.0, 1.5]); 2925 } 2926 2927 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2928 /// and get an unsigned 64-bit result. 2929 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2930 { 2931 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2932 } 2933 unittest 2934 { 2935 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2936 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2937 __m64 C = _mm_mul_su32(A, B); 2938 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2939 } 2940 2941 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2942 /// high 16 bits of the intermediate integers. 2943 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2944 { 2945 static if (GDC_with_SSE2) 2946 { 2947 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2948 } 2949 else static if (LDC_with_SSE2) 2950 { 2951 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2952 } 2953 else 2954 { 2955 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2956 // PERF: it seems the simde solution has one less instruction in ARM64. 2957 // PERF: Catastrophic in ARM32. 2958 short8 sa = cast(short8)a; 2959 short8 sb = cast(short8)b; 2960 short8 r = void; 2961 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2962 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2963 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2964 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2965 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2966 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2967 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2968 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2969 return cast(__m128i)r; 2970 } 2971 } 2972 unittest 2973 { 2974 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2975 __m128i B = _mm_set1_epi16(16384); 2976 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2977 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2978 assert(R.array == correct); 2979 } 2980 2981 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2982 /// high 16 bits of the intermediate integers. 2983 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2984 { 2985 static if (GDC_with_SSE2) 2986 { 2987 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2988 } 2989 else static if (LDC_with_SSE2) 2990 { 2991 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2992 } 2993 else 2994 { 2995 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 2996 // it seems the simde solution has one less instruction in ARM64 2997 // PERF: Catastrophic in ARM32. 2998 short8 sa = cast(short8)a; 2999 short8 sb = cast(short8)b; 3000 short8 r = void; 3001 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 3002 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 3003 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 3004 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 3005 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 3006 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 3007 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 3008 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 3009 return cast(__m128i)r; 3010 } 3011 } 3012 unittest 3013 { 3014 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 3015 __m128i B = _mm_set1_epi16(16384); 3016 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 3017 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 3018 assert(R.array == correct); 3019 } 3020 3021 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 3022 /// bits of the intermediate integers. 3023 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 3024 { 3025 return cast(__m128i)(cast(short8)a * cast(short8)b); 3026 } 3027 unittest 3028 { 3029 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 3030 __m128i B = _mm_set1_epi16(16384); 3031 short8 R = cast(short8)_mm_mullo_epi16(A, B); 3032 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 3033 assert(R.array == correct); 3034 } 3035 3036 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS 3037 __m128i _mm_not_si128 (__m128i a) pure @safe 3038 { 3039 return ~a; 3040 } 3041 unittest 3042 { 3043 __m128i A = _mm_set1_epi32(-748); 3044 int4 notA = cast(int4) _mm_not_si128(A); 3045 int[4] correct = [747, 747, 747, 747]; 3046 assert(notA.array == correct); 3047 } 3048 3049 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 3050 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 3051 { 3052 pragma(inline, true); 3053 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 3054 } 3055 3056 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 3057 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 3058 { 3059 pragma(inline, true); 3060 return a | b; 3061 } 3062 3063 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 3064 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 3065 { 3066 static if (DMD_with_DSIMD) 3067 { 3068 return cast(__m128i) __simd(XMM.PACKSSDW, a, b); 3069 } 3070 else static if (GDC_with_SSE2) 3071 { 3072 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 3073 } 3074 else static if (LDC_with_SSE2) 3075 { 3076 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 3077 } 3078 else static if (LDC_with_ARM64) 3079 { 3080 short4 ra = vqmovn_s32(cast(int4)a); 3081 short4 rb = vqmovn_s32(cast(int4)b); 3082 return cast(__m128i)vcombine_s16(ra, rb); 3083 } 3084 else 3085 { 3086 // PERF: catastrophic on ARM32 3087 short8 r; 3088 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 3089 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 3090 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 3091 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 3092 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 3093 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 3094 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 3095 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 3096 return cast(__m128i)r; 3097 } 3098 } 3099 unittest 3100 { 3101 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 3102 short8 R = cast(short8) _mm_packs_epi32(A, A); 3103 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 3104 assert(R.array == correct); 3105 } 3106 3107 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 3108 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 3109 { 3110 static if (DMD_with_DSIMD) 3111 { 3112 return cast(__m128i) __simd(XMM.PACKSSWB, a, b); 3113 } 3114 else static if (GDC_with_SSE2) 3115 { 3116 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 3117 } 3118 else static if (LDC_with_SSE2) 3119 { 3120 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 3121 } 3122 else static if (LDC_with_ARM64) 3123 { 3124 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 3125 byte8 ra = vqmovn_s16(cast(short8)a); 3126 byte8 rb = vqmovn_s16(cast(short8)b); 3127 return cast(__m128i)vcombine_s8(ra, rb); 3128 } 3129 else 3130 { 3131 // PERF: ARM32 is missing 3132 byte16 r; 3133 short8 sa = cast(short8)a; 3134 short8 sb = cast(short8)b; 3135 foreach(i; 0..8) 3136 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 3137 foreach(i; 0..8) 3138 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 3139 return cast(__m128i)r; 3140 } 3141 } 3142 unittest 3143 { 3144 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 3145 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 3146 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 3147 127, -128, 127, 0, 127, -128, 127, 0]; 3148 assert(R.array == correct); 3149 } 3150 3151 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 3152 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 3153 { 3154 // PERF DMD catastrophic 3155 static if (DMD_with_DSIMD) 3156 { 3157 return cast(__m128i) __simd(XMM.PACKUSWB, a, b); 3158 } 3159 else static if (GDC_with_SSE2) 3160 { 3161 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 3162 } 3163 else static if (LDC_with_SSE2) 3164 { 3165 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 3166 } 3167 else static if (LDC_with_ARM64) 3168 { 3169 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 3170 byte8 ra = vqmovun_s16(cast(short8)a); 3171 byte8 rb = vqmovun_s16(cast(short8)b); 3172 return cast(__m128i)vcombine_s8(ra, rb); 3173 } 3174 else 3175 { 3176 short8 sa = cast(short8)a; 3177 short8 sb = cast(short8)b; 3178 align(16) ubyte[16] result = void; 3179 for (int i = 0; i < 8; ++i) 3180 { 3181 short s = sa[i]; 3182 if (s < 0) s = 0; 3183 if (s > 255) s = 255; 3184 result[i] = cast(ubyte)s; 3185 3186 s = sb[i]; 3187 if (s < 0) s = 0; 3188 if (s > 255) s = 255; 3189 result[i+8] = cast(ubyte)s; 3190 } 3191 return *cast(__m128i*)(result.ptr); 3192 } 3193 } 3194 unittest 3195 { 3196 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 3197 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 3198 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 3199 0, 255, 0, 255, 255, 2, 1, 0]; 3200 foreach(i; 0..16) 3201 assert(AA.array[i] == cast(byte)(correctResult[i])); 3202 } 3203 3204 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 3205 /// and power consumption of spin-wait loops. 3206 void _mm_pause() @trusted 3207 { 3208 version(GNU) 3209 { 3210 static if (GDC_with_SSE2) 3211 { 3212 __builtin_ia32_pause(); 3213 } 3214 else version(X86) 3215 { 3216 asm pure nothrow @nogc @trusted 3217 { 3218 "pause;\n" : : : ; 3219 } 3220 } 3221 else 3222 static assert(false); 3223 } 3224 else static if (LDC_with_SSE2) 3225 { 3226 __builtin_ia32_pause(); 3227 } 3228 else static if (DMD_with_asm) 3229 { 3230 asm nothrow @nogc pure @trusted 3231 { 3232 rep; nop; // F3 90 = pause 3233 } 3234 } 3235 else version (LDC) 3236 { 3237 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 3238 } 3239 else 3240 static assert(false); 3241 } 3242 unittest 3243 { 3244 _mm_pause(); 3245 } 3246 3247 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 3248 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 3249 /// low 16 bits of 64-bit elements in result. 3250 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 3251 { 3252 static if (GDC_with_SSE2) 3253 { 3254 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 3255 } 3256 else static if (LDC_with_SSE2) 3257 { 3258 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 3259 } 3260 else static if (LDC_with_ARM64) 3261 { 3262 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 3263 3264 // PERF: Looks suboptimal vs addp 3265 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 3266 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 3267 ushort8 r = 0; 3268 r[0] = r0; 3269 r[4] = r4; 3270 return cast(__m128i) r; 3271 } 3272 else 3273 { 3274 // PERF: ARM32 is lacking 3275 byte16 ab = cast(byte16)a; 3276 byte16 bb = cast(byte16)b; 3277 ubyte[16] t; 3278 foreach(i; 0..16) 3279 { 3280 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 3281 if (diff < 0) diff = -diff; 3282 t[i] = cast(ubyte)(diff); 3283 } 3284 int4 r = _mm_setzero_si128(); 3285 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 3286 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 3287 return r; 3288 } 3289 } 3290 unittest 3291 { 3292 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 3293 __m128i B = _mm_set1_epi8(1); 3294 __m128i R = _mm_sad_epu8(A, B); 3295 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 3296 0, 3297 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 3298 0]; 3299 assert(R.array == correct); 3300 } 3301 3302 /// Set packed 16-bit integers with the supplied values. 3303 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3304 { 3305 short8 r = void; 3306 r.ptr[0] = e0; 3307 r.ptr[1] = e1; 3308 r.ptr[2] = e2; 3309 r.ptr[3] = e3; 3310 r.ptr[4] = e4; 3311 r.ptr[5] = e5; 3312 r.ptr[6] = e6; 3313 r.ptr[7] = e7; 3314 return cast(__m128i) r; 3315 } 3316 unittest 3317 { 3318 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 3319 short8 B = cast(short8) A; 3320 foreach(i; 0..8) 3321 assert(B.array[i] == i); 3322 } 3323 3324 /// Set packed 32-bit integers with the supplied values. 3325 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3326 { 3327 // PERF: does a constant inline correctly? vs int4 field assignment 3328 align(16) int[4] r = [e0, e1, e2, e3]; 3329 return *cast(int4*)&r; 3330 } 3331 unittest 3332 { 3333 __m128i A = _mm_set_epi32(3, 2, 1, 0); 3334 foreach(i; 0..4) 3335 assert(A.array[i] == i); 3336 } 3337 3338 /// Set packed 64-bit integers with the supplied values. 3339 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 3340 { 3341 pragma(inline, true); 3342 long2 r = void; 3343 r.ptr[0] = e0.array[0]; 3344 r.ptr[1] = e1.array[0]; 3345 return cast(__m128i)(r); 3346 } 3347 unittest 3348 { 3349 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 3350 long2 B = cast(long2) A; 3351 assert(B.array[0] == 5678); 3352 assert(B.array[1] == 1234); 3353 } 3354 3355 /// Set packed 64-bit integers with the supplied values. 3356 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 3357 { 3358 pragma(inline, true); 3359 long2 r = void; 3360 r.ptr[0] = e0; 3361 r.ptr[1] = e1; 3362 return cast(__m128i)(r); 3363 } 3364 unittest 3365 { 3366 __m128i A = _mm_set_epi64x(1234, -5678); 3367 long2 B = cast(long2) A; 3368 assert(B.array[0] == -5678); 3369 assert(B.array[1] == 1234); 3370 } 3371 3372 /// Set packed 8-bit integers with the supplied values. 3373 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3374 byte e11, byte e10, byte e9, byte e8, 3375 byte e7, byte e6, byte e5, byte e4, 3376 byte e3, byte e2, byte e1, byte e0) pure @trusted 3377 { 3378 align(16) byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3379 e8, e9, e10, e11, e12, e13, e14, e15]; 3380 return *cast(__m128i*)(result.ptr); 3381 } 3382 unittest 3383 { 3384 byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 3385 byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1]; 3386 assert(R.array == correct); 3387 } 3388 3389 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3390 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3391 { 3392 pragma(inline, true); 3393 double2 r = void; 3394 r.ptr[0] = e0; 3395 r.ptr[1] = e1; 3396 return r; 3397 } 3398 unittest 3399 { 3400 __m128d A = _mm_set_pd(61.0, 55.0); 3401 double[2] correct = [55.0, 61.0]; 3402 assert(A.array == correct); 3403 } 3404 3405 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3406 __m128d _mm_set_pd1 (double a) pure @trusted 3407 { 3408 pragma(inline, true); 3409 __m128d r = void; 3410 r.ptr[0] = a; 3411 r.ptr[1] = a; 3412 return r; 3413 } 3414 unittest 3415 { 3416 __m128d A = _mm_set_pd1(61.0); 3417 double[2] correct = [61.0, 61.0]; 3418 assert(A.array == correct); 3419 } 3420 3421 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3422 /// and zero the upper element. 3423 __m128d _mm_set_sd (double a) pure @trusted 3424 { 3425 double2 r = void; 3426 r.ptr[0] = a; 3427 r.ptr[1] = 0.0; 3428 return r; 3429 } 3430 unittest 3431 { 3432 __m128d A = _mm_set_sd(61.0); 3433 double[2] correct = [61.0, 0.0]; 3434 assert(A.array == correct); 3435 } 3436 3437 /// Broadcast 16-bit integer a to all elements of dst. 3438 __m128i _mm_set1_epi16 (short a) pure @trusted 3439 { 3440 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3441 { 3442 short8 v = a; 3443 return cast(__m128i) v; 3444 } 3445 else 3446 { 3447 pragma(inline, true); 3448 return cast(__m128i)(short8(a)); 3449 } 3450 } 3451 unittest 3452 { 3453 short8 a = cast(short8) _mm_set1_epi16(31); 3454 for (int i = 0; i < 8; ++i) 3455 assert(a.array[i] == 31); 3456 } 3457 3458 /// Broadcast 32-bit integer `a` to all elements. 3459 __m128i _mm_set1_epi32 (int a) pure @trusted 3460 { 3461 pragma(inline, true); 3462 return cast(__m128i)(int4(a)); 3463 } 3464 unittest 3465 { 3466 int4 a = cast(int4) _mm_set1_epi32(31); 3467 for (int i = 0; i < 4; ++i) 3468 assert(a.array[i] == 31); 3469 } 3470 3471 /// Broadcast 64-bit integer `a` to all elements. 3472 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3473 { 3474 return _mm_set_epi64(a, a); 3475 } 3476 unittest 3477 { 3478 long b = 0x1DEADCAFE; 3479 __m64 a; 3480 a.ptr[0] = b; 3481 long2 c = cast(long2) _mm_set1_epi64(a); 3482 assert(c.array[0] == b); 3483 assert(c.array[1] == b); 3484 } 3485 3486 /// Broadcast 64-bit integer `a` to all elements 3487 __m128i _mm_set1_epi64x (long a) pure @trusted 3488 { 3489 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3490 return cast(__m128i)(b); 3491 } 3492 unittest 3493 { 3494 long b = 0x1DEADCAFE; 3495 long2 c = cast(long2) _mm_set1_epi64x(b); 3496 for (int i = 0; i < 2; ++i) 3497 assert(c.array[i] == b); 3498 } 3499 3500 /// Broadcast 8-bit integer `a` to all elements. 3501 __m128i _mm_set1_epi8 (byte a) pure @trusted 3502 { 3503 pragma(inline, true); 3504 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3505 return cast(__m128i)(b); 3506 } 3507 unittest 3508 { 3509 byte16 b = cast(byte16) _mm_set1_epi8(31); 3510 for (int i = 0; i < 16; ++i) 3511 assert(b.array[i] == 31); 3512 } 3513 3514 alias _mm_set1_pd = _mm_set_pd1; 3515 3516 /// Set packed 16-bit integers with the supplied values in reverse order. 3517 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3518 short e3, short e2, short e1, short e0) pure @trusted 3519 { 3520 short8 r = void; 3521 r.ptr[0] = e7; 3522 r.ptr[1] = e6; 3523 r.ptr[2] = e5; 3524 r.ptr[3] = e4; 3525 r.ptr[4] = e3; 3526 r.ptr[5] = e2; 3527 r.ptr[6] = e1; 3528 r.ptr[7] = e0; 3529 return cast(__m128i)(r); 3530 } 3531 unittest 3532 { 3533 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3534 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3535 assert(A.array == correct); 3536 } 3537 3538 /// Set packed 32-bit integers with the supplied values in reverse order. 3539 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3540 { 3541 // Performs better than = void; with GDC 3542 pragma(inline, true); 3543 align(16) int[4] result = [e3, e2, e1, e0]; 3544 return *cast(__m128i*)(result.ptr); 3545 } 3546 unittest 3547 { 3548 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3549 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3550 assert(A.array == correct); 3551 } 3552 3553 /// Set packed 64-bit integers with the supplied values in reverse order. 3554 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3555 { 3556 long2 r = void; 3557 r.ptr[0] = e1; 3558 r.ptr[1] = e0; 3559 return cast(__m128i)(r); 3560 } 3561 unittest 3562 { 3563 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3564 long[2] correct = [-1, 0]; 3565 assert(A.array == correct); 3566 } 3567 3568 /// Set packed 8-bit integers with the supplied values in reverse order. 3569 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3570 byte e11, byte e10, byte e9, byte e8, 3571 byte e7, byte e6, byte e5, byte e4, 3572 byte e3, byte e2, byte e1, byte e0) pure @trusted 3573 { 3574 align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3575 e7, e6, e5, e4, e3, e2, e1, e0]; 3576 return *cast(__m128i*)(result.ptr); 3577 } 3578 unittest 3579 { 3580 byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 3581 byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]; 3582 assert(R.array == correct); 3583 } 3584 3585 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3586 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3587 { 3588 pragma(inline, true); 3589 double2 result; 3590 result.ptr[0] = e1; 3591 result.ptr[1] = e0; 3592 return result; 3593 } 3594 unittest 3595 { 3596 __m128d A = _mm_setr_pd(61.0, 55.0); 3597 double[2] correct = [61.0, 55.0]; 3598 assert(A.array == correct); 3599 } 3600 3601 /// Return vector of type `__m128d` with all elements set to zero. 3602 __m128d _mm_setzero_pd() pure @trusted 3603 { 3604 pragma(inline, true); 3605 double2 r = void; 3606 r.ptr[0] = 0.0; 3607 r.ptr[1] = 0.0; 3608 return r; 3609 } 3610 unittest 3611 { 3612 __m128d A = _mm_setzero_pd(); 3613 double[2] correct = [0.0, 0.0]; 3614 assert(A.array == correct); 3615 } 3616 3617 /// Return vector of type `__m128i` with all elements set to zero. 3618 __m128i _mm_setzero_si128() pure @trusted 3619 { 3620 pragma(inline, true); 3621 int4 r = void; 3622 r.ptr[0] = 0; 3623 r.ptr[1] = 0; 3624 r.ptr[2] = 0; 3625 r.ptr[3] = 0; 3626 return r; 3627 } 3628 unittest 3629 { 3630 __m128i A = _mm_setzero_si128(); 3631 int[4] correct = [0, 0, 0, 0]; 3632 assert(A.array == correct); 3633 } 3634 3635 /// Shuffle 32-bit integers in `a` using the control in `imm8`. 3636 /// See_also: `_MM_SHUFFLE`. 3637 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted 3638 { 3639 // PERF DMD D_SIMD 3640 static if (GDC_with_SSE2) 3641 { 3642 return __builtin_ia32_pshufd(a, imm8); 3643 } 3644 else static if (LDC_with_optimizations) 3645 { 3646 return shufflevectorLDC!(int4, (imm8 >> 0) & 3, 3647 (imm8 >> 2) & 3, 3648 (imm8 >> 4) & 3, 3649 (imm8 >> 6) & 3)(a, a); 3650 } 3651 else 3652 { 3653 int4 r = void; 3654 r.ptr[0] = a.ptr[(imm8 >> 0) & 3]; 3655 r.ptr[1] = a.ptr[(imm8 >> 2) & 3]; 3656 r.ptr[2] = a.ptr[(imm8 >> 4) & 3]; 3657 r.ptr[3] = a.ptr[(imm8 >> 6) & 3]; 3658 return r; 3659 } 3660 } 3661 unittest 3662 { 3663 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3664 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3665 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3666 int[4] expectedB = [ 3, 2, 1, 0 ]; 3667 assert(B.array == expectedB); 3668 } 3669 3670 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3671 /// See_also: `_MM_SHUFFLE2`. 3672 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted 3673 { 3674 // PERF DMD D_SIMD 3675 static if (GDC_with_SSE2) 3676 { 3677 return __builtin_ia32_shufpd(a, b, imm8); 3678 } 3679 else version(LDC) 3680 { 3681 return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ), 3682 2 + ( (imm8 >> 1) & 1 ))(a, b); 3683 } 3684 else 3685 { 3686 double2 r = void; 3687 r.ptr[0] = a.array[imm8 & 1]; 3688 r.ptr[1] = b.array[(imm8 >> 1) & 1]; 3689 return r; 3690 } 3691 } 3692 unittest 3693 { 3694 __m128d A = _mm_setr_pd(0.5, 2.0); 3695 __m128d B = _mm_setr_pd(4.0, 5.0); 3696 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3697 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3698 double[2] correct = [ 2.0, 5.0 ]; 3699 assert(R.array == correct); 3700 } 3701 3702 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3703 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3704 /// See also: `_MM_SHUFFLE`. 3705 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted 3706 { 3707 static if (DMD_with_DSIMD) 3708 { 3709 return cast(__m128i) __simd(XMM.PSHUFHW, a, a, cast(ubyte)imm8); 3710 } 3711 else static if (GDC_with_SSE2) 3712 { 3713 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3714 } 3715 else static if (LDC_with_optimizations) 3716 { 3717 return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3, 3718 4 + ( (imm8 >> 0) & 3 ), 3719 4 + ( (imm8 >> 2) & 3 ), 3720 4 + ( (imm8 >> 4) & 3 ), 3721 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3722 } 3723 else 3724 { 3725 short8 r = cast(short8)a; 3726 short8 sa = cast(short8)a; 3727 r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ]; 3728 r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ]; 3729 r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ]; 3730 r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ]; 3731 return cast(__m128i) r; 3732 } 3733 } 3734 unittest 3735 { 3736 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3737 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3738 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3739 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3740 assert(C.array == expectedC); 3741 } 3742 3743 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3744 /// bits of result, with the high 64 bits being copied from from `a` to result. 3745 /// See_also: `_MM_SHUFFLE`. 3746 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted 3747 { 3748 static if (DMD_with_DSIMD) 3749 { 3750 return cast(__m128i) __simd(XMM.PSHUFLW, a, a, cast(ubyte)imm8); 3751 } 3752 else static if (GDC_with_SSE2) 3753 { 3754 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3755 } 3756 else static if (LDC_with_optimizations) 3757 { 3758 return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ), 3759 ( (imm8 >> 2) & 3 ), 3760 ( (imm8 >> 4) & 3 ), 3761 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3762 } 3763 else 3764 { 3765 short8 r = cast(short8)a; 3766 short8 sa = cast(short8)a; 3767 r.ptr[0] = sa.array[(imm8 >> 0) & 3]; 3768 r.ptr[1] = sa.array[(imm8 >> 2) & 3]; 3769 r.ptr[2] = sa.array[(imm8 >> 4) & 3]; 3770 r.ptr[3] = sa.array[(imm8 >> 6) & 3]; 3771 return cast(__m128i) r; 3772 } 3773 } 3774 unittest 3775 { 3776 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3777 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3778 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3779 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3780 assert(B.array == expectedB); 3781 } 3782 3783 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3784 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3785 { 3786 static if (LDC_with_SSE2) 3787 { 3788 return __builtin_ia32_pslld128(a, count); 3789 } 3790 else static if (GDC_with_SSE2) 3791 { 3792 return __builtin_ia32_pslld128(a, count); 3793 } 3794 else static if (DMD_with_32bit_asm) 3795 { 3796 asm pure nothrow @nogc @trusted 3797 { 3798 movdqu XMM0, a; 3799 movdqu XMM1, count; 3800 pslld XMM0, XMM1; 3801 movdqu a, XMM0; 3802 } 3803 return a; 3804 } 3805 else 3806 { 3807 int4 r = void; 3808 long2 lc = cast(long2)count; 3809 int bits = cast(int)(lc.array[0]); 3810 foreach(i; 0..4) 3811 r[i] = cast(uint)(a[i]) << bits; 3812 return r; 3813 } 3814 } 3815 3816 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3817 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3818 { 3819 static if (LDC_with_SSE2) 3820 { 3821 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3822 } 3823 else static if (GDC_with_SSE2) 3824 { 3825 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3826 } 3827 else static if (DMD_with_32bit_asm) 3828 { 3829 asm pure nothrow @nogc @trusted 3830 { 3831 movdqu XMM0, a; 3832 movdqu XMM1, count; 3833 psllq XMM0, XMM1; 3834 movdqu a, XMM0; 3835 } 3836 return a; 3837 } 3838 else 3839 { 3840 // ARM: good since LDC 1.12 -O2 3841 // ~but -O0 version is catastrophic 3842 long2 r = void; 3843 long2 sa = cast(long2)a; 3844 long2 lc = cast(long2)count; 3845 int bits = cast(int)(lc.array[0]); 3846 foreach(i; 0..2) 3847 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3848 return cast(__m128i)r; 3849 } 3850 } 3851 3852 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3853 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3854 { 3855 static if (GDC_or_LDC_with_SSE2) 3856 { 3857 return cast(__m128i)__builtin_ia32_psllw128(cast(short8)a, cast(short8)count); 3858 } 3859 else static if (DMD_with_32bit_asm) 3860 { 3861 asm pure nothrow @nogc @trusted 3862 { 3863 movdqu XMM0, a; 3864 movdqu XMM1, count; 3865 psllw XMM0, XMM1; 3866 movdqu a, XMM0; 3867 } 3868 return a; 3869 } 3870 else 3871 { 3872 short8 sa = cast(short8)a; 3873 long2 lc = cast(long2)count; 3874 int bits = cast(int)(lc.array[0]); 3875 short8 r = void; 3876 foreach(i; 0..8) 3877 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3878 return cast(int4)r; 3879 } 3880 } 3881 3882 3883 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3884 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3885 { 3886 static if (GDC_with_SSE2) 3887 { 3888 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3889 } 3890 else static if (LDC_with_SSE2) 3891 { 3892 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3893 } 3894 else 3895 { 3896 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3897 // D says "It's illegal to shift by the same or more bits 3898 // than the size of the quantity being shifted" 3899 // and it's UB instead. 3900 int4 r = _mm_setzero_si128(); 3901 3902 ubyte count = cast(ubyte) imm8; 3903 if (count > 31) 3904 return r; 3905 3906 foreach(i; 0..4) 3907 r.array[i] = cast(uint)(a.array[i]) << count; 3908 return r; 3909 } 3910 } 3911 unittest 3912 { 3913 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3914 __m128i B = _mm_slli_epi32(A, 1); 3915 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3916 int[4] expectedB = [ 0, 4, 6, -8]; 3917 assert(B.array == expectedB); 3918 assert(B2.array == expectedB); 3919 3920 __m128i C = _mm_slli_epi32(A, 0); 3921 int[4] expectedC = [ 0, 2, 3, -4]; 3922 assert(C.array == expectedC); 3923 3924 __m128i D = _mm_slli_epi32(A, 65); 3925 int[4] expectedD = [ 0, 0, 0, 0]; 3926 assert(D.array == expectedD); 3927 } 3928 3929 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3930 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3931 { 3932 static if (GDC_with_SSE2) 3933 { 3934 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3935 } 3936 else static if (LDC_with_SSE2) 3937 { 3938 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3939 } 3940 else 3941 { 3942 long2 sa = cast(long2)a; 3943 3944 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3945 // D says "It's illegal to shift by the same or more bits 3946 // than the size of the quantity being shifted" 3947 // and it's UB instead. 3948 long2 r = cast(long2) _mm_setzero_si128(); 3949 ubyte count = cast(ubyte) imm8; 3950 if (count > 63) 3951 return cast(__m128i)r; 3952 3953 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3954 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3955 return cast(__m128i)r; 3956 } 3957 } 3958 unittest 3959 { 3960 __m128i A = _mm_setr_epi64(8, -4); 3961 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3962 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3963 long[2] expectedB = [ 16, -8]; 3964 assert(B.array == expectedB); 3965 assert(B2.array == expectedB); 3966 3967 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3968 long[2] expectedC = [ 8, -4]; 3969 assert(C.array == expectedC); 3970 3971 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3972 long[2] expectedD = [ 0, -0]; 3973 assert(D.array == expectedD); 3974 } 3975 3976 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3977 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3978 { 3979 static if (GDC_with_SSE2) 3980 { 3981 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3982 } 3983 else static if (LDC_with_SSE2) 3984 { 3985 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3986 } 3987 else static if (LDC_with_ARM64) 3988 { 3989 short8 sa = cast(short8)a; 3990 short8 r = cast(short8)_mm_setzero_si128(); 3991 ubyte count = cast(ubyte) imm8; 3992 if (count > 15) 3993 return cast(__m128i)r; 3994 r = sa << short8(count); 3995 return cast(__m128i)r; 3996 } 3997 else 3998 { 3999 short8 sa = cast(short8)a; 4000 short8 r = cast(short8)_mm_setzero_si128(); 4001 ubyte count = cast(ubyte) imm8; 4002 if (count > 15) 4003 return cast(__m128i)r; 4004 foreach(i; 0..8) 4005 r.ptr[i] = cast(short)(sa.array[i] << count); 4006 return cast(__m128i)r; 4007 } 4008 } 4009 unittest 4010 { 4011 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4012 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 4013 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 4014 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 4015 assert(B.array == expectedB); 4016 assert(B2.array == expectedB); 4017 4018 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 4019 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 4020 assert(C.array == expectedC); 4021 } 4022 4023 4024 /// Shift `a` left by `bytes` bytes while shifting in zeros. 4025 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 4026 { 4027 static if (bytes & 0xF0) 4028 { 4029 return _mm_setzero_si128(); 4030 } 4031 else static if (DMD_with_DSIMD) 4032 { 4033 return cast(__m128i) __simd_ib(XMM.PSLLDQ, op, bytes); 4034 } 4035 else static if (GDC_with_SSE2) 4036 { 4037 pragma(inline, true); // else it doesn't seem to be inlined at all by GDC TODO _mm_srli_si128 4038 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 4039 } 4040 else static if (LDC_with_optimizations) 4041 { 4042 return cast(__m128i) shufflevectorLDC!(byte16, 4043 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 4044 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 4045 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 4046 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 4047 } 4048 else static if (DMD_with_32bit_asm) 4049 { 4050 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 4051 { 4052 movdqu XMM0, op; 4053 pslldq XMM0, bytes; 4054 movdqu op, XMM0; 4055 } 4056 return op; 4057 } 4058 else 4059 { 4060 byte16 A = cast(byte16)op; 4061 byte16 R = void; 4062 for (int n = 15; n >= bytes; --n) 4063 R.ptr[n] = A.array[n-bytes]; 4064 for (int n = bytes-1; n >= 0; --n) 4065 R.ptr[n] = 0; 4066 return cast(__m128i)R; 4067 } 4068 } 4069 unittest 4070 { 4071 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 4072 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 4073 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 4074 assert(R.array == correct); 4075 4076 __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1)); 4077 int[4] expectedB = [0, 0, 0, 0]; 4078 assert(B.array == expectedB); 4079 } 4080 4081 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 4082 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 4083 { 4084 version(LDC) 4085 { 4086 // Disappeared with LDC 1.11 4087 static if (__VERSION__ < 2081) 4088 return __builtin_ia32_sqrtpd(vec); 4089 else 4090 { 4091 // PERF: use llvm_sqrt on the vector 4092 vec.array[0] = llvm_sqrt(vec.array[0]); 4093 vec.array[1] = llvm_sqrt(vec.array[1]); 4094 return vec; 4095 } 4096 } 4097 else static if (GDC_with_SSE2) 4098 { 4099 return __builtin_ia32_sqrtpd(vec); 4100 } 4101 else 4102 { 4103 vec.ptr[0] = sqrt(vec.array[0]); 4104 vec.ptr[1] = sqrt(vec.array[1]); 4105 return vec; 4106 } 4107 } 4108 4109 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 4110 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 4111 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 4112 { 4113 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 4114 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 4115 // The quadword at bits 127:64 of the destination operand remains unchanged." 4116 version(LDC) 4117 { 4118 // Disappeared with LDC 1.11 4119 static if (__VERSION__ < 2081) 4120 { 4121 __m128d c = __builtin_ia32_sqrtsd(b); 4122 a[0] = c[0]; 4123 return a; 4124 } 4125 else 4126 { 4127 a.array[0] = llvm_sqrt(b.array[0]); 4128 return a; 4129 } 4130 } 4131 else static if (GDC_with_SSE2) 4132 { 4133 __m128d c = __builtin_ia32_sqrtsd(b); 4134 a.ptr[0] = c.array[0]; 4135 return a; 4136 } 4137 else 4138 { 4139 a.ptr[0] = sqrt(b.array[0]); 4140 return a; 4141 } 4142 } 4143 unittest 4144 { 4145 __m128d A = _mm_setr_pd(1.0, 3.0); 4146 __m128d B = _mm_setr_pd(4.0, 5.0); 4147 __m128d R = _mm_sqrt_sd(A, B); 4148 double[2] correct = [2.0, 3.0 ]; 4149 assert(R.array == correct); 4150 } 4151 4152 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 4153 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 4154 { 4155 static if (GDC_with_SSE2) 4156 { 4157 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 4158 } 4159 else static if (LDC_with_SSE2) 4160 { 4161 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 4162 } 4163 else 4164 { 4165 short8 sa = cast(short8)a; 4166 long2 lc = cast(long2)count; 4167 int bits = cast(int)(lc.array[0]); 4168 short8 r = void; 4169 foreach(i; 0..8) 4170 r.ptr[i] = cast(short)(sa.array[i] >> bits); 4171 return cast(int4)r; 4172 } 4173 } 4174 4175 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 4176 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 4177 { 4178 static if (LDC_with_SSE2) 4179 { 4180 return __builtin_ia32_psrad128(a, count); 4181 } 4182 else static if (GDC_with_SSE2) 4183 { 4184 return __builtin_ia32_psrad128(a, count); 4185 } 4186 else 4187 { 4188 int4 r = void; 4189 long2 lc = cast(long2)count; 4190 int bits = cast(int)(lc.array[0]); 4191 r.ptr[0] = (a.array[0] >> bits); 4192 r.ptr[1] = (a.array[1] >> bits); 4193 r.ptr[2] = (a.array[2] >> bits); 4194 r.ptr[3] = (a.array[3] >> bits); 4195 return r; 4196 } 4197 } 4198 4199 4200 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 4201 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 4202 { 4203 static if (GDC_with_SSE2) 4204 { 4205 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 4206 } 4207 else static if (LDC_with_SSE2) 4208 { 4209 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 4210 } 4211 else static if (LDC_with_ARM64) 4212 { 4213 short8 sa = cast(short8)a; 4214 ubyte count = cast(ubyte)imm8; 4215 if (count > 15) 4216 count = 15; 4217 short8 r = sa >> short8(count); 4218 return cast(__m128i)r; 4219 } 4220 else 4221 { 4222 short8 sa = cast(short8)a; 4223 short8 r = void; 4224 4225 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4226 // D says "It's illegal to shift by the same or more bits 4227 // than the size of the quantity being shifted" 4228 // and it's UB instead. 4229 ubyte count = cast(ubyte)imm8; 4230 if (count > 15) 4231 count = 15; 4232 foreach(i; 0..8) 4233 r.ptr[i] = cast(short)(sa.array[i] >> count); 4234 return cast(int4)r; 4235 } 4236 } 4237 unittest 4238 { 4239 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4240 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 4241 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 4242 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 4243 assert(B.array == expectedB); 4244 assert(B2.array == expectedB); 4245 4246 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 4247 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 4248 assert(C.array == expectedC); 4249 } 4250 4251 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 4252 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 4253 { 4254 static if (LDC_with_SSE2) 4255 { 4256 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 4257 } 4258 else static if (GDC_with_SSE2) 4259 { 4260 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 4261 } 4262 else 4263 { 4264 int4 r = void; 4265 4266 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4267 // D says "It's illegal to shift by the same or more bits 4268 // than the size of the quantity being shifted" 4269 // and it's UB instead. 4270 // See Issue: #56 4271 ubyte count = cast(ubyte) imm8; 4272 if (count > 31) 4273 count = 31; 4274 4275 r.ptr[0] = (a.array[0] >> count); 4276 r.ptr[1] = (a.array[1] >> count); 4277 r.ptr[2] = (a.array[2] >> count); 4278 r.ptr[3] = (a.array[3] >> count); 4279 return r; 4280 } 4281 } 4282 unittest 4283 { 4284 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4285 __m128i B = _mm_srai_epi32(A, 1); 4286 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 4287 int[4] expectedB = [ 0, 1, 1, -2]; 4288 assert(B.array == expectedB); 4289 assert(B2.array == expectedB); 4290 4291 __m128i C = _mm_srai_epi32(A, 32); 4292 int[4] expectedC = [ 0, 0, 0, -1]; 4293 assert(C.array == expectedC); 4294 4295 __m128i D = _mm_srai_epi32(A, 0); 4296 int[4] expectedD = [ 0, 2, 3, -4]; 4297 assert(D.array == expectedD); 4298 } 4299 4300 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 4301 { 4302 static if (LDC_with_SSE2) 4303 { 4304 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 4305 } 4306 else static if (GDC_with_SSE2) 4307 { 4308 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 4309 } 4310 else 4311 { 4312 short8 sa = cast(short8)a; 4313 long2 lc = cast(long2)count; 4314 int bits = cast(int)(lc.array[0]); 4315 short8 r = void; 4316 foreach(i; 0..8) 4317 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 4318 return cast(int4)r; 4319 } 4320 } 4321 4322 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 4323 { 4324 static if (LDC_with_SSE2) 4325 { 4326 return __builtin_ia32_psrld128(a, count); 4327 } 4328 else static if (GDC_with_SSE2) 4329 { 4330 return __builtin_ia32_psrld128(a, count); 4331 } 4332 else 4333 { 4334 int4 r = void; 4335 long2 lc = cast(long2)count; 4336 int bits = cast(int)(lc.array[0]); 4337 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 4338 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 4339 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 4340 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 4341 return r; 4342 } 4343 } 4344 4345 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 4346 { 4347 static if (LDC_with_SSE2) 4348 { 4349 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 4350 } 4351 else static if (GDC_with_SSE2) 4352 { 4353 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 4354 } 4355 else 4356 { 4357 // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047 4358 // => avoid void initialization. 4359 long2 r; 4360 long2 sa = cast(long2)a; 4361 long2 lc = cast(long2)count; 4362 int bits = cast(int)(lc.array[0]); 4363 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 4364 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 4365 return cast(__m128i)r; 4366 } 4367 } 4368 4369 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 4370 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 4371 { 4372 static if (GDC_with_SSE2) 4373 { 4374 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4375 } 4376 else static if (LDC_with_SSE2) 4377 { 4378 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4379 } 4380 else static if (LDC_with_ARM64) 4381 { 4382 short8 sa = cast(short8)a; 4383 short8 r = cast(short8) _mm_setzero_si128(); 4384 4385 ubyte count = cast(ubyte)imm8; 4386 if (count >= 16) 4387 return cast(__m128i)r; 4388 4389 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 4390 return cast(__m128i)r; 4391 } 4392 else 4393 { 4394 short8 sa = cast(short8)a; 4395 ubyte count = cast(ubyte)imm8; 4396 4397 short8 r = cast(short8) _mm_setzero_si128(); 4398 if (count >= 16) 4399 return cast(__m128i)r; 4400 4401 foreach(i; 0..8) 4402 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 4403 return cast(__m128i)r; 4404 } 4405 } 4406 unittest 4407 { 4408 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4409 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 4410 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 4411 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 4412 assert(B.array == expectedB); 4413 assert(B2.array == expectedB); 4414 4415 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 4416 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 4417 assert(C.array == expectedC); 4418 4419 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 4420 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 4421 assert(D.array == expectedD); 4422 } 4423 4424 4425 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 4426 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 4427 { 4428 static if (GDC_with_SSE2) 4429 { 4430 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4431 } 4432 else static if (LDC_with_SSE2) 4433 { 4434 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4435 } 4436 else 4437 { 4438 ubyte count = cast(ubyte) imm8; 4439 4440 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4441 // D says "It's illegal to shift by the same or more bits 4442 // than the size of the quantity being shifted" 4443 // and it's UB instead. 4444 int4 r = _mm_setzero_si128(); 4445 if (count >= 32) 4446 return r; 4447 r.ptr[0] = a.array[0] >>> count; 4448 r.ptr[1] = a.array[1] >>> count; 4449 r.ptr[2] = a.array[2] >>> count; 4450 r.ptr[3] = a.array[3] >>> count; 4451 return r; 4452 } 4453 } 4454 unittest 4455 { 4456 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4457 __m128i B = _mm_srli_epi32(A, 1); 4458 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 4459 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 4460 assert(B.array == expectedB); 4461 assert(B2.array == expectedB); 4462 4463 __m128i C = _mm_srli_epi32(A, 255); 4464 int[4] expectedC = [ 0, 0, 0, 0 ]; 4465 assert(C.array == expectedC); 4466 } 4467 4468 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 4469 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4470 { 4471 // PERF DMD 4472 static if (GDC_with_SSE2) 4473 { 4474 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4475 } 4476 else static if (LDC_with_SSE2) 4477 { 4478 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4479 } 4480 else 4481 { 4482 long2 r = cast(long2) _mm_setzero_si128(); 4483 long2 sa = cast(long2)a; 4484 4485 ubyte count = cast(ubyte) imm8; 4486 if (count >= 64) 4487 return cast(__m128i)r; 4488 4489 r.ptr[0] = sa.array[0] >>> count; 4490 r.ptr[1] = sa.array[1] >>> count; 4491 return cast(__m128i)r; 4492 } 4493 } 4494 unittest 4495 { 4496 __m128i A = _mm_setr_epi64(8, -4); 4497 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4498 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4499 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4500 assert(B.array == expectedB); 4501 assert(B2.array == expectedB); 4502 4503 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4504 long[2] expectedC = [ 0, 0 ]; 4505 assert(C.array == expectedC); 4506 } 4507 4508 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4509 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted 4510 { 4511 static if (bytes & 0xF0) 4512 { 4513 return _mm_setzero_si128(); 4514 } 4515 else static if (DMD_with_DSIMD) 4516 { 4517 return cast(__m128i) __simd_ib(XMM.PSRLDQ, v, bytes); 4518 } 4519 else static if (GDC_with_SSE2) 4520 { 4521 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4522 } 4523 else static if (DMD_with_32bit_asm) 4524 { 4525 asm pure nothrow @nogc @trusted 4526 { 4527 movdqu XMM0, v; 4528 psrldq XMM0, bytes; 4529 movdqu v, XMM0; 4530 } 4531 return v; 4532 } 4533 else static if (LDC_with_optimizations) 4534 { 4535 return cast(__m128i) shufflevectorLDC!(byte16, 4536 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4537 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4538 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4539 } 4540 else 4541 { 4542 byte16 A = cast(byte16)v; 4543 byte16 R = void; 4544 for (int n = 0; n < bytes; ++n) 4545 R.ptr[15-n] = 0; 4546 for (int n = bytes; n < 16; ++n) 4547 R.ptr[15-n] = A.array[15 - n + bytes]; 4548 return cast(__m128i)R; 4549 } 4550 } 4551 unittest 4552 { 4553 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1)); 4554 int[4] correct = [-2, 3, 4, 0]; 4555 assert(R.array == correct); 4556 4557 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4558 int[4] expectedA = [0, 0, 0, 0]; 4559 assert(A.array == expectedA); 4560 } 4561 4562 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4563 /// #BONUS 4564 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4565 { 4566 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4567 } 4568 unittest 4569 { 4570 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4571 float[4] correct = [3.0f, 4.0f, 0, 0]; 4572 assert(R.array == correct); 4573 } 4574 4575 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4576 /// #BONUS 4577 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4578 { 4579 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4580 } 4581 4582 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4583 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4584 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4585 { 4586 pragma(inline, true); 4587 __m128d* aligned = cast(__m128d*)mem_addr; 4588 *aligned = a; 4589 } 4590 unittest 4591 { 4592 align(16) double[2] A; 4593 __m128d B = _mm_setr_pd(-8.0, 9.0); 4594 _mm_store_pd(A.ptr, B); 4595 assert(A == [-8.0, 9.0]); 4596 } 4597 4598 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4599 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4600 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4601 { 4602 __m128d* aligned = cast(__m128d*)mem_addr; 4603 __m128d r; // PERF =void; 4604 r.ptr[0] = a.array[0]; 4605 r.ptr[1] = a.array[0]; 4606 *aligned = r; 4607 } 4608 4609 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4610 /// be aligned on any particular boundary. 4611 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4612 { 4613 pragma(inline, true); 4614 *mem_addr = a.array[0]; 4615 } 4616 4617 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4618 /// general-protection exception may be generated. 4619 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4620 { 4621 pragma(inline, true); 4622 *mem_addr = a; 4623 } 4624 4625 alias _mm_store1_pd = _mm_store_pd1; /// 4626 4627 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4628 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4629 { 4630 pragma(inline, true); 4631 *mem_addr = a.array[1]; 4632 } 4633 4634 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4635 // expectations from the user point of view. This problem also exist in C++. 4636 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4637 { 4638 pragma(inline, true); 4639 long* dest = cast(long*)mem_addr; 4640 long2 la = cast(long2)a; 4641 *dest = la.array[0]; 4642 } 4643 unittest 4644 { 4645 long[3] A = [1, 2, 3]; 4646 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4647 long[3] correct = [1, 0x1_0000_0000, 3]; 4648 assert(A == correct); 4649 } 4650 4651 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4652 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4653 { 4654 pragma(inline, true); 4655 *mem_addr = a.array[0]; 4656 } 4657 4658 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse 4659 /// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 4660 /// may be generated. 4661 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system 4662 { 4663 __m128d reversed = void; 4664 reversed.ptr[0] = a.array[1]; 4665 reversed.ptr[1] = a.array[0]; 4666 *cast(__m128d*)mem_addr = reversed; 4667 } 4668 unittest 4669 { 4670 align(16) double[2] A = [0.0, 1.0]; 4671 _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0)); 4672 assert(A[0] == 3.0 && A[1] == 2.0); 4673 } 4674 4675 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 4676 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 4677 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system 4678 { 4679 // PERF DMD 4680 pragma(inline, true); 4681 static if (GDC_with_SSE2) 4682 { 4683 __builtin_ia32_storeupd(mem_addr, a); 4684 } 4685 else static if (LDC_with_optimizations) 4686 { 4687 storeUnaligned!double2(a, mem_addr); 4688 } 4689 else 4690 { 4691 mem_addr[0] = a.array[0]; 4692 mem_addr[1] = a.array[1]; 4693 } 4694 } 4695 unittest 4696 { 4697 __m128d A = _mm_setr_pd(3.0, 4.0); 4698 align(16) double[4] R = [0.0, 0, 0, 0]; 4699 double[2] correct = [3.0, 4.0]; 4700 _mm_storeu_pd(&R[1], A); 4701 assert(R[1..3] == correct); 4702 } 4703 4704 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4705 /// boundary. 4706 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned. Make it @system 4707 { 4708 // PERF: DMD 4709 pragma(inline, true); 4710 static if (GDC_with_SSE2) 4711 { 4712 __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a); 4713 } 4714 else static if (LDC_with_optimizations) 4715 { 4716 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4717 } 4718 else 4719 { 4720 int* p = cast(int*)mem_addr; 4721 p[0] = a.array[0]; 4722 p[1] = a.array[1]; 4723 p[2] = a.array[2]; 4724 p[3] = a.array[3]; 4725 } 4726 } 4727 unittest 4728 { 4729 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4730 align(16) int[6] R = [0, 0, 0, 0, 0, 0]; 4731 int[4] correct = [1, 2, 3, 4]; 4732 _mm_storeu_si128(cast(__m128i*)(&R[1]), A); 4733 assert(R[1..5] == correct); 4734 } 4735 4736 /// Store 16-bit integer from the first element of `a` into memory. 4737 /// `mem_addr` does not need to be aligned on any particular boundary. 4738 void _mm_storeu_si16 (void* mem_addr, __m128i a) pure @system 4739 { 4740 short* dest = cast(short*)mem_addr; 4741 *dest = (cast(short8)a).array[0]; 4742 } 4743 unittest 4744 { 4745 short[2] arr = [-24, 12]; 4746 _mm_storeu_si16(&arr[1], _mm_set1_epi16(26)); 4747 short[2] correct = [-24, 26]; 4748 assert(arr == correct); 4749 } 4750 4751 /// Store 32-bit integer from the first element of `a` into memory. 4752 /// `mem_addr` does not need to be aligned on any particular boundary. 4753 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted // TODO should really be @ssytem 4754 { 4755 pragma(inline, true); 4756 int* dest = cast(int*)mem_addr; 4757 *dest = a.array[0]; 4758 } 4759 unittest 4760 { 4761 int[2] arr = [-24, 12]; 4762 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4763 assert(arr == [-24, -1]); 4764 } 4765 4766 /// Store 64-bit integer from the first element of `a` into memory. 4767 /// `mem_addr` does not need to be aligned on any particular boundary. 4768 void _mm_storeu_si64 (void* mem_addr, __m128i a) pure @system 4769 { 4770 pragma(inline, true); 4771 long* dest = cast(long*)mem_addr; 4772 long2 la = cast(long2)a; 4773 *dest = la.array[0]; 4774 } 4775 unittest 4776 { 4777 long[3] A = [1, 2, 3]; 4778 _mm_storeu_si64(&A[1], _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4779 long[3] correct = [1, 0x1_0000_0000, 3]; 4780 assert(A == correct); 4781 } 4782 4783 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4784 /// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte 4785 /// boundary or a general-protection exception may be generated. 4786 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4787 void _mm_stream_pd (double* mem_addr, __m128d a) pure @system 4788 { 4789 // PERF DMD D_SIMD 4790 static if (GDC_with_SSE2) 4791 { 4792 return __builtin_ia32_movntpd(mem_addr, a); 4793 } 4794 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 4795 { 4796 enum prefix = `!0 = !{ i32 1 }`; 4797 enum ir = ` 4798 store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0 4799 ret void`; 4800 LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a); 4801 } 4802 else 4803 { 4804 // Regular store instead. 4805 __m128d* dest = cast(__m128d*)mem_addr; 4806 *dest = a; 4807 } 4808 } 4809 unittest 4810 { 4811 align(16) double[2] A; 4812 __m128d B = _mm_setr_pd(-8.0, 9.0); 4813 _mm_stream_pd(A.ptr, B); 4814 assert(A == [-8.0, 9.0]); 4815 } 4816 4817 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4818 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 4819 /// may be generated. 4820 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4821 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted 4822 { 4823 // PERF DMD D_SIMD 4824 static if (GDC_with_SSE2) 4825 { 4826 return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a); 4827 } 4828 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 4829 { 4830 enum prefix = `!0 = !{ i32 1 }`; 4831 enum ir = ` 4832 store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0 4833 ret void`; 4834 LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a); 4835 } 4836 else 4837 { 4838 // Regular store instead. 4839 __m128i* dest = cast(__m128i*)mem_addr; 4840 *dest = a; 4841 } 4842 } 4843 unittest 4844 { 4845 align(16) int[4] A; 4846 __m128i B = _mm_setr_epi32(-8, 9, 10, -11); 4847 _mm_stream_si128(cast(__m128i*)A.ptr, B); 4848 assert(A == [-8, 9, 10, -11]); 4849 } 4850 4851 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4852 /// pollution. If the cache line containing address `mem_addr` is already in the cache, 4853 /// the cache will be updated. 4854 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4855 void _mm_stream_si32 (int* mem_addr, int a) pure @trusted 4856 { 4857 // PERF DMD D_SIMD 4858 static if (GDC_with_SSE2) 4859 { 4860 return __builtin_ia32_movnti(mem_addr, a); 4861 } 4862 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 4863 { 4864 enum prefix = `!0 = !{ i32 1 }`; 4865 enum ir = ` 4866 store i32 %1, i32* %0, !nontemporal !0 4867 ret void`; 4868 LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a); 4869 } 4870 else 4871 { 4872 // Regular store instead. 4873 *mem_addr = a; 4874 } 4875 } 4876 unittest 4877 { 4878 int A; 4879 _mm_stream_si32(&A, -34); 4880 assert(A == -34); 4881 } 4882 4883 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4884 /// cache pollution. If the cache line containing address `mem_addr` is already 4885 /// in the cache, the cache will be updated. 4886 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4887 void _mm_stream_si64 (long* mem_addr, long a) pure @trusted 4888 { 4889 // PERF DMD D_SIMD 4890 static if (GDC_with_SSE2) 4891 { 4892 return __builtin_ia32_movnti64(mem_addr, a); 4893 } 4894 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 4895 { 4896 enum prefix = `!0 = !{ i32 1 }`; 4897 enum ir = ` 4898 store i64 %1, i64* %0, !nontemporal !0 4899 ret void`; 4900 LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a); 4901 4902 } 4903 else 4904 { 4905 // Regular store instead. 4906 *mem_addr = a; 4907 } 4908 } 4909 unittest 4910 { 4911 long A; 4912 _mm_stream_si64(&A, -46); 4913 assert(A == -46); 4914 } 4915 4916 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4917 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4918 { 4919 pragma(inline, true); 4920 return cast(__m128i)(cast(short8)a - cast(short8)b); 4921 } 4922 unittest 4923 { 4924 __m128i A = _mm_setr_epi16(16, 32767, 1, 2, 3, 4, 6, 6); 4925 __m128i B = _mm_setr_epi16(15, -32768, 6, 8, 1000, 1, 5, 6); 4926 short8 C = cast(short8) _mm_sub_epi16(A, B); 4927 short[8] correct = [ 1, -1,-5,-6, -997, 3, 1, 0]; 4928 assert(C.array == correct); 4929 } 4930 4931 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4932 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4933 { 4934 pragma(inline, true); 4935 return cast(__m128i)(cast(int4)a - cast(int4)b); 4936 } 4937 unittest 4938 { 4939 __m128i A = _mm_setr_epi32(16, int.max, 1, 8); 4940 __m128i B = _mm_setr_epi32(15, int.min, 6, 2); 4941 int4 C = cast(int4) _mm_sub_epi32(A, B); 4942 int[4] correct = [ 1, -1,-5, 6]; 4943 assert(C.array == correct); 4944 } 4945 4946 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4947 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4948 { 4949 pragma(inline, true); 4950 return cast(__m128i)(cast(long2)a - cast(long2)b); 4951 } 4952 unittest 4953 { 4954 __m128i A = _mm_setr_epi64( 16, long.max); 4955 __m128i B = _mm_setr_epi64( 199, long.min); 4956 long2 C = cast(long2) _mm_sub_epi64(A, B); 4957 long[2] correct = [-183, -1]; 4958 assert(C.array == correct); 4959 } 4960 4961 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4962 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4963 { 4964 pragma(inline, true); 4965 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4966 } 4967 unittest 4968 { 4969 __m128i A = _mm_setr_epi8(16, 127, 1, 2, 3, 4, 6, 6, 16, 127, 1, 2, 3, 4, 6, 6); 4970 __m128i B = _mm_setr_epi8(15, -128, 6, 8, 3, 1, 5, 6, 16, 127, 1, 2, 3, 4, 6, 6); 4971 byte16 C = cast(byte16) _mm_sub_epi8(A, B); 4972 byte[16] correct = [ 1, -1,-5,-6, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4973 assert(C.array == correct); 4974 } 4975 4976 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4977 /// floating-point elements in `a`. 4978 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4979 { 4980 pragma(inline, true); 4981 return a - b; 4982 } 4983 unittest 4984 { 4985 __m128d A = _mm_setr_pd(4000.0, -8.0); 4986 __m128d B = _mm_setr_pd(12.0, -8450.0); 4987 __m128d C = _mm_sub_pd(A, B); 4988 double[2] correct = [3988.0, 8442.0]; 4989 assert(C.array == correct); 4990 } 4991 4992 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4993 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4994 /// upper element of result. 4995 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4996 { 4997 version(DigitalMars) 4998 { 4999 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 5000 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 5001 asm pure nothrow @nogc @trusted { nop;} 5002 a[0] = a[0] - b[0]; 5003 return a; 5004 } 5005 else static if (GDC_with_SSE2) 5006 { 5007 return __builtin_ia32_subsd(a, b); 5008 } 5009 else 5010 { 5011 a.ptr[0] -= b.array[0]; 5012 return a; 5013 } 5014 } 5015 unittest 5016 { 5017 __m128d a = [1.5, -2.0]; 5018 a = _mm_sub_sd(a, a); 5019 assert(a.array == [0.0, -2.0]); 5020 } 5021 5022 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 5023 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 5024 { 5025 pragma(inline, true); 5026 return a - b; 5027 } 5028 unittest 5029 { 5030 __m64 A, B; 5031 A = -1214; 5032 B = 489415; 5033 __m64 C = _mm_sub_si64(B, A); 5034 assert(C.array[0] == 489415 + 1214); 5035 } 5036 5037 /// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using 5038 /// saturation. 5039 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 5040 { 5041 // PERF DMD psubsw 5042 static if(LDC_with_saturated_intrinsics) 5043 { 5044 return cast(__m128i) inteli_llvm_subs!short8(cast(short8)a, cast(short8)b); 5045 } 5046 else static if (GDC_with_SSE2) 5047 { 5048 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 5049 } 5050 else 5051 { 5052 short[8] res; // PERF =void; 5053 short8 sa = cast(short8)a; 5054 short8 sb = cast(short8)b; 5055 foreach(i; 0..8) 5056 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 5057 return _mm_loadu_si128(cast(int4*)res.ptr); 5058 } 5059 } 5060 unittest 5061 { 5062 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 5063 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 5064 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 5065 assert(res.array == correctResult); 5066 } 5067 5068 /// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using 5069 /// saturation. 5070 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 5071 { 5072 static if(LDC_with_saturated_intrinsics) 5073 { 5074 return cast(__m128i) inteli_llvm_subs!byte16(cast(byte16)a, cast(byte16)b); 5075 } 5076 else static if (GDC_with_SSE2) 5077 { 5078 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 5079 } 5080 else 5081 { 5082 byte[16] res; // PERF =void; 5083 byte16 sa = cast(byte16)a; 5084 byte16 sb = cast(byte16)b; 5085 foreach(i; 0..16) 5086 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 5087 return _mm_loadu_si128(cast(int4*)res.ptr); 5088 } 5089 } 5090 unittest 5091 { 5092 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 5093 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 5094 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 5095 assert(res.array == correctResult); 5096 } 5097 5098 /// Subtract packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 5099 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 5100 { 5101 static if(LDC_with_saturated_intrinsics) 5102 { 5103 return cast(__m128i) inteli_llvm_subus!short8(cast(short8)a, cast(short8)b); 5104 } 5105 else static if (GDC_with_SSE2) 5106 { 5107 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 5108 } 5109 else 5110 { 5111 short[8] res; // PERF =void; 5112 short8 sa = cast(short8)a; 5113 short8 sb = cast(short8)b; 5114 foreach(i; 0..8) 5115 { 5116 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 5117 res[i] = saturateSignedIntToUnsignedShort(sum); 5118 } 5119 return _mm_loadu_si128(cast(int4*)res.ptr); 5120 } 5121 } 5122 unittest 5123 { 5124 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 5125 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 5126 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 5127 assert(R.array == correct); 5128 } 5129 5130 /// Subtract packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 5131 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 5132 { 5133 static if(LDC_with_saturated_intrinsics) 5134 { 5135 return cast(__m128i) inteli_llvm_subus!byte16(cast(byte16)a, cast(byte16)b); 5136 } 5137 else static if (GDC_with_SSE2) 5138 { 5139 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 5140 } 5141 else 5142 { 5143 ubyte[16] res; // PERF =void; 5144 byte16 sa = cast(byte16)a; 5145 byte16 sb = cast(byte16)b; 5146 foreach(i; 0..16) 5147 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 5148 return _mm_loadu_si128(cast(int4*)res.ptr); 5149 } 5150 } 5151 unittest 5152 { 5153 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 5154 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 5155 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 5156 assert(res.array == correctResult); 5157 } 5158 5159 // Note: the only difference between these intrinsics is the signalling 5160 // behaviour of quiet NaNs. This is incorrect but the case where 5161 // you would want to differentiate between qNaN and sNaN and then 5162 // treat them differently on purpose seems extremely rare. 5163 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 5164 alias _mm_ucomige_sd = _mm_comige_sd; /// 5165 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 5166 alias _mm_ucomile_sd = _mm_comile_sd; /// 5167 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 5168 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 5169 5170 /// Return vector of type `__m128d` with undefined elements. 5171 __m128d _mm_undefined_pd() pure @safe 5172 { 5173 pragma(inline, true); 5174 __m128d result = void; 5175 return result; 5176 } 5177 5178 /// Return vector of type `__m128i` with undefined elements. 5179 __m128i _mm_undefined_si128() pure @safe 5180 { 5181 pragma(inline, true); 5182 __m128i result = void; 5183 return result; 5184 } 5185 5186 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 5187 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted 5188 { 5189 static if (DMD_with_DSIMD) 5190 { 5191 return cast(__m128i) __simd(XMM.PUNPCKHWD, a, b); 5192 } 5193 else static if (GDC_with_SSE2) 5194 { 5195 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 5196 } 5197 else static if (LDC_with_optimizations) 5198 { 5199 enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 5200 ret <8 x i16> %r`; 5201 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 5202 } 5203 else static if (DMD_with_32bit_asm || LDC_with_x86_asm) 5204 { 5205 asm pure nothrow @nogc @trusted 5206 { 5207 movdqu XMM0, a; 5208 movdqu XMM1, b; 5209 punpckhwd XMM0, XMM1; 5210 movdqu a, XMM0; 5211 } 5212 return a; 5213 } 5214 else 5215 { 5216 short8 r = void; 5217 short8 sa = cast(short8)a; 5218 short8 sb = cast(short8)b; 5219 r.ptr[0] = sa.array[4]; 5220 r.ptr[1] = sb.array[4]; 5221 r.ptr[2] = sa.array[5]; 5222 r.ptr[3] = sb.array[5]; 5223 r.ptr[4] = sa.array[6]; 5224 r.ptr[5] = sb.array[6]; 5225 r.ptr[6] = sa.array[7]; 5226 r.ptr[7] = sb.array[7]; 5227 return cast(__m128i)r; 5228 } 5229 } 5230 unittest 5231 { 5232 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 5233 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 5234 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 5235 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 5236 assert(C.array == correct); 5237 } 5238 5239 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 5240 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 5241 { 5242 static if (DMD_with_DSIMD) 5243 { 5244 return cast(__m128i) __simd(XMM.PUNPCKHDQ, a, b); 5245 } 5246 else static if (GDC_with_SSE2) 5247 { 5248 return __builtin_ia32_punpckhdq128(a, b); 5249 } 5250 else static if (LDC_with_optimizations) 5251 { 5252 enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 5253 ret <4 x i32> %r`; 5254 return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b); 5255 } 5256 else 5257 { 5258 __m128i r = void; 5259 r.ptr[0] = a.array[2]; 5260 r.ptr[1] = b.array[2]; 5261 r.ptr[2] = a.array[3]; 5262 r.ptr[3] = b.array[3]; 5263 return r; 5264 } 5265 } 5266 unittest 5267 { 5268 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 5269 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 5270 __m128i C = _mm_unpackhi_epi32(A, B); 5271 int[4] correct = [3, 7, 4, 8]; 5272 assert(C.array == correct); 5273 } 5274 5275 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 5276 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 5277 { 5278 static if (GDC_with_SSE2) 5279 { 5280 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 5281 } 5282 else 5283 { 5284 __m128i r = cast(__m128i)b; 5285 r[0] = a[2]; 5286 r[1] = a[3]; 5287 return r; 5288 } 5289 } 5290 unittest // Issue #36 5291 { 5292 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 5293 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 5294 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 5295 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 5296 assert(C.array == correct); 5297 } 5298 5299 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 5300 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted 5301 { 5302 static if (DMD_with_DSIMD) 5303 { 5304 return cast(__m128i) __simd(XMM.PUNPCKHBW, a, b); 5305 } 5306 else static if (GDC_with_SSE2) 5307 { 5308 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 5309 } 5310 else static if (LDC_with_optimizations) 5311 { 5312 enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 5313 ret <16 x i8> %r`; 5314 return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 5315 } 5316 else static if (DMD_with_32bit_asm || LDC_with_x86_asm) 5317 { 5318 asm pure nothrow @nogc @trusted 5319 { 5320 movdqu XMM0, a; 5321 movdqu XMM1, b; 5322 punpckhbw XMM0, XMM1; 5323 movdqu a, XMM0; 5324 } 5325 return a; 5326 } 5327 else 5328 { 5329 byte16 r = void; 5330 byte16 ba = cast(byte16)a; 5331 byte16 bb = cast(byte16)b; 5332 r.ptr[0] = ba.array[8]; 5333 r.ptr[1] = bb.array[8]; 5334 r.ptr[2] = ba.array[9]; 5335 r.ptr[3] = bb.array[9]; 5336 r.ptr[4] = ba.array[10]; 5337 r.ptr[5] = bb.array[10]; 5338 r.ptr[6] = ba.array[11]; 5339 r.ptr[7] = bb.array[11]; 5340 r.ptr[8] = ba.array[12]; 5341 r.ptr[9] = bb.array[12]; 5342 r.ptr[10] = ba.array[13]; 5343 r.ptr[11] = bb.array[13]; 5344 r.ptr[12] = ba.array[14]; 5345 r.ptr[13] = bb.array[14]; 5346 r.ptr[14] = ba.array[15]; 5347 r.ptr[15] = bb.array[15]; 5348 return cast(__m128i)r; 5349 } 5350 } 5351 unittest 5352 { 5353 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5354 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5355 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 5356 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 5357 assert(C.array == correct); 5358 } 5359 5360 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 5361 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted 5362 { 5363 // PERF DMD D_SIMD 5364 static if (GDC_with_SSE2) 5365 { 5366 return __builtin_ia32_unpckhpd(a, b); 5367 } 5368 else static if (LDC_with_optimizations) 5369 { 5370 enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 1, i32 3> 5371 ret <2 x double> %r`; 5372 return LDCInlineIR!(ir, double2, double2, double2)(a, b); 5373 } 5374 else 5375 { 5376 double2 r = void; 5377 r.ptr[0] = a.array[1]; 5378 r.ptr[1] = b.array[1]; 5379 return r; 5380 } 5381 } 5382 unittest 5383 { 5384 __m128d A = _mm_setr_pd(4.0, 6.0); 5385 __m128d B = _mm_setr_pd(7.0, 9.0); 5386 __m128d C = _mm_unpackhi_pd(A, B); 5387 double[2] correct = [6.0, 9.0]; 5388 assert(C.array == correct); 5389 } 5390 5391 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 5392 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted 5393 { 5394 static if (DMD_with_DSIMD) 5395 { 5396 return cast(__m128i) __simd(XMM.PUNPCKLWD, a, b); 5397 } 5398 else static if (GDC_with_SSE2) 5399 { 5400 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 5401 } 5402 else static if (LDC_with_optimizations) 5403 { 5404 enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 5405 ret <8 x i16> %r`; 5406 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 5407 } 5408 else static if (DMD_with_32bit_asm || LDC_with_x86_asm) 5409 { 5410 asm pure nothrow @nogc @trusted 5411 { 5412 movdqu XMM0, a; 5413 movdqu XMM1, b; 5414 punpcklwd XMM0, XMM1; 5415 movdqu a, XMM0; 5416 } 5417 return a; 5418 } 5419 else 5420 { 5421 short8 r = void; 5422 short8 sa = cast(short8)a; 5423 short8 sb = cast(short8)b; 5424 r.ptr[0] = sa.array[0]; 5425 r.ptr[1] = sb.array[0]; 5426 r.ptr[2] = sa.array[1]; 5427 r.ptr[3] = sb.array[1]; 5428 r.ptr[4] = sa.array[2]; 5429 r.ptr[5] = sb.array[2]; 5430 r.ptr[6] = sa.array[3]; 5431 r.ptr[7] = sb.array[3]; 5432 return cast(__m128i)r; 5433 } 5434 } 5435 unittest 5436 { 5437 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 5438 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 5439 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 5440 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 5441 assert(C.array == correct); 5442 } 5443 5444 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 5445 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 5446 { 5447 static if (DMD_with_DSIMD) 5448 { 5449 return cast(__m128i) __simd(XMM.PUNPCKLDQ, a, b); 5450 } 5451 else static if (GDC_with_SSE2) 5452 { 5453 return __builtin_ia32_punpckldq128(a, b); 5454 } 5455 else static if (LDC_with_optimizations) 5456 { 5457 enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 5458 ret <4 x i32> %r`; 5459 return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b); 5460 } 5461 else 5462 { 5463 __m128i r; 5464 r.ptr[0] = a.array[0]; 5465 r.ptr[1] = b.array[0]; 5466 r.ptr[2] = a.array[1]; 5467 r.ptr[3] = b.array[1]; 5468 return r; 5469 } 5470 } 5471 unittest 5472 { 5473 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 5474 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 5475 __m128i C = _mm_unpacklo_epi32(A, B); 5476 int[4] correct = [1, 5, 2, 6]; 5477 assert(C.array == correct); 5478 } 5479 5480 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 5481 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 5482 { 5483 static if (GDC_with_SSE2) 5484 { 5485 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 5486 } 5487 else 5488 { 5489 long2 lA = cast(long2)a; 5490 long2 lB = cast(long2)b; 5491 long2 R; // PERF =void; 5492 R.ptr[0] = lA.array[0]; 5493 R.ptr[1] = lB.array[0]; 5494 return cast(__m128i)R; 5495 } 5496 } 5497 unittest // Issue #36 5498 { 5499 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 5500 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 5501 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 5502 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 5503 assert(C.array == correct); 5504 } 5505 5506 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 5507 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted 5508 { 5509 static if (DMD_with_DSIMD) 5510 { 5511 return cast(__m128i) __simd(XMM.PUNPCKLBW, a, b); 5512 } 5513 else static if (GDC_with_SSE2) 5514 { 5515 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 5516 } 5517 else static if (LDC_with_optimizations) 5518 { 5519 enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 5520 ret <16 x i8> %r`; 5521 return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 5522 } 5523 else static if (DMD_with_32bit_asm || LDC_with_x86_asm) 5524 { 5525 asm pure nothrow @nogc @trusted 5526 { 5527 movdqu XMM0, a; 5528 movdqu XMM1, b; 5529 punpcklbw XMM0, XMM1; 5530 movdqu a, XMM0; 5531 } 5532 return a; 5533 } 5534 else 5535 { 5536 byte16 r = void; 5537 byte16 ba = cast(byte16)a; 5538 byte16 bb = cast(byte16)b; 5539 r.ptr[0] = ba.array[0]; 5540 r.ptr[1] = bb.array[0]; 5541 r.ptr[2] = ba.array[1]; 5542 r.ptr[3] = bb.array[1]; 5543 r.ptr[4] = ba.array[2]; 5544 r.ptr[5] = bb.array[2]; 5545 r.ptr[6] = ba.array[3]; 5546 r.ptr[7] = bb.array[3]; 5547 r.ptr[8] = ba.array[4]; 5548 r.ptr[9] = bb.array[4]; 5549 r.ptr[10] = ba.array[5]; 5550 r.ptr[11] = bb.array[5]; 5551 r.ptr[12] = ba.array[6]; 5552 r.ptr[13] = bb.array[6]; 5553 r.ptr[14] = ba.array[7]; 5554 r.ptr[15] = bb.array[7]; 5555 return cast(__m128i)r; 5556 } 5557 } 5558 unittest 5559 { 5560 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5561 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5562 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 5563 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 5564 assert(C.array == correct); 5565 } 5566 5567 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 5568 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted 5569 { 5570 // PERF DMD D_SIMD 5571 static if (GDC_with_SSE2) 5572 { 5573 return __builtin_ia32_unpcklpd(a, b); 5574 } 5575 else static if (LDC_with_optimizations) 5576 { 5577 enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 0, i32 2> 5578 ret <2 x double> %r`; 5579 return LDCInlineIR!(ir, double2, double2, double2)(a, b); 5580 } 5581 else 5582 { 5583 double2 r = void; 5584 r.ptr[0] = a.array[0]; 5585 r.ptr[1] = b.array[0]; 5586 return r; 5587 } 5588 } 5589 unittest 5590 { 5591 __m128d A = _mm_setr_pd(4.0, 6.0); 5592 __m128d B = _mm_setr_pd(7.0, 9.0); 5593 __m128d C = _mm_unpacklo_pd(A, B); 5594 double[2] correct = [4.0, 7.0]; 5595 assert(C.array == correct); 5596 } 5597 5598 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 5599 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 5600 { 5601 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 5602 } 5603 unittest 5604 { 5605 __m128d A = _mm_setr_pd(-4.0, 6.0); 5606 __m128d B = _mm_setr_pd(4.0, -6.0); 5607 long2 R = cast(long2) _mm_xor_pd(A, B); 5608 long[2] correct = [long.min, long.min]; 5609 assert(R.array == correct); 5610 } 5611 5612 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 5613 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 5614 { 5615 return a ^ b; 5616 } 5617 unittest 5618 { 5619 __m128i A = _mm_setr_epi64(975394, 619809709); 5620 __m128i B = _mm_setr_epi64(-920275025, -6); 5621 long2 R = cast(long2) _mm_xor_si128(A, B); 5622 long[2] correct = [975394 ^ (-920275025L), 619809709L ^ -6]; 5623 assert(R.array == correct); 5624 } 5625 5626 unittest 5627 { 5628 float distance(float[4] a, float[4] b) nothrow @nogc 5629 { 5630 __m128 va = _mm_loadu_ps(a.ptr); 5631 __m128 vb = _mm_loadu_ps(b.ptr); 5632 __m128 diffSquared = _mm_sub_ps(va, vb); 5633 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 5634 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 5635 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 5636 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 5637 } 5638 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 5639 }