1 /** 2 * SSE4.1 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1 4 * 5 * Copyright: Guillaume Piolat 2021. 6 * Johan Engelen 2021. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.smmintrin; 10 11 // SSE4.1 instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 13 // Note: this header will work whether you have SSE4.1 enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively 15 // generate SSE4.1 instructions. 16 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions. 17 18 public import inteli.types; 19 import inteli.internals; 20 21 // smmintrin pulls in all previous instruction set intrinsics. 22 public import inteli.tmmintrin; 23 24 nothrow @nogc: 25 26 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes 27 enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto 28 enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto 29 enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto 30 enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto 31 enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto 32 enum int _MM_FROUND_NO_EXC = 0x08; /// ditto 33 34 enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); 35 enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); 36 enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); 37 enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); 38 enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); 39 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); 40 41 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. 42 // Note: changed signature, GDC needs a compile-time value for imm8. 43 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) pure @trusted 44 { 45 // PERF DMD 46 static if (GDC_with_SSE41) 47 { 48 pragma(inline, true); // else wouldn't inline in _mm256_blend_epi16 49 return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); 50 } 51 else 52 { 53 // LDC x86 This generates pblendw since LDC 1.1 and -O2 54 short8 r; 55 short8 sa = cast(short8)a; 56 short8 sb = cast(short8)b; 57 for (int n = 0; n < 8; ++n) 58 { 59 r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; 60 } 61 return cast(__m128i)r; 62 } 63 } 64 unittest 65 { 66 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 67 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 68 short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 69 short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; 70 assert(C.array == correct); 71 } 72 73 74 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`. 75 // Note: changed signature, GDC needs a compile-time value for `imm8`. 76 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted 77 { 78 static assert(imm8 >= 0 && imm8 < 4); 79 // PERF DMD 80 static if (GDC_with_SSE41) 81 { 82 return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8); 83 } 84 else 85 { 86 // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 87 double2 r; 88 for (int n = 0; n < 2; ++n) 89 { 90 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 91 } 92 return cast(__m128d)r; 93 } 94 } 95 unittest 96 { 97 __m128d A = _mm_setr_pd(0, 1); 98 __m128d B = _mm_setr_pd(8, 9); 99 double2 C = _mm_blend_pd!2(A, B); 100 double[2] correct = [0, 9]; 101 assert(C.array == correct); 102 } 103 104 105 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 106 /// mask `imm8`. 107 // Note: changed signature, GDC needs a compile-time value for imm8. 108 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) pure @trusted 109 { 110 // PERF DMD 111 static assert(imm8 >= 0 && imm8 < 16); 112 static if (GDC_with_SSE41) 113 { 114 return __builtin_ia32_blendps(a, b, imm8); 115 } 116 else version(LDC) 117 { 118 // LDC x86: generates blendps since LDC 1.1 -O2 119 // arm64: pretty good, two instructions worst case 120 return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0, 121 (imm8 & 2) ? 5 : 1, 122 (imm8 & 4) ? 6 : 2, 123 (imm8 & 8) ? 7 : 3)(a, b); 124 } 125 else 126 { 127 // PERF GDC without SSE4.1 is quite bad 128 __m128 r; 129 for (int n = 0; n < 4; ++n) 130 { 131 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 132 } 133 return r; 134 } 135 } 136 unittest 137 { 138 __m128 A = _mm_setr_ps(0, 1, 2, 3); 139 __m128 B = _mm_setr_ps(8, 9, 10, 11); 140 float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 141 float[4] correct = [8, 1, 10, 11]; 142 assert(C.array == correct); 143 } 144 145 /// Blend packed 8-bit integers from `a` and `b` using `mask`. 146 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted 147 { 148 // PERF DMD 149 /*static if (GDC_with_SSE41) 150 { 151 // This intrinsic do nothing in GDC 12. 152 // TODO report to GDC. No problem in GCC. 153 return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask); 154 } 155 else*/ 156 static if (LDC_with_SSE41) 157 { 158 return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); 159 } 160 else static if (LDC_with_ARM64) 161 { 162 // LDC arm64: two instructions since LDC 1.12 -O2 163 byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7); 164 return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a); 165 } 166 else 167 { 168 __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); 169 return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); 170 } 171 } 172 unittest 173 { 174 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 175 8, 9, 10, 11, 12, 13, 14, 15); 176 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 177 24, 25, 26, 27, 28, 29, 30, 31); 178 __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, 179 1, 1, -1, -1, 4, 1, 8, -128); 180 byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M); 181 byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, 182 8, 9, 26, 27, 12, 13, 14, 31 ]; 183 assert(R.array == correct); 184 } 185 186 187 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`. 188 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted 189 { 190 // PERF DMD 191 static if (GDC_with_SSE42) 192 { 193 // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction 194 // with -msse4.2 but not -msse4.1. 195 // Not sure what is the reason, and there is a replacement sequence. 196 // Sounds like a bug. 197 return __builtin_ia32_blendvpd(a, b, mask); 198 } 199 else static if (LDC_with_SSE41) 200 { 201 return __builtin_ia32_blendvpd(a, b, mask); 202 } 203 else static if (LDC_with_ARM64) 204 { 205 long2 shift; 206 shift = 63; 207 long2 lmask = cast(long2)mask >> shift; 208 return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a); 209 } 210 else 211 { 212 __m128d r; // PERF =void; 213 long2 lmask = cast(long2)mask; 214 for (int n = 0; n < 2; ++n) 215 { 216 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 217 } 218 return r; 219 } 220 } 221 unittest 222 { 223 __m128d A = _mm_setr_pd(1.0, 2.0); 224 __m128d B = _mm_setr_pd(3.0, 4.0); 225 __m128d M1 = _mm_setr_pd(-3.0, 2.0); 226 __m128d R1 = _mm_blendv_pd(A, B, M1); 227 double[2] correct1 = [3.0, 2.0]; 228 assert(R1.array == correct1); 229 230 // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost 231 // See Issue #78 232 __m128d M2 = _mm_setr_pd(double.nan, double.infinity); 233 __m128d R2 = _mm_blendv_pd(A, B, M2); 234 double[2] correct2 = [1.0, 2.0]; 235 assert(R2.array == correct2); 236 } 237 238 239 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`. 240 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted 241 { 242 // PERF DMD 243 static if (GDC_with_SSE41) 244 { 245 return __builtin_ia32_blendvps(a, b, mask); 246 } 247 else static if (LDC_with_SSE41) 248 { 249 return __builtin_ia32_blendvps(a, b, mask); 250 } 251 else static if (LDC_with_ARM64) 252 { 253 int4 shift; 254 shift = 31; 255 int4 lmask = cast(int4)mask >> shift; 256 return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a); 257 } 258 else 259 { 260 __m128 r; // PERF =void; 261 int4 lmask = cast(int4)mask; 262 for (int n = 0; n < 4; ++n) 263 { 264 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 265 } 266 return r; 267 } 268 } 269 unittest 270 { 271 __m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f); 272 __m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f); 273 __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f); 274 __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f); 275 __m128 R1 = _mm_blendv_ps(A, B, M1); 276 __m128 R2 = _mm_blendv_ps(A, B, M2); 277 float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f]; 278 float[4] correct2 = [ 0.0f, 1.0f, 6.0f, 3.0f]; 279 assert(R1.array == correct1); 280 281 // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost 282 // See Issue #78 283 assert(R2.array == correct2); 284 } 285 286 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 287 /// and store the results as packed double-precision floating-point elements. 288 __m128d _mm_ceil_pd (__m128d a) @trusted 289 { 290 static if (LDC_with_ARM64) 291 { 292 // LDC arm64 acceptable since 1.8 -O2 293 // Unfortunately x86 intrinsics force a round-trip back to double2 294 // ARM neon semantics wouldn't have that 295 long2 l = vcvtpq_s64_f64(a); 296 double2 r; 297 r.ptr[0] = l.array[0]; 298 r.ptr[1] = l.array[1]; 299 return r; 300 } 301 else 302 { 303 return _mm_round_pd!2(a); 304 } 305 } 306 unittest 307 { 308 __m128d A = _mm_setr_pd(1.3f, -2.12f); 309 __m128d B = _mm_setr_pd(53.6f, -2.7f); 310 A = _mm_ceil_pd(A); 311 B = _mm_ceil_pd(B); 312 double[2] correctA = [2.0, -2.0]; 313 double[2] correctB = [54.0, -2.0]; 314 assert(A.array == correctA); 315 assert(B.array == correctB); 316 } 317 318 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 319 /// and store the results as packed single-precision floating-point elements. 320 __m128 _mm_ceil_ps (__m128 a) @trusted 321 { 322 static if (LDC_with_ARM64) 323 { 324 // LDC arm64 acceptable since 1.8 -O1 325 int4 l = vcvtpq_s32_f32(a); 326 float4 r; 327 r.ptr[0] = l.array[0]; 328 r.ptr[1] = l.array[1]; 329 r.ptr[2] = l.array[2]; 330 r.ptr[3] = l.array[3]; 331 return r; 332 } 333 else 334 { 335 return _mm_round_ps!2(a); 336 } 337 } 338 unittest 339 { 340 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 341 __m128 C = _mm_ceil_ps(A); 342 float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f]; 343 assert(C.array == correct); 344 } 345 346 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 347 /// store the result as a double-precision floating-point element in the lower element of result, 348 /// and copy the upper element from `a` to the upper element of dst. 349 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted 350 { 351 static if (LDC_with_ARM64) 352 { 353 a[0] = vcvtps_s64_f64(b[0]); 354 return a; 355 } 356 else 357 { 358 return _mm_round_sd!2(a, b); 359 } 360 } 361 unittest 362 { 363 __m128d A = _mm_setr_pd(1.3, -2.12); 364 __m128d B = _mm_setr_pd(53.6, -3.7); 365 __m128d C = _mm_ceil_sd(A, B); 366 double[2] correct = [54.0, -2.12]; 367 assert(C.array == correct); 368 } 369 370 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value, 371 /// store the result as a single-precision floating-point element in the lower element of result, 372 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 373 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted 374 { 375 static if (LDC_with_ARM64) 376 { 377 a[0] = vcvtps_s32_f32(b[0]); 378 return a; 379 } 380 else 381 { 382 return _mm_round_ss!2(a, b); 383 } 384 } 385 unittest 386 { 387 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 388 __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f); 389 __m128 C = _mm_ceil_ss(A, B); 390 float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f]; 391 assert(C.array == correct); 392 } 393 394 /// Compare packed 64-bit integers in `a` and `b` for equality. 395 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted 396 { 397 static if (SIMD_COMPARISON_MASKS_16B) 398 { 399 version(DigitalMars) 400 { 401 // DMD doesn't recognize long2 == long2 402 long2 la = cast(long2)a; 403 long2 lb = cast(long2)b; 404 long2 res; 405 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 406 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 407 return cast(__m128i)res; 408 } 409 else 410 { 411 return cast(__m128i)(cast(long2)a == cast(long2)b); 412 } 413 } 414 else static if (GDC_with_SSE41) 415 { 416 return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b); 417 } 418 else version(LDC) 419 { 420 // LDC x86: generates pcmpeqq since LDC 1.1 -O1 421 // arm64: generates cmeq since LDC 1.8 -O1 422 return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b); 423 } 424 else 425 { 426 // Clever pcmpeqd + pand use with LDC 1.24 -O2 427 long2 la = cast(long2)a; 428 long2 lb = cast(long2)b; 429 long2 res; 430 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 431 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 432 return cast(__m128i)res; 433 } 434 } 435 unittest 436 { 437 __m128i A = _mm_setr_epi64(-1, -2); 438 __m128i B = _mm_setr_epi64(-3, -2); 439 __m128i C = _mm_setr_epi64(-1, -4); 440 long2 AB = cast(long2) _mm_cmpeq_epi64(A, B); 441 long2 AC = cast(long2) _mm_cmpeq_epi64(A, C); 442 long[2] correct1 = [0, -1]; 443 long[2] correct2 = [-1, 0]; 444 assert(AB.array == correct1); 445 assert(AC.array == correct2); 446 } 447 448 449 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. 450 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted 451 { 452 // PERF DMD 453 static if (GDC_with_SSE41) 454 { 455 return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a); 456 } 457 else static if (LDC_with_optimizations) 458 { 459 // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64 460 enum ir = ` 461 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 462 %r = sext <4 x i16> %v to <4 x i32> 463 ret <4 x i32> %r`; 464 return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a); 465 } 466 else 467 { 468 short8 sa = cast(short8)a; 469 int4 r; 470 r.ptr[0] = sa.array[0]; 471 r.ptr[1] = sa.array[1]; 472 r.ptr[2] = sa.array[2]; 473 r.ptr[3] = sa.array[3]; 474 return r; 475 } 476 } 477 unittest 478 { 479 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 480 int4 C = cast(int4) _mm_cvtepi16_epi32(A); 481 int[4] correct = [-1, 0, -32768, 32767]; 482 assert(C.array == correct); 483 } 484 485 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. 486 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted 487 { 488 // PERF DMD 489 static if (GDC_with_SSE41) 490 { 491 return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a); 492 } 493 else static if (LDC_with_optimizations) 494 { 495 // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64 496 enum ir = ` 497 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1> 498 %r = sext <2 x i16> %v to <2 x i64> 499 ret <2 x i64> %r`; 500 return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a); 501 } 502 else 503 { 504 short8 sa = cast(short8)a; 505 long2 r; 506 r.ptr[0] = sa.array[0]; 507 r.ptr[1] = sa.array[1]; 508 return cast(__m128i)r; 509 } 510 } 511 unittest 512 { 513 __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0); 514 long2 C = cast(long2) _mm_cvtepi16_epi64(A); 515 long[2] correct = [-32768, 32767]; 516 assert(C.array == correct); 517 } 518 519 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. 520 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted 521 { 522 // PERF DMD 523 static if (GDC_with_SSE41) 524 { 525 return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a); 526 } 527 else static if (LDC_with_optimizations) 528 { 529 // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64 530 enum ir = ` 531 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 532 %r = sext <2 x i32> %v to <2 x i64> 533 ret <2 x i64> %r`; 534 return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a); 535 } 536 else 537 { 538 int4 sa = cast(int4)a; 539 long2 r; 540 r.ptr[0] = sa.array[0]; 541 r.ptr[1] = sa.array[1]; 542 return cast(__m128i)r; 543 } 544 } 545 unittest 546 { 547 __m128i A = _mm_setr_epi32(-4, 42, 0, 0); 548 long2 C = cast(long2) _mm_cvtepi32_epi64(A); 549 long[2] correct = [-4, 42]; 550 assert(C.array == correct); 551 } 552 553 554 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. 555 __m128i _mm_cvtepi8_epi16 (__m128i a) pure @trusted 556 { 557 // PERF DMD 558 static if (GDC_with_SSE41) 559 { 560 alias ubyte16 = __vector(ubyte[16]); 561 return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a); 562 } 563 else static if (LDC_with_optimizations) 564 { 565 // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 566 // LDC ARM64: sshll generated since LDC 1.8.0 -O1 567 enum ir = ` 568 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 569 %r = sext <8 x i8> %v to <8 x i16> 570 ret <8 x i16> %r`; 571 return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); 572 } 573 else 574 { 575 byte16 sa = cast(byte16)a; 576 short8 r; 577 foreach(n; 0..8) 578 r.ptr[n] = sa.array[n]; 579 return cast(__m128i)r; 580 } 581 } 582 unittest 583 { 584 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 585 short8 C = cast(short8) _mm_cvtepi8_epi16(A); 586 short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8]; 587 assert(C.array == correct); 588 } 589 590 591 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. 592 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted 593 { 594 // PERF DMD 595 static if (GDC_with_SSE41) 596 { 597 alias ubyte16 = __vector(ubyte[16]); 598 return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a); 599 } 600 else static if (LDC_with_SSE41 && LDC_with_optimizations) 601 { 602 // LDC x86: Generates pmovsxbd since LDC 1.1 -O0 603 enum ir = ` 604 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 605 %r = sext <4 x i8> %v to <4 x i32> 606 ret <4 x i32> %r`; 607 return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a); 608 } 609 else 610 { 611 // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would 612 byte16 sa = cast(byte16)a; 613 int4 r; 614 r.ptr[0] = sa.array[0]; 615 r.ptr[1] = sa.array[1]; 616 r.ptr[2] = sa.array[2]; 617 r.ptr[3] = sa.array[3]; 618 return cast(__m128i)r; 619 } 620 } 621 unittest 622 { 623 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 624 int4 C = cast(int4) _mm_cvtepi8_epi32(A); 625 int[4] correct = [127, -128, 1, -1]; 626 assert(C.array == correct); 627 } 628 629 630 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 631 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted 632 { 633 // PERF DMD 634 static if (GDC_with_SSE41) 635 { 636 alias ubyte16 = __vector(ubyte[16]); 637 return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a); 638 } 639 else static if (LDC_with_optimizations) 640 { 641 // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 642 // LDC arm64: it's ok since LDC 1.8 -O1 643 enum ir = ` 644 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1> 645 %r = sext <2 x i8> %v to <2 x i64> 646 ret <2 x i64> %r`; 647 return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a); 648 } 649 else 650 { 651 byte16 sa = cast(byte16)a; 652 long2 r; 653 foreach(n; 0..2) 654 r.ptr[n] = sa.array[n]; 655 return cast(__m128i)r; 656 } 657 } 658 unittest 659 { 660 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 661 long2 C = cast(long2) _mm_cvtepi8_epi64(A); 662 long[2] correct = [127, -128]; 663 assert(C.array == correct); 664 } 665 666 667 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 668 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted 669 { 670 // PERF DMD 671 static if (GDC_with_SSE41) 672 { 673 return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a); 674 } 675 else 676 { 677 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 678 // arm64: ushll since LDC 1.12 -O1 679 short8 sa = cast(short8)a; 680 int4 r; 681 r.ptr[0] = cast(ushort)sa.array[0]; 682 r.ptr[1] = cast(ushort)sa.array[1]; 683 r.ptr[2] = cast(ushort)sa.array[2]; 684 r.ptr[3] = cast(ushort)sa.array[3]; 685 return cast(__m128i)r; 686 } 687 } 688 unittest 689 { 690 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 691 int4 C = cast(int4) _mm_cvtepu16_epi32(A); 692 int[4] correct = [65535, 0, 32768, 32767]; 693 assert(C.array == correct); 694 } 695 696 697 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. 698 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted 699 { 700 // PERF DMD 701 static if (GDC_with_SSE41) 702 { 703 return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a); 704 } 705 else static if (LDC_with_ARM64) 706 { 707 // LDC arm64: a bit shorter than below, in -O2 708 short8 sa = cast(short8)a; 709 long2 r; 710 for(int n = 0; n < 2; ++n) 711 r.ptr[n] = cast(ushort)sa.array[n]; 712 return cast(__m128i)r; 713 } 714 else 715 { 716 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 717 short8 sa = cast(short8)a; 718 long2 r; 719 r.ptr[0] = cast(ushort)sa.array[0]; 720 r.ptr[1] = cast(ushort)sa.array[1]; 721 return cast(__m128i)r; 722 } 723 } 724 unittest 725 { 726 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 727 long2 C = cast(long2) _mm_cvtepu16_epi64(A); 728 long[2] correct = [65535, 0]; 729 assert(C.array == correct); 730 } 731 732 733 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. 734 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted 735 { 736 // PERF DMD 737 static if (GDC_with_SSE41) 738 { 739 return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a); 740 } 741 else 742 { 743 // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1 744 // arm64: generates ushll since LDC 1.12 -O1 745 int4 sa = cast(int4)a; 746 long2 r; 747 r.ptr[0] = cast(uint)sa.array[0]; 748 r.ptr[1] = cast(uint)sa.array[1]; 749 return cast(__m128i)r; 750 } 751 } 752 unittest 753 { 754 __m128i A = _mm_setr_epi32(-1, 42, 0, 0); 755 long2 C = cast(long2) _mm_cvtepu32_epi64(A); 756 long[2] correct = [4294967295, 42]; 757 assert(C.array == correct); 758 } 759 760 761 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. 762 __m128i _mm_cvtepu8_epi16 (__m128i a) pure @trusted 763 { 764 // PERF DMD 765 static if (GDC_with_SSE41) 766 { 767 return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a); 768 } 769 else static if (LDC_with_optimizations) 770 { 771 enum ir = ` 772 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 773 %r = zext <8 x i8> %v to <8 x i16> 774 ret <8 x i16> %r`; 775 return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); 776 } 777 else 778 { 779 return _mm_unpacklo_epi8(a, _mm_setzero_si128()); 780 } 781 } 782 unittest 783 { 784 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 785 short8 C = cast(short8) _mm_cvtepu8_epi16(A); 786 short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248]; 787 assert(C.array == correct); 788 } 789 790 791 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. 792 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted 793 { 794 // PERF DMD 795 static if (GDC_with_SSE41) 796 { 797 alias ubyte16 = __vector(ubyte[16]); 798 return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a); 799 } 800 else static if (LDC_with_ARM64) 801 { 802 // LDC arm64: a bit better than below in -O2 803 byte16 sa = cast(byte16)a; 804 int4 r; 805 for(int n = 0; n < 4; ++n) 806 r.ptr[n] = cast(ubyte)sa.array[n]; 807 return cast(__m128i)r; 808 } 809 else 810 { 811 // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1 812 // PERF: catastrophic with GDC without SSE4.1 813 byte16 sa = cast(byte16)a; 814 int4 r; 815 r.ptr[0] = cast(ubyte)sa.array[0]; 816 r.ptr[1] = cast(ubyte)sa.array[1]; 817 r.ptr[2] = cast(ubyte)sa.array[2]; 818 r.ptr[3] = cast(ubyte)sa.array[3]; 819 return cast(__m128i)r; 820 } 821 } 822 unittest 823 { 824 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 825 int4 C = cast(int4) _mm_cvtepu8_epi32(A); 826 int[4] correct = [127, 128, 1, 255]; 827 assert(C.array == correct); 828 } 829 830 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 831 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted 832 { 833 // PERF DMD 834 static if (GDC_with_SSE41) 835 { 836 alias ubyte16 = __vector(ubyte[16]); 837 return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a); 838 } 839 else static if (LDC_with_ARM64) 840 { 841 // LDC arm64: this optimizes better than the loop below 842 byte16 sa = cast(byte16)a; 843 long2 r; 844 for (int n = 0; n < 2; ++n) 845 r.ptr[n] = cast(ubyte)sa.array[n]; 846 return cast(__m128i)r; 847 } 848 else 849 { 850 // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1 851 byte16 sa = cast(byte16)a; 852 long2 r; 853 r.ptr[0] = cast(ubyte)sa.array[0]; 854 r.ptr[1] = cast(ubyte)sa.array[1]; 855 return cast(__m128i)r; 856 } 857 } 858 unittest 859 { 860 __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 861 long2 C = cast(long2) _mm_cvtepu8_epi64(A); 862 long[2] correct = [127, 254]; 863 assert(C.array == correct); 864 } 865 866 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 867 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally 868 /// store the sum in dst using the low 4 bits of `imm8`. 869 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted 870 { 871 // PERF DMD 872 static if (GDC_with_SSE41) 873 { 874 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 875 } 876 else static if (LDC_with_SSE41) 877 { 878 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 879 } 880 else 881 { 882 __m128d zero = _mm_setzero_pd(); 883 __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b); 884 double sum = temp.array[0] + temp.array[1]; 885 return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum)); 886 } 887 } 888 unittest 889 { 890 __m128d A = _mm_setr_pd(1.0, 2.0); 891 __m128d B = _mm_setr_pd(4.0, 8.0); 892 double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B); 893 double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B); 894 double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B); 895 double[2] correct1 = [ 4.0, 4.0]; 896 double[2] correct2 = [16.0, 0.0]; 897 double[2] correct3 = [ 0.0, 20.0]; 898 assert(R1.array == correct1); 899 assert(R2.array == correct2); 900 assert(R3.array == correct3); 901 } 902 903 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 904 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 905 /// and conditionally store the sum in result using the low 4 bits of `imm8`. 906 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted 907 { 908 // PERF DMD 909 static if (GDC_with_SSE41) 910 { 911 return __builtin_ia32_dpps(a, b, cast(ubyte)imm8); 912 } 913 else static if (LDC_with_SSE41) 914 { 915 return __builtin_ia32_dpps(a, b, cast(byte)imm8); 916 } 917 else 918 { 919 __m128 zero = _mm_setzero_ps(); 920 __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b); 921 float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 922 return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum)); 923 } 924 } 925 unittest 926 { 927 __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f); 928 __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f); 929 float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B); 930 float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B); 931 float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B); 932 float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f]; 933 float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f]; 934 float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f]; 935 assert(R1.array == correct1); 936 assert(R2.array == correct2); 937 assert(R3.array == correct3); 938 } 939 940 941 /// Extract a 32-bit integer from `a`, selected with `imm8`. 942 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted 943 { 944 return (cast(int4)a).array[imm8 & 3]; 945 } 946 unittest 947 { 948 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 949 assert(_mm_extract_epi32(A, 0) == 1); 950 assert(_mm_extract_epi32(A, 1 + 8) == 2); 951 assert(_mm_extract_epi32(A, 3 + 4) == 4); 952 } 953 954 /// Extract a 64-bit integer from `a`, selected with `imm8`. 955 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted 956 { 957 long2 la = cast(long2)a; 958 return la.array[imm8 & 1]; 959 } 960 unittest 961 { 962 __m128i A = _mm_setr_epi64(45, -67); 963 assert(_mm_extract_epi64(A, 0) == 45); 964 assert(_mm_extract_epi64(A, 1) == -67); 965 assert(_mm_extract_epi64(A, 2) == 45); 966 } 967 968 /// Extract an 8-bit integer from `a`, selected with `imm8`. 969 /// Warning: the returned value is zero-extended to 32-bits. 970 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted 971 { 972 byte16 ba = cast(byte16)a; 973 return cast(ubyte) ba.array[imm8 & 15]; 974 } 975 unittest 976 { 977 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15); 978 assert(_mm_extract_epi8(A, 7) == 7); 979 assert(_mm_extract_epi8(A, 13) == 255); 980 assert(_mm_extract_epi8(A, 7 + 16) == 7); 981 } 982 983 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`. 984 /// Note: returns a 32-bit $(I integer). 985 int _mm_extract_ps (__m128 a, const int imm8) @trusted 986 { 987 return (cast(int4)a).array[imm8 & 3]; 988 } 989 unittest 990 { 991 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f); 992 assert(_mm_extract_ps(A, 0) == 0x3f800000); 993 assert(_mm_extract_ps(A, 1 + 8) == 0x40000000); 994 assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000); 995 } 996 997 998 999 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 1000 /// integer value, and store the results as packed double-precision floating-point elements. 1001 __m128d _mm_floor_pd (__m128d a) @trusted 1002 { 1003 static if (LDC_with_ARM64) 1004 { 1005 // LDC arm64 acceptable since 1.8 -O2 1006 long2 l = vcvtmq_s64_f64(a); 1007 double2 r; 1008 r.ptr[0] = l.array[0]; 1009 r.ptr[1] = l.array[1]; 1010 return r; 1011 } 1012 else 1013 { 1014 return _mm_round_pd!1(a); 1015 } 1016 } 1017 unittest 1018 { 1019 __m128d A = _mm_setr_pd(1.3f, -2.12f); 1020 __m128d B = _mm_setr_pd(53.6f, -2.7f); 1021 A = _mm_floor_pd(A); 1022 B = _mm_floor_pd(B); 1023 double[2] correctA = [1.0, -3.0]; 1024 double[2] correctB = [53.0, -3.0]; 1025 assert(A.array == correctA); 1026 assert(B.array == correctB); 1027 } 1028 1029 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 1030 /// integer value, and store the results as packed single-precision floating-point elements. 1031 __m128 _mm_floor_ps (__m128 a) @trusted 1032 { 1033 static if (LDC_with_ARM64) 1034 { 1035 // LDC arm64 acceptable since 1.8 -O1 1036 int4 l = vcvtmq_s32_f32(a); 1037 float4 r; 1038 r.ptr[0] = l.array[0]; 1039 r.ptr[1] = l.array[1]; 1040 r.ptr[2] = l.array[2]; 1041 r.ptr[3] = l.array[3]; 1042 return r; 1043 } 1044 else 1045 { 1046 return _mm_round_ps!1(a); 1047 } 1048 } 1049 unittest 1050 { 1051 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 1052 __m128 C = _mm_floor_ps(A); 1053 float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f]; 1054 assert(C.array == correct); 1055 } 1056 1057 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 1058 /// integer value, store the result as a double-precision floating-point element in the 1059 /// lower element, and copy the upper element from `a` to the upper element. 1060 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted 1061 { 1062 static if (LDC_with_ARM64) 1063 { 1064 a[0] = vcvtms_s64_f64(b[0]); 1065 return a; 1066 } 1067 else 1068 { 1069 return _mm_round_sd!1(a, b); 1070 } 1071 } 1072 unittest 1073 { 1074 __m128d A = _mm_setr_pd(1.3, -2.12); 1075 __m128d B = _mm_setr_pd(-53.1, -3.7); 1076 __m128d C = _mm_floor_sd(A, B); 1077 double[2] correct = [-54.0, -2.12]; 1078 assert(C.array == correct); 1079 } 1080 1081 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an 1082 /// integer value, store the result as a single-precision floating-point element in the 1083 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements. 1084 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted 1085 { 1086 static if (LDC_with_ARM64) 1087 { 1088 a[0] = vcvtms_s32_f32(b[0]); 1089 return a; 1090 } 1091 else 1092 { 1093 return _mm_round_ss!1(a, b); 1094 } 1095 } 1096 unittest 1097 { 1098 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 1099 __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f); 1100 __m128 C = _mm_floor_ss(A, B); 1101 float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f]; 1102 assert(C.array == correct); 1103 } 1104 1105 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`. 1106 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted 1107 { 1108 // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1 1109 // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1 1110 // LDC arm64: ins.s since LDC 1.8 -O2 1111 int4 ia = cast(int4)a; 1112 ia.ptr[imm8 & 3] = i; 1113 return cast(__m128i)ia; 1114 } 1115 unittest 1116 { 1117 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 1118 int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4); 1119 int[4] result = [1, 2, 5, 4]; 1120 assert(C.array == result); 1121 } 1122 1123 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`. 1124 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted 1125 { 1126 // GDC: nothing special to do, psinrq generated with -O1 -msse4.1 1127 // LDC x86: always do something sensible. 1128 long2 la = cast(long2)a; 1129 la.ptr[imm8 & 1] = i; 1130 return cast(__m128i)la; 1131 } 1132 unittest 1133 { 1134 __m128i A = _mm_setr_epi64(1, 2); 1135 long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2); 1136 long[2] result = [1, 5]; 1137 assert(C.array == result); 1138 } 1139 1140 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`. 1141 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. 1142 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted 1143 { 1144 // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1 1145 // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory. 1146 byte16 ba = cast(byte16)a; 1147 ba.ptr[imm8 & 15] = cast(byte)i; 1148 return cast(__m128i)ba; 1149 } 1150 unittest 1151 { 1152 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1153 byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16); 1154 byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 1155 assert(C.array == result); 1156 } 1157 1158 1159 /// Warning: of course it does something totally different from `_mm_insert_epi32`! 1160 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 1161 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 1162 /// (elements are zeroed out when the corresponding bit is set). 1163 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted 1164 { 1165 // PERF DMD 1166 static if (GDC_with_SSE41) 1167 { 1168 return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8); 1169 } 1170 else static if (LDC_with_SSE41) 1171 { 1172 return __builtin_ia32_insertps128(a, b, cast(byte)imm8); 1173 } 1174 else 1175 { 1176 float4 tmp2 = a; 1177 float tmp1 = b.array[(imm8 >> 6) & 3]; 1178 tmp2.ptr[(imm8 >> 4) & 3] = tmp1; 1179 return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps()); 1180 } 1181 } 1182 unittest 1183 { 1184 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1185 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1186 __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B); 1187 float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f]; 1188 assert(C.array == correct); 1189 } 1190 1191 1192 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1193 __m128i _mm_max_epi32 (__m128i a, __m128i b) pure @trusted 1194 { 1195 static if (GDC_with_SSE41) 1196 { 1197 return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b); 1198 } 1199 else version(LDC) 1200 { 1201 // x86: pmaxsd since LDC 1.1 -O1 1202 // ARM: smax.4s since LDC 1.8 -01 1203 int4 sa = cast(int4)a; 1204 int4 sb = cast(int4)b; 1205 static if (SIMD_COMPARISON_MASKS_16B) 1206 int4 greater = sa > sb; 1207 else 1208 int4 greater = greaterMask!int4(sa, sb); 1209 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1210 } 1211 else 1212 { 1213 __m128i higher = _mm_cmpgt_epi32(a, b); 1214 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1215 __m128i mask = _mm_and_si128(aTob, higher); 1216 return _mm_xor_si128(b, mask); 1217 } 1218 } 1219 unittest 1220 { 1221 int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1222 _mm_setr_epi32( -4,-8, 9, -8)); 1223 int[4] correct = [0x7fffffff, 1, 9, 7]; 1224 assert(R.array == correct); 1225 } 1226 1227 /// Compare packed signed 8-bit integers in `a` and `b`, 1228 /// and return packed maximum values. 1229 __m128i _mm_max_epi8 (__m128i a, __m128i b) pure @trusted 1230 { 1231 // PERF DMD 1232 static if (GDC_with_SSE41) 1233 { 1234 return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b); 1235 } 1236 else version(LDC) 1237 { 1238 // x86: pmaxsb since LDC 1.1 -O1 1239 // ARM64: smax.16b since LDC 1.8.0 -O1 1240 byte16 sa = cast(byte16)a; 1241 byte16 sb = cast(byte16)b; 1242 static if (SIMD_COMPARISON_MASKS_16B) 1243 byte16 greater = sa > sb; 1244 else 1245 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1246 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1247 } 1248 else 1249 { 1250 __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else 1251 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1252 __m128i mask = _mm_and_si128(aTob, lower); 1253 return _mm_xor_si128(b, mask); 1254 } 1255 } 1256 unittest 1257 { 1258 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1259 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1260 byte16 R = cast(byte16) _mm_max_epi8(A, B); 1261 byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0]; 1262 assert(R.array == correct); 1263 } 1264 1265 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values. 1266 __m128i _mm_max_epu16 (__m128i a, __m128i b) pure @trusted 1267 { 1268 // PERF DMD 1269 static if (GDC_with_SSE41) 1270 { 1271 return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b); 1272 } 1273 else version(LDC) 1274 { 1275 // x86: pmaxuw since LDC 1.1 -O1 1276 // ARM64: umax.8h since LDC 1.8.0 -O1 1277 // PERF: without sse4.1, LLVM 12 produces a very interesting 1278 // psubusw xmm0, xmm1 1279 // paddw xmm0, xmm1 1280 // sequence that maybe should go in other min/max intrinsics? 1281 ushort8 sa = cast(ushort8)a; 1282 ushort8 sb = cast(ushort8)b; 1283 static if (SIMD_COMPARISON_MASKS_16B) 1284 { 1285 // Note: doesn't work well with GDC, which prefers the builtin. 1286 ushort8 greater = sa > sb; 1287 } 1288 else 1289 ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb); 1290 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1291 } 1292 else 1293 { 1294 b = _mm_subs_epu16(b, a); 1295 b = _mm_add_epi16(b, a); 1296 return b; 1297 } 1298 } 1299 unittest 1300 { 1301 short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1302 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1303 short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57]; 1304 assert(R.array == correct); 1305 } 1306 1307 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. 1308 __m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted 1309 { 1310 // PERF DMD 1311 static if (GDC_with_SSE41) 1312 { 1313 return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b); 1314 } 1315 else version(LDC) 1316 { 1317 // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1 1318 // ARM64: umax.4s since LDC 1.8.0 -O1 1319 uint4 sa = cast(uint4)a; 1320 uint4 sb = cast(uint4)b; 1321 static if (SIMD_COMPARISON_MASKS_16B) 1322 uint4 greater = sa > sb; 1323 else 1324 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1325 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1326 } 1327 else 1328 { 1329 // PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128" 1330 /+ 1331 movdqa xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000] 1332 movdqa xmm3, xmm1 1333 pxor xmm3, xmm2 1334 pxor xmm2, xmm0 1335 pcmpgtd xmm2, xmm3 1336 pand xmm0, xmm2 1337 pandn xmm2, xmm1 1338 por xmm0, xmm2 1339 +/ 1340 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1341 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); 1342 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1343 __m128i mask = _mm_and_si128(aTob, higher); 1344 return _mm_xor_si128(b, mask); 1345 } 1346 } 1347 unittest 1348 { 1349 int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1350 _mm_setr_epi32( -4,-8, 9, -8)); 1351 int[4] correct = [ -4,-8, 9, -7]; 1352 assert(R.array == correct); 1353 } 1354 1355 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1356 __m128i _mm_min_epi32 (__m128i a, __m128i b) pure @trusted 1357 { 1358 // PERF DMD 1359 static if (GDC_with_SSE41) 1360 { 1361 return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b); 1362 } 1363 else version(LDC) 1364 { 1365 // x86: pminsd since LDC 1.1 -O1, also good without sse4.1 1366 // ARM: smin.4s since LDC 1.8 -01 1367 int4 sa = cast(int4)a; 1368 int4 sb = cast(int4)b; 1369 static if (SIMD_COMPARISON_MASKS_16B) 1370 int4 greater = sa > sb; 1371 else 1372 int4 greater = greaterMask!int4(sa, sb); 1373 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1374 } 1375 else 1376 { 1377 __m128i higher = _mm_cmplt_epi32(a, b); 1378 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1379 __m128i mask = _mm_and_si128(aTob, higher); 1380 return _mm_xor_si128(b, mask); 1381 } 1382 } 1383 unittest 1384 { 1385 int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1386 _mm_setr_epi32( -4, -8, 9, -8)); 1387 int[4] correct = [ -4, -8, -4, -8]; 1388 assert(R.array == correct); 1389 } 1390 1391 /// Compare packed signed 8-bit integers in `a` and `b`, 1392 /// and return packed minimum values. 1393 __m128i _mm_min_epi8 (__m128i a, __m128i b) pure @trusted 1394 { 1395 // PERF DMD 1396 static if (GDC_with_SSE41) 1397 { 1398 return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b); 1399 } 1400 else version(LDC) 1401 { 1402 // x86: pminsb since LDC 1.1 -O1 1403 // ARM64: smin.16b since LDC 1.8.0 -O1 1404 byte16 sa = cast(byte16)a; 1405 byte16 sb = cast(byte16)b; 1406 static if (SIMD_COMPARISON_MASKS_16B) 1407 byte16 greater = sa > sb; 1408 else 1409 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1410 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1411 } 1412 else 1413 { 1414 __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else 1415 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1416 __m128i mask = _mm_and_si128(aTob, lower); 1417 return _mm_xor_si128(b, mask); 1418 } 1419 } 1420 unittest 1421 { 1422 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1423 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1424 byte16 R = cast(byte16) _mm_min_epi8(A, B); 1425 byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1426 assert(R.array == correct); 1427 } 1428 1429 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. 1430 __m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted 1431 { 1432 // PERF DMD 1433 static if (GDC_with_SSE41) 1434 { 1435 return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b); 1436 } 1437 else version(LDC) 1438 { 1439 // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1 1440 // ARM64: umin.8h since LDC 1.8.0 -O1 1441 ushort8 sa = cast(ushort8)a; 1442 ushort8 sb = cast(ushort8)b; 1443 static if (SIMD_COMPARISON_MASKS_16B) 1444 ushort8 greater = (sb > sa); 1445 else 1446 ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa); 1447 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1448 } 1449 else 1450 { 1451 __m128i c = _mm_subs_epu16(b, a); 1452 b = _mm_sub_epi16(b, c); 1453 return b; 1454 } 1455 } 1456 unittest 1457 { 1458 short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1459 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1460 short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0]; 1461 assert(R.array == correct); 1462 } 1463 1464 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. 1465 __m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted 1466 { 1467 // PERF DMD 1468 static if (GDC_with_SSE41) 1469 { 1470 return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b); 1471 } 1472 else version(LDC) 1473 { 1474 // x86: pminud since LDC 1.1 -O1, also good without sse4.1 1475 // ARM64: umin.4s since LDC 1.8.0 -O1 1476 uint4 sa = cast(uint4)a; 1477 uint4 sb = cast(uint4)b; 1478 static if (SIMD_COMPARISON_MASKS_16B) 1479 uint4 greater = sa > sb; 1480 else 1481 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1482 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1483 } 1484 else 1485 { 1486 // PERF: same remark as in _mm_max_epu32 1487 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1488 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift)); 1489 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1490 __m128i mask = _mm_and_si128(aTob, higher); 1491 return _mm_xor_si128(b, mask); 1492 } 1493 } 1494 unittest 1495 { 1496 int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1497 _mm_setr_epi32( -4,-8, 9, -8)); 1498 int[4] correct = [0x7fffffff, 1, 4, -8]; 1499 assert(R.array == correct); 1500 } 1501 1502 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 1503 /// store the minimum and index in return value, and zero the remaining bits. 1504 __m128i _mm_minpos_epu16 (__m128i a) @trusted 1505 { 1506 // PERF DMD 1507 static if (GDC_with_SSE41) 1508 { 1509 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1510 } 1511 else static if (LDC_with_SSE41) 1512 { 1513 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1514 } 1515 else static if (LDC_with_ARM64) 1516 { 1517 __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1518 __m128i combinedLo = _mm_unpacklo_epi16(indices, a); 1519 __m128i combinedHi = _mm_unpackhi_epi16(indices, a); 1520 __m128i best = _mm_min_epu32(combinedLo, combinedHi); 1521 best = _mm_min_epu32(best, _mm_srli_si128!8(best)); 1522 best = _mm_min_epu32(best, _mm_srli_si128!4(best)); 1523 short8 sbest = cast(short8)best; 1524 short8 r; 1525 r[0] = sbest[1]; 1526 r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie 1527 r[2] = 0; 1528 r[3] = 0; 1529 r[4] = 0; 1530 r[5] = 0; 1531 r[6] = 0; 1532 r[7] = 0; 1533 return cast(__m128i)r; 1534 } 1535 else 1536 { 1537 short8 sa = cast(short8)a; 1538 ushort min = 0xffff; 1539 int index = 0; 1540 for(int n = 0; n < 8; ++n) 1541 { 1542 ushort c = sa.array[n]; 1543 if (c < min) 1544 { 1545 min = c; 1546 index = n; 1547 } 1548 } 1549 short8 r; 1550 r.ptr[0] = min; 1551 r.ptr[1] = cast(short)index; 1552 return cast(__m128i)r; 1553 } 1554 } 1555 unittest 1556 { 1557 __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6); 1558 __m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6); 1559 short8 R1 = cast(short8) _mm_minpos_epu16(A); 1560 short8 R2 = cast(short8) _mm_minpos_epu16(B); 1561 short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0]; 1562 short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0]; 1563 assert(R1.array == correct1); 1564 assert(R2.array == correct2); 1565 } 1566 1567 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 1568 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 1569 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 1570 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 1571 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 1572 /// at the offset specified in `imm8[2]`. 1573 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted 1574 { 1575 // PERF DMD 1576 static if (GDC_with_SSE41) 1577 { 1578 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8); 1579 } 1580 else static if (LDC_with_SSE41) 1581 { 1582 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); 1583 } 1584 else 1585 { 1586 int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable... 1587 int b_offset = (imm8 & 3) * 4; 1588 1589 byte16 ba = cast(byte16)a; 1590 byte16 bb = cast(byte16)b; 1591 short8 r; 1592 1593 __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0); 1594 1595 for (int j = 0; j < 8; j += 2) 1596 { 1597 int k = a_offset + j; 1598 __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3], 1599 0, 0, 0, 0, 1600 ba[k+1], ba[k+2], ba[k+3], ba[k+4], 1601 0, 0, 0, 0); 1602 short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64 1603 r.ptr[j] = diffs.array[0]; 1604 r.ptr[j+1] = diffs.array[4]; 1605 } 1606 return cast(__m128i)r; 1607 } 1608 } 1609 unittest 1610 { 1611 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1612 __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); 1613 short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23]; 1614 short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749]; 1615 short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35]; 1616 short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741]; 1617 short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4]; 1618 short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B); 1619 short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B); 1620 short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B); 1621 short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B); 1622 short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B); 1623 assert(r1.array == correct1); 1624 assert(r4.array == correct4); 1625 assert(r5.array == correct5); 1626 assert(r7.array == correct7); 1627 assert(r8.array == correct0); 1628 } 1629 1630 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. 1631 __m128i _mm_mul_epi32 (__m128i a, __m128i b) pure @trusted 1632 { 1633 // PERF DMD 1634 static if (GDC_with_SSE41) 1635 { 1636 return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b); 1637 } 1638 else static if (LDC_with_SSE41 && LDC_with_optimizations) 1639 { 1640 // For some reason, clang has the builtin but it's not in IntrinsicsX86.td 1641 // Use IR instead. 1642 // This generates pmuldq with since LDC 1.2.0 -O0 1643 enum ir = ` 1644 %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2> 1645 %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2> 1646 %la = sext <2 x i32> %ia to <2 x i64> 1647 %lb = sext <2 x i32> %ib to <2 x i64> 1648 %r = mul <2 x i64> %la, %lb 1649 ret <2 x i64> %r`; 1650 return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b); 1651 } 1652 else static if (LDC_with_ARM64) 1653 { 1654 // 3 instructions since LDC 1.8 -O2 1655 // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull 1656 int2 a_lo = vmovn_s64(cast(long2)a); 1657 int2 b_lo = vmovn_s64(cast(long2)b); 1658 return cast(__m128i) vmull_s32(a_lo, b_lo); 1659 } 1660 else 1661 { 1662 int4 ia = cast(int4)a; 1663 int4 ib = cast(int4)b; 1664 long2 r; 1665 r.ptr[0] = cast(long)ia.array[0] * ib.array[0]; 1666 r.ptr[1] = cast(long)ia.array[2] * ib.array[2]; 1667 return cast(__m128i)r; 1668 } 1669 } 1670 unittest 1671 { 1672 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1673 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1674 long2 R = cast(long2) _mm_mul_epi32(A, B); 1675 long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144]; 1676 assert(R.array == correct); 1677 } 1678 1679 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 1680 /// return the low 32 bits of the intermediate integers. 1681 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted 1682 { 1683 // PERF DMD 1684 // PERF GDC without SSE4.1 could be better 1685 static if (GDC_with_SSE41) 1686 { 1687 int4 ia = cast(int4)a; 1688 int4 ib = cast(int4)b; 1689 // Note: older GDC doesn't have that op, but older GDC 1690 // also has no support for -msse4.1 detection 1691 return cast(__m128i)(a * b); 1692 } 1693 else version(LDC) 1694 { 1695 int4 ia = cast(int4)a; 1696 int4 ib = cast(int4)b; 1697 return cast(__m128i)(a * b); 1698 } 1699 else 1700 { 1701 // DMD doesn't take the above 1702 int4 ia = cast(int4)a; 1703 int4 ib = cast(int4)b; 1704 int4 r; 1705 r.ptr[0] = ia.array[0] * ib.array[0]; 1706 r.ptr[1] = ia.array[1] * ib.array[1]; 1707 r.ptr[2] = ia.array[2] * ib.array[2]; 1708 r.ptr[3] = ia.array[3] * ib.array[3]; 1709 return r; 1710 } 1711 } 1712 unittest 1713 { 1714 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1715 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1716 int4 R = cast(int4) _mm_mullo_epi32(A, B); 1717 int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0]; 1718 assert(R.array == correct); 1719 } 1720 1721 1722 /// Convert packed signed 32-bit integers from `a` and `b` 1723 /// to packed 16-bit integers using unsigned saturation. 1724 __m128i _mm_packus_epi32 (__m128i a, __m128i b) pure @trusted 1725 { 1726 static if (GDC_with_SSE41) 1727 { 1728 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1729 } 1730 else static if (LDC_with_SSE41) 1731 { 1732 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1733 } 1734 else static if (LDC_with_ARM64) 1735 { 1736 int4 z; 1737 z = 0; 1738 return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)), 1739 vqmovn_u32(vmaxq_s32(z, cast(int4)b))); 1740 } 1741 else 1742 { 1743 __m128i i32768 = _mm_set1_epi32(32768); 1744 __m128i s32768 = _mm_set1_epi16(-32768); 1745 a = _mm_sub_epi32(a, i32768); 1746 b = _mm_sub_epi32(b, i32768); 1747 __m128i clampedSigned = _mm_packs_epi32(a, b); 1748 return _mm_add_epi16(clampedSigned, s32768); 1749 } 1750 } 1751 unittest 1752 { 1753 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 1754 short8 R = cast(short8) _mm_packus_epi32(A, A); 1755 short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0]; 1756 assert(R.array == correct); 1757 } 1758 1759 1760 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 1761 /// rounding parameter, and store the results as packed double-precision floating-point elements. 1762 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1763 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1764 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1765 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1766 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1767 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1768 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted 1769 { 1770 // PERF DMD 1771 static if (GDC_with_SSE41) 1772 { 1773 return __builtin_ia32_roundpd(a, rounding); 1774 } 1775 else static if (LDC_with_SSE41) 1776 { 1777 return __builtin_ia32_roundpd(a, rounding); 1778 } 1779 else 1780 { 1781 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1782 { 1783 // Convert to 64-bit integers 1784 long lo = _mm_cvtsd_si64(a); 1785 a.ptr[0] = a.array[1]; 1786 long hi = _mm_cvtsd_si64(a); 1787 return _mm_setr_pd(lo, hi); 1788 } 1789 else 1790 { 1791 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1792 1793 uint old = _MM_GET_ROUNDING_MODE(); 1794 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1795 1796 // Convert to 64-bit integers 1797 long lo = _mm_cvtsd_si64(a); 1798 a.ptr[0] = a.array[1]; 1799 long hi = _mm_cvtsd_si64(a); 1800 1801 // Convert back to double to achieve the rounding 1802 // The problem is that a 64-bit double can't represent all the values 1803 // a 64-bit integer can (and vice-versa). So this function won't work for 1804 // large values. (TODO: what range exactly?) 1805 _MM_SET_ROUNDING_MODE(old); 1806 return _mm_setr_pd(lo, hi); 1807 } 1808 } 1809 } 1810 unittest 1811 { 1812 // tested in other intrinsics 1813 } 1814 1815 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 1816 /// rounding parameter, and store the results as packed single-precision floating-point elements. 1817 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1818 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1819 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1820 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1821 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1822 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1823 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted 1824 { 1825 // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally 1826 static if (GDC_or_LDC_with_SSE41) 1827 { 1828 return __builtin_ia32_roundps(a, rounding); 1829 } 1830 else 1831 { 1832 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1833 { 1834 __m128i integers = _mm_cvtps_epi32(a); 1835 return _mm_cvtepi32_ps(integers); 1836 } 1837 else 1838 { 1839 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 1840 uint old = _MM_GET_ROUNDING_MODE(); 1841 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1842 scope(exit) _MM_SET_ROUNDING_MODE(old); 1843 1844 // Convert to 64-bit integers 1845 __m128i integers = _mm_cvtps_epi32(a); 1846 1847 // Convert back to float to achieve the rounding 1848 // The problem is that a 32-float can't represent all the values 1849 // a 32-bit integer can (and vice-versa). So this function won't work for 1850 // large values. (TODO: what range exactly?) 1851 __m128 result = _mm_cvtepi32_ps(integers); 1852 1853 return result; 1854 } 1855 } 1856 } 1857 unittest 1858 { 1859 // tested in other intrinsics 1860 } 1861 1862 1863 /// Round the lower double-precision (64-bit) floating-point element in `b` using the 1864 /// rounding parameter, store the result as a double-precision floating-point element 1865 /// in the lower element of result, and copy the upper element from `a` to the upper element of result. 1866 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1867 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1868 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1869 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1870 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1871 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1872 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted 1873 { 1874 static if (GDC_with_SSE41) 1875 { 1876 return __builtin_ia32_roundsd(a, b, rounding); 1877 } 1878 else static if (LDC_with_SSE41) 1879 { 1880 return __builtin_ia32_roundsd(a, b, rounding); 1881 } 1882 else 1883 { 1884 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1885 { 1886 // Convert to 64-bit integer 1887 long b0 = _mm_cvtsd_si64(b); 1888 a.ptr[0] = b0; 1889 return a; 1890 } 1891 else 1892 { 1893 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1894 1895 uint old = _MM_GET_ROUNDING_MODE(); 1896 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1897 1898 // Convert to 64-bit integer 1899 long b0 = _mm_cvtsd_si64(b); 1900 a.ptr[0] = b0; 1901 1902 // Convert back to double to achieve the rounding 1903 // The problem is that a 64-bit double can't represent all the values 1904 // a 64-bit integer can (and vice-versa). So this function won't work for 1905 // large values. (TODO: what range exactly?) 1906 _MM_SET_ROUNDING_MODE(old); 1907 return a; 1908 } 1909 } 1910 } 1911 unittest 1912 { 1913 // tested in other intrinsics 1914 } 1915 1916 1917 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 1918 /// rounding parameter, store the result as a single-precision floating-point element 1919 /// in the lower element of result, and copy the upper 3 packed elements from `a` 1920 /// to the upper elements of result. 1921 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1922 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1923 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1924 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1925 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1926 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1927 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted 1928 { 1929 static if (GDC_with_SSE41) 1930 { 1931 return __builtin_ia32_roundss(a, b, rounding); 1932 } 1933 else static if (LDC_with_SSE41) 1934 { 1935 return __builtin_ia32_roundss(a, b, rounding); 1936 } 1937 else 1938 { 1939 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1940 { 1941 int b0 = _mm_cvtss_si32(b); 1942 a.ptr[0] = b0; 1943 return a; 1944 } 1945 else version(GNU) 1946 { 1947 pragma(inline, false) 1948 __m128 GDCworkaround() nothrow @nogc @trusted 1949 { 1950 uint old = _MM_GET_ROUNDING_MODE(); 1951 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1952 1953 // Convert to 32-bit integer 1954 int b0 = _mm_cvtss_si32(b); 1955 a.ptr[0] = b0; 1956 1957 // Convert back to double to achieve the rounding 1958 // The problem is that a 32-bit float can't represent all the values 1959 // a 32-bit integer can (and vice-versa). So this function won't work for 1960 // large values. (TODO: what range exactly?) 1961 _MM_SET_ROUNDING_MODE(old); 1962 return a; 1963 } 1964 return GDCworkaround(); 1965 } 1966 else 1967 { 1968 uint old = _MM_GET_ROUNDING_MODE(); 1969 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1970 1971 // Convert to 32-bit integer 1972 int b0 = _mm_cvtss_si32(b); 1973 a.ptr[0] = b0; 1974 1975 // Convert back to double to achieve the rounding 1976 // The problem is that a 32-bit float can't represent all the values 1977 // a 32-bit integer can (and vice-versa). So this function won't work for 1978 // large values. (TODO: what range exactly?) 1979 _MM_SET_ROUNDING_MODE(old); 1980 return a; 1981 } 1982 } 1983 } 1984 unittest 1985 { 1986 // tested in other intrinsics 1987 } 1988 1989 1990 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 1991 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 1992 /// exception may be generated. 1993 __m128i _mm_stream_load_si128 (__m128i * mem_addr) pure @trusted 1994 { 1995 // PERF DMD D_SIMD 1996 static if (GDC_with_SSE41) 1997 { 1998 return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr); 1999 } 2000 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 2001 { 2002 enum prefix = `!0 = !{ i32 1 }`; 2003 enum ir = ` 2004 %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0 2005 ret <4 x i32> %r`; 2006 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(mem_addr); 2007 } 2008 else 2009 { 2010 return *mem_addr; // regular move instead 2011 } 2012 } 2013 unittest 2014 { 2015 align(16) static immutable int[4] correct = [1, 2, 3, 4]; 2016 __m128i A = _mm_stream_load_si128(cast(__m128i*)(correct.ptr)); 2017 _mm_mfence(); 2018 assert(A.array == correct); 2019 } 2020 2021 /// Return 1 if all bits in `a` are all 1's. Else return 0. 2022 int _mm_test_all_ones (__m128i a) @safe 2023 { 2024 return _mm_testc_si128(a, _mm_set1_epi32(-1)); 2025 } 2026 unittest 2027 { 2028 __m128i A = _mm_set1_epi32(-1); 2029 __m128i B = _mm_set_epi32(-1, -2, -1, -1); 2030 assert(_mm_test_all_ones(A) == 1); 2031 assert(_mm_test_all_ones(B) == 0); 2032 } 2033 2034 /// Return 1 if all bits in `a` are all 0's. Else return 0. 2035 // This is a #BONUS since it was lacking in Intel Intrinsics API. 2036 int _mm_test_all_zeros (__m128i a) @safe 2037 { 2038 return _mm_testz_si128(a, _mm_set1_epi32(-1)); 2039 } 2040 unittest 2041 { 2042 __m128i A = _mm_set1_epi32(0); 2043 __m128i B = _mm_set_epi32(0, 8, 0, 0); 2044 assert(_mm_test_all_zeros(A) == 1); 2045 assert(_mm_test_all_zeros(B) == 0); 2046 } 2047 2048 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 2049 /// and return 1 if the result is zero, otherwise return 0. 2050 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe 2051 { 2052 return _mm_testz_si128(a, mask); // it's really the same, but with a good name 2053 } 2054 2055 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 2056 /// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with 2057 /// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and 2058 /// CF values are zero, otherwise return 0. 2059 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted 2060 { 2061 return _mm_testnzc_si128(a, mask); 2062 } 2063 2064 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 2065 /// result is zero, otherwise return 0. 2066 /// In other words, test if all bits masked by `b` are 1 in `a`. 2067 int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted 2068 { 2069 // PERF DMD 2070 static if (GDC_with_SSE41) 2071 { 2072 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2073 } 2074 else static if (LDC_with_SSE41) 2075 { 2076 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2077 } 2078 else static if (LDC_with_ARM64) 2079 { 2080 // Acceptable since LDC 1.8 -02 2081 long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a); 2082 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2083 } 2084 else 2085 { 2086 __m128i c = ~a & b; 2087 int[4] zero = [0, 0, 0, 0]; 2088 return c.array == zero; 2089 } 2090 } 2091 unittest 2092 { 2093 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2094 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00); 2095 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2096 assert(_mm_testc_si128(A, A) == 1); 2097 assert(_mm_testc_si128(A, M1) == 0); 2098 assert(_mm_testc_si128(A, M2) == 1); 2099 } 2100 2101 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 2102 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 2103 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 2104 /// result is zero, otherwise set CF to 0. 2105 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 2106 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted 2107 { 2108 // PERF DMD 2109 static if (GDC_with_SSE41) 2110 { 2111 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2112 } 2113 else static if (LDC_with_SSE41) 2114 { 2115 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2116 } 2117 else static if (LDC_with_ARM64) 2118 { 2119 long2 s640 = vandq_s64(cast(long2)b, cast(long2)a); 2120 long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a); 2121 2122 return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)) 2123 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) ); 2124 } 2125 else 2126 { 2127 __m128i c = a & b; 2128 __m128i d = ~a & b; 2129 int[4] zero = [0, 0, 0, 0]; 2130 return !( (c.array == zero) || (d.array == zero)); 2131 } 2132 } 2133 unittest 2134 { 2135 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2136 __m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00); 2137 __m128i Z = _mm_setzero_si128(); 2138 assert(_mm_testnzc_si128(A, Z) == 0); 2139 assert(_mm_testnzc_si128(A, M) == 1); 2140 assert(_mm_testnzc_si128(A, A) == 0); 2141 } 2142 2143 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 2144 /// and return 1 if the result is zero, otherwise return 0. 2145 /// In other words, test if all bits masked by `b` are 0 in `a`. 2146 int _mm_testz_si128 (__m128i a, __m128i b) @trusted 2147 { 2148 // PERF DMD 2149 static if (GDC_with_SSE41) 2150 { 2151 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2152 } 2153 else static if (LDC_with_SSE41) 2154 { 2155 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2156 } 2157 else static if (LDC_with_ARM64) 2158 { 2159 // Acceptable since LDC 1.8 -02 2160 long2 s64 = vandq_s64(cast(long2)a, cast(long2)b); 2161 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2162 } 2163 else 2164 { 2165 __m128i c = a & b; 2166 int[4] zero = [0, 0, 0, 0]; 2167 return c.array == zero; 2168 } 2169 } 2170 unittest 2171 { 2172 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2173 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07); 2174 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2175 assert(_mm_testz_si128(A, A) == 0); 2176 assert(_mm_testz_si128(A, M1) == 1); 2177 assert(_mm_testz_si128(A, M2) == 0); 2178 } 2179