1 /** 2 * MMX intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=MMX 4 * 5 * Copyright: Copyright Guillaume Piolat 2019-2020. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.mmx; 9 10 public import inteli.types; 11 import inteli.internals; 12 13 import inteli.xmmintrin; 14 import inteli.emmintrin; 15 16 nothrow @nogc: 17 18 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics, 19 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen. 20 // intel-intrinsics is just semantics. 21 // Even GDC does not seem to use mm0-mm7 registers, instead preferring xmm0-xmm7. 22 23 24 /// Add packed 16-bit integers in `a` and `b`. 25 __m64 _mm_add_pi16 (__m64 a, __m64 b) 26 { 27 return cast(__m64)(cast(short4)a + cast(short4)b); 28 } 29 unittest 30 { 31 short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3)); 32 short[4] correct = [7, 7, 7, 7]; 33 assert(R.array == correct); 34 } 35 36 /// Add packed 32-bit integers in `a` and `b`. 37 __m64 _mm_add_pi32 (__m64 a, __m64 b) 38 { 39 return cast(__m64)(cast(int2)a + cast(int2)b); 40 } 41 unittest 42 { 43 int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3)); 44 int[2] correct = [7, 7]; 45 assert(R.array == correct); 46 } 47 48 /// Add packed 8-bit integers in `a` and `b`. 49 __m64 _mm_add_pi8 (__m64 a, __m64 b) 50 { 51 return cast(__m64)(cast(byte8)a + cast(byte8)b); 52 } 53 unittest 54 { 55 byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128)); 56 byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1]; 57 assert(R.array == correct); 58 } 59 60 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 61 // PERF: PADDSW not generated 62 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted 63 { 64 return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b))); 65 } 66 unittest 67 { 68 short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0), 69 _mm_set_pi16(3, 2, 1, 0)); 70 static immutable short[4] correctResult = [0, 2, 4, 6]; 71 assert(res.array == correctResult); 72 } 73 74 /// Add packed 8-bit integers in `a` and `b` using signed saturation. 75 // PERF: PADDSB not generated 76 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted 77 { 78 return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b))); 79 } 80 unittest 81 { 82 byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0), 83 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 84 static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 85 assert(res.array == correctResult); 86 } 87 88 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation. 89 // PERF: PADDUSW not generated 90 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted 91 { 92 return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b))); 93 } 94 unittest 95 { 96 short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0), 97 _mm_set_pi16(3, 2, 1, 0)); 98 static immutable short[4] correctResult = [0, cast(short)65535, 4, 6]; 99 assert(res.array == correctResult); 100 } 101 102 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation. 103 // PERF: PADDUSB not generated 104 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted 105 { 106 return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b))); 107 } 108 unittest 109 { 110 byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0), 111 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 112 static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 113 assert(res.array == correctResult); 114 } 115 116 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`. 117 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe 118 { 119 return a & b; 120 } 121 unittest 122 { 123 __m64 A = [7]; 124 __m64 B = [14]; 125 __m64 R = _mm_and_si64(A, B); 126 assert(R.array[0] == 6); 127 } 128 129 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`. 130 __m64 _mm_andnot_si64 (__m64 a, __m64 b) 131 { 132 return (~a) & b; 133 } 134 unittest 135 { 136 __m64 A = [7]; 137 __m64 B = [14]; 138 __m64 R = _mm_andnot_si64(A, B); 139 assert(R.array[0] == 8); 140 } 141 142 /// Compare packed 16-bit integers in `a` and `b` for equality. 143 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe 144 { 145 static if (SIMD_COMPARISON_MASKS_8B) 146 { 147 return cast(__m64)(cast(short4)a == cast(short4)b); 148 } 149 else static if (GDC_with_MMX) 150 { 151 return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b); 152 } 153 else 154 { 155 return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b); 156 } 157 } 158 unittest 159 { 160 short4 A = [-3, -2, -1, 0]; 161 short4 B = [ 4, 3, 2, 1]; 162 short[4] E = [ 0, 0, 0, 0]; 163 short4 R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B)); 164 assert(R.array == E); 165 } 166 167 /// Compare packed 32-bit integers in `a` and `b` for equality. 168 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe 169 { 170 static if (SIMD_COMPARISON_MASKS_8B) 171 { 172 return cast(__m64)(cast(int2)a == cast(int2)b); 173 } 174 else static if (GDC_with_MMX) 175 { 176 return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b); 177 } 178 else 179 { 180 return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b); 181 } 182 } 183 unittest 184 { 185 int2 A = [-3, -2]; 186 int2 B = [ 4, -2]; 187 int[2] E = [ 0, -1]; 188 int2 R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B)); 189 assert(R.array == E); 190 } 191 192 /// Compare packed 8-bit integers in `a` and `b` for equality, 193 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe 194 { 195 static if (SIMD_COMPARISON_MASKS_8B) 196 { 197 return cast(__m64)(cast(byte8)a == cast(byte8)b); 198 } 199 else static if (GDC_with_MMX) 200 { 201 return cast(__m64) __builtin_ia32_pcmpeqb(cast(ubyte8)a, cast(ubyte8)b); 202 } 203 else 204 { 205 return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b); 206 } 207 } 208 unittest 209 { 210 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 211 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 212 byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B); 213 byte[8] correct = [0,-1, 0, 0, 0,-1, 0, 0]; 214 assert(C.array == correct); 215 } 216 217 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 218 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe 219 { 220 static if (SIMD_COMPARISON_MASKS_8B) 221 { 222 return cast(__m64)(cast(short4)a > cast(short4)b); 223 } 224 else static if (GDC_with_MMX) 225 { 226 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 227 } 228 else 229 { 230 return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b); 231 } 232 } 233 unittest 234 { 235 short4 A = [-3, -2, -1, 0]; 236 short4 B = [ 4, 3, 2, 1]; 237 short[4] E = [ 0, 0, 0, 0]; 238 short4 R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B)); 239 assert(R.array == E); 240 } 241 242 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 243 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe 244 { 245 static if (SIMD_COMPARISON_MASKS_8B) 246 { 247 return cast(__m64)(cast(int2)a > cast(int2)b); 248 } 249 else static if (GDC_with_MMX) 250 { 251 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 252 } 253 else 254 { 255 return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b); 256 } 257 } 258 unittest 259 { 260 int2 A = [-3, 2]; 261 int2 B = [ 4, -2]; 262 int[2] E = [ 0, -1]; 263 int2 R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B)); 264 assert(R.array == E); 265 } 266 267 /// Compare packed signed 8-bit integers in `a` and `b` for greater-than. 268 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe 269 { 270 static if (SIMD_COMPARISON_MASKS_8B) 271 { 272 return cast(__m64)(cast(byte8)a > cast(byte8)b); 273 } 274 else static if (GDC_with_MMX) 275 { 276 return cast(__m64) __builtin_ia32_pcmpgtb (cast(ubyte8)a, cast(ubyte8)b); 277 } 278 else 279 { 280 return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b); 281 } 282 } 283 unittest 284 { 285 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 286 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 287 byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B); 288 byte[8] correct = [0, 0,-1, 0, 0, 0, 0, 0]; 289 assert(C.array == correct); 290 } 291 292 /// Copy 64-bit integer `a` to `dst`. 293 long _mm_cvtm64_si64 (__m64 a) pure @safe 294 { 295 long1 la = cast(long1)a; 296 return a.array[0]; 297 } 298 unittest 299 { 300 __m64 A = _mm_setr_pi32(2, 1); 301 long1 lA = cast(long1)A; 302 assert(A.array[0] == 0x100000002); 303 } 304 305 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`. 306 __m64 _mm_cvtsi32_si64 (int a) pure @trusted 307 { 308 __m64 r = void; 309 r.ptr[0] = a; 310 return r; 311 } 312 unittest 313 { 314 __m64 R = _mm_cvtsi32_si64(-1); 315 assert(R.array[0] == -1); 316 } 317 318 /// Copy 64-bit integer `a` to `dst`. 319 __m64 _mm_cvtsi64_m64 (long a) pure @trusted 320 { 321 __m64 r = void; 322 r.ptr[0] = a; 323 return r; 324 } 325 unittest 326 { 327 __m64 R = _mm_cvtsi64_m64(0x123456789A); 328 assert(R.array[0] == 0x123456789A); 329 } 330 331 /// Get the lower 32-bit integer in `a`. 332 int _mm_cvtsi64_si32 (__m64 a) pure @safe 333 { 334 int2 r = cast(int2)a; 335 return r.array[0]; 336 } 337 unittest 338 { 339 __m64 A = _mm_setr_pi32(-6, 5); 340 int R = _mm_cvtsi64_si32(A); 341 assert(R == -6); 342 } 343 344 /// Empty the MMX state, which marks the x87 FPU registers as available for 345 /// use by x87 instructions. 346 /// This instruction is supposed to be used at the end of all MMX technology procedures. 347 /// But this is useless when using `intel-intrinsics`, with all D compilers. 348 void _mm_empty() pure @safe 349 { 350 // do nothing, see comment on top of file 351 } 352 353 354 deprecated alias _m_empty = _mm_empty; /// Deprecated intrinsics. 355 deprecated alias _m_from_int = _mm_cvtsi32_si64; ///ditto 356 deprecated alias _m_from_int64 = _mm_cvtsi64_m64; ///ditto 357 358 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 359 /// Horizontally add adjacent pairs of intermediate 32-bit integers 360 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe 361 { 362 return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b))); 363 } 364 unittest 365 { 366 short4 A = [-32768, -32768, 32767, 32767]; 367 short4 B = [-32768, -32768, 32767, 32767]; 368 int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B); 369 int[2] correct = [-2147483648, 2*32767*32767]; 370 assert(R.array == correct); 371 } 372 373 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 374 /// and store the high 16 bits of the intermediate integers. 375 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe 376 { 377 return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b))); 378 } 379 unittest 380 { 381 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 382 __m64 B = _mm_set1_pi16(16384); 383 short4 R = cast(short4)_mm_mulhi_pi16(A, B); 384 short[4] correct = [1, 2, -4, 1]; 385 assert(R.array == correct); 386 } 387 388 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 389 /// and store the low 16 bits of the intermediate integers. 390 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe 391 { 392 return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b))); 393 } 394 unittest 395 { 396 __m64 A = _mm_setr_pi16(4, 1, 16, 7); 397 __m64 B = _mm_set1_pi16(16384); 398 short4 R = cast(short4)_mm_mullo_pi16(A, B); 399 short[4] correct = [0, 16384, 0, -16384]; 400 assert(R.array == correct); 401 } 402 403 /// Compute the bitwise OR of 64 bits in `a` and `b`. 404 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe 405 { 406 return a | b; 407 } 408 unittest 409 { 410 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 411 __m64 B = _mm_set1_pi16(15); 412 short4 R = cast(short4)_mm_or_si64(A, B); 413 short[4] correct = [255, 15, -1, 15]; 414 assert(R.array == correct); 415 } 416 417 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 418 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted 419 { 420 int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b)); 421 int2 r; 422 r.ptr[0] = p.array[0]; 423 r.ptr[1] = p.array[2]; 424 return cast(__m64)r; 425 } 426 unittest 427 { 428 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 429 byte8 R = cast(byte8) _mm_packs_pi16(A, A); 430 byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0]; 431 assert(R.array == correct); 432 } 433 434 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 435 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted 436 { 437 int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b)); 438 int2 r; 439 r.ptr[0] = p.array[0]; 440 r.ptr[1] = p.array[2]; 441 return cast(__m64)r; 442 } 443 unittest 444 { 445 __m64 A = _mm_setr_pi32(100000, -100000); 446 short4 R = cast(short4) _mm_packs_pi32(A, A); 447 short[4] correct = [32767, -32768, 32767, -32768]; 448 assert(R.array == correct); 449 } 450 451 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 452 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted 453 { 454 int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b)); 455 int2 r; 456 r.ptr[0] = p.array[0]; 457 r.ptr[1] = p.array[2]; 458 return cast(__m64)r; 459 } 460 unittest 461 { 462 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 463 byte8 R = cast(byte8) _mm_packs_pu16(A, A); 464 ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0]; 465 assert(R.array == cast(byte[8])correct); 466 } 467 468 deprecated alias 469 _m_packssdw = _mm_packs_pi32, /// Deprecated intrinsics. 470 _m_packsswb = _mm_packs_pi16, ///ditto 471 _m_packuswb = _mm_packs_pu16, ///ditto 472 _m_paddb = _mm_add_pi8, ///ditto 473 _m_paddd = _mm_add_pi32, ///ditto 474 _m_paddsb = _mm_adds_pi8, ///ditto 475 _m_paddsw = _mm_adds_pi16, ///ditto 476 _m_paddusb = _mm_adds_pu8, ///ditto 477 _m_paddusw = _mm_adds_pu16, ///ditto 478 _m_paddw = _mm_add_pi16, ///ditto 479 _m_pand = _mm_and_si64, ///ditto 480 _m_pandn = _mm_andnot_si64, ///ditto 481 _m_pcmpeqb = _mm_cmpeq_pi8, ///ditto 482 _m_pcmpeqd = _mm_cmpeq_pi32, ///ditto 483 _m_pcmpeqw = _mm_cmpeq_pi16, ///ditto 484 _m_pcmpgtb = _mm_cmpgt_pi8, ///ditto 485 _m_pcmpgtd = _mm_cmpgt_pi32, ///ditto 486 _m_pcmpgtw = _mm_cmpgt_pi16, ///ditto 487 _m_pmaddwd = _mm_madd_pi16, ///ditto 488 _m_pmulhw = _mm_mulhi_pi16, ///ditto 489 _m_pmullw = _mm_mullo_pi16, ///ditto 490 _m_por = _mm_or_si64, ///ditto 491 _m_pslld = _mm_sll_pi32, ///ditto 492 _m_pslldi = _mm_slli_pi32, ///ditto 493 _m_psllq = _mm_sll_si64, ///ditto 494 _m_psllqi = _mm_slli_si64, ///ditto 495 _m_psllw = _mm_sll_pi16, ///ditto 496 _m_psllwi = _mm_slli_pi16, ///ditto 497 _m_psrad = _mm_sra_pi32, ///ditto 498 _m_psradi = _mm_srai_pi32, ///ditto 499 _m_psraw = _mm_sra_pi16, ///ditto 500 _m_psrawi = _mm_srai_pi16, ///ditto 501 _m_psrld = _mm_srl_pi32, ///ditto 502 _m_psrldi = _mm_srli_pi32, ///ditto 503 _m_psrlq = _mm_srl_si64, ///ditto 504 _m_psrlqi = _mm_srli_si64, ///ditto 505 _m_psrlw = _mm_srl_pi16, ///ditto 506 _m_psrlwi = _mm_srli_pi16, ///ditto 507 _m_psubb = _mm_sub_pi8, ///ditto 508 _m_psubd = _mm_sub_pi32, ///ditto 509 _m_psubsb = _mm_subs_pi8, ///ditto 510 _m_psubsw = _mm_subs_pi16, ///ditto 511 _m_psubusb = _mm_subs_pu8, ///ditto 512 _m_psubusw = _mm_subs_pu16, ///ditto 513 _m_psubw = _mm_sub_pi16, ///ditto 514 _m_punpckhbw = _mm_unpackhi_pi8, ///ditto 515 _m_punpckhdq = _mm_unpackhi_pi32, ///ditto 516 _m_punpckhwd = _mm_unpackhi_pi16, ///ditto 517 _m_punpcklbw = _mm_unpacklo_pi8, ///ditto 518 _m_punpckldq = _mm_unpacklo_pi32, ///ditto 519 _m_punpcklwd = _mm_unpacklo_pi16, ///ditto 520 _m_pxor = _mm_xor_si64; ///ditto 521 522 /// Set packed 16-bit integers with the supplied values. 523 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted 524 { 525 short[4] arr = [e0, e1, e2, e3]; 526 return *cast(__m64*)(arr.ptr); 527 } 528 unittest 529 { 530 short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0); 531 short[4] correct = [0, 1, 2, 3]; 532 assert(R.array == correct); 533 } 534 535 /// Set packed 32-bit integers with the supplied values. 536 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted 537 { 538 int[2] arr = [e0, e1]; 539 return *cast(__m64*)(arr.ptr); 540 } 541 unittest 542 { 543 int2 R = cast(int2) _mm_set_pi32(1, 0); 544 int[2] correct = [0, 1]; 545 assert(R.array == correct); 546 } 547 548 /// Set packed 8-bit integers with the supplied values. 549 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 550 { 551 byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7]; 552 return *cast(__m64*)(arr.ptr); 553 } 554 unittest 555 { 556 byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0); 557 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 558 assert(R.array == correct); 559 } 560 561 /// Broadcast 16-bit integer `a` to all elements. 562 __m64 _mm_set1_pi16 (short a) pure @trusted 563 { 564 return cast(__m64)(short4(a)); 565 } 566 unittest 567 { 568 short4 R = cast(short4) _mm_set1_pi16(44); 569 short[4] correct = [44, 44, 44, 44]; 570 assert(R.array == correct); 571 } 572 573 /// Broadcast 32-bit integer `a` to all elements. 574 __m64 _mm_set1_pi32 (int a) pure @trusted 575 { 576 return cast(__m64)(int2(a)); 577 } 578 unittest 579 { 580 int2 R = cast(int2) _mm_set1_pi32(43); 581 int[2] correct = [43, 43]; 582 assert(R.array == correct); 583 } 584 585 /// Broadcast 8-bit integer `a` to all elements. 586 __m64 _mm_set1_pi8 (byte a) pure @trusted 587 { 588 return cast(__m64)(byte8(a)); 589 } 590 unittest 591 { 592 byte8 R = cast(byte8) _mm_set1_pi8(42); 593 byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42]; 594 assert(R.array == correct); 595 } 596 597 /// Set packed 16-bit integers with the supplied values in reverse order. 598 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted 599 { 600 short[4] arr = [e3, e2, e1, e0]; 601 return *cast(__m64*)(arr.ptr); 602 } 603 unittest 604 { 605 short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3); 606 short[4] correct = [0, 1, 2, 3]; 607 assert(R.array == correct); 608 } 609 610 /// Set packed 32-bit integers with the supplied values in reverse order. 611 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted 612 { 613 int[2] arr = [e1, e0]; 614 return *cast(__m64*)(arr.ptr); 615 } 616 unittest 617 { 618 int2 R = cast(int2) _mm_setr_pi32(0, 1); 619 int[2] correct = [0, 1]; 620 assert(R.array == correct); 621 } 622 623 /// Set packed 8-bit integers with the supplied values in reverse order. 624 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 625 { 626 byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0]; 627 return *cast(__m64*)(arr.ptr); 628 } 629 unittest 630 { 631 byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7); 632 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 633 assert(R.array == correct); 634 } 635 636 /// Return vector of type `__m64` with all elements set to zero. 637 __m64 _mm_setzero_si64 () pure @trusted 638 { 639 __m64 r; // PERF =void; 640 r.ptr[0] = 0; 641 return r; 642 } 643 unittest 644 { 645 __m64 R = _mm_setzero_si64(); 646 assert(R.array[0] == 0); 647 } 648 649 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros. 650 deprecated("Use _mm_slli_pi16 instead.") __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe 651 { 652 return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits))); 653 } 654 655 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros. 656 deprecated("Use _mm_slli_pi32 instead.") __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe 657 { 658 return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits))); 659 } 660 661 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros. 662 deprecated("Use _mm_slli_si64 instead.") __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe 663 { 664 return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits))); 665 } 666 667 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 668 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe 669 { 670 return to_m64(_mm_slli_epi16(to_m128i(a), imm8)); 671 } 672 unittest 673 { 674 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 675 short4 B = cast(short4)( _mm_slli_pi16(A, 1) ); 676 short[4] correct = [ -8, -10, 12, 14 ]; 677 assert(B.array == correct); 678 } 679 680 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 681 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe 682 { 683 return to_m64(_mm_slli_epi32(to_m128i(a), imm8)); 684 } 685 unittest 686 { 687 __m64 A = _mm_setr_pi32(-4, 5); 688 int2 B = cast(int2)( _mm_slli_pi32(A, 1) ); 689 int[2] correct = [ -8, 10 ]; 690 assert(B.array == correct); 691 } 692 693 /// Shift 64-bit integer `a` left by `imm8` while shifting in zeros. 694 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe 695 { 696 return to_m64(_mm_slli_epi64(to_m128i(a), imm8)); 697 } 698 unittest 699 { 700 __m64 A = _mm_cvtsi64_m64(-1); 701 long1 R = cast(long1)( _mm_slli_si64(A, 1) ); 702 long[1] correct = [ -2 ]; 703 assert(R.array == correct); 704 } 705 706 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits. 707 deprecated("Use _mm_srai_pi16 instead.") __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe 708 { 709 return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits))); 710 } 711 712 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits. 713 deprecated("Use _mm_srai_pi32 instead.") __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe 714 { 715 return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits))); 716 } 717 718 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 719 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe 720 { 721 return to_m64(_mm_srai_epi16(to_m128i(a), imm8)); 722 } 723 unittest 724 { 725 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 726 short4 B = cast(short4)( _mm_srai_pi16(A, 1) ); 727 short[4] correct = [ -2, -3, 3, 3 ]; 728 assert(B.array == correct); 729 } 730 731 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 732 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe 733 { 734 return to_m64(_mm_srai_epi32(to_m128i(a), imm8)); 735 } 736 unittest 737 { 738 __m64 A = _mm_setr_pi32(-4, 5); 739 int2 B = cast(int2)( _mm_srai_pi32(A, 1) ); 740 int[2] correct = [ -2, 2 ]; 741 assert(B.array == correct); 742 } 743 744 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros. 745 deprecated("Use _mm_srli_pi16 instead.") __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe 746 { 747 return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits))); 748 } 749 750 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros. 751 deprecated("Use _mm_srli_pi32 instead.") __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe 752 { 753 return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits))); 754 } 755 756 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros. 757 deprecated("Use _mm_srli_si64 instead.") __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe 758 { 759 return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits))); 760 } 761 762 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 763 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe 764 { 765 return to_m64(_mm_srli_epi16(to_m128i(a), imm8)); 766 } 767 unittest 768 { 769 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 770 short4 B = cast(short4)( _mm_srli_pi16(A, 1) ); 771 short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ]; 772 assert(B.array == correct); 773 } 774 775 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 776 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe 777 { 778 return to_m64(_mm_srli_epi32(to_m128i(a), imm8)); 779 } 780 unittest 781 { 782 __m64 A = _mm_setr_pi32(-4, 5); 783 int2 B = cast(int2)( _mm_srli_pi32(A, 1) ); 784 int[2] correct = [ 0x7ffffffe, 2 ]; 785 assert(B.array == correct); 786 } 787 788 /// Shift 64-bit integer `a` right by `imm8` while shifting in zeros. 789 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe 790 { 791 return to_m64(_mm_srli_epi64(to_m128i(a), imm8)); 792 } 793 unittest 794 { 795 __m64 A = _mm_cvtsi64_m64(-1); 796 long1 R = cast(long1)( _mm_srli_si64(A, 1) ); 797 long[1] correct = [ 0x7fff_ffff_ffff_ffff ]; 798 assert(R.array == correct); 799 } 800 801 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 802 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe 803 { 804 return cast(__m64)(cast(short4)a - cast(short4)b); 805 } 806 unittest 807 { 808 short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 809 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 810 static immutable short[4] correct = [ -1,-15, 1, 32764]; 811 assert(R.array == correct); 812 } 813 814 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 815 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe 816 { 817 return cast(__m64)(cast(int2)a - cast(int2)b); 818 } 819 unittest 820 { 821 int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10, 4), 822 _mm_setr_pi32( 15, -70)); 823 static immutable int[2] correct = [ -5, 74]; 824 assert(R.array == correct); 825 } 826 827 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 828 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe 829 { 830 return cast(__m64)(cast(byte8)a - cast(byte8)b); 831 } 832 unittest 833 { 834 byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 835 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 836 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, 120 ]; 837 assert(R.array == correct); 838 } 839 840 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation. 841 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe 842 { 843 return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b))); 844 } 845 unittest 846 { 847 short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 848 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 849 static immutable short[4] correct = [ -1,-15, 1, -32768]; 850 assert(R.array == correct); 851 } 852 853 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation. 854 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe 855 { 856 return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b))); 857 } 858 unittest 859 { 860 byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 861 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 862 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, -128 ]; 863 assert(R.array == correct); 864 } 865 866 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 867 /// using saturation. 868 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe 869 { 870 return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b))); 871 } 872 unittest 873 { 874 short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534, 1, 5, 4), 875 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 876 static immutable short[4] correct = [ 0, 0, 1, 0]; 877 assert(R.array == correct); 878 } 879 880 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 881 /// using saturation. 882 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe 883 { 884 return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b))); 885 } 886 unittest 887 { 888 byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8), 889 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 890 static immutable byte[8] correct = [ 0, 7, 0, 0, 0, 0, 0, 0, ]; 891 assert(R.array == correct); 892 } 893 894 deprecated alias _m_to_int = _mm_cvtsi64_si32; /// Deprecated intrinsics. 895 deprecated alias _m_to_int64 = _mm_cvtm64_si64; ///ditto 896 897 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 898 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted 899 { 900 static if (LDC_with_optimizations) 901 { 902 enum ir = `%r = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 903 ret <4 x i16> %r`; 904 return cast(__m64) LDCInlineIR!(ir, short4, short4, short4)(cast(short4)a, cast(short4)b); 905 } 906 else 907 { 908 short4 ia = cast(short4)a; 909 short4 ib = cast(short4)b; 910 short4 r; 911 r.ptr[0] = ia.array[2]; 912 r.ptr[1] = ib.array[2]; 913 r.ptr[2] = ia.array[3]; 914 r.ptr[3] = ib.array[3]; 915 return cast(__m64)r; 916 } 917 } 918 unittest 919 { 920 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 921 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 922 short4 R = cast(short4) _mm_unpackhi_pi16(A, B); 923 short[4] correct = [-16, -3, 7, 10]; 924 assert(R.array == correct); 925 } 926 927 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 928 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted 929 { 930 // Generate punpckldq as far back as LDC 1.0.0 -O1 931 // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions) 932 int2 ia = cast(int2)a; 933 int2 ib = cast(int2)b; 934 int2 r; 935 r.ptr[0] = ia.array[1]; 936 r.ptr[1] = ib.array[1]; 937 return cast(__m64)r; 938 } 939 unittest 940 { 941 __m64 A = _mm_setr_pi32(4, 8); 942 __m64 B = _mm_setr_pi32(5, 9); 943 int2 R = cast(int2) _mm_unpackhi_pi32(A, B); 944 int[2] correct = [8, 9]; 945 assert(R.array == correct); 946 } 947 948 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 949 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b) 950 { 951 static if (LDC_with_optimizations) 952 { 953 enum ir = `%r = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 954 ret <8 x i8> %r`; 955 return cast(__m64) LDCInlineIR!(ir, byte8, byte8, byte8)(cast(byte8)a, cast(byte8)b); 956 } 957 else 958 { 959 byte8 ia = cast(byte8)a; 960 byte8 ib = cast(byte8)b; 961 byte8 r; 962 r.ptr[0] = ia.array[4]; 963 r.ptr[1] = ib.array[4]; 964 r.ptr[2] = ia.array[5]; 965 r.ptr[3] = ib.array[5]; 966 r.ptr[4] = ia.array[6]; 967 r.ptr[5] = ib.array[6]; 968 r.ptr[6] = ia.array[7]; 969 r.ptr[7] = ib.array[7]; 970 return cast(__m64)r; 971 } 972 } 973 unittest 974 { 975 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 976 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 977 byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B); 978 byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8]; 979 assert(R.array == correct); 980 } 981 982 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 983 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b) 984 { 985 // Generates punpcklwd since LDC 1.0.0 -01 986 short4 ia = cast(short4)a; 987 short4 ib = cast(short4)b; 988 short4 r; 989 r.ptr[0] = ia.array[0]; 990 r.ptr[1] = ib.array[0]; 991 r.ptr[2] = ia.array[1]; 992 r.ptr[3] = ib.array[1]; 993 return cast(__m64)r; 994 } 995 unittest 996 { 997 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 998 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 999 short4 R = cast(short4) _mm_unpacklo_pi16(A, B); 1000 short[4] correct = [4, 5, 8, 9]; 1001 assert(R.array == correct); 1002 } 1003 1004 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 1005 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted 1006 { 1007 // x86: Generate punpckldq as far back as LDC 1.0.0 -O1 1008 // ARM: Generate zip as far back as LDC 1.8.0 -O1 1009 int2 ia = cast(int2)a; 1010 int2 ib = cast(int2)b; 1011 int2 r; 1012 r.ptr[0] = ia.array[0]; 1013 r.ptr[1] = ib.array[0]; 1014 return cast(__m64)r; 1015 } 1016 unittest 1017 { 1018 __m64 A = _mm_setr_pi32(4, 8); 1019 __m64 B = _mm_setr_pi32(5, 9); 1020 int2 R = cast(int2) _mm_unpacklo_pi32(A, B); 1021 int[2] correct = [4, 5]; 1022 assert(R.array == correct); 1023 } 1024 1025 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 1026 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b) 1027 { 1028 static if (LDC_with_optimizations) 1029 { 1030 enum ir = `%r = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 1031 ret <8 x i8> %r`; 1032 return cast(__m64) LDCInlineIR!(ir, byte8, byte8, byte8)(cast(byte8)a, cast(byte8)b); 1033 } 1034 else 1035 { 1036 byte8 ia = cast(byte8)a; 1037 byte8 ib = cast(byte8)b; 1038 byte8 r; 1039 r.ptr[0] = ia.array[0]; 1040 r.ptr[1] = ib.array[0]; 1041 r.ptr[2] = ia.array[1]; 1042 r.ptr[3] = ib.array[1]; 1043 r.ptr[4] = ia.array[2]; 1044 r.ptr[5] = ib.array[2]; 1045 r.ptr[6] = ia.array[3]; 1046 r.ptr[7] = ib.array[3]; 1047 return cast(__m64)r; 1048 } 1049 } 1050 unittest 1051 { 1052 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 1053 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 1054 byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B); 1055 byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4]; 1056 assert(R.array == correct); 1057 } 1058 1059 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`. 1060 __m64 _mm_xor_si64 (__m64 a, __m64 b) 1061 { 1062 return a ^ b; 1063 } 1064 unittest 1065 { 1066 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 1067 __m64 B = _mm_set1_pi16(15); 1068 short4 R = cast(short4)_mm_xor_si64(A, B); 1069 short[4] correct = [240, 14, -16, 15]; 1070 assert(R.array == correct); 1071 } 1072