1 /** 2 * AVX2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX2 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.avx2intrin; 10 11 // AVX2 instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX2 13 // Note: this header will work whether you have AVX2 enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+avx2"] or equivalent to actively 15 // generate AVX2 instructions. 16 // With GDC, use "dflags-gdc": ["-mavx2"] or equivalent to actively 17 // generate AVX2 instructions. 18 19 20 // Note: many special cases for GDC, because when suporting SIMD_COMPARISON_MASKS_32B but not having AVX2, 21 // the replaced operators have terrible performance. Mostly a problem for -mavx on x86 22 23 public import inteli.types; 24 import inteli.internals; 25 26 // Pull in all previous instruction set intrinsics. 27 public import inteli.avxintrin; 28 29 nothrow @nogc: 30 31 /// Compute the absolute value of packed signed 16-bit integers in `a`. 32 __m256i _mm256_abs_epi16 (__m256i a) @trusted 33 { 34 // PERF DMD 35 version(LDC) 36 enum split = true; // akways beneficial in LDC neon, ssse3, or even sse2 37 else 38 enum split = GDC_with_SSSE3; 39 40 static if (GDC_with_AVX2) 41 { 42 return cast(__m256i) __builtin_ia32_pabsw256(cast(short16)a); 43 } 44 else static if (__VERSION__ >= 2097 && LDC_with_AVX2) 45 { 46 // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 47 // no good way to do abs(256-bit) 48 return cast(__m256i) inteli_llvm_abs!short16(cast(short16)a, false); 49 } 50 else static if (split) 51 { 52 __m128i a_lo = _mm256_extractf128_si256!0(a); 53 __m128i a_hi = _mm256_extractf128_si256!1(a); 54 __m128i r_lo = _mm_abs_epi16(a_lo); 55 __m128i r_hi = _mm_abs_epi16(a_hi); 56 return _mm256_set_m128i(r_hi, r_lo); 57 } 58 else 59 { 60 short16 sa = cast(short16)a; 61 for (int i = 0; i < 16; ++i) 62 { 63 short s = sa.array[i]; 64 sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s)); 65 } 66 return cast(__m256i)sa; 67 } 68 } 69 unittest 70 { 71 __m256i A = _mm256_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000, 72 1, -1, -32768, 32767, 12, -13, 1000, -1040); 73 short16 B = cast(short16) _mm256_abs_epi16(A); 74 short[16] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000, 75 1, 1, -32768, 32767, 12, 13, 1000, 1040]; 76 assert(B.array == correct); 77 } 78 79 /// Compute the absolute value of packed signed 32-bit integers in `a`. 80 __m256i _mm256_abs_epi32 (__m256i a) @trusted 81 { 82 // PERF DMD 83 version(LDC) 84 enum split = true; // always beneficial in LDC neon, ssse3, or even sse2 85 else 86 enum split = false; // GDC manages to split and use pabsd in SSSE3 without guidance 87 88 static if (GDC_with_AVX2) 89 { 90 return cast(__m256i) __builtin_ia32_pabsd256(cast(int8)a); 91 } 92 else static if (__VERSION__ >= 2097 && LDC_with_AVX2) 93 { 94 // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 95 // no good way to do abs(256-bit) 96 return cast(__m256i) inteli_llvm_abs!int8(cast(int8)a, false); 97 } 98 else static if (split) 99 { 100 __m128i a_lo = _mm256_extractf128_si256!0(a); 101 __m128i a_hi = _mm256_extractf128_si256!1(a); 102 __m128i r_lo = _mm_abs_epi32(a_lo); 103 __m128i r_hi = _mm_abs_epi32(a_hi); 104 return _mm256_set_m128i(r_hi, r_lo); 105 } 106 else 107 { 108 int8 sa = cast(int8)a; 109 for (int i = 0; i < 8; ++i) 110 { 111 int s = sa.array[i]; 112 sa.ptr[i] = (s >= 0 ? s : -s); 113 } 114 return cast(__m256i)sa; 115 } 116 } 117 unittest 118 { 119 __m256i A = _mm256_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647, -1, 0, -2_147_483_648, -2_147_483_646); 120 int8 B = cast(int8) _mm256_abs_epi32(A); 121 int[8] correct = [0, 1, -2_147_483_648, 2_147_483_647, 1, 0, -2_147_483_648, 2_147_483_646]; 122 assert(B.array == correct); 123 } 124 125 /// Compute the absolute value of packed signed 8-bit integers in `a`. 126 __m256i _mm256_abs_epi8 (__m256i a) @trusted 127 { 128 // PERF DMD 129 // PERF GDC in SSSE3 to AVX doesn't use pabsb and split is catastrophic because of _mm_min_epu8 130 version(LDC) 131 enum split = true; // akways beneficial in LDC neon, ssse3, sse2 132 else 133 enum split = false; 134 135 static if (GDC_with_AVX2) 136 { 137 return cast(__m256i) __builtin_ia32_pabsb256(cast(ubyte32)a); 138 } 139 else static if (__VERSION__ >= 2097 && LDC_with_AVX2) 140 { 141 // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 142 // no good way to do abs(256-bit) 143 return cast(__m256i) inteli_llvm_abs!byte32(cast(byte32)a, false); 144 } 145 else static if (split) 146 { 147 __m128i a_lo = _mm256_extractf128_si256!0(a); 148 __m128i a_hi = _mm256_extractf128_si256!1(a); 149 __m128i r_lo = _mm_abs_epi8(a_lo); 150 __m128i r_hi = _mm_abs_epi8(a_hi); 151 return _mm256_set_m128i(r_hi, r_lo); 152 } 153 else 154 { 155 // Basically this loop is poison for LDC optimizer 156 byte32 sa = cast(byte32)a; 157 for (int i = 0; i < 32; ++i) 158 { 159 byte s = sa.array[i]; 160 sa.ptr[i] = s >= 0 ? s : cast(byte)(-cast(int)(s)); 161 } 162 return cast(__m256i)sa; 163 } 164 } 165 unittest 166 { 167 __m256i A = _mm256_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 168 0, -1, -128, -126, 127, -6, -5, -4, -3, -2, 0, 1, 2, 3, 4, 5); 169 byte32 B = cast(byte32) _mm256_abs_epi8(A); 170 byte[32] correct = [0, 1, -128, 127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 1, -128, 126, 127, 6, 5, 4, 3, 2, 0, 1, 2, 3, 4, 5]; 172 assert(B.array == correct); 173 } 174 175 /// Add packed 16-bit integers in `a` and `b`. 176 __m256i _mm256_add_epi16 (__m256i a, __m256i b) pure @safe 177 { 178 pragma(inline, true); 179 return cast(__m256i)(cast(short16)a + cast(short16)b); 180 } 181 unittest 182 { 183 __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0, 6, -2); 184 short16 R = cast(short16) _mm256_add_epi16(A, A); 185 short[16] correct = [ -14, -2, 0, 18, -200, 200, 468, 864, 0, -2, 0, -2, 25536, 0, 12, -4 ]; 186 assert(R.array == correct); 187 } 188 189 /// Add packed 32-bit integers in `a` and `b`. 190 __m256i _mm256_add_epi32(__m256i a, __m256i b) pure @safe 191 { 192 pragma(inline, true); 193 return cast(__m256i)(cast(int8)a + cast(int8)b); 194 } 195 unittest 196 { 197 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432); 198 int8 R = cast(int8) _mm256_add_epi32(A, A); 199 int[8] correct = [ -14, -2, 0, 18, -200, 200, 468, 864 ]; 200 assert(R.array == correct); 201 } 202 203 /// Add packed 64-bit integers in `a` and `b`. 204 __m256i _mm256_add_epi64 (__m256i a, __m256i b) pure @safe 205 { 206 pragma(inline, true); 207 return a + b; 208 } 209 unittest 210 { 211 __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12); 212 long4 R = cast(__m256i) _mm256_add_epi64(A, A); 213 long[4] correct = [ -2, 0, 84, -24 ]; 214 assert(R.array == correct); 215 } 216 217 /// Add packed 8-bit integers in `a` and `b`. 218 __m256i _mm256_add_epi8 (__m256i a, __m256i b) pure @safe 219 { 220 pragma(inline, true); 221 return cast(__m256i)(cast(byte32)a + cast(byte32)b); 222 } 223 unittest 224 { 225 __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78, 226 4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78); 227 byte32 R = cast(byte32) _mm256_add_epi8(A, A); 228 byte[32] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100, 229 8, 18, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -4, 0, 20, -100]; 230 assert(R.array == correct); 231 } 232 233 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 234 __m256i _mm256_adds_epi16 (__m256i a, __m256i b) pure @trusted 235 { 236 // PERF DMD 237 static if (GDC_with_AVX2) 238 { 239 return cast(__m256i) __builtin_ia32_paddsw256(cast(short16)a, cast(short16)b); 240 } 241 else static if(LDC_with_saturated_intrinsics) 242 { 243 return cast(__m256i) inteli_llvm_adds!short16(cast(short16)a, cast(short16)b); 244 } 245 else 246 { 247 short16 r; 248 short16 sa = cast(short16)a; 249 short16 sb = cast(short16)b; 250 foreach(i; 0..16) 251 r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 252 return cast(__m256i)r; 253 } 254 } 255 unittest 256 { 257 short16 res = cast(short16) _mm256_adds_epi16(_mm256_setr_epi16( 7, 6, 5, -32768, 3, 3, 32767, 0, 7, 6, 5, -32768, 3, 3, 32767, 0), 258 _mm256_setr_epi16( 7, 6, 5, -30000, 3, 1, 1, -10, 7, 6, 5, -30000, 3, 1, 1, -10)); 259 static immutable short[16] correctResult = [14, 12, 10, -32768, 6, 4, 32767, -10, 14, 12, 10, -32768, 6, 4, 32767, -10]; 260 assert(res.array == correctResult); 261 } 262 263 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 264 __m256i _mm256_adds_epi8 (__m256i a, __m256i b) pure @trusted 265 { 266 // PERF DMD 267 static if (GDC_with_AVX2) 268 { 269 return cast(__m256i) __builtin_ia32_paddsb256(cast(ubyte32)a, cast(ubyte32)b); 270 } 271 else static if(LDC_with_saturated_intrinsics) 272 { 273 return cast(__m256i) inteli_llvm_adds!byte32(cast(byte32)a, cast(byte32)b); 274 } 275 else 276 { 277 byte32 r; 278 byte32 sa = cast(byte32)a; 279 byte32 sb = cast(byte32)b; 280 foreach(i; 0..32) 281 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]); 282 return cast(__m256i)r; 283 } 284 } 285 unittest 286 { 287 byte32 res = cast(byte32) _mm256_adds_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0), 288 _mm256_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0)); 289 static immutable byte[32] correctResult = [30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0, 30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0]; 290 assert(res.array == correctResult); 291 } 292 293 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 294 __m256i _mm256_adds_epu16 (__m256i a, __m256i b) pure @trusted 295 { 296 // PERF DMD 297 static if (GDC_with_AVX2) 298 { 299 return cast(__m256i) __builtin_ia32_paddusw256(cast(short16)a, cast(short16)b); 300 } 301 else static if(LDC_with_saturated_intrinsics) 302 { 303 return cast(__m256i) inteli_llvm_addus!short16(cast(short16)a, cast(short16)b); 304 } 305 else 306 { 307 short16 r; 308 short16 sa = cast(short16)a; 309 short16 sb = cast(short16)b; 310 foreach(i; 0..16) 311 r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 312 return cast(__m256i)r; 313 } 314 } 315 unittest 316 { 317 short16 res = cast(short16) _mm256_adds_epu16(_mm256_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 318 _mm256_set_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0)); 319 static immutable short[16] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 320 assert(res.array == correctResult); 321 } 322 323 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 324 __m256i _mm256_adds_epu8 (__m256i a, __m256i b) pure @trusted 325 { 326 // PERF DMD 327 static if (GDC_with_AVX2) 328 { 329 return cast(__m256i) __builtin_ia32_paddusb256(cast(ubyte32)a, cast(ubyte32)b); 330 } 331 else static if(LDC_with_saturated_intrinsics) 332 { 333 return cast(__m256i) inteli_llvm_addus!byte32(cast(byte32)a, cast(byte32)b); 334 } 335 else 336 { 337 byte32 r; 338 byte32 sa = cast(byte32)a; 339 byte32 sb = cast(byte32)b; 340 foreach(i; 0..32) 341 r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 342 return cast(__m256i)r; 343 } 344 } 345 unittest 346 { 347 __m256i A = _mm256_setr_epi8(0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0); 348 __m256i B = _mm256_setr_epi8(0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, 40, 0, 0, 0, 0, 0, 0); 349 byte32 R = cast(byte32) _mm256_adds_epu8(A, B); 350 static immutable byte[32] correct = [0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, cast(byte)176, 0, 0, 0, 0, 0, 0]; 351 assert(R.array == correct); 352 } 353 354 /// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the 355 /// result right by `imm8` bytes, and return the low 16 bytes of that in each lane. 356 __m256i _mm256_alignr_epi8(ubyte count)(__m256i a, __m256i b) pure @trusted 357 { 358 359 // PERF DMD 360 static if (GDC_with_AVX2) 361 { 362 return cast(__m256i)__builtin_ia32_palignr256(a, b, count * 8); 363 } 364 else 365 { 366 // Note that palignr 256-bit does the same as palignr 128-bit by lane. Can split. 367 // With LDC 1.24 + avx2 feature + -02, that correctly gives a AVX2 vpalignr despite being split. 368 // I guess we could do it with a big 32-items shufflevector but not sure if best. 369 // 2 inst on ARM64 neon, which is optimal. 370 __m128i a_lo = _mm256_extractf128_si256!0(a); 371 __m128i a_hi = _mm256_extractf128_si256!1(a); 372 __m128i b_lo = _mm256_extractf128_si256!0(b); 373 __m128i b_hi = _mm256_extractf128_si256!1(b); 374 __m128i r_lo = _mm_alignr_epi8!count(a_lo, b_lo); 375 __m128i r_hi = _mm_alignr_epi8!count(a_hi, b_hi); 376 return _mm256_set_m128i(r_hi, r_lo); 377 } 378 } 379 unittest 380 { 381 __m128i A = _mm_setr_epi8( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 382 __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 383 __m256i AA = _mm256_set_m128i(A, A); 384 __m256i BB = _mm256_set_m128i(B, B); 385 386 { 387 byte32 C = cast(byte32) _mm256_alignr_epi8!0(AA, BB); 388 byte[32] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]; 389 assert(C.array == correct); 390 } 391 { 392 byte32 C = cast(byte32) _mm256_alignr_epi8!20(AA, BB); 393 byte[32] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0]; 394 assert(C.array == correct); 395 } 396 { 397 byte32 C = cast(byte32) _mm256_alignr_epi8!34(AA, BB); 398 byte[32] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 399 assert(C.array == correct); 400 } 401 } 402 403 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`. 404 __m256i _mm256_and_si256 (__m256i a, __m256i b) pure @safe 405 { 406 pragma(inline, true); 407 return a & b; 408 } 409 unittest 410 { 411 __m256i A = _mm256_set1_epi32(7); 412 __m256i B = _mm256_set1_epi32(14); 413 int8 R = cast(int8) _mm256_and_si256(A, B); 414 int[8] correct = [6, 6, 6, 6, 6, 6, 6, 6]; 415 assert(R.array == correct); 416 } 417 418 /// Compute the bitwise NOT of 256 bits (representing integer data) in `a` and then AND with `b`. 419 __m256i _mm256_andnot_si256 (__m256i a, __m256i b) pure @safe 420 { 421 // See: https://issues.dlang.org/show_bug.cgi?id=24283, 422 // need workaround if we ever use DMD AVX codegen 423 424 pragma(inline, true); 425 return (~a) & b; 426 } 427 unittest 428 { 429 __m256i A = _mm256_setr_epi32(7, -2, 9, 54654, 7, -2, 9, 54654); 430 __m256i B = _mm256_setr_epi32(14, 78, 111, -256, 14, 78, 111, -256); 431 int8 R = cast(int8) _mm256_andnot_si256(A, B); 432 int[8] correct = [8, 0, 102, -54784, 8, 0, 102, -54784]; 433 assert(R.array == correct); 434 } 435 436 /// Average packed unsigned 16-bit integers in `a` and `b`. 437 __m256i _mm256_avg_epu16 (__m256i a, __m256i b) pure @trusted 438 { 439 static if (GDC_with_AVX2) 440 { 441 return cast(__m256i) __builtin_ia32_pavgw256(cast(short16)a, cast(short16)b); 442 } 443 else static if (LDC_with_AVX2 && __VERSION__ >= 2094) 444 { 445 return cast(__m256i) __builtin_ia32_pavgw256(cast(short16)a, cast(short16)b); 446 } 447 else 448 { 449 // Splitting is always beneficial here, except -O0 450 __m128i a_lo = _mm256_extractf128_si256!0(a); 451 __m128i a_hi = _mm256_extractf128_si256!1(a); 452 __m128i b_lo = _mm256_extractf128_si256!0(b); 453 __m128i b_hi = _mm256_extractf128_si256!1(b); 454 __m128i r_lo = _mm_avg_epu16(a_lo, b_lo); 455 __m128i r_hi = _mm_avg_epu16(a_hi, b_hi); 456 return _mm256_set_m128i(r_hi, r_lo); 457 } 458 } 459 unittest 460 { 461 __m256i A = _mm256_set1_epi16(31457); 462 __m256i B = _mm256_set1_epi16(cast(short)64000); 463 short16 avg = cast(short16)(_mm256_avg_epu16(A, B)); 464 foreach(i; 0..16) 465 assert(avg.array[i] == cast(short)47729); 466 } 467 468 /// Average packed unsigned 8-bit integers in `a` and `b`. 469 __m256i _mm256_avg_epu8 (__m256i a, __m256i b) pure @trusted 470 { 471 static if (GDC_with_AVX2) 472 { 473 return cast(__m256i) __builtin_ia32_pavgb256(cast(ubyte32)a, cast(ubyte32)b); 474 } 475 else static if (LDC_with_AVX2 && __VERSION__ >= 2094) 476 { 477 return cast(__m256i) __builtin_ia32_pavgb256(cast(byte32)a, cast(byte32)b); 478 } 479 else 480 { 481 // Splitting is always beneficial here, except -O0 482 __m128i a_lo = _mm256_extractf128_si256!0(a); 483 __m128i a_hi = _mm256_extractf128_si256!1(a); 484 __m128i b_lo = _mm256_extractf128_si256!0(b); 485 __m128i b_hi = _mm256_extractf128_si256!1(b); 486 __m128i r_lo = _mm_avg_epu8(a_lo, b_lo); 487 __m128i r_hi = _mm_avg_epu8(a_hi, b_hi); 488 return _mm256_set_m128i(r_hi, r_lo); 489 } 490 } 491 unittest 492 { 493 __m256i A = _mm256_set1_epi8(-1); 494 __m256i B = _mm256_set1_epi8(13); 495 byte32 avg = cast(byte32)(_mm256_avg_epu8(A, B)); 496 foreach(i; 0..32) 497 assert(avg.array[i] == cast(byte)134); 498 } 499 500 /// Blend packed 16-bit integers from `a` and `b` within 128-bit lanes using 8-bit control 501 /// mask `imm8`, in each of the two lanes. 502 /// Note: this is functionally equivalent to two `_mm_blend_epi16`. 503 __m256i _mm256_blend_epi16(int imm8) (__m256i a, __m256i b) pure @trusted 504 { 505 // PERF DMD 506 assert(imm8 >= 0 && imm8 < 256); 507 enum bool split = true; // makes things better, except on ARM32 which is no better than naive 508 509 static if (GDC_with_AVX2) 510 { 511 return cast(__m256i) __builtin_ia32_pblendw256(cast(short16)a, cast(short16)b, imm8); 512 } 513 else static if (split) 514 { 515 __m128i a_lo = _mm256_extractf128_si256!0(a); 516 __m128i a_hi = _mm256_extractf128_si256!1(a); 517 __m128i b_lo = _mm256_extractf128_si256!0(b); 518 __m128i b_hi = _mm256_extractf128_si256!1(b); 519 __m128i r_lo = _mm_blend_epi16!(imm8)(a_lo, b_lo); 520 __m128i r_hi = _mm_blend_epi16!(imm8)(a_hi, b_hi); 521 return _mm256_set_m128i(r_hi, r_lo); 522 } 523 } 524 unittest 525 { 526 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 0, -1, -2, -3, -4, -5, -6, -7); 527 __m256i B = _mm256_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15, -8, -9, -10, -11, -12, -13, -14, -15); 528 short16 C = cast(short16) _mm256_blend_epi16!147(A, B); // 10010011 10010011 529 short[16] correct = [8, 9, 2, 3, 12, 5, 6, 15, -8, -9, -2, -3, -12, -5, -6, -15]; 530 assert(C.array == correct); 531 } 532 533 /// Blend packed 32-bit integers from `a` and `b` using 4-bit control mask `imm8`. 534 __m128i _mm_blend_epi32(int imm8)(__m128i a, __m128i b) pure @trusted 535 { 536 // This one is interesting, it is functionally equivalent to SSE4.1 blendps (_mm_blend_ps) 537 // So without AVX2 we can always fallback to _mm_blend_ps 538 // And indeed, a shufflevector!int4 doesn't even use vpblendd with LDC, and prefer 539 // blendps and shufps so why bother. 540 541 // PERF DMD 542 static assert(imm8 >= 0 && imm8 < 16); 543 static if (GDC_with_AVX2) 544 { 545 return __builtin_ia32_pblendd128(a, b, imm8); 546 } 547 else 548 { 549 return cast(__m128i) _mm_blend_ps!imm8(cast(__m128)a, cast(__m128)b); 550 } 551 } 552 unittest 553 { 554 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 555 __m128i B = _mm_setr_epi32(8, 9, 10, 11); 556 int4 C = _mm_blend_epi32!13(A, B); // 1101 557 int[4] correct = [8, 1, 10, 11]; 558 assert(C.array == correct); 559 } 560 561 /// Blend packed 32-bit integers from `a` and `b` using 8-bit control mask `imm8`. 562 __m256i _mm256_blend_epi32(int imm8)(__m256i a, __m256i b) pure @trusted 563 { 564 // This one is functionally equivalent to AVX _mm256_blend_ps, except with integers. 565 // With LDC, doing a shufflevector here would select the vblendps instruction anyway, 566 // so we might as well defer to _mm256_blend_ps. 567 568 // PERF DMD 569 static assert(imm8 >= 0 && imm8 < 256); 570 static if (GDC_with_AVX2) 571 { 572 return cast(__m256i) __builtin_ia32_pblendd256 (cast(int8)a, cast(int8)b, imm8); 573 } 574 else 575 { 576 return cast(__m256i) _mm256_blend_ps!imm8(cast(__m256)a, cast(__m256)b); 577 } 578 } 579 unittest 580 { 581 __m256i A = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); 582 __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 147, 15); 583 int8 C = cast(int8) _mm256_blend_epi32!0xe7(A, B); 584 int[8] correct = [8, 9, 10, 3, 4, 13, 147, 15]; 585 assert(C.array == correct); 586 } 587 588 // TODO __m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask) pure @safe 589 590 /// Broadcast the low packed 8-bit integer from `a` to all elements of result. 591 __m128i _mm_broadcastb_epi8 (__m128i a) pure @safe 592 { 593 byte16 ba = cast(byte16)a; 594 byte16 r; 595 r = ba.array[0]; 596 return cast(__m128i)r; 597 } 598 unittest 599 { 600 byte16 A; 601 A.ptr[0] = 2; 602 byte16 B = cast(byte16) _mm_broadcastb_epi8(cast(__m128i)A); 603 byte[16] correct = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]; 604 assert(B.array == correct); 605 } 606 607 /// Bro0adcast the low packed 8-bit integer from `a` to all elements of result. 608 __m256i _mm256_broadcastb_epi8(__m128i a) pure @safe 609 { 610 byte16 ba = cast(byte16)a; 611 byte32 r; 612 r = ba.array[0]; 613 return cast(__m256i)r; 614 } 615 unittest 616 { 617 byte16 A; 618 A.ptr[0] = 2; 619 byte32 B = cast(byte32) _mm256_broadcastb_epi8(cast(__m128i)A); 620 byte[32] correct = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 621 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]; 622 assert(B.array == correct); 623 } 624 625 /// Broadcast the low packed 32-bit integer from `a` to all elements of result. 626 __m128i _mm_broadcastd_epi32 (__m128i a) pure @safe 627 { 628 int4 ba = cast(int4)a; 629 int4 r; 630 r = ba.array[0]; 631 return cast(__m128i)r; 632 } 633 unittest 634 { 635 int4 A; 636 A.ptr[0] = -2; 637 int4 B = cast(int4) _mm_broadcastd_epi32(cast(__m128i)A); 638 int[4] correct = [-2, -2, -2, -2]; 639 assert(B.array == correct); 640 } 641 642 /// Broadcast the low packed 32-bit integer from `a` to all elements of result. 643 __m256i _mm256_broadcastd_epi32 (__m128i a) pure @safe 644 { 645 int4 ba = cast(int4)a; 646 int8 r; 647 r = ba.array[0]; 648 return cast(__m256i)r; 649 } 650 unittest 651 { 652 int4 A; 653 A.ptr[0] = -2; 654 int8 B = cast(int8) _mm256_broadcastd_epi32(cast(__m128i)A); 655 int[8] correct = [-2, -2, -2, -2, -2, -2, -2, -2]; 656 assert(B.array == correct); 657 } 658 659 /// Broadcast the low packed 64-bit integer from `a` to all elements of result. 660 __m128i _mm_broadcastq_epi64 (__m128i a) pure @safe 661 { 662 long2 ba = cast(long2)a; 663 long2 r; 664 r = ba.array[0]; 665 return cast(__m128i)r; 666 } 667 unittest 668 { 669 long2 A; 670 A.ptr[0] = -2; 671 long2 B = cast(long2) _mm_broadcastq_epi64(cast(__m128i)A); 672 long[2] correct = [-2, -2]; 673 assert(B.array == correct); 674 } 675 676 /// Broadcast the low packed 64-bit integer from `a` to all elements of result. 677 __m256i _mm256_broadcastq_epi64 (__m128i a) pure @safe 678 { 679 long2 ba = cast(long2)a; 680 long4 r; 681 r = ba.array[0]; 682 return cast(__m256i)r; 683 } 684 unittest 685 { 686 long2 A; 687 A.ptr[0] = -2; 688 long4 B = cast(long4) _mm256_broadcastq_epi64(cast(__m128i)A); 689 long[4] correct = [-2, -2, -2, -2]; 690 assert(B.array == correct); 691 } 692 693 /// Broadcast the low double-precision (64-bit) floating-point element from `a` to all elements of result. 694 __m128d _mm_broadcastsd_pd (__m128d a) pure @safe 695 { 696 double2 r; 697 r = a.array[0]; 698 return r; 699 } 700 unittest 701 { 702 double2 A; 703 A.ptr[0] = 2; 704 double2 B = _mm_broadcastsd_pd(A); 705 double[2] correct = [2.0, 2.0]; 706 assert(B.array == correct); 707 } 708 709 /// Broadcast the low double-precision (64-bit) floating-point element from `a` to all elements of result. 710 __m256d _mm256_broadcastsd_pd (__m128d a) pure @safe 711 { 712 double4 r; 713 r = a.array[0]; 714 return r; 715 } 716 unittest 717 { 718 double2 A; 719 A.ptr[0] = 3; 720 double4 B = _mm256_broadcastsd_pd(A); 721 double[4] correct = [3.0, 3, 3, 3]; 722 assert(B.array == correct); 723 } 724 725 /// Broadcast 128 bits of integer data from ``a to all 128-bit lanes in result. 726 /// Note: also exist with name `_mm256_broadcastsi128_si256` which is identical. 727 __m256i _mm_broadcastsi128_si256 (__m128i a) pure @trusted 728 { 729 // Note that GDC will prefer vinserti128 to vbroadcast, for some reason 730 // So in the end it's the same as naive code. 731 // For this reason, __builtin_ia32_vbroadcastsi256 isn't used 732 long2 ba = cast(long2)a; 733 long4 r; 734 r.ptr[0] = ba.array[0]; 735 r.ptr[1] = ba.array[1]; 736 r.ptr[2] = ba.array[0]; 737 r.ptr[3] = ba.array[1]; 738 return cast(__m256i)r; 739 } 740 unittest 741 { 742 long2 A; 743 A.ptr[0] = 34; 744 A.ptr[1] = -56; 745 long4 B = cast(long4) _mm_broadcastsi128_si256(cast(__m128i)A); 746 long[4] correct = [34, -56, 34, -56]; 747 assert(B.array == correct); 748 } 749 750 ///ditto 751 alias _mm256_broadcastsi128_si256 = _mm_broadcastsi128_si256; // intrinsics is duplicated in the Guide, for some reason 752 753 /// Broadcast the low single-precision (32-bit) floating-point element from `a` to all elements of result. 754 __m128 _mm_broadcastss_ps (__m128 a) pure @safe 755 { 756 float4 r; 757 r = a.array[0]; 758 return r; 759 } 760 unittest 761 { 762 float4 A; 763 A.ptr[0] = 2; 764 float4 B = _mm_broadcastss_ps(A); 765 float[4] correct = [2.0f, 2, 2, 2]; 766 assert(B.array == correct); 767 } 768 769 /// Broadcast the low single-precision (32-bit) floating-point element from `a` to all elements of result. 770 __m256 _mm256_broadcastss_ps (__m128 a) pure @safe 771 { 772 float8 r; 773 r = a.array[0]; 774 return r; 775 } 776 unittest 777 { 778 float4 A; 779 A.ptr[0] = 2; 780 float8 B = _mm256_broadcastss_ps(A); 781 float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2]; 782 assert(B.array == correct); 783 } 784 785 /// Broadcast the low packed 16-bit integer from `a` to all elements of result. 786 __m128i _mm_broadcastw_epi16 (__m128i a) pure @safe 787 { 788 short8 ba = cast(short8)a; 789 short8 r; 790 r = ba.array[0]; 791 return cast(__m128i)r; 792 } 793 unittest 794 { 795 short8 A; 796 A.ptr[0] = 13; 797 short8 B = cast(short8) _mm_broadcastw_epi16(cast(__m128i)A); 798 short[8] correct = [13, 13, 13, 13, 13, 13, 13, 13]; 799 assert(B.array == correct); 800 } 801 802 /// Broadcast the low packed 16-bit integer from `a` to all elements of result. 803 __m256i _mm256_broadcastw_epi16 (__m128i a) pure @safe 804 { 805 short8 ba = cast(short8)a; 806 short16 r; 807 r = ba.array[0]; 808 return cast(__m256i)r; 809 } 810 unittest 811 { 812 short8 A; 813 A.ptr[0] = 13; 814 short16 B = cast(short16) _mm256_broadcastw_epi16(cast(__m128i)A); 815 short[16] correct = [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]; 816 assert(B.array == correct); 817 } 818 819 // TODO __m256i _mm256_bslli_epi128 (__m256i a, const int imm8) pure @safe 820 // TODO __m256i _mm256_bsrli_epi128 (__m256i a, const int imm8) pure @safe 821 822 /// Compare packed 16-bit integers in `a` and `b` for equality. 823 __m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b) pure @trusted 824 { 825 // PERF: GDC without AVX 826 // PERF: DMD 827 static if (SIMD_COMPARISON_MASKS_32B) 828 { 829 // PERF: catastrophic in GDC without AVX2 830 return cast(__m256i)(cast(short16)a == cast(short16)b); 831 } 832 else static if (GDC_with_AVX2) 833 { 834 return cast(__m256i) __builtin_ia32_pcmpeqw256(cast(short16)a, cast(short16)b); 835 } 836 else version(LDC) 837 { 838 return cast(__m256i) equalMask!short16(cast(short16)a, cast(short16)b); 839 } 840 else 841 { 842 short16 sa = cast(short16)a; 843 short16 sb = cast(short16)b; 844 short16 sr; 845 for (int n = 0; n < 16; ++n) 846 { 847 bool cond = sa.array[n] == sb.array[n]; 848 sr.ptr[n] = cond ? -1 : 0; 849 } 850 return cast(__m256i) sr; 851 } 852 } 853 unittest 854 { 855 short16 A = [-3, -2, -1, 0, 0, 1, 2, 3, -3, -2, -1, 0, 0, 1, 2, 3]; 856 short16 B = [ 4, 3, 2, 1, 0, -1, -2, -3, -3, 3, 2, 1, 0, -1, -2, -3]; 857 short[16] E = [ 0, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0]; 858 short16 R = cast(short16)(_mm256_cmpeq_epi16(cast(__m256i)A, cast(__m256i)B)); 859 assert(R.array == E); 860 } 861 862 /// Compare packed 32-bit integers in `a` and `b` for equality. 863 __m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b) pure @trusted 864 { 865 // PERF: GDC without AVX 866 // PERF: DMD 867 static if (SIMD_COMPARISON_MASKS_32B) 868 { 869 // Quite bad in GDC -mavx (with no AVX2) 870 return cast(__m256i)(cast(int8)a == cast(int8)b); 871 } 872 else static if (GDC_with_AVX2) 873 { 874 return cast(__m256i) __builtin_ia32_pcmpeqd256(cast(int8)a, cast(int8)b); 875 } 876 else version(LDC) 877 { 878 return cast(__m256i) equalMask!int8(cast(int8)a, cast(int8)b); 879 } 880 else 881 { 882 int8 ia = cast(int8)a; 883 int8 ib = cast(int8)b; 884 int8 ir; 885 for (int n = 0; n < 8; ++n) 886 { 887 bool cond = ia.array[n] == ib.array[n]; 888 ir.ptr[n] = cond ? -1 : 0; 889 } 890 return cast(__m256i) ir; 891 } 892 } 893 unittest 894 { 895 int8 A = [-3, -2, -1, 0, -3, -2, -1, 0]; 896 int8 B = [ 4, -2, 2, 0, 4, -2, 2, 0]; 897 int[8] E = [ 0, -1, 0, -1, 0, -1, 0, -1]; 898 int8 R = cast(int8)(_mm256_cmpeq_epi32(cast(__m256i)A, cast(__m256i)B)); 899 assert(R.array == E); 900 } 901 902 /// Compare packed 64-bit integers in `a` and `b` for equality. 903 __m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) pure @trusted 904 { 905 // PERF: GDC without AVX 906 // PERF: DMD 907 static if (SIMD_COMPARISON_MASKS_32B) 908 { 909 // Note: enabling this with DMD will probably lead to same bug as _mm_cmpeq_epi64 910 return cast(__m256i)(cast(long4)a == cast(long4)b); 911 } 912 else static if (GDC_with_AVX2) 913 { 914 return cast(__m256i)__builtin_ia32_pcmpeqq256(cast(long4)a, cast(long4)b); 915 } 916 else version(LDC) 917 { 918 return cast(__m256i) equalMask!long4(cast(long4)a, cast(long4)b); 919 } 920 else 921 { 922 long4 la = cast(long4)a; 923 long4 lb = cast(long4)b; 924 long4 res; 925 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 926 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 927 res.ptr[2] = (la.array[2] == lb.array[2]) ? -1 : 0; 928 res.ptr[3] = (la.array[3] == lb.array[3]) ? -1 : 0; 929 return cast(__m256i)res; 930 } 931 } 932 unittest 933 { 934 __m256i A = _mm256_setr_epi64(-1, -2, -1, -2); 935 __m256i B = _mm256_setr_epi64(-3, -2, -3, -3); 936 __m256i C = _mm256_setr_epi64(-1, -4, -1, -2); 937 long4 AB = cast(long4) _mm256_cmpeq_epi64(A, B); 938 long4 AC = cast(long4) _mm256_cmpeq_epi64(A, C); 939 long[4] correct1 = [ 0, -1, 0, 0]; 940 long[4] correct2 = [-1, 0, -1, -1]; 941 assert(AB.array == correct1); 942 assert(AC.array == correct2); 943 } 944 945 /// Compare packed 8-bit integers in `a` and `b` for equality. 946 __m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b) pure @trusted 947 { 948 // PERF: GDC without AVX2, need split 949 // PERF: DMD 950 static if (SIMD_COMPARISON_MASKS_32B) 951 { 952 return cast(__m256i)(cast(byte32)a == cast(byte32)b); 953 } 954 else static if (GDC_with_AVX2) 955 { 956 return cast(__m256i) __builtin_ia32_pcmpeqb256(cast(ubyte32)a, cast(ubyte32)b); 957 } 958 else version(LDC) 959 { 960 return cast(__m256i) equalMask!byte32(cast(byte32)a, cast(byte32)b); 961 } 962 else 963 { 964 byte32 ba = cast(byte32)a; 965 byte32 bb = cast(byte32)b; 966 byte32 br; 967 for (int n = 0; n < 32; ++n) 968 { 969 bool cond = ba.array[n] == bb.array[n]; 970 br.ptr[n] = cond ? -1 : 0; 971 } 972 return cast(__m256i) br; 973 } 974 } 975 unittest 976 { 977 __m256i A = _mm256_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 978 1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 42); 979 __m256i B = _mm256_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1, 980 2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 981 byte32 C = cast(byte32) _mm256_cmpeq_epi8(A, B); 982 byte[32] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1, 983 0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, 0]; 984 assert(C.array == correct); 985 } 986 987 /// Compare packed signed 16-bit integers in `a` and `b` for greater-than. 988 __m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b) pure @safe 989 { 990 version(GNU) 991 enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC without AVX2 992 else 993 enum bool mayUseComparisonOperator = true; 994 995 static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator) 996 { 997 return cast(__m256i)(cast(short16)a > cast(short16)b); 998 } 999 else static if (GDC_with_AVX2) 1000 { 1001 return cast(__m256i) __builtin_ia32_pcmpgtw256(cast(short16)a, cast(short16)b); 1002 } 1003 else // split 1004 { 1005 __m128i a_lo = _mm256_extractf128_si256!0(a); 1006 __m128i a_hi = _mm256_extractf128_si256!1(a); 1007 __m128i b_lo = _mm256_extractf128_si256!0(b); 1008 __m128i b_hi = _mm256_extractf128_si256!1(b); 1009 __m128i r_lo = _mm_cmpgt_epi16(a_lo, b_lo); 1010 __m128i r_hi = _mm_cmpgt_epi16(a_hi, b_hi); 1011 return _mm256_set_m128i(r_hi, r_lo); 1012 } 1013 } 1014 unittest 1015 { 1016 short16 A = [-3, -2, -1, 0, 0, 1, 2, 3, -3, -2, -1, 0, 0, 1, 2, 3]; 1017 short16 B = [ 4, 3, 2, 1, 0, -1, -2, -3, 4, -3, 2, 1, 0, -1, -2, -3]; 1018 short[16] E = [ 0, 0, 0, 0, 0, -1, -1, -1, 0, -1, 0, 0, 0, -1, -1, -1]; 1019 short16 R = cast(short16)(_mm256_cmpgt_epi16(cast(__m256i)A, cast(__m256i)B)); 1020 assert(R.array == E); 1021 } 1022 1023 /// Compare packed signed 32-bit integers in `a` and `b` for greater-than. 1024 __m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b) pure @safe 1025 { 1026 version(GNU) 1027 enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC else 1028 else 1029 enum bool mayUseComparisonOperator = true; 1030 1031 static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator) 1032 { 1033 return cast(__m256i)(cast(int8)a > cast(int8)b); 1034 } 1035 else static if (GDC_with_AVX2) 1036 { 1037 return cast(__m256i) __builtin_ia32_pcmpgtd256(cast(int8)a, cast(int8)b); 1038 } 1039 else // split 1040 { 1041 __m128i a_lo = _mm256_extractf128_si256!0(a); 1042 __m128i a_hi = _mm256_extractf128_si256!1(a); 1043 __m128i b_lo = _mm256_extractf128_si256!0(b); 1044 __m128i b_hi = _mm256_extractf128_si256!1(b); 1045 __m128i r_lo = _mm_cmpgt_epi32(a_lo, b_lo); 1046 __m128i r_hi = _mm_cmpgt_epi32(a_hi, b_hi); 1047 return _mm256_set_m128i(r_hi, r_lo); 1048 } 1049 } 1050 unittest 1051 { 1052 int8 A = [-3, 2, -1, 0, -3, 2, -1, 0]; 1053 int8 B = [ 4, -2, 2, 0, 4, -2, 2, 0]; 1054 int[8] E = [ 0, -1, 0, 0, 0, -1, 0, 0]; 1055 int8 R = cast(int8) _mm256_cmpgt_epi32(cast(__m256i)A, cast(__m256i)B); 1056 assert(R.array == E); 1057 } 1058 1059 __m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b) pure @safe 1060 { 1061 version(GNU) 1062 enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC else 1063 else 1064 enum bool mayUseComparisonOperator = true; 1065 1066 static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator) 1067 { 1068 return cast(__m256i)(cast(long4)a > cast(long4)b); 1069 } 1070 else static if (GDC_with_AVX2) 1071 { 1072 return cast(__m256i) __builtin_ia32_pcmpgtq256(cast(long4)a, cast(long4)b); 1073 } 1074 else // split 1075 { 1076 __m128i a_lo = _mm256_extractf128_si256!0(a); 1077 __m128i a_hi = _mm256_extractf128_si256!1(a); 1078 __m128i b_lo = _mm256_extractf128_si256!0(b); 1079 __m128i b_hi = _mm256_extractf128_si256!1(b); 1080 __m128i r_lo = _mm_cmpgt_epi64(a_lo, b_lo); 1081 __m128i r_hi = _mm_cmpgt_epi64(a_hi, b_hi); 1082 return _mm256_set_m128i(r_hi, r_lo); 1083 } 1084 } 1085 unittest 1086 { 1087 __m256i A = _mm256_setr_epi64(-3, 2, 70, 2); 1088 __m256i B = _mm256_setr_epi64 (4, -2, 4, -2); 1089 long[4] correct = [ 0, -1, -1, -1 ]; 1090 long4 R = cast(long4)(_mm256_cmpgt_epi64(A, B)); 1091 assert(R.array == correct); 1092 } 1093 1094 /// Compare packed signed 8-bit integers in `a` and `b` for greater-than. 1095 __m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b) pure @safe 1096 { 1097 version(GNU) 1098 enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC without AVX2 1099 else 1100 enum bool mayUseComparisonOperator = true; 1101 1102 static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator) 1103 { 1104 return cast(__m256i)(cast(byte32)a > cast(byte32)b); 1105 } 1106 else static if (GDC_with_AVX2) 1107 { 1108 return cast(__m256i) __builtin_ia32_pcmpgtb256(cast(short16)a, cast(short16)b); 1109 } 1110 else // split 1111 { 1112 __m128i a_lo = _mm256_extractf128_si256!0(a); 1113 __m128i a_hi = _mm256_extractf128_si256!1(a); 1114 __m128i b_lo = _mm256_extractf128_si256!0(b); 1115 __m128i b_hi = _mm256_extractf128_si256!1(b); 1116 __m128i r_lo = _mm_cmpgt_epi8(a_lo, b_lo); 1117 __m128i r_hi = _mm_cmpgt_epi8(a_hi, b_hi); 1118 return _mm256_set_m128i(r_hi, r_lo); 1119 } 1120 } 1121 unittest 1122 { 1123 __m256i A = _mm256_setr_epi8(1, 2, 3, 1, 127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1, 2, 3, 1, 127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 1124 __m256i B = _mm256_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1, 2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 0); 1125 byte32 C = cast(byte32) _mm256_cmpgt_epi8(A, B); 1126 byte[32] correct = [0, 0,-1, 0, -1, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0, 0, 0,-1, 0, -1, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1,-1]; 1127 assert(C.array == correct); 1128 } 1129 1130 1131 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. 1132 __m256i _mm256_cvtepi16_epi32 (__m128i a) pure @trusted 1133 { 1134 static if (GDC_with_AVX2) 1135 { 1136 return cast(__m256i) __builtin_ia32_pmovsxwd256(cast(short8)a); 1137 } 1138 else static if (LDC_with_optimizations) 1139 { 1140 enum ir = ` 1141 %r = sext <8 x i16> %0 to <8 x i32> 1142 ret <8 x i32> %r`; 1143 return cast(__m256i) LDCInlineIR!(ir, int8, short8)(cast(short8)a); 1144 } 1145 else 1146 { 1147 short8 sa = cast(short8)a; 1148 int8 r; 1149 r.ptr[0] = sa.array[0]; 1150 r.ptr[1] = sa.array[1]; 1151 r.ptr[2] = sa.array[2]; 1152 r.ptr[3] = sa.array[3]; 1153 r.ptr[4] = sa.array[4]; 1154 r.ptr[5] = sa.array[5]; 1155 r.ptr[6] = sa.array[6]; 1156 r.ptr[7] = sa.array[7]; 1157 return cast(__m256i)r; 1158 } 1159 } 1160 unittest 1161 { 1162 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767); 1163 int8 C = cast(int8) _mm256_cvtepi16_epi32(A); 1164 int[8] correct = [-1, 0, -32768, 32767, -1, 0, -32768, 32767]; 1165 assert(C.array == correct); 1166 } 1167 1168 1169 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. 1170 __m256i _mm256_cvtepi16_epi64 (__m128i a) pure @trusted 1171 { 1172 static if (GDC_with_AVX2) 1173 { 1174 return cast(__m256i) __builtin_ia32_pmovsxwq256(cast(short8)a); 1175 } 1176 else static if (LDC_with_optimizations) 1177 { 1178 enum ir = ` 1179 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 1180 %r = sext <4 x i16> %v to <4 x i64> 1181 ret <4 x i64> %r`; 1182 return cast(__m256i) LDCInlineIR!(ir, long4, short8)(cast(short8)a); 1183 } 1184 else 1185 { 1186 // LDC x86 generates vpmovsxwq since LDC 1.12 -O1 1187 short8 sa = cast(short8)a; 1188 long4 r; 1189 r.ptr[0] = sa.array[0]; 1190 r.ptr[1] = sa.array[1]; 1191 r.ptr[2] = sa.array[2]; 1192 r.ptr[3] = sa.array[3]; 1193 return cast(__m256i)r; 1194 } 1195 } 1196 unittest 1197 { 1198 __m128i A = _mm_setr_epi16(-1, 0, short.min, short.max, 2, 3, 4, 5); 1199 long4 C = cast(long4) _mm256_cvtepi16_epi64(A); 1200 long[4] correct = [-1, 0, short.min, short.max]; 1201 assert(C.array == correct); 1202 } 1203 1204 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. 1205 __m256i _mm256_cvtepi32_epi64 (__m128i a) pure @trusted 1206 { 1207 long4 r; 1208 r.ptr[0] = a.array[0]; 1209 r.ptr[1] = a.array[1]; 1210 r.ptr[2] = a.array[2]; 1211 r.ptr[3] = a.array[3]; 1212 return cast(__m256i)r; 1213 } 1214 unittest 1215 { 1216 __m128i A = _mm_setr_epi32(-1, 0, int.min, int.max); 1217 long4 C = cast(long4) _mm256_cvtepi32_epi64(A); 1218 long[4] correct = [-1, 0, int.min, int.max]; 1219 assert(C.array == correct); 1220 } 1221 1222 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. 1223 __m256i _mm256_cvtepi8_epi16 (__m128i a) pure @trusted 1224 { 1225 static if (GDC_with_AVX2) 1226 { 1227 return cast(__m256i) __builtin_ia32_pmovsxbw256(cast(ubyte16)a); 1228 } 1229 else static if (LDC_with_optimizations) 1230 { 1231 enum ir = ` 1232 %r = sext <16 x i8> %0 to <16 x i16> 1233 ret <16 x i16> %r`; 1234 return cast(__m256i) LDCInlineIR!(ir, short16, byte16)(cast(byte16)a); 1235 } 1236 else 1237 { 1238 short16 r; 1239 byte16 ba = cast(byte16)a; 1240 for (int n = 0; n < 16; ++n) 1241 { 1242 r.ptr[n] = ba.array[n]; 1243 } 1244 return cast(__m256i)r; 1245 } 1246 } 1247 unittest 1248 { 1249 __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1250 short16 C = cast(short16) _mm256_cvtepi8_epi16(A); 1251 short[16] correct = [-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; 1252 assert(C.array == correct); 1253 } 1254 1255 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. 1256 __m256i _mm256_cvtepi8_epi32 (__m128i a) pure @trusted 1257 { 1258 static if (GDC_with_AVX2) 1259 { 1260 return cast(__m256i) __builtin_ia32_pmovsxbd256(cast(ubyte16)a); 1261 } 1262 else static if (LDC_with_optimizations) 1263 { 1264 enum ir = ` 1265 %v = shufflevector <16 x i8> %0,<16 x i8> undef, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5,i32 6, i32 7> 1266 %r = sext <8 x i8> %v to <8 x i32> 1267 ret <8 x i32> %r`; 1268 return cast(__m256i) LDCInlineIR!(ir, int8, byte16)(cast(byte16)a); 1269 } 1270 else 1271 { 1272 // PERF This is rather bad in GDC without AVX, or with DMD 1273 // should split that 1274 int8 r; 1275 byte16 ba = cast(byte16)a; 1276 for (int n = 0; n < 8; ++n) 1277 { 1278 r.ptr[n] = ba.array[n]; 1279 } 1280 return cast(__m256i)r; 1281 } 1282 } 1283 unittest 1284 { 1285 __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1286 int8 C = cast(int8) _mm256_cvtepi8_epi32(A); 1287 int[8] correct = [-1, 0, byte.min, byte.max, 2, 3, 4, 5]; 1288 assert(C.array == correct); 1289 } 1290 1291 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 1292 __m256i _mm256_cvtepi8_epi64 (__m128i a) pure @trusted 1293 { 1294 // PERF This is rather bad in GDC without AVX 1295 static if (GDC_with_AVX2) 1296 { 1297 return cast(__m256i) __builtin_ia32_pmovsxbq256(cast(ubyte16)a); 1298 } 1299 else static if (LDC_with_ARM64) 1300 { 1301 // 4 inst since LDC 1.22 -O2 1302 return _mm256_cvtepi16_epi64(_mm_cvtepi8_epi16(a)); 1303 } 1304 else static if (LDC_with_optimizations) 1305 { 1306 enum ir = ` 1307 %v = shufflevector <16 x i8> %0,<16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1308 %r = sext <4 x i8> %v to <4 x i64> 1309 ret <4 x i64> %r`; 1310 return cast(__m256i) LDCInlineIR!(ir, long4, byte16)(cast(byte16)a); 1311 } 1312 else 1313 { 1314 long4 r; 1315 byte16 ba = cast(byte16)a; 1316 for (int n = 0; n < 4; ++n) 1317 { 1318 r.ptr[n] = ba.array[n]; 1319 } 1320 return cast(__m256i)r; 1321 } 1322 } 1323 unittest 1324 { 1325 __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1326 long4 C = cast(long4) _mm256_cvtepi8_epi64(A); 1327 long[4] correct = [-1, 0, byte.min, byte.max]; 1328 assert(C.array == correct); 1329 } 1330 1331 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 1332 __m256i _mm256_cvtepu16_epi32(__m128i a) pure @trusted 1333 { 1334 static if (GDC_with_AVX2) 1335 { 1336 return cast(__m256i) __builtin_ia32_pmovzxwd256(cast(short8)a); 1337 } 1338 else 1339 { 1340 short8 sa = cast(short8)a; 1341 int8 r; 1342 r.ptr[0] = cast(ushort)sa.array[0]; 1343 r.ptr[1] = cast(ushort)sa.array[1]; 1344 r.ptr[2] = cast(ushort)sa.array[2]; 1345 r.ptr[3] = cast(ushort)sa.array[3]; 1346 r.ptr[4] = cast(ushort)sa.array[4]; 1347 r.ptr[5] = cast(ushort)sa.array[5]; 1348 r.ptr[6] = cast(ushort)sa.array[6]; 1349 r.ptr[7] = cast(ushort)sa.array[7]; 1350 return cast(__m256i)r; 1351 } 1352 } 1353 unittest 1354 { 1355 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767); 1356 int8 C = cast(int8) _mm256_cvtepu16_epi32(A); 1357 int[8] correct = [65535, 0, 32768, 32767, 65535, 0, 32768, 32767]; 1358 assert(C.array == correct); 1359 } 1360 1361 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. 1362 __m256i _mm256_cvtepu16_epi64(__m128i a) pure @trusted 1363 { 1364 static if (GDC_with_AVX2) 1365 { 1366 return cast(__m256i) __builtin_ia32_pmovzxwq256(cast(short8)a); 1367 } 1368 else static if (LDC_with_optimizations) 1369 { 1370 enum ir = ` 1371 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 1372 %r = zext <4 x i16> %v to <4 x i64> 1373 ret <4 x i64> %r`; 1374 return cast(__m256i) LDCInlineIR!(ir, long4, short8)(cast(short8)a); 1375 } 1376 else 1377 { 1378 short8 sa = cast(short8)a; 1379 long4 r; 1380 r.ptr[0] = cast(ushort)sa.array[0]; 1381 r.ptr[1] = cast(ushort)sa.array[1]; 1382 r.ptr[2] = cast(ushort)sa.array[2]; 1383 r.ptr[3] = cast(ushort)sa.array[3]; 1384 return cast(__m256i)r; 1385 } 1386 } 1387 unittest 1388 { 1389 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 2, 3, 4, 5); 1390 long4 C = cast(long4) _mm256_cvtepu16_epi64(A); 1391 long[4] correct = [65535, 0, 32768, 32767]; 1392 assert(C.array == correct); 1393 } 1394 1395 /// Zero-extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. 1396 __m256i _mm256_cvtepu32_epi64 (__m128i a) pure @trusted 1397 { 1398 static if (GDC_with_AVX2) 1399 { 1400 return cast(__m256i) __builtin_ia32_pmovzxdq256(cast(int4)a); 1401 } 1402 else static if (LDC_with_optimizations) 1403 { 1404 enum ir = ` 1405 %r = zext <4 x i32> %0 to <4 x i64> 1406 ret <4 x i64> %r`; 1407 return cast(__m256i) LDCInlineIR!(ir, long4, int4)(cast(int4)a); 1408 } 1409 else 1410 { 1411 long4 r; 1412 r.ptr[0] = cast(uint)a.array[0]; 1413 r.ptr[1] = cast(uint)a.array[1]; 1414 r.ptr[2] = cast(uint)a.array[2]; 1415 r.ptr[3] = cast(uint)a.array[3]; 1416 return cast(__m256i)r; 1417 } 1418 } 1419 unittest 1420 { 1421 __m128i A = _mm_setr_epi32(-1, 0, int.min, int.max); 1422 long4 C = cast(long4) _mm256_cvtepu32_epi64(A); 1423 long[4] correct = [uint.max, 0, 2_147_483_648, int.max]; 1424 assert(C.array == correct); 1425 } 1426 1427 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. 1428 __m256i _mm256_cvtepu8_epi16 (__m128i a) pure @trusted 1429 { 1430 static if (GDC_with_AVX2) 1431 { 1432 return cast(__m256i) __builtin_ia32_pmovzxbw256(cast(ubyte16)a); 1433 } 1434 else static if (LDC_with_optimizations) 1435 { 1436 enum ir = ` 1437 %r = zext <16 x i8> %0 to <16 x i16> 1438 ret <16 x i16> %r`; 1439 return cast(__m256i) LDCInlineIR!(ir, short16, byte16)(cast(byte16)a); 1440 } 1441 else 1442 { 1443 short16 r; 1444 byte16 ba = cast(byte16)a; 1445 for (int n = 0; n < 16; ++n) 1446 { 1447 r.ptr[n] = cast(ubyte)ba.array[n]; 1448 } 1449 return cast(__m256i)r; 1450 } 1451 } 1452 unittest 1453 { 1454 __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1455 short16 C = cast(short16) _mm256_cvtepu8_epi16(A); 1456 short[16] correct = [255, 0, 128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; 1457 assert(C.array == correct); 1458 } 1459 1460 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. 1461 __m256i _mm256_cvtepu8_epi32 (__m128i a) pure @trusted 1462 { 1463 static if (GDC_with_AVX2) 1464 { 1465 return cast(__m256i) __builtin_ia32_pmovzxbd256(cast(ubyte16)a); 1466 } 1467 else static if (LDC_with_optimizations) 1468 { 1469 enum ir = ` 1470 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5,i32 6, i32 7> 1471 %r = zext <8 x i8> %v to <8 x i32> 1472 ret <8 x i32> %r`; 1473 return cast(__m256i) LDCInlineIR!(ir, int8, byte16)(cast(byte16)a); 1474 } 1475 else 1476 { 1477 int8 r; 1478 byte16 ba = cast(byte16)a; 1479 for (int n = 0; n < 8; ++n) 1480 { 1481 r.ptr[n] = cast(ubyte)ba.array[n]; 1482 } 1483 return cast(__m256i)r; 1484 } 1485 } 1486 unittest 1487 { 1488 __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1489 int8 C = cast(int8) _mm256_cvtepu8_epi32(A); 1490 int[8] correct = [255, 0, 128, 127, 2, 3, 4, 5]; 1491 assert(C.array == correct); 1492 } 1493 1494 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 64-bit integers. 1495 __m256i _mm256_cvtepu8_epi64 (__m128i a) pure @trusted 1496 { 1497 // PERF ARM64+LDC, not awesome 1498 static if (GDC_with_AVX2) 1499 { 1500 return cast(__m256i) __builtin_ia32_pmovzxbq256(cast(ubyte16)a); 1501 } 1502 else static if (LDC_with_optimizations) 1503 { 1504 enum ir = ` 1505 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 1506 %r = zext <4 x i8> %v to <4 x i64> 1507 ret <4 x i64> %r`; 1508 return cast(__m256i) LDCInlineIR!(ir, long4, byte16)(cast(byte16)a); 1509 } 1510 else 1511 { 1512 long4 r; 1513 byte16 ba = cast(byte16)a; 1514 for (int n = 0; n < 4; ++n) 1515 { 1516 r.ptr[n] = cast(ubyte)ba.array[n]; 1517 } 1518 return cast(__m256i)r; 1519 } 1520 } 1521 unittest 1522 { 1523 __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1524 long4 C = cast(long4) _mm256_cvtepu8_epi64(A); 1525 long[4] correct = [255, 0, 128, 127]; 1526 assert(C.array == correct); 1527 } 1528 1529 /// Extract a 16-bit integer from `a`, selected with index. 1530 int _mm256_extract_epi16 (__m256i a, int index) pure @trusted 1531 { 1532 short16 sa = cast(short16)a; 1533 return sa.ptr[index & 15]; 1534 } 1535 unittest 1536 { 1537 short16 b; 1538 b = 43; 1539 assert(_mm256_extract_epi16(cast(__m256i)b, 7) == 43); 1540 } 1541 1542 /// Extract a 8-bit integer from `a`, selected with index. 1543 int _mm256_extract_epi8 (__m256i a, int index) pure @trusted 1544 { 1545 byte32 sa = cast(byte32)a; 1546 return sa.ptr[index & 31]; 1547 } 1548 unittest 1549 { 1550 byte32 b; 1551 b = -44; 1552 assert(_mm256_extract_epi8(cast(__m256i)b, 5) == -44); 1553 assert(_mm256_extract_epi8(cast(__m256i)b, 5 + 32) == -44); 1554 } 1555 1556 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`. 1557 __m128i _mm256_extracti128_si256(int imm8)(__m256i a) pure @trusted 1558 if ( (imm8 == 0) || (imm8 == 1) ) 1559 { 1560 pragma(inline, true); 1561 1562 static if (GDC_with_AVX2) 1563 { 1564 return cast(__m128i) __builtin_ia32_extract128i256(a, imm8); 1565 } 1566 else static if (LDC_with_optimizations) 1567 { 1568 enum str = (imm8 == 1) ? "<i32 2, i32 3>" : "<i32 0, i32 1>"; 1569 enum ir = "%r = shufflevector <4 x i64> %0, <4 x i64> undef, <2 x i32>" ~ str ~ "\n" ~ 1570 "ret <2 x i64> %r"; 1571 return cast(__m128i) LDCInlineIR!(ir, ulong2, ulong4)(cast(ulong4)a); 1572 } 1573 else 1574 { 1575 long4 al = cast(long4) a; 1576 long2 ret; 1577 ret.ptr[0] = (imm8==1) ? al.array[2] : al.array[0]; 1578 ret.ptr[1] = (imm8==1) ? al.array[3] : al.array[1]; 1579 return cast(__m128i) ret; 1580 } 1581 } 1582 unittest 1583 { 1584 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432 ); 1585 int[4] correct0 = [ -7, -1, 0, 9 ]; 1586 int[4] correct1 = [ -100, 100, 234, 432 ]; 1587 __m128i R0 = _mm256_extracti128_si256!(0)(A); 1588 __m128i R1 = _mm256_extracti128_si256!(1)(A); 1589 assert(R0.array == correct0); 1590 assert(R1.array == correct1); 1591 } 1592 1593 // TODO __m256i _mm256_hadd_epi16 (__m256i a, __m256i b) pure @safe 1594 // TODO __m256i _mm256_hadd_epi32 (__m256i a, __m256i b) pure @safe 1595 // TODO __m256i _mm256_hadds_epi16 (__m256i a, __m256i b) pure @safe 1596 // TODO __m256i _mm256_hsub_epi16 (__m256i a, __m256i b) pure @safe 1597 // TODO __m256i _mm256_hsub_epi32 (__m256i a, __m256i b) pure @safe 1598 // TODO __m256i _mm256_hsubs_epi16 (__m256i a, __m256i b) pure @safe 1599 1600 // TODO __m128i _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale) pure @safe 1601 // TODO __m128i _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe 1602 // TODO __m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe 1603 // TODO __m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) pure @safe 1604 // TODO __m128i _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe 1605 // TODO __m128i _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe 1606 // TODO __m256i _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe 1607 // TODO __m256i _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale) pure @safe 1608 // TODO __m128d _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe 1609 // TODO __m128d _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) pure @safe 1610 // TODO __m256d _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe 1611 // TODO __m256d _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale) pure @safe 1612 // TODO __m128 _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale) pure @safe 1613 // TODO __m128 _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) pure @safe 1614 // TODO __m256 _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale) pure @safe 1615 // TODO __m256 _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale) pure @safe 1616 // TODO __m128i _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale) pure @safe 1617 // TODO __m128i _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe 1618 // TODO __m128i _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe 1619 // TODO __m128i _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale) pure @safe 1620 // TODO __m128i _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe 1621 // TODO __m128i _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe 1622 // TODO __m256i _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale) pure @safe 1623 // TODO __m256i _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale) pure @safe 1624 // TODO __m128d _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe 1625 // TODO __m128d _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) pure @safe 1626 // TODO __m256d _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale) pure @safe 1627 // TODO __m256d _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale) pure @safe 1628 // TODO __m128 _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale) pure @safe 1629 // TODO __m128 _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) pure @safe 1630 // TODO __m128 _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale) pure @safe 1631 // TODO __m128 _mm256_mask_i64gather_ps (__m128 src, float const* base_addr, __m256i vindex, __m128 mask, const int scale) pure @safe 1632 1633 1634 /// Copy `a` to result, then insert 128 bits from `b` into result at the location specified by 1635 /// `imm8`. 1636 __m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8) pure @trusted 1637 { 1638 long2 lb = cast(long2)b; 1639 a.ptr[(imm8 & 1)*2 ] = lb.array[0]; 1640 a.ptr[(imm8 & 1)*2+1] = lb.array[1]; 1641 return a; 1642 } 1643 unittest 1644 { 1645 __m256i A = [0, 1, 2, 3]; 1646 long2 B = [4, 5]; 1647 __m256i C = _mm256_inserti128_si256(A, cast(__m128i)B, 0 + 8); 1648 __m256i D = _mm256_inserti128_si256(A, cast(__m128i)B, 1); 1649 long[4] correctC = [4, 5, 2, 3]; 1650 long[4] correctD = [0, 1, 4, 5]; 1651 assert(C.array == correctC); 1652 assert(D.array == correctD); 1653 } 1654 1655 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1656 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1657 /// and pack the results in destination. 1658 __m256i _mm256_madd_epi16 (__m256i a, __m256i b) pure @trusted 1659 { 1660 static if (GDC_with_AVX2) 1661 { 1662 return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b); 1663 } 1664 else static if (LDC_with_AVX2) 1665 { 1666 return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b); 1667 } 1668 else 1669 { 1670 // split is beneficial for ARM64, LDC and GDC without AVX2 1671 __m128i a_lo = _mm256_extractf128_si256!0(a); 1672 __m128i a_hi = _mm256_extractf128_si256!1(a); 1673 __m128i b_lo = _mm256_extractf128_si256!0(b); 1674 __m128i b_hi = _mm256_extractf128_si256!1(b); 1675 __m128i r_lo = _mm_madd_epi16(a_lo, b_lo); 1676 __m128i r_hi = _mm_madd_epi16(a_hi, b_hi); 1677 return _mm256_set_m128i(r_hi, r_lo); 1678 } 1679 } 1680 unittest 1681 { 1682 short16 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1683 short16 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1684 int8 R = cast(int8) _mm256_madd_epi16(cast(__m256i)A, cast(__m256i)B); 1685 int[8] correct = [1, 13, -2147483648, 2*32767*32767, 1, 13, -2147483648, 2*32767*32767]; 1686 assert(R.array == correct); 1687 } 1688 1689 // TODO __m256i _mm256_maddubs_epi16 (__m256i a, __m256i b) pure @safe 1690 1691 version(DigitalMars) 1692 { 1693 // this avoids a bug with DMD < 2.099 -a x86 -O 1694 private enum bool maskLoadWorkaroundDMD = (__VERSION__ < 2099); 1695 } 1696 else 1697 { 1698 private enum bool maskLoadWorkaroundDMD = false; 1699 } 1700 1701 /// Load packed 32-bit integers from memory using `mask` (elements are zeroed out when the highest 1702 /// bit is not set in the corresponding element). 1703 /// Warning: See "Note about mask load/store" to know why you must address valid memory only. 1704 __m128i _mm_maskload_epi32 (const(int)* mem_addr, __m128i mask) /* pure */ @system 1705 { 1706 // PERF DMD 1707 static if (LDC_with_AVX2) 1708 { 1709 // MAYDO report that the builtin is impure 1710 return __builtin_ia32_maskloadd(mem_addr, mask); 1711 } 1712 else static if (GDC_with_AVX2) 1713 { 1714 return __builtin_ia32_maskloadd(cast(__m128i*)mem_addr, mask); 1715 } 1716 else 1717 { 1718 return cast(__m128i) _mm_maskload_ps(cast(const(float)*)mem_addr, mask); 1719 } 1720 } 1721 unittest 1722 { 1723 static if (!maskLoadWorkaroundDMD) 1724 { 1725 int[4] A = [7, 1, 2, 3]; 1726 int4 B = _mm_maskload_epi32(A.ptr, _mm_setr_epi32(1, -1, -1, 1)); // can address invalid memory with mask load and writes! 1727 int[4] correct = [0, 1, 2, 0]; 1728 assert(B.array == correct); 1729 } 1730 } 1731 1732 1733 // TODO __m256i _mm256_maskload_epi32 (int const* mem_addr, __m256i mask) pure @safe 1734 // TODO __m128i _mm_maskload_epi64 (__int64 const* mem_addr, __m128i mask) pure @safe 1735 // TODO __m256i _mm256_maskload_epi64 (__int64 const* mem_addr, __m256i mask) pure @safe 1736 1737 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 1738 __m256i _mm256_max_epi16 (__m256i a, __m256i b) pure @safe 1739 { 1740 // PERF D_SIMD 1741 version(GNU) 1742 enum bool split = true; 1743 else static if (SIMD_COMPARISON_MASKS_32B) 1744 enum bool split = false; 1745 else 1746 enum bool split = true; 1747 1748 static if (GDC_with_AVX2) 1749 { 1750 return cast(__m256i) __builtin_ia32_pmaxsw256(cast(short16)a, cast(short16)b); 1751 } 1752 else static if (split) 1753 { 1754 // split 1755 __m128i a_lo = _mm256_extractf128_si256!0(a); 1756 __m128i a_hi = _mm256_extractf128_si256!1(a); 1757 __m128i b_lo = _mm256_extractf128_si256!0(b); 1758 __m128i b_hi = _mm256_extractf128_si256!1(b); 1759 __m128i r_lo = _mm_max_epi16(a_lo, b_lo); 1760 __m128i r_hi = _mm_max_epi16(a_hi, b_hi); 1761 return _mm256_set_m128i(r_hi, r_lo); 1762 } 1763 else static if (SIMD_COMPARISON_MASKS_32B) 1764 { 1765 // catastrophic with GDC x86 for some reason. Sad. 1766 short16 sa = cast(short16)a; 1767 short16 sb = cast(short16)b; 1768 short16 greater = sa > sb; 1769 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 1770 } 1771 else 1772 static assert(0); 1773 } 1774 unittest 1775 { 1776 short16 R = cast(short16) _mm256_max_epi16(_mm256_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57, 1, 0, 0, 0, 1, 0, 0, 0), 1777 _mm256_setr_epi16( -4,-8, 9, 7, 0,-32768, 0, 0, 0, 2, 0, 4, 2, 1, 2, -4)); 1778 short[16] correct = [32767, 1, 9, 7, 9, 7, 0, 0, 1, 2, 0, 4, 2, 1, 2, 0]; 1779 assert(R.array == correct); 1780 } 1781 1782 /// Compare packed signed 32-bit integers in `a` and `b`, and return packed maximum values. 1783 __m256i _mm256_max_epi32 (__m256i a, __m256i b) pure @safe 1784 { 1785 // PERF D_SIMD 1786 version(GNU) 1787 enum bool split = true; 1788 else static if (SIMD_COMPARISON_MASKS_32B) 1789 enum bool split = false; 1790 else 1791 enum bool split = true; 1792 1793 static if (GDC_with_AVX2) 1794 { 1795 return cast(__m256i) __builtin_ia32_pmaxsd256(cast(int8)a, cast(int8)b); 1796 } 1797 else static if (split) 1798 { 1799 // split 1800 __m128i a_lo = _mm256_extractf128_si256!0(a); 1801 __m128i a_hi = _mm256_extractf128_si256!1(a); 1802 __m128i b_lo = _mm256_extractf128_si256!0(b); 1803 __m128i b_hi = _mm256_extractf128_si256!1(b); 1804 __m128i r_lo = _mm_max_epi32(a_lo, b_lo); 1805 __m128i r_hi = _mm_max_epi32(a_hi, b_hi); 1806 return _mm256_set_m128i(r_hi, r_lo); 1807 } 1808 else static if (SIMD_COMPARISON_MASKS_32B) 1809 { 1810 // catastrophic with GDC x86 for some reason, like for 16-bit numbers. 1811 int8 sa = cast(int8)a; 1812 int8 sb = cast(int8)b; 1813 int8 greater = sa > sb; 1814 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 1815 } 1816 else 1817 static assert(0); 1818 } 1819 unittest 1820 { 1821 int8 R = cast(int8) _mm256_max_epi32(_mm256_setr_epi32(0x7fffffff, 1, -4, 7, 0x7fffffff, 2, -4, 7), 1822 _mm256_setr_epi32( -4,-8, 9, -8,-0x80000000,-8, 9, -8)); 1823 int[8] correct = [0x7fffffff, 1, 9, 7, 0x7fffffff, 2, 9, 7]; 1824 assert(R.array == correct); 1825 } 1826 1827 /// Compare packed signed 8-bit integers in `a` and `b`, and return packed maximum values. 1828 __m256i _mm256_max_epi8 (__m256i a, __m256i b) pure @trusted 1829 { 1830 // PERF D_SIMD 1831 version(GNU) 1832 enum bool split = true; 1833 else static if (SIMD_COMPARISON_MASKS_32B) 1834 enum bool split = false; 1835 else 1836 enum bool split = true; 1837 static if (GDC_with_AVX2) 1838 { 1839 // Strangely, GDC asks for unsigned ubyte32 1840 return cast(__m256i) __builtin_ia32_pmaxsb256(cast(ubyte32)a, cast(ubyte32)b); 1841 } 1842 else static if (split) 1843 { 1844 // split 1845 __m128i a_lo = _mm256_extractf128_si256!0(a); 1846 __m128i a_hi = _mm256_extractf128_si256!1(a); 1847 __m128i b_lo = _mm256_extractf128_si256!0(b); 1848 __m128i b_hi = _mm256_extractf128_si256!1(b); 1849 __m128i r_lo = _mm_max_epi8(a_lo, b_lo); 1850 __m128i r_hi = _mm_max_epi8(a_hi, b_hi); 1851 return _mm256_set_m128i(r_hi, r_lo); 1852 } 1853 else static if (SIMD_COMPARISON_MASKS_32B) 1854 { 1855 // This is real bad with GDC, again 1856 byte32 sa = cast(byte32)a; 1857 byte32 sb = cast(byte32)b; 1858 byte32 greater = cast(byte32)(sa > sb); 1859 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 1860 } 1861 else 1862 static assert(false); 1863 } 1864 unittest 1865 { 1866 __m256i A = _mm256_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0, 127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1867 __m256i B = _mm256_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0); 1868 byte32 R = cast(byte32) _mm256_max_epi8(A, B); 1869 byte[32] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0, 127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 4, 0, 0]; 1870 assert(R.array == correct); 1871 } 1872 1873 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed maximum values. 1874 __m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @trusted 1875 { 1876 // PERF D_SIMD 1877 version(GNU) 1878 enum bool split = true; 1879 else static if (SIMD_COMPARISON_MASKS_32B) 1880 enum bool split = false; 1881 else 1882 enum bool split = true; 1883 1884 static if (GDC_with_AVX2) 1885 { 1886 return cast(__m256i) __builtin_ia32_pmaxuw256(cast(short16)a, cast(short16)b); 1887 } 1888 else static if (split) 1889 { 1890 // split 1891 __m128i a_lo = _mm256_extractf128_si256!0(a); 1892 __m128i a_hi = _mm256_extractf128_si256!1(a); 1893 __m128i b_lo = _mm256_extractf128_si256!0(b); 1894 __m128i b_hi = _mm256_extractf128_si256!1(b); 1895 __m128i r_lo = _mm_max_epu16(a_lo, b_lo); 1896 __m128i r_hi = _mm_max_epu16(a_hi, b_hi); 1897 return _mm256_set_m128i(r_hi, r_lo); 1898 } 1899 else static if (SIMD_COMPARISON_MASKS_32B) 1900 { 1901 // catastrophic with GDC x86_64, good with LDC 1902 short16 sa = cast(short16)a; 1903 short16 sb = cast(short16)b; 1904 short16 greater = cast(short16)(cast(ushort16)sa > cast(ushort16)sb); 1905 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 1906 } 1907 else 1908 static assert(false); 1909 } 1910 unittest 1911 { 1912 short16 R = cast(short16) _mm256_max_epu16(_mm256_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57, 1, 0, 0, 0, 1, 0, 0, -6), 1913 _mm256_setr_epi16( -4,-8, 9, 7, 0,-32768, 0, 0, 0, 2, 0, 4, 2, 1, 2, -4)); 1914 short[16] correct = [-4,-8, -4, -8, 9,-32768, 0,-57, 1, 2, 0, 4, 2, 1, 2, -4]; 1915 assert(R.array == correct); 1916 } 1917 1918 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values. 1919 __m256i _mm256_max_epu32 (__m256i a, __m256i b) pure @safe 1920 { 1921 // PERF D_SIMD 1922 version(GNU) 1923 enum bool split = true; 1924 else static if (SIMD_COMPARISON_MASKS_32B) 1925 enum bool split = false; 1926 else 1927 enum bool split = true; 1928 1929 static if (GDC_with_AVX2) 1930 { 1931 return cast(__m256i) __builtin_ia32_pmaxud256(cast(int8)a, cast(int8)b); 1932 } 1933 else static if (split) 1934 { 1935 // split 1936 __m128i a_lo = _mm256_extractf128_si256!0(a); 1937 __m128i a_hi = _mm256_extractf128_si256!1(a); 1938 __m128i b_lo = _mm256_extractf128_si256!0(b); 1939 __m128i b_hi = _mm256_extractf128_si256!1(b); 1940 __m128i r_lo = _mm_max_epu32(a_lo, b_lo); 1941 __m128i r_hi = _mm_max_epu32(a_hi, b_hi); 1942 return _mm256_set_m128i(r_hi, r_lo); 1943 } 1944 else static if (SIMD_COMPARISON_MASKS_32B) 1945 { 1946 // catastrophic with GDC x86 for some reason, like for 16-bit numbers. 1947 uint8 sa = cast(uint8)a; 1948 uint8 sb = cast(uint8)b; 1949 uint8 greater = sa > sb; 1950 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 1951 } 1952 else 1953 static assert(0); 1954 } 1955 unittest 1956 { 1957 int8 R = cast(int8) _mm256_max_epu32(_mm256_setr_epi32(0x7fffffff, 1, 4, -7, 0x7fffffff, 1, 11, -7), 1958 _mm256_setr_epi32( -4,-8, 9, -8, -4,-8, 9, -8)); 1959 int[8] correct = [ -4,-8, 9, -7, -4,-8, 11, -7]; 1960 assert(R.array == correct); 1961 } 1962 1963 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values. 1964 __m256i _mm256_max_epu8 (__m256i a, __m256i b) pure @safe 1965 { 1966 // PERF D_SIMD 1967 version(GNU) 1968 enum bool split = true; 1969 else static if (SIMD_COMPARISON_MASKS_32B) 1970 enum bool split = false; 1971 else 1972 enum bool split = true; 1973 static if (GDC_with_AVX2) 1974 { 1975 return cast(__m256i) __builtin_ia32_pmaxub256(cast(ubyte32)a, cast(ubyte32)b); 1976 } 1977 else static if (split) 1978 { 1979 // split 1980 __m128i a_lo = _mm256_extractf128_si256!0(a); 1981 __m128i a_hi = _mm256_extractf128_si256!1(a); 1982 __m128i b_lo = _mm256_extractf128_si256!0(b); 1983 __m128i b_hi = _mm256_extractf128_si256!1(b); 1984 __m128i r_lo = _mm_max_epu8(a_lo, b_lo); 1985 __m128i r_hi = _mm_max_epu8(a_hi, b_hi); 1986 return _mm256_set_m128i(r_hi, r_lo); 1987 } 1988 else static if (SIMD_COMPARISON_MASKS_32B) 1989 { 1990 // This is real bad with GDC, again 1991 ubyte32 sa = cast(ubyte32)a; 1992 ubyte32 sb = cast(ubyte32)b; 1993 ubyte32 greater = cast(ubyte32)(sa > sb); 1994 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 1995 } 1996 else 1997 static assert(false); 1998 } 1999 unittest 2000 { 2001 byte32 R = cast(byte32) _mm256_max_epu8(_mm256_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2002 _mm256_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2003 byte[32] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2004 assert(R.array == correct); 2005 } 2006 2007 // Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2008 __m256i _mm256_min_epi16 (__m256i a, __m256i b) pure @safe 2009 { 2010 // PERF D_SIMD 2011 version(GNU) 2012 enum bool split = true; 2013 else static if (SIMD_COMPARISON_MASKS_32B) 2014 enum bool split = false; 2015 else 2016 enum bool split = true; 2017 2018 static if (GDC_with_AVX2) 2019 { 2020 return cast(__m256i) __builtin_ia32_pminsw256(cast(short16)a, cast(short16)b); 2021 } 2022 else static if (split) 2023 { 2024 // split 2025 __m128i a_lo = _mm256_extractf128_si256!0(a); 2026 __m128i a_hi = _mm256_extractf128_si256!1(a); 2027 __m128i b_lo = _mm256_extractf128_si256!0(b); 2028 __m128i b_hi = _mm256_extractf128_si256!1(b); 2029 __m128i r_lo = _mm_min_epi16(a_lo, b_lo); 2030 __m128i r_hi = _mm_min_epi16(a_hi, b_hi); 2031 return _mm256_set_m128i(r_hi, r_lo); 2032 } 2033 else static if (SIMD_COMPARISON_MASKS_32B) 2034 { 2035 // same as _mm256_min_epi16, this is catastrophic with GDC -mavx 2036 short16 sa = cast(short16)a; 2037 short16 sb = cast(short16)b; 2038 short16 greater = sa > sb; 2039 return cast(__m256i)( (~greater & sa) | (greater & sb) ); 2040 } 2041 else 2042 static assert(0); 2043 } 2044 unittest 2045 { 2046 short16 R = cast(short16) _mm256_min_epi16(_mm256_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57, 1, 0, 0, 0, 1, 0, 0, 0), 2047 _mm256_setr_epi16( -4,-8, 9, 7, 0,-32768, 0, 0, 0, 2, 0, 4, 2, 1, 2, -4)); 2048 short[16] correct = [ -4,-8, -4, -8, 0,-32768, 0,-57, 0, 0, 0, 0, 1, 0, 0, -4]; 2049 assert(R.array == correct); 2050 } 2051 2052 /// Compare packed signed 32-bit integers in `a` and `b`, and return packed minimum values. 2053 __m256i _mm256_min_epi32 (__m256i a, __m256i b) pure @safe 2054 { 2055 // PERF D_SIMD 2056 version(GNU) 2057 enum bool split = true; 2058 else static if (SIMD_COMPARISON_MASKS_32B) 2059 enum bool split = false; 2060 else 2061 enum bool split = true; 2062 2063 static if (GDC_with_AVX2) 2064 { 2065 return cast(__m256i) __builtin_ia32_pminsd256(cast(int8)a, cast(int8)b); 2066 } 2067 else static if (split) 2068 { 2069 // split 2070 __m128i a_lo = _mm256_extractf128_si256!0(a); 2071 __m128i a_hi = _mm256_extractf128_si256!1(a); 2072 __m128i b_lo = _mm256_extractf128_si256!0(b); 2073 __m128i b_hi = _mm256_extractf128_si256!1(b); 2074 __m128i r_lo = _mm_min_epi32(a_lo, b_lo); 2075 __m128i r_hi = _mm_min_epi32(a_hi, b_hi); 2076 return _mm256_set_m128i(r_hi, r_lo); 2077 } 2078 else static if (SIMD_COMPARISON_MASKS_32B) 2079 { 2080 // Not checked this one, probably same badness issue with GDC 2081 int8 sa = cast(int8)a; 2082 int8 sb = cast(int8)b; 2083 int8 greater = sa > sb; 2084 return cast(__m256i)( (~greater & sa) | (greater & sb) ); 2085 } 2086 else 2087 static assert(0); 2088 } 2089 unittest 2090 { 2091 int8 R = cast(int8) _mm256_min_epi32(_mm256_setr_epi32(0x7fffffff, 1, -4, 7, 0x7fffffff, 2, -4, 7), 2092 _mm256_setr_epi32( -4,-8, 9, -8,-0x80000000,-8, 9, -8)); 2093 int[8] correct = [ - 4,-8, -4, -8,-0x80000000,-8, -4, -8]; 2094 assert(R.array == correct); 2095 } 2096 2097 2098 /// Compare packed signed 8-bit integers in `a` and `b`, and return packed minimum values. 2099 __m256i _mm256_min_epi8 (__m256i a, __m256i b) pure @trusted 2100 { 2101 // PERF D_SIMD 2102 version(GNU) 2103 enum bool split = true; 2104 else static if (SIMD_COMPARISON_MASKS_32B) 2105 enum bool split = false; 2106 else 2107 enum bool split = true; 2108 static if (GDC_with_AVX2) 2109 { 2110 // Strangely, GDC asks for unsigned ubyte32 2111 return cast(__m256i) __builtin_ia32_pminsb256(cast(ubyte32)a, cast(ubyte32)b); 2112 } 2113 else static if (split) 2114 { 2115 // split 2116 __m128i a_lo = _mm256_extractf128_si256!0(a); 2117 __m128i a_hi = _mm256_extractf128_si256!1(a); 2118 __m128i b_lo = _mm256_extractf128_si256!0(b); 2119 __m128i b_hi = _mm256_extractf128_si256!1(b); 2120 __m128i r_lo = _mm_min_epi8(a_lo, b_lo); 2121 __m128i r_hi = _mm_min_epi8(a_hi, b_hi); 2122 return _mm256_set_m128i(r_hi, r_lo); 2123 } 2124 else static if (SIMD_COMPARISON_MASKS_32B) 2125 { 2126 // This is real bad with GDC, again 2127 byte32 sa = cast(byte32)a; 2128 byte32 sb = cast(byte32)b; 2129 byte32 greater = cast(byte32)(sa > sb); 2130 return cast(__m256i)( (~greater & sa) | (greater & sb) ); 2131 } 2132 else 2133 static assert(false); 2134 } 2135 unittest 2136 { 2137 __m256i A = _mm256_setr_epi8(127, 1, -4, -8, 9, 7, 0, -57, 0, 0, 0, 0, 0, 0, 0, 0, 127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 2138 __m256i B = _mm256_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, -4, 0, 0); 2139 byte32 R = cast(byte32) _mm256_min_epi8(A, B); 2140 byte[32] correct = [ 4, -8, -4, -8, 0, -128, 0, -57, 0, 0, 0, 0, 0, 0, 0, 0, 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, -4, 0, 0]; 2141 assert(R.array == correct); 2142 } 2143 2144 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed minimum values. 2145 __m256i _mm256_min_epu16 (__m256i a, __m256i b) pure @trusted 2146 { 2147 // PERF D_SIMD 2148 version(GNU) 2149 enum bool split = true; 2150 else static if (SIMD_COMPARISON_MASKS_32B) 2151 enum bool split = false; 2152 else 2153 enum bool split = true; 2154 2155 static if (GDC_with_AVX2) 2156 { 2157 return cast(__m256i) __builtin_ia32_pminuw256(cast(short16)a, cast(short16)b); 2158 } 2159 else static if (split) 2160 { 2161 // split 2162 __m128i a_lo = _mm256_extractf128_si256!0(a); 2163 __m128i a_hi = _mm256_extractf128_si256!1(a); 2164 __m128i b_lo = _mm256_extractf128_si256!0(b); 2165 __m128i b_hi = _mm256_extractf128_si256!1(b); 2166 __m128i r_lo = _mm_min_epu16(a_lo, b_lo); 2167 __m128i r_hi = _mm_min_epu16(a_hi, b_hi); 2168 return _mm256_set_m128i(r_hi, r_lo); 2169 } 2170 else static if (SIMD_COMPARISON_MASKS_32B) 2171 { 2172 // catastrophic with GDC x86_64 2173 short16 sa = cast(short16)a; 2174 short16 sb = cast(short16)b; 2175 short16 greater = cast(short16)(cast(ushort16)sa > cast(ushort16)sb); 2176 return cast(__m256i)( (~greater & sa) | (greater & sb) ); 2177 } 2178 else 2179 static assert(false); 2180 } 2181 unittest 2182 { 2183 short16 R = cast(short16) _mm256_min_epu16(_mm256_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57, 1, 0, 0, 0, 1, 0, 0, -6), 2184 _mm256_setr_epi16( -4, -8, 9, 7, 0,-32768, 0, 0, 0, 2, 0, 4, 2, 1, 2, -4)); 2185 short[16] correct = [32767, 1, 9, 7, 0, 7, 0, 0, 0, 0, 0, 0, 1, 0, 0, -6]; 2186 assert(R.array == correct); 2187 } 2188 2189 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed minimum values. 2190 __m256i _mm256_min_epu32 (__m256i a, __m256i b) pure @safe 2191 { 2192 // PERF D_SIMD 2193 version(GNU) 2194 enum bool split = true; 2195 else static if (SIMD_COMPARISON_MASKS_32B) 2196 enum bool split = false; 2197 else 2198 enum bool split = true; 2199 2200 static if (GDC_with_AVX2) 2201 { 2202 return cast(__m256i) __builtin_ia32_pminud256(cast(int8)a, cast(int8)b); 2203 } 2204 else static if (split) 2205 { 2206 // split 2207 __m128i a_lo = _mm256_extractf128_si256!0(a); 2208 __m128i a_hi = _mm256_extractf128_si256!1(a); 2209 __m128i b_lo = _mm256_extractf128_si256!0(b); 2210 __m128i b_hi = _mm256_extractf128_si256!1(b); 2211 __m128i r_lo = _mm_min_epu32(a_lo, b_lo); 2212 __m128i r_hi = _mm_min_epu32(a_hi, b_hi); 2213 return _mm256_set_m128i(r_hi, r_lo); 2214 } 2215 else static if (SIMD_COMPARISON_MASKS_32B) 2216 { 2217 // catastrophic with GDC, so in this case split instead 2218 uint8 sa = cast(uint8)a; 2219 uint8 sb = cast(uint8)b; 2220 uint8 greater = sa > sb; 2221 return cast(__m256i)( (greater & sb) | (~greater & sa) ); 2222 } 2223 else 2224 static assert(0); 2225 } 2226 unittest 2227 { 2228 int8 R = cast(int8) _mm256_min_epu32(_mm256_setr_epi32(0x7fffffff, 1, 4, -7, 0x7fffffff, 1, 11, -7), 2229 _mm256_setr_epi32( -4,-8, 9, -8, -4,-8, 9, -8)); 2230 int[8] correct = [0x7fffffff, 1, 4, -8, 0x7fffffff, 1, 9, -8]; 2231 assert(R.array == correct); 2232 } 2233 2234 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2235 __m256i _mm256_min_epu8 (__m256i a, __m256i b) pure @safe 2236 { 2237 // PERF D_SIMD 2238 version(GNU) 2239 enum bool split = true; 2240 else static if (SIMD_COMPARISON_MASKS_32B) 2241 enum bool split = false; 2242 else 2243 enum bool split = true; 2244 static if (GDC_with_AVX2) 2245 { 2246 return cast(__m256i) __builtin_ia32_pminub256(cast(ubyte32)a, cast(ubyte32)b); 2247 } 2248 else static if (split) 2249 { 2250 // split 2251 __m128i a_lo = _mm256_extractf128_si256!0(a); 2252 __m128i a_hi = _mm256_extractf128_si256!1(a); 2253 __m128i b_lo = _mm256_extractf128_si256!0(b); 2254 __m128i b_hi = _mm256_extractf128_si256!1(b); 2255 __m128i r_lo = _mm_min_epu8(a_lo, b_lo); 2256 __m128i r_hi = _mm_min_epu8(a_hi, b_hi); 2257 return _mm256_set_m128i(r_hi, r_lo); 2258 } 2259 else static if (SIMD_COMPARISON_MASKS_32B) 2260 { 2261 ubyte32 sa = cast(ubyte32)a; 2262 ubyte32 sb = cast(ubyte32)b; 2263 ubyte32 greater = cast(ubyte32)(sa > sb); 2264 return cast(__m256i)( (~greater & sa) | (greater & sb) ); 2265 } 2266 else 2267 static assert(false); 2268 } 2269 unittest 2270 { 2271 byte32 R = cast(byte32) _mm256_min_epu8(_mm256_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2272 _mm256_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2273 byte[32] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2274 assert(R.array == correct); 2275 } 2276 2277 2278 // TODO int _mm256_movemask_epi8 (__m256i a) pure @safe 2279 // TODO __m256i _mm256_mpsadbw_epu8 (__m256i a, __m256i b, const int imm8) pure @safe 2280 2281 /// Multiply the low signed 32-bit integers from each packed 64-bit element in `a` and `b`, and 2282 /// return the signed 64-bit results. 2283 __m256i _mm256_mul_epi32 (__m256i a, __m256i b) pure @trusted 2284 { 2285 // PERF LDC + SSE2 to SSSE3. I don't quite see what to do, same problem in _mm_mul_epi32. 2286 static if (GDC_with_AVX2) 2287 { 2288 return cast(__m256i) __builtin_ia32_pmuldq256(cast(int8)a, cast(int8)b); 2289 } 2290 else static if ( (LDC_with_SSE41 || LDC_with_AVX2) && LDC_with_optimizations) 2291 { 2292 // good with LDC + SSE4.1 to AVX2, else need to split 2293 enum ir = ` 2294 %ia = shufflevector <8 x i32> %0,<8 x i32> %0, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2295 %ib = shufflevector <8 x i32> %1,<8 x i32> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2296 %la = sext <4 x i32> %ia to <4 x i64> 2297 %lb = sext <4 x i32> %ib to <4 x i64> 2298 %r = mul <4 x i64> %la, %lb 2299 ret <4 x i64> %r`; 2300 return cast(__m256i) LDCInlineIR!(ir, long4, int8, int8)(cast(int8)a, cast(int8)b); 2301 } 2302 else 2303 { 2304 // split, very beneficial with LDC+ARM64 2305 __m128i a_lo = _mm256_extractf128_si256!0(a); 2306 __m128i a_hi = _mm256_extractf128_si256!1(a); 2307 __m128i b_lo = _mm256_extractf128_si256!0(b); 2308 __m128i b_hi = _mm256_extractf128_si256!1(b); 2309 __m128i r_lo = _mm_mul_epi32(a_lo, b_lo); 2310 __m128i r_hi = _mm_mul_epi32(a_hi, b_hi); 2311 return _mm256_set_m128i(r_hi, r_lo); 2312 } 2313 } 2314 unittest 2315 { 2316 __m256i A = _mm256_setr_epi32(61616461, 1915324654, 4564061, 3, 61616466, 1915324654, 4564061, 3); 2317 __m256i B = _mm256_setr_epi32(49716422, -915616216, -121144, 0, 49716422, -915616216, -121145, 0); 2318 long4 R = cast(long4) _mm256_mul_epi32(A, B); 2319 long[4] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144, cast(long)61616466 * 49716422, cast(long)4564061 * -121145]; 2320 assert(R.array == correct); 2321 } 2322 2323 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, and 2324 /// return the unsigned 64-bit results. 2325 __m256i _mm256_mul_epu32 (__m256i a, __m256i b) pure @trusted 2326 { 2327 // PERF DMD 2328 static if (GDC_with_AVX2) 2329 { 2330 return cast(__m256i) __builtin_ia32_pmuludq256(cast(int8)a, cast(int8)b); 2331 } 2332 else version(GNU) 2333 { 2334 // explicit split needed for GDC without avx2 2335 __m128i a_lo = _mm256_extractf128_si256!0(a); 2336 __m128i a_hi = _mm256_extractf128_si256!1(a); 2337 __m128i b_lo = _mm256_extractf128_si256!0(b); 2338 __m128i b_hi = _mm256_extractf128_si256!1(b); 2339 __m128i r_lo = _mm_mul_epu32(a_lo, b_lo); 2340 __m128i r_hi = _mm_mul_epu32(a_hi, b_hi); 2341 return _mm256_set_m128i(r_hi, r_lo); 2342 } 2343 else 2344 { 2345 // Works well in all LDC cases, surprisingly. 2346 int8 ia = cast(int8)a; 2347 int8 ib = cast(int8)b; 2348 long4 r; 2349 r.ptr[0] = cast(long)cast(uint)ia.array[0] * cast(long)cast(uint)ib.array[0]; 2350 r.ptr[1] = cast(long)cast(uint)ia.array[2] * cast(long)cast(uint)ib.array[2]; 2351 r.ptr[2] = cast(long)cast(uint)ia.array[4] * cast(long)cast(uint)ib.array[4]; 2352 r.ptr[3] = cast(long)cast(uint)ia.array[6] * cast(long)cast(uint)ib.array[6]; 2353 return cast(__m256i)r; 2354 } 2355 } 2356 unittest 2357 { 2358 __m256i A = _mm256_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff, 42, 0xDEADBEEF, 42, 0xffffffff); 2359 __m256i B = _mm256_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff, 42, 0xCAFEBABE, 42, 0xffffffff); 2360 __m256i C = _mm256_mul_epu32(A, B); 2361 long4 LC = cast(long4)C; 2362 long[4] correct = [18446744065119617025uL, 12723420444339690338uL, 18446744065119617025uL, 12723420444339690338uL]; 2363 assert(LC.array == correct); 2364 } 2365 2366 // TODO __m256i _mm256_mulhi_epi16 (__m256i a, __m256i b) pure @safe 2367 // TODO __m256i _mm256_mulhi_epu16 (__m256i a, __m256i b) pure @safe 2368 // TODO __m256i _mm256_mulhrs_epi16 (__m256i a, __m256i b) pure @safe 2369 // TODO __m256i _mm256_mullo_epi16 (__m256i a, __m256i b) pure @safe 2370 // TODO __m256i _mm256_mullo_epi32 (__m256i a, __m256i b) pure @safe 2371 2372 /// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b`. 2373 __m256i _mm256_or_si256 (__m256i a, __m256i b) pure @safe 2374 { 2375 return a | b; 2376 } 2377 unittest 2378 { 2379 long A = 0x55555555_55555555; 2380 long B = 0xAAAAAAAA_AAAAAAAA; 2381 __m256i vA = _mm256_set_epi64(A, B, A, B); 2382 __m256i vB = _mm256_set_epi64(B, A, 0, B); 2383 __m256i R = _mm256_or_si256(vA, vB); 2384 long[4] correct = [B, A, -1, -1]; 2385 assert(R.array == correct); 2386 } 2387 2388 /// Convert packed signed 16-bit integers from `a` and `b `to packed 8-bit integers using signed saturation. 2389 /// Warning: `a` and `b` are interleaved per-lane. 2390 /// Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1. 2391 __m256i _mm256_packs_epi16 (__m256i a, __m256i b) pure @safe 2392 { 2393 // PERF D_SIMD 2394 static if (GDC_with_AVX2) 2395 { 2396 return cast(__m256i) __builtin_ia32_packsswb256(cast(short16)a, cast(short16)b); 2397 } 2398 else static if (LDC_with_AVX2) 2399 { 2400 return cast(__m256i) __builtin_ia32_packsswb256(cast(short16)a, cast(short16)b); 2401 } 2402 else 2403 { 2404 __m128i a_lo = _mm256_extractf128_si256!0(a); 2405 __m128i a_hi = _mm256_extractf128_si256!1(a); 2406 __m128i b_lo = _mm256_extractf128_si256!0(b); 2407 __m128i b_hi = _mm256_extractf128_si256!1(b); 2408 __m128i r_lo = _mm_packs_epi16(a_lo, b_lo); 2409 __m128i r_hi = _mm_packs_epi16(a_hi, b_hi); 2410 return _mm256_set_m128i(r_hi, r_lo); 2411 } 2412 } 2413 unittest 2414 { 2415 __m256i A = _mm256_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0, 2416 -1000, -1000, 1000, 0, 256, -129, 254, 0); 2417 byte32 R = cast(byte32) _mm256_packs_epi16(A, A); 2418 byte[32] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2419 127, -128, 127, 0, 127, -128, 127, 0, 2420 -128, -128, 127, 0, 127, -128, 127, 0, 2421 -128, -128, 127, 0, 127, -128, 127, 0]; 2422 assert(R.array == correct); 2423 } 2424 2425 /// Convert packed signed 32-bit integers from `a` and `b `to packed 16-bit integers using signed saturation. 2426 /// Warning: `a` and `b` are interleaved per-lane. 2427 /// Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1. 2428 __m256i _mm256_packs_epi32 (__m256i a, __m256i b) pure @safe 2429 { 2430 // PERF D_SIMD 2431 static if (GDC_with_AVX2) 2432 { 2433 return cast(__m256i) __builtin_ia32_packssdw256(cast(int8)a, cast(int8)b); 2434 } 2435 else static if (LDC_with_AVX2) 2436 { 2437 return cast(__m256i) __builtin_ia32_packssdw256(cast(int8)a, cast(int8)b); 2438 } 2439 else 2440 { 2441 __m128i a_lo = _mm256_extractf128_si256!0(a); 2442 __m128i a_hi = _mm256_extractf128_si256!1(a); 2443 __m128i b_lo = _mm256_extractf128_si256!0(b); 2444 __m128i b_hi = _mm256_extractf128_si256!1(b); 2445 __m128i r_lo = _mm_packs_epi32(a_lo, b_lo); 2446 __m128i r_hi = _mm_packs_epi32(a_hi, b_hi); 2447 return _mm256_set_m128i(r_hi, r_lo); 2448 } 2449 } 2450 unittest 2451 { 2452 __m256i A = _mm256_setr_epi32(100000, -100000, 1000, 0, 4, 5, -100000, 7); 2453 short16 R = cast(short16) _mm256_packs_epi32(A, A); 2454 short[16] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0, 4, 5, -32768, 7, 4, 5, -32768, 7]; 2455 assert(R.array == correct); 2456 } 2457 2458 2459 /// Convert packed signed 16-bit integers from `a` and `b `to packed 8-bit integers using unsigned saturation. 2460 /// Warning: `a` and `b` are interleaved per-lane. 2461 /// Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1. 2462 __m256i _mm256_packus_epi16 (__m256i a, __m256i b) pure @trusted 2463 { 2464 // PERF D_SIMD 2465 static if (GDC_with_AVX2) 2466 { 2467 return cast(__m256i) __builtin_ia32_packuswb256(cast(short16)a, cast(short16)b); 2468 } 2469 else static if (LDC_with_AVX2) 2470 { 2471 return cast(__m256i) __builtin_ia32_packuswb256(cast(short16)a, cast(short16)b); 2472 } 2473 else 2474 { 2475 // Always beneficial with LDC. 2476 // arm64: 4 inst with LDC -O1 2477 __m128i a_lo = _mm256_extractf128_si256!0(a); 2478 __m128i a_hi = _mm256_extractf128_si256!1(a); 2479 __m128i b_lo = _mm256_extractf128_si256!0(b); 2480 __m128i b_hi = _mm256_extractf128_si256!1(b); 2481 __m128i r_lo = _mm_packus_epi16(a_lo, b_lo); 2482 __m128i r_hi = _mm_packus_epi16(a_hi, b_hi); 2483 return _mm256_set_m128i(r_hi, r_lo); 2484 } 2485 } 2486 unittest 2487 { 2488 __m256i A = _mm256_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0, -10, 400, 0, 256, -32768, 2, 1, 0); 2489 __m256i B = _mm256_setr_epi16( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 2490 byte32 R = cast(byte32) _mm256_packus_epi16(A, B); 2491 align(32) static immutable byte[32] correctResult = [0, -1, 0, -1, -1, 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 2492 0, -1, 0, -1, 0 , 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15]; 2493 assert(R.array == correctResult); 2494 } 2495 2496 /// Convert packed signed 32-bit integers from `a` and `b `to packed 16-bit integers using unsigned saturation. 2497 /// Warning: `a` and `b` are interleaved per-lane. 2498 /// Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1. 2499 __m256i _mm256_packus_epi32 (__m256i a, __m256i b) pure @safe 2500 { 2501 // PERF D_SIMD 2502 static if (GDC_with_AVX2) 2503 { 2504 return cast(__m256i) __builtin_ia32_packusdw256(cast(int8)a, cast(int8)b); 2505 } 2506 else static if (LDC_with_AVX2) 2507 { 2508 return cast(__m256i) __builtin_ia32_packusdw256(cast(int8)a, cast(int8)b); 2509 } 2510 else 2511 { 2512 // 8 inst in arm64 since LDC 1.22 -O2, 2513 // sounds a bit underperforming maybe 2514 __m128i a_lo = _mm256_extractf128_si256!0(a); 2515 __m128i a_hi = _mm256_extractf128_si256!1(a); 2516 __m128i b_lo = _mm256_extractf128_si256!0(b); 2517 __m128i b_hi = _mm256_extractf128_si256!1(b); 2518 __m128i r_lo = _mm_packus_epi32(a_lo, b_lo); 2519 __m128i r_hi = _mm_packus_epi32(a_hi, b_hi); 2520 return _mm256_set_m128i(r_hi, r_lo); 2521 } 2522 } 2523 unittest 2524 { 2525 __m256i A = _mm256_setr_epi32(100000, -100000, 1000, 0, 100000, -100000, 1000, 1); 2526 short16 R = cast(short16) _mm256_packus_epi32(A, A); 2527 short[16] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0, 2528 cast(short)65535, 0, 1000, 1, cast(short)65535, 0, 1000, 1]; 2529 assert(R.array == correct); 2530 } 2531 2532 2533 2534 // TODO __m256i _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8) pure @safe 2535 // TODO __m256i _mm256_permute4x64_epi64 (__m256i a, const int imm8) pure @safe 2536 // TODO __m256d _mm256_permute4x64_pd (__m256d a, const int imm8) pure @safe 2537 // TODO __m256i _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx) pure @safe 2538 // TODO __m256 _mm256_permutevar8x32_ps (__m256 a, __m256i idx) pure @safe 2539 2540 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2541 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2542 /// low 16 bits of 64-bit elements in result. 2543 __m256i _mm256_sad_epu8 (__m256i a, __m256i b) pure @trusted 2544 { 2545 static if (GDC_with_AVX2) 2546 { 2547 return cast(__m256i) __builtin_ia32_psadbw256(cast(ubyte32)a, cast(ubyte32)b); 2548 } 2549 else static if (LDC_with_AVX2) 2550 { 2551 return cast(__m256i) __builtin_ia32_psadbw256(cast(byte32)a, cast(byte32)b); 2552 } 2553 else 2554 { 2555 // split is beneficial for ARM64, LDC and GDC without AVX2 2556 __m128i a_lo = _mm256_extractf128_si256!0(a); 2557 __m128i a_hi = _mm256_extractf128_si256!1(a); 2558 __m128i b_lo = _mm256_extractf128_si256!0(b); 2559 __m128i b_hi = _mm256_extractf128_si256!1(b); 2560 __m128i r_lo = _mm_sad_epu8(a_lo, b_lo); 2561 __m128i r_hi = _mm_sad_epu8(a_hi, b_hi); 2562 return _mm256_set_m128i(r_hi, r_lo); 2563 } 2564 } 2565 unittest 2566 { 2567 __m256i A = _mm256_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54, 2568 3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 2569 __m256i B = _mm256_set1_epi8(1); 2570 int8 R = cast(int8) _mm256_sad_epu8(A, B); 2571 int[8] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 2572 0, 2573 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 2574 0, 2575 2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 2576 0, 2577 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 2578 0]; 2579 assert(R.array == correct); 2580 } 2581 2582 2583 // TODO __m256i _mm256_shuffle_epi32 (__m256i a, const int imm8) pure @safe 2584 // TODO __m256i _mm256_shuffle_epi8 (__m256i a, __m256i b) pure @safe 2585 // TODO __m256i _mm256_shufflehi_epi16 (__m256i a, const int imm8) pure @safe 2586 // TODO __m256i _mm256_shufflelo_epi16 (__m256i a, const int imm8) pure @safe 2587 // TODO __m256i _mm256_sign_epi16 (__m256i a, __m256i b) pure @safe 2588 // TODO __m256i _mm256_sign_epi32 (__m256i a, __m256i b) pure @safe 2589 // TODO __m256i _mm256_sign_epi8 (__m256i a, __m256i b) pure @safe 2590 // TODO __m256i _mm256_sll_epi16 (__m256i a, __m128i count) pure @safe 2591 // TODO __m256i _mm256_sll_epi32 (__m256i a, __m128i count) pure @safe 2592 // TODO __m256i _mm256_sll_epi64 (__m256i a, __m128i count) pure @safe 2593 2594 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 2595 __m256i _mm256_slli_epi16(__m256i a, int imm8) pure @safe 2596 { 2597 static if (GDC_or_LDC_with_AVX2) 2598 { 2599 return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8); 2600 } 2601 else // split 2602 { 2603 __m128i a_lo = _mm256_extractf128_si256!0(a); 2604 __m128i a_hi = _mm256_extractf128_si256!1(a); 2605 __m128i r_lo = _mm_slli_epi16(a_lo, imm8); 2606 __m128i r_hi = _mm_slli_epi16(a_hi, imm8); 2607 return _mm256_set_m128i(r_hi, r_lo); 2608 } 2609 } 2610 unittest 2611 { 2612 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7); 2613 short16 B = cast(short16)( _mm256_slli_epi16(A, 1) ); 2614 short16 B2 = cast(short16)( _mm256_slli_epi16(A, 1 + 256) ); 2615 short[16] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14, 0, 2, 4, 6, -8, -10, 12, 14 ]; 2616 assert(B.array == expectedB); 2617 assert(B2.array == expectedB); 2618 2619 short16 C = cast(short16)( _mm256_slli_epi16(A, 16) ); 2620 short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; 2621 assert(C.array == expectedC); 2622 } 2623 2624 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 2625 __m256i _mm256_slli_epi32 (__m256i a, int imm8) pure @safe 2626 { 2627 static if (GDC_or_LDC_with_AVX2) 2628 { 2629 return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8); 2630 } 2631 else 2632 { 2633 __m128i a_lo = _mm256_extractf128_si256!0(a); 2634 __m128i a_hi = _mm256_extractf128_si256!1(a); 2635 __m128i r_lo = _mm_slli_epi32(a_lo, imm8); 2636 __m128i r_hi = _mm_slli_epi32(a_hi, imm8); 2637 return _mm256_set_m128i(r_hi, r_lo); 2638 } 2639 } 2640 unittest 2641 { 2642 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -9); 2643 int8 B = cast(int8) _mm256_slli_epi32(A, 1); 2644 int8 B2 = cast(int8) _mm256_slli_epi32(A, 1 + 256); 2645 int[8] expectedB = [ 0, 4, 6, -8, 0, 4, 6, -18 ]; 2646 assert(B.array == expectedB); 2647 assert(B2.array == expectedB); 2648 2649 int8 C = cast(int8) _mm256_slli_epi32(A, 0); 2650 int[8] expectedC = [ 0, 2, 3, -4, 0, 2, 3, -9 ]; 2651 assert(C.array == expectedC); 2652 2653 int8 D = cast(int8) _mm256_slli_epi32(A, 65); 2654 int[8] expectedD = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 2655 assert(D.array == expectedD); 2656 } 2657 2658 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 2659 __m256i _mm256_slli_epi64 (__m256i a, int imm8) pure @safe 2660 { 2661 static if (GDC_or_LDC_with_AVX2) 2662 { 2663 return cast(__m256i) __builtin_ia32_psllqi256(cast(long4)a, cast(ubyte)imm8); 2664 } 2665 else 2666 { 2667 __m128i a_lo = _mm256_extractf128_si256!0(a); 2668 __m128i a_hi = _mm256_extractf128_si256!1(a); 2669 __m128i r_lo = _mm_slli_epi64(a_lo, imm8); 2670 __m128i r_hi = _mm_slli_epi64(a_hi, imm8); 2671 return _mm256_set_m128i(r_hi, r_lo); 2672 } 2673 } 2674 unittest 2675 { 2676 __m256i A = _mm256_setr_epi64(23, -4, 1, long.max); 2677 long4 B = cast(long4) _mm256_slli_epi64(A, 1); 2678 long4 B2 = cast(long4) _mm256_slli_epi64(A, 1 + 256); 2679 2680 long[4] expectedB = [ 46, -8, 2, -2]; 2681 assert(B.array == expectedB); 2682 assert(B2.array == expectedB); 2683 2684 long4 C = cast(long4) _mm256_slli_epi64(A, 0); 2685 long[4] expectedC = [ 23, -4, 1, long.max ]; 2686 assert(C.array == expectedC); 2687 2688 long4 D = cast(long4) _mm256_slli_epi64(A, 65); 2689 long[4] expectedD = [ 0, 0, 0, 0 ]; 2690 assert(D.array == expectedD); 2691 } 2692 2693 // TODO __m256i _mm256_slli_si256 (__m256i a, const int imm8) pure @safe 2694 // TODO __m128i _mm_sllv_epi32 (__m128i a, __m128i count) pure @safe 2695 // TODO __m256i _mm256_sllv_epi32 (__m256i a, __m256i count) pure @safe 2696 // TODO __m128i _mm_sllv_epi64 (__m128i a, __m128i count) pure @safe 2697 // TODO __m256i _mm256_sllv_epi64 (__m256i a, __m256i count) pure @safe 2698 // TODO __m256i _mm256_sra_epi16 (__m256i a, __m128i count) pure @safe 2699 // TODO __m256i _mm256_sra_epi32 (__m256i a, __m128i count) pure @safe 2700 2701 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 2702 __m256i _mm256_srai_epi16 (__m256i a, int imm8) pure @safe 2703 { 2704 static if (GDC_or_LDC_with_AVX2) 2705 { 2706 return cast(__m256i) __builtin_ia32_psrawi256(cast(short16)a, cast(ubyte)imm8); 2707 } 2708 else 2709 { 2710 // split 2711 __m128i a_lo = _mm256_extractf128_si256!0(a); 2712 __m128i a_hi = _mm256_extractf128_si256!1(a); 2713 __m128i r_lo = _mm_srai_epi16(a_lo, imm8); 2714 __m128i r_hi = _mm_srai_epi16(a_hi, imm8); 2715 return _mm256_set_m128i(r_hi, r_lo); 2716 } 2717 } 2718 unittest 2719 { 2720 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, short.min, short.max, 2, 3, -4, -5, 6, 7); 2721 short16 B = cast(short16)( _mm256_srai_epi16(A, 1) ); 2722 short16 B2 = cast(short16)( _mm256_srai_epi16(A, 1 + 256) ); 2723 short[16] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3, -16384, 16383, 1, 1, -2, -3, 3, 3 ]; 2724 assert(B.array == expectedB); 2725 assert(B2.array == expectedB); 2726 2727 short16 C = cast(short16)( _mm256_srai_epi16(A, 18) ); 2728 short[16] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0, 2729 -1, 0, 0, 0, -1, -1, 0, 0 ]; 2730 assert(C.array == expectedC); 2731 } 2732 2733 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 2734 __m256i _mm256_srai_epi32 (__m256i a, int imm8) pure @safe 2735 { 2736 static if (GDC_or_LDC_with_AVX2) 2737 { 2738 return cast(__m256i) __builtin_ia32_psradi256(cast(int8)a, cast(ubyte)imm8); 2739 } 2740 else // split 2741 { 2742 __m128i a_lo = _mm256_extractf128_si256!0(a); 2743 __m128i a_hi = _mm256_extractf128_si256!1(a); 2744 __m128i r_lo = _mm_srai_epi32(a_lo, imm8); 2745 __m128i r_hi = _mm_srai_epi32(a_hi, imm8); 2746 return _mm256_set_m128i(r_hi, r_lo); 2747 } 2748 } 2749 unittest 2750 { 2751 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4); 2752 int8 B = cast(int8) _mm256_srai_epi32(A, 1); 2753 int8 B2 = cast(int8) _mm256_srai_epi32(A, 1 + 256); 2754 int[8] expectedB = [ 0, 1, 1, -2, 0, 1, 1, -2]; 2755 assert(B.array == expectedB); 2756 assert(B2.array == expectedB); 2757 2758 int8 C = cast(int8) _mm256_srai_epi32(A, 32); 2759 int[8] expectedC = [ 0, 0, 0, -1, 0, 0, 0, -1]; 2760 assert(C.array == expectedC); 2761 2762 int8 D = cast(int8) _mm256_srai_epi32(A, 0); 2763 int[8] expectedD = [ 0, 2, 3, -4, 0, 2, 3, -4]; 2764 assert(D.array == expectedD); 2765 } 2766 2767 // TODO __m128i _mm_srav_epi32 (__m128i a, __m128i count) pure @safe 2768 // TODO __m256i _mm256_srav_epi32 (__m256i a, __m256i count) pure @safe 2769 // TODO __m256i _mm256_srl_epi16 (__m256i a, __m128i count) pure @safe 2770 // TODO __m256i _mm256_srl_epi32 (__m256i a, __m128i count) pure @safe 2771 // TODO __m256i _mm256_srl_epi64 (__m256i a, __m128i count) pure @safe 2772 2773 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 2774 __m256i _mm256_srli_epi16 (__m256i a, int imm8) pure @trusted 2775 { 2776 static if (GDC_with_AVX2) 2777 { 2778 return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8); 2779 } 2780 else static if (LDC_with_AVX2) 2781 { 2782 return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8); 2783 } 2784 else 2785 { 2786 __m128i a_lo = _mm256_extractf128_si256!0(a); 2787 __m128i a_hi = _mm256_extractf128_si256!1(a); 2788 __m128i r_lo = _mm_srli_epi16(a_lo, imm8); 2789 __m128i r_hi = _mm_srli_epi16(a_hi, imm8); 2790 return _mm256_set_m128i(r_hi, r_lo); 2791 } 2792 } 2793 unittest 2794 { 2795 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7); 2796 short16 B = cast(short16) _mm256_srli_epi16(A, 1); 2797 short16 B2 = cast(short16) _mm256_srli_epi16(A, 1 + 256); 2798 short[16] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3, 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 2799 assert(B.array == expectedB); 2800 assert(B2.array == expectedB); 2801 2802 short16 C = cast(short16) _mm256_srli_epi16(A, 16); 2803 short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; 2804 assert(C.array == expectedC); 2805 2806 short16 D = cast(short16) _mm256_srli_epi16(A, 0); 2807 short[16] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7 ]; 2808 assert(D.array == expectedD); 2809 } 2810 2811 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 2812 __m256i _mm256_srli_epi32 (__m256i a, int imm8) pure @trusted 2813 { 2814 static if (GDC_with_AVX2) 2815 { 2816 return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8); 2817 } 2818 else static if (LDC_with_AVX2) 2819 { 2820 return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8); 2821 } 2822 else 2823 { 2824 // split 2825 __m128i a_lo = _mm256_extractf128_si256!0(a); 2826 __m128i a_hi = _mm256_extractf128_si256!1(a); 2827 __m128i r_lo = _mm_srli_epi32(a_lo, imm8); 2828 __m128i r_hi = _mm_srli_epi32(a_hi, imm8); 2829 return _mm256_set_m128i(r_hi, r_lo); 2830 } 2831 } 2832 unittest 2833 { 2834 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4); 2835 int8 B = cast(int8) _mm256_srli_epi32(A, 1); 2836 int8 B2 = cast(int8) _mm256_srli_epi32(A, 1 + 256); 2837 int[8] expectedB = [ 0, 1, 1, 0x7FFFFFFE, 0, 1, 1, 0x7FFFFFFE]; 2838 assert(B.array == expectedB); 2839 assert(B2.array == expectedB); 2840 2841 int8 C = cast(int8) _mm256_srli_epi32(A, 255); 2842 int[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 2843 assert(C.array == expectedC); 2844 } 2845 2846 // TODO __m256i _mm256_srli_epi64 (__m256i a, int imm8) pure @safe 2847 // TODO __m256i _mm256_srli_si256 (__m256i a, const int imm8) pure @safe 2848 // TODO __m128i _mm_srlv_epi32 (__m128i a, __m128i count) pure @safe 2849 // TODO __m256i _mm256_srlv_epi32 (__m256i a, __m256i count) pure @safe 2850 // TODO __m128i _mm_srlv_epi64 (__m128i a, __m128i count) pure @safe 2851 // TODO __m256i _mm256_srlv_epi64 (__m256i a, __m256i count) pure @safe 2852 2853 // TODO __m256i _mm256_stream_load_si256 (__m256i const* mem_addr) pure @safe 2854 2855 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 2856 __m256i _mm256_sub_epi16 (__m256i a, __m256i b) pure @safe 2857 { 2858 pragma(inline, true); 2859 return cast(__m256i)(cast(short16)a - cast(short16)b); 2860 } 2861 unittest 2862 { 2863 __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0, 6, -2); 2864 short16 R = cast(short16) _mm256_sub_epi16(A, A); 2865 short[16] correct = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]; 2866 assert(R.array == correct); 2867 } 2868 2869 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 2870 __m256i _mm256_sub_epi32(__m256i a, __m256i b) pure @safe 2871 { 2872 pragma(inline, true); 2873 return cast(__m256i)(cast(int8)a - cast(int8)b); 2874 } 2875 unittest 2876 { 2877 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432); 2878 int8 R = cast(int8) _mm256_sub_epi32(A, A); 2879 int[8] correct = [ 0, 0, 0, 0, 0, 0, 0, 0]; 2880 assert(R.array == correct); 2881 } 2882 2883 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 2884 __m256i _mm256_sub_epi64 (__m256i a, __m256i b) pure @safe 2885 { 2886 pragma(inline, true); 2887 return a - b; 2888 } 2889 unittest 2890 { 2891 __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12); 2892 long4 R = cast(__m256i) _mm256_sub_epi64(A, A); 2893 long[4] correct = [ 0, 0, 0, 0 ]; 2894 assert(R.array == correct); 2895 } 2896 2897 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 2898 __m256i _mm256_sub_epi8 (__m256i a, __m256i b) pure @safe 2899 { 2900 pragma(inline, true); 2901 return cast(__m256i)(cast(byte32)a - cast(byte32)b); 2902 } 2903 unittest 2904 { 2905 __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78, 2906 4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78); 2907 byte32 R = cast(byte32) _mm256_sub_epi8(A, A); 2908 byte[32] correct; // zero initialized 2909 assert(R.array == correct); 2910 } 2911 2912 /// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using 2913 /// saturation. 2914 __m256i _mm256_subs_epi16 (__m256i a, __m256i b) pure @trusted 2915 { 2916 // PERF DMD 2917 static if (GDC_with_AVX2) 2918 { 2919 return cast(__m256i) __builtin_ia32_psubsw256(cast(short16)a, cast(short16)b); 2920 } 2921 else static if(LDC_with_saturated_intrinsics) 2922 { 2923 return cast(__m256i) inteli_llvm_subs!short16(cast(short16)a, cast(short16)b); 2924 } 2925 else 2926 { 2927 short16 r; 2928 short16 sa = cast(short16)a; 2929 short16 sb = cast(short16)b; 2930 foreach(i; 0..16) 2931 r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 2932 return cast(__m256i)r; 2933 } 2934 } 2935 unittest 2936 { 2937 short16 res = cast(short16) _mm256_subs_epi16(_mm256_setr_epi16( 7, 6, 5, -32768, 3, 3, 32766, 0, 7, 6, 5, -32750, 3, 3, 32767, 0), 2938 _mm256_setr_epi16( 7, 6, 5, -30000, 3, 1, -2, -10, 7, 6, 5, 100, 3, 1, 1, -10)); 2939 static immutable short[16] correctResult = [ 0, 0, 0, -2768, 0, 2, 32767, 10, 0, 0, 0, -32768, 0, 2, 32766, 10]; 2940 assert(res.array == correctResult); 2941 } 2942 2943 2944 /// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using 2945 /// saturation. 2946 __m256i _mm256_subs_epi8 (__m256i a, __m256i b) pure @trusted 2947 { 2948 // PERF DMD 2949 static if (GDC_with_AVX2) 2950 { 2951 return cast(__m256i) __builtin_ia32_psubsb256(cast(ubyte32)a, cast(ubyte32)b); 2952 } 2953 else static if(LDC_with_saturated_intrinsics) 2954 { 2955 return cast(__m256i) inteli_llvm_subs!byte32(cast(byte32)a, cast(byte32)b); 2956 } 2957 else 2958 { 2959 byte32 r; 2960 byte32 sa = cast(byte32)a; 2961 byte32 sb = cast(byte32)b; 2962 foreach(i; 0..32) 2963 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 2964 return cast(__m256i)r; 2965 } 2966 } 2967 unittest 2968 { 2969 byte32 R = cast(byte32) _mm256_subs_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 126, 9, 8, 7, 6, 5, -127, 3, 2, 1, 0), 2970 _mm256_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, -10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 2971 static immutable byte[32] correct = [ 0, 0, 0, 0, 0, 117, 0, 0, 0, 0, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, 0, 0, 0, 0, 0, -128, 0, 0, 0, 0]; 2972 assert(R.array == correct); 2973 } 2974 2975 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 2976 /// using saturation. 2977 __m256i _mm256_subs_epu16 (__m256i a, __m256i b) pure @trusted 2978 { 2979 // PERF DMD 2980 static if (GDC_with_AVX2) 2981 { 2982 return cast(__m256i) __builtin_ia32_psubusw256(cast(short16)a, cast(short16)b); 2983 } 2984 else static if(LDC_with_saturated_intrinsics) 2985 { 2986 return cast(__m256i) inteli_llvm_subus!short16(cast(short16)a, cast(short16)b); 2987 } 2988 else 2989 { 2990 short16 r; 2991 short16 sa = cast(short16)a; 2992 short16 sb = cast(short16)b; 2993 foreach(i; 0..16) 2994 r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i])); 2995 return cast(__m256i)r; 2996 } 2997 } 2998 unittest 2999 { 3000 short16 R = cast(short16) _mm256_subs_epu16(_mm256_setr_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65534, 0), 3001 _mm256_setr_epi16(3, 4, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 20, cast(short)65535, 0)); 3002 static immutable short[16] correct = [0, 0, cast(short)65534, 0, 0, 0, cast(short)65534, 0, 0, 0, cast(short)65534, 0, 0, 0, 0, 0]; 3003 assert(R.array == correct); 3004 } 3005 3006 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` using 3007 /// saturation. 3008 __m256i _mm256_subs_epu8 (__m256i a, __m256i b) pure @trusted 3009 { 3010 // PERF DMD 3011 // PERF GDC without AVX2 3012 static if (GDC_with_AVX2) 3013 { 3014 return cast(__m256i) __builtin_ia32_psubusb256(cast(ubyte32)a, cast(ubyte32)b); 3015 } 3016 else static if(LDC_with_saturated_intrinsics) 3017 { 3018 return cast(__m256i) inteli_llvm_subus!byte32(cast(byte32)a, cast(byte32)b); 3019 } 3020 else 3021 { 3022 byte32 r; 3023 byte32 sa = cast(byte32)a; 3024 byte32 sb = cast(byte32)b; 3025 foreach(i; 0..32) 3026 r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 3027 return cast(__m256i)r; 3028 } 3029 } 3030 unittest 3031 { 3032 __m256i A = _mm256_setr_epi8(0, 0, 5, 4, 5, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0); 3033 __m256i B = _mm256_setr_epi8(0, 0, 4, 5, 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)137, 0, 0, 0, 40, 0, 0, 0, 0, 0, 0); 3034 byte32 R = cast(byte32) _mm256_subs_epu8(A, B); 3035 static immutable byte[32] correct = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)254, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)0, 0, 0, 0, cast(byte) 96, 0, 0, 0, 0, 0, 0]; 3036 assert(R.array == correct); 3037 } 3038 3039 /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in `a` and `b`. 3040 __m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b) pure @safe 3041 { 3042 static if (GDC_with_AVX2) 3043 { 3044 return cast(long4) __builtin_ia32_punpckhwd256(cast(short16)a, cast(short16)b); 3045 } 3046 else static if (LDC_with_optimizations) 3047 { 3048 enum ir = `%r = shufflevector <16 x i16> %0, <16 x i16> %1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12,i32 28, i32 13,i32 29, i32 14,i32 30, i32 15,i32 31> 3049 ret <16 x i16> %r`; 3050 return cast(__m256i)LDCInlineIR!(ir, short16, short16, short16)(cast(short16)a, cast(short16)b); 3051 } 3052 else 3053 { 3054 // Better for arm64, GDC without AVX2 3055 __m128i a_lo = _mm256_extractf128_si256!0(a); 3056 __m128i a_hi = _mm256_extractf128_si256!1(a); 3057 __m128i b_lo = _mm256_extractf128_si256!0(b); 3058 __m128i b_hi = _mm256_extractf128_si256!1(b); 3059 __m128i r_lo = _mm_unpackhi_epi16(a_lo, b_lo); 3060 __m128i r_hi = _mm_unpackhi_epi16(a_hi, b_hi); 3061 return _mm256_set_m128i(r_hi, r_lo); 3062 } 3063 } 3064 unittest 3065 { 3066 __m256i A = _mm256_setr_epi16( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3067 __m256i B = _mm256_setr_epi16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 3068 short16 C = cast(short16) _mm256_unpackhi_epi16(A, B); 3069 short[16] correct = [4, 20, 5, 21, 6, 22, 7, 23, 3070 12, 28, 13, 29, 14, 30, 15, 31]; 3071 assert(C.array == correct); 3072 } 3073 3074 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in `a` and `b`. 3075 __m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b) pure @trusted 3076 { 3077 static if (GDC_with_AVX2) 3078 enum bool split = false; 3079 else version(GNU) 3080 enum bool split = true; 3081 else 3082 enum bool split = false; 3083 3084 static if (GDC_with_AVX2) 3085 { 3086 return cast(long4) __builtin_ia32_punpckhdq256(cast(int8)a, cast(int8)b); 3087 } 3088 else static if (LDC_with_optimizations) 3089 { 3090 // LDC AVX2: Suprisingly, this start using vunpckhps in LDC 1.31 -O2 3091 enum ir = `%r = shufflevector <8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 3092 ret <8 x i32> %r`; 3093 return cast(__m256i)LDCInlineIR!(ir, int8, int8, int8)(cast(int8)a, cast(int8)b); 3094 } 3095 else static if (split) 3096 { 3097 __m128i a_lo = _mm256_extractf128_si256!0(a); 3098 __m128i a_hi = _mm256_extractf128_si256!1(a); 3099 __m128i b_lo = _mm256_extractf128_si256!0(b); 3100 __m128i b_hi = _mm256_extractf128_si256!1(b); 3101 __m128i r_lo = _mm_unpackhi_epi32(a_lo, b_lo); 3102 __m128i r_hi = _mm_unpackhi_epi32(a_hi, b_hi); 3103 return _mm256_set_m128i(r_hi, r_lo); 3104 } 3105 else 3106 { 3107 int8 R; 3108 int8 ai = cast(int8)a; 3109 int8 bi = cast(int8)b; 3110 R.ptr[0] = ai.array[2]; 3111 R.ptr[1] = bi.array[2]; 3112 R.ptr[2] = ai.array[3]; 3113 R.ptr[3] = bi.array[3]; 3114 R.ptr[4] = ai.array[6]; 3115 R.ptr[5] = bi.array[6]; 3116 R.ptr[6] = ai.array[7]; 3117 R.ptr[7] = bi.array[7]; 3118 return cast(__m256i) R; 3119 } 3120 } 3121 unittest 3122 { 3123 __m256i A = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); 3124 __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15); 3125 int8 C = cast(int8) _mm256_unpackhi_epi32(A, B); 3126 int[8] correct = [2, 10, 3, 11, 6, 14, 7, 15]; 3127 assert(C.array == correct); 3128 } 3129 3130 /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in `a` and `b`, 3131 __m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b) pure @trusted 3132 { 3133 static if (GDC_with_AVX2) 3134 { 3135 return cast(__m256i) __builtin_ia32_punpckhbw256(cast(ubyte32)a, cast(ubyte32)b); 3136 } 3137 else static if (LDC_with_optimizations) 3138 { 3139 enum ir = `%r = shufflevector <32 x i8> %0, <32 x i8> %1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 3140 ret <32 x i8> %r`; 3141 return cast(__m256i)LDCInlineIR!(ir, byte32, byte32, byte32)(cast(byte32)a, cast(byte32)b); 3142 } 3143 else 3144 { 3145 // Splitting always beneficial 3146 __m128i a_lo = _mm256_extractf128_si256!0(a); 3147 __m128i a_hi = _mm256_extractf128_si256!1(a); 3148 __m128i b_lo = _mm256_extractf128_si256!0(b); 3149 __m128i b_hi = _mm256_extractf128_si256!1(b); 3150 __m128i r_lo = _mm_unpackhi_epi8(a_lo, b_lo); 3151 __m128i r_hi = _mm_unpackhi_epi8(a_hi, b_hi); 3152 return _mm256_set_m128i(r_hi, r_lo); 3153 } 3154 } 3155 unittest 3156 { 3157 __m256i A = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 3158 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 3159 __m256i B = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 3160 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); 3161 byte32 C = cast(byte32) _mm256_unpackhi_epi8(A, B); 3162 byte[32] correct = [ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 3163 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 ]; 3164 assert(C.array == correct); 3165 } 3166 3167 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in `a` and `b`. 3168 __m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b) pure @trusted 3169 { 3170 version(GNU) 3171 enum split = true; // Benefits GDC in non-AVX2 3172 else 3173 enum split = false; 3174 3175 static if (GDC_with_AVX2) 3176 { 3177 return __builtin_ia32_punpckhqdq256(a, b); 3178 } 3179 else static if (LDC_with_optimizations) 3180 { 3181 enum ir = `%r = shufflevector <4 x i64> %0, <4 x i64> %1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 3182 ret <4 x i64> %r`; 3183 return cast(__m256i)LDCInlineIR!(ir, long4, long4, long4)(a, b); 3184 } 3185 else static if (split) 3186 { 3187 __m128i a_lo = _mm256_extractf128_si256!0(a); 3188 __m128i a_hi = _mm256_extractf128_si256!1(a); 3189 __m128i b_lo = _mm256_extractf128_si256!0(b); 3190 __m128i b_hi = _mm256_extractf128_si256!1(b); 3191 __m128i r_lo = _mm_unpackhi_epi64(a_lo, b_lo); 3192 __m128i r_hi = _mm_unpackhi_epi64(a_hi, b_hi); 3193 return _mm256_set_m128i(r_hi, r_lo); 3194 } 3195 else 3196 { 3197 long4 R; 3198 R.ptr[0] = a.array[1]; 3199 R.ptr[1] = b.array[1]; 3200 R.ptr[2] = a.array[3]; 3201 R.ptr[3] = b.array[3]; 3202 return R; 3203 } 3204 } 3205 unittest 3206 { 3207 __m256i A = _mm256_setr_epi64(0x22222222_22222222, 0x33333333_33333333, 2, 3); 3208 __m256i B = _mm256_setr_epi64(0x44444444_44444444, 0x55555555_55555555, 4, 5); 3209 long4 C = _mm256_unpackhi_epi64(A, B); 3210 long[4] correct = [0x33333333_33333333, 0x55555555_55555555, 3, 5]; 3211 assert(C.array == correct); 3212 } 3213 3214 /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in `a` and `b`. 3215 __m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b) pure @safe 3216 { 3217 static if (GDC_with_AVX2) 3218 { 3219 return cast(__m256i) __builtin_ia32_punpcklwd256(cast(short16)a, cast(short16)b); 3220 } 3221 else static if (LDC_with_optimizations) 3222 { 3223 enum ir = `%r = shufflevector <16 x i16> %0, <16 x i16> %1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> 3224 ret <16 x i16> %r`; 3225 return cast(__m256i)LDCInlineIR!(ir, short16, short16, short16)(cast(short16)a, cast(short16)b); 3226 } 3227 else 3228 { 3229 __m128i a_lo = _mm256_extractf128_si256!0(a); 3230 __m128i a_hi = _mm256_extractf128_si256!1(a); 3231 __m128i b_lo = _mm256_extractf128_si256!0(b); 3232 __m128i b_hi = _mm256_extractf128_si256!1(b); 3233 __m128i r_lo = _mm_unpacklo_epi16(a_lo, b_lo); 3234 __m128i r_hi = _mm_unpacklo_epi16(a_hi, b_hi); 3235 return _mm256_set_m128i(r_hi, r_lo); 3236 } 3237 } 3238 unittest 3239 { 3240 __m256i A = _mm256_setr_epi16( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3241 __m256i B = _mm256_setr_epi16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 3242 short16 C = cast(short16) _mm256_unpacklo_epi16(A, B); 3243 short[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 3244 8, 24, 9, 25, 10, 26, 11, 27]; 3245 assert(C.array == correct); 3246 } 3247 3248 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in `a` and `b`. 3249 __m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b) pure @trusted 3250 { 3251 static if (GDC_with_AVX2) 3252 enum bool split = false; 3253 else version(GNU) 3254 enum bool split = true; 3255 else 3256 enum bool split = false; 3257 3258 static if (GDC_with_AVX2) 3259 { 3260 return cast(long4) __builtin_ia32_punpckldq256(cast(int8)a, cast(int8)b); 3261 } 3262 else static if (LDC_with_optimizations) 3263 { 3264 // LDC AVX2: Suprisingly, this start using vunpcklps in LDC 1.31 -O1 3265 enum ir = `%r = shufflevector <8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 3266 ret <8 x i32> %r`; 3267 return cast(__m256i)LDCInlineIR!(ir, int8, int8, int8)(cast(int8)a, cast(int8)b); 3268 } 3269 else static if (split) 3270 { 3271 __m128i a_lo = _mm256_extractf128_si256!0(a); 3272 __m128i a_hi = _mm256_extractf128_si256!1(a); 3273 __m128i b_lo = _mm256_extractf128_si256!0(b); 3274 __m128i b_hi = _mm256_extractf128_si256!1(b); 3275 __m128i r_lo = _mm_unpacklo_epi32(a_lo, b_lo); 3276 __m128i r_hi = _mm_unpacklo_epi32(a_hi, b_hi); 3277 return _mm256_set_m128i(r_hi, r_lo); 3278 } 3279 else 3280 { 3281 int8 R; 3282 int8 ai = cast(int8)a; 3283 int8 bi = cast(int8)b; 3284 R.ptr[0] = ai.array[0]; 3285 R.ptr[1] = bi.array[0]; 3286 R.ptr[2] = ai.array[1]; 3287 R.ptr[3] = bi.array[1]; 3288 R.ptr[4] = ai.array[4]; 3289 R.ptr[5] = bi.array[4]; 3290 R.ptr[6] = ai.array[5]; 3291 R.ptr[7] = bi.array[5]; 3292 return cast(__m256i) R; 3293 } 3294 } 3295 unittest 3296 { 3297 __m256i A = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); 3298 __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15); 3299 int8 C = cast(int8) _mm256_unpacklo_epi32(A, B); 3300 int[8] correct = [0, 8, 1, 9, 4, 12, 5, 13]; 3301 assert(C.array == correct); 3302 } 3303 3304 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in `a` and `b`. 3305 __m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b) pure @trusted 3306 { 3307 version(GNU) 3308 enum split = true; // Benefits GDC in non-AVX2 3309 else 3310 enum split = false; 3311 3312 static if (GDC_with_AVX2) 3313 { 3314 return __builtin_ia32_punpcklqdq256(a, b); 3315 } 3316 else static if (LDC_with_optimizations) 3317 { 3318 enum ir = `%r = shufflevector <4 x i64> %0, <4 x i64> %1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 3319 ret <4 x i64> %r`; 3320 return cast(__m256i)LDCInlineIR!(ir, long4, long4, long4)(a, b); 3321 } 3322 else static if (split) 3323 { 3324 __m128i a_lo = _mm256_extractf128_si256!0(a); 3325 __m128i a_hi = _mm256_extractf128_si256!1(a); 3326 __m128i b_lo = _mm256_extractf128_si256!0(b); 3327 __m128i b_hi = _mm256_extractf128_si256!1(b); 3328 __m128i r_lo = _mm_unpacklo_epi64(a_lo, b_lo); 3329 __m128i r_hi = _mm_unpacklo_epi64(a_hi, b_hi); 3330 return _mm256_set_m128i(r_hi, r_lo); 3331 } 3332 else 3333 { 3334 long4 R; 3335 R.ptr[0] = a.array[0]; 3336 R.ptr[1] = b.array[0]; 3337 R.ptr[2] = a.array[2]; 3338 R.ptr[3] = b.array[2]; 3339 return R; 3340 } 3341 } 3342 unittest 3343 { 3344 __m256i A = _mm256_setr_epi64(0x22222222_22222222, 0x33333333_33333333, 2, 3); 3345 __m256i B = _mm256_setr_epi64(0x44444444_44444444, 0x55555555_55555555, 4, 5); 3346 long4 C = _mm256_unpacklo_epi64(A, B); 3347 long[4] correct = [0x22222222_22222222, 0x44444444_44444444, 2, 4]; 3348 assert(C.array == correct); 3349 } 3350 3351 /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in `a` and `b`. 3352 __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b) pure @trusted 3353 { 3354 static if (GDC_with_AVX2) 3355 { 3356 return cast(__m256i) __builtin_ia32_punpcklbw256(cast(ubyte32)a, cast(ubyte32)b); 3357 } 3358 else static if (LDC_with_optimizations) 3359 { 3360 enum ir = `%r = shufflevector <32 x i8> %0, <32 x i8> %1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> 3361 ret <32 x i8> %r`; 3362 return cast(__m256i)LDCInlineIR!(ir, byte32, byte32, byte32)(cast(byte32)a, cast(byte32)b); 3363 } 3364 else 3365 { 3366 // Splitting always beneficial 3367 __m128i a_lo = _mm256_extractf128_si256!0(a); 3368 __m128i a_hi = _mm256_extractf128_si256!1(a); 3369 __m128i b_lo = _mm256_extractf128_si256!0(b); 3370 __m128i b_hi = _mm256_extractf128_si256!1(b); 3371 __m128i r_lo = _mm_unpacklo_epi8(a_lo, b_lo); 3372 __m128i r_hi = _mm_unpacklo_epi8(a_hi, b_hi); 3373 return _mm256_set_m128i(r_hi, r_lo); 3374 } 3375 } 3376 unittest 3377 { 3378 __m256i A = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 3379 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 3380 __m256i B = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 3381 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); 3382 byte32 C = cast(byte32) _mm256_unpacklo_epi8(A, B); 3383 byte[32] correct = [ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 3384 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 ]; 3385 assert(C.array == correct); 3386 } 3387 3388 /// Compute the bitwise XOR of 256 bits (representing integer data) in `a` and `b`. 3389 __m256i _mm256_xor_si256 (__m256i a, __m256i b) pure @safe 3390 { 3391 return a ^ b; 3392 } 3393 unittest 3394 { 3395 __m256i A = _mm256_setr_epi64(975394, 619809709, -1, 54); 3396 __m256i B = _mm256_setr_epi64(-920275025, -6, 85873, 96644); 3397 long4 R = cast(long4) _mm256_xor_si256(A, B); 3398 long[4] correct = [975394 ^ (-920275025L), 619809709L ^ -6, (-1) ^ 85873, 54 ^ 96644]; 3399 assert(R.array == correct); 3400 } 3401 3402 3403 /+ 3404 3405 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.d") 3406 int4 __builtin_ia32_gatherd_d(int4, const void*, int4, int4, byte); 3407 3408 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.d.256") 3409 int8 __builtin_ia32_gatherd_d256(int8, const void*, int8, int8, byte); 3410 3411 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.pd") 3412 double2 __builtin_ia32_gatherd_pd(double2, const void*, int4, double2, byte); 3413 3414 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.pd.256") 3415 double4 __builtin_ia32_gatherd_pd256(double4, const void*, int4, double4, byte); 3416 3417 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.ps") 3418 float4 __builtin_ia32_gatherd_ps(float4, const void*, int4, float4, byte); 3419 3420 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.ps.256") 3421 float8 __builtin_ia32_gatherd_ps256(float8, const void*, int8, float8, byte); 3422 3423 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.q") 3424 long2 __builtin_ia32_gatherd_q(long2, const void*, int4, long2, byte); 3425 3426 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.q.256") 3427 long4 __builtin_ia32_gatherd_q256(long4, const void*, int4, long4, byte); 3428 3429 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.d") 3430 int4 __builtin_ia32_gatherq_d(int4, const void*, long2, int4, byte); 3431 3432 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.d.256") 3433 int4 __builtin_ia32_gatherq_d256(int4, const void*, long4, int4, byte); 3434 3435 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.pd") 3436 double2 __builtin_ia32_gatherq_pd(double2, const void*, long2, double2, byte); 3437 3438 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.pd.256") 3439 double4 __builtin_ia32_gatherq_pd256(double4, const void*, long4, double4, byte); 3440 3441 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.ps") 3442 float4 __builtin_ia32_gatherq_ps(float4, const void*, long2, float4, byte); 3443 3444 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.ps.256") 3445 float4 __builtin_ia32_gatherq_ps256(float4, const void*, long4, float4, byte); 3446 3447 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q") 3448 long2 __builtin_ia32_gatherq_q(long2, const void*, long2, long2, byte); 3449 3450 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q.256") 3451 long4 __builtin_ia32_gatherq_q256(long4, const void*, long4, long4, byte); 3452 3453 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.d") 3454 int4 __builtin_ia32_maskloadd(const void*, int4); 3455 3456 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.d.256") 3457 int8 __builtin_ia32_maskloadd256(const void*, int8); 3458 3459 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.q") 3460 long2 __builtin_ia32_maskloadq(const void*, long2); 3461 3462 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.q.256") 3463 long4 __builtin_ia32_maskloadq256(const void*, long4); 3464 3465 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d") 3466 void __builtin_ia32_maskstored(void*, int4, int4); 3467 3468 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d.256") 3469 void __builtin_ia32_maskstored256(void*, int8, int8); 3470 3471 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q") 3472 void __builtin_ia32_maskstoreq(void*, long2, long2); 3473 3474 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q.256") 3475 void __builtin_ia32_maskstoreq256(void*, long4, long4); 3476 3477 pragma(LDC_intrinsic, "llvm.x86.avx2.mpsadbw") 3478 short16 __builtin_ia32_mpsadbw256(byte32, byte32, byte) pure @safe; 3479 3480 3481 pragma(LDC_intrinsic, "llvm.x86.avx2.pblendvb") 3482 byte32 __builtin_ia32_pblendvb256(byte32, byte32, byte32) pure @safe; 3483 3484 pragma(LDC_intrinsic, "llvm.x86.avx2.permd") 3485 int8 __builtin_ia32_permvarsi256(int8, int8) pure @safe; 3486 3487 pragma(LDC_intrinsic, "llvm.x86.avx2.permps") 3488 float8 __builtin_ia32_permvarsf256(float8, int8) pure @safe; 3489 3490 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.d") 3491 int8 __builtin_ia32_phaddd256(int8, int8) pure @safe; 3492 3493 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.sw") 3494 short16 __builtin_ia32_phaddsw256(short16, short16) pure @safe; 3495 3496 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.w") 3497 short16 __builtin_ia32_phaddw256(short16, short16) pure @safe; 3498 3499 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.d") 3500 int8 __builtin_ia32_phsubd256(int8, int8) pure @safe; 3501 3502 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.sw") 3503 short16 __builtin_ia32_phsubsw256(short16, short16) pure @safe; 3504 3505 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.w") 3506 short16 __builtin_ia32_phsubw256(short16, short16) pure @safe; 3507 3508 pragma(LDC_intrinsic, "llvm.x86.avx2.pmadd.ub.sw") 3509 short16 __builtin_ia32_pmaddubsw256(byte32, byte32) pure @safe; 3510 3511 pragma(LDC_intrinsic, "llvm.x86.avx2.pmadd.wd") 3512 int8 __builtin_ia32_pmaddwd256(short16, short16) pure @safe; 3513 3514 pragma(LDC_intrinsic, "llvm.x86.avx2.pmovmskb") 3515 int __builtin_ia32_pmovmskb256(byte32) pure @safe; 3516 3517 pragma(LDC_intrinsic, "llvm.x86.avx2.pmul.hr.sw") 3518 short16 __builtin_ia32_pmulhrsw256(short16, short16) pure @safe; 3519 3520 pragma(LDC_intrinsic, "llvm.x86.avx2.pmulh.w") 3521 short16 __builtin_ia32_pmulhw256(short16, short16) pure @safe; 3522 3523 pragma(LDC_intrinsic, "llvm.x86.avx2.pmulhu.w") 3524 short16 __builtin_ia32_pmulhuw256(short16, short16) pure @safe; 3525 3526 pragma(LDC_intrinsic, "llvm.x86.avx2.psad.bw") 3527 long4 __builtin_ia32_psadbw256(byte32, byte32) pure @safe; 3528 3529 pragma(LDC_intrinsic, "llvm.x86.avx2.pshuf.b") 3530 byte32 __builtin_ia32_pshufb256(byte32, byte32) pure @safe; 3531 3532 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.b") 3533 byte32 __builtin_ia32_psignb256(byte32, byte32) pure @safe; 3534 3535 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.d") 3536 int8 __builtin_ia32_psignd256(int8, int8) pure @safe; 3537 3538 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.w") 3539 short16 __builtin_ia32_psignw256(short16, short16) pure @safe; 3540 3541 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.d") 3542 int8 __builtin_ia32_pslld256(int8, int4) pure @safe; 3543 3544 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.q") 3545 long4 __builtin_ia32_psllq256(long4, long2) pure @safe; 3546 3547 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.w") 3548 short16 __builtin_ia32_psllw256(short16, short8) pure @safe; 3549 3550 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.d") 3551 int8 __builtin_ia32_pslldi256(int8, int) pure @safe; 3552 3553 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.q") 3554 long4 __builtin_ia32_psllqi256(long4, int) pure @safe; 3555 3556 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.w") 3557 short16 __builtin_ia32_psllwi256(short16, int) pure @safe; 3558 3559 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.d") 3560 int4 __builtin_ia32_psllv4si(int4, int4) pure @safe; 3561 3562 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.d.256") 3563 int8 __builtin_ia32_psllv8si(int8, int8) pure @safe; 3564 3565 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.q") 3566 long2 __builtin_ia32_psllv2di(long2, long2) pure @safe; 3567 3568 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.q.256") 3569 long4 __builtin_ia32_psllv4di(long4, long4) pure @safe; 3570 3571 pragma(LDC_intrinsic, "llvm.x86.avx2.psra.d") 3572 int8 __builtin_ia32_psrad256(int8, int4) pure @safe; 3573 3574 pragma(LDC_intrinsic, "llvm.x86.avx2.psra.w") 3575 short16 __builtin_ia32_psraw256(short16, short8) pure @safe; 3576 3577 pragma(LDC_intrinsic, "llvm.x86.avx2.psrai.d") 3578 int8 __builtin_ia32_psradi256(int8, int) pure @safe; 3579 3580 pragma(LDC_intrinsic, "llvm.x86.avx2.psrai.w") 3581 short16 __builtin_ia32_psrawi256(short16, int) pure @safe; 3582 3583 pragma(LDC_intrinsic, "llvm.x86.avx2.psrav.d") 3584 int4 __builtin_ia32_psrav4si(int4, int4) pure @safe; 3585 3586 pragma(LDC_intrinsic, "llvm.x86.avx2.psrav.d.256") 3587 int8 __builtin_ia32_psrav8si(int8, int8) pure @safe; 3588 3589 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.d") 3590 int8 __builtin_ia32_psrld256(int8, int4) pure @safe; 3591 3592 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.q") 3593 long4 __builtin_ia32_psrlq256(long4, long2) pure @safe; 3594 3595 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.w") 3596 short16 __builtin_ia32_psrlw256(short16, short8) pure @safe; 3597 3598 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.d") 3599 int8 __builtin_ia32_psrldi256(int8, int) pure @safe; 3600 3601 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.q") 3602 long4 __builtin_ia32_psrlqi256(long4, int) pure @safe; 3603 3604 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.w") 3605 short16 __builtin_ia32_psrlwi256(short16, int) pure @safe; 3606 3607 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.d") 3608 int4 __builtin_ia32_psrlv4si(int4, int4) pure @safe; 3609 3610 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.d.256") 3611 int8 __builtin_ia32_psrlv8si(int8, int8) pure @safe; 3612 3613 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.q") 3614 long2 __builtin_ia32_psrlv2di(long2, long2) pure @safe; 3615 3616 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.q.256") 3617 long4 __builtin_ia32_psrlv4di(long4, long4) pure @safe; 3618 3619 +/