The OpenD Programming Language

1 /**
2 * AVX2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX2
4 *
5 * Copyright: Guillaume Piolat 2022.
6 *            Johan Engelen 2022.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.avx2intrin;
10 
11 // AVX2 instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX2
13 // Note: this header will work whether you have AVX2 enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+avx2"] or equivalent to actively
15 // generate AVX2 instructions.
16 // With GDC, use "dflags-gdc": ["-mavx2"] or equivalent to actively
17 // generate AVX2 instructions.
18 
19 
20 // Note: many special cases for GDC, because when suporting SIMD_COMPARISON_MASKS_32B but not having AVX2, 
21 // the replaced operators have terrible performance. Mostly a problem for -mavx on x86
22 
23 public import inteli.types;
24 import inteli.internals;
25 
26 // Pull in all previous instruction set intrinsics.
27 public import inteli.avxintrin;
28 
29 nothrow @nogc:
30 
31 /// Compute the absolute value of packed signed 16-bit integers in `a`.
32 __m256i _mm256_abs_epi16 (__m256i a) @trusted
33 {
34     // PERF DMD
35     version(LDC)
36         enum split = true; // akways beneficial in LDC neon, ssse3, or even sse2
37     else
38         enum split = GDC_with_SSSE3;
39 
40     static if (GDC_with_AVX2)
41     {
42         return cast(__m256i) __builtin_ia32_pabsw256(cast(short16)a);
43     }
44     else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
45     {
46         // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
47         // no good way to do abs(256-bit)
48         return cast(__m256i) inteli_llvm_abs!short16(cast(short16)a, false);
49     }    
50     else static if (split)
51     {
52         __m128i a_lo = _mm256_extractf128_si256!0(a);
53         __m128i a_hi = _mm256_extractf128_si256!1(a);
54         __m128i r_lo = _mm_abs_epi16(a_lo);
55         __m128i r_hi = _mm_abs_epi16(a_hi);
56         return _mm256_set_m128i(r_hi, r_lo);
57     }    
58     else
59     {        
60         short16 sa = cast(short16)a;
61         for (int i = 0; i < 16; ++i)
62         {
63             short s = sa.array[i];
64             sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s));
65         }  
66         return cast(__m256i)sa;
67     }
68 }
69 unittest
70 {
71     __m256i A = _mm256_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000,
72                                   1, -1, -32768, 32767, 12, -13, 1000, -1040);
73     short16 B = cast(short16) _mm256_abs_epi16(A);
74     short[16] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000,
75                          1, 1, -32768, 32767, 12, 13, 1000, 1040];
76     assert(B.array == correct);
77 }
78 
79 /// Compute the absolute value of packed signed 32-bit integers in `a`.
80 __m256i _mm256_abs_epi32 (__m256i a) @trusted
81 {
82     // PERF DMD
83     version(LDC)
84         enum split = true; // always beneficial in LDC neon, ssse3, or even sse2
85     else
86         enum split = false; // GDC manages to split and use pabsd in SSSE3 without guidance
87 
88     static if (GDC_with_AVX2)
89     {
90         return cast(__m256i) __builtin_ia32_pabsd256(cast(int8)a);
91     }
92     else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
93     {
94         // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
95         // no good way to do abs(256-bit)
96         return cast(__m256i) inteli_llvm_abs!int8(cast(int8)a, false);
97     }
98     else static if (split)
99     {
100         __m128i a_lo = _mm256_extractf128_si256!0(a);
101         __m128i a_hi = _mm256_extractf128_si256!1(a);
102         __m128i r_lo = _mm_abs_epi32(a_lo);
103         __m128i r_hi = _mm_abs_epi32(a_hi);
104         return _mm256_set_m128i(r_hi, r_lo);
105     }
106     else
107     {
108         int8 sa = cast(int8)a;
109         for (int i = 0; i < 8; ++i)
110         {
111             int s = sa.array[i];
112             sa.ptr[i] = (s >= 0 ? s : -s);
113         }
114         return cast(__m256i)sa;
115     }
116 }
117 unittest
118 {
119     __m256i A = _mm256_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647, -1, 0, -2_147_483_648, -2_147_483_646);
120     int8 B = cast(int8) _mm256_abs_epi32(A);
121     int[8] correct = [0, 1, -2_147_483_648, 2_147_483_647, 1, 0, -2_147_483_648, 2_147_483_646];
122     assert(B.array == correct);
123 }
124 
125 /// Compute the absolute value of packed signed 8-bit integers in `a`.
126 __m256i _mm256_abs_epi8 (__m256i a) @trusted
127 {
128     // PERF DMD
129     // PERF GDC in SSSE3 to AVX doesn't use pabsb and split is catastrophic because of _mm_min_epu8
130     version(LDC)
131         enum split = true; // akways beneficial in LDC neon, ssse3, sse2
132     else
133         enum split = false;
134 
135     static if (GDC_with_AVX2)
136     {
137         return cast(__m256i) __builtin_ia32_pabsb256(cast(ubyte32)a);
138     }
139     else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
140     {
141         // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
142         // no good way to do abs(256-bit)
143         return cast(__m256i) inteli_llvm_abs!byte32(cast(byte32)a, false);
144     }
145     else static if (split)
146     {
147         __m128i a_lo = _mm256_extractf128_si256!0(a);
148         __m128i a_hi = _mm256_extractf128_si256!1(a);
149         __m128i r_lo = _mm_abs_epi8(a_lo);
150         __m128i r_hi = _mm_abs_epi8(a_hi);
151         return _mm256_set_m128i(r_hi, r_lo);
152     }
153     else
154     {
155         // Basically this loop is poison for LDC optimizer
156         byte32 sa = cast(byte32)a;
157         for (int i = 0; i < 32; ++i)
158         {
159             byte s = sa.array[i];
160             sa.ptr[i] = s >= 0 ? s : cast(byte)(-cast(int)(s));
161         }
162         return cast(__m256i)sa;
163     }
164 }
165 unittest
166 {
167     __m256i A = _mm256_setr_epi8(0, -1, -128, -127, 127,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0,
168                                  0, -1, -128, -126, 127, -6, -5, -4, -3, -2, 0, 1, 2, 3, 4, 5);
169     byte32 B = cast(byte32) _mm256_abs_epi8(A);
170     byte[32] correct =          [0,  1, -128,  127, 127,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0,
171                                  0,  1, -128,  126, 127,  6,  5,  4,  3,  2, 0, 1, 2, 3, 4, 5];
172     assert(B.array == correct);
173 }
174 
175 /// Add packed 16-bit integers in `a` and `b`.
176 __m256i _mm256_add_epi16 (__m256i a, __m256i b) pure @safe
177 {
178     pragma(inline, true);
179     return cast(__m256i)(cast(short16)a + cast(short16)b);
180 }
181 unittest
182 {
183     __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0,  6, -2);
184     short16 R = cast(short16) _mm256_add_epi16(A, A);
185     short[16] correct         = [ -14, -2, 0, 18, -200, 200, 468, 864,     0,    -2, 0, -2,  25536, 0, 12, -4 ];
186     assert(R.array == correct);
187 }
188 
189 /// Add packed 32-bit integers in `a` and `b`.
190 __m256i _mm256_add_epi32(__m256i a, __m256i b) pure @safe
191 {
192     pragma(inline, true);
193     return cast(__m256i)(cast(int8)a + cast(int8)b);
194 }
195 unittest
196 {
197     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432);
198     int8 R = cast(int8) _mm256_add_epi32(A, A);
199     int[8] correct = [ -14, -2, 0, 18, -200, 200, 468, 864 ];
200     assert(R.array == correct);
201 }
202 
203 /// Add packed 64-bit integers in `a` and `b`.
204 __m256i _mm256_add_epi64 (__m256i a, __m256i b) pure @safe
205 {
206     pragma(inline, true);
207     return a + b;
208 }
209 unittest
210 {
211     __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12);
212     long4 R = cast(__m256i) _mm256_add_epi64(A, A);
213     long[4] correct = [ -2, 0, 84, -24 ];
214     assert(R.array == correct);
215 }
216 
217 /// Add packed 8-bit integers in `a` and `b`.
218 __m256i _mm256_add_epi8 (__m256i a, __m256i b) pure @safe
219 {
220     pragma(inline, true);
221     return cast(__m256i)(cast(byte32)a + cast(byte32)b);
222 }
223 unittest
224 {
225     __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78,
226                                  4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78);
227     byte32 R = cast(byte32) _mm256_add_epi8(A, A);
228     byte[32] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100,
229                         8, 18, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -4, 0, 20, -100];
230     assert(R.array == correct);
231 }
232 
233 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
234 __m256i _mm256_adds_epi16 (__m256i a, __m256i b) pure @trusted
235 {
236     // PERF DMD
237     static if (GDC_with_AVX2)
238     {
239         return cast(__m256i) __builtin_ia32_paddsw256(cast(short16)a, cast(short16)b);
240     }
241     else static if(LDC_with_saturated_intrinsics)
242     {
243         return cast(__m256i) inteli_llvm_adds!short16(cast(short16)a, cast(short16)b);
244     }
245     else
246     {
247         short16 r;
248         short16 sa = cast(short16)a;
249         short16 sb = cast(short16)b;
250         foreach(i; 0..16)
251             r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
252         return cast(__m256i)r;
253     }
254 }
255 unittest
256 {
257     short16 res = cast(short16) _mm256_adds_epi16(_mm256_setr_epi16( 7,  6,  5, -32768, 3, 3, 32767,   0,  7,  6,  5, -32768, 3, 3, 32767,   0),
258                                                   _mm256_setr_epi16( 7,  6,  5, -30000, 3, 1,     1, -10,  7,  6,  5, -30000, 3, 1,     1, -10));
259     static immutable short[16] correctResult                    =  [14, 12, 10, -32768, 6, 4, 32767, -10, 14, 12, 10, -32768, 6, 4, 32767, -10];
260     assert(res.array == correctResult);
261 }
262 
263 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
264 __m256i _mm256_adds_epi8 (__m256i a, __m256i b) pure @trusted
265 {
266     // PERF DMD
267     static if (GDC_with_AVX2)
268     {
269         return cast(__m256i) __builtin_ia32_paddsb256(cast(ubyte32)a, cast(ubyte32)b);
270     }
271     else static if(LDC_with_saturated_intrinsics)
272     {
273         return cast(__m256i) inteli_llvm_adds!byte32(cast(byte32)a, cast(byte32)b);
274     }
275     else
276     {
277         byte32 r;
278         byte32 sa = cast(byte32)a;
279         byte32 sb = cast(byte32)b;
280         foreach(i; 0..32)
281             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]);
282         return cast(__m256i)r;
283     }
284 }
285 unittest
286 {
287     byte32 res = cast(byte32) _mm256_adds_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0),
288                                                _mm256_setr_epi8(15, 14, 13, 12, 11,  10, 9, 8, 7, 6, 5,   -4, 3, 2, 1, 0, 15, 14, 13, 12, 11,  10, 9, 8, 7, 6, 5,   -4, 3, 2, 1, 0));
289     static immutable byte[32] correctResult                  = [30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0, 30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0]; 
290     assert(res.array == correctResult);
291 }
292 
293 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
294 __m256i _mm256_adds_epu16 (__m256i a, __m256i b) pure @trusted
295 {
296     // PERF DMD
297     static if (GDC_with_AVX2)
298     {
299         return cast(__m256i) __builtin_ia32_paddusw256(cast(short16)a, cast(short16)b);
300     }
301     else static if(LDC_with_saturated_intrinsics)
302     {
303         return cast(__m256i) inteli_llvm_addus!short16(cast(short16)a, cast(short16)b);
304     }
305     else
306     {
307         short16 r;
308         short16 sa = cast(short16)a;
309         short16 sb = cast(short16)b;
310         foreach(i; 0..16)
311             r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
312         return cast(__m256i)r;
313     }
314 }
315 unittest
316 {
317     short16 res = cast(short16) _mm256_adds_epu16(_mm256_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
318                                              _mm256_set_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0));
319     static immutable short[16] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
320     assert(res.array == correctResult);
321 }
322 
323 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
324 __m256i _mm256_adds_epu8 (__m256i a, __m256i b) pure @trusted
325 {
326     // PERF DMD
327     static if (GDC_with_AVX2)
328     {
329         return cast(__m256i) __builtin_ia32_paddusb256(cast(ubyte32)a, cast(ubyte32)b);
330     }
331     else static if(LDC_with_saturated_intrinsics)
332     {
333         return cast(__m256i) inteli_llvm_addus!byte32(cast(byte32)a, cast(byte32)b);
334     }
335     else
336     {
337         byte32 r;
338         byte32 sa = cast(byte32)a;
339         byte32 sb = cast(byte32)b;
340         foreach(i; 0..32)
341             r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
342         return cast(__m256i)r;
343     }
344 }
345 unittest
346 {
347     __m256i A          = _mm256_setr_epi8(0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0);
348     __m256i B          = _mm256_setr_epi8(0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,             1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0,            40, 0, 0, 0, 0, 0, 0);
349     byte32 R = cast(byte32) _mm256_adds_epu8(A, B);
350     static immutable byte[32] correct =  [0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, cast(byte)176, 0, 0, 0, 0, 0, 0];
351     assert(R.array == correct);
352 }
353 
354 /// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the 
355 /// result right by `imm8` bytes, and return the low 16 bytes of that in each lane.
356 __m256i _mm256_alignr_epi8(ubyte count)(__m256i a, __m256i b) pure @trusted
357 {
358 
359     // PERF DMD
360     static if (GDC_with_AVX2)
361     {
362         return cast(__m256i)__builtin_ia32_palignr256(a, b, count * 8);
363     }
364     else
365     {
366         // Note that palignr 256-bit does the same as palignr 128-bit by lane. Can split.
367         // With LDC 1.24 + avx2 feature + -02, that correctly gives a AVX2 vpalignr despite being split.
368         // I guess we could do it with a big 32-items shufflevector but not sure if best.
369         // 2 inst on ARM64 neon, which is optimal.
370         __m128i a_lo = _mm256_extractf128_si256!0(a);
371         __m128i a_hi = _mm256_extractf128_si256!1(a);
372         __m128i b_lo = _mm256_extractf128_si256!0(b);
373         __m128i b_hi = _mm256_extractf128_si256!1(b);
374         __m128i r_lo = _mm_alignr_epi8!count(a_lo, b_lo);
375         __m128i r_hi = _mm_alignr_epi8!count(a_hi, b_hi);
376         return _mm256_set_m128i(r_hi, r_lo);   
377     }
378 }
379 unittest
380 {
381     __m128i A = _mm_setr_epi8( 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16);
382     __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
383     __m256i AA = _mm256_set_m128i(A, A);
384     __m256i BB = _mm256_set_m128i(B, B);
385 
386     {
387         byte32 C = cast(byte32) _mm256_alignr_epi8!0(AA, BB);
388         byte[32] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
389         assert(C.array == correct);
390     }
391     {
392         byte32 C = cast(byte32) _mm256_alignr_epi8!20(AA, BB);
393         byte[32] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0];
394         assert(C.array == correct);
395     }
396     {
397         byte32 C = cast(byte32) _mm256_alignr_epi8!34(AA, BB);
398         byte[32] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
399         assert(C.array == correct);
400     }
401 }
402 
403 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`.
404 __m256i _mm256_and_si256 (__m256i a, __m256i b) pure @safe
405 {
406     pragma(inline, true);
407     return a & b;
408 }
409 unittest
410 {
411     __m256i A = _mm256_set1_epi32(7);
412     __m256i B = _mm256_set1_epi32(14);
413     int8 R = cast(int8) _mm256_and_si256(A, B);
414     int[8] correct = [6, 6, 6, 6, 6, 6, 6, 6];
415     assert(R.array == correct);
416 }
417 
418 /// Compute the bitwise NOT of 256 bits (representing integer data) in `a` and then AND with `b`.
419 __m256i _mm256_andnot_si256 (__m256i a, __m256i b) pure @safe
420 {
421     // See: https://issues.dlang.org/show_bug.cgi?id=24283, 
422     // need workaround if we ever use DMD AVX codegen
423 
424     pragma(inline, true);
425     return (~a) & b;
426 }
427 unittest
428 {
429     __m256i A = _mm256_setr_epi32(7, -2, 9, 54654, 7, -2, 9, 54654);
430     __m256i B = _mm256_setr_epi32(14, 78, 111, -256, 14, 78, 111, -256);
431     int8 R = cast(int8) _mm256_andnot_si256(A, B);
432     int[8] correct = [8, 0, 102, -54784, 8, 0, 102, -54784];
433     assert(R.array == correct);
434 }
435 
436 /// Average packed unsigned 16-bit integers in `a` and `b`.
437 __m256i _mm256_avg_epu16 (__m256i a, __m256i b) pure @trusted
438 {
439     static if (GDC_with_AVX2)
440     {
441         return cast(__m256i) __builtin_ia32_pavgw256(cast(short16)a, cast(short16)b);
442     }
443     else static if (LDC_with_AVX2 && __VERSION__ >= 2094)
444     {
445         return cast(__m256i) __builtin_ia32_pavgw256(cast(short16)a, cast(short16)b);
446     }
447     else
448     {
449         // Splitting is always beneficial here, except -O0
450         __m128i a_lo = _mm256_extractf128_si256!0(a);
451         __m128i a_hi = _mm256_extractf128_si256!1(a);
452         __m128i b_lo = _mm256_extractf128_si256!0(b);
453         __m128i b_hi = _mm256_extractf128_si256!1(b);
454         __m128i r_lo = _mm_avg_epu16(a_lo, b_lo);
455         __m128i r_hi = _mm_avg_epu16(a_hi, b_hi);
456         return _mm256_set_m128i(r_hi, r_lo);
457     }
458 }
459 unittest
460 {
461     __m256i A = _mm256_set1_epi16(31457);
462     __m256i B = _mm256_set1_epi16(cast(short)64000);
463     short16 avg = cast(short16)(_mm256_avg_epu16(A, B));
464     foreach(i; 0..16)
465         assert(avg.array[i] == cast(short)47729);
466 }
467 
468 /// Average packed unsigned 8-bit integers in `a` and `b`.
469 __m256i _mm256_avg_epu8 (__m256i a, __m256i b) pure @trusted
470 {
471     static if (GDC_with_AVX2)
472     {
473         return cast(__m256i) __builtin_ia32_pavgb256(cast(ubyte32)a, cast(ubyte32)b);
474     }
475     else static if (LDC_with_AVX2 && __VERSION__ >= 2094)
476     {
477         return cast(__m256i) __builtin_ia32_pavgb256(cast(byte32)a, cast(byte32)b);
478     }
479     else
480     {
481         // Splitting is always beneficial here, except -O0
482         __m128i a_lo = _mm256_extractf128_si256!0(a);
483         __m128i a_hi = _mm256_extractf128_si256!1(a);
484         __m128i b_lo = _mm256_extractf128_si256!0(b);
485         __m128i b_hi = _mm256_extractf128_si256!1(b);
486         __m128i r_lo = _mm_avg_epu8(a_lo, b_lo);
487         __m128i r_hi = _mm_avg_epu8(a_hi, b_hi);
488         return _mm256_set_m128i(r_hi, r_lo);
489     }
490 }
491 unittest
492 {
493     __m256i A = _mm256_set1_epi8(-1);
494     __m256i B = _mm256_set1_epi8(13);
495     byte32 avg = cast(byte32)(_mm256_avg_epu8(A, B));
496     foreach(i; 0..32)
497         assert(avg.array[i] == cast(byte)134);
498 }
499 
500 /// Blend packed 16-bit integers from `a` and `b` within 128-bit lanes using 8-bit control
501 /// mask `imm8`, in each of the two lanes.
502 /// Note: this is functionally equivalent to two `_mm_blend_epi16`.
503 __m256i _mm256_blend_epi16(int imm8) (__m256i a, __m256i b) pure @trusted
504 {
505     // PERF DMD
506     assert(imm8 >= 0 && imm8 < 256);
507     enum bool split = true; // makes things better, except on ARM32 which is no better than naive
508 
509     static if (GDC_with_AVX2)
510     {
511         return cast(__m256i) __builtin_ia32_pblendw256(cast(short16)a, cast(short16)b, imm8);
512     }
513     else static if (split)
514     {
515         __m128i a_lo = _mm256_extractf128_si256!0(a);
516         __m128i a_hi = _mm256_extractf128_si256!1(a);
517         __m128i b_lo = _mm256_extractf128_si256!0(b);
518         __m128i b_hi = _mm256_extractf128_si256!1(b);
519         __m128i r_lo = _mm_blend_epi16!(imm8)(a_lo, b_lo);
520         __m128i r_hi = _mm_blend_epi16!(imm8)(a_hi, b_hi);
521         return _mm256_set_m128i(r_hi, r_lo);
522     }
523 }
524 unittest
525 {
526     __m256i A = _mm256_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7,  0, -1,  -2,  -3,  -4,  -5,  -6,  -7);
527     __m256i B = _mm256_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15, -8, -9, -10, -11, -12, -13, -14, -15);
528     short16 C = cast(short16) _mm256_blend_epi16!147(A, B); // 10010011 10010011
529     short[16] correct =        [8, 9,  2,  3, 12,  5,  6, 15, -8, -9,  -2, -3, -12,  -5,  -6, -15];
530     assert(C.array == correct);
531 }
532 
533 /// Blend packed 32-bit integers from `a` and `b` using 4-bit control mask `imm8`.
534 __m128i _mm_blend_epi32(int imm8)(__m128i a, __m128i b) pure @trusted
535 {
536     // This one is interesting, it is functionally equivalent to SSE4.1 blendps (_mm_blend_ps)
537     // So without AVX2 we can always fallback to _mm_blend_ps
538     // And indeed, a shufflevector!int4 doesn't even use vpblendd with LDC, and prefer
539     // blendps and shufps so why bother.
540 
541     // PERF DMD
542     static assert(imm8 >= 0 && imm8 < 16);
543     static if (GDC_with_AVX2)
544     {
545         return __builtin_ia32_pblendd128(a, b, imm8);
546     }
547     else
548     {
549         return cast(__m128i) _mm_blend_ps!imm8(cast(__m128)a, cast(__m128)b);
550     }
551 }
552 unittest
553 {
554     __m128i A = _mm_setr_epi32(0, 1,  2,  3);
555     __m128i B = _mm_setr_epi32(8, 9, 10, 11);
556     int4 C = _mm_blend_epi32!13(A, B); // 1101
557     int[4] correct =    [8, 1, 10, 11];
558     assert(C.array == correct);
559 }
560 
561 /// Blend packed 32-bit integers from `a` and `b` using 8-bit control mask `imm8`.
562 __m256i _mm256_blend_epi32(int imm8)(__m256i a, __m256i b) pure @trusted
563 {
564     // This one is functionally equivalent to AVX _mm256_blend_ps, except with integers.
565     // With LDC, doing a shufflevector here would select the vblendps instruction anyway,
566     // so we might as well defer to _mm256_blend_ps.
567 
568     // PERF DMD
569     static assert(imm8 >= 0 && imm8 < 256);
570     static if (GDC_with_AVX2)
571     {
572         return cast(__m256i) __builtin_ia32_pblendd256 (cast(int8)a, cast(int8)b, imm8);
573     }
574     else
575     {
576         return cast(__m256i) _mm256_blend_ps!imm8(cast(__m256)a, cast(__m256)b);
577     }
578 }
579 unittest
580 {
581     __m256i A = _mm256_setr_epi32(0, 1,  2,  3,  4,  5,  6,  7);
582     __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 147, 15);
583     int8 C = cast(int8) _mm256_blend_epi32!0xe7(A, B);
584     int[8] correct =             [8, 9, 10,  3,  4, 13, 147, 15];
585     assert(C.array == correct);
586 }
587 
588 // TODO __m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask) pure @safe
589 
590 /// Broadcast the low packed 8-bit integer from `a` to all elements of result.
591 __m128i _mm_broadcastb_epi8 (__m128i a) pure @safe
592 {
593     byte16 ba = cast(byte16)a;
594     byte16 r;
595     r = ba.array[0];
596     return cast(__m128i)r;
597 }
598 unittest
599 {
600     byte16 A;
601     A.ptr[0] = 2;
602     byte16 B = cast(byte16) _mm_broadcastb_epi8(cast(__m128i)A);
603     byte[16] correct = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2];
604     assert(B.array == correct);
605 }
606 
607 /// Bro0adcast the low packed 8-bit integer from `a` to all elements of result.
608 __m256i _mm256_broadcastb_epi8(__m128i a) pure @safe
609 {
610     byte16 ba = cast(byte16)a;
611     byte32 r;
612     r = ba.array[0];
613     return cast(__m256i)r;
614 }
615 unittest
616 {
617     byte16 A;
618     A.ptr[0] = 2;
619     byte32 B = cast(byte32) _mm256_broadcastb_epi8(cast(__m128i)A);
620     byte[32] correct = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
621                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2];
622     assert(B.array == correct);
623 }
624 
625 /// Broadcast the low packed 32-bit integer from `a` to all elements of result.
626 __m128i _mm_broadcastd_epi32 (__m128i a) pure @safe
627 {
628     int4 ba = cast(int4)a;
629     int4 r;
630     r = ba.array[0];
631     return cast(__m128i)r;
632 }
633 unittest
634 {
635     int4 A;
636     A.ptr[0] = -2;
637     int4 B = cast(int4) _mm_broadcastd_epi32(cast(__m128i)A);
638     int[4] correct = [-2, -2, -2, -2];
639     assert(B.array == correct);
640 }
641 
642 /// Broadcast the low packed 32-bit integer from `a` to all elements of result.
643 __m256i _mm256_broadcastd_epi32 (__m128i a) pure @safe
644 {
645     int4 ba = cast(int4)a;
646     int8 r;
647     r = ba.array[0];
648     return cast(__m256i)r;
649 }
650 unittest
651 {
652     int4 A;
653     A.ptr[0] = -2;
654     int8 B = cast(int8) _mm256_broadcastd_epi32(cast(__m128i)A);
655     int[8] correct = [-2, -2, -2, -2, -2, -2, -2, -2];
656     assert(B.array == correct);
657 }
658 
659 /// Broadcast the low packed 64-bit integer from `a` to all elements of result.
660 __m128i _mm_broadcastq_epi64 (__m128i a) pure @safe
661 {
662     long2 ba = cast(long2)a;
663     long2 r;
664     r = ba.array[0];
665     return cast(__m128i)r;
666 }
667 unittest
668 {
669     long2 A;
670     A.ptr[0] = -2;
671     long2 B = cast(long2) _mm_broadcastq_epi64(cast(__m128i)A);
672     long[2] correct = [-2, -2];
673     assert(B.array == correct);
674 }
675 
676 /// Broadcast the low packed 64-bit integer from `a` to all elements of result.
677 __m256i _mm256_broadcastq_epi64 (__m128i a) pure @safe
678 {
679     long2 ba = cast(long2)a;
680     long4 r;
681     r = ba.array[0];
682     return cast(__m256i)r;
683 }
684 unittest
685 {
686     long2 A;
687     A.ptr[0] = -2;
688     long4 B = cast(long4) _mm256_broadcastq_epi64(cast(__m128i)A);
689     long[4] correct = [-2, -2, -2, -2];
690     assert(B.array == correct);
691 }
692 
693 /// Broadcast the low double-precision (64-bit) floating-point element from `a` to all elements of result.
694 __m128d _mm_broadcastsd_pd (__m128d a) pure @safe
695 {
696     double2 r;
697     r = a.array[0];
698     return r;
699 }
700 unittest
701 {
702     double2 A;
703     A.ptr[0] = 2;
704     double2 B = _mm_broadcastsd_pd(A);
705     double[2] correct = [2.0, 2.0];
706     assert(B.array == correct);
707 }
708 
709 /// Broadcast the low double-precision (64-bit) floating-point element from `a` to all elements of result.
710 __m256d _mm256_broadcastsd_pd (__m128d a) pure @safe
711 {
712     double4 r;
713     r = a.array[0];
714     return r;
715 }
716 unittest
717 {
718     double2 A;
719     A.ptr[0] = 3;
720     double4 B = _mm256_broadcastsd_pd(A);
721     double[4] correct = [3.0, 3, 3, 3];
722     assert(B.array == correct);
723 }
724 
725 /// Broadcast 128 bits of integer data from ``a to all 128-bit lanes in result.
726 /// Note: also exist with name `_mm256_broadcastsi128_si256` which is identical.
727 __m256i _mm_broadcastsi128_si256 (__m128i a) pure @trusted
728 {
729     // Note that GDC will prefer vinserti128 to vbroadcast, for some reason
730     // So in the end it's the same as naive code.
731     // For this reason, __builtin_ia32_vbroadcastsi256 isn't used
732     long2 ba = cast(long2)a;
733     long4 r;
734     r.ptr[0] = ba.array[0];
735     r.ptr[1] = ba.array[1];
736     r.ptr[2] = ba.array[0];
737     r.ptr[3] = ba.array[1];
738     return cast(__m256i)r;
739 }
740 unittest
741 {
742     long2 A;
743     A.ptr[0] = 34;
744     A.ptr[1] = -56;
745     long4 B = cast(long4) _mm_broadcastsi128_si256(cast(__m128i)A);
746     long[4] correct = [34, -56, 34, -56];
747     assert(B.array == correct);
748 }
749 
750 ///ditto
751 alias _mm256_broadcastsi128_si256 = _mm_broadcastsi128_si256; // intrinsics is duplicated in the Guide, for some reason
752 
753 /// Broadcast the low single-precision (32-bit) floating-point element from `a` to all elements of result.
754 __m128 _mm_broadcastss_ps (__m128 a) pure @safe
755 {
756     float4 r;
757     r = a.array[0];
758     return r;
759 }
760 unittest
761 {
762     float4 A;
763     A.ptr[0] = 2;
764     float4 B = _mm_broadcastss_ps(A);
765     float[4] correct = [2.0f, 2, 2, 2];
766     assert(B.array == correct);
767 }
768 
769 /// Broadcast the low single-precision (32-bit) floating-point element from `a` to all elements of result.
770 __m256 _mm256_broadcastss_ps (__m128 a) pure @safe
771 {
772     float8 r;
773     r = a.array[0];
774     return r;
775 }
776 unittest
777 {
778     float4 A;
779     A.ptr[0] = 2;
780     float8 B = _mm256_broadcastss_ps(A);
781     float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2];
782     assert(B.array == correct);
783 }
784 
785 /// Broadcast the low packed 16-bit integer from `a` to all elements of result.
786 __m128i _mm_broadcastw_epi16 (__m128i a) pure @safe
787 {
788     short8 ba = cast(short8)a;
789     short8 r;
790     r = ba.array[0];
791     return cast(__m128i)r;
792 }
793 unittest
794 {
795     short8 A;
796     A.ptr[0] = 13;
797     short8 B = cast(short8) _mm_broadcastw_epi16(cast(__m128i)A);
798     short[8] correct = [13, 13, 13, 13, 13, 13, 13, 13];
799     assert(B.array == correct);
800 }
801 
802 /// Broadcast the low packed 16-bit integer from `a` to all elements of result.
803 __m256i _mm256_broadcastw_epi16 (__m128i a) pure @safe
804 {
805     short8 ba = cast(short8)a;
806     short16 r;
807     r = ba.array[0];
808     return cast(__m256i)r;
809 }
810 unittest
811 {
812     short8 A;
813     A.ptr[0] = 13;
814     short16 B = cast(short16) _mm256_broadcastw_epi16(cast(__m128i)A);
815     short[16] correct = [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13];
816     assert(B.array == correct);
817 }
818 
819 // TODO __m256i _mm256_bslli_epi128 (__m256i a, const int imm8) pure @safe
820 // TODO __m256i _mm256_bsrli_epi128 (__m256i a, const int imm8) pure @safe
821 
822 /// Compare packed 16-bit integers in `a` and `b` for equality.
823 __m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b) pure @trusted
824 {
825     // PERF: GDC without AVX
826     // PERF: DMD
827     static if (SIMD_COMPARISON_MASKS_32B)
828     {
829         // PERF: catastrophic in GDC without AVX2
830         return cast(__m256i)(cast(short16)a == cast(short16)b);
831     }
832     else static if (GDC_with_AVX2)
833     {
834         return cast(__m256i) __builtin_ia32_pcmpeqw256(cast(short16)a, cast(short16)b);
835     }
836     else version(LDC)
837     {
838         return cast(__m256i) equalMask!short16(cast(short16)a, cast(short16)b);
839     }
840     else
841     {
842         short16 sa = cast(short16)a;
843         short16 sb = cast(short16)b;
844         short16 sr;
845         for (int n = 0; n < 16; ++n)
846         {
847             bool cond = sa.array[n] == sb.array[n];
848             sr.ptr[n] = cond ? -1 : 0;
849         }
850         return cast(__m256i) sr;
851     }
852 }
853 unittest
854 {
855     short16   A = [-3, -2, -1,  0,  0,  1,  2,  3, -3, -2, -1,  0,  0,  1,  2,  3];
856     short16   B = [ 4,  3,  2,  1,  0, -1, -2, -3, -3,  3,  2,  1,  0, -1, -2, -3];
857     short[16] E = [ 0,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0];
858     short16   R = cast(short16)(_mm256_cmpeq_epi16(cast(__m256i)A, cast(__m256i)B));
859     assert(R.array == E);
860 }
861 
862 /// Compare packed 32-bit integers in `a` and `b` for equality.
863 __m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b) pure @trusted
864 {
865     // PERF: GDC without AVX
866     // PERF: DMD
867     static if (SIMD_COMPARISON_MASKS_32B)
868     {
869         // Quite bad in GDC -mavx (with no AVX2)
870         return cast(__m256i)(cast(int8)a == cast(int8)b);
871     }
872     else static if (GDC_with_AVX2)
873     {
874         return cast(__m256i) __builtin_ia32_pcmpeqd256(cast(int8)a, cast(int8)b);
875     }
876     else version(LDC)
877     {
878         return cast(__m256i) equalMask!int8(cast(int8)a, cast(int8)b);
879     }
880     else
881     {
882         int8 ia = cast(int8)a;
883         int8 ib = cast(int8)b;
884         int8 ir;
885         for (int n = 0; n < 8; ++n)
886         {
887             bool cond = ia.array[n] == ib.array[n];
888             ir.ptr[n] = cond ? -1 : 0;
889         }
890         return cast(__m256i) ir;
891     }
892 }
893 unittest
894 {
895     int8   A = [-3, -2, -1,  0, -3, -2, -1,  0];
896     int8   B = [ 4, -2,  2,  0,  4, -2,  2,  0];
897     int[8] E = [ 0, -1,  0, -1,  0, -1,  0, -1];
898     int8   R = cast(int8)(_mm256_cmpeq_epi32(cast(__m256i)A, cast(__m256i)B));
899     assert(R.array == E);
900 }
901 
902 /// Compare packed 64-bit integers in `a` and `b` for equality.
903 __m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) pure @trusted
904 {
905     // PERF: GDC without AVX
906     // PERF: DMD
907     static if (SIMD_COMPARISON_MASKS_32B)
908     {
909         // Note: enabling this with DMD will probably lead to same bug as _mm_cmpeq_epi64
910         return cast(__m256i)(cast(long4)a == cast(long4)b);
911     }
912     else static if (GDC_with_AVX2)
913     {
914         return cast(__m256i)__builtin_ia32_pcmpeqq256(cast(long4)a, cast(long4)b);
915     }
916     else version(LDC)
917     {
918         return cast(__m256i) equalMask!long4(cast(long4)a, cast(long4)b);
919     }
920     else
921     {
922         long4 la = cast(long4)a;
923         long4 lb = cast(long4)b;
924         long4 res;
925         res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
926         res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
927         res.ptr[2] = (la.array[2] == lb.array[2]) ? -1 : 0;
928         res.ptr[3] = (la.array[3] == lb.array[3]) ? -1 : 0;
929         return cast(__m256i)res;
930     }
931 }
932 unittest
933 {
934     __m256i A = _mm256_setr_epi64(-1, -2, -1, -2);
935     __m256i B = _mm256_setr_epi64(-3, -2, -3, -3);
936     __m256i C = _mm256_setr_epi64(-1, -4, -1, -2);
937     long4 AB = cast(long4) _mm256_cmpeq_epi64(A, B);
938     long4 AC = cast(long4) _mm256_cmpeq_epi64(A, C);
939     long[4] correct1 = [ 0, -1,  0,  0];
940     long[4] correct2 = [-1,  0, -1, -1];
941     assert(AB.array == correct1);
942     assert(AC.array == correct2);
943 }
944 
945 /// Compare packed 8-bit integers in `a` and `b` for equality.
946 __m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b) pure @trusted
947 {
948     // PERF: GDC without AVX2, need split
949     // PERF: DMD
950     static if (SIMD_COMPARISON_MASKS_32B)
951     {
952         return cast(__m256i)(cast(byte32)a == cast(byte32)b);
953     }
954     else static if (GDC_with_AVX2)
955     {
956         return cast(__m256i) __builtin_ia32_pcmpeqb256(cast(ubyte32)a, cast(ubyte32)b);
957     }
958     else version(LDC)
959     {
960         return cast(__m256i) equalMask!byte32(cast(byte32)a, cast(byte32)b);
961     }
962     else
963     {
964         byte32 ba = cast(byte32)a;
965         byte32 bb = cast(byte32)b;
966         byte32 br;
967         for (int n = 0; n < 32; ++n)
968         {
969             bool cond = ba.array[n] == bb.array[n];
970             br.ptr[n] = cond ? -1 : 0;
971         }
972         return cast(__m256i) br;
973     }
974 }
975 unittest
976 {
977     __m256i A = _mm256_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1,
978                                  1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 42);
979     __m256i B = _mm256_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1,
980                                  2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
981     byte32 C = cast(byte32) _mm256_cmpeq_epi8(A, B);
982     byte[32] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1,
983                               0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0,  0];
984     assert(C.array == correct);
985 }
986 
987 /// Compare packed signed 16-bit integers in `a` and `b` for greater-than.
988 __m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b) pure @safe
989 {
990     version(GNU)
991         enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC without AVX2
992     else
993         enum bool mayUseComparisonOperator = true;
994 
995     static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
996     {
997         return cast(__m256i)(cast(short16)a > cast(short16)b);
998     }
999     else static if (GDC_with_AVX2)
1000     {
1001         return cast(__m256i) __builtin_ia32_pcmpgtw256(cast(short16)a, cast(short16)b);
1002     }
1003     else // split
1004     {
1005         __m128i a_lo = _mm256_extractf128_si256!0(a);
1006         __m128i a_hi = _mm256_extractf128_si256!1(a);
1007         __m128i b_lo = _mm256_extractf128_si256!0(b);
1008         __m128i b_hi = _mm256_extractf128_si256!1(b);
1009         __m128i r_lo = _mm_cmpgt_epi16(a_lo, b_lo);
1010         __m128i r_hi = _mm_cmpgt_epi16(a_hi, b_hi);
1011         return _mm256_set_m128i(r_hi, r_lo);
1012     }
1013 }
1014 unittest
1015 {
1016     short16   A = [-3, -2, -1,  0,  0,  1,  2,  3, -3, -2, -1,  0,  0,  1,  2,  3];
1017     short16   B = [ 4,  3,  2,  1,  0, -1, -2, -3,  4, -3,  2,  1,  0, -1, -2, -3];
1018     short[16] E = [ 0,  0,  0,  0,  0, -1, -1, -1,  0, -1,  0,  0,  0, -1, -1, -1];
1019     short16   R = cast(short16)(_mm256_cmpgt_epi16(cast(__m256i)A, cast(__m256i)B));
1020     assert(R.array == E);
1021 }
1022 
1023 /// Compare packed signed 32-bit integers in `a` and `b` for greater-than.
1024 __m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b) pure @safe
1025 {
1026     version(GNU)
1027         enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC else
1028     else
1029         enum bool mayUseComparisonOperator = true;
1030 
1031     static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
1032     {
1033         return cast(__m256i)(cast(int8)a > cast(int8)b);
1034     }
1035     else static if (GDC_with_AVX2)
1036     {
1037         return cast(__m256i) __builtin_ia32_pcmpgtd256(cast(int8)a, cast(int8)b);
1038     }
1039     else // split
1040     {
1041         __m128i a_lo = _mm256_extractf128_si256!0(a);
1042         __m128i a_hi = _mm256_extractf128_si256!1(a);
1043         __m128i b_lo = _mm256_extractf128_si256!0(b);
1044         __m128i b_hi = _mm256_extractf128_si256!1(b);
1045         __m128i r_lo = _mm_cmpgt_epi32(a_lo, b_lo);
1046         __m128i r_hi = _mm_cmpgt_epi32(a_hi, b_hi);
1047         return _mm256_set_m128i(r_hi, r_lo);
1048     }
1049 }
1050 unittest
1051 {
1052     int8   A = [-3,  2, -1,  0, -3,  2, -1,  0];
1053     int8   B = [ 4, -2,  2,  0,  4, -2,  2,  0];
1054     int[8] E = [ 0, -1,  0,  0,  0, -1,  0,  0];
1055     int8   R = cast(int8) _mm256_cmpgt_epi32(cast(__m256i)A, cast(__m256i)B);
1056     assert(R.array == E);
1057 }
1058 
1059 __m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b) pure @safe
1060 {
1061     version(GNU)
1062         enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC else
1063     else
1064         enum bool mayUseComparisonOperator = true;
1065 
1066     static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
1067     {
1068         return cast(__m256i)(cast(long4)a > cast(long4)b);
1069     }
1070     else static if (GDC_with_AVX2)
1071     {
1072         return cast(__m256i) __builtin_ia32_pcmpgtq256(cast(long4)a, cast(long4)b);
1073     }
1074     else // split
1075     {
1076         __m128i a_lo = _mm256_extractf128_si256!0(a);
1077         __m128i a_hi = _mm256_extractf128_si256!1(a);
1078         __m128i b_lo = _mm256_extractf128_si256!0(b);
1079         __m128i b_hi = _mm256_extractf128_si256!1(b);
1080         __m128i r_lo = _mm_cmpgt_epi64(a_lo, b_lo);
1081         __m128i r_hi = _mm_cmpgt_epi64(a_hi, b_hi);
1082         return _mm256_set_m128i(r_hi, r_lo);
1083     }
1084 }
1085 unittest
1086 {
1087     __m256i A = _mm256_setr_epi64(-3,  2, 70,  2);
1088     __m256i B = _mm256_setr_epi64 (4, -2,  4, -2);
1089     long[4] correct = [ 0, -1, -1, -1 ];
1090     long4 R = cast(long4)(_mm256_cmpgt_epi64(A, B));
1091     assert(R.array == correct);
1092 }
1093 
1094 /// Compare packed signed 8-bit integers in `a` and `b` for greater-than.
1095 __m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b) pure @safe
1096 {
1097     version(GNU)
1098         enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC without AVX2
1099     else
1100         enum bool mayUseComparisonOperator = true;
1101 
1102     static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
1103     {
1104         return cast(__m256i)(cast(byte32)a > cast(byte32)b);
1105     }
1106     else static if (GDC_with_AVX2)
1107     {
1108         return cast(__m256i) __builtin_ia32_pcmpgtb256(cast(short16)a, cast(short16)b);
1109     }
1110     else // split
1111     {
1112         __m128i a_lo = _mm256_extractf128_si256!0(a);
1113         __m128i a_hi = _mm256_extractf128_si256!1(a);
1114         __m128i b_lo = _mm256_extractf128_si256!0(b);
1115         __m128i b_hi = _mm256_extractf128_si256!1(b);
1116         __m128i r_lo = _mm_cmpgt_epi8(a_lo, b_lo);
1117         __m128i r_hi = _mm_cmpgt_epi8(a_hi, b_hi);
1118         return _mm256_set_m128i(r_hi, r_lo);
1119     }
1120 }
1121 unittest
1122 {
1123     __m256i A = _mm256_setr_epi8(1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1,   1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
1124     __m256i B = _mm256_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1,   2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 0);
1125     byte32 C = cast(byte32) _mm256_cmpgt_epi8(A, B);
1126     byte[32] correct =          [0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0,   0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1,-1];
1127     assert(C.array == correct);
1128 }
1129 
1130 
1131 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
1132 __m256i _mm256_cvtepi16_epi32 (__m128i a) pure @trusted
1133 {
1134     static if (GDC_with_AVX2)
1135     {
1136         return cast(__m256i) __builtin_ia32_pmovsxwd256(cast(short8)a);
1137     }
1138     else static if (LDC_with_optimizations)
1139     {
1140         enum ir = `
1141             %r = sext <8 x i16> %0 to <8 x i32>
1142             ret <8 x i32> %r`;
1143         return cast(__m256i) LDCInlineIR!(ir, int8, short8)(cast(short8)a);
1144     }
1145     else
1146     {
1147         short8 sa = cast(short8)a;
1148         int8 r;
1149         r.ptr[0] = sa.array[0];
1150         r.ptr[1] = sa.array[1];
1151         r.ptr[2] = sa.array[2];
1152         r.ptr[3] = sa.array[3];
1153         r.ptr[4] = sa.array[4];
1154         r.ptr[5] = sa.array[5];
1155         r.ptr[6] = sa.array[6];
1156         r.ptr[7] = sa.array[7];
1157         return cast(__m256i)r;
1158     }
1159 }
1160 unittest
1161 {
1162     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767);
1163     int8 C = cast(int8) _mm256_cvtepi16_epi32(A);
1164     int[8] correct = [-1, 0, -32768, 32767, -1, 0, -32768, 32767];
1165     assert(C.array == correct);
1166 }
1167 
1168 
1169 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
1170 __m256i _mm256_cvtepi16_epi64 (__m128i a) pure @trusted
1171 {
1172     static if (GDC_with_AVX2)
1173     {
1174         return cast(__m256i) __builtin_ia32_pmovsxwq256(cast(short8)a);
1175     }
1176     else static if (LDC_with_optimizations)
1177     {
1178         enum ir = `
1179             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
1180             %r = sext <4 x i16> %v to <4 x i64>
1181             ret <4 x i64> %r`;
1182         return cast(__m256i) LDCInlineIR!(ir, long4, short8)(cast(short8)a);
1183     }
1184     else
1185     {
1186         // LDC x86 generates vpmovsxwq since LDC 1.12 -O1
1187         short8 sa = cast(short8)a;
1188         long4 r;
1189         r.ptr[0] = sa.array[0];
1190         r.ptr[1] = sa.array[1];
1191         r.ptr[2] = sa.array[2];
1192         r.ptr[3] = sa.array[3];
1193         return cast(__m256i)r;
1194     }
1195 }
1196 unittest
1197 {
1198     __m128i A = _mm_setr_epi16(-1, 0, short.min, short.max, 2, 3, 4, 5);
1199     long4 C = cast(long4) _mm256_cvtepi16_epi64(A);
1200     long[4] correct = [-1, 0, short.min, short.max];
1201     assert(C.array == correct);
1202 }
1203 
1204 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
1205 __m256i _mm256_cvtepi32_epi64 (__m128i a) pure @trusted
1206 {
1207     long4 r;
1208     r.ptr[0] = a.array[0];
1209     r.ptr[1] = a.array[1];
1210     r.ptr[2] = a.array[2];
1211     r.ptr[3] = a.array[3];
1212     return cast(__m256i)r;
1213 }
1214 unittest
1215 {
1216     __m128i A = _mm_setr_epi32(-1, 0, int.min, int.max);
1217     long4 C = cast(long4) _mm256_cvtepi32_epi64(A);
1218     long[4] correct = [-1, 0, int.min, int.max];
1219     assert(C.array == correct);
1220 }
1221 
1222 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
1223 __m256i _mm256_cvtepi8_epi16 (__m128i a) pure @trusted
1224 {
1225     static if (GDC_with_AVX2)
1226     {
1227         return cast(__m256i) __builtin_ia32_pmovsxbw256(cast(ubyte16)a);
1228     }
1229     else static if (LDC_with_optimizations)
1230     {
1231         enum ir = `
1232             %r = sext <16 x i8> %0 to <16 x i16>
1233             ret <16 x i16> %r`;
1234         return cast(__m256i) LDCInlineIR!(ir, short16, byte16)(cast(byte16)a);
1235     }
1236     else
1237     {
1238         short16 r;
1239         byte16 ba = cast(byte16)a;
1240         for (int n = 0; n < 16; ++n)
1241         {
1242             r.ptr[n] = ba.array[n];
1243         }
1244         return cast(__m256i)r; 
1245     }
1246 }
1247 unittest
1248 {
1249     __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1250     short16 C = cast(short16) _mm256_cvtepi8_epi16(A);
1251     short[16] correct = [-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
1252     assert(C.array == correct);
1253 }
1254 
1255 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
1256 __m256i _mm256_cvtepi8_epi32 (__m128i a) pure @trusted
1257 {
1258     static if (GDC_with_AVX2)
1259     {
1260         return cast(__m256i) __builtin_ia32_pmovsxbd256(cast(ubyte16)a);
1261     }
1262     else static if (LDC_with_optimizations)
1263     {
1264         enum ir = `
1265             %v = shufflevector <16 x i8> %0,<16 x i8> undef, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5,i32 6, i32 7>
1266             %r = sext <8 x i8> %v to <8 x i32>
1267             ret <8 x i32> %r`;
1268         return cast(__m256i) LDCInlineIR!(ir, int8, byte16)(cast(byte16)a);
1269     }
1270     else
1271     {
1272         // PERF This is rather bad in GDC without AVX, or with DMD
1273         // should split that
1274         int8 r;
1275         byte16 ba = cast(byte16)a;
1276         for (int n = 0; n < 8; ++n)
1277         {
1278             r.ptr[n] = ba.array[n];
1279         }
1280         return cast(__m256i)r; 
1281     }
1282 }
1283 unittest
1284 {
1285     __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1286     int8 C = cast(int8) _mm256_cvtepi8_epi32(A);
1287     int[8] correct = [-1, 0, byte.min, byte.max, 2, 3, 4, 5];
1288     assert(C.array == correct);
1289 }
1290 
1291 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
1292 __m256i _mm256_cvtepi8_epi64 (__m128i a) pure @trusted
1293 {
1294     // PERF This is rather bad in GDC without AVX
1295     static if (GDC_with_AVX2)
1296     {
1297         return cast(__m256i) __builtin_ia32_pmovsxbq256(cast(ubyte16)a);
1298     }
1299     else static if (LDC_with_ARM64)
1300     {
1301         // 4 inst since LDC 1.22 -O2 
1302         return _mm256_cvtepi16_epi64(_mm_cvtepi8_epi16(a));
1303     }
1304     else static if (LDC_with_optimizations)
1305     {
1306         enum ir = `
1307             %v = shufflevector <16 x i8> %0,<16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1308             %r = sext <4 x i8> %v to <4 x i64>
1309             ret <4 x i64> %r`;
1310         return cast(__m256i) LDCInlineIR!(ir, long4, byte16)(cast(byte16)a);
1311     }
1312     else
1313     {
1314         long4 r;
1315         byte16 ba = cast(byte16)a;
1316         for (int n = 0; n < 4; ++n)
1317         {
1318             r.ptr[n] = ba.array[n];
1319         }
1320         return cast(__m256i)r; 
1321     }
1322 }
1323 unittest
1324 {
1325     __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1326     long4 C = cast(long4) _mm256_cvtepi8_epi64(A);
1327     long[4] correct = [-1, 0, byte.min, byte.max];
1328     assert(C.array == correct);
1329 }
1330 
1331 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
1332 __m256i _mm256_cvtepu16_epi32(__m128i a) pure @trusted
1333 {
1334     static if (GDC_with_AVX2)
1335     {
1336         return cast(__m256i) __builtin_ia32_pmovzxwd256(cast(short8)a);
1337     }
1338     else
1339     {
1340         short8 sa = cast(short8)a;
1341         int8 r;
1342         r.ptr[0] = cast(ushort)sa.array[0];
1343         r.ptr[1] = cast(ushort)sa.array[1];
1344         r.ptr[2] = cast(ushort)sa.array[2];
1345         r.ptr[3] = cast(ushort)sa.array[3];
1346         r.ptr[4] = cast(ushort)sa.array[4];
1347         r.ptr[5] = cast(ushort)sa.array[5];
1348         r.ptr[6] = cast(ushort)sa.array[6];
1349         r.ptr[7] = cast(ushort)sa.array[7];
1350         return cast(__m256i)r;
1351     }
1352 }
1353 unittest
1354 {
1355     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767);
1356     int8 C = cast(int8) _mm256_cvtepu16_epi32(A);
1357     int[8] correct = [65535, 0, 32768, 32767, 65535, 0, 32768, 32767];
1358     assert(C.array == correct);
1359 }
1360 
1361 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
1362 __m256i _mm256_cvtepu16_epi64(__m128i a) pure @trusted
1363 {
1364     static if (GDC_with_AVX2)
1365     {
1366         return cast(__m256i) __builtin_ia32_pmovzxwq256(cast(short8)a);
1367     }
1368     else static if (LDC_with_optimizations)
1369     {
1370         enum ir = `
1371             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
1372             %r = zext <4 x i16> %v to <4 x i64>
1373             ret <4 x i64> %r`;
1374         return cast(__m256i) LDCInlineIR!(ir, long4, short8)(cast(short8)a);
1375     }
1376     else
1377     {
1378         short8 sa = cast(short8)a;
1379         long4 r;
1380         r.ptr[0] = cast(ushort)sa.array[0];
1381         r.ptr[1] = cast(ushort)sa.array[1];
1382         r.ptr[2] = cast(ushort)sa.array[2];
1383         r.ptr[3] = cast(ushort)sa.array[3];
1384         return cast(__m256i)r;
1385     }
1386 }
1387 unittest
1388 {
1389     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 2, 3, 4, 5);
1390     long4 C = cast(long4) _mm256_cvtepu16_epi64(A);
1391     long[4] correct = [65535, 0, 32768, 32767];
1392     assert(C.array == correct);
1393 }
1394 
1395 /// Zero-extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
1396 __m256i _mm256_cvtepu32_epi64 (__m128i a) pure @trusted
1397 {
1398     static if (GDC_with_AVX2)
1399     {
1400         return cast(__m256i) __builtin_ia32_pmovzxdq256(cast(int4)a);
1401     }
1402     else static if (LDC_with_optimizations)
1403     {
1404         enum ir = `
1405             %r = zext <4 x i32> %0 to <4 x i64>
1406             ret <4 x i64> %r`;
1407         return cast(__m256i) LDCInlineIR!(ir, long4, int4)(cast(int4)a);
1408     }
1409     else
1410     {
1411         long4 r;
1412         r.ptr[0] = cast(uint)a.array[0];
1413         r.ptr[1] = cast(uint)a.array[1];
1414         r.ptr[2] = cast(uint)a.array[2];
1415         r.ptr[3] = cast(uint)a.array[3];
1416         return cast(__m256i)r; 
1417     }
1418 }
1419 unittest
1420 {
1421     __m128i A = _mm_setr_epi32(-1, 0, int.min, int.max);
1422     long4 C = cast(long4) _mm256_cvtepu32_epi64(A);
1423     long[4] correct = [uint.max, 0, 2_147_483_648, int.max];
1424     assert(C.array == correct);
1425 }
1426 
1427 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
1428 __m256i _mm256_cvtepu8_epi16 (__m128i a) pure @trusted
1429 {
1430     static if (GDC_with_AVX2)
1431     {
1432         return cast(__m256i) __builtin_ia32_pmovzxbw256(cast(ubyte16)a);
1433     }
1434     else static if (LDC_with_optimizations)
1435     {
1436         enum ir = `
1437             %r = zext <16 x i8> %0 to <16 x i16>
1438             ret <16 x i16> %r`;
1439         return cast(__m256i) LDCInlineIR!(ir, short16, byte16)(cast(byte16)a);
1440     }
1441     else
1442     {
1443         short16 r;
1444         byte16 ba = cast(byte16)a;
1445         for (int n = 0; n < 16; ++n)
1446         {
1447             r.ptr[n] = cast(ubyte)ba.array[n];
1448         }
1449         return cast(__m256i)r; 
1450     }
1451 }
1452 unittest
1453 {
1454     __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1455     short16 C = cast(short16) _mm256_cvtepu8_epi16(A);
1456     short[16] correct     = [255, 0,  128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
1457     assert(C.array == correct);
1458 }
1459 
1460 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
1461 __m256i _mm256_cvtepu8_epi32 (__m128i a) pure @trusted
1462 {
1463     static if (GDC_with_AVX2)
1464     {
1465         return cast(__m256i) __builtin_ia32_pmovzxbd256(cast(ubyte16)a);
1466     }
1467     else static if (LDC_with_optimizations)
1468     {
1469         enum ir = `
1470             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5,i32 6, i32 7>
1471             %r = zext <8 x i8> %v to <8 x i32>
1472             ret <8 x i32> %r`;
1473         return cast(__m256i) LDCInlineIR!(ir, int8, byte16)(cast(byte16)a);
1474     }
1475     else
1476     {
1477         int8 r;
1478         byte16 ba = cast(byte16)a;
1479         for (int n = 0; n < 8; ++n)
1480         {
1481             r.ptr[n] = cast(ubyte)ba.array[n];
1482         }
1483         return cast(__m256i)r; 
1484     }
1485 }
1486 unittest
1487 {
1488     __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1489     int8 C = cast(int8) _mm256_cvtepu8_epi32(A);
1490     int[8] correct     = [255, 0,  128, 127, 2, 3, 4, 5];
1491     assert(C.array == correct);
1492 }
1493 
1494 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 64-bit integers.
1495 __m256i _mm256_cvtepu8_epi64 (__m128i a) pure @trusted
1496 {
1497     // PERF ARM64+LDC, not awesome
1498     static if (GDC_with_AVX2)
1499     {
1500         return cast(__m256i) __builtin_ia32_pmovzxbq256(cast(ubyte16)a);
1501     }
1502     else static if (LDC_with_optimizations)
1503     {
1504         enum ir = `
1505             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
1506             %r = zext <4 x i8> %v to <4 x i64>
1507             ret <4 x i64> %r`;
1508         return cast(__m256i) LDCInlineIR!(ir, long4, byte16)(cast(byte16)a);
1509     }
1510     else
1511     {
1512         long4 r;
1513         byte16 ba = cast(byte16)a;
1514         for (int n = 0; n < 4; ++n)
1515         {
1516             r.ptr[n] = cast(ubyte)ba.array[n];
1517         }
1518         return cast(__m256i)r; 
1519     }
1520 }
1521 unittest
1522 {
1523     __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1524     long4 C = cast(long4) _mm256_cvtepu8_epi64(A);
1525     long[4] correct     = [255, 0,  128, 127];
1526     assert(C.array == correct);
1527 }
1528 
1529 /// Extract a 16-bit integer from `a`, selected with index.
1530 int _mm256_extract_epi16 (__m256i a, int index) pure @trusted
1531 {
1532     short16 sa = cast(short16)a;
1533     return sa.ptr[index & 15];
1534 }
1535 unittest
1536 {
1537     short16 b;
1538     b = 43;
1539     assert(_mm256_extract_epi16(cast(__m256i)b, 7) == 43);
1540 }
1541 
1542 /// Extract a 8-bit integer from `a`, selected with index.
1543 int _mm256_extract_epi8 (__m256i a, int index) pure @trusted
1544 {
1545     byte32 sa = cast(byte32)a;
1546     return sa.ptr[index & 31];
1547 }
1548 unittest
1549 {
1550     byte32 b;
1551     b = -44;
1552     assert(_mm256_extract_epi8(cast(__m256i)b, 5) == -44);
1553     assert(_mm256_extract_epi8(cast(__m256i)b, 5 + 32) == -44);
1554 }
1555 
1556 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
1557 __m128i _mm256_extracti128_si256(int imm8)(__m256i a) pure @trusted
1558     if ( (imm8 == 0) || (imm8 == 1) )
1559 {
1560     pragma(inline, true);
1561 
1562     static if (GDC_with_AVX2)
1563     {
1564         return cast(__m128i) __builtin_ia32_extract128i256(a, imm8);
1565     }
1566     else static if (LDC_with_optimizations)
1567     {
1568         enum str = (imm8 == 1) ? "<i32 2, i32 3>" : "<i32 0, i32 1>";
1569         enum ir = "%r = shufflevector <4 x i64> %0, <4 x i64> undef, <2 x i32>" ~ str ~ "\n" ~
1570                   "ret <2 x i64> %r";
1571         return cast(__m128i) LDCInlineIR!(ir, ulong2, ulong4)(cast(ulong4)a);
1572     }
1573     else
1574     {
1575         long4 al = cast(long4) a;
1576         long2 ret;
1577         ret.ptr[0] = (imm8==1) ? al.array[2] : al.array[0];
1578         ret.ptr[1] = (imm8==1) ? al.array[3] : al.array[1];
1579         return cast(__m128i) ret;
1580     }
1581 }
1582 unittest
1583 {
1584     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432 );
1585     int[4] correct0 = [ -7, -1, 0, 9 ];
1586     int[4] correct1 = [ -100, 100, 234, 432 ];
1587     __m128i R0 = _mm256_extracti128_si256!(0)(A);
1588     __m128i R1 = _mm256_extracti128_si256!(1)(A);
1589     assert(R0.array == correct0);
1590     assert(R1.array == correct1);
1591 }
1592 
1593 // TODO __m256i _mm256_hadd_epi16 (__m256i a, __m256i b) pure @safe
1594 // TODO __m256i _mm256_hadd_epi32 (__m256i a, __m256i b) pure @safe
1595 // TODO __m256i _mm256_hadds_epi16 (__m256i a, __m256i b) pure @safe
1596 // TODO __m256i _mm256_hsub_epi16 (__m256i a, __m256i b) pure @safe
1597 // TODO __m256i _mm256_hsub_epi32 (__m256i a, __m256i b) pure @safe
1598 // TODO __m256i _mm256_hsubs_epi16 (__m256i a, __m256i b) pure @safe
1599 
1600 // TODO __m128i _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale) pure @safe
1601 // TODO __m128i _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
1602 // TODO __m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe
1603 // TODO __m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) pure @safe
1604 // TODO __m128i _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe
1605 // TODO __m128i _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
1606 // TODO __m256i _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe
1607 // TODO __m256i _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale) pure @safe
1608 // TODO __m128d _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe
1609 // TODO __m128d _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) pure @safe
1610 // TODO __m256d _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe
1611 // TODO __m256d _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale) pure @safe
1612 // TODO __m128 _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale) pure @safe
1613 // TODO __m128 _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) pure @safe
1614 // TODO __m256 _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale) pure @safe
1615 // TODO __m256 _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale) pure @safe
1616 // TODO __m128i _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale) pure @safe
1617 // TODO __m128i _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
1618 // TODO __m128i _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe
1619 // TODO __m128i _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale) pure @safe
1620 // TODO __m128i _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe
1621 // TODO __m128i _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
1622 // TODO __m256i _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale) pure @safe
1623 // TODO __m256i _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale) pure @safe
1624 // TODO __m128d _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe
1625 // TODO __m128d _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) pure @safe
1626 // TODO __m256d _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale) pure @safe
1627 // TODO __m256d _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale) pure @safe
1628 // TODO __m128 _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale) pure @safe
1629 // TODO __m128 _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) pure @safe
1630 // TODO __m128 _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale) pure @safe
1631 // TODO __m128 _mm256_mask_i64gather_ps (__m128 src, float const* base_addr, __m256i vindex, __m128 mask, const int scale) pure @safe
1632 
1633 
1634 /// Copy `a` to result, then insert 128 bits from `b` into result at the location specified by 
1635 /// `imm8`.
1636 __m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8) pure @trusted
1637 {
1638     long2 lb = cast(long2)b;
1639     a.ptr[(imm8 & 1)*2  ] = lb.array[0];
1640     a.ptr[(imm8 & 1)*2+1] = lb.array[1];
1641     return a; 
1642 }
1643 unittest
1644 {
1645     __m256i A = [0, 1, 2, 3];
1646     long2 B = [4, 5];
1647     __m256i C = _mm256_inserti128_si256(A, cast(__m128i)B, 0 + 8);
1648     __m256i D = _mm256_inserti128_si256(A, cast(__m128i)B, 1);
1649     long[4] correctC = [4, 5, 2, 3]; 
1650     long[4] correctD = [0, 1, 4, 5];
1651     assert(C.array == correctC);
1652     assert(D.array == correctD);
1653 }
1654 
1655 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1656 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1657 /// and pack the results in destination.
1658 __m256i _mm256_madd_epi16 (__m256i a, __m256i b) pure @trusted
1659 {
1660     static if (GDC_with_AVX2)
1661     {
1662         return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
1663     }
1664     else static if (LDC_with_AVX2)
1665     {
1666         return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
1667     }
1668     else
1669     {
1670         // split is beneficial for ARM64, LDC and GDC without AVX2
1671         __m128i a_lo = _mm256_extractf128_si256!0(a);
1672         __m128i a_hi = _mm256_extractf128_si256!1(a);
1673         __m128i b_lo = _mm256_extractf128_si256!0(b);
1674         __m128i b_hi = _mm256_extractf128_si256!1(b);
1675         __m128i r_lo = _mm_madd_epi16(a_lo, b_lo);
1676         __m128i r_hi = _mm_madd_epi16(a_hi, b_hi);
1677         return _mm256_set_m128i(r_hi, r_lo);
1678     }
1679 }
1680 unittest
1681 {
1682     short16 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
1683     short16 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
1684     int8 R = cast(int8) _mm256_madd_epi16(cast(__m256i)A, cast(__m256i)B);
1685     int[8] correct = [1, 13, -2147483648, 2*32767*32767, 1, 13, -2147483648, 2*32767*32767];
1686     assert(R.array == correct);
1687 }
1688 
1689 // TODO __m256i _mm256_maddubs_epi16 (__m256i a, __m256i b) pure @safe
1690 
1691 version(DigitalMars)
1692 {
1693     // this avoids a bug with DMD < 2.099 -a x86 -O
1694     private enum bool maskLoadWorkaroundDMD = (__VERSION__ < 2099);
1695 }
1696 else
1697 {
1698     private enum bool maskLoadWorkaroundDMD = false;
1699 }
1700 
1701 /// Load packed 32-bit integers from memory using `mask` (elements are zeroed out when the highest
1702 /// bit is not set in the corresponding element).
1703 /// Warning: See "Note about mask load/store" to know why you must address valid memory only.
1704 __m128i _mm_maskload_epi32 (const(int)* mem_addr, __m128i mask) /* pure */ @system
1705 {
1706     // PERF DMD
1707     static if (LDC_with_AVX2)
1708     {
1709         // MAYDO report that the builtin is impure
1710         return __builtin_ia32_maskloadd(mem_addr, mask);
1711     }
1712     else static if (GDC_with_AVX2)
1713     {
1714         return __builtin_ia32_maskloadd(cast(__m128i*)mem_addr, mask);
1715     }
1716     else
1717     {
1718         return cast(__m128i) _mm_maskload_ps(cast(const(float)*)mem_addr, mask);
1719     }
1720 }
1721 unittest
1722 {
1723     static if (!maskLoadWorkaroundDMD)
1724     {
1725         int[4] A = [7, 1, 2, 3];
1726         int4 B = _mm_maskload_epi32(A.ptr, _mm_setr_epi32(1, -1, -1, 1));  // can address invalid memory with mask load and writes!
1727         int[4] correct = [0, 1, 2, 0];
1728         assert(B.array == correct);
1729     }
1730 }
1731 
1732 
1733 // TODO __m256i _mm256_maskload_epi32 (int const* mem_addr, __m256i mask) pure @safe
1734 // TODO __m128i _mm_maskload_epi64 (__int64 const* mem_addr, __m128i mask) pure @safe
1735 // TODO __m256i _mm256_maskload_epi64 (__int64 const* mem_addr, __m256i mask) pure @safe
1736 
1737 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
1738 __m256i _mm256_max_epi16 (__m256i a, __m256i b) pure @safe
1739 {
1740     // PERF D_SIMD
1741     version(GNU)
1742         enum bool split = true;
1743     else static if (SIMD_COMPARISON_MASKS_32B)
1744         enum bool split = false;
1745     else
1746         enum bool split = true;
1747 
1748     static if (GDC_with_AVX2)
1749     {
1750         return cast(__m256i) __builtin_ia32_pmaxsw256(cast(short16)a, cast(short16)b);
1751     }
1752     else static if (split)
1753     {
1754         // split
1755         __m128i a_lo = _mm256_extractf128_si256!0(a);
1756         __m128i a_hi = _mm256_extractf128_si256!1(a);
1757         __m128i b_lo = _mm256_extractf128_si256!0(b);
1758         __m128i b_hi = _mm256_extractf128_si256!1(b);
1759         __m128i r_lo = _mm_max_epi16(a_lo, b_lo);
1760         __m128i r_hi = _mm_max_epi16(a_hi, b_hi);
1761         return _mm256_set_m128i(r_hi, r_lo);
1762     }
1763     else static if (SIMD_COMPARISON_MASKS_32B)
1764     {
1765         // catastrophic with GDC x86 for some reason. Sad.
1766         short16 sa = cast(short16)a;
1767         short16 sb = cast(short16)b;
1768         short16 greater = sa > sb;
1769         return cast(__m256i)( (greater & sa) | (~greater & sb) );
1770     }
1771     else
1772         static assert(0);    
1773 }
1774 unittest
1775 {
1776     short16 R = cast(short16) _mm256_max_epi16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0, 0),
1777                                                _mm256_setr_epi16(   -4,-8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
1778     short[16] correct =                                         [32767, 1,  9,  7, 9,     7, 0,  0, 1, 2, 0, 4, 2, 1, 2, 0];
1779     assert(R.array == correct);
1780 }
1781 
1782 /// Compare packed signed 32-bit integers in `a` and `b`, and return packed maximum values.
1783 __m256i _mm256_max_epi32 (__m256i a, __m256i b) pure @safe
1784 {
1785     // PERF D_SIMD
1786     version(GNU)
1787         enum bool split = true;
1788     else static if (SIMD_COMPARISON_MASKS_32B)
1789         enum bool split = false;
1790     else
1791         enum bool split = true;
1792 
1793     static if (GDC_with_AVX2)
1794     {
1795         return cast(__m256i) __builtin_ia32_pmaxsd256(cast(int8)a, cast(int8)b);
1796     }
1797     else static if (split)
1798     {
1799         // split
1800         __m128i a_lo = _mm256_extractf128_si256!0(a);
1801         __m128i a_hi = _mm256_extractf128_si256!1(a);
1802         __m128i b_lo = _mm256_extractf128_si256!0(b);
1803         __m128i b_hi = _mm256_extractf128_si256!1(b);
1804         __m128i r_lo = _mm_max_epi32(a_lo, b_lo);
1805         __m128i r_hi = _mm_max_epi32(a_hi, b_hi);
1806         return _mm256_set_m128i(r_hi, r_lo);
1807     }
1808     else static if (SIMD_COMPARISON_MASKS_32B) 
1809     {
1810         // catastrophic with GDC x86 for some reason, like for 16-bit numbers.
1811         int8 sa = cast(int8)a;
1812         int8 sb = cast(int8)b;
1813         int8 greater = sa > sb;
1814         return cast(__m256i)( (greater & sa) | (~greater & sb) );
1815     }
1816     else
1817         static assert(0);    
1818 }
1819 unittest
1820 {
1821     int8 R = cast(int8) _mm256_max_epi32(_mm256_setr_epi32(0x7fffffff, 1, -4,  7, 0x7fffffff, 2, -4,  7),
1822                                          _mm256_setr_epi32(        -4,-8,  9, -8,-0x80000000,-8,  9, -8));
1823     int[8] correct =                                      [0x7fffffff, 1,  9,  7, 0x7fffffff, 2,  9,  7];
1824     assert(R.array == correct);
1825 }
1826 
1827 /// Compare packed signed 8-bit integers in `a` and `b`, and return packed maximum values.
1828 __m256i _mm256_max_epi8 (__m256i a, __m256i b) pure @trusted
1829 {
1830     // PERF D_SIMD
1831     version(GNU)
1832         enum bool split = true;
1833     else static if (SIMD_COMPARISON_MASKS_32B)
1834         enum bool split = false;
1835     else
1836         enum bool split = true;
1837     static if (GDC_with_AVX2)
1838     {
1839         // Strangely, GDC asks for unsigned ubyte32
1840         return cast(__m256i) __builtin_ia32_pmaxsb256(cast(ubyte32)a, cast(ubyte32)b);
1841     }
1842     else static if (split)
1843     {
1844         // split
1845         __m128i a_lo = _mm256_extractf128_si256!0(a);
1846         __m128i a_hi = _mm256_extractf128_si256!1(a);
1847         __m128i b_lo = _mm256_extractf128_si256!0(b);
1848         __m128i b_hi = _mm256_extractf128_si256!1(b);
1849         __m128i r_lo = _mm_max_epi8(a_lo, b_lo);
1850         __m128i r_hi = _mm_max_epi8(a_hi, b_hi);
1851         return _mm256_set_m128i(r_hi, r_lo);
1852     }
1853     else static if (SIMD_COMPARISON_MASKS_32B)
1854     {
1855         // This is real bad with GDC, again
1856         byte32 sa = cast(byte32)a;
1857         byte32 sb = cast(byte32)b;
1858         byte32 greater = cast(byte32)(sa > sb);
1859         return cast(__m256i)( (greater & sa) | (~greater & sb) );
1860     }
1861     else
1862         static assert(false);
1863 }
1864 unittest
1865 {
1866     __m256i A = _mm256_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0,   127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1867     __m256i B = _mm256_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0,     4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 4, 0, 0);
1868     byte32 R = cast(byte32) _mm256_max_epi8(A, B);
1869     byte[32] correct =          [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0,   127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 4, 0, 0];
1870     assert(R.array == correct);
1871 }
1872 
1873 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed maximum values.
1874 __m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @trusted
1875 {
1876     // PERF D_SIMD
1877     version(GNU)
1878         enum bool split = true;
1879     else static if (SIMD_COMPARISON_MASKS_32B)
1880         enum bool split = false;
1881     else
1882         enum bool split = true;
1883 
1884     static if (GDC_with_AVX2)
1885     {
1886         return cast(__m256i) __builtin_ia32_pmaxuw256(cast(short16)a, cast(short16)b);
1887     }
1888     else static if (split)
1889     {
1890         // split
1891         __m128i a_lo = _mm256_extractf128_si256!0(a);
1892         __m128i a_hi = _mm256_extractf128_si256!1(a);
1893         __m128i b_lo = _mm256_extractf128_si256!0(b);
1894         __m128i b_hi = _mm256_extractf128_si256!1(b);
1895         __m128i r_lo = _mm_max_epu16(a_lo, b_lo);
1896         __m128i r_hi = _mm_max_epu16(a_hi, b_hi);
1897         return _mm256_set_m128i(r_hi, r_lo);
1898     }
1899     else static if (SIMD_COMPARISON_MASKS_32B)
1900     {
1901         // catastrophic with GDC x86_64, good with LDC
1902         short16 sa = cast(short16)a;
1903         short16 sb = cast(short16)b;
1904         short16 greater = cast(short16)(cast(ushort16)sa > cast(ushort16)sb);
1905         return cast(__m256i)( (greater & sa) | (~greater & sb) );
1906     }
1907     else
1908         static assert(false);
1909 }
1910 unittest
1911 {
1912     short16 R = cast(short16) _mm256_max_epu16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0, -6),
1913                                                 _mm256_setr_epi16(  -4,-8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
1914     short[16] correct =                                            [-4,-8, -4, -8, 9,-32768, 0,-57, 1, 2, 0, 4, 2, 1, 2, -4];
1915     assert(R.array == correct);
1916 }
1917 
1918 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values.
1919 __m256i _mm256_max_epu32 (__m256i a, __m256i b) pure @safe
1920 {
1921     // PERF D_SIMD
1922     version(GNU)
1923         enum bool split = true;
1924     else static if (SIMD_COMPARISON_MASKS_32B)
1925         enum bool split = false;
1926     else
1927         enum bool split = true;
1928 
1929     static if (GDC_with_AVX2)
1930     {
1931         return cast(__m256i) __builtin_ia32_pmaxud256(cast(int8)a, cast(int8)b);
1932     }
1933     else static if (split)
1934     {
1935         // split
1936         __m128i a_lo = _mm256_extractf128_si256!0(a);
1937         __m128i a_hi = _mm256_extractf128_si256!1(a);
1938         __m128i b_lo = _mm256_extractf128_si256!0(b);
1939         __m128i b_hi = _mm256_extractf128_si256!1(b);
1940         __m128i r_lo = _mm_max_epu32(a_lo, b_lo);
1941         __m128i r_hi = _mm_max_epu32(a_hi, b_hi);
1942         return _mm256_set_m128i(r_hi, r_lo);
1943     }
1944     else static if (SIMD_COMPARISON_MASKS_32B) 
1945     {
1946         // catastrophic with GDC x86 for some reason, like for 16-bit numbers.
1947         uint8 sa = cast(uint8)a;
1948         uint8 sb = cast(uint8)b;
1949         uint8 greater = sa > sb;
1950         return cast(__m256i)( (greater & sa) | (~greater & sb) );
1951     }
1952     else
1953         static assert(0);
1954 }
1955 unittest
1956 {
1957     int8 R = cast(int8) _mm256_max_epu32(_mm256_setr_epi32(0x7fffffff, 1,  4, -7, 0x7fffffff, 1, 11, -7),
1958                                          _mm256_setr_epi32(        -4,-8,  9, -8,         -4,-8,  9, -8));
1959     int[8] correct =                                      [        -4,-8,  9, -7,         -4,-8, 11, -7];
1960     assert(R.array == correct);
1961 }
1962 
1963 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values.
1964 __m256i _mm256_max_epu8 (__m256i a, __m256i b) pure @safe
1965 {
1966     // PERF D_SIMD
1967     version(GNU)
1968         enum bool split = true;
1969     else static if (SIMD_COMPARISON_MASKS_32B)
1970         enum bool split = false;
1971     else
1972         enum bool split = true;
1973     static if (GDC_with_AVX2)
1974     {
1975         return cast(__m256i) __builtin_ia32_pmaxub256(cast(ubyte32)a, cast(ubyte32)b);
1976     }
1977     else static if (split)
1978     {
1979         // split
1980         __m128i a_lo = _mm256_extractf128_si256!0(a);
1981         __m128i a_hi = _mm256_extractf128_si256!1(a);
1982         __m128i b_lo = _mm256_extractf128_si256!0(b);
1983         __m128i b_hi = _mm256_extractf128_si256!1(b);
1984         __m128i r_lo = _mm_max_epu8(a_lo, b_lo);
1985         __m128i r_hi = _mm_max_epu8(a_hi, b_hi);
1986         return _mm256_set_m128i(r_hi, r_lo);
1987     }
1988     else static if (SIMD_COMPARISON_MASKS_32B)
1989     {
1990         // This is real bad with GDC, again
1991         ubyte32 sa = cast(ubyte32)a;
1992         ubyte32 sb = cast(ubyte32)b;
1993         ubyte32 greater = cast(ubyte32)(sa > sb);
1994         return cast(__m256i)( (greater & sa) | (~greater & sb) );
1995     }
1996     else
1997         static assert(false);
1998 }
1999 unittest
2000 {
2001     byte32 R = cast(byte32) _mm256_max_epu8(_mm256_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0,   45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2002                                             _mm256_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57,   -4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2003     byte[32] correct =                                      [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57,   -4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2004     assert(R.array == correct);
2005 }
2006 
2007 // Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2008 __m256i _mm256_min_epi16 (__m256i a, __m256i b) pure @safe
2009 {
2010     // PERF D_SIMD
2011     version(GNU)
2012         enum bool split = true;
2013     else static if (SIMD_COMPARISON_MASKS_32B)
2014         enum bool split = false;
2015     else
2016         enum bool split = true;
2017 
2018     static if (GDC_with_AVX2)
2019     {
2020         return cast(__m256i) __builtin_ia32_pminsw256(cast(short16)a, cast(short16)b);
2021     }
2022     else static if (split)
2023     {
2024         // split
2025         __m128i a_lo = _mm256_extractf128_si256!0(a);
2026         __m128i a_hi = _mm256_extractf128_si256!1(a);
2027         __m128i b_lo = _mm256_extractf128_si256!0(b);
2028         __m128i b_hi = _mm256_extractf128_si256!1(b);
2029         __m128i r_lo = _mm_min_epi16(a_lo, b_lo);
2030         __m128i r_hi = _mm_min_epi16(a_hi, b_hi);
2031         return _mm256_set_m128i(r_hi, r_lo);
2032     }
2033     else static if (SIMD_COMPARISON_MASKS_32B)
2034     {
2035         // same as _mm256_min_epi16, this is catastrophic with GDC -mavx
2036         short16 sa = cast(short16)a;
2037         short16 sb = cast(short16)b;
2038         short16 greater = sa > sb;
2039         return cast(__m256i)( (~greater & sa) | (greater & sb) );
2040     }
2041     else
2042         static assert(0);
2043 }
2044 unittest
2045 {
2046     short16 R = cast(short16) _mm256_min_epi16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0,  0),
2047                                                _mm256_setr_epi16(   -4,-8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
2048     short[16] correct =                                         [   -4,-8, -4, -8, 0,-32768, 0,-57, 0, 0, 0, 0, 1, 0, 0, -4];
2049     assert(R.array == correct);
2050 }
2051 
2052 /// Compare packed signed 32-bit integers in `a` and `b`, and return packed minimum values.
2053 __m256i _mm256_min_epi32 (__m256i a, __m256i b) pure @safe
2054 {
2055     // PERF D_SIMD
2056     version(GNU)
2057         enum bool split = true;
2058     else static if (SIMD_COMPARISON_MASKS_32B)
2059         enum bool split = false;
2060     else
2061         enum bool split = true;
2062 
2063     static if (GDC_with_AVX2)
2064     {
2065         return cast(__m256i) __builtin_ia32_pminsd256(cast(int8)a, cast(int8)b);
2066     }
2067     else static if (split)
2068     {
2069         // split
2070         __m128i a_lo = _mm256_extractf128_si256!0(a);
2071         __m128i a_hi = _mm256_extractf128_si256!1(a);
2072         __m128i b_lo = _mm256_extractf128_si256!0(b);
2073         __m128i b_hi = _mm256_extractf128_si256!1(b);
2074         __m128i r_lo = _mm_min_epi32(a_lo, b_lo);
2075         __m128i r_hi = _mm_min_epi32(a_hi, b_hi);
2076         return _mm256_set_m128i(r_hi, r_lo);
2077     }
2078     else static if (SIMD_COMPARISON_MASKS_32B) 
2079     {
2080         // Not checked this one, probably same badness issue with GDC
2081         int8 sa = cast(int8)a;
2082         int8 sb = cast(int8)b;
2083         int8 greater = sa > sb;
2084         return cast(__m256i)( (~greater & sa) | (greater & sb) );
2085     }
2086     else
2087         static assert(0);    
2088 }
2089 unittest
2090 {
2091     int8 R = cast(int8) _mm256_min_epi32(_mm256_setr_epi32(0x7fffffff, 1, -4,  7, 0x7fffffff, 2, -4,  7),
2092                                          _mm256_setr_epi32(        -4,-8,  9, -8,-0x80000000,-8,  9, -8));
2093     int[8] correct =                                      [ -       4,-8, -4, -8,-0x80000000,-8, -4, -8];
2094     assert(R.array == correct);
2095 }
2096 
2097 
2098 /// Compare packed signed 8-bit integers in `a` and `b`, and return packed minimum values.
2099 __m256i _mm256_min_epi8 (__m256i a, __m256i b) pure @trusted
2100 {
2101     // PERF D_SIMD
2102     version(GNU)
2103         enum bool split = true;
2104     else static if (SIMD_COMPARISON_MASKS_32B)
2105         enum bool split = false;
2106     else
2107         enum bool split = true;
2108     static if (GDC_with_AVX2)
2109     {
2110         // Strangely, GDC asks for unsigned ubyte32
2111         return cast(__m256i) __builtin_ia32_pminsb256(cast(ubyte32)a, cast(ubyte32)b);
2112     }
2113     else static if (split)
2114     {
2115         // split
2116         __m128i a_lo = _mm256_extractf128_si256!0(a);
2117         __m128i a_hi = _mm256_extractf128_si256!1(a);
2118         __m128i b_lo = _mm256_extractf128_si256!0(b);
2119         __m128i b_hi = _mm256_extractf128_si256!1(b);
2120         __m128i r_lo = _mm_min_epi8(a_lo, b_lo);
2121         __m128i r_hi = _mm_min_epi8(a_hi, b_hi);
2122         return _mm256_set_m128i(r_hi, r_lo);
2123     }
2124     else static if (SIMD_COMPARISON_MASKS_32B)
2125     {
2126         // This is real bad with GDC, again
2127         byte32 sa = cast(byte32)a;
2128         byte32 sb = cast(byte32)b;
2129         byte32 greater = cast(byte32)(sa > sb);
2130         return cast(__m256i)( (~greater & sa) | (greater & sb) );
2131     }
2132     else
2133         static assert(false);
2134 }
2135 unittest
2136 {
2137     __m256i A = _mm256_setr_epi8(127,  1, -4, -8, 9,    7, 0, -57, 0, 0, 0, 0, 0, 0, 0, 0,   127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
2138     __m256i B = _mm256_setr_epi8(  4, -8,  9, -7, 0, -128, 0,   0, 0, 0, 0, 0, 0, 0, 0, 0,     4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, -4, 0, 0);
2139     byte32 R = cast(byte32) _mm256_min_epi8(A, B);
2140     byte[32] correct =          [  4, -8, -4, -8, 0, -128, 0, -57, 0, 0, 0, 0, 0, 0, 0, 0,     4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, -4, 0, 0];
2141     assert(R.array == correct);
2142 }
2143 
2144 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed minimum values.
2145 __m256i _mm256_min_epu16 (__m256i a, __m256i b) pure @trusted
2146 {
2147     // PERF D_SIMD
2148     version(GNU)
2149         enum bool split = true;
2150     else static if (SIMD_COMPARISON_MASKS_32B)
2151         enum bool split = false;
2152     else
2153         enum bool split = true;
2154 
2155     static if (GDC_with_AVX2)
2156     {
2157         return cast(__m256i) __builtin_ia32_pminuw256(cast(short16)a, cast(short16)b);
2158     }
2159     else static if (split)
2160     {
2161         // split
2162         __m128i a_lo = _mm256_extractf128_si256!0(a);
2163         __m128i a_hi = _mm256_extractf128_si256!1(a);
2164         __m128i b_lo = _mm256_extractf128_si256!0(b);
2165         __m128i b_hi = _mm256_extractf128_si256!1(b);
2166         __m128i r_lo = _mm_min_epu16(a_lo, b_lo);
2167         __m128i r_hi = _mm_min_epu16(a_hi, b_hi);
2168         return _mm256_set_m128i(r_hi, r_lo);
2169     }
2170     else static if (SIMD_COMPARISON_MASKS_32B)
2171     {
2172         // catastrophic with GDC x86_64
2173         short16 sa = cast(short16)a;
2174         short16 sb = cast(short16)b;
2175         short16 greater = cast(short16)(cast(ushort16)sa > cast(ushort16)sb);
2176         return cast(__m256i)( (~greater & sa) | (greater & sb) );
2177     }
2178     else
2179         static assert(false);
2180 }
2181 unittest
2182 {
2183     short16 R = cast(short16) _mm256_min_epu16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0, -6),
2184                                                _mm256_setr_epi16(  -4, -8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
2185     short[16] correct =                                         [32767, 1,  9,  7, 0,     7, 0,  0, 0, 0, 0, 0, 1, 0, 0, -6];
2186     assert(R.array == correct);
2187 }
2188 
2189 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed minimum values.
2190 __m256i _mm256_min_epu32 (__m256i a, __m256i b) pure @safe
2191 {
2192     // PERF D_SIMD
2193     version(GNU)
2194         enum bool split = true;
2195     else static if (SIMD_COMPARISON_MASKS_32B)
2196         enum bool split = false;
2197     else
2198         enum bool split = true;
2199 
2200     static if (GDC_with_AVX2)
2201     {
2202         return cast(__m256i) __builtin_ia32_pminud256(cast(int8)a, cast(int8)b);
2203     }
2204     else static if (split)
2205     {
2206         // split
2207         __m128i a_lo = _mm256_extractf128_si256!0(a);
2208         __m128i a_hi = _mm256_extractf128_si256!1(a);
2209         __m128i b_lo = _mm256_extractf128_si256!0(b);
2210         __m128i b_hi = _mm256_extractf128_si256!1(b);
2211         __m128i r_lo = _mm_min_epu32(a_lo, b_lo);
2212         __m128i r_hi = _mm_min_epu32(a_hi, b_hi);
2213         return _mm256_set_m128i(r_hi, r_lo);
2214     }
2215     else static if (SIMD_COMPARISON_MASKS_32B) 
2216     {
2217         // catastrophic with GDC, so in this case split instead
2218         uint8 sa = cast(uint8)a;
2219         uint8 sb = cast(uint8)b;
2220         uint8 greater = sa > sb;
2221         return cast(__m256i)( (greater & sb) | (~greater & sa) );
2222     }
2223     else
2224         static assert(0);
2225 }
2226 unittest
2227 {
2228     int8 R = cast(int8) _mm256_min_epu32(_mm256_setr_epi32(0x7fffffff, 1,  4, -7, 0x7fffffff, 1, 11, -7),
2229                                          _mm256_setr_epi32(        -4,-8,  9, -8,         -4,-8,  9, -8));
2230     int[8] correct =                                      [0x7fffffff, 1,  4, -8, 0x7fffffff, 1,  9, -8];
2231     assert(R.array == correct);
2232 }
2233 
2234 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2235 __m256i _mm256_min_epu8 (__m256i a, __m256i b) pure @safe
2236 {
2237     // PERF D_SIMD
2238     version(GNU)
2239         enum bool split = true;
2240     else static if (SIMD_COMPARISON_MASKS_32B)
2241         enum bool split = false;
2242     else
2243         enum bool split = true;
2244     static if (GDC_with_AVX2)
2245     {
2246         return cast(__m256i) __builtin_ia32_pminub256(cast(ubyte32)a, cast(ubyte32)b);
2247     }
2248     else static if (split)
2249     {
2250         // split
2251         __m128i a_lo = _mm256_extractf128_si256!0(a);
2252         __m128i a_hi = _mm256_extractf128_si256!1(a);
2253         __m128i b_lo = _mm256_extractf128_si256!0(b);
2254         __m128i b_hi = _mm256_extractf128_si256!1(b);
2255         __m128i r_lo = _mm_min_epu8(a_lo, b_lo);
2256         __m128i r_hi = _mm_min_epu8(a_hi, b_hi);
2257         return _mm256_set_m128i(r_hi, r_lo);
2258     }
2259     else static if (SIMD_COMPARISON_MASKS_32B)
2260     {
2261         ubyte32 sa = cast(ubyte32)a;
2262         ubyte32 sb = cast(ubyte32)b;
2263         ubyte32 greater = cast(ubyte32)(sa > sb);
2264         return cast(__m256i)( (~greater & sa) | (greater & sb) );
2265     }
2266     else
2267         static assert(false);
2268 }
2269 unittest
2270 {
2271     byte32 R = cast(byte32) _mm256_min_epu8(_mm256_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0,   45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2272                                             _mm256_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57,   -4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2273     byte[32] correct =                                      [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0,   45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2274     assert(R.array == correct);
2275 }
2276 
2277 
2278 // TODO int _mm256_movemask_epi8 (__m256i a) pure @safe
2279 // TODO __m256i _mm256_mpsadbw_epu8 (__m256i a, __m256i b, const int imm8) pure @safe
2280 
2281 /// Multiply the low signed 32-bit integers from each packed 64-bit element in `a` and `b`, and 
2282 /// return the signed 64-bit results.
2283 __m256i _mm256_mul_epi32 (__m256i a, __m256i b) pure @trusted
2284 {
2285     // PERF LDC + SSE2 to SSSE3. I don't quite see what to do, same problem in _mm_mul_epi32.
2286     static if (GDC_with_AVX2)
2287     {
2288         return cast(__m256i) __builtin_ia32_pmuldq256(cast(int8)a, cast(int8)b);
2289     }
2290     else static if ( (LDC_with_SSE41 || LDC_with_AVX2) && LDC_with_optimizations) 
2291     {
2292         // good with LDC + SSE4.1 to AVX2, else need to split
2293         enum ir = `
2294             %ia = shufflevector <8 x i32> %0,<8 x i32> %0, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2295             %ib = shufflevector <8 x i32> %1,<8 x i32> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2296             %la = sext <4 x i32> %ia to <4 x i64>
2297             %lb = sext <4 x i32> %ib to <4 x i64>
2298             %r = mul <4 x i64> %la, %lb
2299             ret <4 x i64> %r`;
2300         return cast(__m256i) LDCInlineIR!(ir, long4, int8, int8)(cast(int8)a, cast(int8)b);
2301     }
2302     else
2303     {
2304         // split, very beneficial with LDC+ARM64
2305         __m128i a_lo = _mm256_extractf128_si256!0(a);
2306         __m128i a_hi = _mm256_extractf128_si256!1(a);
2307         __m128i b_lo = _mm256_extractf128_si256!0(b);
2308         __m128i b_hi = _mm256_extractf128_si256!1(b);
2309         __m128i r_lo = _mm_mul_epi32(a_lo, b_lo);
2310         __m128i r_hi = _mm_mul_epi32(a_hi, b_hi);
2311         return _mm256_set_m128i(r_hi, r_lo);
2312     }
2313 }
2314 unittest
2315 {
2316     __m256i A = _mm256_setr_epi32(61616461, 1915324654, 4564061, 3, 61616466, 1915324654, 4564061, 3);
2317     __m256i B = _mm256_setr_epi32(49716422, -915616216, -121144, 0, 49716422, -915616216, -121145, 0);
2318     long4 R = cast(long4) _mm256_mul_epi32(A, B);
2319     long[4] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144, cast(long)61616466 * 49716422, cast(long)4564061 * -121145];
2320     assert(R.array == correct);
2321 }
2322 
2323 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, and 
2324 /// return the unsigned 64-bit results.
2325 __m256i _mm256_mul_epu32 (__m256i a, __m256i b) pure @trusted
2326 {
2327     // PERF DMD
2328     static if (GDC_with_AVX2)
2329     {
2330         return cast(__m256i) __builtin_ia32_pmuludq256(cast(int8)a, cast(int8)b);
2331     }
2332     else version(GNU)
2333     {
2334         // explicit split needed for GDC without avx2
2335         __m128i a_lo = _mm256_extractf128_si256!0(a);
2336         __m128i a_hi = _mm256_extractf128_si256!1(a);
2337         __m128i b_lo = _mm256_extractf128_si256!0(b);
2338         __m128i b_hi = _mm256_extractf128_si256!1(b);
2339         __m128i r_lo = _mm_mul_epu32(a_lo, b_lo);
2340         __m128i r_hi = _mm_mul_epu32(a_hi, b_hi);
2341         return _mm256_set_m128i(r_hi, r_lo);
2342     }       
2343     else
2344     {
2345         // Works well in all LDC cases, surprisingly.
2346         int8 ia = cast(int8)a;
2347         int8 ib = cast(int8)b;
2348         long4 r;
2349         r.ptr[0] = cast(long)cast(uint)ia.array[0] * cast(long)cast(uint)ib.array[0];
2350         r.ptr[1] = cast(long)cast(uint)ia.array[2] * cast(long)cast(uint)ib.array[2];
2351         r.ptr[2] = cast(long)cast(uint)ia.array[4] * cast(long)cast(uint)ib.array[4];
2352         r.ptr[3] = cast(long)cast(uint)ia.array[6] * cast(long)cast(uint)ib.array[6];
2353         return cast(__m256i)r;
2354     }
2355 }
2356 unittest
2357 {
2358     __m256i A = _mm256_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff, 42, 0xDEADBEEF, 42, 0xffffffff);
2359     __m256i B = _mm256_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff, 42, 0xCAFEBABE, 42, 0xffffffff);
2360     __m256i C = _mm256_mul_epu32(A, B);
2361     long4 LC = cast(long4)C;
2362     long[4] correct = [18446744065119617025uL, 12723420444339690338uL, 18446744065119617025uL, 12723420444339690338uL];
2363     assert(LC.array == correct);
2364 }
2365 
2366 // TODO __m256i _mm256_mulhi_epi16 (__m256i a, __m256i b) pure @safe
2367 // TODO __m256i _mm256_mulhi_epu16 (__m256i a, __m256i b) pure @safe
2368 // TODO __m256i _mm256_mulhrs_epi16 (__m256i a, __m256i b) pure @safe
2369 // TODO __m256i _mm256_mullo_epi16 (__m256i a, __m256i b) pure @safe
2370 // TODO __m256i _mm256_mullo_epi32 (__m256i a, __m256i b) pure @safe
2371 
2372 /// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b`.
2373 __m256i _mm256_or_si256 (__m256i a, __m256i b) pure @safe
2374 {
2375     return a | b;
2376 }
2377 unittest
2378 {
2379     long A = 0x55555555_55555555;
2380     long B = 0xAAAAAAAA_AAAAAAAA;
2381     __m256i vA = _mm256_set_epi64(A, B, A, B);
2382     __m256i vB = _mm256_set_epi64(B, A, 0, B);
2383     __m256i R  = _mm256_or_si256(vA, vB);
2384     long[4] correct = [B, A, -1, -1];
2385     assert(R.array == correct);
2386 }
2387 
2388 /// Convert packed signed 16-bit integers from `a` and `b `to packed 8-bit integers using signed saturation.
2389 /// Warning: `a` and `b` are interleaved per-lane. 
2390 ///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
2391 __m256i _mm256_packs_epi16 (__m256i a, __m256i b) pure @safe
2392 {
2393     // PERF D_SIMD
2394     static if (GDC_with_AVX2)
2395     {
2396         return cast(__m256i) __builtin_ia32_packsswb256(cast(short16)a, cast(short16)b);
2397     }
2398     else static if (LDC_with_AVX2)
2399     {
2400         return cast(__m256i) __builtin_ia32_packsswb256(cast(short16)a, cast(short16)b);
2401     }
2402     else
2403     {
2404         __m128i a_lo = _mm256_extractf128_si256!0(a);
2405         __m128i a_hi = _mm256_extractf128_si256!1(a);
2406         __m128i b_lo = _mm256_extractf128_si256!0(b);
2407         __m128i b_hi = _mm256_extractf128_si256!1(b);
2408         __m128i r_lo = _mm_packs_epi16(a_lo, b_lo);
2409         __m128i r_hi = _mm_packs_epi16(a_hi, b_hi);
2410         return _mm256_set_m128i(r_hi, r_lo);
2411     }
2412 }
2413 unittest
2414 {
2415     __m256i A = _mm256_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0, 
2416                                  -1000, -1000, 1000, 0, 256, -129, 254, 0);
2417     byte32 R = cast(byte32) _mm256_packs_epi16(A, A);
2418     byte[32] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2419                         127, -128, 127, 0, 127, -128, 127, 0,
2420                        -128, -128, 127, 0, 127, -128, 127, 0,
2421                        -128, -128, 127, 0, 127, -128, 127, 0];
2422     assert(R.array == correct);
2423 }
2424 
2425 /// Convert packed signed 32-bit integers from `a` and `b `to packed 16-bit integers using signed saturation.
2426 /// Warning: `a` and `b` are interleaved per-lane.
2427 ///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
2428 __m256i _mm256_packs_epi32 (__m256i a, __m256i b) pure @safe
2429 {
2430     // PERF D_SIMD
2431     static if (GDC_with_AVX2)
2432     {
2433         return cast(__m256i) __builtin_ia32_packssdw256(cast(int8)a, cast(int8)b);
2434     }
2435     else static if (LDC_with_AVX2)
2436     {
2437         return cast(__m256i) __builtin_ia32_packssdw256(cast(int8)a, cast(int8)b);
2438     }
2439     else
2440     {
2441         __m128i a_lo = _mm256_extractf128_si256!0(a);
2442         __m128i a_hi = _mm256_extractf128_si256!1(a);
2443         __m128i b_lo = _mm256_extractf128_si256!0(b);
2444         __m128i b_hi = _mm256_extractf128_si256!1(b);
2445         __m128i r_lo = _mm_packs_epi32(a_lo, b_lo);
2446         __m128i r_hi = _mm_packs_epi32(a_hi, b_hi);
2447         return _mm256_set_m128i(r_hi, r_lo);
2448     }
2449 }
2450 unittest
2451 {
2452     __m256i A = _mm256_setr_epi32(100000, -100000, 1000, 0, 4, 5, -100000, 7);
2453     short16 R = cast(short16) _mm256_packs_epi32(A, A);
2454     short[16] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0, 4, 5, -32768, 7, 4, 5, -32768, 7];
2455     assert(R.array == correct);
2456 }
2457 
2458 
2459 /// Convert packed signed 16-bit integers from `a` and `b `to packed 8-bit integers using unsigned saturation.
2460 /// Warning: `a` and `b` are interleaved per-lane. 
2461 ///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
2462 __m256i _mm256_packus_epi16 (__m256i a, __m256i b) pure @trusted
2463 {
2464     // PERF D_SIMD
2465     static if (GDC_with_AVX2)
2466     {
2467         return cast(__m256i) __builtin_ia32_packuswb256(cast(short16)a, cast(short16)b);
2468     }
2469     else static if (LDC_with_AVX2)
2470     {
2471         return cast(__m256i) __builtin_ia32_packuswb256(cast(short16)a, cast(short16)b);
2472     }
2473     else
2474     {
2475         // Always beneficial with LDC.
2476         // arm64: 4 inst with LDC  -O1
2477         __m128i a_lo = _mm256_extractf128_si256!0(a);
2478         __m128i a_hi = _mm256_extractf128_si256!1(a);
2479         __m128i b_lo = _mm256_extractf128_si256!0(b);
2480         __m128i b_hi = _mm256_extractf128_si256!1(b);
2481         __m128i r_lo = _mm_packus_epi16(a_lo, b_lo);
2482         __m128i r_hi = _mm_packus_epi16(a_hi, b_hi);
2483         return _mm256_set_m128i(r_hi, r_lo);
2484     }
2485 }
2486 unittest
2487 {
2488     __m256i A = _mm256_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0, -10, 400,  0, 256, -32768,  2,  1, 0);
2489     __m256i B = _mm256_setr_epi16(  0,   1, 2,   3,   4, 5, 6, 7,   8,   9, 10,  11,     12, 13, 14, 15);
2490     byte32 R = cast(byte32) _mm256_packus_epi16(A, B);
2491    align(32) static immutable byte[32] correctResult = [0, -1, 0, -1, -1, 2, 1, 0, 0, 1,  2,  3,  4,  5,  6,  7,
2492                                                         0, -1, 0, -1, 0  , 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15];
2493     assert(R.array == correctResult);
2494 }
2495 
2496 /// Convert packed signed 32-bit integers from `a` and `b `to packed 16-bit integers using unsigned saturation.
2497 /// Warning: `a` and `b` are interleaved per-lane.
2498 ///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
2499 __m256i _mm256_packus_epi32 (__m256i a, __m256i b) pure @safe
2500 {
2501     // PERF D_SIMD
2502     static if (GDC_with_AVX2)
2503     {
2504         return cast(__m256i) __builtin_ia32_packusdw256(cast(int8)a, cast(int8)b);
2505     }
2506     else static if (LDC_with_AVX2)
2507     {
2508         return cast(__m256i) __builtin_ia32_packusdw256(cast(int8)a, cast(int8)b);
2509     }
2510     else
2511     {
2512         // 8 inst in arm64 since LDC 1.22 -O2,
2513         // sounds a bit underperforming maybe
2514         __m128i a_lo = _mm256_extractf128_si256!0(a);
2515         __m128i a_hi = _mm256_extractf128_si256!1(a);
2516         __m128i b_lo = _mm256_extractf128_si256!0(b);
2517         __m128i b_hi = _mm256_extractf128_si256!1(b);
2518         __m128i r_lo = _mm_packus_epi32(a_lo, b_lo);
2519         __m128i r_hi = _mm_packus_epi32(a_hi, b_hi);
2520         return _mm256_set_m128i(r_hi, r_lo);
2521     }
2522 }
2523 unittest
2524 {
2525     __m256i A = _mm256_setr_epi32(100000, -100000, 1000, 0, 100000, -100000, 1000, 1);
2526     short16 R = cast(short16) _mm256_packus_epi32(A, A);
2527     short[16] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0,
2528                          cast(short)65535, 0, 1000, 1, cast(short)65535, 0, 1000, 1];
2529     assert(R.array == correct);
2530 }
2531 
2532 
2533 
2534 // TODO __m256i _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8) pure @safe
2535 // TODO __m256i _mm256_permute4x64_epi64 (__m256i a, const int imm8) pure @safe
2536 // TODO __m256d _mm256_permute4x64_pd (__m256d a, const int imm8) pure @safe
2537 // TODO __m256i _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx) pure @safe
2538 // TODO __m256 _mm256_permutevar8x32_ps (__m256 a, __m256i idx) pure @safe
2539 
2540 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each
2541 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the
2542 /// low 16 bits of 64-bit elements in result.
2543 __m256i _mm256_sad_epu8 (__m256i a, __m256i b) pure @trusted
2544 {
2545     static if (GDC_with_AVX2)
2546     {
2547         return cast(__m256i) __builtin_ia32_psadbw256(cast(ubyte32)a, cast(ubyte32)b);
2548     }
2549     else static if (LDC_with_AVX2)
2550     {
2551         return cast(__m256i) __builtin_ia32_psadbw256(cast(byte32)a, cast(byte32)b);
2552     }
2553     else
2554     {
2555         // split is beneficial for ARM64, LDC and GDC without AVX2
2556         __m128i a_lo = _mm256_extractf128_si256!0(a);
2557         __m128i a_hi = _mm256_extractf128_si256!1(a);
2558         __m128i b_lo = _mm256_extractf128_si256!0(b);
2559         __m128i b_hi = _mm256_extractf128_si256!1(b);
2560         __m128i r_lo = _mm_sad_epu8(a_lo, b_lo);
2561         __m128i r_hi = _mm_sad_epu8(a_hi, b_hi);
2562         return _mm256_set_m128i(r_hi, r_lo);
2563     }
2564 }
2565 unittest
2566 {
2567     __m256i A = _mm256_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54,
2568                               3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
2569     __m256i B = _mm256_set1_epi8(1);
2570     int8 R = cast(int8) _mm256_sad_epu8(A, B);
2571     int[8] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
2572                       0,
2573                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
2574                       0,
2575                       2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
2576                       0,
2577                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
2578                       0];
2579     assert(R.array == correct);
2580 }
2581 
2582 
2583 // TODO __m256i _mm256_shuffle_epi32 (__m256i a, const int imm8) pure @safe
2584 // TODO __m256i _mm256_shuffle_epi8 (__m256i a, __m256i b) pure @safe
2585 // TODO __m256i _mm256_shufflehi_epi16 (__m256i a, const int imm8) pure @safe
2586 // TODO __m256i _mm256_shufflelo_epi16 (__m256i a, const int imm8) pure @safe
2587 // TODO __m256i _mm256_sign_epi16 (__m256i a, __m256i b) pure @safe
2588 // TODO __m256i _mm256_sign_epi32 (__m256i a, __m256i b) pure @safe
2589 // TODO __m256i _mm256_sign_epi8 (__m256i a, __m256i b) pure @safe
2590 // TODO __m256i _mm256_sll_epi16 (__m256i a, __m128i count) pure @safe
2591 // TODO __m256i _mm256_sll_epi32 (__m256i a, __m128i count) pure @safe
2592 // TODO __m256i _mm256_sll_epi64 (__m256i a, __m128i count) pure @safe
2593 
2594 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
2595 __m256i _mm256_slli_epi16(__m256i a, int imm8) pure @safe
2596 {
2597     static if (GDC_or_LDC_with_AVX2)
2598     {
2599         return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8);
2600     }
2601     else // split
2602     {
2603         __m128i a_lo = _mm256_extractf128_si256!0(a);
2604         __m128i a_hi = _mm256_extractf128_si256!1(a);
2605         __m128i r_lo = _mm_slli_epi16(a_lo, imm8);
2606         __m128i r_hi = _mm_slli_epi16(a_hi, imm8);
2607         return _mm256_set_m128i(r_hi, r_lo);
2608     }
2609 }
2610 unittest
2611 {
2612     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
2613     short16 B = cast(short16)( _mm256_slli_epi16(A, 1) );
2614     short16 B2 = cast(short16)( _mm256_slli_epi16(A, 1 + 256) );
2615     short[16] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14, 0, 2, 4, 6, -8, -10, 12, 14 ];
2616     assert(B.array == expectedB);
2617     assert(B2.array == expectedB);
2618 
2619     short16 C = cast(short16)( _mm256_slli_epi16(A, 16) );
2620     short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
2621     assert(C.array == expectedC);
2622 }
2623 
2624 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
2625 __m256i _mm256_slli_epi32 (__m256i a, int imm8) pure @safe
2626 {
2627     static if (GDC_or_LDC_with_AVX2)
2628     {
2629         return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8);
2630     }
2631     else
2632     {
2633         __m128i a_lo = _mm256_extractf128_si256!0(a);
2634         __m128i a_hi = _mm256_extractf128_si256!1(a);
2635         __m128i r_lo = _mm_slli_epi32(a_lo, imm8);
2636         __m128i r_hi = _mm_slli_epi32(a_hi, imm8);
2637         return _mm256_set_m128i(r_hi, r_lo);
2638     }
2639 }
2640 unittest
2641 {
2642     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -9);
2643     int8 B = cast(int8) _mm256_slli_epi32(A, 1);
2644     int8 B2 = cast(int8) _mm256_slli_epi32(A, 1 + 256);
2645     int[8] expectedB = [ 0, 4, 6, -8, 0, 4, 6, -18 ];
2646     assert(B.array == expectedB);
2647     assert(B2.array == expectedB);
2648 
2649     int8 C = cast(int8) _mm256_slli_epi32(A, 0);
2650     int[8] expectedC = [ 0, 2, 3, -4, 0, 2, 3, -9 ];
2651     assert(C.array == expectedC);
2652 
2653     int8 D = cast(int8) _mm256_slli_epi32(A, 65);
2654     int[8] expectedD = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
2655     assert(D.array == expectedD);
2656 }
2657 
2658 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
2659 __m256i _mm256_slli_epi64 (__m256i a, int imm8) pure @safe
2660 {
2661     static if (GDC_or_LDC_with_AVX2)
2662     {
2663         return cast(__m256i) __builtin_ia32_psllqi256(cast(long4)a, cast(ubyte)imm8);
2664     }
2665     else
2666     {
2667         __m128i a_lo = _mm256_extractf128_si256!0(a);
2668         __m128i a_hi = _mm256_extractf128_si256!1(a);
2669         __m128i r_lo = _mm_slli_epi64(a_lo, imm8);
2670         __m128i r_hi = _mm_slli_epi64(a_hi, imm8);
2671         return _mm256_set_m128i(r_hi, r_lo);
2672     }
2673 }
2674 unittest
2675 {
2676     __m256i A = _mm256_setr_epi64(23, -4, 1, long.max);
2677     long4 B = cast(long4) _mm256_slli_epi64(A, 1);
2678     long4 B2 = cast(long4) _mm256_slli_epi64(A, 1 + 256);
2679 
2680     long[4] expectedB = [ 46, -8, 2, -2];
2681     assert(B.array == expectedB);
2682     assert(B2.array == expectedB);
2683 
2684     long4 C = cast(long4) _mm256_slli_epi64(A, 0);
2685     long[4] expectedC = [ 23, -4, 1, long.max ];
2686     assert(C.array == expectedC);
2687 
2688     long4 D = cast(long4) _mm256_slli_epi64(A, 65);
2689     long[4] expectedD = [ 0, 0, 0, 0 ];
2690     assert(D.array == expectedD);
2691 }
2692 
2693 // TODO __m256i _mm256_slli_si256 (__m256i a, const int imm8) pure @safe
2694 // TODO __m128i _mm_sllv_epi32 (__m128i a, __m128i count) pure @safe
2695 // TODO __m256i _mm256_sllv_epi32 (__m256i a, __m256i count) pure @safe
2696 // TODO __m128i _mm_sllv_epi64 (__m128i a, __m128i count) pure @safe
2697 // TODO __m256i _mm256_sllv_epi64 (__m256i a, __m256i count) pure @safe
2698 // TODO __m256i _mm256_sra_epi16 (__m256i a, __m128i count) pure @safe
2699 // TODO __m256i _mm256_sra_epi32 (__m256i a, __m128i count) pure @safe
2700 
2701 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
2702 __m256i _mm256_srai_epi16 (__m256i a, int imm8) pure @safe
2703 {
2704     static if (GDC_or_LDC_with_AVX2)
2705     {
2706         return cast(__m256i) __builtin_ia32_psrawi256(cast(short16)a, cast(ubyte)imm8);
2707     }
2708     else 
2709     {
2710         // split
2711         __m128i a_lo = _mm256_extractf128_si256!0(a);
2712         __m128i a_hi = _mm256_extractf128_si256!1(a);
2713         __m128i r_lo = _mm_srai_epi16(a_lo, imm8);
2714         __m128i r_hi = _mm_srai_epi16(a_hi, imm8);
2715         return _mm256_set_m128i(r_hi, r_lo);
2716     }
2717 }
2718 unittest
2719 {
2720     __m256i A  = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, short.min, short.max, 2, 3, -4, -5, 6, 7);
2721     short16 B  = cast(short16)( _mm256_srai_epi16(A, 1) );
2722     short16 B2 = cast(short16)( _mm256_srai_epi16(A, 1 + 256) );
2723     short[16] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3, -16384, 16383, 1, 1, -2, -3, 3, 3 ];
2724     assert(B.array == expectedB);
2725     assert(B2.array == expectedB);
2726 
2727     short16 C = cast(short16)( _mm256_srai_epi16(A, 18) );
2728     short[16] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0,
2729                            -1, 0, 0, 0, -1, -1, 0, 0 ];
2730     assert(C.array == expectedC);
2731 }
2732 
2733 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
2734 __m256i _mm256_srai_epi32 (__m256i a, int imm8) pure @safe
2735 {
2736     static if (GDC_or_LDC_with_AVX2)
2737     {
2738         return cast(__m256i) __builtin_ia32_psradi256(cast(int8)a, cast(ubyte)imm8);
2739     }
2740     else // split
2741     {
2742         __m128i a_lo = _mm256_extractf128_si256!0(a);
2743         __m128i a_hi = _mm256_extractf128_si256!1(a);
2744         __m128i r_lo = _mm_srai_epi32(a_lo, imm8);
2745         __m128i r_hi = _mm_srai_epi32(a_hi, imm8);
2746         return _mm256_set_m128i(r_hi, r_lo);
2747     }
2748 }
2749 unittest
2750 {
2751     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
2752     int8 B = cast(int8) _mm256_srai_epi32(A, 1);
2753     int8 B2 = cast(int8) _mm256_srai_epi32(A, 1 + 256);
2754     int[8] expectedB = [ 0, 1, 1, -2, 0, 1, 1, -2];
2755     assert(B.array == expectedB);
2756     assert(B2.array == expectedB);
2757 
2758     int8 C = cast(int8) _mm256_srai_epi32(A, 32);
2759     int[8] expectedC = [ 0, 0, 0, -1, 0, 0, 0, -1];
2760     assert(C.array == expectedC);
2761 
2762     int8 D = cast(int8) _mm256_srai_epi32(A, 0);
2763     int[8] expectedD = [ 0, 2, 3, -4, 0, 2, 3, -4];
2764     assert(D.array == expectedD);
2765 }
2766 
2767 // TODO __m128i _mm_srav_epi32 (__m128i a, __m128i count) pure @safe
2768 // TODO __m256i _mm256_srav_epi32 (__m256i a, __m256i count) pure @safe
2769 // TODO __m256i _mm256_srl_epi16 (__m256i a, __m128i count) pure @safe
2770 // TODO __m256i _mm256_srl_epi32 (__m256i a, __m128i count) pure @safe
2771 // TODO __m256i _mm256_srl_epi64 (__m256i a, __m128i count) pure @safe
2772 
2773 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
2774 __m256i _mm256_srli_epi16 (__m256i a, int imm8) pure @trusted
2775 {
2776     static if (GDC_with_AVX2)
2777     {
2778         return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
2779     }
2780     else static if (LDC_with_AVX2)
2781     {
2782         return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
2783     }
2784     else
2785     {
2786         __m128i a_lo = _mm256_extractf128_si256!0(a);
2787         __m128i a_hi = _mm256_extractf128_si256!1(a);
2788         __m128i r_lo = _mm_srli_epi16(a_lo, imm8);
2789         __m128i r_hi = _mm_srli_epi16(a_hi, imm8);
2790         return _mm256_set_m128i(r_hi, r_lo);
2791     }
2792 }
2793 unittest
2794 {
2795     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
2796     short16 B = cast(short16) _mm256_srli_epi16(A, 1);
2797     short16 B2 = cast(short16) _mm256_srli_epi16(A, 1 + 256);
2798     short[16] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3, 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
2799     assert(B.array == expectedB);
2800     assert(B2.array == expectedB);
2801 
2802     short16 C = cast(short16) _mm256_srli_epi16(A, 16);
2803     short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
2804     assert(C.array == expectedC);
2805 
2806     short16 D = cast(short16) _mm256_srli_epi16(A, 0);
2807     short[16] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7 ];
2808     assert(D.array == expectedD);
2809 }
2810 
2811 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
2812 __m256i _mm256_srli_epi32 (__m256i a, int imm8) pure @trusted
2813 {
2814     static if (GDC_with_AVX2)
2815     {
2816         return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
2817     }
2818     else static if (LDC_with_AVX2)
2819     {
2820         return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
2821     }
2822     else 
2823     {
2824         // split
2825         __m128i a_lo = _mm256_extractf128_si256!0(a);
2826         __m128i a_hi = _mm256_extractf128_si256!1(a);
2827         __m128i r_lo = _mm_srli_epi32(a_lo, imm8);
2828         __m128i r_hi = _mm_srli_epi32(a_hi, imm8);
2829         return _mm256_set_m128i(r_hi, r_lo);
2830     }
2831 }
2832 unittest
2833 {
2834     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
2835     int8 B = cast(int8) _mm256_srli_epi32(A, 1);
2836     int8 B2 = cast(int8) _mm256_srli_epi32(A, 1 + 256);
2837     int[8] expectedB = [ 0, 1, 1, 0x7FFFFFFE, 0, 1, 1, 0x7FFFFFFE];
2838     assert(B.array == expectedB);
2839     assert(B2.array == expectedB);
2840 
2841     int8 C = cast(int8) _mm256_srli_epi32(A, 255);
2842     int[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
2843     assert(C.array == expectedC);
2844 }
2845 
2846 // TODO __m256i _mm256_srli_epi64 (__m256i a, int imm8) pure @safe
2847 // TODO __m256i _mm256_srli_si256 (__m256i a, const int imm8) pure @safe
2848 // TODO __m128i _mm_srlv_epi32 (__m128i a, __m128i count) pure @safe
2849 // TODO __m256i _mm256_srlv_epi32 (__m256i a, __m256i count) pure @safe
2850 // TODO __m128i _mm_srlv_epi64 (__m128i a, __m128i count) pure @safe
2851 // TODO __m256i _mm256_srlv_epi64 (__m256i a, __m256i count) pure @safe
2852 
2853 // TODO __m256i _mm256_stream_load_si256 (__m256i const* mem_addr) pure @safe
2854 
2855 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
2856 __m256i _mm256_sub_epi16 (__m256i a, __m256i b) pure @safe
2857 {
2858     pragma(inline, true);
2859     return cast(__m256i)(cast(short16)a - cast(short16)b);
2860 }
2861 unittest
2862 {
2863     __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0,  6, -2);
2864     short16 R = cast(short16) _mm256_sub_epi16(A, A);
2865     short[16] correct         = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0];
2866     assert(R.array == correct);
2867 }
2868 
2869 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
2870 __m256i _mm256_sub_epi32(__m256i a, __m256i b) pure @safe
2871 {
2872     pragma(inline, true);
2873     return cast(__m256i)(cast(int8)a - cast(int8)b);
2874 }
2875 unittest
2876 {
2877     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432);
2878     int8 R = cast(int8) _mm256_sub_epi32(A, A);
2879     int[8] correct = [ 0, 0, 0, 0, 0, 0, 0, 0];
2880     assert(R.array == correct);
2881 }
2882 
2883 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
2884 __m256i _mm256_sub_epi64 (__m256i a, __m256i b) pure @safe
2885 {
2886     pragma(inline, true);
2887     return a - b;
2888 }
2889 unittest
2890 {
2891     __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12);
2892     long4 R = cast(__m256i) _mm256_sub_epi64(A, A);
2893     long[4] correct = [ 0, 0, 0, 0 ];
2894     assert(R.array == correct);
2895 }
2896 
2897 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
2898 __m256i _mm256_sub_epi8 (__m256i a, __m256i b) pure @safe
2899 {
2900     pragma(inline, true);
2901     return cast(__m256i)(cast(byte32)a - cast(byte32)b);
2902 }
2903 unittest
2904 {
2905     __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78,
2906                                  4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78);
2907     byte32 R = cast(byte32) _mm256_sub_epi8(A, A);
2908     byte[32] correct; // zero initialized
2909     assert(R.array == correct);
2910 }
2911 
2912 /// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using 
2913 /// saturation.
2914 __m256i _mm256_subs_epi16 (__m256i a, __m256i b) pure @trusted
2915 {
2916     // PERF DMD
2917     static if (GDC_with_AVX2)
2918     {
2919         return cast(__m256i) __builtin_ia32_psubsw256(cast(short16)a, cast(short16)b);
2920     }
2921     else static if(LDC_with_saturated_intrinsics)
2922     {
2923         return cast(__m256i) inteli_llvm_subs!short16(cast(short16)a, cast(short16)b);
2924     }
2925     else
2926     {
2927         short16 r;
2928         short16 sa = cast(short16)a;
2929         short16 sb = cast(short16)b;
2930         foreach(i; 0..16)
2931             r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
2932         return cast(__m256i)r;
2933     }
2934 }
2935 unittest
2936 {
2937     short16 res = cast(short16) _mm256_subs_epi16(_mm256_setr_epi16( 7,  6,  5, -32768, 3, 3, 32766,   0,  7,  6,  5, -32750, 3, 3, 32767,   0),
2938                                                   _mm256_setr_epi16( 7,  6,  5, -30000, 3, 1,    -2, -10,  7,  6,  5,    100, 3, 1,     1, -10));
2939     static immutable short[16] correctResult                    =  [ 0,  0,  0,  -2768, 0, 2, 32767,  10,  0,  0,  0, -32768, 0, 2, 32766,  10];
2940     assert(res.array == correctResult);
2941 }
2942 
2943 
2944 /// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using
2945 /// saturation.
2946 __m256i _mm256_subs_epi8 (__m256i a, __m256i b) pure @trusted
2947 {
2948     // PERF DMD
2949     static if (GDC_with_AVX2)
2950     {
2951         return cast(__m256i) __builtin_ia32_psubsb256(cast(ubyte32)a, cast(ubyte32)b);
2952     }
2953     else static if(LDC_with_saturated_intrinsics)
2954     {
2955         return cast(__m256i) inteli_llvm_subs!byte32(cast(byte32)a, cast(byte32)b);
2956     }
2957     else
2958     {
2959         byte32 r;
2960         byte32 sa = cast(byte32)a;
2961         byte32 sb = cast(byte32)b;
2962         foreach(i; 0..32)
2963             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
2964         return cast(__m256i)r;
2965     }
2966 }
2967 unittest
2968 {
2969     byte32 R = cast(byte32) _mm256_subs_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 126, 9, 8, 7, 6, 5, -127, 3, 2, 1, 0),
2970                                              _mm256_setr_epi8(15, 14, 13, 12, 11,  10, 9, 8, 7, 6, 5,    4, 3, 2, 1, 0, 15, 14, 13, 12, 11, -10, 9, 8, 7, 6, 5,    4, 3, 2, 1, 0));
2971     static immutable byte[32] correct                      = [ 0,  0,  0,  0,  0, 117, 0, 0, 0, 0, 0, -128, 0, 0, 0, 0,  0,  0,  0,  0,  0, 127, 0, 0, 0, 0, 0, -128, 0, 0, 0, 0]; 
2972     assert(R.array == correct);
2973 }
2974 
2975 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 
2976 /// using saturation.
2977 __m256i _mm256_subs_epu16 (__m256i a, __m256i b) pure @trusted
2978 {
2979     // PERF DMD
2980     static if (GDC_with_AVX2)
2981     {
2982         return cast(__m256i) __builtin_ia32_psubusw256(cast(short16)a, cast(short16)b);
2983     }
2984     else static if(LDC_with_saturated_intrinsics)
2985     {
2986         return cast(__m256i) inteli_llvm_subus!short16(cast(short16)a, cast(short16)b);
2987     }
2988     else
2989     {
2990         short16 r;
2991         short16 sa = cast(short16)a;
2992         short16 sb = cast(short16)b;
2993         foreach(i; 0..16)
2994             r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]));
2995         return cast(__m256i)r;
2996     }
2997 }
2998 unittest
2999 {
3000     short16 R = cast(short16) _mm256_subs_epu16(_mm256_setr_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3,  2, cast(short)65534, 0),
3001                                                 _mm256_setr_epi16(3, 4,                1, 0, 3, 2,                1, 0, 3, 2,                1, 0, 3, 20, cast(short)65535, 0));
3002     static immutable short[16] correct =                         [0, 0, cast(short)65534, 0, 0, 0, cast(short)65534, 0, 0, 0, cast(short)65534, 0, 0,  0,                0, 0];
3003     assert(R.array == correct);
3004 }
3005 
3006 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` using
3007 /// saturation.
3008 __m256i _mm256_subs_epu8 (__m256i a, __m256i b) pure @trusted
3009 {
3010     // PERF DMD
3011     // PERF GDC without AVX2
3012     static if (GDC_with_AVX2)
3013     {
3014         return cast(__m256i) __builtin_ia32_psubusb256(cast(ubyte32)a, cast(ubyte32)b);
3015     }
3016     else static if(LDC_with_saturated_intrinsics)
3017     {
3018         return cast(__m256i) inteli_llvm_subus!byte32(cast(byte32)a, cast(byte32)b);
3019     }
3020     else
3021     {
3022         byte32 r;
3023         byte32 sa = cast(byte32)a;
3024         byte32 sb = cast(byte32)b;
3025         foreach(i; 0..32)
3026             r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
3027         return cast(__m256i)r;
3028     }
3029 }
3030 unittest
3031 {
3032     __m256i A          = _mm256_setr_epi8(0, 0, 5, 4, 5, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0);
3033     __m256i B          = _mm256_setr_epi8(0, 0, 4, 5, 5, 0, 0, 0, 0, 0, 0, 0,             1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)137, 0, 0, 0,            40, 0, 0, 0, 0, 0, 0);
3034     byte32 R = cast(byte32) _mm256_subs_epu8(A, B);
3035     static immutable byte[32] correct =  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)254, 0, 0, 0, 0, 0, 0, 0, 0,   cast(byte)0, 0, 0, 0, cast(byte) 96, 0, 0, 0, 0, 0, 0];
3036     assert(R.array == correct);
3037 }
3038 
3039 /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in `a` and `b`.
3040 __m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b) pure @safe
3041 {
3042     static if (GDC_with_AVX2)
3043     {
3044         return cast(long4) __builtin_ia32_punpckhwd256(cast(short16)a, cast(short16)b);
3045     }
3046     else static if (LDC_with_optimizations)
3047     {
3048         enum ir = `%r = shufflevector <16 x i16> %0, <16 x i16> %1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12,i32 28, i32 13,i32 29, i32 14,i32 30, i32 15,i32 31>
3049             ret <16 x i16> %r`;
3050         return cast(__m256i)LDCInlineIR!(ir, short16, short16, short16)(cast(short16)a, cast(short16)b);
3051     }
3052     else
3053     {
3054         // Better for arm64, GDC without AVX2
3055         __m128i a_lo = _mm256_extractf128_si256!0(a);
3056         __m128i a_hi = _mm256_extractf128_si256!1(a);
3057         __m128i b_lo = _mm256_extractf128_si256!0(b);
3058         __m128i b_hi = _mm256_extractf128_si256!1(b);
3059         __m128i r_lo = _mm_unpackhi_epi16(a_lo, b_lo);
3060         __m128i r_hi = _mm_unpackhi_epi16(a_hi, b_hi);
3061         return _mm256_set_m128i(r_hi, r_lo);
3062     }
3063 }
3064 unittest
3065 {
3066     __m256i A = _mm256_setr_epi16( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
3067     __m256i B = _mm256_setr_epi16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
3068     short16 C = cast(short16) _mm256_unpackhi_epi16(A, B);
3069     short[16] correct = [4,  20, 5,  21, 6, 22, 7, 23, 
3070                          12, 28, 13, 29, 14, 30, 15, 31];
3071     assert(C.array == correct);
3072 }
3073 
3074 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in `a` and `b`.
3075 __m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b) pure @trusted
3076 {
3077     static if (GDC_with_AVX2)
3078         enum bool split = false;
3079     else version(GNU)
3080         enum bool split = true;
3081     else
3082         enum bool split = false;
3083 
3084     static if (GDC_with_AVX2)
3085     {
3086         return cast(long4) __builtin_ia32_punpckhdq256(cast(int8)a, cast(int8)b);
3087     }
3088     else static if (LDC_with_optimizations)
3089     {
3090         // LDC AVX2: Suprisingly, this start using vunpckhps in LDC 1.31 -O2
3091         enum ir = `%r = shufflevector <8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
3092             ret <8 x i32> %r`;
3093         return cast(__m256i)LDCInlineIR!(ir, int8, int8, int8)(cast(int8)a, cast(int8)b);
3094     }
3095     else static if (split)
3096     {
3097         __m128i a_lo = _mm256_extractf128_si256!0(a);
3098         __m128i a_hi = _mm256_extractf128_si256!1(a);
3099         __m128i b_lo = _mm256_extractf128_si256!0(b);
3100         __m128i b_hi = _mm256_extractf128_si256!1(b);
3101         __m128i r_lo = _mm_unpackhi_epi32(a_lo, b_lo);
3102         __m128i r_hi = _mm_unpackhi_epi32(a_hi, b_hi);
3103         return _mm256_set_m128i(r_hi, r_lo);
3104     }
3105     else
3106     {
3107         int8 R;
3108         int8 ai = cast(int8)a;
3109         int8 bi = cast(int8)b;
3110         R.ptr[0] = ai.array[2];
3111         R.ptr[1] = bi.array[2];
3112         R.ptr[2] = ai.array[3];
3113         R.ptr[3] = bi.array[3];
3114         R.ptr[4] = ai.array[6];
3115         R.ptr[5] = bi.array[6];
3116         R.ptr[6] = ai.array[7];
3117         R.ptr[7] = bi.array[7];
3118         return cast(__m256i) R;
3119     }
3120 }
3121 unittest
3122 {
3123     __m256i A = _mm256_setr_epi32(0, 1,  2,  3,  4,  5,  6,  7);
3124     __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15);
3125     int8 C = cast(int8) _mm256_unpackhi_epi32(A, B);
3126     int[8] correct = [2, 10, 3, 11, 6, 14, 7, 15];
3127     assert(C.array == correct);
3128 }
3129 
3130 /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in `a` and `b`,
3131 __m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b) pure @trusted
3132 {
3133     static if (GDC_with_AVX2)
3134     {
3135         return cast(__m256i) __builtin_ia32_punpckhbw256(cast(ubyte32)a, cast(ubyte32)b);
3136     }
3137     else static if (LDC_with_optimizations)
3138     {
3139         enum ir = `%r = shufflevector <32 x i8> %0, <32 x i8> %1, <32 x i32> <i32 8, i32 40,  i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
3140             ret <32 x i8> %r`;
3141         return cast(__m256i)LDCInlineIR!(ir, byte32, byte32, byte32)(cast(byte32)a, cast(byte32)b);
3142     }
3143     else
3144     {
3145         // Splitting always beneficial
3146         __m128i a_lo = _mm256_extractf128_si256!0(a);
3147         __m128i a_hi = _mm256_extractf128_si256!1(a);
3148         __m128i b_lo = _mm256_extractf128_si256!0(b);
3149         __m128i b_hi = _mm256_extractf128_si256!1(b);
3150         __m128i r_lo = _mm_unpackhi_epi8(a_lo, b_lo);
3151         __m128i r_hi = _mm_unpackhi_epi8(a_hi, b_hi);
3152         return _mm256_set_m128i(r_hi, r_lo);
3153     }
3154 }
3155 unittest
3156 {
3157     __m256i A = _mm256_setr_epi8(  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
3158                                   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
3159     __m256i B = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
3160                                   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
3161     byte32 C = cast(byte32) _mm256_unpackhi_epi8(A, B);
3162     byte[32] correct =          [  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
3163                                   24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 ];
3164     assert(C.array == correct);
3165 }
3166 
3167 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in `a` and `b`.
3168 __m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b) pure @trusted
3169 {
3170     version(GNU)
3171         enum split = true; // Benefits GDC in non-AVX2
3172     else
3173         enum split = false;
3174 
3175     static if (GDC_with_AVX2)
3176     {
3177         return __builtin_ia32_punpckhqdq256(a, b);
3178     }
3179     else static if (LDC_with_optimizations)
3180     {
3181         enum ir = `%r = shufflevector <4 x i64> %0, <4 x i64> %1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
3182             ret <4 x i64> %r`;
3183         return cast(__m256i)LDCInlineIR!(ir, long4, long4, long4)(a, b);
3184     }
3185     else static if (split)
3186     {
3187         __m128i a_lo = _mm256_extractf128_si256!0(a);
3188         __m128i a_hi = _mm256_extractf128_si256!1(a);
3189         __m128i b_lo = _mm256_extractf128_si256!0(b);
3190         __m128i b_hi = _mm256_extractf128_si256!1(b);
3191         __m128i r_lo = _mm_unpackhi_epi64(a_lo, b_lo);
3192         __m128i r_hi = _mm_unpackhi_epi64(a_hi, b_hi);
3193         return _mm256_set_m128i(r_hi, r_lo);
3194     }
3195     else
3196     {        
3197         long4 R;
3198         R.ptr[0] = a.array[1];
3199         R.ptr[1] = b.array[1];
3200         R.ptr[2] = a.array[3];
3201         R.ptr[3] = b.array[3];
3202         return R;
3203     }
3204 }
3205 unittest
3206 {
3207     __m256i A = _mm256_setr_epi64(0x22222222_22222222, 0x33333333_33333333, 2, 3);
3208     __m256i B = _mm256_setr_epi64(0x44444444_44444444, 0x55555555_55555555, 4, 5);
3209     long4 C = _mm256_unpackhi_epi64(A, B);
3210     long[4] correct = [0x33333333_33333333, 0x55555555_55555555, 3, 5];
3211     assert(C.array == correct);
3212 }
3213 
3214 /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in `a` and `b`.
3215 __m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b) pure @safe
3216 {
3217     static if (GDC_with_AVX2)
3218     {
3219         return cast(__m256i) __builtin_ia32_punpcklwd256(cast(short16)a, cast(short16)b);
3220     }
3221     else static if (LDC_with_optimizations)
3222     {
3223         enum ir = `%r = shufflevector <16 x i16> %0, <16 x i16> %1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
3224             ret <16 x i16> %r`;
3225         return cast(__m256i)LDCInlineIR!(ir, short16, short16, short16)(cast(short16)a, cast(short16)b);
3226     }
3227     else
3228     {
3229         __m128i a_lo = _mm256_extractf128_si256!0(a);
3230         __m128i a_hi = _mm256_extractf128_si256!1(a);
3231         __m128i b_lo = _mm256_extractf128_si256!0(b);
3232         __m128i b_hi = _mm256_extractf128_si256!1(b);
3233         __m128i r_lo = _mm_unpacklo_epi16(a_lo, b_lo);
3234         __m128i r_hi = _mm_unpacklo_epi16(a_hi, b_hi);
3235         return _mm256_set_m128i(r_hi, r_lo);
3236     }
3237 }
3238 unittest
3239 {
3240     __m256i A = _mm256_setr_epi16( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
3241     __m256i B = _mm256_setr_epi16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
3242     short16 C = cast(short16) _mm256_unpacklo_epi16(A, B);
3243     short[16] correct = [0,  16, 1,  17, 2, 18, 3, 19, 
3244                          8,  24, 9,  25, 10, 26, 11, 27];
3245     assert(C.array == correct);
3246 }
3247 
3248 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in `a` and `b`.
3249 __m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b) pure @trusted
3250 {
3251     static if (GDC_with_AVX2)
3252         enum bool split = false;
3253     else version(GNU)
3254         enum bool split = true;
3255     else
3256         enum bool split = false;
3257 
3258     static if (GDC_with_AVX2)
3259     {
3260         return cast(long4) __builtin_ia32_punpckldq256(cast(int8)a, cast(int8)b);
3261     }
3262     else static if (LDC_with_optimizations)
3263     {
3264         // LDC AVX2: Suprisingly, this start using vunpcklps in LDC 1.31 -O1
3265         enum ir = `%r = shufflevector <8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
3266             ret <8 x i32> %r`;
3267         return cast(__m256i)LDCInlineIR!(ir, int8, int8, int8)(cast(int8)a, cast(int8)b);
3268     }
3269     else static if (split)
3270     {
3271         __m128i a_lo = _mm256_extractf128_si256!0(a);
3272         __m128i a_hi = _mm256_extractf128_si256!1(a);
3273         __m128i b_lo = _mm256_extractf128_si256!0(b);
3274         __m128i b_hi = _mm256_extractf128_si256!1(b);
3275         __m128i r_lo = _mm_unpacklo_epi32(a_lo, b_lo);
3276         __m128i r_hi = _mm_unpacklo_epi32(a_hi, b_hi);
3277         return _mm256_set_m128i(r_hi, r_lo);
3278     }
3279     else
3280     {
3281         int8 R;
3282         int8 ai = cast(int8)a;
3283         int8 bi = cast(int8)b;
3284         R.ptr[0] = ai.array[0];
3285         R.ptr[1] = bi.array[0];
3286         R.ptr[2] = ai.array[1];
3287         R.ptr[3] = bi.array[1];
3288         R.ptr[4] = ai.array[4];
3289         R.ptr[5] = bi.array[4];
3290         R.ptr[6] = ai.array[5];
3291         R.ptr[7] = bi.array[5];
3292         return cast(__m256i) R;
3293     }
3294 }
3295 unittest
3296 {
3297     __m256i A = _mm256_setr_epi32(0, 1,  2,  3,  4,  5,  6,  7);
3298     __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15);
3299     int8 C = cast(int8) _mm256_unpacklo_epi32(A, B);
3300     int[8] correct = [0, 8, 1, 9, 4, 12, 5, 13];
3301     assert(C.array == correct);
3302 }
3303 
3304 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in `a` and `b`.
3305 __m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b) pure @trusted
3306 {
3307     version(GNU)
3308         enum split = true; // Benefits GDC in non-AVX2
3309     else
3310         enum split = false;
3311 
3312     static if (GDC_with_AVX2)
3313     {
3314         return __builtin_ia32_punpcklqdq256(a, b);
3315     }
3316     else static if (LDC_with_optimizations)
3317     {
3318         enum ir = `%r = shufflevector <4 x i64> %0, <4 x i64> %1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
3319             ret <4 x i64> %r`;
3320         return cast(__m256i)LDCInlineIR!(ir, long4, long4, long4)(a, b);
3321     }
3322     else static if (split)
3323     {
3324         __m128i a_lo = _mm256_extractf128_si256!0(a);
3325         __m128i a_hi = _mm256_extractf128_si256!1(a);
3326         __m128i b_lo = _mm256_extractf128_si256!0(b);
3327         __m128i b_hi = _mm256_extractf128_si256!1(b);
3328         __m128i r_lo = _mm_unpacklo_epi64(a_lo, b_lo);
3329         __m128i r_hi = _mm_unpacklo_epi64(a_hi, b_hi);
3330         return _mm256_set_m128i(r_hi, r_lo);
3331     }
3332     else
3333     {        
3334         long4 R;
3335         R.ptr[0] = a.array[0];
3336         R.ptr[1] = b.array[0];
3337         R.ptr[2] = a.array[2];
3338         R.ptr[3] = b.array[2];
3339         return R;
3340     }
3341 }
3342 unittest
3343 {
3344     __m256i A = _mm256_setr_epi64(0x22222222_22222222, 0x33333333_33333333, 2, 3);
3345     __m256i B = _mm256_setr_epi64(0x44444444_44444444, 0x55555555_55555555, 4, 5);
3346     long4 C = _mm256_unpacklo_epi64(A, B);
3347     long[4] correct = [0x22222222_22222222, 0x44444444_44444444, 2, 4];
3348     assert(C.array == correct);
3349 }
3350 
3351 /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in `a` and `b`. 
3352 __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b) pure @trusted
3353 {
3354     static if (GDC_with_AVX2)
3355     {
3356         return cast(__m256i) __builtin_ia32_punpcklbw256(cast(ubyte32)a, cast(ubyte32)b);
3357     }
3358     else static if (LDC_with_optimizations)
3359     {
3360         enum ir = `%r = shufflevector <32 x i8> %0, <32 x i8> %1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
3361             ret <32 x i8> %r`;
3362         return cast(__m256i)LDCInlineIR!(ir, byte32, byte32, byte32)(cast(byte32)a, cast(byte32)b);
3363     }
3364     else
3365     {
3366         // Splitting always beneficial
3367         __m128i a_lo = _mm256_extractf128_si256!0(a);
3368         __m128i a_hi = _mm256_extractf128_si256!1(a);
3369         __m128i b_lo = _mm256_extractf128_si256!0(b);
3370         __m128i b_hi = _mm256_extractf128_si256!1(b);
3371         __m128i r_lo = _mm_unpacklo_epi8(a_lo, b_lo);
3372         __m128i r_hi = _mm_unpacklo_epi8(a_hi, b_hi);
3373         return _mm256_set_m128i(r_hi, r_lo);
3374     }
3375 }
3376 unittest
3377 {
3378     __m256i A = _mm256_setr_epi8(  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
3379                                   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
3380     __m256i B = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
3381                                   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
3382     byte32 C = cast(byte32) _mm256_unpacklo_epi8(A, B);
3383     byte[32] correct =          [  0, 32,  1, 33,  2, 34,  3, 35,  4, 36,  5, 37,  6, 38,  7, 39,
3384                                   16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 ];
3385     assert(C.array == correct);
3386 }
3387 
3388 /// Compute the bitwise XOR of 256 bits (representing integer data) in `a` and `b`.
3389 __m256i _mm256_xor_si256 (__m256i a, __m256i b) pure @safe
3390 {
3391     return a ^ b;
3392 }
3393 unittest
3394 {
3395     __m256i A = _mm256_setr_epi64(975394,    619809709,    -1,    54);
3396     __m256i B = _mm256_setr_epi64(-920275025,       -6, 85873, 96644);
3397     long4 R = cast(long4) _mm256_xor_si256(A, B);
3398     long[4] correct = [975394 ^ (-920275025L), 619809709L ^ -6, (-1) ^ 85873, 54 ^ 96644];
3399     assert(R.array == correct);
3400 }
3401 
3402 
3403 /+
3404 
3405 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.d")
3406 int4 __builtin_ia32_gatherd_d(int4, const void*, int4, int4, byte);
3407 
3408 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.d.256")
3409 int8 __builtin_ia32_gatherd_d256(int8, const void*, int8, int8, byte);
3410 
3411 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.pd")
3412 double2 __builtin_ia32_gatherd_pd(double2, const void*, int4, double2, byte);
3413 
3414 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.pd.256")
3415 double4 __builtin_ia32_gatherd_pd256(double4, const void*, int4, double4, byte);
3416 
3417 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.ps")
3418 float4 __builtin_ia32_gatherd_ps(float4, const void*, int4, float4, byte);
3419 
3420 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.ps.256")
3421 float8 __builtin_ia32_gatherd_ps256(float8, const void*, int8, float8, byte);
3422 
3423 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.q")
3424 long2 __builtin_ia32_gatherd_q(long2, const void*, int4, long2, byte);
3425 
3426 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.q.256")
3427 long4 __builtin_ia32_gatherd_q256(long4, const void*, int4, long4, byte);
3428 
3429 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.d")
3430 int4 __builtin_ia32_gatherq_d(int4, const void*, long2, int4, byte);
3431 
3432 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.d.256")
3433 int4 __builtin_ia32_gatherq_d256(int4, const void*, long4, int4, byte);
3434 
3435 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.pd")
3436 double2 __builtin_ia32_gatherq_pd(double2, const void*, long2, double2, byte);
3437 
3438 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.pd.256")
3439 double4 __builtin_ia32_gatherq_pd256(double4, const void*, long4, double4, byte);
3440 
3441 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.ps")
3442 float4 __builtin_ia32_gatherq_ps(float4, const void*, long2, float4, byte);
3443 
3444 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.ps.256")
3445 float4 __builtin_ia32_gatherq_ps256(float4, const void*, long4, float4, byte);
3446 
3447 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q")
3448 long2 __builtin_ia32_gatherq_q(long2, const void*, long2, long2, byte);
3449 
3450 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q.256")
3451 long4 __builtin_ia32_gatherq_q256(long4, const void*, long4, long4, byte);
3452 
3453 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.d")
3454 int4 __builtin_ia32_maskloadd(const void*, int4);
3455 
3456 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.d.256")
3457 int8 __builtin_ia32_maskloadd256(const void*, int8);
3458 
3459 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.q")
3460 long2 __builtin_ia32_maskloadq(const void*, long2);
3461 
3462 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.q.256")
3463 long4 __builtin_ia32_maskloadq256(const void*, long4);
3464 
3465 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d")
3466 void __builtin_ia32_maskstored(void*, int4, int4);
3467 
3468 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d.256")
3469 void __builtin_ia32_maskstored256(void*, int8, int8);
3470 
3471 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q")
3472 void __builtin_ia32_maskstoreq(void*, long2, long2);
3473 
3474 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q.256")
3475 void __builtin_ia32_maskstoreq256(void*, long4, long4);
3476 
3477 pragma(LDC_intrinsic, "llvm.x86.avx2.mpsadbw")
3478 short16 __builtin_ia32_mpsadbw256(byte32, byte32, byte) pure @safe;
3479 
3480 
3481 pragma(LDC_intrinsic, "llvm.x86.avx2.pblendvb")
3482 byte32 __builtin_ia32_pblendvb256(byte32, byte32, byte32) pure @safe;
3483 
3484 pragma(LDC_intrinsic, "llvm.x86.avx2.permd")
3485 int8 __builtin_ia32_permvarsi256(int8, int8) pure @safe;
3486 
3487 pragma(LDC_intrinsic, "llvm.x86.avx2.permps")
3488 float8 __builtin_ia32_permvarsf256(float8, int8) pure @safe;
3489 
3490 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.d")
3491 int8 __builtin_ia32_phaddd256(int8, int8) pure @safe;
3492 
3493 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.sw")
3494 short16 __builtin_ia32_phaddsw256(short16, short16) pure @safe;
3495 
3496 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.w")
3497 short16 __builtin_ia32_phaddw256(short16, short16) pure @safe;
3498 
3499 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.d")
3500 int8 __builtin_ia32_phsubd256(int8, int8) pure @safe;
3501 
3502 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.sw")
3503 short16 __builtin_ia32_phsubsw256(short16, short16) pure @safe;
3504 
3505 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.w")
3506 short16 __builtin_ia32_phsubw256(short16, short16) pure @safe;
3507 
3508 pragma(LDC_intrinsic, "llvm.x86.avx2.pmadd.ub.sw")
3509 short16 __builtin_ia32_pmaddubsw256(byte32, byte32) pure @safe;
3510 
3511 pragma(LDC_intrinsic, "llvm.x86.avx2.pmadd.wd")
3512 int8 __builtin_ia32_pmaddwd256(short16, short16) pure @safe;
3513 
3514 pragma(LDC_intrinsic, "llvm.x86.avx2.pmovmskb")
3515 int __builtin_ia32_pmovmskb256(byte32) pure @safe;
3516 
3517 pragma(LDC_intrinsic, "llvm.x86.avx2.pmul.hr.sw")
3518 short16 __builtin_ia32_pmulhrsw256(short16, short16) pure @safe;
3519 
3520 pragma(LDC_intrinsic, "llvm.x86.avx2.pmulh.w")
3521 short16 __builtin_ia32_pmulhw256(short16, short16) pure @safe;
3522 
3523 pragma(LDC_intrinsic, "llvm.x86.avx2.pmulhu.w")
3524 short16 __builtin_ia32_pmulhuw256(short16, short16) pure @safe;
3525 
3526 pragma(LDC_intrinsic, "llvm.x86.avx2.psad.bw")
3527 long4 __builtin_ia32_psadbw256(byte32, byte32) pure @safe;
3528 
3529 pragma(LDC_intrinsic, "llvm.x86.avx2.pshuf.b")
3530 byte32 __builtin_ia32_pshufb256(byte32, byte32) pure @safe;
3531 
3532 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.b")
3533 byte32 __builtin_ia32_psignb256(byte32, byte32) pure @safe;
3534 
3535 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.d")
3536 int8 __builtin_ia32_psignd256(int8, int8) pure @safe;
3537 
3538 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.w")
3539 short16 __builtin_ia32_psignw256(short16, short16) pure @safe;
3540 
3541 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.d")
3542 int8 __builtin_ia32_pslld256(int8, int4) pure @safe;
3543 
3544 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.q")
3545 long4 __builtin_ia32_psllq256(long4, long2) pure @safe;
3546 
3547 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.w")
3548 short16 __builtin_ia32_psllw256(short16, short8) pure @safe;
3549 
3550 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.d")
3551 int8 __builtin_ia32_pslldi256(int8, int) pure @safe;
3552 
3553 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.q")
3554 long4 __builtin_ia32_psllqi256(long4, int) pure @safe;
3555 
3556 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.w")
3557 short16 __builtin_ia32_psllwi256(short16, int) pure @safe;
3558 
3559 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.d")
3560 int4 __builtin_ia32_psllv4si(int4, int4) pure @safe;
3561 
3562 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.d.256")
3563 int8 __builtin_ia32_psllv8si(int8, int8) pure @safe;
3564 
3565 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.q")
3566 long2 __builtin_ia32_psllv2di(long2, long2) pure @safe;
3567 
3568 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.q.256")
3569 long4 __builtin_ia32_psllv4di(long4, long4) pure @safe;
3570 
3571 pragma(LDC_intrinsic, "llvm.x86.avx2.psra.d")
3572 int8 __builtin_ia32_psrad256(int8, int4) pure @safe;
3573 
3574 pragma(LDC_intrinsic, "llvm.x86.avx2.psra.w")
3575 short16 __builtin_ia32_psraw256(short16, short8) pure @safe;
3576 
3577 pragma(LDC_intrinsic, "llvm.x86.avx2.psrai.d")
3578 int8 __builtin_ia32_psradi256(int8, int) pure @safe;
3579 
3580 pragma(LDC_intrinsic, "llvm.x86.avx2.psrai.w")
3581 short16 __builtin_ia32_psrawi256(short16, int) pure @safe;
3582 
3583 pragma(LDC_intrinsic, "llvm.x86.avx2.psrav.d")
3584 int4 __builtin_ia32_psrav4si(int4, int4) pure @safe;
3585 
3586 pragma(LDC_intrinsic, "llvm.x86.avx2.psrav.d.256")
3587 int8 __builtin_ia32_psrav8si(int8, int8) pure @safe;
3588 
3589 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.d")
3590 int8 __builtin_ia32_psrld256(int8, int4) pure @safe;
3591 
3592 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.q")
3593 long4 __builtin_ia32_psrlq256(long4, long2) pure @safe;
3594 
3595 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.w")
3596 short16 __builtin_ia32_psrlw256(short16, short8) pure @safe;
3597 
3598 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.d")
3599 int8 __builtin_ia32_psrldi256(int8, int) pure @safe;
3600 
3601 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.q")
3602 long4 __builtin_ia32_psrlqi256(long4, int) pure @safe;
3603 
3604 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.w")
3605 short16 __builtin_ia32_psrlwi256(short16, int) pure @safe;
3606 
3607 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.d")
3608 int4 __builtin_ia32_psrlv4si(int4, int4) pure @safe;
3609 
3610 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.d.256")
3611 int8 __builtin_ia32_psrlv8si(int8, int8) pure @safe;
3612 
3613 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.q")
3614 long2 __builtin_ia32_psrlv2di(long2, long2) pure @safe;
3615 
3616 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.q.256")
3617 long4 __builtin_ia32_psrlv4di(long4, long4) pure @safe;
3618 
3619 +/