The OpenD Programming Language

1 /**
2 * AVX intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX
4 *
5 * Copyright: Guillaume Piolat 2022.
6 *            Johan Engelen 2022.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.avxintrin;
10 
11 // AVX instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX
13 // Note: this header will work whether you have AVX enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively
15 // generate AVX instructions.
16 // With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively
17 // generate AVX instructions.
18 
19 
20 /// IMPORTANT NOTE ABOUT MASK LOAD/STORE:
21 ///
22 /// In theory, masked load/store can adress unadressable memory provided the mask is zero.
23 /// In practice, that is not the case for the following reasons:
24 /// 
25 /// - AMD manual says:
26 ///   "Exception and trap behavior for elements not selected for loading or storing from/to memory
27 ///   is implementation dependent. For instance, a given implementation may signal a data 
28 ///   breakpoint or a page fault for doublewords that are zero-masked and not actually written."
29 ///
30 /// - Intel fetches the whole cacheline anyway:
31 ///   https://erik.science/2019/06/21/AVX-fun.html
32 ///   "Even if the mask is stored in the special mask registers, it will still first fetch the data
33 ///    before checking the mask."
34 ///
35 /// So intel-intrinsics adopted the tightened semantics of only adressing fully addressable memory 
36 /// with masked loads and stores.
37 
38 
39 /// Some AVX intrinsics takes a float comparison constant.
40 /// When labelled "ordered" it means "AND ordered"
41 /// When labelled "unordered" it means "OR unordered"
42 alias _CMP_EQ = int;
43 ///ditto
44 enum : _CMP_EQ
45 {
46     _CMP_EQ_OQ    = 0x00, // Equal (ordered, non-signaling)
47     _CMP_LT_OS    = 0x01, // Less-than (ordered, signaling)
48     _CMP_LE_OS    = 0x02, // Less-than-or-equal (ordered, signaling)
49     _CMP_UNORD_Q  = 0x03, // Unordered (non-signaling)
50     _CMP_NEQ_UQ   = 0x04, // Not-equal (unordered, non-signaling)
51     _CMP_NLT_US   = 0x05, // Not-less-than (unordered, signaling)
52     _CMP_NLE_US   = 0x06, // Not-less-than-or-equal (unordered, signaling)
53     _CMP_ORD_Q    = 0x07, // Ordered (nonsignaling)
54     _CMP_EQ_UQ    = 0x08, // Equal (unordered, non-signaling)
55     _CMP_NGE_US   = 0x09, // Not-greater-than-or-equal (unordered, signaling)
56     _CMP_NGT_US   = 0x0a, // Not-greater-than (unordered, signaling)
57     _CMP_FALSE_OQ = 0x0b, // False (ordered, non-signaling)
58     _CMP_NEQ_OQ   = 0x0c, // Not-equal (ordered, non-signaling)
59     _CMP_GE_OS    = 0x0d, // Greater-than-or-equal (ordered, signaling)
60     _CMP_GT_OS    = 0x0e, // Greater-than (ordered, signaling)
61     _CMP_TRUE_UQ  = 0x0f, // True (unordered, non-signaling)
62     _CMP_EQ_OS    = 0x10, // Equal (ordered, signaling)
63     _CMP_LT_OQ    = 0x11, // Less-than (ordered, non-signaling)
64     _CMP_LE_OQ    = 0x12, // Less-than-or-equal (ordered, non-signaling)
65     _CMP_UNORD_S  = 0x13, // Unordered (signaling)
66     _CMP_NEQ_US   = 0x14, // Not-equal (unordered, signaling)
67     _CMP_NLT_UQ   = 0x15, // Not-less-than (unordered, non-signaling)
68     _CMP_NLE_UQ   = 0x16, // Not-less-than-or-equal (unordered, non-signaling)
69     _CMP_ORD_S    = 0x17, // Ordered (signaling)
70     _CMP_EQ_US    = 0x18, // Equal (unordered, signaling)
71     _CMP_NGE_UQ   = 0x19, // Not-greater-than-or-equal (unordered, non-signaling)
72     _CMP_NGT_UQ   = 0x1a, // Not-greater-than (unordered, non-signaling)
73     _CMP_FALSE_OS = 0x1b, // False (ordered, signaling)
74     _CMP_NEQ_OS   = 0x1c, // Not-equal (ordered, signaling)
75     _CMP_GE_OQ    = 0x1d, // Greater-than-or-equal (ordered, non-signaling)
76     _CMP_GT_OQ    = 0x1e, // Greater-than (ordered, non-signaling)
77     _CMP_TRUE_US  = 0x1f  // (unordered, signaling)
78 }
79 
80 public import inteli.types;
81 import inteli.internals;
82 
83 // Pull in all previous instruction set intrinsics.
84 public import inteli.smmintrin;
85 public import inteli.tmmintrin;
86 public import inteli.nmmintrin;
87 
88 
89 
90 // In x86, LDC earlier version may have trouble preserving the stack pointer when an unsupported
91 // 256-bit vector type is passed, and AVX is disabled.
92 // This leads to disabling some intrinsics in this particular situation, since they are not safe for
93 // the caller.
94 version(LDC)
95 {
96     version(X86)
97     {
98         enum llvm256BitStackWorkaroundIn32BitX86 = __VERSION__ < 2099;
99     }
100     else 
101         enum llvm256BitStackWorkaroundIn32BitX86 = false;
102 }
103 else
104     enum llvm256BitStackWorkaroundIn32BitX86 = false;
105 
106 
107 
108 
109 nothrow @nogc:
110 
111 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
112 __m256d _mm256_add_pd (__m256d a, __m256d b) pure @trusted
113 {
114     return a + b;
115 }
116 unittest
117 {
118     align(32) double[4] A = [-1, 2, -3, 40000];
119     align(32) double[4] B = [ 9, -7, 8, -0.5];
120     __m256d R = _mm256_add_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr));
121     double[4] correct = [8, -5, 5, 39999.5];
122     assert(R.array == correct);
123 }
124 
125 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
126 __m256 _mm256_add_ps (__m256 a, __m256 b) pure @trusted
127 {
128     return a + b;
129 }
130 unittest
131 {
132     align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6];
133     align(32) float[8] B = [ 9.0f, -7, 8,  -0.5, 8, 7, 3, -1];
134     __m256 R = _mm256_add_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr));
135     float[8] correct     = [8, -5, 5, 39999.5, 8, 10, 8, 5];
136     assert(R.array == correct);
137 }
138 
139 /// Alternatively add and subtract packed double-precision (64-bit) floating-point
140 ///  elements in `a` to/from packed elements in `b`.
141 __m256d _mm256_addsub_pd (__m256d a, __m256d b) pure @trusted
142 {
143     // PERF DMD
144     static if (GDC_or_LDC_with_AVX)
145     {
146         return __builtin_ia32_addsubpd256(a, b);
147     }
148     else
149     {
150         //// Note: GDC x86 generates addsubpd since GDC 11.1 with -O3
151         ////       LDC x86 generates addsubpd since LDC 1.18 with -O2
152         //// LDC ARM: not fantastic, ok since LDC 1.18 -O2
153         a.ptr[0] = a.array[0] + (-b.array[0]);
154         a.ptr[1] = a.array[1] + b.array[1];
155         a.ptr[2] = a.array[2] + (-b.array[2]);
156         a.ptr[3] = a.array[3] + b.array[3];
157         return a;
158     }
159 }
160 unittest
161 {
162     align(32) double[4] A = [-1, 2, -3, 40000];
163     align(32) double[4] B = [ 9, -7, 8, -0.5];
164     __m256d R = _mm256_addsub_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr));
165     double[4] correct = [-10, -5, -11, 39999.5];
166     assert(R.array == correct);
167 }
168 
169 /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements 
170 /// in `a` to/from packed elements in `b`.
171 __m256 _mm256_addsub_ps (__m256 a, __m256 b) pure @trusted
172 {
173     // PERF DMD
174     static if (GDC_or_LDC_with_AVX)
175     {
176         return __builtin_ia32_addsubps256(a, b);
177     }
178     else
179     {
180         // Note: GDC x86 generates addsubps since GDC 11 -O3
181         //               and in absence of AVX, a pair of SSE3 addsubps since GDC 12 -O2
182         //       LDC x86 generates addsubps since LDC 1.18 -O2
183         //               and in absence of AVX, a pair of SSE3 addsubps since LDC 1.1 -O1
184         // LDC ARM: neat output since LDC 1.21 -O2
185    
186         a.ptr[0] = a.array[0] + (-b.array[0]);
187         a.ptr[1] = a.array[1] + b.array[1];
188         a.ptr[2] = a.array[2] + (-b.array[2]);
189         a.ptr[3] = a.array[3] + b.array[3];
190         a.ptr[4] = a.array[4] + (-b.array[4]);
191         a.ptr[5] = a.array[5] + b.array[5];
192         a.ptr[6] = a.array[6] + (-b.array[6]);
193         a.ptr[7] = a.array[7] + b.array[7];
194         return a;
195     }
196 }
197 unittest
198 {
199     align(32) float[8] A = [-1.0f,  2,  -3, 40000,    0, 3,  5,  6];
200     align(32) float[8] B = [ 9.0f, -7,   8,  -0.5,    8, 7,  3, -1];
201     __m256 R = _mm256_addsub_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr));
202     float[8] correct     = [  -10, -5, -11, 39999.5, -8, 10, 2,  5];
203     assert(R.array == correct);
204 }
205 
206 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in `a` and `b`.
207 __m256d _mm256_and_pd (__m256d a, __m256d b) pure @trusted
208 {
209     // Note: GCC avxintrin.h uses the builtins for AND NOTAND OR of _ps and _pd,
210     //       but those do not seem needed at any optimization level.
211     return cast(__m256d)(cast(__m256i)a & cast(__m256i)b);
212 }
213 unittest
214 {
215     double a = 4.32;
216     double b = -78.99;
217     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
218     __m256d A = _mm256_set_pd(a, b, a, b);
219     __m256d B = _mm256_set_pd(b, a, b, a);
220     long4 R = cast(long4)( _mm256_and_pd(A, B) );
221     assert(R.array[0] == correct);
222     assert(R.array[1] == correct);
223     assert(R.array[2] == correct);
224     assert(R.array[3] == correct);
225 }
226 
227 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
228 __m256 _mm256_and_ps (__m256 a, __m256 b) pure @trusted
229 {
230     return cast(__m256)(cast(__m256i)a & cast(__m256i)b);
231 }
232 unittest
233 {
234     float a = 4.32f;
235     float b = -78.99f;
236     int correct = (*cast(int*)(&a)) & (*cast(int*)(&b));
237     __m256 A = _mm256_set_ps(a, b, a, b, a, b, a, b);
238     __m256 B = _mm256_set_ps(b, a, b, a, b, a, b, a);
239     int8 R = cast(int8)( _mm256_and_ps(A, B) );
240     foreach(i; 0..8)
241         assert(R.array[i] == correct);
242 }
243 
244 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`
245 /// and then AND with b.
246 __m256d _mm256_andnot_pd (__m256d a, __m256d b) pure @trusted
247 {
248     // PERF DMD
249     __m256i notA = _mm256_not_si256(cast(__m256i)a);
250     __m256i ib = cast(__m256i)b;
251     __m256i ab = notA & ib;
252     return cast(__m256d)ab;
253 }
254 unittest
255 {
256     double a = 4.32;
257     double b = -78.99;
258     long notA = ~ ( *cast(long*)(&a) );
259     long correct = notA & (*cast(long*)(&b));
260     __m256d A = _mm256_set_pd(a, a, a, a);
261     __m256d B = _mm256_set_pd(b, b, b, b);
262     long4 R = cast(long4)( _mm256_andnot_pd(A, B) );
263     foreach(i; 0..4)
264         assert(R.array[i] == correct);
265 }
266 
267 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a`
268 /// and then AND with b.
269 __m256 _mm256_andnot_ps (__m256 a, __m256 b) pure @trusted
270 {
271     // PERF DMD
272     __m256i notA = _mm256_not_si256(cast(__m256i)a);
273     __m256i ib = cast(__m256i)b;
274     __m256i ab = notA & ib;
275     return cast(__m256)ab;
276 }
277 unittest
278 {
279     float a = 4.32f;
280     float b = -78.99f;
281     int notA = ~ ( *cast(int*)(&a) );
282     int correct = notA & (*cast(int*)(&b));
283     __m256 A = _mm256_set1_ps(a);
284     __m256 B = _mm256_set1_ps(b);
285     int8 R = cast(int8)( _mm256_andnot_ps(A, B) );
286     foreach(i; 0..8)
287         assert(R.array[i] == correct);
288 }
289 
290 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control 
291 /// mask `imm8`.
292 __m256d _mm256_blend_pd(int imm8)(__m256d a, __m256d b)
293 {
294     static assert(imm8 >= 0 && imm8 < 16);
295 
296     // PERF DMD
297     static if (GDC_with_AVX)
298     {
299         return __builtin_ia32_blendpd256 (a, b, imm8);
300     }
301     else
302     {
303         // Works great with LDC.
304         double4 r;
305         for (int n = 0; n < 4; ++n)
306         {
307             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
308         }
309         return r;
310     }
311 }
312 unittest
313 {
314     __m256d A = _mm256_setr_pd(0, 1, 2, 3);
315     __m256d B = _mm256_setr_pd(8, 9, 10, 11);
316     double4 C = _mm256_blend_pd!0x06(A, B);
317     double[4] correct =    [0, 9, 10, 3];
318     assert(C.array == correct);
319 }
320 
321 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 
322 /// mask `imm8`.
323 __m256 _mm256_blend_ps(int imm8)(__m256 a, __m256 b) pure @trusted
324 {
325     static assert(imm8 >= 0 && imm8 < 256);
326     // PERF DMD
327     static if (GDC_with_AVX)
328     {
329         return __builtin_ia32_blendps256 (a, b, imm8);
330     }
331     else version(LDC)
332     {
333         // LDC x86: generates a vblendps since LDC 1.1 -O0
334         //   arm64: pretty good, four instructions worst case
335         return shufflevectorLDC!(float8, (imm8 & 1) ? 8 : 0,
336                                  (imm8 & 2) ? 9 : 1,
337                                  (imm8 & 4) ? 10 : 2,
338                                  (imm8 & 8) ? 11 : 3,
339                                  (imm8 & 16) ? 12 : 4,
340                                  (imm8 & 32) ? 13 : 5,
341                                  (imm8 & 64) ? 14 : 6,
342                                  (imm8 & 128) ? 15 : 7)(a, b);
343     }
344     else
345     {
346         // LDC x86: vblendps generated since LDC 1.27 -O1
347         float8 r;
348         for (int n = 0; n < 8; ++n)
349         {
350             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
351         }
352         return r;
353     }
354 }
355 unittest
356 {
357     __m256 A = _mm256_setr_ps(0, 1,  2,  3,  4,  5,  6,  7);
358     __m256 B = _mm256_setr_ps(8, 9, 10, 11, 12, 13, 14, 15);
359     float8 C = _mm256_blend_ps!0xe7(A, B);
360     float[8] correct =       [8, 9, 10,  3,  4, 13, 14, 15];
361     assert(C.array == correct);
362 }
363 
364 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using mask.
365 __m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) @trusted
366 {
367     // PERF DMD
368     static if (GDC_with_AVX)
369     {
370         // Amazingly enough, GCC/GDC generates the vblendvpd instruction
371         // with -mavx2 but not -mavx.
372         // Not sure what is the reason, and there is a replacement sequence.
373         // Sounds like a bug, similar to _mm_blendv_pd
374         // or maybe the instruction in unsafe?
375         return __builtin_ia32_blendvpd256(a, b, mask);
376     }
377     else static if (LDC_with_AVX)
378     {
379         return __builtin_ia32_blendvpd256(a, b, mask);
380     }
381     else
382     {
383         // LDC x86: vblendvpd since LDC 1.27 -O2
384         //     arm64: only 4 instructions, since LDC 1.27 -O2
385         __m256d r;
386         long4 lmask = cast(long4)mask;
387         for (int n = 0; n < 4; ++n)
388         {
389             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
390         }
391         return r;
392     }
393 }
394 unittest
395 {
396     __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
397     __m256d B = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0);
398     __m256d M = _mm256_setr_pd(-3.0, 2.0, 1.0, -4.0);
399     __m256d R = _mm256_blendv_pd(A, B, M);
400     double[4] correct1 = [5.0, 2.0, 3.0, 8.0];
401     assert(R.array == correct1);
402 }
403 
404 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 
405 /// using `mask`.
406 __m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask) @trusted
407 {
408     // PERF DMD
409     static if (GDC_or_LDC_with_AVX)
410     {
411         return __builtin_ia32_blendvps256(a, b, mask);
412     }
413     else static if (LDC_with_ARM64)
414     {
415         int8 shift;
416         shift = 31;
417         int8 lmask = cast(int8)mask >> shift;     
418         int8 ia = cast(int8)a;   
419         int8 ib = cast(int8)b;
420         return cast(__m256)(ia ^ ((ia ^ ib) & lmask));
421     }
422     else
423     {
424         // In both LDC and GDC with SSE4.1, this generates blendvps as fallback
425         __m256 r;
426         int8 lmask = cast(int8)mask;
427         for (int n = 0; n < 8; ++n)
428         {
429             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
430         }
431         return r;
432     }
433 }
434 unittest
435 {
436     __m256 A = _mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
437     __m256 B = _mm256_setr_ps(5.0f, 6.0f, 7.0f, 8.0f, 5.0f, 6.0f, 7.0f, 8.0f);
438     __m256 M = _mm256_setr_ps(-3.0f, 2.0f, 1.0f, -4.0f, -3.0f, 2.0f, 1.0f, -4.0f);
439     __m256 R = _mm256_blendv_ps(A, B, M);
440     float[8] correct1 = [5.0f, 2.0f, 3.0f, 8.0f, 5.0f, 2.0f, 3.0f, 8.0f];
441     assert(R.array == correct1);
442 }
443 
444 /// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit)
445 /// floating-point elements) to all elements.
446 /// This effectively duplicates the 128-bit vector.
447 __m256d _mm256_broadcast_pd (const(__m128d)* mem_addr) pure @trusted
448 {
449     // PERF DMD
450     static if (GDC_with_AVX)
451     {
452         return __builtin_ia32_vbroadcastf128_pd256(cast(float4*)mem_addr);
453     }
454     else
455     {
456         const(double)* p = cast(const(double)*) mem_addr;
457         __m256d r;
458         r.ptr[0] = p[0];
459         r.ptr[1] = p[1];
460         r.ptr[2] = p[0];
461         r.ptr[3] = p[1];
462         return r;
463     }
464 }
465 unittest
466 {
467     __m128d A = _mm_setr_pd(3, -4);
468     __m256d B = _mm256_broadcast_pd(&A);
469     double[4] correct = [3, -4, 3, -4];
470     assert(B.array == correct);
471 }
472 
473 /// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) 
474 /// floating-point elements) to all elements.
475 /// This effectively duplicates the 128-bit vector.
476 __m256 _mm256_broadcast_ps (const(__m128)* mem_addr) pure @trusted
477 {
478     // PERF DMD
479     static if (GDC_with_AVX)
480     {
481         return __builtin_ia32_vbroadcastf128_ps256(cast(float4*)mem_addr);
482     }   
483     else
484     {
485         const(float)* p = cast(const(float)*)mem_addr;
486         __m256 r;
487         r.ptr[0] = p[0];
488         r.ptr[1] = p[1];
489         r.ptr[2] = p[2];
490         r.ptr[3] = p[3];
491         r.ptr[4] = p[0];
492         r.ptr[5] = p[1];
493         r.ptr[6] = p[2];
494         r.ptr[7] = p[3];
495         return r;
496     }
497 }
498 unittest
499 {
500     __m128 A = _mm_setr_ps(1, 2, 3, -4);
501     __m256 B = _mm256_broadcast_ps(&A);
502     float[8] correct = [1.0f, 2, 3, -4, 1, 2, 3, -4];
503     assert(B.array == correct);
504 }
505 
506 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements.
507 __m256d _mm256_broadcast_sd (const(double)* mem_addr) pure @trusted
508 {
509     static if (GDC_with_AVX)
510     {
511         return __builtin_ia32_vbroadcastsd256(mem_addr);
512     }
513     else
514     {
515         double a = *mem_addr;
516         __m256d r;
517         r.ptr[0] = a;
518         r.ptr[1] = a;
519         r.ptr[2] = a;
520         r.ptr[3] = a;
521         return r;
522     }
523 }
524 unittest
525 {
526     double t = 7.5f;
527     __m256d A = _mm256_broadcast_sd(&t);
528     double[4] correct = [7.5, 7.5, 7.5, 7.5];
529     assert(A.array == correct);
530 }
531 
532 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements.
533 __m128 _mm_broadcast_ss (const(float)* mem_addr) pure @trusted
534 {
535     // PERF DMD
536     static if (GDC_with_AVX)
537     {
538         return __builtin_ia32_vbroadcastss(mem_addr);
539     }
540     else
541     {
542         float a = *mem_addr;
543         __m128 r;
544         r.ptr[0] = a;
545         r.ptr[1] = a;
546         r.ptr[2] = a;
547         r.ptr[3] = a;
548         return r;
549     }
550 }
551 unittest
552 {
553     float t = 7.5f;
554     __m128 A = _mm_broadcast_ss(&t);
555     float[4] correct = [7.5f, 7.5f, 7.5f, 7.5f];
556     assert(A.array == correct);
557 }
558 
559 __m256 _mm256_broadcast_ss (const(float)* mem_addr)
560 {
561     // PERF DMD
562     static if (GDC_with_AVX)
563     {
564         return __builtin_ia32_vbroadcastss256 (mem_addr);
565     }
566     else
567     {
568         float a = *mem_addr;
569         __m256 r = __m256(a);
570         return r;
571     }
572 }
573 unittest
574 {
575     float t = 7.5f;
576     __m256 A = _mm256_broadcast_ss(&t);
577     float[8] correct = [7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f];
578     assert(A.array == correct);
579 }
580 
581 /// Cast vector of type `__m256d` to type `__m256`.
582 __m256 _mm256_castpd_ps (__m256d a) pure @safe
583 {
584     return cast(__m256)a;
585 }
586 
587 /// Cast vector of type `__m256d` to type `__m256i`.
588 __m256i _mm256_castpd_si256 (__m256d a) pure @safe
589 {
590     return cast(__m256i)a;
591 }
592 
593 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are undefined.
594 __m256d _mm256_castpd128_pd256 (__m128d a) pure @trusted
595 {
596     static if (GDC_with_AVX)
597     {
598         return __builtin_ia32_pd256_pd(a);
599     }
600     else
601     {
602         __m256d r = void;
603         r.ptr[0] = a.array[0];
604         r.ptr[1] = a.array[1];
605         return r;
606     }
607 }
608 unittest
609 {
610     __m128d A = _mm_setr_pd(4.0, -6.125);
611     __m256d B = _mm256_castpd128_pd256(A);
612     assert(B.array[0] == 4.0);
613     assert(B.array[1] == -6.125);
614 }
615 
616 /// Cast vector of type `__m256d` to type `__m128d`; the upper 128 bits of `a` are lost.
617 __m128d _mm256_castpd256_pd128 (__m256d a) pure @trusted
618 {
619     static if (GDC_with_AVX)
620     {
621         return __builtin_ia32_pd_pd256(a);
622     }
623     else
624     {
625         __m128d r;
626         r.ptr[0] = a.array[0];
627         r.ptr[1] = a.array[1];
628         return r;
629     }
630 }
631 unittest
632 {
633     __m256d A = _mm256_set_pd(1, 2, -6.25, 4.0);
634     __m128d B = _mm256_castpd256_pd128(A);
635     assert(B.array[0] == 4.0);
636     assert(B.array[1] == -6.25);
637 }
638 
639 /// Cast vector of type `__m256` to type `__m256d`.
640 __m256d _mm256_castps_pd (__m256 a) pure @safe
641 {
642     return cast(__m256d)a;
643 }
644 
645 /// Cast vector of type `__m256` to type `__m256i`.
646 __m256i _mm256_castps_si256 (__m256 a) pure @safe
647 {
648     return cast(__m256i)a;
649 }
650 
651 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are undefined.
652 __m256 _mm256_castps128_ps256 (__m128 a) pure @trusted
653 {
654     static if (GDC_with_AVX)
655     {
656         return __builtin_ia32_ps256_ps(a);
657     }
658     else
659     {
660         __m256 r = void;
661         r.ptr[0] = a.array[0];
662         r.ptr[1] = a.array[1];
663         r.ptr[2] = a.array[2];
664         r.ptr[3] = a.array[3];
665         return r;
666     }
667 }
668 unittest
669 {
670     __m128 A = _mm_setr_ps(1.0f, 2, 3, 4);
671     __m256 B = _mm256_castps128_ps256(A);
672     float[4] correct = [1.0f, 2, 3, 4];
673     assert(B.array[0..4] == correct);
674 }
675 
676 /// Cast vector of type `__m256` to type `__m128`. The upper 128-bit of `a` are lost.
677 __m128 _mm256_castps256_ps128 (__m256 a) pure @trusted
678 {
679     return *cast(const(__m128)*)(&a);
680 }
681 unittest
682 {
683     __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8);
684     __m128 B = _mm256_castps256_ps128(A);
685     float[4] correct = [1.0f, 2, 3, 4];
686     assert(B.array == correct);
687 }
688 
689 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are undefined.
690 __m256i _mm256_castsi128_si256 (__m128i a) pure @trusted
691 {
692     long2 la = cast(long2)a;
693     long4 r = void;
694     r.ptr[0] = la.array[0];
695     r.ptr[1] = la.array[1];
696     return r;
697 }
698 unittest
699 {
700     __m128i A = _mm_setr_epi64(-1, 42);
701     __m256i B = _mm256_castsi128_si256(A);
702     long[2] correct = [-1, 42];
703     assert(B.array[0..2] == correct);
704 }
705 
706 /// Cast vector of type `__m256i` to type `__m256d`.
707 __m256d _mm256_castsi256_pd (__m256i a) pure @safe
708 {
709     return cast(__m256d)a;
710 }
711 
712 /// Cast vector of type `__m256i` to type `__m256`.
713 __m256 _mm256_castsi256_ps (__m256i a) pure @safe
714 {
715     return cast(__m256)a;
716 }
717 
718 /// Cast vector of type `__m256i` to type `__m128i`. The upper 128-bit of `a` are lost.
719 __m128i _mm256_castsi256_si128 (__m256i a) pure @trusted
720 {
721     long2 r = void;
722     r.ptr[0] = a.array[0];
723     r.ptr[1] = a.array[1];
724     return cast(__m128i)r;
725 }
726 unittest
727 {
728     long4 A;
729     A.ptr[0] = -1;
730     A.ptr[1] = 42;
731     long2 B = cast(long2)(_mm256_castsi256_si128(A));
732     long[2] correct = [-1, 42];
733     assert(B.array[0..2] == correct);
734 }
735 
736 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer 
737 /// value, and store the results as packed double-precision floating-point elements.
738 __m256d _mm256_ceil_pd (__m256d a) @safe
739 {
740     static if (LDC_with_ARM64)
741     {
742          __m128d lo = _mm256_extractf128_pd!0(a);
743         __m128d hi = _mm256_extractf128_pd!1(a);
744         __m128d ilo = _mm_ceil_pd(lo);
745         __m128d ihi = _mm_ceil_pd(hi);
746         return _mm256_set_m128d(ihi, ilo);
747     }
748     else
749     {
750         return _mm256_round_pd!2(a);
751     }
752 }
753 unittest
754 {
755     __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f);
756     A = _mm256_ceil_pd(A);
757     double[4] correct = [2.0, -2.0, 54.0, -2.0];
758     assert(A.array == correct);
759 }
760 
761 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer 
762 /// value, and store the results as packed single-precision floating-point elements.
763 __m256 _mm256_ceil_ps (__m256 a) @safe
764 {
765     static if (LDC_with_ARM64)
766     {
767         __m128 lo = _mm256_extractf128_ps!0(a);
768         __m128 hi = _mm256_extractf128_ps!1(a);
769         __m128 ilo = _mm_ceil_ps(lo);
770         __m128 ihi = _mm_ceil_ps(hi);
771         return _mm256_set_m128(ihi, ilo);
772     }
773     else
774     {
775         return _mm256_round_ps!2(a);
776     }
777 }
778 unittest
779 {
780     __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f);
781     __m256 C = _mm256_ceil_ps(A);
782     float[8] correct       = [2.0f, -2.0f,  54.0f, -2.0f, -1,    3,     -53,    3];
783     assert(C.array == correct);
784 }
785 
786 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b` based on the 
787 /// comparison operand specified by `imm8`. 
788 __m128d _mm_cmp_pd(int imm8)(__m128d a, __m128d b) pure @safe
789 {
790     enum comparison = mapAVXFPComparison(imm8);
791     return cast(__m128d) cmppd!comparison(a, b);
792 }
793 unittest
794 {
795     __m128d A = _mm_setr_pd(double.infinity, double.nan);
796     __m128d B = _mm_setr_pd(3.0,             4.0);
797     long2 R = cast(long2) _mm_cmp_pd!_CMP_GT_OS(A, B);
798     long[2] correct = [-1, 0];
799     assert(R.array == correct);
800 
801     long2 R2 = cast(long2) _mm_cmp_pd!_CMP_NLE_UQ(A, B);
802     long[2] correct2 = [-1, -1];
803     assert(R2.array == correct2);
804 }
805 
806 ///ditto
807 __m256d _mm256_cmp_pd(int imm8)(__m256d a, __m256d b) pure @safe
808 {
809     enum comparison = mapAVXFPComparison(imm8);
810     return cast(__m256d) cmppd256!comparison(a, b);
811 }
812 unittest
813 {
814     __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, double.nan);
815     __m256d B = _mm256_setr_pd(3.0, 2.0, 1.0, double.nan);
816     __m256i R = cast(__m256i) _mm256_cmp_pd!_CMP_LT_OS(A, B);
817     long[4] correct = [-1, 0, 0, 0];
818     assert(R.array == correct);
819 }
820 
821 /// Compare packed double-precision (32-bit) floating-point elements in `a` and `b` based on the 
822 /// comparison operand specified by `imm8`. 
823 __m128 _mm_cmp_ps(int imm8)(__m128 a, __m128 b) pure @safe
824 {
825     enum comparison = mapAVXFPComparison(imm8);
826     return cast(__m128) cmpps!comparison(a, b);
827 }
828 
829 ///ditto
830 __m256 _mm256_cmp_ps(int imm8)(__m256 a, __m256 b) pure @safe
831 {
832     enum comparison = mapAVXFPComparison(imm8);
833     return cast(__m256) cmpps256!comparison(a, b);
834 }
835 
836 /// Compare the lower double-precision (64-bit) floating-point element in `a` and `b` based on the
837 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 
838 /// copy the upper element from `a` to the upper element of result.
839 __m128d _mm_cmp_sd(int imm8)(__m128d a, __m128d b) pure @safe
840 {
841     enum comparison = mapAVXFPComparison(imm8);
842     return cast(__m128d) cmpsd!comparison(a, b);
843 }
844 
845 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` based on the
846 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 
847 /// copy the upper 3 packed elements from `a` to the upper elements of result.
848 __m128 _mm_cmp_ss(int imm8)(__m128 a, __m128 b) pure @safe
849 {
850     enum comparison = mapAVXFPComparison(imm8);
851     return cast(__m128) cmpss!comparison(a, b);
852 }
853 
854 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point 
855 /// elements.
856 __m256d _mm256_cvtepi32_pd (__m128i a) pure @trusted
857 {
858     static if (LDC_with_optimizations)
859     {
860         enum ir = `
861             %r = sitofp <4 x i32> %0 to <4 x double>
862             ret <4 x double> %r`;
863         return LDCInlineIR!(ir, double4, __m128i)(a);
864     }
865     else static if (GDC_with_AVX)
866     {
867         return __builtin_ia32_cvtdq2pd256(a);
868     }
869     else
870     {
871         double4 r;
872         r.ptr[0] = a.array[0];
873         r.ptr[1] = a.array[1];
874         r.ptr[2] = a.array[2];
875         r.ptr[3] = a.array[3];
876         return r;
877     }
878 }
879 unittest
880 {
881     __m256d R = _mm256_cvtepi32_pd(_mm_set1_epi32(54));
882     double[4] correct = [54.0, 54, 54, 54];
883     assert(R.array == correct);
884 }
885 
886 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point 
887 /// elements.
888 __m256 _mm256_cvtepi32_ps (__m256i a) pure @trusted
889 {
890     static if (LDC_with_optimizations)
891     {
892         enum ir = `
893             %r = sitofp <8 x i32> %0 to <8 x float>
894             ret <8 x float> %r`;
895         return LDCInlineIR!(ir, float8, int8)(cast(int8)a);
896     }
897     else static if (GDC_with_AVX)
898     {
899         return __builtin_ia32_cvtdq2ps256(cast(int8)a);
900     }
901     else
902     {
903         int8 ia = cast(int8)a;
904         __m256 r;
905         r.ptr[0] = ia.array[0];
906         r.ptr[1] = ia.array[1];
907         r.ptr[2] = ia.array[2];
908         r.ptr[3] = ia.array[3];
909         r.ptr[4] = ia.array[4];
910         r.ptr[5] = ia.array[5];
911         r.ptr[6] = ia.array[6];
912         r.ptr[7] = ia.array[7];
913         return r;
914     }
915 }
916 unittest
917 {
918     __m256 R = _mm256_cvtepi32_ps(_mm256_set1_epi32(5));
919     float[8] correct = [5.0f, 5, 5, 5, 5, 5, 5, 5];
920     assert(R.array == correct);
921 }
922 
923 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 
924 /// integers. Follows the current rounding mode.
925 __m128i _mm256_cvtpd_epi32 (__m256d a) @safe
926 {
927     static if (GDC_or_LDC_with_AVX)
928     {
929         return __builtin_ia32_cvtpd2dq256(a);
930     }
931     else
932     {
933         __m128d lo = _mm256_extractf128_pd!0(a);
934         __m128d hi = _mm256_extractf128_pd!1(a);
935         __m128i ilo = _mm_cvtpd_epi32(lo); // Only lower 64-bit contains significant values
936         __m128i ihi = _mm_cvtpd_epi32(hi);
937         return _mm_unpacklo_epi64(ilo, ihi);
938     }
939 }
940 unittest
941 {
942     int4 A = _mm256_cvtpd_epi32(_mm256_setr_pd(61.0, 55.0, -100, 1_000_000));
943     int[4] correct = [61, 55, -100, 1_000_000];
944     assert(A.array == correct);
945 }
946 
947 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) 
948 /// floating-point elements.
949 __m128 _mm256_cvtpd_ps (__m256d a) pure @trusted
950 {
951     // PERF DMD
952     static if (GDC_or_LDC_with_AVX)
953     {
954         return __builtin_ia32_cvtpd2ps256(a);
955     }
956     else
957     {
958         __m128 r;
959         r.ptr[0] = a.array[0];
960         r.ptr[1] = a.array[1];
961         r.ptr[2] = a.array[2];
962         r.ptr[3] = a.array[3];
963         return r;
964     }
965 }
966 unittest
967 {
968     __m256d A = _mm256_setr_pd(1.0, 2, 3, 5);
969     __m128 R = _mm256_cvtpd_ps(A);
970     float[4] correct = [1.0f, 2, 3, 5];
971     assert(R.array == correct);
972 }
973 
974 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 
975 /// integers, using the current rounding mode.
976 __m256i _mm256_cvtps_epi32 (__m256 a) @trusted
977 {
978     static if (GDC_or_LDC_with_AVX)
979     {
980         return cast(__m256i) __builtin_ia32_cvtps2dq256(a);
981     }
982     else
983     {
984         __m128 lo = _mm256_extractf128_ps!0(a);
985         __m128 hi = _mm256_extractf128_ps!1(a);
986         __m128i ilo = _mm_cvtps_epi32(lo);
987         __m128i ihi = _mm_cvtps_epi32(hi);
988         return _mm256_set_m128i(ihi, ilo);
989     }
990 }
991 unittest
992 {
993     uint savedRounding = _MM_GET_ROUNDING_MODE();
994 
995     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
996     __m256i A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.1f, 53.5f, -2.9f, -1.4f, 2.1f, -53.5f, 2.9f));
997     assert( (cast(int8)A).array == [1, -2, 54, -3, -1, 2, -54, 3]);
998 
999     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1000     A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.11f, 53.4f, -2.8f, -1.3f, 2.11f, -53.4f, 2.8f));
1001     assert( (cast(int8)A).array == [1, -3, 53, -3, -2, 2, -54, 2]);
1002 
1003     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1004     A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f));
1005     assert( (cast(int8)A).array == [2, -2, 54, -2, -1, 3, -53, 3]);
1006 
1007     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1008     A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.17f, 53.8f, -2.91f, -1.4f, 2.17f, -53.8f, 2.91f));
1009     assert( (cast(int8)A).array == [1, -2, 53, -2, -1, 2, -53, 2]);
1010 
1011     _MM_SET_ROUNDING_MODE(savedRounding);
1012 }
1013 
1014 
1015 /// Convert packed single-precision (32-bit) floating-point elements in `a`` to packed double-precision 
1016 /// (64-bit) floating-point elements.
1017 __m256d _mm256_cvtps_pd (__m128 a) pure @trusted
1018 {   
1019     // PERF DMD
1020     static if (GDC_with_AVX)
1021     {
1022         return __builtin_ia32_cvtps2pd256(a); // LDC doesn't have the builtin
1023     }
1024     else
1025     {
1026         // LDC: x86, needs -O2 to generate cvtps2pd since LDC 1.2.0
1027         __m256d r;
1028         r.ptr[0] = a.array[0];
1029         r.ptr[1] = a.array[1];
1030         r.ptr[2] = a.array[2];
1031         r.ptr[3] = a.array[3];
1032         return r;
1033     }
1034 }
1035 unittest
1036 {
1037     __m128 A = _mm_setr_ps(1.0f, 2, 3, 5);
1038     __m256d R = _mm256_cvtps_pd(A);
1039     double[4] correct = [1.0, 2, 3, 5];
1040     assert(R.array == correct);
1041 }
1042 
1043 /// Return the lower double-precision (64-bit) floating-point element of `a`.
1044 double _mm256_cvtsd_f64 (__m256d a) pure @safe
1045 {
1046     return a.array[0];
1047 }
1048 
1049 /// Return the lower 32-bit integer in `a`.
1050 int _mm256_cvtsi256_si32 (__m256i a) pure @safe
1051 {
1052     return (cast(int8)a).array[0];
1053 }
1054 
1055 /// Return the lower single-precision (32-bit) floating-point element of `a`.
1056 float _mm256_cvtss_f32 (__m256 a) pure @safe
1057 {
1058     return a.array[0];
1059 }
1060 
1061 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 
1062 /// integers with truncation.
1063 __m128i _mm256_cvttpd_epi32 (__m256d a) pure @trusted
1064 {
1065     // PERF DMD
1066     static if (GDC_or_LDC_with_AVX)
1067     {
1068         return cast(__m128i)__builtin_ia32_cvttpd2dq256(a);
1069     }
1070     else
1071     {
1072         __m128i r;
1073         r.ptr[0] = cast(int)a.array[0];
1074         r.ptr[1] = cast(int)a.array[1];
1075         r.ptr[2] = cast(int)a.array[2];
1076         r.ptr[3] = cast(int)a.array[3];
1077         return r;
1078     }
1079 }
1080 unittest
1081 {
1082     __m256d A = _mm256_set_pd(4.7, -1000.9, -7.1, 3.1);
1083     __m128i R = _mm256_cvttpd_epi32(A);
1084     int[4] correct = [3, -7, -1000, 4];
1085     assert(R.array == correct);
1086 }
1087 
1088 /// Convert packed single-precision (32-bit) floating-point elements in `a`.
1089 __m256i _mm256_cvttps_epi32 (__m256 a) pure @trusted
1090 {
1091     // PERF DMD
1092     static if (GDC_or_LDC_with_AVX)
1093     {
1094         return cast(__m256i)__builtin_ia32_cvttps2dq256(a);
1095     }
1096     else
1097     {
1098         int8 r;
1099         r.ptr[0] = cast(int)a.array[0];
1100         r.ptr[1] = cast(int)a.array[1];
1101         r.ptr[2] = cast(int)a.array[2];
1102         r.ptr[3] = cast(int)a.array[3];
1103         r.ptr[4] = cast(int)a.array[4];
1104         r.ptr[5] = cast(int)a.array[5];
1105         r.ptr[6] = cast(int)a.array[6];
1106         r.ptr[7] = cast(int)a.array[7];
1107         return cast(__m256i)r;
1108     }
1109 }
1110 unittest
1111 {
1112     __m256 A = _mm256_set_ps(4.7, -1000.9, -7.1, 3.1, 1.4, 2.9, -2.9, 0);
1113     int8 R = cast(int8) _mm256_cvttps_epi32(A);
1114     int[8] correct = [0, -2, 2, 1, 3, -7, -1000, 4];
1115     assert(R.array == correct);
1116 }
1117 
1118 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1119 __m256d _mm256_div_pd (__m256d a, __m256d b) pure @safe
1120 {
1121     return a / b;
1122 }
1123 unittest
1124 {
1125     __m256d a = [1.5, -2.0, 3.0, 1.0];
1126     a = _mm256_div_pd(a, a);
1127     double[4] correct = [1.0, 1.0, 1.0, 1.0];
1128     assert(a.array == correct);
1129 }
1130 
1131 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`.
1132 __m256 _mm256_div_ps (__m256 a, __m256 b) pure @safe
1133 {
1134     return a / b;
1135 }
1136 unittest
1137 {
1138     __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 4.5f, -5.0f, 6.0f, 7.0f];
1139     a = _mm256_div_ps(a, a);
1140     float[8] correct = [1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f];
1141     assert(a.array == correct);
1142 }
1143 
1144 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in `a` and 
1145 /// `b` using the high 4 bits in `imm8`, sum the four products, and conditionally store the sum 
1146 /// using the low 4 bits of `imm8`.
1147 __m256 _mm256_dp_ps(int imm8)(__m256 a, __m256 b)
1148 {
1149     // PERF DMD
1150     static if (GDC_or_LDC_with_AVX)
1151     {
1152         return __builtin_ia32_dpps256(a, b, cast(ubyte)imm8);
1153     }
1154     else
1155     {
1156         // Note: in LDC with SSE4.1 but no AVX, we _could_ increase perf a bit by using two 
1157         // _mm_dp_ps.
1158         __m256 zero = _mm256_setzero_ps();
1159         enum ubyte op = (imm8 >>> 4) & 15;
1160         __m256 temp = _mm256_blend_ps!( op | (op << 4) )(zero, a * b);
1161         float lo = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
1162         float hi = temp.array[4] + temp.array[5] + temp.array[6] + temp.array[7];
1163         __m256 r = _mm256_set_m128(_mm_set1_ps(hi), _mm_set1_ps(lo));
1164         enum ubyte op2 = (imm8 & 15);
1165         return _mm256_blend_ps!(op2 | (op2 << 4))(zero, r);
1166     }
1167 }
1168 unittest
1169 {
1170     // Products:                 9    14    20   24     6    16    12   -24
1171     __m256 A = _mm256_setr_ps(1.0f, 2.0f, 4.0f, 8.0f, 1.0f, 2.0f, 4.0f, 8.0f);
1172     __m256 B = _mm256_setr_ps(9.0f, 7.0f, 5.0f, 3.0f, 6.0f, 8.0f, 3.0f,-3.0f);
1173     float8 R1 = _mm256_dp_ps!(0xf0 + 0xf)(A, B);
1174     float8 R2 = _mm256_dp_ps!(0x30 + 0x5)(A, B);
1175     float8 R3 = _mm256_dp_ps!(0x50 + 0xa)(A, B);
1176     float[8] correct1 =   [67.0f, 67.0f, 67.0f,67.0f,  10,   10,   10,  10];
1177     float[8] correct2 =   [23.0f, 0.0f, 23.0f,  0.0f,  22,    0,   22,   0];
1178     float[8] correct3 =   [0.0f, 29.0f, 0.0f,  29.0f,   0,   18,    0,  18];
1179     assert(R1.array == correct1);
1180     assert(R2.array == correct2);
1181     assert(R3.array == correct3);
1182 }
1183 
1184 /// Extract a 32-bit integer from `a`, selected with `imm8`.
1185 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted
1186 {
1187     return (cast(int8)a).array[imm8 & 7];
1188 }
1189 unittest
1190 {
1191     align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6];
1192     auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr);
1193     assert(_mm256_extract_epi32(A, 0) == -1);
1194     assert(_mm256_extract_epi32(A, 1 + 8) == 2);
1195     assert(_mm256_extract_epi32(A, 3 + 16) == 4);
1196     assert(_mm256_extract_epi32(A, 7 + 32) == -6);
1197 }
1198 
1199 /// Extract a 64-bit integer from `a`, selected with `index`.
1200 long _mm256_extract_epi64 (__m256i a, const int index) pure @safe
1201 {
1202     return a.array[index & 3];
1203 }
1204 unittest
1205 {
1206     __m256i A = _mm256_setr_epi64x(-7, 6, 42, 0);
1207     assert(_mm256_extract_epi64(A, -8) == -7);
1208     assert(_mm256_extract_epi64(A, 1) == 6);
1209     assert(_mm256_extract_epi64(A, 2 + 4) == 42);
1210 }
1211 
1212 /// Extract a 128-bits lane from `a`, selected with `index` (0 or 1).
1213 /// Note: `_mm256_extractf128_pd!0` is equivalent to `_mm256_castpd256_pd128`.
1214 __m128d _mm256_extractf128_pd(ubyte imm8)(__m256d a) pure @trusted
1215 {
1216     version(GNU) pragma(inline, true); // else GDC has trouble inlining this
1217 
1218     // PERF DMD
1219     static if (GDC_with_AVX)
1220     {
1221         // Note: needs to be a template intrinsics because of this builtin.
1222         return __builtin_ia32_vextractf128_pd256(a, imm8 & 1);
1223     }
1224     else
1225     {
1226         double2 r = void;
1227         enum int index = 2*(imm8 & 1);
1228         r.ptr[0] = a.array[index+0];
1229         r.ptr[1] = a.array[index+1];
1230         return r;
1231     }
1232 }
1233 unittest
1234 {
1235     __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
1236     double[4] correct = [1.0, 2, 3, 4];
1237     __m128d l0 = _mm256_extractf128_pd!18(A);
1238     __m128d l1 = _mm256_extractf128_pd!55(A);
1239     assert(l0.array == correct[0..2]);
1240     assert(l1.array == correct[2..4]);
1241 }
1242 
1243 ///ditto
1244 __m128 _mm256_extractf128_ps(ubyte imm8)(__m256 a) pure @trusted
1245 {
1246     version(GNU) pragma(inline, true); // else GDC has trouble inlining this
1247 
1248     // PERF DMD
1249     static if (GDC_with_AVX)
1250     {
1251         return __builtin_ia32_vextractf128_ps256(a, imm8 & 1);
1252     }
1253     else
1254     {
1255         float4 r = void; // Optimize well since LDC 1.1 -O1
1256         enum int index = 4*(imm8 & 1);
1257         r.ptr[0] = a.array[index+0];
1258         r.ptr[1] = a.array[index+1];
1259         r.ptr[2] = a.array[index+2];
1260         r.ptr[3] = a.array[index+3];
1261         return r;
1262     }
1263 }
1264 unittest
1265 {
1266     __m256 A = _mm256_setr_ps(1.0, 2, 3, 4, 5, 6, 7, 8);
1267     float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8];
1268     __m128 l0 = _mm256_extractf128_ps!8(A);
1269     __m128 l1 = _mm256_extractf128_ps!255(A);
1270     assert(l0.array == correct[0..4]);
1271     assert(l1.array == correct[4..8]);
1272 }
1273 
1274 ///ditto
1275 __m128i _mm256_extractf128_si256(ubyte imm8)(__m256i a) pure @trusted
1276 {
1277     version(GNU) pragma(inline, true); // else GDC has trouble inlining this
1278 
1279     // PERF DMD
1280     static if (GDC_with_AVX)
1281     {
1282         // Note: if it weren't for this GDC intrinsic, _mm256_extractf128_si256
1283         // could be a non-template, however, this wins in -O0.
1284         // Same story for _mm256_extractf128_ps and _mm256_extractf128_pd
1285         return __builtin_ia32_vextractf128_si256(cast(int8)a, imm8 & 1);
1286     }
1287     else
1288     {
1289         long2 r = void;
1290         enum int index = 2*(imm8 & 1);
1291         r.ptr[0] = a.array[index+0];
1292         r.ptr[1] = a.array[index+1];
1293         return cast(__m128i)r;
1294     }
1295 }
1296 unittest
1297 {
1298     __m256i A = _mm256_setr_epi32(9, 2, 3, 4, 5, 6, 7, 8);
1299     int[8] correct = [9, 2, 3, 4, 5, 6, 7, 8];
1300     __m128i l0 = _mm256_extractf128_si256!0(A);
1301     __m128i l1 = _mm256_extractf128_si256!1(A);
1302     assert(l0.array == correct[0..4]);
1303     assert(l1.array == correct[4..8]);
1304 }
1305 
1306 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an integer 
1307 /// value, and store the results as packed double-precision floating-point elements.
1308 __m256d _mm256_floor_pd (__m256d a) @safe
1309 {
1310     static if (LDC_with_ARM64)
1311     {
1312         __m128d lo = _mm256_extractf128_pd!0(a);
1313         __m128d hi = _mm256_extractf128_pd!1(a);
1314         __m128d ilo = _mm_floor_pd(lo);
1315         __m128d ihi = _mm_floor_pd(hi);
1316         return _mm256_set_m128d(ihi, ilo);
1317     }
1318     else
1319     {
1320         return _mm256_round_pd!1(a);
1321     }
1322 }
1323 unittest
1324 {
1325     __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f);
1326     A = _mm256_floor_pd(A);
1327     double[4] correct = [1.0, -3.0, 53.0, -3.0];
1328     assert(A.array == correct);
1329 }
1330 
1331 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an integer 
1332 /// value, and store the results as packed single-precision floating-point elements.
1333 __m256 _mm256_floor_ps (__m256 a) @safe
1334 {
1335     static if (LDC_with_ARM64)
1336     {
1337         __m128 lo = _mm256_extractf128_ps!0(a);
1338         __m128 hi = _mm256_extractf128_ps!1(a);
1339         __m128 ilo = _mm_floor_ps(lo);
1340         __m128 ihi = _mm_floor_ps(hi);
1341         return _mm256_set_m128(ihi, ilo);
1342     }
1343     else
1344     {
1345         return _mm256_round_ps!1(a);
1346     }
1347 }
1348 unittest
1349 {
1350     __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f);
1351     __m256 C = _mm256_floor_ps(A);
1352     float[8] correct       = [1.0f, -3.0f,  53.0f, -3.0f, -2,    2,     -54,    2];
1353     assert(C.array == correct);
1354 }
1355 
1356 /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in `a` 
1357 /// and `b`. 
1358 __m256d _mm256_hadd_pd (__m256d a, __m256d b) pure @trusted
1359 {
1360     static if (GDC_or_LDC_with_AVX)
1361     {
1362         return __builtin_ia32_haddpd256(a, b);
1363     }
1364     else
1365     {
1366         __m256d res;
1367         res.ptr[0] = a.array[1] + a.array[0];
1368         res.ptr[1] = b.array[1] + b.array[0];
1369         res.ptr[2] = a.array[3] + a.array[2];
1370         res.ptr[3] = b.array[3] + b.array[2];
1371         return res;
1372     }
1373 }
1374 unittest
1375 {
1376     __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0);
1377     __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0);
1378     __m256d C = _mm256_hadd_pd(A, B);
1379     double[4] correct =      [3.5, 8.0, 30.0, 114.0];
1380     assert(C.array == correct);
1381 }
1382 
1383 /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in `a` and
1384 /// `b`.
1385 __m256 _mm256_hadd_ps (__m256 a, __m256 b) pure @trusted
1386 {
1387     // PERD DMD
1388     static if (GDC_or_LDC_with_AVX)
1389     {
1390         return __builtin_ia32_haddps256(a, b);
1391     }
1392     else static if (LDC_with_ARM64)
1393     {
1394         __m128 a_hi = _mm256_extractf128_ps!1(a);
1395         __m128 a_lo = _mm256_extractf128_ps!0(a);
1396         __m128 b_hi = _mm256_extractf128_ps!1(b);
1397         __m128 b_lo = _mm256_extractf128_ps!0(b);
1398         __m128 hi = vpaddq_f32(a_hi, b_hi);
1399         __m128 lo = vpaddq_f32(a_lo, b_lo);
1400         return _mm256_set_m128(hi, lo);
1401     }
1402     else
1403     {    
1404         __m256 res;
1405         res.ptr[0] = a.array[1] + a.array[0];
1406         res.ptr[1] = a.array[3] + a.array[2];
1407         res.ptr[2] = b.array[1] + b.array[0];
1408         res.ptr[3] = b.array[3] + b.array[2];
1409         res.ptr[4] = a.array[5] + a.array[4];
1410         res.ptr[5] = a.array[7] + a.array[6];
1411         res.ptr[6] = b.array[5] + b.array[4];
1412         res.ptr[7] = b.array[7] + b.array[6];
1413         return res;
1414     }
1415 }
1416 unittest
1417 {
1418     __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f);
1419     __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f);
1420     __m256 R = _mm256_hadd_ps(A, B);
1421     float[8] correct =      [3.0f, 8.0f, 3.5f, 7.5f, 3.0f, 8.0f, 3.5f, 8.5f];
1422     assert(R.array == correct);
1423 }
1424 
1425 /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in
1426 /// `a` and `b`. 
1427 __m256d _mm256_hsub_pd (__m256d a, __m256d b) pure @trusted
1428 {
1429     static if (GDC_or_LDC_with_AVX)
1430     {
1431         return __builtin_ia32_hsubpd256(a, b);
1432     }
1433     else 
1434     {
1435         // 2 zip1, 2 zip2, 2 fsub... I don't think there is better in arm64
1436         __m256d res;
1437         res.ptr[0] = a.array[0] - a.array[1];
1438         res.ptr[1] = b.array[0] - b.array[1];
1439         res.ptr[2] = a.array[2] - a.array[3];
1440         res.ptr[3] = b.array[2] - b.array[3];
1441         return res;
1442     }
1443 }
1444 unittest
1445 {
1446     __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0);
1447     __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0);
1448     __m256d C = _mm256_hsub_pd(A, B);
1449     double[4] correct =      [-0.5, -6.0, 12.0, 86.0];
1450     assert(C.array == correct);
1451 }
1452 
1453 __m256 _mm256_hsub_ps (__m256 a, __m256 b) pure @trusted
1454 {
1455     // PERD DMD
1456     static if (GDC_or_LDC_with_AVX)
1457     {
1458         return __builtin_ia32_hsubps256(a, b);
1459     }
1460     else
1461     {
1462         __m128 a_hi = _mm256_extractf128_ps!1(a);
1463         __m128 a_lo = _mm256_extractf128_ps!0(a);
1464         __m128 b_hi = _mm256_extractf128_ps!1(b);
1465         __m128 b_lo = _mm256_extractf128_ps!0(b);
1466         __m128 hi = _mm_hsub_ps(a_hi, b_hi);
1467         __m128 lo = _mm_hsub_ps(a_lo, b_lo);
1468         return _mm256_set_m128(hi, lo);
1469     }
1470 }
1471 unittest
1472 {
1473     __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f);
1474     __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f);
1475     __m256 R = _mm256_hsub_ps(A, B);
1476     float[8] correct =   [-1.0f, -2.0f, -0.5f, -0.5f, -1.0f, -2.0f, -0.5f, -1.5f];
1477     assert(R.array == correct);
1478 }
1479 
1480 /// Copy `a`, and insert the 16-bit integer `i` into the result at the location specified by 
1481 /// `index & 15`.
1482 __m256i _mm256_insert_epi16 (__m256i a, short i, const int index) pure @trusted
1483 {
1484     short16 sa = cast(short16)a;
1485     sa.ptr[index & 15] = i;
1486     return cast(__m256i)sa;
1487 }
1488 unittest
1489 {
1490     __m256i A = _mm256_set1_epi16(1);
1491     short16 R = cast(short16) _mm256_insert_epi16(A, 2, 16 + 16 + 7);
1492     short[16] correct = [1, 1, 1, 1, 1, 1, 1, 2, 
1493                          1, 1, 1, 1, 1, 1, 1, 1 ];
1494     assert(R.array == correct);
1495 }
1496 
1497 /// Copy `a`, and insert the 32-bit integer `i` into the result at the location specified by 
1498 /// `index & 7`.
1499 __m256i _mm256_insert_epi32 (__m256i a, int i, const int index) pure @trusted
1500 {
1501     int8 ia = cast(int8)a;
1502     ia.ptr[index & 7] = i;
1503     return cast(__m256i)ia;
1504 }
1505 unittest
1506 {
1507     __m256i A = _mm256_set1_epi32(1);
1508     int8 R = cast(int8) _mm256_insert_epi32(A, -2, 8 + 8 + 1);
1509     int[8] correct = [1, -2, 1, 1, 1, 1, 1, 1];
1510     assert(R.array == correct);
1511 }
1512 
1513 /// Copy `a`, and insert the 64-bit integer `i` into the result at the location specified by 
1514 /// `index & 3`.
1515 __m256i _mm256_insert_epi64(__m256i a, long i, const int index) pure @trusted
1516 {
1517     a.ptr[index & 3] = i;
1518     return a;
1519 }
1520 unittest
1521 {
1522     __m256i A = _mm256_set1_epi64(1);
1523     long4 R = cast(long4) _mm256_insert_epi64(A, -2, 2 - 4 - 4);
1524     long[4] correct = [1, 1, -2, 1];
1525     assert(R.array == correct);
1526 }
1527 
1528 /// Copy `a`, and insert the 8-bit integer `i` into the result at the location specified by 
1529 /// `index & 31`.
1530 __m256i _mm256_insert_epi8(__m256i a, byte i, const int index) pure @trusted
1531 {
1532     byte32 ba = cast(byte32)a;
1533     ba.ptr[index & 31] = i;
1534     return cast(__m256i)ba;
1535 }
1536 unittest
1537 {
1538     __m256i A = _mm256_set1_epi8(1);
1539     byte32 R = cast(byte32) _mm256_insert_epi8(A, -2, 7 - 32 - 32);
1540     byte[32] correct = [1, 1, 1, 1, 1, 1, 1,-2, 1, 1, 1, 1, 1, 1, 1, 1,
1541                         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ];
1542     assert(R.array == correct);
1543 }
1544 
1545 /// Copy `a`, then insert 128 bits (composed of 2 packed double-precision (64-bit) 
1546 /// floating-point elements) from `b` at the location specified by `imm8`.
1547 __m256d _mm256_insertf128_pd(int imm8)(__m256d a, __m128d b) pure @trusted
1548 {
1549     static if (GDC_with_AVX)
1550     {
1551         enum ubyte lane = imm8 & 1;
1552         return __builtin_ia32_vinsertf128_pd256(a, b, lane);
1553     }
1554     else
1555     {
1556         __m256d r = a;
1557         enum int index = (imm8 & 1) ? 2 : 0;
1558         r.ptr[index] = b.array[0];
1559         r.ptr[index+1] = b.array[1];
1560         return r;
1561     }
1562 }
1563 
1564 /// Copy `a` then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point
1565 /// elements) from `b`, at the location specified by `imm8`.
1566 __m256 _mm256_insertf128_ps(int imm8)(__m256 a, __m128 b) pure @trusted
1567 {
1568     static if (GDC_with_AVX)
1569     {
1570         enum ubyte lane = imm8 & 1;
1571         return __builtin_ia32_vinsertf128_ps256(a, b, lane);
1572     }
1573     else
1574     {
1575         __m256 r = a;
1576         enum int index = (imm8 & 1) ? 4 : 0;
1577         r.ptr[index] = b.array[0];
1578         r.ptr[index+1] = b.array[1];
1579         r.ptr[index+2] = b.array[2];
1580         r.ptr[index+3] = b.array[3];
1581         return r;
1582     }
1583 }
1584 
1585 /// Copy `a`, then insert 128 bits from `b` at the location specified by `imm8`.
1586 __m256i _mm256_insertf128_si256(int imm8)(__m256i a, __m128i b) pure @trusted
1587 {
1588     static if (GDC_with_AVX)
1589     {
1590         enum ubyte lane = imm8 & 1;
1591         return cast(__m256i) __builtin_ia32_vinsertf128_si256 (cast(int8)a, b, lane);
1592     }
1593     else
1594     {
1595         long2 lb = cast(long2)b;
1596         __m256i r = a;
1597         enum int index = (imm8 & 1) ? 2 : 0;
1598         r.ptr[index] = lb.array[0];
1599         r.ptr[index+1] = lb.array[1];
1600         return r;
1601     }
1602 }
1603 
1604 /// Load 256-bits of integer data from unaligned memory into dst. 
1605 /// This intrinsic may run better than `_mm256_loadu_si256` when the data crosses a cache 
1606 /// line boundary.
1607 __m256i _mm256_lddqu_si256(const(__m256i)* mem_addr) @trusted
1608 {
1609     // PERF DMD
1610     static if (GDC_or_LDC_with_AVX)
1611     {
1612         return cast(__m256i) __builtin_ia32_lddqu256(cast(const(char)*)mem_addr);
1613     }
1614     else
1615         return _mm256_loadu_si256(mem_addr);
1616 }
1617 unittest
1618 {
1619     int[10] correct = [0, -1, 2, -3, 4, 9, -7, 8, -6, 34];
1620     int8 A = cast(int8) _mm256_lddqu_si256(cast(__m256i*) &correct[1]);
1621     assert(A.array == correct[1..9]);
1622 }
1623 
1624 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 
1625 /// from memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
1626 /// exception may be generated.
1627 __m256d _mm256_load_pd (const(double)* mem_addr) pure @trusted
1628 {
1629     return *cast(__m256d*)mem_addr;
1630 }
1631 unittest
1632 {
1633     static immutable align(32) double[4] correct = [1.0, 2.0, 3.5, -42.0];
1634     __m256d A = _mm256_load_pd(correct.ptr);
1635     assert(A.array == correct);
1636 }
1637 
1638 /// Load 256-bits (composed of 8 packed single-precision (32-bit) 
1639 /// floating-point elements) from memory. 
1640 /// `mem_addr` must be aligned on a 32-byte boundary or a 
1641 /// general-protection exception may be generated.
1642 __m256 _mm256_load_ps (const(float)* mem_addr) pure @trusted
1643 {
1644     return *cast(__m256*)mem_addr;
1645 }
1646 unittest
1647 {
1648     static immutable align(32) float[8] correct = 
1649         [1.0, 2.0, 3.5, -42.0, 7.43f, 0.0f, 3, 2];
1650     __m256 A = _mm256_load_ps(correct.ptr);
1651     assert(A.array == correct);
1652 }
1653 
1654 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on
1655 /// any particular boundary.
1656 // See this dlang forum post => https://forum.dlang.org/thread/vymrsngsfibkmqsqffce@forum.dlang.org
1657 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted
1658 {
1659     // PERF DMD
1660     static if (GDC_with_AVX)
1661     {
1662         return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr);
1663     }
1664     else static if (LDC_with_optimizations)
1665     {
1666         return loadUnaligned!(__m256i)(cast(long*)mem_addr);
1667     }
1668     else
1669     {
1670         const(long)* p = cast(const(long)*)mem_addr; 
1671         long4 r;
1672         r.ptr[0] = p[0];
1673         r.ptr[1] = p[1];
1674         r.ptr[2] = p[2];
1675         r.ptr[3] = p[3];
1676         return r;
1677     }
1678 }
1679 unittest
1680 {
1681     align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6];
1682     int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr);
1683     assert(A.array == correct);
1684 }
1685 
1686 /// Load 256-bits of integer data from memory. `mem_addr` must be aligned on a 
1687 /// 32-byte boundary or a general-protection exception may be generated.
1688 __m256i _mm256_load_si256 (const(void)* mem_addr) pure @system
1689 {
1690     return *cast(__m256i*)mem_addr;
1691 }
1692 unittest
1693 {
1694     static immutable align(64) long[4] correct = [1, -2, long.min, long.max];
1695     __m256i A = _mm256_load_si256(correct.ptr);
1696     assert(A.array == correct);
1697 }
1698 
1699 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 
1700 /// from memory. `mem_addr` does not need to be aligned on any particular boundary.
1701 __m256d _mm256_loadu_pd (const(void)* mem_addr) pure @system
1702 {
1703     // PERF DMD
1704     static if (GDC_with_AVX)
1705     {
1706         return __builtin_ia32_loadupd256 ( cast(const(double)*) mem_addr);
1707     }
1708     else static if (LDC_with_optimizations)
1709     {
1710         return loadUnaligned!(__m256d)(cast(double*)mem_addr);
1711     }    
1712     else
1713     {
1714         const(double)* p = cast(const(double)*)mem_addr; 
1715         double4 r;
1716         r.ptr[0] = p[0];
1717         r.ptr[1] = p[1];
1718         r.ptr[2] = p[2];
1719         r.ptr[3] = p[3];
1720         return r;
1721     }
1722 }
1723 unittest
1724 {
1725     double[4] correct = [1.0, -2.0, 0.0, 768.5];
1726     __m256d A = _mm256_loadu_pd(correct.ptr);
1727     assert(A.array == correct);
1728 }
1729 
1730 /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory.
1731 /// `mem_addr` does not need to be aligned on any particular boundary.
1732 __m256 _mm256_loadu_ps (const(float)* mem_addr) pure @system
1733 {
1734     // PERF DMD
1735     static if (GDC_with_AVX)
1736     {
1737         return __builtin_ia32_loadups256 ( cast(const(float)*) mem_addr);
1738     }
1739     else static if (LDC_with_optimizations)
1740     {
1741         return loadUnaligned!(__m256)(cast(float*)mem_addr);
1742     }    
1743     else
1744     {
1745         const(float)* p = cast(const(float)*)mem_addr; 
1746         float8 r = void;
1747         r.ptr[0] = p[0];
1748         r.ptr[1] = p[1];
1749         r.ptr[2] = p[2];
1750         r.ptr[3] = p[3];
1751         r.ptr[4] = p[4];
1752         r.ptr[5] = p[5];
1753         r.ptr[6] = p[6];
1754         r.ptr[7] = p[7];
1755         return r;
1756     }
1757 }
1758 unittest
1759 {
1760     align(32) float[10] correct = [0.0f, 1, 2, 3, 4, 5, 6, 7, 8, 9];
1761     __m256 A = _mm256_loadu_ps(&correct[1]);
1762     assert(A.array == correct[1..9]);
1763 }
1764 
1765 /// Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point 
1766 /// elements) from memory, and combine them into a 256-bit value. 
1767 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
1768 __m256 _mm256_loadu2_m128 (const(float)* hiaddr, const(float)* loaddr) pure @system
1769 {
1770     // Note: no particular instruction for this in x86.
1771     return _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr));
1772 }
1773 unittest
1774 {
1775     align(32) float[6] A = [4.5f, 2, 8, 97, -1, 3];
1776     align(32) float[6] B = [6.5f, 3, 9, 98, -2, 4];
1777     __m256 R = _mm256_loadu2_m128(&B[1], &A[1]);
1778     float[8] correct = [2.0f, 8, 97, -1, 3, 9, 98, -2];
1779     assert(R.array == correct);
1780 }
1781 
1782 /// Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point
1783 /// elements) from memory, and combine them into a 256-bit value. 
1784 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
1785 __m256d _mm256_loadu2_m128d (const(double)* hiaddr, const(double)* loaddr) pure @system
1786 {
1787     // Note: no particular instruction for this in x86.
1788     return _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr));
1789 }
1790 unittest
1791 {
1792     align(32) double[4] A = [4.5f, 2, 8, 97];
1793     align(32) double[4] B = [6.5f, 3, 9, 98];
1794     __m256d R = _mm256_loadu2_m128d(&B[1], &A[1]);
1795     double[4] correct = [2.0, 8, 3, 9];
1796     assert(R.array == correct);
1797 }
1798 
1799 /// Load two 128-bit values (composed of integer data) from memory, and combine them into a 
1800 /// 256-bit value. `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
1801 __m256i _mm256_loadu2_m128i (const(__m128i)* hiaddr, const(__m128i)* loaddr) pure @trusted
1802 {
1803     // Note: no particular instruction for this in x86.
1804     return _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr));
1805 }
1806 unittest
1807 {
1808     align(32) long[4] A = [5, 2, 8, 97];
1809     align(32) long[4] B = [6, 3, 9, 98];
1810     __m256i R = _mm256_loadu2_m128i(cast(const(__m128i)*) &B[1], cast(const(__m128i)*)  &A[1]);
1811     long[4] correct = [2, 8, 3, 9];
1812     assert(R.array == correct);
1813 }
1814 
1815 version(DigitalMars)
1816 {
1817     // this avoids a bug with DMD < 2.099 -a x86 -O
1818     private enum bool maskLoadWorkaroundDMD = (__VERSION__ < 2099);
1819 }
1820 else
1821 {
1822     private enum bool maskLoadWorkaroundDMD = false;
1823 }
1824 
1825 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask` 
1826 /// (elements are zeroed out when the high bit of the corresponding element is not set).
1827 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access
1828 /// only when needed.
1829 /// See: "Note about mask load/store" to know why you must address valid memory only.
1830 __m128d _mm_maskload_pd (const(double)* mem_addr, __m128i mask) /* pure */ @system
1831 {
1832     // PERF DMD
1833     static if (LDC_with_AVX)
1834     {
1835         // MAYDO report that the builtin is impure
1836         return __builtin_ia32_maskloadpd(mem_addr, cast(long2)mask);
1837     }
1838     else static if (GDC_with_AVX)
1839     {
1840         return __builtin_ia32_maskloadpd(cast(double2*)mem_addr, cast(long2)mask);
1841     }
1842     else
1843     {
1844         __m128d a = _mm_loadu_pd(mem_addr);
1845         __m128d zero = _mm_setzero_pd();
1846         return _mm_blendv_pd(zero, a, cast(double2)mask);
1847     }
1848 }
1849 unittest
1850 {
1851     static if (!maskLoadWorkaroundDMD) 
1852     {
1853         double[2] A = [7.5, 1];
1854         double2 B = _mm_maskload_pd(A.ptr, _mm_setr_epi64(-1, 1));
1855         double[2] correct = [7.5, 0];
1856         assert(B.array == correct);
1857     }
1858 }
1859 
1860 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask`
1861 /// (elements are zeroed out when the high bit of the corresponding element is not set).
1862 /// See: "Note about mask load/store" to know why you must address valid memory only.
1863 __m256d _mm256_maskload_pd (const(double)* mem_addr, __m256i mask) /*pure*/ @system
1864 {
1865     // PERF DMD
1866     static if (LDC_with_AVX)
1867     {
1868         // MAYDO that the builtin is impure
1869         return __builtin_ia32_maskloadpd256(mem_addr, mask);
1870     }
1871     else static if (GDC_with_AVX)
1872     {
1873         return __builtin_ia32_maskloadpd256(cast(double4*)mem_addr, mask);
1874     }
1875     else
1876     {
1877         __m256d a = _mm256_loadu_pd(mem_addr);
1878         __m256d zero = _mm256_setzero_pd();
1879         return _mm256_blendv_pd(zero, a, cast(double4)mask);
1880     }
1881 }
1882 unittest
1883 {
1884     static if (!maskLoadWorkaroundDMD)
1885     {
1886         double[4] A = [7.5, 1, 2, 3];
1887         double4 B = _mm256_maskload_pd(A.ptr, _mm256_setr_epi64(1, -1, -1, 1));
1888         double[4] correct = [0.0, 1, 2, 0];
1889         assert(B.array == correct);
1890     }
1891 }
1892 
1893 /// Load packed single-precision (32-bit) floating-point elements from memory using mask (elements
1894 /// are zeroed out when the high bit of the corresponding element is not set).
1895 /// Warning: See "Note about mask load/store" to know why you must address valid memory only.
1896 __m128 _mm_maskload_ps (const(float)* mem_addr, __m128i mask) /* pure */ @system
1897 {
1898     // PERF DMD
1899     static if (LDC_with_AVX)
1900     {
1901         // MAYDO report that the builtin is impure
1902         return __builtin_ia32_maskloadps(mem_addr, mask);
1903     }
1904     else static if (GDC_with_AVX)
1905     {
1906         return __builtin_ia32_maskloadps(cast(float4*)mem_addr, mask);
1907     }
1908     else
1909     {
1910         __m128 a = _mm_loadu_ps(mem_addr);
1911         __m128 zero = _mm_setzero_ps();
1912         return _mm_blendv_ps(zero, a, cast(float4)mask);
1913     }
1914 }
1915 unittest
1916 {
1917     static if (!maskLoadWorkaroundDMD)
1918     {
1919         float[4] A = [7.5f, 1, 2, 3];
1920         float4 B = _mm_maskload_ps(A.ptr, _mm_setr_epi32(1, -1, -1, 1));  // can address invalid memory with mask load and writes!
1921         float[4] correct = [0.0f, 1, 2, 0];
1922         assert(B.array == correct);
1923     }
1924 }
1925 
1926 /// Load packed single-precision (32-bit) floating-point elements from memory using `mask`
1927 /// (elements are zeroed out when the high bit of the corresponding element is not set).
1928 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access
1929 /// only when needed.
1930 /// See: "Note about mask load/store" to know why you must address valid memory only.
1931 __m256 _mm256_maskload_ps (const(float)* mem_addr, __m256i mask) /*pure*/ @system
1932 {
1933     // PERF DMD
1934     static if (LDC_with_AVX)
1935     {
1936         // MAYDO that the builtin is impure
1937         return __builtin_ia32_maskloadps256(mem_addr, cast(int8)mask);
1938     }
1939     else static if (GDC_with_AVX)
1940     {
1941         return __builtin_ia32_maskloadps256(cast(float8*)mem_addr, cast(int8)mask);
1942     }
1943     else
1944     {
1945         __m256 a = _mm256_loadu_ps(mem_addr);
1946         __m256 zero = _mm256_setzero_ps();
1947         return _mm256_blendv_ps(zero, a, cast(float8)mask);
1948     }
1949 }
1950 unittest
1951 {
1952     float[8] A                  = [1,   7.5f,  1,  2, 3,  4,  5, 6];
1953     __m256i  M = _mm256_setr_epi32(1,     -1,  1, -1, 1, -1, -1, 1);
1954     float8 B = _mm256_maskload_ps(A.ptr, M);
1955     float[8] correct =            [0.0f, 7.5f, 0,  2, 0,  4,  5, 0];
1956     assert(B.array == correct);
1957 }
1958 
1959 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`.
1960 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access
1961 /// only when needed.
1962 /// See: "Note about mask load/store" to know why you must address valid memory only.
1963 void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a) /* pure */ @system
1964 {
1965     // PERF DMD
1966     static if (LDC_with_AVX)
1967     {
1968         // MAYDO that the builtin is impure
1969         __builtin_ia32_maskstorepd(mem_addr, cast(long2)mask, a);
1970     }
1971     else static if (GDC_with_AVX)
1972     {
1973         __builtin_ia32_maskstorepd(cast(double2*)mem_addr, cast(long2)mask, a);
1974     }
1975     else
1976     {
1977         __m128d source = _mm_loadu_pd(mem_addr);
1978         __m128d r = _mm_blendv_pd(source, a, cast(double2) mask);
1979         _mm_storeu_pd(mem_addr, r);
1980     }
1981 }
1982 unittest
1983 {
1984     double[2] A = [0.0, 1.0];
1985     __m128i M = _mm_setr_epi64(-1, 0);
1986     __m128d B = _mm_setr_pd(2.0, 3.0);
1987     _mm_maskstore_pd(A.ptr, M, B);
1988     double[2] correct = [2.0, 1.0];
1989     assert(A == correct);
1990 }
1991 
1992 
1993 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`.
1994 /// See: "Note about mask load/store" to know why you must address valid memory only.
1995 static if (!llvm256BitStackWorkaroundIn32BitX86)
1996 {
1997     void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a) /* pure */ @system
1998     {
1999         // PERF DMD
2000         static if (LDC_with_AVX)
2001         {
2002             // MAYDO that the builtin is impure
2003             __builtin_ia32_maskstorepd256(mem_addr, cast(long4)mask, a);
2004         }
2005         else static if (GDC_with_AVX)
2006         {
2007             __builtin_ia32_maskstorepd256(cast(double4*)mem_addr, cast(long4)mask, a);
2008         }
2009         else
2010         {
2011             __m256d source = _mm256_loadu_pd(mem_addr);
2012             __m256d r = _mm256_blendv_pd(source, a, cast(double4) mask);
2013             _mm256_storeu_pd(mem_addr, r);
2014         }
2015     }
2016     unittest
2017     {
2018         double[4] A = [0.0, 1, 2, 3];
2019         __m256i M = _mm256_setr_epi64x(-9, 0, -1, 0);
2020         __m256d B = _mm256_setr_pd(2, 3, 4, 5);
2021         _mm256_maskstore_pd(A.ptr, M, B);
2022         double[4] correct = [2.0, 1, 4, 3];
2023         assert(A == correct);
2024     }
2025 }
2026 
2027 /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`.
2028 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access
2029 /// only when needed.
2030 /// See: "Note about mask load/store" to know why you must address valid memory only.
2031 void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)  /* pure */ @system
2032 {
2033     // PERF DMD
2034     static if (LDC_with_AVX)
2035     {
2036         // MAYDO report that the builtin is impure
2037         __builtin_ia32_maskstoreps(mem_addr, mask, a);
2038     }
2039     else static if (GDC_with_AVX)
2040     {
2041         __builtin_ia32_maskstoreps(cast(float4*)mem_addr, mask, a);
2042     }
2043     else
2044     {
2045         __m128 source = _mm_loadu_ps(mem_addr);
2046         __m128 r = _mm_blendv_ps(source, a, cast(float4) mask);
2047         _mm_storeu_ps(mem_addr, r);
2048     }
2049 }
2050 unittest
2051 {
2052     float[4] A = [0.0f, 1, 2, 6];
2053     __m128i M = _mm_setr_epi32(-1, 0, -1, 0);
2054     __m128 B = _mm_setr_ps(2, 3, 4, 5);
2055     _mm_maskstore_ps(A.ptr, M, B);
2056     float[4] correct = [2.0f, 1, 4, 6];
2057     assert(A == correct);
2058 }
2059 
2060 static if (!llvm256BitStackWorkaroundIn32BitX86)
2061 {
2062     /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`.
2063     /// See: "Note about mask load/store" to know why you must address valid memory only.
2064     void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) /* pure */ @system
2065     {
2066         // PERF DMD
2067         static if (LDC_with_AVX)
2068         {
2069             // MAYDO report that the builtin is impure
2070             __builtin_ia32_maskstoreps256(mem_addr, cast(int8)mask, a);
2071         }
2072         else static if (GDC_with_AVX)
2073         {
2074             __builtin_ia32_maskstoreps256(cast(float8*)mem_addr, cast(int8)mask, a);
2075         }
2076         else
2077         {
2078             __m256 source = _mm256_loadu_ps(mem_addr);
2079             __m256 r = _mm256_blendv_ps(source, a, cast(float8) mask);
2080             _mm256_storeu_ps(mem_addr, r);
2081         }
2082     }
2083     unittest
2084     {
2085         float[8] A                 = [0.0f, 0, 1,  2, 3,  4,  5, 7];
2086         __m256i M = _mm256_setr_epi32(  0, -1, 0, -1, 0, -1, -1, 0);
2087         __m256 B = _mm256_set1_ps(6.0f);
2088         _mm256_maskstore_ps(A.ptr, M, B);
2089         float[8] correct           = [0.0f, 6, 1,  6, 3,  6,  6, 7];
2090         assert(A == correct);
2091     }
2092 }
2093 
2094 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
2095 /// packed maximum values.
2096 __m256d _mm256_max_pd (__m256d a, __m256d b) pure @trusted
2097 {    
2098     // PERF DMD
2099     static if (GDC_or_LDC_with_AVX)
2100     {
2101         return __builtin_ia32_maxpd256(a, b);
2102     }
2103     else
2104     {
2105         // LDC: becomes good in -O2
2106         // PERF: GDC without AVX
2107         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2108         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2109         a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2];
2110         a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3];
2111         return a;
2112     }
2113 }
2114 unittest
2115 {
2116     __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity);
2117     __m256d B = _mm256_setr_pd(1.0, 8.0,  0.0, 100000.0);
2118     __m256d M = _mm256_max_pd(A, B);
2119     double[4] correct =       [4.0, 8.0, 0.0, double.infinity];
2120 }
2121 
2122 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 
2123 /// packed maximum values.
2124 __m256 _mm256_max_ps (__m256 a, __m256 b) pure @trusted
2125 {
2126     // PERF DMD
2127     static if (GDC_or_LDC_with_AVX)
2128     {
2129         return __builtin_ia32_maxps256(a, b);
2130     }
2131     else
2132     {
2133         // LDC: becomes good in -O2, but looks brittle.
2134         // PERF GDC without AVX
2135         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2136         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2137         a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2];
2138         a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3];
2139         a.ptr[4] = (a.array[4] > b.array[4]) ? a.array[4] : b.array[4];
2140         a.ptr[5] = (a.array[5] > b.array[5]) ? a.array[5] : b.array[5];
2141         a.ptr[6] = (a.array[6] > b.array[6]) ? a.array[6] : b.array[6];
2142         a.ptr[7] = (a.array[7] > b.array[7]) ? a.array[7] : b.array[7];
2143         return a;
2144     }
2145 }
2146 unittest
2147 {
2148     __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4);
2149     __m256 B = _mm256_setr_ps(1.0, 8.0,  0.0, 100000.0f     , 4, 3, 2, 1);
2150     __m256 M = _mm256_max_ps(A, B);
2151     float[8] correct =       [4.0, 8.0,  0.0, float.infinity , 4, 3, 3, 4];
2152 }
2153 
2154 // Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
2155 /// packed minimum values.
2156 __m256d _mm256_min_pd (__m256d a, __m256d b) pure @trusted
2157 {
2158     // PERF DMD
2159     static if (GDC_or_LDC_with_AVX)
2160     {
2161         return __builtin_ia32_minpd256(a, b);
2162     }
2163     else
2164     {
2165         // LDC: becomes good in -O2
2166         // PERF: GDC without AVX
2167         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2168         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2169         a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2];
2170         a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3];
2171         return a;
2172     }
2173 }
2174 unittest
2175 {
2176     __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity);
2177     __m256d B = _mm256_setr_pd(1.0, 8.0,  0.0, 100000.0);
2178     __m256d M = _mm256_min_pd(A, B);
2179     double[4] correct =       [1.0, 8.0, -9.0, 100000.0];
2180 }
2181 
2182 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 
2183 /// packed maximum values.
2184 __m256 _mm256_min_ps (__m256 a, __m256 b) pure @trusted
2185 {
2186     // PERF DMD
2187     static if (GDC_or_LDC_with_AVX)
2188     {
2189         return __builtin_ia32_minps256(a, b);
2190     }
2191     else
2192     {
2193         // LDC: becomes good in -O2, but looks brittle.
2194         // PERF GDC without AVX
2195         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2196         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2197         a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2];
2198         a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3];
2199         a.ptr[4] = (a.array[4] < b.array[4]) ? a.array[4] : b.array[4];
2200         a.ptr[5] = (a.array[5] < b.array[5]) ? a.array[5] : b.array[5];
2201         a.ptr[6] = (a.array[6] < b.array[6]) ? a.array[6] : b.array[6];
2202         a.ptr[7] = (a.array[7] < b.array[7]) ? a.array[7] : b.array[7];
2203         return a;
2204     }
2205 }
2206 unittest
2207 {
2208     __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4);
2209     __m256 B = _mm256_setr_ps(1.0, 8.0,  0.0, 100000.0f     , 4, 3, 2, 1);
2210     __m256 M = _mm256_min_ps(A, B);
2211     float[8] correct =       [1.0, 1.0, -9.0, 100000.0f     , 1, 2, 2, 1];
2212 }
2213 
2214 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from `a`.
2215 __m256d _mm256_movedup_pd (__m256d a) @trusted
2216 {
2217     // PERF DMD
2218     static if (GDC_with_AVX)
2219     {
2220         return __builtin_ia32_movddup256 (a);
2221     }
2222     else
2223     {
2224         a.ptr[1] = a.array[0];
2225         a.ptr[3] = a.array[2];
2226         return a;
2227     }
2228 }
2229 unittest
2230 {
2231     __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
2232     A = _mm256_movedup_pd(A);
2233     double[4] correct = [1.0, 1, 3, 3];
2234     assert(A.array == correct);
2235 }
2236 
2237 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
2238 __m256 _mm256_movehdup_ps (__m256 a) @trusted
2239 {
2240     // PERF DMD
2241     static if (GDC_with_AVX)
2242     {
2243         return __builtin_ia32_movshdup256 (a);
2244     }
2245     else
2246     {
2247         a.ptr[0] = a.array[1];
2248         a.ptr[2] = a.array[3];
2249         a.ptr[4] = a.array[5];
2250         a.ptr[6] = a.array[7];
2251         return a;
2252     }
2253 }
2254 unittest
2255 {
2256     __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8);
2257     A = _mm256_movehdup_ps(A);
2258     float[8] correct = [2.0, 2, 4, 4, 6, 6, 8, 8];
2259     assert(A.array == correct);
2260 }
2261 
2262 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
2263 __m256 _mm256_moveldup_ps (__m256 a) @trusted
2264 {
2265     // PERF DMD
2266     static if (GDC_with_AVX)
2267     {
2268         return __builtin_ia32_movsldup256 (a);
2269     }
2270     else
2271     {
2272         a.ptr[1] = a.array[0];
2273         a.ptr[3] = a.array[2];
2274         a.ptr[5] = a.array[4];
2275         a.ptr[7] = a.array[6];
2276         return a;
2277     }
2278 }
2279 unittest
2280 {
2281     __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8);
2282     A = _mm256_moveldup_ps(A);
2283     float[8] correct = [1.0, 1, 3, 3, 5, 5, 7, 7];
2284     assert(A.array == correct);
2285 }
2286 
2287 /// Set each bit of result mask based on the most significant bit of the corresponding packed 
2288 /// double-precision (64-bit) floating-point element in `a`.
2289 int _mm256_movemask_pd (__m256d a) @safe
2290 {
2291     // PERF DMD
2292     static if (GDC_or_LDC_with_AVX)
2293     {
2294         return __builtin_ia32_movmskpd256(a);
2295     }
2296     else static if (LDC_with_SSE2)
2297     {
2298         // this doesn't benefit GDC, and not clear for arm64.
2299         __m128d A_lo = _mm256_extractf128_pd!0(a);
2300         __m128d A_hi = _mm256_extractf128_pd!1(a);
2301 
2302         return (_mm_movemask_pd(A_hi) << 2) | _mm_movemask_pd(A_lo);
2303     }
2304     else
2305     {
2306         // Fortunately, branchless on arm64
2307         long4 lv = cast(long4)a;
2308         int r = 0;
2309         if (lv.array[0] < 0) r += 1;
2310         if (lv.array[1] < 0) r += 2;
2311         if (lv.array[2] < 0) r += 4;
2312         if (lv.array[3] < 0) r += 8;
2313         return r;
2314     }
2315 }
2316 unittest
2317 {
2318     __m256d A = _mm256_setr_pd(-1, -double.infinity, 0, -1);
2319     assert(_mm256_movemask_pd(A) == 1 + 2 + 8);
2320 }
2321 
2322 /// Set each bit of mask result based on the most significant bit of the corresponding packed 
2323 /// single-precision (32-bit) floating-point element in `a`.
2324 int _mm256_movemask_ps (__m256 a) @system
2325 {
2326     // PERF DMD
2327     // PERF GDC without AVX
2328     static if (GDC_or_LDC_with_AVX)
2329     {
2330         return __builtin_ia32_movmskps256(a);
2331     }
2332     else version(LDC)
2333     {
2334         // this doesn't benefit GDC (unable to inline), but benefits both LDC with SSE2 and ARM64
2335         __m128 A_lo = _mm256_extractf128_ps!0(a);
2336         __m128 A_hi = _mm256_extractf128_ps!1(a);
2337         return (_mm_movemask_ps(A_hi) << 4) | _mm_movemask_ps(A_lo);
2338     }
2339     else
2340     {
2341         int8 lv = cast(int8)a;
2342         int r = 0;
2343         if (lv.array[0] < 0) r += 1;
2344         if (lv.array[1] < 0) r += 2;
2345         if (lv.array[2] < 0) r += 4;
2346         if (lv.array[3] < 0) r += 8;
2347         if (lv.array[4] < 0) r += 16;
2348         if (lv.array[5] < 0) r += 32;
2349         if (lv.array[6] < 0) r += 64;
2350         if (lv.array[7] < 0) r += 128;
2351         return r;
2352     }
2353 }
2354 unittest
2355 {
2356     __m256 A = _mm256_setr_ps(-1, -double.infinity, 0, -1, 1, double.infinity, -2, double.nan);
2357     assert(_mm256_movemask_ps(A) == 1 + 2 + 8 + 64);
2358 }
2359 
2360 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`.
2361 __m256d _mm256_mul_pd (__m256d a, __m256d b) pure @safe
2362 {
2363     return a * b;
2364 }
2365 unittest
2366 {
2367     __m256d a = [-2.0, 1.5, -2.0, 1.5];
2368     a = _mm256_mul_pd(a, a);
2369     assert(a.array == [4.0, 2.25, 4.0, 2.25]);
2370 }
2371 
2372 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`.
2373 __m256 _mm256_mul_ps (__m256 a, __m256 b) pure @safe
2374 {
2375     return a * b;
2376 }
2377 unittest
2378 {
2379     __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2.0f, 3.0f, 1.0f];
2380     a = _mm256_mul_ps(a, a);
2381     float[8] correct = [2.25f, 4.0f, 9.0f, 1.0f, 2.25f, 4.0f, 9.0f, 1.0f];
2382     assert(a.array == correct);
2383 }
2384 
2385 
2386 /// Compute the bitwise NOT of 256 bits in `a`. #BONUS
2387 __m256i _mm256_not_si256 (__m256i a) pure @safe
2388 {
2389     return ~a;
2390 }
2391 unittest
2392 {
2393     __m256i A = _mm256_set1_epi64x(-748);
2394     long4 notA = cast(long4) _mm256_not_si256(A);
2395     int[4] correct = [747, 747, 747, 747];
2396     assert(notA.array == correct);
2397 }
2398 
2399 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2400 __m256d _mm256_or_pd (__m256d a, __m256d b) pure @safe
2401 {
2402     return cast(__m256d)( cast(__m256i)a | cast(__m256i)b );
2403 }
2404 
2405 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
2406 __m256 _mm256_or_ps (__m256 a, __m256 b) pure @safe
2407 {
2408     return cast(__m256)( cast(__m256i)a | cast(__m256i)b );
2409 }
2410 
2411 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `imm8`.
2412 __m128d _mm_permute_pd(int imm8)(__m128d a) pure @trusted
2413 {
2414     static if (GDC_with_AVX)
2415     {
2416         return __builtin_ia32_vpermilpd(a, imm8 & 3);
2417     }
2418     else
2419     {
2420         // Shufflevector not particularly better for LDC here
2421         __m128d r;
2422         r.ptr[0] = a.array[imm8 & 1];
2423         r.ptr[1] = a.array[(imm8 >> 1) & 1];
2424         return r;
2425     }
2426 }
2427 unittest
2428 {
2429     __m128d A = _mm_setr_pd(5, 6);
2430     __m128d B = _mm_permute_pd!1(A);
2431     __m128d C = _mm_permute_pd!3(A);
2432     double[2] RB = [6, 5];
2433     double[2] RC = [6, 6];
2434     assert(B.array == RB);
2435     assert(C.array == RC);
2436 }
2437 
2438 ///ditto
2439 __m256d _mm256_permute_pd(int imm8)(__m256d a) pure @trusted
2440 {
2441     // PERF DMD
2442     static if (GDC_with_AVX)
2443     {
2444         return __builtin_ia32_vpermilpd256(a, imm8 & 15);
2445     }
2446     else version(LDC)
2447     {
2448         return shufflevectorLDC!(double4,        
2449                                        (imm8 >> 0) & 1,
2450                                      ( (imm8 >> 1) & 1),
2451                                  2 + ( (imm8 >> 2) & 1),
2452                                  2 + ( (imm8 >> 3) & 1) )(a, a);
2453     }
2454     else
2455     {
2456         __m256d r;
2457         r.ptr[0] = a.array[ imm8       & 1];
2458         r.ptr[1] = a.array[(imm8 >> 1) & 1];
2459         r.ptr[2] = a.array[2 + ((imm8 >> 2) & 1)];
2460         r.ptr[3] = a.array[2 + ((imm8 >> 3) & 1)];
2461         return r;
2462     }
2463 }
2464 unittest
2465 {
2466     __m256d A = _mm256_setr_pd(0.0, 1, 2, 3);
2467     __m256d R = _mm256_permute_pd!(1 + 4)(A);
2468     double[4] correct = [1.0, 0, 3, 2];
2469     assert(R.array == correct);
2470 }
2471 
2472 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `imm8`.
2473 __m128 _mm_permute_ps(int imm8)(__m128 a) pure @trusted
2474 {
2475     // PERF DMD
2476     static if (GDC_with_AVX)
2477     {
2478         return __builtin_ia32_vpermilps(a, cast(ubyte)imm8);
2479     }
2480     else version(LDC)
2481     {
2482         return shufflevectorLDC!(float4, (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, 
2483             (imm8 >> 6) & 3)(a, a);
2484     }
2485     else
2486     {
2487         // PERF: could use _mm_shuffle_ps which is a super set
2488         // when AVX isn't available
2489         __m128 r;
2490         r.ptr[0] = a.array[(imm8 >> 0) & 3];
2491         r.ptr[1] = a.array[(imm8 >> 2) & 3];
2492         r.ptr[2] = a.array[(imm8 >> 4) & 3];
2493         r.ptr[3] = a.array[(imm8 >> 6) & 3];
2494         return r;
2495     }
2496 }
2497 unittest
2498 {
2499     __m128 A = _mm_setr_ps(0.0f, 1, 2, 3);
2500     __m128 R = _mm_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A);
2501     float[4] correct = [1.0f, 3, 0, 2];
2502     assert(R.array == correct);
2503 }
2504 
2505 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 
2506 /// the control in `imm8`. The same shuffle is applied in lower and higher 128-bit lane.
2507 __m256 _mm256_permute_ps(int imm8)(__m256 a,) pure @trusted
2508 {
2509     // PERF DMD
2510     static if (GDC_with_AVX)
2511     {
2512         return __builtin_ia32_vpermilps256(a, cast(ubyte)imm8);
2513     }
2514     else version(LDC)
2515     {
2516         return shufflevectorLDC!(float8, 
2517             (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, (imm8 >> 6) & 3,
2518             4 + ((imm8 >> 0) & 3), 4 + ((imm8 >> 2) & 3), 4 + ((imm8 >> 4) & 3), 
2519             4 + ((imm8 >> 6) & 3))(a, a);
2520     }
2521     else
2522     {
2523         __m256 r;
2524         r.ptr[0] = a.array[(imm8 >> 0) & 3];
2525         r.ptr[1] = a.array[(imm8 >> 2) & 3];
2526         r.ptr[2] = a.array[(imm8 >> 4) & 3];
2527         r.ptr[3] = a.array[(imm8 >> 6) & 3];
2528         r.ptr[4] = a.array[4 + ((imm8 >> 0) & 3)];
2529         r.ptr[5] = a.array[4 + ((imm8 >> 2) & 3)];
2530         r.ptr[6] = a.array[4 + ((imm8 >> 4) & 3)];
2531         r.ptr[7] = a.array[4 + ((imm8 >> 6) & 3)];
2532         return r;
2533     }
2534 }
2535 unittest
2536 {
2537     __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7);
2538     __m256 R = _mm256_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A);
2539     float[8] correct = [1.0f, 3, 0, 2, 5, 7, 4, 6];
2540     assert(R.array == correct);
2541 } 
2542 
2543 /// Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 
2544 /// selected by `imm8` from `a` and `b`.
2545 __m256d _mm256_permute2f128_pd(int imm8)(__m256d a, __m256d b) pure @safe
2546 {
2547     return cast(__m256d) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b);
2548 }
2549 ///ditto
2550 __m256d _mm256_permute2f128_ps(int imm8)(__m256 a, __m256 b) pure @safe
2551 {
2552     return cast(__m256) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b);
2553 }
2554 ///ditto
2555 __m256i _mm256_permute2f128_si256(int imm8)(__m256i a, __m256i b) pure @trusted
2556 {
2557     static if (GDC_with_AVX)
2558     {
2559         return cast(__m256i) __builtin_ia32_vperm2f128_si256(cast(int8)a, cast(int8)b, cast(ubyte)imm8);
2560     }
2561     else 
2562     {
2563         static __m128i SELECT4(int imm4)(__m256i a, __m256i b) pure @trusted
2564         {
2565             static assert(imm4 >= 0 && imm4 <= 15);
2566             static if (imm4 & 8)
2567             {
2568                 return _mm_setzero_si128();
2569             }
2570             else static if ((imm4 & 2) == 0)
2571             {
2572                 long2 r;
2573                 enum int index = 2*(imm4 & 1);
2574                 r.ptr[0] = a.array[index+0];
2575                 r.ptr[1] = a.array[index+1];
2576                 return cast(__m128i)r;
2577             }
2578             else
2579             {
2580                 static assert( (imm4 & 2) != 0);
2581                 long2 r;
2582                 enum int index = 2*(imm4 & 1);
2583                 r.ptr[0] = b.array[index+0];
2584                 r.ptr[1] = b.array[index+1];
2585                 return cast(__m128i)r;
2586             }
2587         }
2588 
2589         long4 r;
2590         __m128i lo = SELECT4!(imm8 & 15)(a, b);
2591         __m128i hi = SELECT4!((imm8 >> 4) & 15)(a, b);
2592         return _mm256_set_m128i(hi, lo);
2593     }
2594 }
2595 unittest
2596 {
2597     __m256d A = _mm256_setr_pd(8.0, 1, 2, 3);
2598     __m256d B = _mm256_setr_pd(4.0, 5, 6, 7);
2599     __m256d R = _mm256_permute2f128_pd!(128 + 2)(A, B);
2600     double[4] correct = [4.0, 5.0, 0.0, 0.0];
2601     assert(R.array == correct);
2602 
2603     __m256d R2 = _mm256_permute2f128_pd!(3*16 + 1)(A, B);
2604     double[4] correct2 = [2.0, 3.0, 6.0, 7.0];
2605     assert(R2.array == correct2);
2606 }
2607 
2608 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `b`.
2609 /// Warning: the selector is in bit 1, not bit 0, of each 64-bit element!
2610 ///          This is really not intuitive.
2611 __m128d _mm_permutevar_pd(__m128d a, __m128i b) pure @trusted
2612 {
2613     enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64;
2614 
2615     static if (GDC_or_LDC_with_AVX)
2616     {
2617         return cast(__m128d) __builtin_ia32_vpermilvarpd(a, cast(long2)b);
2618     }
2619     else static if (implementWithByteShuffle)
2620     {
2621         align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7];
2622         align(16) static immutable byte[16] mmBroadcast_u8 = [0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8];
2623         int4 bi = cast(int4)b;
2624         long2 two;
2625         two = 2;
2626         bi = _mm_slli_epi64(cast(__m128i)( (cast(long2)bi) & two), 2);
2627         bi = _mm_shuffle_epi8(bi, *cast(__m128i*)mmBroadcast_u8.ptr);
2628         // bi is now [ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ]
2629         byte16 bytesIndices = cast(byte16)bi;
2630         bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr;
2631 
2632         // which allows us to make a single _mm_shuffle_epi8
2633         return cast(__m128d) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices);
2634     }
2635     else
2636     {
2637         // This isn't great in ARM64, TBL or TBX instructions can't do that.
2638         // that could fit the bill, if it had 64-bit operands. But it only has 8-bit operands.
2639         // SVE2 could do it with svtbx[_f64] probably.
2640         long2 bl = cast(long2)b;
2641         __m128d r;
2642         r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1];
2643         r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1];
2644         return r;
2645     }
2646 }
2647 unittest
2648 {
2649     __m128d A = _mm_setr_pd(5, 6);
2650     __m128d B = _mm_permutevar_pd(A, _mm_setr_epi64(2, 1));
2651     __m128d C = _mm_permutevar_pd(A, _mm_setr_epi64(1 + 2 + 4, 2));    
2652     // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element
2653     double[2] RB = [6, 5];
2654     double[2] RC = [6, 6];
2655     assert(B.array == RB);
2656     assert(C.array == RC);
2657 }
2658 
2659 ///ditto
2660 __m256d _mm256_permutevar_pd (__m256d a, __m256i b) pure @trusted
2661 {
2662     // Worth it: for GDC, in SSSE3+
2663     //           for LDC, all the time
2664     version(LDC)
2665         enum bool implementWithByteShuffle = true;
2666     else
2667         enum bool implementWithByteShuffle = GDC_with_SSSE3;
2668 
2669     // PERF DMD
2670     static if (GDC_or_LDC_with_AVX)
2671     {
2672         return cast(__m256d) __builtin_ia32_vpermilvarpd256(a, cast(long4)b);
2673     }
2674     else static if (implementWithByteShuffle)
2675     {
2676         // because we don't have 256-bit vectors, split and use _mm_permutevar_ps
2677         __m128d a_lo = _mm256_extractf128_pd!0(a);
2678         __m128d a_hi = _mm256_extractf128_pd!1(a);
2679         __m128i b_lo = _mm256_extractf128_si256!0(b);
2680         __m128i b_hi = _mm256_extractf128_si256!1(b);
2681         __m128d r_lo = _mm_permutevar_pd(a_lo, b_lo);
2682         __m128d r_hi = _mm_permutevar_pd(a_hi, b_hi);
2683         return _mm256_set_m128d(r_hi, r_lo);
2684     }
2685     else
2686     {
2687         long4 bl = cast(long4)b;
2688         __m256d r;
2689         r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1];
2690         r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1];
2691         r.ptr[2] = a.array[2 + ((bl.array[2] & 2) >> 1)];
2692         r.ptr[3] = a.array[2 + ((bl.array[3] & 2) >> 1)];
2693         return r;
2694     }
2695 }
2696 unittest
2697 {
2698     __m256d A = _mm256_setr_pd(5, 6, 7, 8);
2699     __m256d B = _mm256_permutevar_pd(A, _mm256_setr_epi64(2, 1, 0, 2));
2700     __m256d C = _mm256_permutevar_pd(A, _mm256_setr_epi64(1 + 2 + 4, 2, 2, 0));
2701     // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element
2702     double[4] RB = [6, 5, 7, 8];
2703     double[4] RC = [6, 6, 8, 7];
2704     assert(B.array == RB);
2705     assert(C.array == RC);
2706 }
2707 
2708 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `b`.
2709 __m128 _mm_permutevar_ps (__m128 a, __m128i b) @trusted
2710 {
2711     // PERF DMD
2712 
2713     enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64;
2714 
2715     static if (GDC_or_LDC_with_AVX)
2716     {
2717         return cast(__m128) __builtin_ia32_vpermilvarps(a, cast(int4)b);
2718     }
2719     else static if (implementWithByteShuffle)
2720     {
2721         // This workaround is worth it: in GDC with SSSE3, in LDC with SSSE3, in ARM64 (neon)
2722         int4 bi = cast(int4)b;
2723         int4 three;
2724         three = 3;
2725         bi = _mm_slli_epi32(bi & three, 2);
2726         // bi is [ind0 0 0 0 ind1 0 0 0 ind2 0 0 0 ind3 0 0 0]
2727         bi = bi | _mm_slli_si128!1(bi);
2728         bi = bi | _mm_slli_si128!2(bi);
2729         // bi is now [ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind2 ind2 ind2 ind2 ind3 ind3 ind3 ind3]
2730         byte16 bytesIndices = cast(byte16)bi;
2731         align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
2732         bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr;
2733 
2734         // which allows us to make a single _mm_shuffle_epi8
2735         return cast(__m128) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices);
2736     }
2737     else
2738     {
2739 
2740         int4 bi = cast(int4)b;
2741         __m128 r;
2742         r.ptr[0] = a.array[ (bi.array[0] & 3) ];
2743         r.ptr[1] = a.array[ (bi.array[1] & 3) ];
2744         r.ptr[2] = a.array[ (bi.array[2] & 3) ];
2745         r.ptr[3] = a.array[ (bi.array[3] & 3) ];
2746         return r;
2747     }
2748 }
2749 unittest
2750 {
2751     __m128 A = _mm_setr_ps(5, 6, 7, 8);
2752     __m128 B = _mm_permutevar_ps(A, _mm_setr_epi32(2, 1, 0, 2 + 4));
2753     __m128 C = _mm_permutevar_ps(A, _mm_setr_epi32(2, 3 + 8, 1, 0));
2754     float[4] RB = [7, 6, 5, 7];
2755     float[4] RC = [7, 8, 6, 5];
2756     assert(B.array == RB);
2757     assert(C.array == RC);
2758 }
2759 
2760 ///ditto
2761 __m256 _mm256_permutevar_ps (__m256 a, __m256i b) @trusted
2762 {
2763     // In order to do those two, it is necessary to use _mm_shuffle_epi8 and reconstruct the integers afterwards.
2764     enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64;
2765 
2766     static if (GDC_or_LDC_with_AVX)
2767     {
2768         return __builtin_ia32_vpermilvarps256(a, cast(int8)b);
2769     }
2770     else static if (implementWithByteShuffle)
2771     {
2772         // because we don't have 256-bit vectors, split and use _mm_permutevar_ps
2773         __m128 a_lo = _mm256_extractf128_ps!0(a);
2774         __m128 a_hi = _mm256_extractf128_ps!1(a);
2775         __m128i b_lo = _mm256_extractf128_si256!0(b);
2776         __m128i b_hi = _mm256_extractf128_si256!1(b);
2777         __m128 r_lo = _mm_permutevar_ps(a_lo, b_lo);
2778         __m128 r_hi = _mm_permutevar_ps(a_hi, b_hi);
2779         return _mm256_set_m128(r_hi, r_lo);
2780     }
2781     else
2782     {
2783         int8 bi = cast(int8)b;
2784         __m256 r;
2785         r.ptr[0] = a.array[ (bi.array[0] & 3) ];
2786         r.ptr[1] = a.array[ (bi.array[1] & 3) ];
2787         r.ptr[2] = a.array[ (bi.array[2] & 3) ];
2788         r.ptr[3] = a.array[ (bi.array[3] & 3) ];
2789         r.ptr[4] = a.array[ 4 + (bi.array[4] & 3) ];
2790         r.ptr[5] = a.array[ 4 + (bi.array[5] & 3) ];
2791         r.ptr[6] = a.array[ 4 + (bi.array[6] & 3) ];
2792         r.ptr[7] = a.array[ 4 + (bi.array[7] & 3) ];
2793         return r;
2794     } 
2795 }
2796 unittest
2797 {
2798     __m256 A = _mm256_setr_ps(1, 2, 3, 4, 5, 6, 7, 8);
2799     __m256 B = _mm256_permutevar_ps(A, _mm256_setr_epi32(2,     1, 0, 2, 3, 2, 1, 0));
2800     __m256 C = _mm256_permutevar_ps(A, _mm256_setr_epi32(2, 3 + 8, 1, 0, 2, 3, 0, 1));
2801     float[8] RB = [3.0f, 2, 1, 3, 8, 7, 6, 5];
2802     float[8] RC = [3.0f, 4, 2, 1, 7, 8, 5, 6];
2803     assert(B.array == RB);
2804     assert(C.array == RC);
2805 }
2806 
2807 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements
2808 /// in `a`. The maximum relative error for this approximation is less than 1.5*2^-12.
2809 __m256 _mm256_rcp_ps (__m256 a) pure @trusted
2810 {
2811     // PERF DMD
2812     static if (GDC_or_LDC_with_AVX)
2813     {
2814         return __builtin_ia32_rcpps256(a);
2815     }
2816     else
2817     {        
2818         a.ptr[0] = 1.0f / a.array[0];
2819         a.ptr[1] = 1.0f / a.array[1];
2820         a.ptr[2] = 1.0f / a.array[2];
2821         a.ptr[3] = 1.0f / a.array[3];
2822         a.ptr[4] = 1.0f / a.array[4];
2823         a.ptr[5] = 1.0f / a.array[5];
2824         a.ptr[6] = 1.0f / a.array[6];
2825         a.ptr[7] = 1.0f / a.array[7];
2826         return a;
2827     }
2828 }
2829 unittest
2830 {
2831     __m256 A = _mm256_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f, 9, -46, 1869816, 55583);
2832     __m256 groundTruth = _mm256_set1_ps(1.0f) / A;
2833     __m256 result = _mm256_rcp_ps(A);
2834     foreach(i; 0..8)
2835     {
2836         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
2837         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2838     }
2839 }
2840 
2841 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
2842 /// rounding parameter, and store the results as packed double-precision floating-point elements.
2843 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
2844 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
2845 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
2846 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
2847 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
2848 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2849 __m256d _mm256_round_pd(int rounding)(__m256d a) @trusted
2850 {
2851     // PERF DMD
2852     static if (GDC_with_AVX)
2853     {
2854         return __builtin_ia32_roundpd256(a, rounding);
2855     }
2856     else static if (LDC_with_AVX)
2857     {
2858         return __builtin_ia32_roundpd256(a, rounding);
2859     }
2860     else
2861     {
2862         static if (rounding & _MM_FROUND_CUR_DIRECTION)
2863         {
2864             // PERF: non-AVX x86, would probably be faster to convert those double at once to int64
2865 
2866             __m128d A_lo = _mm256_extractf128_pd!0(a);
2867             __m128d A_hi = _mm256_extractf128_pd!1(a);
2868 
2869             // Convert to 64-bit integers one by one
2870             long x0 = _mm_cvtsd_si64(A_lo);
2871             long x2 = _mm_cvtsd_si64(A_hi);
2872             A_lo.ptr[0] = A_lo.array[1];
2873             A_hi.ptr[0] = A_hi.array[1];
2874             long x1 = _mm_cvtsd_si64(A_lo);
2875             long x3 = _mm_cvtsd_si64(A_hi);
2876 
2877             return _mm256_setr_pd(x0, x1, x2, x3);
2878         }
2879         else
2880         {
2881             version(GNU) pragma(inline, false); // this was required for SSE4.1 rounding, let it here
2882 
2883             uint old = _MM_GET_ROUNDING_MODE();
2884             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
2885             
2886             __m128d A_lo = _mm256_extractf128_pd!0(a);
2887             __m128d A_hi = _mm256_extractf128_pd!1(a);
2888 
2889             // Convert to 64-bit integers one by one
2890             long x0 = _mm_cvtsd_si64(A_lo);
2891             long x2 = _mm_cvtsd_si64(A_hi);
2892             A_lo.ptr[0] = A_lo.array[1];
2893             A_hi.ptr[0] = A_hi.array[1];
2894             long x1 = _mm_cvtsd_si64(A_lo);
2895             long x3 = _mm_cvtsd_si64(A_hi);
2896 
2897             // Convert back to double to achieve the rounding
2898             // The problem is that a 64-bit double can't represent all the values 
2899             // a 64-bit integer can (and vice-versa). So this function won't work for
2900             // large values. (FUTURE: what range exactly?)
2901             _MM_SET_ROUNDING_MODE(old);
2902             return _mm256_setr_pd(x0, x1, x2, x3);
2903         }
2904     }
2905 }
2906 unittest
2907 {
2908     // tested in other intrinsics
2909 }
2910 
2911 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
2912 /// rounding parameter, and store the results as packed single-precision floating-point elements.
2913 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
2914 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
2915 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
2916 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
2917 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
2918 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2919 __m256 _mm256_round_ps(int rounding)(__m256 a) @trusted
2920 {
2921     // PERF DMD
2922     static if (GDC_or_LDC_with_AVX)
2923     {
2924         return __builtin_ia32_roundps256(a, rounding);
2925     }
2926     else static if (GDC_or_LDC_with_SSE41)
2927     {
2928         // we can use _mm_round_ps
2929         __m128 lo = _mm256_extractf128_ps!0(a);
2930         __m128 hi = _mm256_extractf128_ps!1(a);
2931         __m128 ilo = _mm_round_ps!rounding(lo); // unfortunately _mm_round_ps isn't fast for arm64, so we avoid that in that case
2932         __m128 ihi = _mm_round_ps!rounding(hi);
2933         return _mm256_set_m128(ihi, ilo);
2934     }
2935     else
2936     {
2937         static if (rounding & _MM_FROUND_CUR_DIRECTION)
2938         {
2939             __m256i integers = _mm256_cvtps_epi32(a);
2940             return _mm256_cvtepi32_ps(integers);
2941         }
2942         else
2943         {
2944             version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
2945             uint old = _MM_GET_ROUNDING_MODE();
2946             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
2947             scope(exit) _MM_SET_ROUNDING_MODE(old);
2948 
2949             // Convert to 32-bit integers
2950             __m256i integers = _mm256_cvtps_epi32(a);
2951 
2952             // Convert back to float to achieve the rounding
2953             // The problem is that a 32-float can't represent all the values 
2954             // a 32-bit integer can (and vice-versa). So this function won't work for
2955             // large values. (FUTURE: what range exactly?)
2956             __m256 result = _mm256_cvtepi32_ps(integers);
2957 
2958             return result;
2959         }
2960     }
2961 }
2962 unittest
2963 {
2964     // tested in other intrinsics
2965 }
2966 
2967 
2968 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) 
2969 /// floating-point elements in `a`. The maximum relative error for this approximation is less than
2970 /// 1.5*2^-12.
2971 __m256 _mm256_rsqrt_ps (__m256 a) pure @trusted
2972 {
2973     static if (GDC_or_LDC_with_AVX)
2974     {
2975         return __builtin_ia32_rsqrtps256(a);
2976     }
2977     else version(LDC)
2978     {
2979         a[0] = 1.0f / llvm_sqrt(a[0]);
2980         a[1] = 1.0f / llvm_sqrt(a[1]);
2981         a[2] = 1.0f / llvm_sqrt(a[2]);
2982         a[3] = 1.0f / llvm_sqrt(a[3]);
2983         a[4] = 1.0f / llvm_sqrt(a[4]);
2984         a[5] = 1.0f / llvm_sqrt(a[5]);
2985         a[6] = 1.0f / llvm_sqrt(a[6]);
2986         a[7] = 1.0f / llvm_sqrt(a[7]);
2987         return a;
2988     }
2989     else
2990     {
2991         a.ptr[0] = 1.0f / sqrt(a.array[0]);
2992         a.ptr[1] = 1.0f / sqrt(a.array[1]);
2993         a.ptr[2] = 1.0f / sqrt(a.array[2]);
2994         a.ptr[3] = 1.0f / sqrt(a.array[3]);
2995         a.ptr[4] = 1.0f / sqrt(a.array[4]);
2996         a.ptr[5] = 1.0f / sqrt(a.array[5]);
2997         a.ptr[6] = 1.0f / sqrt(a.array[6]);
2998         a.ptr[7] = 1.0f / sqrt(a.array[7]);
2999         return a;
3000     }
3001 }
3002 unittest
3003 {
3004     __m256 A = _mm256_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f, 2.34f, 70000.0f, 0.00001f, 345.5f);
3005     __m256 groundTruth = _mm256_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f,
3006                                         0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f);
3007     __m256 result = _mm256_rsqrt_ps(A);
3008     foreach(i; 0..8)
3009     {
3010         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
3011         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
3012     }
3013 }
3014 
3015 /// Set packed 16-bit integers with the supplied values.
3016 __m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
3017 {
3018     short16 r; // Note: = void would prevent GDC from inlining a constant short16...
3019     r.ptr[0] = e0;
3020     r.ptr[1] = e1;
3021     r.ptr[2] = e2;
3022     r.ptr[3] = e3;
3023     r.ptr[4] = e4;
3024     r.ptr[5] = e5;
3025     r.ptr[6] = e6;
3026     r.ptr[7] = e7;
3027     r.ptr[8] = e8;
3028     r.ptr[9] = e9;
3029     r.ptr[10] = e10;
3030     r.ptr[11] = e11;
3031     r.ptr[12] = e12;
3032     r.ptr[13] = e13;
3033     r.ptr[14] = e14;
3034     r.ptr[15] = e15;
3035     return cast(__m256i) r;
3036 }
3037 unittest
3038 {
3039     short16 A = cast(short16) _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 
3040                                                7, 6, 5, 4, 3, 2, 1, 0);
3041     foreach(i; 0..16)
3042         assert(A.array[i] == i);
3043 }
3044 
3045 /// Set packed 32-bit integers with the supplied values.
3046 __m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted
3047 {
3048     // Inlines a constant with GCC -O1, LDC -O2
3049     int8 r; // = void would prevent GCC from inlining a constant call
3050     r.ptr[0] = e0;
3051     r.ptr[1] = e1;
3052     r.ptr[2] = e2;
3053     r.ptr[3] = e3;
3054     r.ptr[4] = e4;
3055     r.ptr[5] = e5;
3056     r.ptr[6] = e6;
3057     r.ptr[7] = e7;
3058     return cast(__m256i)r;
3059 }
3060 unittest
3061 {
3062     int8 A = cast(int8) _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
3063     foreach(i; 0..8)
3064         assert(A.array[i] == i);
3065 }
3066 
3067 /// Set packed 64-bit integers with the supplied values.
3068 __m256i _mm256_set_epi64x (long e3, long e2, long e1, long e0) pure @trusted
3069 {
3070     long4 r = void;
3071     r.ptr[0] = e0;
3072     r.ptr[1] = e1;
3073     r.ptr[2] = e2;
3074     r.ptr[3] = e3;
3075     return r;
3076 }
3077 unittest
3078 {
3079     __m256i A = _mm256_set_epi64x(-1, 42, long.min, long.max);
3080     long[4] correct = [long.max, long.min, 42, -1];
3081     assert(A.array == correct);
3082 }
3083 
3084 ///ditto
3085 alias _mm256_set_epi64 = _mm256_set_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API.
3086 
3087 /// Set packed 8-bit integers with the supplied values.
3088 __m256i _mm256_set_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 
3089                          byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 
3090                          byte e15, byte e14, byte e13, byte e12, byte e11, byte e10,  byte e9,  byte e8, 
3091                           byte e7,  byte e6,  byte e5,  byte e4,  byte e3,  byte e2,  byte e1,  byte e0)
3092 {
3093     // Inline a constant call in GDC -O1 and LDC -O2
3094     align(32) byte[32] result = [ e0,  e1,  e2,  e3,  e4,  e5,  e6,  e7,
3095                                   e8,  e9, e10, e11, e12, e13, e14, e15,
3096                                  e16, e17, e18, e19, e20, e21, e22, e23,
3097                                  e24, e25, e26, e27, e28, e29, e30, e31 ];
3098     return *cast(__m256i*)(result.ptr);
3099 }
3100 unittest
3101 {
3102     byte32 R = cast(byte32) _mm256_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7);
3103     byte[32] correct = [7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0,
3104                         14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1];
3105     assert(R.array == correct);
3106 }
3107 
3108 /// Set packed `__m256d` vector with the supplied values.
3109 __m256 _mm256_set_m128 (__m128 hi, __m128 lo) pure @trusted
3110 {
3111     // DMD PERF
3112     static if (GDC_with_AVX)
3113     {
3114         __m256 r = __builtin_ia32_ps256_ps(lo);
3115         return __builtin_ia32_vinsertf128_ps256(r, hi, 1);
3116     }
3117     else
3118     {
3119         __m256 r = void;
3120         r.ptr[0] = lo.array[0];
3121         r.ptr[1] = lo.array[1];
3122         r.ptr[2] = lo.array[2];
3123         r.ptr[3] = lo.array[3];
3124         r.ptr[4] = hi.array[0];
3125         r.ptr[5] = hi.array[1];
3126         r.ptr[6] = hi.array[2];
3127         r.ptr[7] = hi.array[3];
3128         return r;
3129     }
3130 
3131     /*
3132         // BUG, doesn't work if AVX vector is emulated, but SSE vector is not
3133         // See issue #108
3134         __m256 r = void;
3135         __m128* p = cast(__m128*)(&r);
3136         p[0] = lo;
3137         p[1] = hi;
3138         return r;
3139     */
3140 }
3141 unittest
3142 {
3143     __m128 lo = _mm_setr_ps(1.0f, 2, 3, 4);
3144     __m128 hi = _mm_setr_ps(3.0f, 4, 5, 6);
3145     __m256 R = _mm256_set_m128(hi, lo);
3146     float[8] correct = [1.0f, 2, 3, 4, 3, 4, 5, 6];
3147     assert(R.array == correct);
3148 }
3149 
3150 /// Set packed `__m256d` vector with the supplied values.
3151 __m256d _mm256_set_m128d (__m128d hi, __m128d lo) pure @trusted
3152 {
3153     __m256d r = void;
3154     r.ptr[0] = lo.array[0];
3155     r.ptr[1] = lo.array[1];
3156     r.ptr[2] = hi.array[0];
3157     r.ptr[3] = hi.array[1];
3158     return r;
3159 }
3160 unittest
3161 {
3162     __m128d lo = _mm_setr_pd(1.0, 2.0);
3163     __m128d hi = _mm_setr_pd(3.0, 4.0);
3164     __m256d R = _mm256_set_m128d(hi, lo);
3165     double[4] correct = [1.0, 2.0, 3.0, 4.0];
3166     assert(R.array == correct);
3167 }
3168 
3169 /// Set packed `__m256i` vector with the supplied values.
3170 __m256i _mm256_set_m128i (__m128i hi, __m128i lo) pure @trusted
3171 {
3172     // DMD PERF
3173     static if (GDC_with_AVX)
3174     {
3175         __m256i r = cast(long4) __builtin_ia32_si256_si (lo);
3176         return cast(long4) __builtin_ia32_vinsertf128_si256(cast(int8)r, hi, 1);
3177     }
3178     else
3179     {
3180         int8 r = void;
3181         r.ptr[0] = lo.array[0];
3182         r.ptr[1] = lo.array[1];
3183         r.ptr[2] = lo.array[2];
3184         r.ptr[3] = lo.array[3];
3185         r.ptr[4] = hi.array[0];
3186         r.ptr[5] = hi.array[1];
3187         r.ptr[6] = hi.array[2];
3188         r.ptr[7] = hi.array[3];
3189         return cast(long4)r;
3190     }
3191 }
3192 unittest
3193 {
3194     __m128i lo = _mm_setr_epi32( 1,  2,  3,  4);
3195     __m128i hi =  _mm_set_epi32(-3, -4, -5, -6);
3196     int8 R = cast(int8)_mm256_set_m128i(hi, lo);
3197     int[8] correct = [1, 2, 3, 4, -6, -5, -4, -3];
3198     assert(R.array == correct);
3199 }
3200 
3201 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3202 __m256d _mm256_set_pd (double e3, double e2, double e1, double e0) pure @trusted
3203 {
3204     __m256d r = void;
3205     r.ptr[0] = e0;
3206     r.ptr[1] = e1;
3207     r.ptr[2] = e2;
3208     r.ptr[3] = e3;
3209     return r;
3210 }
3211 unittest
3212 {
3213     __m256d A = _mm256_set_pd(3, 2, 1, 546);
3214     double[4] correct = [546.0, 1.0, 2.0, 3.0];
3215     assert(A.array == correct);
3216 }
3217 
3218 /// Set packed single-precision (32-bit) floating-point elements with the supplied values.
3219 __m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted
3220 {
3221     // PERF: see #102, use = void?
3222     __m256 r;
3223     r.ptr[0] = e0;
3224     r.ptr[1] = e1;
3225     r.ptr[2] = e2;
3226     r.ptr[3] = e3;
3227     r.ptr[4] = e4;
3228     r.ptr[5] = e5;
3229     r.ptr[6] = e6;
3230     r.ptr[7] = e7;
3231     return r;
3232 }
3233 unittest
3234 {
3235     __m256 A = _mm256_set_ps(3, 2, 1, 546.0f, -1.25f, -2, -3, 0);
3236     float[8] correct = [0, -3, -2, -1.25f, 546.0f, 1.0, 2.0, 3.0];
3237     assert(A.array == correct);
3238 }
3239 
3240 /// Broadcast 16-bit integer `a` to all elements of the return value.
3241 __m256i _mm256_set1_epi16 (short a) pure @trusted
3242 {
3243     version(DigitalMars) 
3244     {
3245         // workaround https://issues.dlang.org/show_bug.cgi?id=21469
3246         // It used to ICE, after that the codegen was just wrong.
3247         // No issue anymore in DMD 2.101, we can eventually remove that
3248         static if (__VERSION__ < 2101)
3249         {
3250             short16 v = a;
3251             return cast(__m256i) v;
3252         }
3253         else
3254         {
3255             pragma(inline, true);
3256             return cast(__m256i)(short16(a));
3257         }
3258     }
3259     else
3260     {
3261         pragma(inline, true);
3262         return cast(__m256i)(short16(a));
3263     }
3264 }
3265 unittest
3266 {
3267     short16 a = cast(short16) _mm256_set1_epi16(31);
3268     for (int i = 0; i < 16; ++i)
3269         assert(a.array[i] == 31);
3270 }
3271 
3272 /// Broadcast 32-bit integer `a` to all elements.
3273 __m256i _mm256_set1_epi32 (int a) pure @trusted
3274 {
3275     version(DigitalMars) 
3276     {
3277         // No issue anymore in DMD 2.101, we can eventually remove that
3278         static if (__VERSION__ < 2101)
3279         {
3280             int8 v = a;
3281             return cast(__m256i) v;
3282         }
3283         else
3284         {
3285             pragma(inline, true);
3286             return cast(__m256i)(int8(a));
3287         }
3288     }
3289     else
3290     {
3291         pragma(inline, true);
3292         return cast(__m256i)(int8(a));
3293     }
3294 }
3295 unittest
3296 {
3297     int8 a = cast(int8) _mm256_set1_epi32(31);
3298     for (int i = 0; i < 8; ++i)
3299         assert(a.array[i] == 31);
3300 }
3301 
3302 /// Broadcast 64-bit integer `a` to all elements of the return value.
3303 __m256i _mm256_set1_epi64x (long a)
3304 {
3305     return cast(__m256i)(long4(a));
3306 }
3307 unittest
3308 {
3309     long4 a = cast(long4) _mm256_set1_epi64x(-31);
3310     for (int i = 0; i < 4; ++i)
3311         assert(a.array[i] == -31);
3312 }
3313 ///ditto
3314 alias _mm256_set1_epi64 = _mm256_set1_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API.
3315 
3316 /// Broadcast 8-bit integer `a` to all elements of the return value.
3317 __m256i _mm256_set1_epi8 (byte a) pure @trusted
3318 {
3319     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469
3320     {
3321         byte32 v = a;
3322         return cast(__m256i) v;
3323     }
3324     else
3325     {
3326         pragma(inline, true);
3327         return cast(__m256i)(byte32(a));
3328     }
3329 }
3330 unittest
3331 {
3332     byte32 a = cast(byte32) _mm256_set1_epi8(31);
3333     for (int i = 0; i < 32; ++i)
3334         assert(a.array[i] == 31);
3335 }
3336 
3337 /// Broadcast double-precision (64-bit) floating-point value `a` to all elements of the return value.
3338 __m256d _mm256_set1_pd (double a) pure @trusted
3339 {
3340     return __m256d(a);
3341 }
3342 unittest
3343 {
3344     double a = 464.21;
3345     double[4] correct = [a, a, a, a];
3346     double4 A = cast(double4) _mm256_set1_pd(a);
3347     assert(A.array == correct);
3348 }
3349 
3350 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements of the return value.
3351 __m256 _mm256_set1_ps (float a) pure @trusted
3352 {
3353     return __m256(a);
3354 }
3355 unittest
3356 {
3357     float a = 464.21f;
3358     float[8] correct = [a, a, a, a, a, a, a, a];
3359     float8 A = cast(float8) _mm256_set1_ps(a);
3360     assert(A.array == correct);
3361 }
3362 
3363 /// Set packed 16-bit integers with the supplied values in reverse order.
3364 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9,  short e8,
3365                            short e7,  short e6,  short e5,  short e4,  short e3,  short e2,  short e1,  short e0) pure @trusted
3366 {
3367     short[16] result = [ e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
3368                          e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
3369     static if (GDC_with_AVX)
3370     {
3371          return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr);
3372     }
3373     else static if (LDC_with_optimizations)
3374     {
3375         return cast(__m256i)( loadUnaligned!(short16)(result.ptr) );
3376     }
3377     else
3378     {
3379         short16 r;
3380         for(int n = 0; n < 16; ++n)
3381             r.ptr[n] = result[n];
3382         return cast(__m256i)r;
3383     }
3384 }
3385 unittest
3386 {
3387     short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128,
3388                                                 -1, 0, -21, 21, 42, 127, -42, -128);
3389     short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
3390                          -1, 0, -21, 21, 42, 127, -42, -128];
3391     assert(A.array == correct);
3392 }
3393 
3394 /// Set packed 32-bit integers with the supplied values in reverse order.
3395 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted
3396 {
3397     // Inlines a constant with GCC -O1, LDC -O2
3398     int8 r; // = void would prevent GDC from inlining a constant call
3399     r.ptr[0] = e7;
3400     r.ptr[1] = e6;
3401     r.ptr[2] = e5;
3402     r.ptr[3] = e4;
3403     r.ptr[4] = e3;
3404     r.ptr[5] = e2;
3405     r.ptr[6] = e1;
3406     r.ptr[7] = e0;
3407     return cast(__m256i)r;
3408 }
3409 unittest
3410 {
3411     int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666);
3412     int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666];
3413     assert(A.array == correct);
3414 }
3415 
3416 /// Set packed 64-bit integers with the supplied values in reverse order.
3417 __m256i _mm256_setr_epi64x (long e3, long e2, long e1, long e0) pure @trusted
3418 {
3419     long4 r = void;
3420     r.ptr[0] = e3;
3421     r.ptr[1] = e2;
3422     r.ptr[2] = e1;
3423     r.ptr[3] = e0;
3424     return r;
3425 }
3426 unittest
3427 {
3428     __m256i A = _mm256_setr_epi64x(-1, 42, long.min, long.max);
3429     long[4] correct = [-1, 42, long.min, long.max];
3430     assert(A.array == correct);
3431 }
3432 ///ditto
3433 alias _mm256_setr_epi64 = _mm256_setr_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API.
3434 
3435 /// Set packed 8-bit integers with the supplied values in reverse order.
3436 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24,
3437                           byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16,
3438                           byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9,  byte e8,
3439                           byte e7,  byte e6,  byte e5,  byte e4,  byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3440 {
3441     // Inline a constant call in GDC -O1 and LDC -O2
3442     align(32) byte[32] result = [ e31,  e30,  e29,  e28,  e27,  e26,  e25,  e24,
3443                                   e23,  e22,  e21,  e20,  e19,  e18,  e17,  e16,
3444                                   e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
3445                                    e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
3446     return *cast(__m256i*)(result.ptr);
3447 }
3448 unittest
3449 {
3450     byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128,
3451                                               -1, 0, -21, 21, 42, 127, -42, -128,
3452                                               -1, 0, -21, 21, 42, 127, -42, -128,
3453                                               -1, 0, -21, 21, 42, 127, -42, -128);
3454     byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
3455                         -1, 0, -21, 21, 42, 127, -42, -128,
3456                         -1, 0, -21, 21, 42, 127, -42, -128,
3457                         -1, 0, -21, 21, 42, 127, -42, -128];
3458     assert(A.array == correct);
3459 }
3460 
3461 /// Set packed `__m256` vector with the supplied values.
3462 __m256 _mm256_setr_m128 (__m128 lo, __m128 hi)
3463 {
3464     return _mm256_set_m128(hi, lo);
3465 }
3466 unittest
3467 {
3468     __m128 A = _mm_setr_ps(1.0f, 2, 3, 4);
3469     __m128 B = _mm_setr_ps(3.0f, 4, 5, 6);
3470     __m256 R = _mm256_setr_m128(B, A);
3471     float[8] correct = [3.0f, 4, 5, 6, 1, 2, 3, 4,];
3472     assert(R.array == correct);
3473 }
3474 
3475 /// Set packed `__m256d` vector with the supplied values.
3476 __m256d _mm256_setr_m128d (__m128d lo, __m128d hi)
3477 {
3478     return _mm256_set_m128d(hi, lo);
3479 }
3480 unittest
3481 {
3482     __m128d A = _mm_setr_pd(1.0, 2.0);
3483     __m128d B = _mm_setr_pd(3.0, 4.0);
3484     __m256d R = _mm256_setr_m128d(B, A);
3485     double[4] correct = [3.0, 4.0, 1.0, 2.0];
3486     assert(R.array == correct);
3487 }
3488 
3489 /// Set packed `__m256i` vector with the supplied values.
3490 __m256i _mm256_setr_m128i (__m128i lo, __m128i hi)
3491 {
3492     return _mm256_set_m128i(hi, lo);
3493 }
3494 unittest
3495 {
3496     __m128i A = _mm_setr_epi32( 1,  2,  3,  4);
3497     __m128i B =  _mm_set_epi32(-3, -4, -5, -6);
3498     int8 R = cast(int8)_mm256_setr_m128i(B, A);
3499     int[8] correct = [-6, -5, -4, -3, 1, 2, 3, 4];
3500     assert(R.array == correct);
3501 }
3502 
3503 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3504 __m256d _mm256_setr_pd (double e3, double e2, double e1, double e0) pure @trusted
3505 {
3506     static if (LDC_with_optimizations)
3507     {
3508         // PERF, probably not the best
3509         double[4] result = [e3, e2, e1, e0];
3510         return loadUnaligned!(double4)(result.ptr);
3511     }
3512     else
3513     {
3514         __m256d r;
3515         r.ptr[0] = e3;
3516         r.ptr[1] = e2;
3517         r.ptr[2] = e1;
3518         r.ptr[3] = e0;
3519         return r;
3520     }
3521 }
3522 unittest
3523 {
3524     __m256d A = _mm256_setr_pd(3, 2, 1, 546.125);
3525     double[4] correct = [3.0, 2.0, 1.0, 546.125];
3526     assert(A.array == correct);
3527 }
3528 
3529 
3530 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order.
3531 __m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted
3532 {
3533     // PERF DMD
3534     static if (GDC_with_AVX)
3535     {
3536         align(32) float[8] r = [ e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
3537         return *cast(__m256*)r;
3538     }
3539     else version(LDC)
3540     {
3541         align(32) float[8] r = [ e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
3542         return *cast(__m256*)r;
3543     }
3544     else
3545     {
3546         __m256 r;
3547         r.ptr[0] = e7;
3548         r.ptr[1] = e6;
3549         r.ptr[2] = e5;
3550         r.ptr[3] = e4;
3551         r.ptr[4] = e3;
3552         r.ptr[5] = e2;
3553         r.ptr[6] = e1;
3554         r.ptr[7] = e0;
3555         return r;
3556     }
3557 }
3558 unittest
3559 {
3560     __m256 A = _mm256_setr_ps(   3, 2, 1, 546.125f, 4, 5, 6, 7);
3561     float[8] correct       = [3.0f, 2, 1, 546.125f, 4, 5, 6, 7];
3562     assert(A.array == correct);
3563 }
3564 
3565 /// Return vector of type `__m256d` with all elements set to zero.
3566 __m256d _mm256_setzero_pd() pure @safe
3567 {
3568     return double4(0.0);
3569 }
3570 unittest
3571 {
3572     __m256d A = _mm256_setzero_pd();
3573     double[4] correct = [0.0, 0.0, 0.0, 0.0];
3574     assert(A.array == correct);
3575 }
3576 
3577 /// Return vector of type `__m256` with all elements set to zero.
3578 __m256 _mm256_setzero_ps() pure @safe
3579 {
3580     return float8(0.0f);
3581 }
3582 unittest
3583 {
3584     __m256 A = _mm256_setzero_ps();
3585     float[8] correct = [0.0f, 0, 0, 0, 0, 0, 0, 0];
3586     assert(A.array == correct);
3587 }
3588 
3589 /// Return vector of type `__m256i` with all elements set to zero.
3590 __m256i _mm256_setzero_si256() pure @trusted
3591 {
3592     return __m256i(0);
3593 }
3594 unittest
3595 {
3596     __m256i A = _mm256_setzero_si256();
3597     long[4] correct = [0, 0, 0, 0];
3598     assert(A.array == correct);
3599 }
3600 
3601 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the 
3602 /// control in `imm8`.
3603 __m256d _mm256_shuffle_pd(int imm8)(__m256d a, __m256d b) pure @trusted
3604 {
3605     // PERF DMD
3606     static if (GDC_with_AVX)
3607     {
3608         return __builtin_ia32_shufpd256(a, b, imm8);
3609     }
3610     else version(LDC)
3611     {
3612         return shufflevectorLDC!(double4,        
3613                                        (imm8 >> 0) & 1,
3614                                  4 + ( (imm8 >> 1) & 1),
3615                                  2 + ( (imm8 >> 2) & 1),
3616                                  6 + ( (imm8 >> 3) & 1) )(a, b);
3617     }
3618     else
3619     {
3620         double4 r = void;
3621         r.ptr[0] = a.array[(imm8 >> 0) & 1];
3622         r.ptr[1] = b.array[(imm8 >> 1) & 1];
3623         r.ptr[2] = a.array[2 + ( (imm8 >> 2) & 1)];
3624         r.ptr[3] = b.array[2 + ( (imm8 >> 3) & 1)];
3625         return r;
3626     }
3627 }
3628 unittest
3629 {
3630     __m256d A = _mm256_setr_pd( 0, 1, 2, 3);
3631     __m256d B = _mm256_setr_pd( 4, 5, 6, 7);
3632     __m256d C = _mm256_shuffle_pd!75 /* 01001011 */(A, B);
3633     double[4] correct = [1.0, 5.0, 2.0, 7.0];
3634     assert(C.array == correct);
3635 } 
3636 
3637 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 
3638 /// the control in `imm8`.
3639 __m256 _mm256_shuffle_ps(int imm8)(__m256 a, __m256 b) pure @trusted
3640 {
3641     // PERF DMD
3642     static if (GDC_with_AVX)
3643     {
3644         return __builtin_ia32_shufps256(a, b, imm8);
3645     }
3646     else version(LDC)
3647     {
3648         return shufflevectorLDC!(float8, (imm8 >> 0) & 3,
3649                                  (imm8 >> 2) & 3,
3650                                  8 + ( (imm8 >> 4) & 3),
3651                                  8 + ( (imm8 >> 6) & 3),
3652                                  4 + ( (imm8 >> 0) & 3),
3653                                  4 + ( (imm8 >> 2) & 3),
3654                                  12 + ( (imm8 >> 4) & 3),
3655                                  12 + ( (imm8 >> 6) & 3) )(a, b);
3656     }
3657     else
3658     {
3659         float8 r = void;
3660         r.ptr[0] = a.array[(imm8 >> 0) & 3];
3661         r.ptr[1] = a.array[(imm8 >> 2) & 3];
3662         r.ptr[2] = b.array[(imm8 >> 4) & 3];
3663         r.ptr[3] = b.array[(imm8 >> 6) & 3];
3664         r.ptr[4] = a.array[4 + ( (imm8 >> 0) & 3 )];
3665         r.ptr[5] = a.array[4 + ( (imm8 >> 2) & 3 )];
3666         r.ptr[6] = b.array[4 + ( (imm8 >> 4) & 3 )];
3667         r.ptr[7] = b.array[4 + ( (imm8 >> 6) & 3 )];
3668         return r;
3669     }
3670 }
3671 unittest
3672 {
3673     __m256 A = _mm256_setr_ps( 0,  1,  2,  3,  4,  5,  6,  7);
3674     __m256 B = _mm256_setr_ps( 8,  9, 10, 11, 12, 13, 14, 15);
3675     __m256 C = _mm256_shuffle_ps!75 /* 01001011 */(A, B);
3676     float[8] correct = [3.0f, 2, 8, 9, 7, 6, 12, 13];
3677     assert(C.array == correct);
3678 } 
3679 
3680 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `a`.
3681 __m256d _mm256_sqrt_pd (__m256d a) pure @trusted
3682 {
3683     static if (GDC_with_AVX)
3684     {
3685         return __builtin_ia32_sqrtpd256(a);
3686     } 
3687     else version(LDC)
3688     {   
3689         static if (__VERSION__ >= 2084) 
3690             return llvm_sqrt(a); // that capability appeared in LDC 1.14
3691         else
3692         {
3693             a.ptr[0] = llvm_sqrt(a.array[0]);
3694             a.ptr[1] = llvm_sqrt(a.array[1]);
3695             a.ptr[2] = llvm_sqrt(a.array[2]);
3696             a.ptr[3] = llvm_sqrt(a.array[3]);
3697             return a;
3698         }
3699     }    
3700     else
3701     {
3702         a.ptr[0] = sqrt(a.array[0]);
3703         a.ptr[1] = sqrt(a.array[1]);
3704         a.ptr[2] = sqrt(a.array[2]);
3705         a.ptr[3] = sqrt(a.array[3]);
3706         return a;
3707     }
3708 }
3709 unittest
3710 {
3711     __m256d A = _mm256_sqrt_pd(_mm256_set1_pd(4.0));
3712     double[4] correct = [2.0, 2, 2, 2];
3713     assert(A.array == correct);
3714 }
3715 
3716 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`.
3717 __m256 _mm256_sqrt_ps (__m256 a) pure @trusted
3718 {
3719     static if (GDC_with_AVX)
3720     {
3721         return __builtin_ia32_sqrtps256(a);
3722     } 
3723     else version(LDC)
3724     {  
3725         static if (__VERSION__ >= 2084) 
3726             return llvm_sqrt(a); // that capability appeared in LDC 1.14
3727         else
3728         {  
3729             a.ptr[0] = llvm_sqrt(a.array[0]);
3730             a.ptr[1] = llvm_sqrt(a.array[1]);
3731             a.ptr[2] = llvm_sqrt(a.array[2]);
3732             a.ptr[3] = llvm_sqrt(a.array[3]);
3733             a.ptr[4] = llvm_sqrt(a.array[4]);
3734             a.ptr[5] = llvm_sqrt(a.array[5]);
3735             a.ptr[6] = llvm_sqrt(a.array[6]);
3736             a.ptr[7] = llvm_sqrt(a.array[7]);
3737             return a;
3738         }
3739     }    
3740     else
3741     {
3742         a.ptr[0] = sqrt(a.array[0]);
3743         a.ptr[1] = sqrt(a.array[1]);
3744         a.ptr[2] = sqrt(a.array[2]);
3745         a.ptr[3] = sqrt(a.array[3]);
3746         a.ptr[4] = sqrt(a.array[4]);
3747         a.ptr[5] = sqrt(a.array[5]);
3748         a.ptr[6] = sqrt(a.array[6]);
3749         a.ptr[7] = sqrt(a.array[7]);
3750         return a;
3751     }
3752 }
3753 unittest
3754 {
3755     __m256 A = _mm256_sqrt_ps(_mm256_set1_ps(4.0f));
3756     float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2];
3757     assert(A.array == correct);
3758 }
3759 
3760 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 
3761 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
3762 /// exception may be generated.
3763 void _mm256_store_pd (double* mem_addr, __m256d a) pure @system
3764 {
3765     *cast(__m256d*)mem_addr = a;
3766 }
3767 unittest
3768 {
3769     align(32) double[4] mem;
3770     double[4] correct = [1.0, 2, 3, 4];
3771     _mm256_store_pd(mem.ptr, _mm256_setr_pd(1.0, 2, 3, 4));
3772     assert(mem == correct);
3773 }
3774 
3775 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 
3776 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
3777 /// exception may be generated.
3778 void _mm256_store_ps (float* mem_addr, __m256 a) pure @system
3779 {
3780     *cast(__m256*)mem_addr = a;
3781 }
3782 unittest
3783 {
3784     align(32) float[8] mem;
3785     float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8];
3786     _mm256_store_ps(mem.ptr, _mm256_set_ps(8.0, 7, 6, 5, 4, 3, 2, 1));
3787     assert(mem == correct);
3788 }
3789 
3790 /// Store 256-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 32-byte 
3791 /// boundary or a general-protection exception may be generated.
3792 void _mm256_store_si256 (__m256i * mem_addr, __m256i a) pure @safe
3793 {
3794     *mem_addr = a;
3795 }
3796 unittest
3797 {
3798     align(32) long[4] mem;
3799     long[4] correct = [5, -6, -7, 8];
3800     _mm256_store_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8));
3801     assert(mem == correct);
3802 }
3803 
3804 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 
3805 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
3806 void _mm256_storeu_pd (double * mem_addr, __m256d a) pure @system
3807 {
3808     // PERF DMD
3809     static if (GDC_with_AVX)
3810     {
3811         __builtin_ia32_storeupd256(mem_addr, a);
3812     }
3813     else static if (LDC_with_optimizations)
3814     {
3815         storeUnaligned!__m256d(a, mem_addr);
3816     }
3817     else
3818     {
3819         for(int n = 0; n < 4; ++n)
3820             mem_addr[n] = a.array[n];
3821     }
3822 }
3823 unittest
3824 {
3825     align(32) double[6] arr = [0.0, 0, 0, 0, 0, 0];
3826     _mm256_storeu_pd(&arr[1], _mm256_set1_pd(4.0));
3827     double[4] correct = [4.0, 4, 4, 4];
3828     assert(arr[1..5] == correct);
3829 }
3830 
3831 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 
3832 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
3833 void _mm256_storeu_ps (float* mem_addr, __m256 a) pure @system
3834 {
3835     // PERF DMD
3836     static if (GDC_with_AVX)
3837     {
3838         __builtin_ia32_storeups256(mem_addr, a);
3839     }
3840     else static if (LDC_with_optimizations)
3841     {
3842         storeUnaligned!__m256(a, mem_addr);
3843     }
3844     else
3845     {
3846         for(int n = 0; n < 8; ++n)
3847             mem_addr[n] = a.array[n];
3848     }
3849 }
3850 unittest
3851 {
3852     align(32) float[10] arr = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0];
3853     _mm256_storeu_ps(&arr[1], _mm256_set1_ps(4.0f));
3854     float[8] correct = [4.0f, 4, 4, 4, 4, 4, 4, 4];
3855     assert(arr[1..9] == correct);
3856 }
3857 
3858 
3859 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned
3860 ///  on any particular boundary.
3861 void _mm256_storeu_si256 (__m256i* mem_addr, __m256i a) pure @trusted
3862 {
3863     // PERF DMD
3864     static if (GDC_with_AVX)
3865     {
3866         __builtin_ia32_storedqu256(cast(char*)mem_addr, cast(ubyte32) a);
3867     }
3868     else static if (LDC_with_optimizations)
3869     {
3870         storeUnaligned!__m256i(a, cast(long*)mem_addr);
3871     }
3872     else
3873     {
3874         long4 v = cast(long4)a;
3875         long* p = cast(long*)mem_addr;
3876         for(int n = 0; n < 4; ++n)
3877             p[n] = v[n];
3878     }
3879 }
3880 unittest
3881 {
3882     align(32) long[6] arr = [0, 0, 0, 0, 0, 0];
3883     _mm256_storeu_si256( cast(__m256i*) &arr[1], _mm256_set1_epi64x(4));
3884     long[4] correct = [4, 4, 4, 4];
3885     assert(arr[1..5] == correct);
3886 }
3887 
3888 /// Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) 
3889 /// floating-point elements) from `a` into memory two different 128-bit locations. 
3890 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3891 void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a) pure @system
3892 {
3893     // This is way better on GDC, and similarly in LDC, vs using other intrinsics
3894     loaddr[0] = a.array[0];
3895     loaddr[1] = a.array[1];
3896     loaddr[2] = a.array[2];
3897     loaddr[3] = a.array[3];
3898     hiaddr[0] = a.array[4];
3899     hiaddr[1] = a.array[5];
3900     hiaddr[2] = a.array[6];
3901     hiaddr[3] = a.array[7];
3902 }
3903 unittest
3904 {
3905     align(32) float[11] A = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
3906     _mm256_storeu2_m128(&A[1], &A[6], _mm256_set1_ps(2.0f));
3907     float[11] correct     = [0.0f, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0];
3908     assert(A == correct);
3909 }
3910 
3911 /// Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit)
3912 /// floating-point elements) from `a` into memory two different 128-bit locations. 
3913 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3914 void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a) pure @system
3915 {
3916     loaddr[0] = a.array[0];
3917     loaddr[1] = a.array[1];
3918     hiaddr[0] = a.array[2];
3919     hiaddr[1] = a.array[3];
3920 }
3921 unittest
3922 {
3923     double[2] A;
3924     double[2] B;
3925     _mm256_storeu2_m128d(A.ptr, B.ptr, _mm256_set1_pd(-43.0));
3926     double[2] correct = [-43.0, -43];
3927     assert(A == correct);
3928     assert(B == correct);
3929 }
3930 
3931 /// Store the high and low 128-bit halves (each composed of integer data) from `a` into memory two 
3932 /// different 128-bit locations. 
3933 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3934 void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a) pure @trusted
3935 {
3936     long* hi = cast(long*)hiaddr;
3937     long* lo = cast(long*)loaddr;
3938     lo[0] = a.array[0];
3939     lo[1] = a.array[1];
3940     hi[0] = a.array[2];
3941     hi[1] = a.array[3];
3942 }
3943 unittest
3944 {
3945     long[2] A;
3946     long[2] B;
3947     _mm256_storeu2_m128i(cast(__m128i*)A.ptr, cast(__m128i*)B.ptr, _mm256_set1_epi64x(-42));
3948     long[2] correct = [-42, -42];
3949     assert(A == correct);
3950     assert(B == correct);
3951 }
3952 
3953 /// Store 256-bits (composed of 4 packed single-precision (64-bit) floating-point elements) from
3954 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 
3955 /// boundary or a general-protection exception may be generated.
3956 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
3957 void _mm256_stream_pd (double* mem_addr, __m256d a) pure @system
3958 {
3959     // PERF DMD
3960     // PERF GDC + SSE2
3961     static if (LDC_with_InlineIREx && LDC_with_optimizations)
3962     {
3963         enum prefix = `!0 = !{ i32 1 }`;
3964         enum ir = `
3965             store <4 x double> %1, <4 x double>* %0, align 32, !nontemporal !0
3966             ret void`;
3967         LDCInlineIREx!(prefix, ir, "", void, double4*, double4)(cast(double4*)mem_addr, a);
3968     }   
3969     else static if (GDC_with_AVX) // any hope to be non-temporal? Using SSE2 instructions.
3970     {
3971         __builtin_ia32_movntpd256 (mem_addr, a);
3972     }
3973     else
3974     {
3975         // Regular store instead.
3976         __m256d* dest = cast(__m256d*)mem_addr;
3977         *dest = a;
3978     }
3979 }
3980 unittest
3981 {
3982     align(32) double[4] mem;
3983     double[4] correct = [5.0, -6, -7, 8];
3984     _mm256_stream_pd(mem.ptr, _mm256_setr_pd(5.0, -6, -7, 8));
3985     assert(mem == correct);
3986 }
3987 
3988 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from
3989 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 
3990 /// boundary or a general-protection exception may be generated.
3991 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
3992 void _mm256_stream_ps (float* mem_addr, __m256 a) pure @system
3993 {
3994     // PERF DMD
3995     // PERF GDC + SSE2
3996     static if (LDC_with_InlineIREx && LDC_with_optimizations)
3997     {
3998         enum prefix = `!0 = !{ i32 1 }`;
3999         enum ir = `
4000             store <8 x float> %1, <8 x float>* %0, align 32, !nontemporal !0
4001             ret void`;
4002         LDCInlineIREx!(prefix, ir, "", void, float8*, float8)(cast(float8*)mem_addr, a);
4003     }   
4004     else static if (GDC_with_AVX)
4005     {
4006         __builtin_ia32_movntps256 (mem_addr, a);
4007     }
4008     else
4009     {
4010         // Regular store instead.
4011         __m256* dest = cast(__m256*)mem_addr;
4012         *dest = a;
4013     }
4014 }
4015 unittest
4016 {
4017     align(32) float[8] mem;
4018     float[8] correct = [5, -6, -7, 8, 1, 2, 3, 4];
4019     _mm256_stream_ps(mem.ptr, _mm256_setr_ps(5, -6, -7, 8, 1, 2, 3, 4));
4020     assert(mem == correct);
4021 }
4022 
4023 /// Store 256-bits of integer data from `a` into memory using a non-temporal memory hint. 
4024 /// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be
4025 /// generated.
4026 /// Note: there isn't any particular instruction in AVX to do that. It just defers to SSE2.
4027 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4028 void _mm256_stream_si256 (__m256i * mem_addr, __m256i a) pure @trusted
4029 {
4030     // PERF DMD
4031     // PERF GDC
4032     static if (LDC_with_InlineIREx && LDC_with_optimizations)
4033     {
4034         enum prefix = `!0 = !{ i32 1 }`;
4035         enum ir = `
4036             store <4 x i64> %1, <4 x i64>* %0, align 16, !nontemporal !0
4037             ret void`;
4038         LDCInlineIREx!(prefix, ir, "", void, long4*, long4)(mem_addr, a);
4039     }
4040     else static if (GDC_with_SSE2) // any hope to be non-temporal? Using SSE2 instructions.
4041     {
4042         long2 lo, hi;
4043         lo.ptr[0] = a.array[0];
4044         lo.ptr[1] = a.array[1];
4045         hi.ptr[0] = a.array[2];
4046         hi.ptr[1] = a.array[3];
4047         _mm_stream_si128(cast(__m128i*)mem_addr, cast(__m128i)lo);
4048         _mm_stream_si128((cast(__m128i*)mem_addr) + 1, cast(__m128i)hi);
4049     }
4050     else
4051     {
4052         // Regular store instead.
4053         __m256i* dest = cast(__m256i*)mem_addr;
4054         *dest = a;
4055     }
4056 }
4057 unittest
4058 {
4059     align(32) long[4] mem;
4060     long[4] correct = [5, -6, -7, 8];
4061     _mm256_stream_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8));
4062     assert(mem == correct);
4063 }
4064 
4065 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from 
4066 /// packed double-precision (64-bit) floating-point elements in `a`.
4067 __m256d _mm256_sub_pd (__m256d a, __m256d b) pure @safe
4068 {
4069     return a - b;
4070 }
4071 unittest
4072 {
4073     __m256d a = [1.5, -2.0, 3.0, 200000.0];
4074     a = _mm256_sub_pd(a, a);
4075     double[4] correct = [0.0, 0, 0, 0];
4076     assert(a.array == correct);
4077 }
4078 
4079 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from 
4080 /// packed single-precision (32-bit) floating-point elements in `a`.
4081 __m256 _mm256_sub_ps (__m256 a, __m256 b) pure @safe
4082 {
4083     return a - b;
4084 }
4085 unittest
4086 {
4087     __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2000.0f, 3.0f, 1.0f];
4088     a = _mm256_sub_ps(a, a);
4089     float[8] correct = [0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f];
4090     assert(a.array == correct);
4091 }
4092 
4093 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 
4094 /// return 1 if the sign bit of each 64-bit element in the intermediate value is zero, 
4095 /// otherwise return 0.
4096 int _mm_testc_pd (__m128d a, __m128d b) pure @trusted
4097 {
4098     static if (GDC_or_LDC_with_AVX)
4099     {
4100         return __builtin_ia32_vtestcpd(a, b);
4101     }
4102     else
4103     {
4104         // PERF: maybe do the generic version more like simde
4105         long2 la = cast(long2)a;
4106         long2 lb = cast(long2)b;
4107         long2 r = ~la & lb;
4108         return r.array[0] >= 0 && r.array[1] >= 0;
4109     }
4110 }
4111 unittest
4112 {
4113     __m128d A  = _mm_setr_pd(-1, 1);
4114     __m128d B = _mm_setr_pd(-1, -1);
4115     __m128d C = _mm_setr_pd(1, -1);
4116     assert(_mm_testc_pd(A, A) == 1);
4117     assert(_mm_testc_pd(A, B) == 0);
4118     assert(_mm_testc_pd(B, A) == 1);
4119 }
4120 
4121 ///ditto
4122 int _mm256_testc_pd (__m256d a, __m256d b) pure @safe
4123 {
4124     static if (GDC_or_LDC_with_AVX)
4125     {
4126         return __builtin_ia32_vtestcpd256(a, b);
4127     }
4128     else static if (LDC_with_ARM64)
4129     {
4130         // better to split than do vanilla (down to 10 inst)
4131         __m128d lo_a = _mm256_extractf128_pd!0(a);
4132         __m128d lo_b = _mm256_extractf128_pd!0(b);
4133         __m128d hi_a = _mm256_extractf128_pd!1(a);
4134         __m128d hi_b = _mm256_extractf128_pd!1(b);
4135         return _mm_testc_pd(lo_a, lo_b) & _mm_testc_pd(hi_a, hi_b);
4136     }
4137     else
4138     {
4139         // PERF: do the generic version more like simde, maybe this get rids of arm64 version
4140         long4 la = cast(long4)a;
4141         long4 lb = cast(long4)b;
4142         long4 r = ~la & lb;
4143         return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0;
4144     }
4145 }
4146 unittest
4147 {
4148     __m256d A = _mm256_setr_pd(-1, 1, -1, 1);
4149     __m256d B = _mm256_setr_pd(-1, -1, -1, -1);
4150     __m256d C = _mm256_setr_pd(1, -1, 1, -1);
4151     assert(_mm256_testc_pd(A, A) == 1);
4152     assert(_mm256_testc_pd(A, B) == 0);
4153     assert(_mm256_testc_pd(B, A) == 1);
4154 }
4155 
4156 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 
4157 /// return 1 if the sign bit of each 32-bit element in the intermediate value is zero, 
4158 /// otherwise return 0.
4159 int _mm_testc_ps (__m128 a, __m128 b) pure @safe
4160 {
4161     // PERF DMD
4162     static if (GDC_or_LDC_with_AVX)
4163     {
4164         return __builtin_ia32_vtestcps(a, b);
4165     }   
4166     else static if (LDC_with_ARM64)
4167     {
4168         int4 la = cast(int4)a;
4169         int4 lb = cast(int4)b;
4170         int4 r = ~la & lb;
4171         int4 shift;
4172         shift = 31;
4173         r >>= shift;
4174         int[4] zero = [0, 0, 0, 0];
4175         return r.array == zero;
4176     }
4177     else
4178     {
4179         // PERF: do the generic version more like simde, maybe this get rids of arm64 version
4180         int4 la = cast(int4)a;
4181         int4 lb = cast(int4)b;
4182         int4 r = ~la & lb;
4183         return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0;
4184     }
4185 }
4186 unittest
4187 {
4188     __m128 A = _mm_setr_ps(-1, 1, -1, 1);
4189     __m128 B = _mm_setr_ps(-1, -1, -1, -1);
4190     __m128 C = _mm_setr_ps(1, -1, 1, -1);
4191     assert(_mm_testc_ps(A, A) == 1);
4192     assert(_mm_testc_ps(A, B) == 0);
4193     assert(_mm_testc_ps(B, A) == 1);
4194 }
4195 
4196 ///ditto
4197 int _mm256_testc_ps (__m256 a, __m256 b) pure @safe
4198 {
4199     // PERF DMD
4200     static if (GDC_or_LDC_with_AVX)
4201     {
4202         return __builtin_ia32_vtestcps256(a, b);
4203     }
4204     else static if (LDC_with_ARM64)
4205     {
4206         int8 la = cast(int8)a;
4207         int8 lb = cast(int8)b;
4208         int8 r = ~la & lb;
4209         int8 shift;
4210         shift = 31;
4211         r >>= shift;
4212         int[8] zero = [0, 0, 0, 0, 0, 0, 0, 0];
4213         return r.array == zero;
4214     }
4215     else
4216     {
4217         // PERF: do the generic version more like simde, maybe this get rids of arm64 version
4218         int8 la = cast(int8)a;
4219         int8 lb = cast(int8)b;
4220         int8 r = ~la & lb;
4221         return r.array[0] >= 0 
4222             && r.array[1] >= 0
4223             && r.array[2] >= 0
4224             && r.array[3] >= 0
4225             && r.array[4] >= 0
4226             && r.array[5] >= 0
4227             && r.array[6] >= 0
4228             && r.array[7] >= 0;
4229     }
4230 }
4231 unittest
4232 {
4233     __m256 A = _mm256_setr_ps(-1,  1, -1,  1, -1,  1, -1,  1);
4234     __m256 B = _mm256_setr_ps(-1, -1, -1, -1, -1, -1, -1, -1);
4235     __m256 C = _mm256_setr_ps( 1, -1,  1, -1,  1,  1,  1,  1);
4236     assert(_mm256_testc_ps(A, A) == 1);
4237     assert(_mm256_testc_ps(B, B) == 1);
4238     assert(_mm256_testc_ps(A, B) == 0);
4239     assert(_mm256_testc_ps(B, A) == 1);
4240     assert(_mm256_testc_ps(C, B) == 0);
4241     assert(_mm256_testc_ps(B, C) == 1);
4242 }
4243 
4244 /// Compute the bitwise NOT of `a` and then AND with `b`, and return 1 if the result is zero,
4245 /// otherwise return 0.
4246 /// In other words, test if all bits masked by `b` are also 1 in `a`.
4247 int _mm256_testc_si256 (__m256i a, __m256i b) pure @trusted
4248 {
4249     static if (GDC_or_LDC_with_AVX)
4250     {
4251         return __builtin_ia32_ptestc256(cast(long4)a, cast(long4)b);
4252     }
4253     else static if (LDC_with_ARM64)
4254     {
4255         // better to split than do vanilla (down to 10 inst)
4256         __m128i lo_a = _mm256_extractf128_si256!0(a);
4257         __m128i lo_b = _mm256_extractf128_si256!0(b);
4258         __m128i hi_a = _mm256_extractf128_si256!1(a);
4259         __m128i hi_b = _mm256_extractf128_si256!1(b);
4260         return _mm_testc_si128(lo_a, lo_b) & _mm_testc_si128(hi_a, hi_b);
4261     }
4262     else
4263     {
4264         __m256i c = ~a & b;
4265         long[4] zero = [0, 0, 0, 0];
4266         return c.array == zero;
4267     }
4268 }
4269 unittest
4270 {
4271     __m256i A  = _mm256_setr_epi64(0x01, 0x02, 0x04, 0xf8);
4272     __m256i M1 = _mm256_setr_epi64(0xfe, 0xfd, 0x00, 0x00);
4273     __m256i M2 = _mm256_setr_epi64(0x00, 0x00, 0x04, 0x00);
4274     assert(_mm256_testc_si256(A, A) == 1);
4275     assert(_mm256_testc_si256(A, M1) == 0);
4276     assert(_mm256_testc_si256(A, M2) == 1);
4277 }
4278 
4279 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 
4280 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 
4281 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 
4282 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
4283 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise
4284 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
4285 ///
4286 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
4287 ///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
4288 int _mm_testnzc_pd (__m128d a, __m128d b) pure @safe
4289 {
4290     // PERF DMD
4291     static if (GDC_or_LDC_with_AVX)
4292     {
4293         return __builtin_ia32_vtestnzcpd(a, b);
4294     }
4295     else
4296     {
4297         // ZF = 0 means "there is at least one pair of negative numbers"
4298         // ZF = 1 means "no pairs of negative numbers"
4299         // CF = 0 means "there is a negative number in b that is next to a positive number in a"
4300         // CF = 1 means "all negative numbers in b are also negative in a"
4301         // Consequently, CF = 0 and ZF = 0 means:
4302         //   "There is a pair of matching negative numbers in a and b, 
4303         //   AND also there is a negative number in b, that is matching a positive number in a"
4304         // Phew.
4305 
4306         // courtesy of simd-everywhere
4307         __m128i m = _mm_and_si128(cast(__m128i)a, cast(__m128i)b);
4308         __m128i m2 = _mm_andnot_si128(cast(__m128i)a, cast(__m128i)b);
4309         m = _mm_srli_epi64(m, 63);
4310         m2 = _mm_srli_epi64(m2, 63);
4311         return cast(int)( m.array[0] | m.array[2]) & (m2.array[0] | m2.array[2]);
4312     }
4313 }
4314 unittest
4315 {
4316     __m128d PM = _mm_setr_pd( 1, -1);
4317     __m128d MP = _mm_setr_pd(-1,  1);
4318     __m128d MM = _mm_setr_pd(-1, -1);
4319     assert(_mm_testnzc_pd(PM, MP) == 0);
4320     assert(_mm_testnzc_pd(PM, MM) == 1);
4321     assert(_mm_testnzc_pd(MP, MP) == 0);
4322     assert(_mm_testnzc_pd(MP, MM) == 1);
4323     assert(_mm_testnzc_pd(MM, MM) == 0);
4324 }
4325 
4326 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 
4327 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 
4328 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 
4329 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
4330 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise
4331 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
4332 ///
4333 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
4334 ///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
4335 int _mm256_testnzc_pd (__m256d a, __m256d b) pure @safe
4336 {
4337     // PERF DMD
4338     static if (GDC_or_LDC_with_AVX)
4339     {
4340         return __builtin_ia32_vtestnzcpd256(a, b);
4341     }
4342     else
4343     {
4344         long4 la = cast(long4)a;
4345         long4 lb = cast(long4)b;
4346         long4 r = la & lb;
4347         long m = r.array[0] | r.array[1] | r.array[2] | r.array[3];
4348         int ZF = (~m >> 63) & 1;
4349         long4 r2 = ~la & lb;
4350         long m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3];
4351         int CF = (~m2 >> 63) & 1;
4352         return (CF | ZF) == 0;
4353     }
4354 }
4355 unittest
4356 {
4357     __m256d PM = _mm256_setr_pd( 1, -1, 1, 1);
4358     __m256d MP = _mm256_setr_pd(-1,  1, 1, 1);
4359     __m256d MM = _mm256_setr_pd(-1, -1, 1, 1);
4360     assert(_mm256_testnzc_pd(PM, MP) == 0);
4361     assert(_mm256_testnzc_pd(PM, MM) == 1);
4362     assert(_mm256_testnzc_pd(MP, MP) == 0);
4363     assert(_mm256_testnzc_pd(MP, MM) == 1);
4364     assert(_mm256_testnzc_pd(MM, MM) == 0);
4365 }
4366 
4367 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 
4368 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 
4369 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 
4370 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
4371 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise
4372 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
4373 ///
4374 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
4375 ///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
4376 int _mm_testnzc_ps (__m128 a, __m128 b) pure @safe
4377 {
4378     // PERF DMD
4379     static if (GDC_or_LDC_with_AVX)
4380     {
4381         return __builtin_ia32_vtestnzcps(a, b);
4382     }
4383     else
4384     {
4385         int4 la = cast(int4)a;
4386         int4 lb = cast(int4)b;
4387         int4 r = la & lb;
4388         int m = r.array[0] | r.array[1] | r.array[2] | r.array[3];
4389         int ZF = (~m >> 31) & 1;
4390         int4 r2 = ~la & lb;
4391         int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3];
4392         int CF = (~m2 >> 31) & 1;
4393         return (CF | ZF) == 0;
4394     }
4395 }
4396 unittest
4397 {
4398     __m128 PM = _mm_setr_ps( 1, -1, 1, 1);
4399     __m128 MP = _mm_setr_ps(-1,  1, 1, 1);
4400     __m128 MM = _mm_setr_ps(-1, -1, 1, 1);
4401     assert(_mm_testnzc_ps(PM, MP) == 0);
4402     assert(_mm_testnzc_ps(PM, MM) == 1);
4403     assert(_mm_testnzc_ps(MP, MP) == 0);
4404     assert(_mm_testnzc_ps(MP, MM) == 1);
4405     assert(_mm_testnzc_ps(MM, MM) == 0);
4406 }
4407 
4408 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 
4409 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 
4410 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 
4411 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
4412 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise
4413 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
4414 ///
4415 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
4416 ///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
4417 int _mm256_testnzc_ps (__m256 a, __m256 b) pure @safe
4418 {
4419     // PERF DMD
4420     static if (GDC_or_LDC_with_AVX)
4421     {
4422         return __builtin_ia32_vtestnzcps256(a, b);
4423     }
4424     else
4425     {
4426         int8 la = cast(int8)a;
4427         int8 lb = cast(int8)b;
4428         int8 r = la & lb;
4429         int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]
4430             |   r.array[4] | r.array[5] | r.array[6] | r.array[7];
4431         int ZF = (~m >> 31) & 1;
4432         int8 r2 = ~la & lb;
4433         int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3]
4434                | r2.array[4] | r2.array[5] | r2.array[6] | r2.array[7];
4435         int CF = (~m2 >> 31) & 1;
4436         return (CF | ZF) == 0;
4437     }
4438 }
4439 unittest
4440 {
4441     __m256 PM = _mm256_setr_ps(1, 1, 1, 1,  1, -1, 1, 1);
4442     __m256 MP = _mm256_setr_ps(1, 1, 1, 1, -1,  1, 1, 1);
4443     __m256 MM = _mm256_setr_ps(1, 1, 1, 1, -1, -1, 1, 1);
4444     assert(_mm256_testnzc_ps(PM, MP) == 0);
4445     assert(_mm256_testnzc_ps(PM, MM) == 1);
4446     assert(_mm256_testnzc_ps(MP, MP) == 0);
4447     assert(_mm256_testnzc_ps(MP, MM) == 1);
4448     assert(_mm256_testnzc_ps(MM, MM) == 0);
4449 }
4450 
4451 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`, 
4452 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
4453 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
4454 /// result is zero, otherwise set CF to 0. 
4455 /// Return 1 if both the ZF and CF values are zero, otherwise return 0.
4456 int _mm256_testnzc_si256 (__m256i a, __m256i b) pure @trusted
4457 {
4458     // PERF ARM64
4459     // PERF DMD
4460     // PERF LDC without AVX
4461     static if (GDC_or_LDC_with_AVX)
4462     {
4463         return __builtin_ia32_ptestnzc256(cast(long4) a, cast(long4) b);
4464     }
4465     else
4466     {
4467         // Need to defer to _mm_testnzc_si128 if possible, for more speed
4468         __m256i c = a & b;
4469         __m256i d = ~a & b;
4470         long m = c.array[0] | c.array[1] | c.array[2] | c.array[3];
4471         long n = d.array[0] | d.array[1] | d.array[2] | d.array[3];
4472         return (m != 0) & (n != 0);
4473     }
4474 }
4475 unittest
4476 {
4477     __m256i A  = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0, 0, 0, 0);
4478     __m256i M  = _mm256_setr_epi32(0x01, 0x40, 0x00, 0x00, 0, 0, 0, 0);
4479     __m256i Z = _mm256_setzero_si256();
4480     assert(_mm256_testnzc_si256(A, Z) == 0);
4481     assert(_mm256_testnzc_si256(A, M) == 1);
4482     assert(_mm256_testnzc_si256(A, A) == 0);
4483 }
4484 
4485 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 
4486 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of
4487 /// each 64-bit element in the intermediate value is zero, otherwise return 0.
4488 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
4489 int _mm_testz_pd (__m128d a, __m128d b) pure @trusted
4490 {
4491     static if (GDC_or_LDC_with_AVX)
4492     {
4493         return __builtin_ia32_vtestzpd(a, b);
4494     }
4495     else
4496     {
4497         long2 la = cast(long2)a;
4498         long2 lb = cast(long2)b;
4499         long2 r = la & lb;
4500         long m = r.array[0] | r.array[1];
4501         return (~m >> 63) & 1;
4502     }
4503 }
4504 unittest
4505 {
4506     __m128d A  = _mm_setr_pd(-1, 1);
4507     __m128d B = _mm_setr_pd(-1, -1);
4508     __m128d C = _mm_setr_pd(1, -1);
4509     assert(_mm_testz_pd(A, A) == 0);
4510     assert(_mm_testz_pd(A, B) == 0);
4511     assert(_mm_testz_pd(C, A) == 1);
4512 }
4513 
4514 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 
4515 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of
4516 /// each 64-bit element in the intermediate value is zero, otherwise return 0.
4517 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
4518 int _mm256_testz_pd (__m256d a, __m256d b) pure @trusted
4519 {
4520     static if (GDC_or_LDC_with_AVX)
4521     {
4522         return __builtin_ia32_vtestzpd256(a, b);
4523     }
4524     else
4525     {
4526         long4 la = cast(long4)a;
4527         long4 lb = cast(long4)b;
4528         long4 r = la & lb;
4529         long r2 = r.array[0] | r.array[1] | r.array[2] | r.array[3];
4530         return (~r2 >> 63) & 1;
4531     }
4532 }
4533 unittest
4534 {
4535     __m256d A = _mm256_setr_pd(-1, 1, -1, 1);
4536     __m256d B = _mm256_setr_pd(1,  1, -1, 1);
4537     __m256d C = _mm256_setr_pd(1, -1, 1, -1);
4538     assert(_mm256_testz_pd(A, A) == 0);
4539     assert(_mm256_testz_pd(A, B) == 0);
4540     assert(_mm256_testz_pd(C, A) == 1);
4541 }
4542 
4543 /// Compute the bitwise AND of 128 bits (representing double-precision (32-bit) floating-point 
4544 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of
4545 /// each 32-bit element in the intermediate value is zero, otherwise return 0.
4546 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
4547 int _mm_testz_ps (__m128 a, __m128 b) pure @safe
4548 {
4549     // PERF DMD
4550     static if (GDC_or_LDC_with_AVX)
4551     {
4552         return __builtin_ia32_vtestzps(a, b);
4553     }
4554     else
4555     {
4556         int4 la = cast(int4)a;
4557         int4 lb = cast(int4)b;
4558         int4 r = la & lb;
4559         int m = r.array[0] | r.array[1] | r.array[2] | r.array[3];
4560         return (~m >> 31) & 1;
4561     }
4562 }
4563 unittest
4564 {
4565     __m128 A = _mm_setr_ps(-1,  1, -1,  1);
4566     __m128 B = _mm_setr_ps( 1,  1, -1,  1);
4567     __m128 C = _mm_setr_ps( 1, -1,  1, -1);
4568     assert(_mm_testz_ps(A, A) == 0);
4569     assert(_mm_testz_ps(A, B) == 0);
4570     assert(_mm_testz_ps(C, A) == 1);
4571     assert(_mm_testz_ps(C, B) == 1);
4572 }
4573 
4574 /// Compute the bitwise AND of 256 bits (representing double-precision (32-bit) floating-point 
4575 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of
4576 /// each 32-bit element in the intermediate value is zero, otherwise return 0.
4577 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
4578 int _mm256_testz_ps (__m256 a, __m256 b) pure @safe
4579 {
4580     // PERF DMD
4581     static if (GDC_or_LDC_with_AVX)
4582     {
4583         return __builtin_ia32_vtestzps256(a, b);
4584     }
4585     else
4586     {
4587         int8 la = cast(int8)a;
4588         int8 lb = cast(int8)b;
4589         int8 r = la & lb;
4590         int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]
4591             | r.array[4] | r.array[5] | r.array[6] | r.array[7];
4592         return (~m >> 31) & 1;
4593     }
4594 }
4595 
4596 /// Compute the bitwise AND of 256 bits (representing integer data) in 
4597 /// and return 1 if the result is zero, otherwise return 0.
4598 /// In other words, test if all bits masked by `b` are 0 in `a`.
4599 int _mm256_testz_si256 (__m256i a, __m256i b) @trusted
4600 {
4601     // PERF DMD
4602     static if (GDC_with_AVX)
4603     {
4604         return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b);
4605     }
4606     else static if (LDC_with_AVX)
4607     {
4608         return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b);
4609     }
4610     else version(LDC)
4611     {
4612         // better to split than do vanilla (down to 8 inst in arm64)
4613         __m128i lo_a = _mm256_extractf128_si256!0(a);
4614         __m128i lo_b = _mm256_extractf128_si256!0(b);
4615         __m128i hi_a = _mm256_extractf128_si256!1(a);
4616         __m128i hi_b = _mm256_extractf128_si256!1(b);
4617         return _mm_testz_si128(lo_a, lo_b) & _mm_testz_si128(hi_a, hi_b);
4618     }
4619     else
4620     {
4621         __m256i c = a & b;
4622         long[4] zero = [0, 0, 0, 0];
4623         return c.array == zero;
4624     }
4625 }
4626 unittest
4627 {
4628     __m256i A  = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0x01, 0x02, 0x04, 0xf8);
4629     __m256i M1 = _mm256_setr_epi32(0xfe, 0xfd, 0x00, 0x07, 0xfe, 0xfd, 0x00, 0x07);
4630     __m256i M2 = _mm256_setr_epi32(0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00);
4631     assert(_mm256_testz_si256(A, A) == 0);
4632     assert(_mm256_testz_si256(A, M1) == 1);
4633     assert(_mm256_testz_si256(A, M2) == 0);
4634 }
4635 
4636 /// Return vector of type __m256d with undefined elements.
4637 __m256d _mm256_undefined_pd () pure @safe
4638 {
4639     __m256d r = void;
4640     return r;
4641 }
4642 
4643 /// Return vector of type __m256 with undefined elements.
4644 __m256 _mm256_undefined_ps () pure @safe
4645 {
4646     __m256 r = void;
4647     return r;
4648 }
4649 
4650 /// Return vector of type __m256i with undefined elements.
4651 __m256i _mm256_undefined_si256 () pure @safe
4652 {
4653     __m256i r = void;
4654     return r;
4655 }
4656 
4657 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 
4658 /// each 128-bit lane in `a` and `b`.
4659 __m256d _mm256_unpackhi_pd (__m256d a, __m256d b) pure @trusted
4660 {
4661     static if (LDC_with_optimizations)
4662     {
4663         enum ir = `%r = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
4664                    ret <4 x double> %r`;
4665         return LDCInlineIR!(ir, double4, double4, double4)(a, b);
4666     }
4667     else static if (GDC_with_AVX)
4668     {
4669         return __builtin_ia32_unpckhpd256 (a, b);
4670     }
4671     else
4672     {
4673         __m256d r;
4674         r.ptr[0] = a.array[1];
4675         r.ptr[1] = b.array[1];
4676         r.ptr[2] = a.array[3];
4677         r.ptr[3] = b.array[3];
4678         return r;
4679     } 
4680 }
4681 unittest
4682 {
4683     __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
4684     __m256d B = _mm256_setr_pd(5.0, 6, 7, 8);
4685     __m256d C = _mm256_unpackhi_pd(A, B);
4686     double[4] correct =       [2.0, 6, 4, 8];
4687     assert(C.array == correct);
4688 }
4689 
4690 
4691 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 
4692 /// each 128-bit lane in `a` and `b`.
4693 __m256 _mm256_unpackhi_ps (__m256 a, __m256 b) pure @trusted
4694 {
4695     static if (LDC_with_optimizations)
4696     {
4697         enum ir = `%r = shufflevector <8 x float> %0, <8 x float> %1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
4698             ret <8 x float> %r`;
4699         return LDCInlineIR!(ir, float8, float8, float8)(a, b);
4700     }
4701     else static if (GDC_with_AVX)
4702     {
4703         return __builtin_ia32_unpckhps256 (a, b);
4704     }
4705     else
4706     {
4707         __m256 r;
4708         r.ptr[0] = a.array[2];
4709         r.ptr[1] = b.array[2];
4710         r.ptr[2] = a.array[3];
4711         r.ptr[3] = b.array[3];
4712         r.ptr[4] = a.array[6];
4713         r.ptr[5] = b.array[6];
4714         r.ptr[6] = a.array[7];
4715         r.ptr[7] = b.array[7];
4716         return r;
4717     } 
4718 }
4719 unittest
4720 {
4721     __m256 A = _mm256_setr_ps(0.0f,  1,  2,  3,  4,  5,  6,  7);
4722     __m256 B = _mm256_setr_ps(8.0f,  9, 10, 11, 12, 13, 14, 15);
4723     __m256 C = _mm256_unpackhi_ps(A, B);
4724     float[8] correct =       [2.0f, 10,  3, 11,  6, 14,  7, 15];
4725     assert(C.array == correct);
4726 }
4727 
4728 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 
4729 /// each 128-bit lane in `a` and `b`.
4730 __m256d _mm256_unpacklo_pd (__m256d a, __m256d b)
4731 {
4732     static if (LDC_with_optimizations)
4733     {
4734         enum ir = `%r = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
4735             ret <4 x double> %r`;
4736         return LDCInlineIR!(ir, double4, double4, double4)(a, b);
4737     }
4738     else static if (GDC_with_AVX)
4739     {
4740         return __builtin_ia32_unpcklpd256 (a, b);
4741     }
4742     else
4743     {
4744         __m256d r;
4745         r.ptr[0] = a.array[0];
4746         r.ptr[1] = b.array[0];
4747         r.ptr[2] = a.array[2];
4748         r.ptr[3] = b.array[2];
4749         return r;        
4750     } 
4751 }
4752 unittest
4753 {
4754     __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
4755     __m256d B = _mm256_setr_pd(5.0, 6, 7, 8);
4756     __m256d C = _mm256_unpacklo_pd(A, B);
4757     double[4] correct =       [1.0, 5, 3, 7];
4758     assert(C.array == correct);
4759 }
4760 
4761 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of
4762 /// each 128-bit lane in `a` and `b`.
4763 __m256 _mm256_unpacklo_ps (__m256 a, __m256 b)
4764 {
4765     static if (LDC_with_optimizations)
4766     {
4767         enum ir = `%r = shufflevector <8 x float> %0, <8 x float> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
4768             ret <8 x float> %r`;
4769         return LDCInlineIR!(ir, float8, float8, float8)(a, b);
4770     }
4771     else static if (GDC_with_AVX)
4772     {
4773         return __builtin_ia32_unpcklps256 (a, b);
4774     }
4775     else
4776     {
4777         __m256 r;
4778         r.ptr[0] = a.array[0];
4779         r.ptr[1] = b.array[0];
4780         r.ptr[2] = a.array[1];
4781         r.ptr[3] = b.array[1];
4782         r.ptr[4] = a.array[4];
4783         r.ptr[5] = b.array[4];
4784         r.ptr[6] = a.array[5];
4785         r.ptr[7] = b.array[5];
4786         return r;        
4787     } 
4788 }
4789 unittest
4790 {
4791     __m256 A = _mm256_setr_ps(0.0f,  1,  2,  3,  4,  5,  6,  7);
4792     __m256 B = _mm256_setr_ps(8.0f,  9, 10, 11, 12, 13, 14, 15);
4793     __m256 C = _mm256_unpacklo_ps(A, B);
4794     float[8] correct =       [0.0f,  8,  1,  9,  4, 12,  5, 13];
4795     assert(C.array == correct);
4796 }
4797 
4798 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
4799 __m256d _mm256_xor_pd (__m256d a, __m256d b) pure @safe
4800 {
4801     return cast(__m256d)( cast(__m256i)a ^ cast(__m256i)b );
4802 }
4803 
4804 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
4805 __m256 _mm256_xor_ps (__m256 a, __m256 b) pure @safe
4806 {
4807     return cast(__m256)( cast(__m256i)a ^ cast(__m256i)b );
4808 }
4809 
4810 void _mm256_zeroall () pure @safe
4811 {
4812     // PERF DMD needs to do it explicitely if AVX is ever used one day.
4813 
4814     static if (GDC_with_AVX)
4815     {
4816         __builtin_ia32_vzeroall();
4817     }
4818     else
4819     {
4820         // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM).
4821     }
4822 }
4823 
4824 void _mm256_zeroupper () pure @safe
4825 {
4826     // PERF DMD needs to do it explicitely if AVX is ever used.
4827 
4828     static if (GDC_with_AVX)
4829     {
4830         __builtin_ia32_vzeroupper();
4831     }
4832     else
4833     {
4834         // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM).
4835     }
4836     
4837 }
4838 
4839 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are zeroed.
4840 __m256d _mm256_zextpd128_pd256 (__m128d a) pure @trusted
4841 {
4842     __m256d r;
4843     r.ptr[0] = a.array[0];
4844     r.ptr[1] = a.array[1];
4845     r.ptr[2] = 0;
4846     r.ptr[3] = 0;
4847     return r;
4848 }
4849 unittest
4850 {
4851     __m256d R = _mm256_zextpd128_pd256(_mm_setr_pd(2.0, -3.0));
4852     double[4] correct = [2.0, -3, 0, 0];
4853     assert(R.array == correct);
4854 }
4855 
4856 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are zeroed.
4857 __m256 _mm256_zextps128_ps256 (__m128 a) pure @trusted
4858 {
4859     double2 la = cast(double2)a;
4860     double4 r;
4861     r.ptr[0] = la.array[0];
4862     r.ptr[1] = la.array[1];
4863     r.ptr[2] = 0;
4864     r.ptr[3] = 0;
4865     return cast(__m256)r;
4866 }
4867 unittest
4868 {
4869     __m256 R = _mm256_zextps128_ps256(_mm_setr_ps(2.0, -3.0, 4, -5));
4870     float[8] correct = [2.0, -3, 4, -5, 0, 0, 0, 0];
4871     assert(R.array == correct);
4872 }
4873 
4874 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are zeroed. 
4875 __m256i _mm256_zextsi128_si256 (__m128i a) pure @trusted
4876 {
4877     long2 la = cast(long2)a;
4878     __m256i r;
4879     r.ptr[0] = la.array[0];
4880     r.ptr[1] = la.array[1];
4881     r.ptr[2] = 0;
4882     r.ptr[3] = 0;
4883     return r;
4884 }
4885 unittest
4886 {
4887     __m256i R = _mm256_zextsi128_si256(_mm_setr_epi64(-1, 99));
4888     long[4] correct = [-1, 99, 0, 0];
4889     assert(R.array == correct);
4890 }