The OpenD Programming Language

1 /**
2 * SSE2 intrinsics. 
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
4 *
5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.emmintrin;
9 
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14 
15 nothrow @nogc:
16 
17 
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24     pragma(inline, true);
25     return cast(__m128i)(cast(short8)a + cast(short8)b);
26 }
27 unittest
28 {
29     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
30     short8 R = cast(short8) _mm_add_epi16(A, A);
31     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
32     assert(R.array == correct);
33 }
34 
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
37 {
38     pragma(inline, true);
39     return cast(__m128i)(cast(int4)a + cast(int4)b);
40 }
41 unittest
42 {
43     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
44     int4 R = _mm_add_epi32(A, A);
45     int[4] correct = [ -14, -2, 0, 18 ];
46     assert(R.array == correct);
47 }
48 
49 /// Add packed 64-bit integers in `a` and `b`.
50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
51 {
52     pragma(inline, true);
53     return cast(__m128i)(cast(long2)a + cast(long2)b);
54 }
55 unittest
56 {
57     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
58     long2 R = cast(long2) _mm_add_epi64(A, A);
59     long[2] correct = [ -2, 0 ];
60     assert(R.array == correct);
61 }
62 
63 /// Add packed 8-bit integers in `a` and `b`.
64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
65 {
66     pragma(inline, true);
67     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
68 }
69 unittest
70 {
71     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
72     byte16 R = cast(byte16) _mm_add_epi8(A, A);
73     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
74     assert(R.array == correct);
75 }
76 
77 /// Add the lower double-precision (64-bit) floating-point element 
78 /// in `a` and `b`, store the result in the lower element of dst, 
79 /// and copy the upper element from `a` to the upper element of destination. 
80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
81 {
82     static if (DMD_with_DSIMD)
83     {
84         return cast(__m128d) __simd(XMM.ADDSD, a, b);
85     }
86     else static if (GDC_with_SSE2)
87     {
88         return __builtin_ia32_addsd(a, b);
89     }
90     else version(DigitalMars)
91     {
92         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
93         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
94         asm pure nothrow @nogc @trusted { nop;}
95         a[0] = a[0] + b[0];
96         return a;
97     }
98     else
99     {
100         a[0] += b[0];
101         return a;
102     }
103 }
104 unittest
105 {
106     __m128d a = [1.5, -2.0];
107     a = _mm_add_sd(a, a);
108     assert(a.array == [3.0, -2.0]);
109 }
110 
111 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
112 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
113 {
114     pragma(inline, true);
115     return a + b;
116 }
117 unittest
118 {
119     __m128d a = [1.5, -2.0];
120     a = _mm_add_pd(a, a);
121     assert(a.array == [3.0, -4.0]);
122 }
123 
124 /// Add 64-bit integers `a` and `b`.
125 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
126 {
127     // PERF DMD
128     pragma(inline, true);
129     return a + b;
130 }
131 
132 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
133 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
134 {
135     static if (DMD_with_DSIMD)
136     {
137         return cast(__m128i) __simd(XMM.PADDSW, a, b);
138     }
139     else static if (GDC_with_SSE2)
140     {
141         return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
142     }
143     else static if(LDC_with_saturated_intrinsics)
144     {
145         return cast(__m128i) inteli_llvm_adds!short8(cast(short8)a, cast(short8)b);
146     }
147     else
148     {
149         short[8] res; // PERF =void;
150         short8 sa = cast(short8)a;
151         short8 sb = cast(short8)b;
152         foreach(i; 0..8)
153             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
154         return _mm_loadu_si128(cast(int4*)res.ptr);
155     }
156 }
157 unittest
158 {
159     short8 res = cast(short8) _mm_adds_epi16(_mm_setr_epi16( 7,  6,  5, -32768, 3, 3, 32767,   0),
160                                              _mm_setr_epi16( 7,  6,  5, -30000, 3, 1,     1, -10));
161     static immutable short[8] correctResult             =  [14, 12, 10, -32768, 6, 4, 32767, -10];
162     assert(res.array == correctResult);
163 }
164 
165 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
166 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
167 {
168     static if (DMD_with_DSIMD)
169     {
170         return cast(__m128i) __simd(XMM.PADDSB, a, b);
171     }
172     else static if (GDC_with_SSE2)
173     {
174         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
175     }
176     else static if(LDC_with_saturated_intrinsics)
177     {
178         return cast(__m128i) inteli_llvm_adds!byte16(cast(byte16)a, cast(byte16)b);
179     }
180     else
181     {
182         byte[16] res; // PERF =void;
183         byte16 sa = cast(byte16)a;
184         byte16 sb = cast(byte16)b;
185         foreach(i; 0..16)
186             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
187         return _mm_loadu_si128(cast(int4*)res.ptr);
188     }
189 }
190 unittest
191 {
192     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0),
193                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0));
194     static immutable byte[16] correctResult = [0, 2, 4, 6, -128, 10, 12, 14,
195                                                16, 18, 127, 22, 24, 26, 28, 30];
196     assert(res.array == correctResult);
197 }
198 
199 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
200 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
201 {
202     static if (DMD_with_DSIMD)
203     {
204         return cast(__m128i) __simd(XMM.PADDUSB, a, b);
205     }
206     else static if (GDC_with_SSE2)
207     {
208         return cast(__m128i) __builtin_ia32_paddusb128(cast(ubyte16)a, cast(ubyte16)b);
209     }
210     else static if(LDC_with_saturated_intrinsics)
211     {
212         return cast(__m128i) inteli_llvm_addus!byte16(cast(byte16)a, cast(byte16)b);
213     }
214     else
215     {
216         ubyte[16] res; // PERF =void;
217         byte16 sa = cast(byte16)a;
218         byte16 sb = cast(byte16)b;
219         foreach(i; 0..16)
220             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
221         return _mm_loadu_si128(cast(int4*)res.ptr);
222     }
223 }
224 unittest
225 {
226     byte16 res = cast(byte16) 
227         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
228                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
229     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
230                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
231     assert(res.array == correctResult);
232 }
233 
234 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
235 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
236 {
237     static if (DMD_with_DSIMD)
238     {
239         // Note: DMD generates a reverted paddusw vs LDC and GDC, but that doesn't change the result anyway
240         return cast(__m128i) __simd(XMM.PADDUSW, a, b);
241     }
242     else static if (GDC_with_SSE2)
243     {
244         return cast(__m128i) __builtin_ia32_paddusw128(cast(short8)a, cast(short8)b);
245     }
246     else static if(LDC_with_saturated_intrinsics)
247     {
248         return cast(__m128i) inteli_llvm_addus!short8(cast(short8)a, cast(short8)b);
249     }
250     else
251     {
252         ushort[8] res; // PERF =void;
253         short8 sa = cast(short8)a;
254         short8 sb = cast(short8)b;
255         foreach(i; 0..8)
256             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
257         return _mm_loadu_si128(cast(int4*)res.ptr);
258     }
259 }
260 unittest
261 {
262     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
263                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
264     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
265     assert(res.array == correctResult);
266 }
267 
268 /// Compute the bitwise AND of packed double-precision (64-bit) 
269 /// floating-point elements in `a` and `b`.
270 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
271 {
272     pragma(inline, true);
273     return cast(__m128d)( cast(long2)a & cast(long2)b );
274 }
275 unittest
276 {
277     double a = 4.32;
278     double b = -78.99;
279     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
280     __m128d A = _mm_set_pd(a, b);
281     __m128d B = _mm_set_pd(b, a);
282     long2 R = cast(long2)( _mm_and_pd(A, B) );
283     assert(R.array[0] == correct);
284     assert(R.array[1] == correct);
285 }
286 
287 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
288 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
289 {
290     pragma(inline, true);
291     return a & b;
292 }
293 unittest
294 {
295     __m128i A = _mm_set1_epi32(7);
296     __m128i B = _mm_set1_epi32(14);
297     __m128i R = _mm_and_si128(A, B);
298     int[4] correct = [6, 6, 6, 6];
299     assert(R.array == correct);
300 }
301 
302 /// Compute the bitwise NOT of packed double-precision (64-bit) 
303 /// floating-point elements in `a` and then AND with `b`.
304 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
305 {
306     static if (DMD_with_DSIMD)
307     {
308         return cast(__m128d) __simd(XMM.ANDNPD, a, b);
309     }
310     else
311     {
312         return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
313     }
314 }
315 unittest
316 {
317     double a = 4.32;
318     double b = -78.99;
319     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
320     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
321     __m128d A = _mm_setr_pd(a, b);
322     __m128d B = _mm_setr_pd(b, a);
323     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
324     assert(R.array[0] == correct);
325     assert(R.array[1] == correct2);
326 }
327 
328 /// Compute the bitwise NOT of 128 bits (representing integer data) 
329 /// in `a` and then AND with `b`.
330 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
331 {
332     static if (DMD_with_DSIMD)
333     {
334         return cast(__m128i) __simd(XMM.PANDN, a, b);
335     }
336     else
337     {
338         return (~a) & b;
339     }
340 }
341 unittest
342 {
343     __m128i A = _mm_setr_epi32(7, -2, 9, 54654);
344     __m128i B = _mm_setr_epi32(14, 78, 111, -256);
345     __m128i R = _mm_andnot_si128(A, B);
346     int[4] correct = [8, 0, 102, -54784];
347     assert(R.array == correct);
348 }
349 
350 /// Average packed unsigned 16-bit integers in `a` and `b`.
351 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
352 {
353     static if (DMD_with_DSIMD)
354     {
355         return cast(__m128i) __simd(XMM.PAVGW, a, b);
356     }
357     else static if (GDC_with_SSE2)
358     {
359         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
360     }
361     else static if (LDC_with_ARM64)
362     {
363         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
364     }
365     else static if (LDC_with_SSE2 && __VERSION__ >= 2094)
366     {
367         // Exists since LDC 1.18
368         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
369     }
370     else static if (LDC_with_optimizations)
371     {
372         // Generates pavgw even in LDC 1.0, even in -O0
373         // But not in ARM
374         enum ir = `
375             %ia = zext <8 x i16> %0 to <8 x i32>
376             %ib = zext <8 x i16> %1 to <8 x i32>
377             %isum = add <8 x i32> %ia, %ib
378             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
379             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
380             %r = trunc <8 x i32> %isums to <8 x i16>
381             ret <8 x i16> %r`;
382         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
383     }
384     else
385     {
386         short8 sa = cast(short8)a;
387         short8 sb = cast(short8)b;
388         short8 sr = void;
389         foreach(i; 0..8)
390         {
391             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
392         }
393         return cast(int4)sr;
394     }
395 }
396 unittest
397 {
398     __m128i A = _mm_set1_epi16(31);
399     __m128i B = _mm_set1_epi16(64);
400     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
401     foreach(i; 0..8)
402         assert(avg.array[i] == 48);
403 }
404 
405 /// Average packed unsigned 8-bit integers in `a` and `b`.
406 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
407 {
408     static if (DMD_with_DSIMD)
409     {
410         return cast(__m128i) __simd(XMM.PAVGB, a, b);
411     }
412     else static if (GDC_with_SSE2)
413     {
414         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
415     }
416     else static if (LDC_with_SSE2 && __VERSION__ >= 2094)
417     {
418         // Exists since LDC 1.18
419         return cast(__m128i) __builtin_ia32_pavgb128(cast(byte16)a, cast(byte16)b);
420     }
421     else static if (LDC_with_ARM64)
422     {
423         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
424     }
425     else static if (LDC_with_optimizations)
426     {
427         // Generates pavgb even in LDC 1.0, even in -O0
428         // But not in ARM
429         enum ir = `
430             %ia = zext <16 x i8> %0 to <16 x i16>
431             %ib = zext <16 x i8> %1 to <16 x i16>
432             %isum = add <16 x i16> %ia, %ib
433             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
434             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
435             %r = trunc <16 x i16> %isums to <16 x i8>
436             ret <16 x i8> %r`;
437         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
438     }
439     else
440     {
441         byte16 sa = cast(byte16)a;
442         byte16 sb = cast(byte16)b;
443         byte16 sr = void;
444         foreach(i; 0..16)
445         {
446             sr.ptr[i] = cast(ubyte)( (cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]) + 1) >> 1 );
447         }
448         return cast(int4)sr;
449     }
450 }
451 unittest
452 {
453     __m128i A = _mm_set1_epi8(31);
454     __m128i B = _mm_set1_epi8(64);
455     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
456     foreach(i; 0..16)
457         assert(avg.array[i] == 48);
458 }
459 
460 /// Shift `a` left by `bytes` bytes while shifting in zeros.
461 alias _mm_bslli_si128 = _mm_slli_si128;
462 unittest
463 {
464     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
465     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
466     __m128i result = _mm_bslli_si128!5(toShift);
467     assert( (cast(byte16)result).array == exact);
468 }
469 
470 /// Shift `v` right by `bytes` bytes while shifting in zeros.
471 alias _mm_bsrli_si128 = _mm_srli_si128;
472 unittest
473 {
474     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
475     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
476     __m128i result = _mm_bsrli_si128!5(toShift);
477     assert( (cast(byte16)result).array == exact);
478 }
479 
480 /// Cast vector of type `__m128d` to type `__m128`. 
481 /// Note: Also possible with a regular `cast(__m128)(a)`.
482 __m128 _mm_castpd_ps (__m128d a) pure @safe
483 {
484     return cast(__m128)a;
485 }
486 
487 /// Cast vector of type `__m128d` to type `__m128i`. 
488 /// Note: Also possible with a regular `cast(__m128i)(a)`.
489 __m128i _mm_castpd_si128 (__m128d a) pure @safe
490 {
491     return cast(__m128i)a;
492 }
493 
494 /// Cast vector of type `__m128` to type `__m128d`. 
495 /// Note: Also possible with a regular `cast(__m128d)(a)`.
496 __m128d _mm_castps_pd (__m128 a) pure @safe
497 {
498     return cast(__m128d)a;
499 }
500 
501 /// Cast vector of type `__m128` to type `__m128i`. 
502 /// Note: Also possible with a regular `cast(__m128i)(a)`.
503 __m128i _mm_castps_si128 (__m128 a) pure @safe
504 {
505     return cast(__m128i)a;
506 }
507 
508 /// Cast vector of type `__m128i` to type `__m128d`. 
509 /// Note: Also possible with a regular `cast(__m128d)(a)`.
510 __m128d _mm_castsi128_pd (__m128i a) pure @safe
511 {
512     return cast(__m128d)a;
513 }
514 
515 /// Cast vector of type `__m128i` to type `__m128`. 
516 /// Note: Also possible with a regular `cast(__m128)(a)`.
517 __m128 _mm_castsi128_ps (__m128i a) pure @safe
518 {
519     return cast(__m128)a;
520 }
521 
522 /// Invalidate and flush the cache line that contains `p` 
523 /// from all levels of the cache hierarchy.
524 void _mm_clflush (const(void)* p) @trusted
525 {
526     static if (GDC_with_SSE2)
527     {
528         __builtin_ia32_clflush(p);
529     }
530     else static if (LDC_with_SSE2)
531     {
532         __builtin_ia32_clflush(cast(void*)p);
533     }
534     else version(D_InlineAsm_X86)
535     {
536         asm pure nothrow @nogc @trusted
537         {
538             mov EAX, p;
539             clflush [EAX];
540         }
541     }
542     else version(D_InlineAsm_X86_64)
543     {
544         asm pure nothrow @nogc @trusted
545         {
546             mov RAX, p;
547             clflush [RAX];
548         }
549     }
550     else 
551     {
552         // Do nothing. Invalidating cacheline does
553         // not affect correctness.
554     }
555 }
556 unittest
557 {
558     ubyte[64] cacheline;
559     _mm_clflush(cacheline.ptr);
560 }
561 
562 /// Compare packed 16-bit integers in `a` and `b` for equality.
563 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
564 {
565     static if (SIMD_COMPARISON_MASKS_16B)
566     {
567         return cast(__m128i)(cast(short8)a == cast(short8)b);
568     }
569     else static if (GDC_with_SSE2)
570     {
571         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
572     }
573     else
574     {
575         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
576     }
577 }
578 unittest
579 {
580     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
581     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
582     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
583     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
584     assert(R.array == E);
585 }
586 
587 /// Compare packed 32-bit integers in `a` and `b` for equality.
588 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
589 {
590     static if (SIMD_COMPARISON_MASKS_16B)
591     {
592         return cast(__m128i)(cast(int4)a == cast(int4)b);
593     }
594     else static if (GDC_with_SSE2)
595     {
596         return __builtin_ia32_pcmpeqd128(a, b);
597     }
598     else
599     {
600         return equalMask!__m128i(a, b);
601     }
602 }
603 unittest
604 {
605     int4   A = [-3, -2, -1,  0];
606     int4   B = [ 4, -2,  2,  0];
607     int[4] E = [ 0, -1,  0, -1];
608     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
609     assert(R.array == E);
610 }
611 
612 /// Compare packed 8-bit integers in `a` and `b` for equality.
613 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
614 {
615     static if (SIMD_COMPARISON_MASKS_16B)
616     {
617         return cast(__m128i)(cast(byte16)a == cast(byte16)b);
618     }
619     else static if (GDC_with_SSE2)
620     {
621         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
622     }
623     else
624     {
625         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
626     }
627 }
628 unittest
629 {
630     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
631     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
632     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
633     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
634     assert(C.array == correct);
635 }
636 
637 /// Compare packed double-precision (64-bit) floating-point elements 
638 /// in `a` and `b` for equality.
639 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
640 {
641     static if (SIMD_COMPARISON_MASKS_16B)
642     {
643         return cast(double2)(cast(double2)a == cast(double2)b);
644     }
645     else static if (GDC_with_SSE2)
646     {
647         return __builtin_ia32_cmpeqpd(a, b);
648     }
649     else
650     {
651         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
652     }
653 }
654 unittest
655 {
656     double2 A = _mm_setr_pd(1.0, 2.0);
657     double2 B = _mm_setr_pd(0.0, 2.0);
658     double2 N = _mm_setr_pd(double.nan, double.nan);
659     long2 C = cast(long2) _mm_cmpeq_pd(A, B);
660     long[2] correctC = [0, -1];
661     assert(C.array == correctC);
662     long2 D = cast(long2) _mm_cmpeq_pd(N, N);
663     long[2] correctD = [0, 0];
664     assert(D.array == correctD);
665 }
666 
667 /// Compare the lower double-precision (64-bit) floating-point elements
668 /// in `a` and `b` for equality, store the result in the lower element,
669 /// and copy the upper element from `a`.
670 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
671 {
672     static if (DMD_with_DSIMD)
673     {
674         return cast(__m128d) __simd(XMM.CMPSD, a, b, 0);
675     }
676     else static if (GDC_with_SSE2)
677     {
678         return __builtin_ia32_cmpeqsd(a, b);
679     }
680     else
681     {
682         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
683     }
684 }
685 unittest
686 {
687     double2 A = _mm_setr_pd(0.0, 2.0);
688     double2 B = _mm_setr_pd(1.0, 2.0);
689     double2 C = _mm_setr_pd(1.0, 3.0);
690     double2 D = cast(double2) _mm_cmpeq_sd(A, B);
691     long2 E = cast(long2) _mm_cmpeq_sd(B, C);
692     double[2] correctD = [0.0, 2.0];
693     double two = 2.0;
694     long[2] correctE = [-1, *cast(long*)&two];
695     assert(D.array == correctD);
696     assert(E.array == correctE);
697 }
698 
699 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
700 /// #BONUS
701 __m128i _mm_cmpge_epi16 (__m128i a, __m128i b) pure @safe
702 {
703     static if (SIMD_COMPARISON_MASKS_16B)
704     {
705         return cast(__m128i)(cast(short8)a >= cast(short8)b);
706     }
707     else version (LDC)
708     {
709         // LDC ARM64: generates cmge since -O1
710         return cast(__m128i) greaterOrEqualMask!short8(cast(short8)a, cast(short8)b);
711     }
712     else
713     {        
714         return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_cmpgt_epi16(a, b));
715     }
716 }
717 unittest
718 {
719     short8   A = [-3, -2, -32768,  0,  0,  1,  2,  3];
720     short8   B = [ 4,  3,  32767,  1,  0, -1, -2, -3];
721     short[8] E = [ 0,  0,      0,  0,  -1, -1, -1, -1];
722     short8   R = cast(short8)(_mm_cmpge_epi16(cast(__m128i)A, cast(__m128i)B));
723     assert(R.array == E);
724 }
725 
726 /// Compare packed double-precision (64-bit) floating-point elements 
727 /// in `a` and `b` for greater-than-or-equal.
728 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
729 {
730     static if (SIMD_COMPARISON_MASKS_16B)
731     {
732         return cast(__m128d)(a >= b);
733     }
734     else static if (GDC_with_SSE2)
735     {
736         return __builtin_ia32_cmpgepd(a, b);
737     }
738     else
739     {
740         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
741     }
742 }
743 
744 /// Compare the lower double-precision (64-bit) floating-point elements 
745 /// in `a` and `b` for greater-than-or-equal, store the result in the 
746 /// lower element, and copy the upper element from `a`.
747 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
748 {
749     static if (DMD_with_DSIMD)
750     {
751         return cast(__m128d) __simd(XMM.CMPSD, b, a, 2);
752     }
753     else static if (GDC_with_SSE2)
754     {
755         return __builtin_ia32_cmplesd(b, a);
756     }
757     else
758     {
759         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
760     }
761 }
762 unittest
763 {
764     __m128d A = _mm_setr_pd(1.0, 0.0);
765     __m128d B = _mm_setr_pd(double.nan, 0.0);
766     __m128d C = _mm_setr_pd(2.0, 0.0);
767     assert( (cast(long2)_mm_cmpge_sd(A, A)).array[0] == -1);
768     assert( (cast(long2)_mm_cmpge_sd(A, B)).array[0] ==  0);
769     assert( (cast(long2)_mm_cmpge_sd(A, C)).array[0] ==  0);
770     assert( (cast(long2)_mm_cmpge_sd(B, A)).array[0] ==  0);
771     assert( (cast(long2)_mm_cmpge_sd(B, B)).array[0] ==  0);
772     assert( (cast(long2)_mm_cmpge_sd(B, C)).array[0] ==  0);
773     assert( (cast(long2)_mm_cmpge_sd(C, A)).array[0] == -1);
774     assert( (cast(long2)_mm_cmpge_sd(C, B)).array[0] ==  0);
775     assert( (cast(long2)_mm_cmpge_sd(C, C)).array[0] == -1);
776 }
777 
778 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
779 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
780 {
781     static if (SIMD_COMPARISON_MASKS_16B)
782     {
783         return cast(__m128i)(cast(short8)a > cast(short8)b);
784     }
785     else static if (GDC_with_SSE2)
786     {
787         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
788     }
789     else
790     {
791         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
792     }
793 }
794 unittest
795 {
796     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
797     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
798     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
799     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
800     assert(R.array == E);
801 }
802 
803 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
804 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
805 {
806     static if (SIMD_COMPARISON_MASKS_16B)
807     {
808         return cast(__m128i)(cast(int4)a > cast(int4)b);
809     }
810     else static if (GDC_with_SSE2)
811     {
812         return __builtin_ia32_pcmpgtd128(a, b); 
813     }
814     else
815     {
816         return cast(__m128i)( greaterMask!int4(a, b));
817     }
818 }
819 unittest
820 {
821     int4   A = [-3,  2, -1,  0];
822     int4   B = [ 4, -2,  2,  0];
823     int[4] E = [ 0, -1,  0,  0];
824     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
825     assert(R.array == E);
826 }
827 
828 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
829 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
830 {
831     static if (SIMD_COMPARISON_MASKS_16B)
832     {
833         return cast(__m128i)(cast(byte16)a > cast(byte16)b);
834     }
835     else
836     {
837         // Note: __builtin_ia32_pcmpgtb128 is buggy, do not use with GDC
838         // TODO: re-check that
839         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
840     }
841 }
842 unittest
843 {
844     __m128i A = _mm_setr_epi8(1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
845     __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
846     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
847     byte[16] correct =       [0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
848     __m128i D = _mm_cmpeq_epi8(A, B);
849     assert(C.array == correct);
850 }
851 
852 /// Compare packed double-precision (64-bit) floating-point elements 
853 /// in `a` and `b` for greater-than.
854 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
855 {
856     static if (SIMD_COMPARISON_MASKS_16B)
857     {
858         return cast(__m128d)(a > b);
859     }
860     else static if (GDC_with_SSE2)
861     {
862         return __builtin_ia32_cmpgtpd(a, b); 
863     }
864     else
865     {
866         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
867     }
868 }
869 
870 /// Compare the lower double-precision (64-bit) floating-point elements 
871 /// in `a` and `b` for greater-than, store the result in the lower element,
872 /// and copy the upper element from `a`.
873 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
874 {
875     static if (DMD_with_DSIMD)
876     {
877         return cast(__m128d) __simd(XMM.CMPSD, b, a, 1);
878     }
879     else static if (GDC_with_SSE2)
880     {
881         return __builtin_ia32_cmpltsd(b, a);
882     }
883     else
884     {
885         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
886     }
887 }
888 unittest
889 {
890     __m128d A = _mm_setr_pd(1.0, 0.0);
891     __m128d B = _mm_setr_pd(double.nan, 0.0);
892     __m128d C = _mm_setr_pd(2.0, 0.0);
893     assert( (cast(long2)_mm_cmpgt_sd(A, A)).array[0] ==  0);
894     assert( (cast(long2)_mm_cmpgt_sd(A, B)).array[0] ==  0);
895     assert( (cast(long2)_mm_cmpgt_sd(A, C)).array[0] ==  0);
896     assert( (cast(long2)_mm_cmpgt_sd(B, A)).array[0] ==  0);
897     assert( (cast(long2)_mm_cmpgt_sd(B, B)).array[0] ==  0);
898     assert( (cast(long2)_mm_cmpgt_sd(B, C)).array[0] ==  0);
899     assert( (cast(long2)_mm_cmpgt_sd(C, A)).array[0] == -1);
900     assert( (cast(long2)_mm_cmpgt_sd(C, B)).array[0] ==  0);
901     assert( (cast(long2)_mm_cmpgt_sd(C, C)).array[0] ==  0);
902 }
903 
904 
905 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
906 /// #BONUS
907 __m128i _mm_cmple_epi16 (__m128i a, __m128i b) pure @safe
908 {
909     static if (SIMD_COMPARISON_MASKS_16B)
910     {
911         return cast(__m128i)(cast(short8)a <= cast(short8)b);
912     }
913     else version (LDC)
914     {
915         // LDC ARM64: generates cmge since -O1
916         return cast(__m128i) greaterOrEqualMask!short8(cast(short8)b, cast(short8)a);
917     }
918     else
919     {
920         return _mm_xor_si128(_mm_cmpeq_epi16(b, a), _mm_cmpgt_epi16(b, a));
921     }
922 }
923 unittest
924 {
925     short8   A = [-3, -2, -32768,  1,  0,  1,  2,  3];
926     short8   B = [ 4,  3,  32767,  0,  0, -1, -2, -3];
927     short[8] E = [-1, -1,     -1,  0,  -1, 0,  0,  0];
928     short8   R = cast(short8)(_mm_cmple_epi16(cast(__m128i)A, cast(__m128i)B));
929     assert(R.array == E);
930 }
931 
932 /// Compare packed double-precision (64-bit) floating-point elements 
933 /// in `a` and `b` for less-than-or-equal.
934 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
935 {
936     static if (SIMD_COMPARISON_MASKS_16B)
937     {
938         return cast(__m128d)(a <= b);
939     }
940     else static if (GDC_with_SSE2)
941     {
942         return __builtin_ia32_cmplepd(a, b); 
943     }
944     else
945     {
946         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
947     }
948 }
949 
950 /// Compare the lower double-precision (64-bit) floating-point elements 
951 /// in `a` and `b` for less-than-or-equal, store the result in the 
952 /// lower element, and copy the upper element from `a`.
953 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
954 {
955     static if (DMD_with_DSIMD)
956     {
957         return cast(__m128d) __simd(XMM.CMPSD, a, b, 2);
958     }
959     else static if (GDC_with_SSE2)
960     {
961         return __builtin_ia32_cmplesd(a, b); 
962     }
963     else
964     {
965         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
966     }
967 }
968 
969 /// Compare packed 16-bit integers in `a` and `b` for less-than.
970 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
971 {
972     return _mm_cmpgt_epi16(b, a);
973 }
974 
975 /// Compare packed 32-bit integers in `a` and `b` for less-than.
976 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
977 {
978     return _mm_cmpgt_epi32(b, a);
979 }
980 
981 /// Compare packed 8-bit integers in `a` and `b` for less-than.
982 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
983 {
984     return _mm_cmpgt_epi8(b, a);
985 }
986 
987 /// Compare packed double-precision (64-bit) floating-point elements
988 /// in `a` and `b` for less-than.
989 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
990 {
991     static if (SIMD_COMPARISON_MASKS_16B)
992     {
993         return cast(__m128d)(a < b);
994     }
995     else static if (GDC_with_SSE2)
996     {
997         return __builtin_ia32_cmpltpd(a, b); 
998     }
999     else
1000     {
1001         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
1002     }
1003 }
1004 
1005 /// Compare the lower double-precision (64-bit) floating-point elements
1006 /// in `a` and `b` for less-than, store the result in the lower 
1007 /// element, and copy the upper element from `a`.
1008 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
1009 {
1010     static if (DMD_with_DSIMD)
1011     {
1012         return cast(__m128d) __simd(XMM.CMPSD, a, b, 1);
1013     }
1014     else static if (GDC_with_SSE2)
1015     {
1016         return __builtin_ia32_cmpltsd(a, b); 
1017     }
1018     else
1019     {
1020         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
1021     }
1022 }
1023 
1024 /// Compare packed double-precision (64-bit) floating-point elements
1025 /// in `a` and `b` for not-equal.
1026 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
1027 {
1028     static if (GDC_with_SSE2)
1029     {
1030         return __builtin_ia32_cmpneqpd(a, b); 
1031     }
1032     else
1033     {
1034         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
1035     }
1036 }
1037 
1038 /// Compare the lower double-precision (64-bit) floating-point elements
1039 /// in `a` and `b` for not-equal, store the result in the lower 
1040 /// element, and copy the upper element from `a`.
1041 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
1042 {
1043     static if (GDC_with_SSE2)
1044     {
1045         return __builtin_ia32_cmpneqsd(a, b); 
1046     }
1047     else
1048     {
1049         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
1050     }
1051 }
1052 
1053 /// Compare packed double-precision (64-bit) floating-point elements 
1054 /// in `a` and `b` for not-greater-than-or-equal.
1055 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
1056 {
1057     static if (GDC_with_SSE2)
1058     {
1059         return __builtin_ia32_cmpngepd(a, b); 
1060     }
1061     else
1062     {
1063         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
1064     }
1065 }
1066 
1067 /// Compare the lower double-precision (64-bit) floating-point elements 
1068 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
1069 /// the lower element, and copy the upper element from `a`.
1070 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
1071 {
1072     // Note: There is no __builtin_ia32_cmpngesd builtin.
1073     static if (GDC_with_SSE2)
1074     {
1075         return __builtin_ia32_cmpltsd(b, a); 
1076     }
1077     else
1078     {
1079         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
1080     }
1081 }
1082 
1083 /// Compare packed double-precision (64-bit) floating-point elements 
1084 /// in `a` and `b` for not-greater-than.
1085 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
1086 {
1087     static if (GDC_with_SSE2)
1088     {
1089         return __builtin_ia32_cmpngtpd(a, b);
1090     }
1091     else
1092     {
1093         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
1094     }
1095 }
1096 
1097 /// Compare the lower double-precision (64-bit) floating-point elements 
1098 /// in `a` and `b` for not-greater-than, store the result in the 
1099 /// lower element, and copy the upper element from `a`.
1100 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
1101 {
1102     // Note: There is no __builtin_ia32_cmpngtsd builtin.
1103     static if (GDC_with_SSE2)
1104     {
1105         return __builtin_ia32_cmplesd(b, a);
1106     }
1107     else
1108     {
1109         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
1110     }
1111 }
1112 
1113 /// Compare packed double-precision (64-bit) floating-point elements 
1114 /// in `a` and `b` for not-less-than-or-equal.
1115 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
1116 {
1117     static if (GDC_with_SSE2)
1118     {
1119         return __builtin_ia32_cmpnlepd(a, b);
1120     }
1121     else
1122     {
1123         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
1124     }
1125 }
1126 
1127 /// Compare the lower double-precision (64-bit) floating-point elements 
1128 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
1129 /// lower element, and copy the upper element from `a`.
1130 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
1131 {
1132     static if (GDC_with_SSE2)
1133     {
1134         return __builtin_ia32_cmpnlesd(a, b);
1135     }
1136     else
1137     {
1138         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
1139     }
1140 }
1141  
1142 /// Compare packed double-precision (64-bit) floating-point elements 
1143 /// in `a` and `b` for not-less-than.
1144 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
1145 {
1146     static if (GDC_with_SSE2)
1147     {
1148         return __builtin_ia32_cmpnltpd(a, b);
1149     }
1150     else
1151     {
1152         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
1153     }
1154 }
1155 
1156 /// Compare the lower double-precision (64-bit) floating-point elements 
1157 /// in `a` and `b` for not-less-than, store the result in the lower 
1158 /// element, and copy the upper element from `a`.
1159 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1160 {
1161     static if (GDC_with_SSE2)
1162     {
1163         return __builtin_ia32_cmpnltsd(a, b);
1164     }
1165     else
1166     {
1167         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1168     }
1169 }
1170 
1171 /// Compare packed double-precision (64-bit) floating-point elements 
1172 /// in `a` and `b` to see if neither is NaN.
1173 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1174 {
1175     static if (GDC_with_SSE2)
1176     {
1177         return __builtin_ia32_cmpordpd(a, b);
1178     }
1179     else
1180     {
1181         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1182     }
1183 }
1184 
1185 /// Compare the lower double-precision (64-bit) floating-point elements 
1186 /// in `a` and `b` to see if neither is NaN, store the result in the 
1187 /// lower element, and copy the upper element from `a` to the upper element.
1188 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1189 {
1190     static if (GDC_with_SSE2)
1191     {
1192         return __builtin_ia32_cmpordsd(a, b);
1193     }
1194     else
1195     {
1196         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1197     }
1198 }
1199 
1200 /// Compare packed double-precision (64-bit) floating-point elements 
1201 /// in `a` and `b` to see if either is NaN.
1202 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1203 {
1204     static if (GDC_with_SSE2)
1205     {
1206         return __builtin_ia32_cmpunordpd(a, b);
1207     }
1208     else
1209     {
1210         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1211     }
1212 }
1213 
1214 /// Compare the lower double-precision (64-bit) floating-point elements 
1215 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1216 /// element, and copy the upper element from `a` to the upper element.
1217 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1218 {
1219     static if (GDC_with_SSE2)
1220     {
1221         return __builtin_ia32_cmpunordsd(a, b);
1222     }
1223     else
1224     {
1225         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1226     }
1227 }
1228 
1229 /// Compare the lower double-precision (64-bit) floating-point element 
1230 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1231 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1232 {
1233     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1234     // comisd instruction, it returns false in case of unordered instead.
1235     //
1236     // Actually C++ compilers disagree over the meaning of that instruction.
1237     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1238     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1239     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1240     return a.array[0] == b.array[0];
1241 }
1242 unittest
1243 {
1244     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1245     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1246     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1247     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1248     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1249 }
1250 
1251 /// Compare the lower double-precision (64-bit) floating-point element 
1252 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1253 /// result (0 or 1).
1254 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1255 {
1256     return a.array[0] >= b.array[0];
1257 }
1258 unittest
1259 {
1260     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1261     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1262     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1263     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1264     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1265     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1266 }
1267 
1268 /// Compare the lower double-precision (64-bit) floating-point element 
1269 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1270 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1271 {
1272     return a.array[0] > b.array[0];
1273 }
1274 unittest
1275 {
1276     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1277     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1278     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1279     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1280     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1281 }
1282 
1283 /// Compare the lower double-precision (64-bit) floating-point element 
1284 /// in `a` and `b` for less-than-or-equal.
1285 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1286 {
1287     return a.array[0] <= b.array[0];
1288 }
1289 unittest
1290 {
1291     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1292     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1293     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1294     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1295     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1296     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1297 }
1298 
1299 /// Compare the lower double-precision (64-bit) floating-point element 
1300 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1301 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1302 {
1303     return a.array[0] < b.array[0];
1304 }
1305 unittest
1306 {
1307     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1308     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1309     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1310     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1311     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1312     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1313 }
1314 
1315 /// Compare the lower double-precision (64-bit) floating-point element
1316 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1317 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1318 {
1319     return a.array[0] != b.array[0];
1320 }
1321 unittest
1322 {
1323     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1324     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1325     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1326     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1327     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1328 }
1329 
1330 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1331 /// floating-point elements.
1332 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1333 {
1334     static if (LDC_with_optimizations)
1335     {
1336         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1337         enum ir = `
1338             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1339             %r = sitofp <2 x i32> %v to <2 x double>
1340             ret <2 x double> %r`;
1341         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1342     }
1343     else static if (GDC_with_SSE2)
1344     {
1345         return __builtin_ia32_cvtdq2pd(a);
1346     }
1347     else
1348     {
1349         double2 r = void;
1350         r.ptr[0] = a.array[0];
1351         r.ptr[1] = a.array[1];
1352         return r;
1353     }
1354 }
1355 unittest
1356 {
1357     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1358     assert(A.array[0] == 54.0);
1359     assert(A.array[1] == 54.0);
1360 }
1361 
1362 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1363 /// floating-point elements.
1364 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1365 {
1366     static if (DMD_with_DSIMD)
1367     {
1368         return cast(__m128)__simd(XMM.CVTDQ2PS, cast(void16) a);
1369     }
1370     else static if (GDC_with_SSE2)
1371     {
1372         return __builtin_ia32_cvtdq2ps(a);
1373     }
1374     else static if (LDC_with_optimizations)
1375     {
1376         // See #86 for why we had to resort to LLVM IR.
1377         // Plain code below was leading to catastrophic behaviour. 
1378         // x86: Generates cvtdq2ps since LDC 1.1.0 -O0
1379         // ARM: Generats scvtf.4s since LDC 1.8.0 -O0
1380         enum ir = `
1381             %r = sitofp <4 x i32> %0 to <4 x float>
1382             ret <4 x float> %r`;
1383         return cast(__m128) LDCInlineIR!(ir, float4, int4)(a);
1384     }
1385     else
1386     {
1387         __m128 res; // PERF =void;
1388         res.ptr[0] = cast(float)a.array[0];
1389         res.ptr[1] = cast(float)a.array[1];
1390         res.ptr[2] = cast(float)a.array[2];
1391         res.ptr[3] = cast(float)a.array[3];
1392         return res;
1393     }
1394 }
1395 unittest
1396 {
1397     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1398     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1399 }
1400 
1401 /// Convert packed double-precision (64-bit) floating-point elements 
1402 /// in `a` to packed 32-bit integers.
1403 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1404 {
1405     // PERF ARM32
1406     static if (LDC_with_SSE2)
1407     {
1408         return __builtin_ia32_cvtpd2dq(a);
1409     }
1410     else static if (GDC_with_SSE2)
1411     {
1412         return __builtin_ia32_cvtpd2dq(a);
1413     }
1414     else static if (LDC_with_ARM64)
1415     {
1416         // Get current rounding mode.
1417         uint fpscr = arm_get_fpcr();
1418         long2 i;
1419         switch(fpscr & _MM_ROUND_MASK_ARM)
1420         {
1421             default:
1422             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1423             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1424             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1425             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1426         }
1427         int4 zero = 0;
1428         return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero); // PERF: this slow down build for nothing, test without shufflevector
1429     }
1430     else
1431     {
1432         // PERF ARM32
1433         __m128i r = _mm_setzero_si128();
1434         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1435         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1436         return r;
1437     }
1438 }
1439 unittest
1440 {
1441     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1442     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1443 }
1444 
1445 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1446 /// to packed 32-bit integers
1447 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1448 {
1449     return to_m64(_mm_cvtpd_epi32(v));
1450 }
1451 unittest
1452 {
1453     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1454     assert(A.array[0] == 55 && A.array[1] == 61);
1455 }
1456 
1457 /// Convert packed double-precision (64-bit) floating-point elements 
1458 /// in `a` to packed single-precision (32-bit) floating-point elements.
1459 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1460 {
1461     static if (LDC_with_SSE2)
1462     {
1463         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1464     }
1465     else static if (GDC_with_SSE2)
1466     {
1467         return __builtin_ia32_cvtpd2ps(a);
1468     }
1469     else
1470     { 
1471         __m128 r = void;
1472         r.ptr[0] = a.array[0];
1473         r.ptr[1] = a.array[1];
1474         r.ptr[2] = 0;
1475         r.ptr[3] = 0;
1476         return r;
1477     }
1478 }
1479 unittest
1480 {
1481     __m128d A = _mm_set_pd(5.25, 4.0);
1482     __m128 B = _mm_cvtpd_ps(A);
1483     assert(B.array == [4.0f, 5.25f, 0, 0]);
1484 }
1485 
1486 /// Convert packed 32-bit integers in `v` to packed double-precision 
1487 /// (64-bit) floating-point elements.
1488 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1489 {
1490     return _mm_cvtepi32_pd(to_m128i(v));
1491 }
1492 unittest
1493 {
1494     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1495     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1496 }
1497 
1498 /// Convert packed single-precision (32-bit) floating-point elements 
1499 /// in `a` to packed 32-bit integers
1500 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1501 {
1502     static if (LDC_with_SSE2)
1503     {
1504         return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1505     }
1506     else static if (GDC_with_SSE2)
1507     {
1508         return __builtin_ia32_cvtps2dq(a);
1509     }
1510     else static if (LDC_with_ARM64)
1511     {
1512         // Get current rounding mode.
1513         uint fpscr = arm_get_fpcr();
1514         switch(fpscr & _MM_ROUND_MASK_ARM)
1515         {
1516             default:
1517             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1518             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1519             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1520             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1521         }
1522     }
1523     else
1524     {
1525         __m128i r = void;
1526         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1527         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1528         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1529         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1530         return r;
1531     }
1532 }
1533 unittest
1534 {
1535     // GDC bug #98607
1536     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1537     // GDC does not provide optimization barrier for rounding mode.
1538     // Workarounded with different literals. This bug will likely only manifest in unittest.
1539     // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1540 
1541     uint savedRounding = _MM_GET_ROUNDING_MODE();
1542 
1543     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1544     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1545     assert(A.array == [1, -2, 54, -3]);
1546 
1547     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1548     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1549     assert(A.array == [1, -3, 53, -3]);
1550 
1551     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1552     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1553     assert(A.array == [2, -2, 54, -2]);
1554 
1555     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1556     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1557     assert(A.array == [1, -2, 53, -2]);
1558 
1559     _MM_SET_ROUNDING_MODE(savedRounding);
1560 }
1561 
1562 /// Convert packed single-precision (32-bit) floating-point elements 
1563 /// in `a` to packed double-precision (64-bit) floating-point elements.
1564 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1565 {
1566     static if (LDC_with_optimizations)
1567     {
1568         // Generates cvtps2pd since LDC 1.0 -O0
1569         enum ir = `
1570             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1571             %r = fpext <2 x float> %v to <2 x double>
1572             ret <2 x double> %r`;
1573         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1574     }
1575     else static if (GDC_with_SSE2)
1576     {
1577         return __builtin_ia32_cvtps2pd(a);
1578     }
1579     else
1580     {
1581         double2 r = void;
1582         r.ptr[0] = a.array[0];
1583         r.ptr[1] = a.array[1];
1584         return r;
1585     }
1586 }
1587 unittest
1588 {
1589     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1590     assert(A.array[0] == 54.0);
1591     assert(A.array[1] == 54.0);
1592 }
1593 
1594 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1595 double _mm_cvtsd_f64 (__m128d a) pure @safe
1596 {
1597     return a.array[0];
1598 }
1599 
1600 /// Convert the lower double-precision (64-bit) floating-point element
1601 /// in `a` to a 32-bit integer.
1602 int _mm_cvtsd_si32 (__m128d a) @safe
1603 {
1604     static if (LDC_with_SSE2)
1605     {
1606         return __builtin_ia32_cvtsd2si(a);
1607     }
1608     else static if (GDC_with_SSE2)
1609     {
1610         return __builtin_ia32_cvtsd2si(a);
1611     }
1612     else
1613     {
1614         return convertDoubleToInt32UsingMXCSR(a[0]);
1615     }
1616 }
1617 unittest
1618 {
1619     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1620 }
1621 
1622 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1623 long _mm_cvtsd_si64 (__m128d a) @trusted
1624 {
1625     static if (LDC_with_SSE2)
1626     {
1627         version (X86_64)
1628         {
1629             return __builtin_ia32_cvtsd2si64(a);
1630         }
1631         else
1632         {
1633             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1634             // using SSE instructions only. So the builtin doesn't exist for this arch.
1635             return convertDoubleToInt64UsingMXCSR(a[0]);
1636         }
1637     }
1638     else
1639     {
1640         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1641     }
1642 }
1643 unittest
1644 {
1645     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1646 
1647     uint savedRounding = _MM_GET_ROUNDING_MODE();
1648 
1649     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1650     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1651 
1652     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1653     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1654 
1655     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1656     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1657 
1658     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1659     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1660 
1661     _MM_SET_ROUNDING_MODE(savedRounding);
1662 }
1663 
1664 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1665 
1666 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1667 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1668 /// to the upper elements of result.
1669 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1670 {
1671     static if (GDC_with_SSE2)
1672     {
1673         return __builtin_ia32_cvtsd2ss(a, b); 
1674     }
1675     else
1676     {
1677         // Generates cvtsd2ss since LDC 1.3 -O0
1678         a.ptr[0] = b.array[0];
1679         return a;
1680     }
1681 }
1682 unittest
1683 {
1684     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1685     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1686 }
1687 
1688 /// Get the lower 32-bit integer in `a`.
1689 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1690 {
1691     return a.array[0];
1692 }
1693 
1694 /// Get the lower 64-bit integer in `a`.
1695 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1696 {
1697     long2 la = cast(long2)a;
1698     return la.array[0];
1699 }
1700 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1701 
1702 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1703 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1704 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1705 {
1706     a.ptr[0] = cast(double)b;
1707     return a;
1708 }
1709 unittest
1710 {
1711     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1712     assert(a.array == [42.0, 0]);
1713 }
1714 
1715 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1716 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1717 {
1718     int4 r = [0, 0, 0, 0];
1719     r.ptr[0] = a;
1720     return r;
1721 }
1722 unittest
1723 {
1724     __m128i a = _mm_cvtsi32_si128(65);
1725     assert(a.array == [65, 0, 0, 0]);
1726 }
1727 
1728 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1729 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1730 
1731 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1732 {
1733     a.ptr[0] = cast(double)b;
1734     return a;
1735 }
1736 unittest
1737 {
1738     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1739     assert(a.array == [42.0, 0]);
1740 }
1741 
1742 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1743 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1744 {
1745     long2 r = [0, 0];
1746     r.ptr[0] = a;
1747     return cast(__m128i)(r);
1748 }
1749 
1750 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1751 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1752 
1753 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1754 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1755 // element of result.
1756 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1757 {
1758     a.ptr[0] = b.array[0];
1759     return a;
1760 }
1761 unittest
1762 {
1763     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1764     assert(a.array == [42.0, 0]);
1765 }
1766 
1767 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1768 long _mm_cvttss_si64 (__m128 a) pure @safe
1769 {
1770     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1771 }
1772 unittest
1773 {
1774     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1775 }
1776 
1777 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1778 /// Put zeroes in the upper elements of result.
1779 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1780 {
1781     static if (LDC_with_SSE2)
1782     {
1783         return __builtin_ia32_cvttpd2dq(a);
1784     }
1785     else static if (GDC_with_SSE2)
1786     {
1787         return __builtin_ia32_cvttpd2dq(a);
1788     }
1789     else
1790     {
1791         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1792         __m128i r; // PERF =void;
1793         r.ptr[0] = cast(int)a.array[0];
1794         r.ptr[1] = cast(int)a.array[1];
1795         r.ptr[2] = 0;
1796         r.ptr[3] = 0;
1797         return r;
1798     }
1799 }
1800 unittest
1801 {
1802     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1803     assert(R.array == [-4, 45641, 0, 0]);
1804 }
1805 
1806 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1807 /// to packed 32-bit integers with truncation.
1808 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1809 {
1810     return to_m64(_mm_cvttpd_epi32(v));
1811 }
1812 unittest
1813 {
1814     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1815     int[2] correct = [-4, 45641];
1816     assert(R.array == correct);
1817 }
1818 
1819 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1820 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1821 {
1822     // x86: Generates cvttps2dq since LDC 1.3 -O2
1823     // ARM64: generates fcvtze since LDC 1.8 -O2
1824     __m128i r; // PERF = void;
1825     r.ptr[0] = cast(int)a.array[0];
1826     r.ptr[1] = cast(int)a.array[1];
1827     r.ptr[2] = cast(int)a.array[2];
1828     r.ptr[3] = cast(int)a.array[3];
1829     return r;
1830 }
1831 unittest
1832 {
1833     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1834     assert(R.array == [-4, 45641, 0, 1]);
1835 }
1836 
1837 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1838 int _mm_cvttsd_si32 (__m128d a)
1839 {
1840     // Generates cvttsd2si since LDC 1.3 -O0
1841     return cast(int)a.array[0];
1842 }
1843 
1844 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1845 long _mm_cvttsd_si64 (__m128d a)
1846 {
1847     // Generates cvttsd2si since LDC 1.3 -O0
1848     // but in 32-bit instead, it's a long sequence that resort to FPU
1849     return cast(long)a.array[0];
1850 }
1851 
1852 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1853 
1854 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1855 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1856 {
1857     pragma(inline, true);
1858     return a / b;
1859 }
1860 
1861 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1862 {
1863     static if (GDC_with_SSE2)
1864     {
1865         return __builtin_ia32_divsd(a, b);
1866     }
1867     else version(DigitalMars)
1868     {
1869         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1870         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1871         asm pure nothrow @nogc @trusted { nop;}
1872         a.array[0] = a.array[0] / b.array[0];
1873         return a;
1874     }
1875     else
1876     {
1877         a.ptr[0] /= b.array[0];
1878         return a;
1879     }
1880 }
1881 unittest
1882 {
1883     __m128d a = [2.0, 4.5];
1884     a = _mm_div_sd(a, a);
1885     assert(a.array == [1.0, 4.5]);
1886 }
1887 
1888 /// Extract a 16-bit integer from `v`, selected with `index`.
1889 /// Warning: the returned value is zero-extended to 32-bits.
1890 int _mm_extract_epi16(__m128i v, int index) pure @safe
1891 {
1892     short8 r = cast(short8)v;
1893     return cast(ushort)(r.array[index & 7]);
1894 }
1895 unittest
1896 {
1897     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1898     assert(_mm_extract_epi16(A, 6) == 6);
1899     assert(_mm_extract_epi16(A, 0) == 65535);
1900     assert(_mm_extract_epi16(A, 5 + 8) == 5);
1901 }
1902 
1903 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1904 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1905 {
1906     short8 r = cast(short8)v;
1907     r.ptr[index & 7] = cast(short)i;
1908     return cast(__m128i)r;
1909 }
1910 unittest
1911 {
1912     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1913     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1914     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1915     assert(R.array == correct);
1916 }
1917 
1918 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 
1919 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 
1920 /// is globally visible before any load instruction which follows the fence in program order.
1921 void _mm_lfence() @trusted
1922 {
1923     version(GNU)
1924     {
1925         static if (GDC_with_SSE2)
1926         {
1927             __builtin_ia32_lfence();
1928         }
1929         else version(X86)
1930         {
1931             asm pure nothrow @nogc @trusted
1932             {
1933                 "lfence;\n" : : : ;
1934             }
1935         }
1936         else
1937             static assert(false);
1938     }
1939     else static if (LDC_with_SSE2)
1940     {
1941         __builtin_ia32_lfence();
1942     }
1943     else static if (LDC_with_ARM64)
1944     {
1945          __builtin_arm_dmb(9);  // dmb ishld
1946     }
1947     else static if (DMD_with_asm)
1948     {
1949         asm nothrow @nogc pure @trusted
1950         {
1951             lfence;
1952         }
1953     }
1954     else version(LDC)
1955     {
1956         // When the architecture is unknown, generate a full memory barrier,
1957         // as the semantics of sfence do not really match those of atomics.
1958         llvm_memory_fence();
1959     }
1960     else
1961         static assert(false);
1962 }
1963 unittest
1964 {
1965     _mm_lfence();
1966 }
1967 
1968 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1969 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1970 __m128d _mm_load_pd (const(double) * mem_addr) pure
1971 {
1972     pragma(inline, true);
1973     __m128d* aligned = cast(__m128d*)mem_addr;
1974     return *aligned;
1975 }
1976 unittest
1977 {
1978     align(16) double[2] S = [-5.0, 7.0];
1979     __m128d R = _mm_load_pd(S.ptr);
1980     assert(R.array == S);
1981 }
1982 
1983 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1984 /// `mem_addr` does not need to be aligned on any particular boundary.
1985 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1986 {
1987     double m = *mem_addr;
1988     __m128d r; // PERF =void;
1989     r.ptr[0] = m;
1990     r.ptr[1] = m;
1991     return r;
1992 }
1993 unittest
1994 {
1995     double what = 4;
1996     __m128d R = _mm_load_pd1(&what);
1997     double[2] correct = [4.0, 4];
1998     assert(R.array == correct);
1999 }
2000 
2001 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
2002 /// element. `mem_addr` does not need to be aligned on any particular boundary.
2003 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
2004 {
2005     double2 r = [0, 0];
2006     r.ptr[0] = *mem_addr;
2007     return r;
2008 }
2009 unittest
2010 {
2011     double x = -42;
2012     __m128d a = _mm_load_sd(&x);
2013     assert(a.array == [-42.0, 0.0]);
2014 }
2015 
2016 /// Load 128-bits of integer data from memory into dst. 
2017 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2018 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @safe
2019 {
2020     pragma(inline, true);
2021     return *mem_addr;
2022 }
2023 unittest
2024 {
2025     align(16) int[4] correct = [-1, 2, 3, 4];
2026     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
2027     assert(A.array == correct);
2028 }
2029 
2030 alias _mm_load1_pd = _mm_load_pd1; ///
2031 
2032 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
2033 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
2034 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
2035 {
2036     pragma(inline, true);
2037     a.ptr[1] = *mem_addr;
2038     return a;
2039 }
2040 unittest
2041 {
2042     double A = 7.0;
2043     __m128d B = _mm_setr_pd(4.0, -5.0);
2044     __m128d R = _mm_loadh_pd(B, &A);
2045     double[2] correct = [ 4.0, 7.0 ];
2046     assert(R.array == correct);
2047 }
2048 
2049 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
2050 /// Note: strange signature since the memory doesn't have to aligned, and should point to addressable 64-bit, not 128-bit.
2051 /// You may use `_mm_loadu_si64` instead.
2052 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted
2053 {
2054     pragma(inline, true);
2055     static if (DMD_with_DSIMD)
2056     {
2057         return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
2058     }
2059     else
2060     {
2061         auto pLong = cast(const(long)*)mem_addr;
2062         long2 r = [0, 0];
2063         r.ptr[0] = *pLong;
2064         return cast(__m128i)(r);
2065     }
2066 }
2067 unittest
2068 {
2069     long A = 0x7878787870707070;
2070     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
2071     long[2] correct = [0x7878787870707070, 0];
2072     assert(R.array == correct);
2073 }
2074 
2075 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
2076 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
2077 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
2078 {
2079     a.ptr[0] = *mem_addr;
2080     return a;
2081 }
2082 unittest
2083 {
2084     double A = 7.0;
2085     __m128d B = _mm_setr_pd(4.0, -5.0);
2086     __m128d R = _mm_loadl_pd(B, &A);
2087     double[2] correct = [ 7.0, -5.0 ];
2088     assert(R.array == correct);
2089 }
2090 
2091 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
2092 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2093 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
2094 {
2095     __m128d a = *cast(__m128d*)(mem_addr);
2096     __m128d r; // PERF =void;
2097     r.ptr[0] = a.array[1];
2098     r.ptr[1] = a.array[0];
2099     return r;
2100 }
2101 unittest
2102 {
2103     align(16) double[2] A = [56.0, -74.0];
2104     __m128d R = _mm_loadr_pd(A.ptr);
2105     double[2] correct = [-74.0, 56.0];
2106     assert(R.array == correct);
2107 }
2108 
2109 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
2110 /// `mem_addr` does not need to be aligned on any particular boundary.
2111 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
2112 {
2113     pragma(inline, true);
2114     static if (GDC_with_SSE2)
2115     {
2116         return __builtin_ia32_loadupd(mem_addr); 
2117     }
2118     else static if (LDC_with_optimizations)
2119     {
2120         return loadUnaligned!(double2)(mem_addr);
2121     }
2122     else version(DigitalMars)
2123     {
2124         // Apparently inside __simd you can use aligned dereferences without fear.
2125         // That was issue 23048 on dlang's Bugzilla.
2126         static if (DMD_with_DSIMD)
2127         {
2128             return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr);
2129         }
2130         else static if (SSESizedVectorsAreEmulated)
2131         {
2132             // Since this vector is emulated, it doesn't have alignement constraints
2133             // and as such we can just cast it.
2134             return *cast(__m128d*)(mem_addr);
2135         }
2136         else
2137         {
2138             __m128d result;
2139             result.ptr[0] = mem_addr[0];
2140             result.ptr[1] = mem_addr[1];
2141             return result;
2142         }
2143     }
2144     else
2145     {
2146         __m128d result;
2147         result.ptr[0] = mem_addr[0];
2148         result.ptr[1] = mem_addr[1];
2149         return result;
2150     }
2151 }
2152 unittest
2153 {
2154     double[2] A = [56.0, -75.0];
2155     __m128d R = _mm_loadu_pd(A.ptr);
2156     double[2] correct = [56.0, -75.0];
2157     assert(R.array == correct);
2158 }
2159 
2160 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
2161 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
2162 {
2163     // PERF DMD
2164     pragma(inline, true);
2165     static if (GDC_with_SSE2)
2166     {
2167         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
2168     }
2169     else static if (LDC_with_optimizations)
2170     {
2171         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
2172     }
2173     else
2174     {
2175         const(int)* p = cast(const(int)*)mem_addr;
2176         __m128i r = void;
2177         r.ptr[0] = p[0];
2178         r.ptr[1] = p[1];
2179         r.ptr[2] = p[2];
2180         r.ptr[3] = p[3];
2181         return r;
2182     }
2183 }
2184 unittest
2185 {
2186     align(16) int[4] correct = [-1, 2, -3, 4];
2187     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
2188     assert(A.array == correct);
2189 }
2190 
2191 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise.
2192 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted // TODO: should be @system actually
2193 {
2194     static if (DMD_with_DSIMD)
2195     {
2196         int r = *cast(short*)(mem_addr);
2197         return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r);
2198     }
2199     else version(DigitalMars)
2200     {
2201         // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672
2202         // DMD cannot handle the below code...
2203         align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0];
2204         r[0] = *cast(short*)(mem_addr);
2205         return *cast(int4*)(r.ptr);
2206     }
2207     else
2208     {
2209         short r = *cast(short*)(mem_addr);
2210         short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
2211         result.ptr[0] = r;
2212         return cast(__m128i)result;
2213     }
2214 }
2215 unittest
2216 {
2217     short r = 13;
2218     short8 A = cast(short8) _mm_loadu_si16(&r);
2219     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
2220     assert(A.array == correct);
2221 }
2222 
2223 /// Load unaligned 32-bit integer from memory into the first element of result.
2224 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted // TODO: should be @system actually
2225 {
2226     pragma(inline, true);
2227     int r = *cast(int*)(mem_addr);
2228     int4 result = [0, 0, 0, 0];
2229     result.ptr[0] = r;
2230     return result;
2231 }
2232 unittest
2233 {
2234     int r = 42;
2235     __m128i A = _mm_loadu_si32(&r);
2236     int[4] correct = [42, 0, 0, 0];
2237     assert(A.array == correct);
2238 }
2239 
2240 /// Load unaligned 64-bit integer from memory into the first element of result.
2241 /// Upper 64-bit is zeroed.
2242 __m128i _mm_loadu_si64 (const(void)* mem_addr) pure @system
2243 {
2244     pragma(inline, true);
2245     static if (DMD_with_DSIMD)
2246     {
2247         return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
2248     }
2249     else
2250     {    
2251         auto pLong = cast(const(long)*)mem_addr;
2252         long2 r = [0, 0];
2253         r.ptr[0] = *pLong;
2254         return cast(__m128i)r;
2255     }
2256 }
2257 unittest
2258 {
2259     long r = 446446446446;
2260     long2 A = cast(long2) _mm_loadu_si64(&r);
2261     long[2] correct = [446446446446, 0];
2262     assert(A.array == correct);
2263 }
2264 
2265 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2266 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2267 /// and pack the results in destination.
2268 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2269 {
2270     static if (GDC_with_SSE2)
2271     {
2272         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2273     }
2274     else static if (LDC_with_SSE2)
2275     {
2276         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2277     }
2278     else static if (LDC_with_optimizations)
2279     {
2280         // 5 inst with arm64 + LDC 1.32 + -O1
2281         enum ir = `            
2282             %ia = sext <8 x i16> %0 to <8 x i32>
2283             %ib = sext <8 x i16> %1 to <8 x i32>
2284             %p = mul <8 x i32> %ia, %ib
2285             %p_even = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 0, i32 2,i32 4, i32 6>
2286             %p_odd  = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 1, i32 3,i32 5, i32 7>            
2287             %p_sum = add <4 x i32> %p_even, %p_odd
2288             ret <4 x i32> %p_sum`;
2289         return cast(__m128i) LDCInlineIR!(ir, int4, short8, short8)(cast(short8)a, cast(short8)b);
2290     }
2291     else
2292     {
2293         short8 sa = cast(short8)a;
2294         short8 sb = cast(short8)b;
2295         int4 r;
2296         foreach(i; 0..4)
2297         {
2298             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2299         }
2300         return r;
2301     }
2302 }
2303 unittest
2304 {
2305     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2306     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2307     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2308     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2309     assert(R.array == correct);
2310 }
2311 
2312 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2313 /// (elements are not stored when the highest bit is not set in the corresponding element)
2314 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2315 /// boundary.
2316 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2317 {
2318     static if (GDC_with_SSE2)
2319     {    
2320         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2321     }
2322     else static if (LDC_with_SSE2)
2323     {
2324         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2325     }
2326     else static if (LDC_with_ARM64)
2327     {
2328         // PERF: catastrophic on ARM32
2329         byte16 bmask  = cast(byte16)mask;
2330         byte16 shift = 7;
2331         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2332         mask = cast(__m128i) bmask;
2333         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2334         dest = (a & mask) | (dest & ~mask);
2335         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2336     }
2337     else
2338     {
2339         byte16 b = cast(byte16)a;
2340         byte16 m = cast(byte16)mask;
2341         byte* dest = cast(byte*)(mem_addr);
2342         foreach(j; 0..16)
2343         {
2344             if (m.array[j] & 128)
2345             {
2346                 dest[j] = b.array[j];
2347             }
2348         }
2349     }
2350 }
2351 unittest
2352 {
2353     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2354     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2355     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2356     _mm_maskmoveu_si128(A, mask, dest.ptr);
2357     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2358     assert(dest == correct);
2359 }
2360 
2361 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2362 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2363 {
2364     static if (GDC_with_SSE2)
2365     {
2366         return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2367     }
2368     else version(LDC)
2369     {
2370         // x86: pmaxsw since LDC 1.0 -O1
2371         // ARM: smax.8h since LDC 1.5 -01
2372         short8 sa = cast(short8)a;
2373         short8 sb = cast(short8)b;
2374         static if (SIMD_COMPARISON_MASKS_16B)
2375             short8 greater = sa > sb;
2376         else
2377             short8 greater = greaterMask!short8(sa, sb);
2378         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2379     }
2380     else
2381     {
2382         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2383         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2384         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2385         return _mm_xor_si128(b, mask);
2386     }
2387 }
2388 unittest
2389 {
2390     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2391                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2392     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2393     assert(R.array == correct);
2394 }
2395 
2396 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2397 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2398 {
2399     // PERF DMD
2400     static if (GDC_with_SSE2)
2401     {
2402         return cast(__m128i) __builtin_ia32_pmaxub128(cast(ubyte16)a, cast(ubyte16)b);
2403     }
2404     else version(LDC)
2405     {
2406         // x86: pmaxub since LDC 1.0.0 -O1
2407         // ARM64: umax.16b since LDC 1.5.0 -O1
2408         // PERF: catastrophic on ARM32
2409         ubyte16 sa = cast(ubyte16)a;
2410         ubyte16 sb = cast(ubyte16)b;
2411         static if (SIMD_COMPARISON_MASKS_16B)
2412             ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b);
2413         else
2414             ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2415         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2416     }
2417     else
2418     {
2419         // PERF: use algorithm from _mm_max_epu16
2420         __m128i value128 = _mm_set1_epi8(-128);
2421         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2422         __m128i aTob = a ^ b; // a ^ (a ^ b) == b
2423         __m128i mask = aTob & higher;
2424         return b ^ mask;
2425 
2426     }
2427 }
2428 unittest
2429 {
2430     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2431                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2432     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2433     assert(R.array == correct);
2434 }
2435 
2436 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
2437 /// packed maximum values.
2438 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2439 {
2440     static if (GDC_with_SSE2)
2441     {
2442         return __builtin_ia32_maxpd(a, b);
2443     }
2444     else
2445     {
2446         // x86: Generates maxpd starting with LDC 1.9 -O2
2447         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2448         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2449         return a;
2450     }
2451 }
2452 unittest
2453 {
2454     __m128d A = _mm_setr_pd(4.0, 1.0);
2455     __m128d B = _mm_setr_pd(1.0, 8.0);
2456     __m128d M = _mm_max_pd(A, B);
2457     assert(M.array[0] == 4.0);
2458     assert(M.array[1] == 8.0);
2459 }
2460 
2461 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2462 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2463 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2464 {
2465     static if (GDC_with_SSE2)
2466     {
2467         return __builtin_ia32_maxsd(a, b);
2468     }
2469     else
2470     {
2471          __m128d r = a;
2472         // Generates maxsd starting with LDC 1.3
2473         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2474         return r;
2475     }
2476 }
2477 unittest
2478 {
2479     __m128d A = _mm_setr_pd(1.0, 1.0);
2480     __m128d B = _mm_setr_pd(4.0, 2.0);
2481     __m128d M = _mm_max_sd(A, B);
2482     assert(M.array[0] == 4.0);
2483     assert(M.array[1] == 1.0);
2484 }
2485 
2486 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2487 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2488 /// is globally visible before any memory instruction which follows the fence in program order.
2489 void _mm_mfence() @trusted // not pure!
2490 {
2491     version(GNU)
2492     {
2493         static if (GDC_with_SSE2)
2494         {
2495             __builtin_ia32_mfence();
2496         }
2497         else version(X86)
2498         {
2499             asm pure nothrow @nogc @trusted
2500             {
2501                 "mfence;\n" : : : ;
2502             }
2503         }
2504         else
2505             static assert(false);
2506     }
2507     else static if (LDC_with_SSE2)
2508     {
2509         __builtin_ia32_mfence();
2510     }
2511     else static if (DMD_with_asm)
2512     {
2513         asm nothrow @nogc pure @trusted
2514         {
2515             mfence;
2516         }
2517     }
2518     else version(LDC)
2519     {
2520         // Note: will generate the DMB ish instruction on ARM
2521         llvm_memory_fence();
2522     }
2523     else
2524         static assert(false);
2525 }
2526 unittest
2527 {
2528     _mm_mfence();
2529 }
2530 
2531 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2532 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2533 {
2534     static if (GDC_with_SSE2)
2535     {
2536         return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2537     }
2538     else version(LDC)
2539     {
2540         // x86: pminsw since LDC 1.0 -O1
2541         // ARM64: smin.8h since LDC 1.5 -01
2542         short8 sa = cast(short8)a;
2543         short8 sb = cast(short8)b;
2544         static if (SIMD_COMPARISON_MASKS_16B)
2545             short8 greater = sa > sb;
2546         else
2547             short8 greater = greaterMask!short8(sa, sb);
2548         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2549     }
2550     else
2551     {
2552         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2553         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2554         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2555         return _mm_xor_si128(b, mask);
2556     }
2557 }
2558 unittest
2559 {
2560     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2561                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2562     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2563     assert(R.array == correct);
2564 }
2565 
2566 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2567 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2568 {
2569     static if (GDC_with_SSE2)
2570     {
2571         return cast(__m128i) __builtin_ia32_pminub128(cast(ubyte16)a, cast(ubyte16)b);
2572     }
2573     else version(LDC)
2574     {
2575         // x86: pminub since LDC 1.0.0 -O1
2576         // ARM: umin.16b since LDC 1.5.0 -O1
2577         // PERF: catastrophic on ARM32
2578         ubyte16 sa = cast(ubyte16)a;
2579         ubyte16 sb = cast(ubyte16)b;
2580         static if (SIMD_COMPARISON_MASKS_16B)
2581             ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b);
2582         else
2583             ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2584         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2585     }
2586     else
2587     {
2588         // PERF: use the algorithm from _mm_max_epu16
2589         __m128i value128 = _mm_set1_epi8(-128);
2590         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2591         __m128i aTob = a ^ b; // a ^ (a ^ b) == b
2592         __m128i mask = aTob & lower;
2593         return b ^ mask;
2594     }
2595 }
2596 unittest
2597 {
2598     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2599                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2600     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2601     assert(R.array == correct);
2602 }
2603 
2604 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2605 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2606 {
2607     static if (GDC_with_SSE2)
2608     {
2609         return __builtin_ia32_minpd(a, b);
2610     }
2611     else
2612     {
2613         // Generates minpd starting with LDC 1.9
2614         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2615         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2616         return a;
2617     }
2618 }
2619 unittest
2620 {
2621     __m128d A = _mm_setr_pd(1.0, 2.0);
2622     __m128d B = _mm_setr_pd(4.0, 1.0);
2623     __m128d M = _mm_min_pd(A, B);
2624     assert(M.array[0] == 1.0);
2625     assert(M.array[1] == 1.0);
2626 }
2627 
2628 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2629 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2630 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2631 {
2632     static if (GDC_with_SSE2)
2633     {
2634         return __builtin_ia32_minsd(a, b);
2635     }
2636     else
2637     {
2638         // Generates minsd starting with LDC 1.3
2639         __m128d r = a;
2640         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2641         return r;
2642     }
2643 }
2644 unittest
2645 {
2646     __m128d A = _mm_setr_pd(1.0, 3.0);
2647     __m128d B = _mm_setr_pd(4.0, 2.0);
2648     __m128d M = _mm_min_sd(A, B);
2649     assert(M.array[0] == 1.0);
2650     assert(M.array[1] == 3.0);
2651 }
2652 
2653 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2654 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2655 {
2656     static if (GDC_with_SSE2)
2657     {
2658         // slightly better with GDC -O0
2659         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2660     }
2661     else
2662     {
2663         long2 result = [ 0, 0 ];
2664         long2 la = cast(long2) a;
2665         result.ptr[0] = la.array[0];
2666         return cast(__m128i)(result);
2667     }
2668 }
2669 unittest
2670 {
2671     long2 A = [13, 47];
2672     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2673     long[2] correct = [13, 0];
2674     assert(B.array == correct);
2675 }
2676 
2677 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2678 /// the upper element from `a` to the upper element of dst.
2679 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2680 {
2681     static if (GDC_with_SSE2)
2682     {
2683         return __builtin_ia32_movsd(a, b); 
2684     }
2685     else
2686     {
2687         b.ptr[1] = a.array[1];
2688         return b;
2689     }
2690 }
2691 unittest
2692 {
2693     double2 A = [13.0, 47.0];
2694     double2 B = [34.0, 58.0];
2695     double2 C = _mm_move_sd(A, B);
2696     double[2] correct = [34.0, 47.0];
2697     assert(C.array == correct);
2698 }
2699 
2700 /// Create mask from the most significant bit of each 8-bit element in `v`.
2701 int _mm_movemask_epi8 (__m128i a) pure @trusted
2702 {
2703     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2704     static if (GDC_with_SSE2)
2705     {
2706         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2707     }
2708     else static if (LDC_with_SSE2)
2709     {
2710         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2711     }
2712     else static if (LDC_with_ARM64)
2713     {
2714         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2715         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2716         // SO there might be something a bit faster, but this one is reasonable and branchless.
2717         byte8 mask_shift;
2718         mask_shift.ptr[0] = 7;
2719         mask_shift.ptr[1] = 6;
2720         mask_shift.ptr[2] = 5;
2721         mask_shift.ptr[3] = 4;
2722         mask_shift.ptr[4] = 3;
2723         mask_shift.ptr[5] = 2;
2724         mask_shift.ptr[6] = 1;
2725         mask_shift.ptr[7] = 0;
2726         byte8 mask_and = byte8(-128);
2727         byte8 lo = vget_low_u8(cast(byte16)a);
2728         byte8 hi = vget_high_u8(cast(byte16)a);
2729         lo = vand_u8(lo, mask_and);
2730         lo = vshr_u8(lo, mask_shift);
2731         hi = vand_u8(hi, mask_and);
2732         hi = vshr_u8(hi, mask_shift);
2733         lo = vpadd_u8(lo,lo);
2734         lo = vpadd_u8(lo,lo);
2735         lo = vpadd_u8(lo,lo);
2736         hi = vpadd_u8(hi,hi);
2737         hi = vpadd_u8(hi,hi);
2738         hi = vpadd_u8(hi,hi);
2739         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2740     }
2741     else
2742     {
2743         byte16 ai = cast(byte16)a;
2744         int r = 0;
2745         foreach(bit; 0..16)
2746         {
2747             if (ai.array[bit] < 0) r += (1 << bit);
2748         }
2749         return r;
2750     }
2751 }
2752 unittest
2753 {
2754     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2755 }
2756 
2757 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS
2758 int _mm_movemask_epi16 (__m128i a) pure @trusted
2759 {
2760     return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128()));
2761 }
2762 unittest
2763 {
2764     assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8)));
2765 }
2766 
2767 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2768 /// loating-point element in `v`.
2769 int _mm_movemask_pd(__m128d v) pure @safe
2770 {
2771     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2772     static if (GDC_or_LDC_with_SSE2)
2773     {
2774         return __builtin_ia32_movmskpd(v);
2775     }
2776     else
2777     {
2778         long2 lv = cast(long2)v;
2779         int r = 0;
2780         if (lv.array[0] < 0) r += 1;
2781         if (lv.array[1] < 0) r += 2;
2782         return r;
2783     }
2784 }
2785 unittest
2786 {
2787     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2788     assert(_mm_movemask_pd(A) == 2);
2789 }
2790 
2791 /// Copy the lower 64-bit integer in `v`.
2792 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2793 {
2794     long2 lv = cast(long2)v;
2795     return long1(lv.array[0]);
2796 }
2797 unittest
2798 {
2799     __m128i A = _mm_set_epi64x(-1, -2);
2800     __m64 R = _mm_movepi64_pi64(A);
2801     assert(R.array[0] == -2);
2802 }
2803 
2804 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2805 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2806 {
2807     long2 r;
2808     r.ptr[0] = a.array[0];
2809     r.ptr[1] = 0;
2810     return cast(__m128i)r;
2811 }
2812 
2813 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, 
2814 /// and store the unsigned 64-bit results.
2815 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2816 {    
2817     // PERF DMD D_SIMD
2818     static if (GDC_with_SSE2)
2819     {
2820         return cast(__m128i) __builtin_ia32_pmuludq128 (a, b);
2821     }
2822     else
2823     {
2824         version(LDC)
2825         {
2826             static if (__VERSION__ >= 2088)
2827             {
2828                 // Need LLVM9 for proper optimization
2829                 long2 la, lb;
2830                 la.ptr[0] = cast(uint)a.array[0];
2831                 la.ptr[1] = cast(uint)a.array[2];
2832                 lb.ptr[0] = cast(uint)b.array[0];
2833                 lb.ptr[1] = cast(uint)b.array[2];
2834             }
2835             else
2836             {
2837                 __m128i zero;
2838                 zero = 0;
2839                 long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero);
2840                 long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero);
2841             }
2842         }
2843         else
2844         {
2845             long2 la, lb;
2846             la.ptr[0] = cast(uint)a.array[0];
2847             la.ptr[1] = cast(uint)a.array[2];
2848             lb.ptr[0] = cast(uint)b.array[0];
2849             lb.ptr[1] = cast(uint)b.array[2];
2850         }
2851 
2852         version(DigitalMars)
2853         {
2854             // DMD has no long2 mul
2855             la.ptr[0] *= lb.array[0];
2856             la.ptr[1] *= lb.array[1];
2857             return cast(__m128i)(la);
2858         }
2859         else
2860         {
2861             static if (__VERSION__ >= 2076)
2862             {
2863                 return cast(__m128i)(la * lb);
2864             }
2865             else
2866             {
2867                 // long2 mul not supported before LDC 1.5
2868                 la.ptr[0] *= lb.array[0];
2869                 la.ptr[1] *= lb.array[1];
2870                 return cast(__m128i)(la);
2871             }
2872         }
2873     }
2874 }
2875 unittest
2876 {
2877     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2878     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2879     __m128i C = _mm_mul_epu32(A, B);
2880     long2 LC = cast(long2)C;
2881     assert(LC.array[0] == 18446744065119617025uL);
2882     assert(LC.array[1] == 12723420444339690338uL);
2883 }
2884 
2885 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2886 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2887 {
2888     pragma(inline, true);
2889     return a * b;
2890 }
2891 unittest
2892 {
2893     __m128d a = [-2.0, 1.5];
2894     a = _mm_mul_pd(a, a);
2895     assert(a.array == [4.0, 2.25]);
2896 }
2897 
2898 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2899 /// element of result, and copy the upper element from `a` to the upper element of result.
2900 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2901 {
2902     version(DigitalMars)
2903     {    
2904         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2905         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2906         asm pure nothrow @nogc @trusted { nop;}
2907         a.array[0] = a.array[0] * b.array[0];
2908         return a;
2909     }
2910     else static if (GDC_with_SSE2)
2911     {
2912         return __builtin_ia32_mulsd(a, b);
2913     }
2914     else
2915     {
2916         a.ptr[0] *= b.array[0];
2917         return a;
2918     }
2919 }
2920 unittest
2921 {
2922     __m128d a = [-2.0, 1.5];
2923     a = _mm_mul_sd(a, a);
2924     assert(a.array == [4.0, 1.5]);
2925 }
2926 
2927 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2928 /// and get an unsigned 64-bit result.
2929 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2930 {
2931     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2932 }
2933 unittest
2934 {
2935     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2936     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2937     __m64 C = _mm_mul_su32(A, B);
2938     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2939 }
2940 
2941 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2942 /// high 16 bits of the intermediate integers.
2943 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2944 {
2945     static if (GDC_with_SSE2)
2946     {
2947         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2948     }
2949     else static if (LDC_with_SSE2)
2950     {
2951         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2952     }
2953     else
2954     {
2955         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2956         //        PERF: it seems the simde solution has one less instruction in ARM64.
2957         // PERF: Catastrophic in ARM32.
2958         short8 sa = cast(short8)a;
2959         short8 sb = cast(short8)b;
2960         short8 r = void;
2961         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2962         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2963         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2964         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2965         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2966         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2967         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2968         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2969         return cast(__m128i)r;
2970     }
2971 }
2972 unittest
2973 {
2974     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2975     __m128i B = _mm_set1_epi16(16384);
2976     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2977     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2978     assert(R.array == correct);
2979 }
2980 
2981 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2982 /// high 16 bits of the intermediate integers.
2983 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2984 {
2985     static if (GDC_with_SSE2)
2986     {
2987         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2988     }
2989     else static if (LDC_with_SSE2)
2990     {
2991         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2992     }
2993     else
2994     {
2995         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2996         //      it seems the simde solution has one less instruction in ARM64
2997         // PERF: Catastrophic in ARM32.
2998         short8 sa = cast(short8)a;
2999         short8 sb = cast(short8)b;
3000         short8 r = void;
3001         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
3002         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
3003         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
3004         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
3005         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
3006         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
3007         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
3008         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
3009         return cast(__m128i)r;
3010     }
3011 }
3012 unittest
3013 {
3014     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
3015     __m128i B = _mm_set1_epi16(16384);
3016     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
3017     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
3018     assert(R.array == correct);
3019 }
3020 
3021 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
3022 /// bits of the intermediate integers.
3023 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
3024 {
3025     return cast(__m128i)(cast(short8)a * cast(short8)b);
3026 }
3027 unittest
3028 {
3029     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
3030     __m128i B = _mm_set1_epi16(16384);
3031     short8 R = cast(short8)_mm_mullo_epi16(A, B);
3032     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
3033     assert(R.array == correct);
3034 }
3035 
3036 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS
3037 __m128i _mm_not_si128 (__m128i a) pure @safe
3038 {
3039     return ~a;
3040 }
3041 unittest
3042 {
3043     __m128i A = _mm_set1_epi32(-748);
3044     int4 notA = cast(int4) _mm_not_si128(A);
3045     int[4] correct = [747, 747, 747, 747];
3046     assert(notA.array == correct);
3047 }
3048 
3049 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
3050 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
3051 {
3052     pragma(inline, true);
3053     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
3054 }
3055 
3056 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
3057 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
3058 {
3059     pragma(inline, true);
3060     return a | b;
3061 }
3062 
3063 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
3064 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
3065 {
3066     static if (DMD_with_DSIMD)
3067     {
3068         return cast(__m128i) __simd(XMM.PACKSSDW, a, b);
3069     }
3070     else static if (GDC_with_SSE2)
3071     {
3072         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
3073     }    
3074     else static if (LDC_with_SSE2)
3075     {
3076         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
3077     }
3078     else static if (LDC_with_ARM64)
3079     {
3080         short4 ra = vqmovn_s32(cast(int4)a);
3081         short4 rb = vqmovn_s32(cast(int4)b);
3082         return cast(__m128i)vcombine_s16(ra, rb);
3083     }
3084     else
3085     {
3086         // PERF: catastrophic on ARM32
3087         short8 r;
3088         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
3089         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
3090         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
3091         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
3092         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
3093         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
3094         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
3095         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
3096         return cast(__m128i)r;
3097     }
3098 }
3099 unittest
3100 {
3101     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
3102     short8 R = cast(short8) _mm_packs_epi32(A, A);
3103     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
3104     assert(R.array == correct);
3105 }
3106 
3107 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
3108 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
3109 {
3110     static if (DMD_with_DSIMD)
3111     {
3112         return cast(__m128i) __simd(XMM.PACKSSWB, a, b);
3113     }
3114     else static if (GDC_with_SSE2)
3115     {
3116         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
3117     }
3118     else static if (LDC_with_SSE2)
3119     {
3120         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
3121     }
3122     else static if (LDC_with_ARM64)
3123     {
3124         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
3125         byte8 ra = vqmovn_s16(cast(short8)a);
3126         byte8 rb = vqmovn_s16(cast(short8)b);
3127         return cast(__m128i)vcombine_s8(ra, rb);
3128     }
3129     else
3130     {
3131         // PERF: ARM32 is missing
3132         byte16 r;
3133         short8 sa = cast(short8)a;
3134         short8 sb = cast(short8)b;
3135         foreach(i; 0..8)
3136             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
3137         foreach(i; 0..8)
3138             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
3139         return cast(__m128i)r;
3140     }
3141 }
3142 unittest
3143 {
3144     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
3145     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
3146     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
3147                         127, -128, 127, 0, 127, -128, 127, 0];
3148     assert(R.array == correct);
3149 }
3150 
3151 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
3152 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
3153 {
3154     // PERF DMD catastrophic
3155     static if (DMD_with_DSIMD)
3156     {
3157         return cast(__m128i) __simd(XMM.PACKUSWB, a, b);
3158     }
3159     else static if (GDC_with_SSE2)
3160     {
3161         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
3162     }
3163     else static if (LDC_with_SSE2)
3164     {
3165         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
3166     }
3167     else static if (LDC_with_ARM64)
3168     {
3169         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
3170         byte8 ra = vqmovun_s16(cast(short8)a);
3171         byte8 rb = vqmovun_s16(cast(short8)b);
3172         return cast(__m128i)vcombine_s8(ra, rb);
3173     }
3174     else
3175     {
3176         short8 sa = cast(short8)a;
3177         short8 sb = cast(short8)b;
3178         align(16) ubyte[16] result = void;
3179         for (int i = 0; i < 8; ++i)
3180         {
3181             short s = sa[i];
3182             if (s < 0) s = 0;
3183             if (s > 255) s = 255;
3184             result[i] = cast(ubyte)s;
3185 
3186             s = sb[i];
3187             if (s < 0) s = 0;
3188             if (s > 255) s = 255;
3189             result[i+8] = cast(ubyte)s;
3190         }
3191         return *cast(__m128i*)(result.ptr);
3192     }
3193 }
3194 unittest
3195 {
3196     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
3197     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
3198     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
3199                                                 0, 255, 0, 255, 255, 2, 1, 0];
3200     foreach(i; 0..16)
3201         assert(AA.array[i] == cast(byte)(correctResult[i]));
3202 }
3203 
3204 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
3205 /// and power consumption of spin-wait loops.
3206 void _mm_pause() @trusted
3207 {
3208     version(GNU)
3209     {
3210         static if (GDC_with_SSE2)
3211         {
3212             __builtin_ia32_pause();
3213         }
3214         else version(X86)
3215         {
3216             asm pure nothrow @nogc @trusted
3217             {
3218                 "pause;\n" : : : ;
3219             }
3220         }
3221         else
3222             static assert(false);
3223     }
3224     else static if (LDC_with_SSE2)
3225     {
3226         __builtin_ia32_pause();
3227     }
3228     else static if (DMD_with_asm)
3229     {
3230         asm nothrow @nogc pure @trusted
3231         {
3232             rep; nop; // F3 90 =  pause
3233         }
3234     }
3235     else version (LDC)
3236     {
3237         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
3238     }
3239     else
3240         static assert(false);
3241 }
3242 unittest
3243 {
3244     _mm_pause();
3245 }
3246 
3247 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
3248 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
3249 /// low 16 bits of 64-bit elements in result.
3250 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
3251 {
3252     static if (GDC_with_SSE2)
3253     {
3254         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
3255     }
3256     else static if (LDC_with_SSE2)
3257     {
3258         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
3259     }
3260     else static if (LDC_with_ARM64)
3261     {
3262         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
3263 
3264         // PERF: Looks suboptimal vs addp
3265         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
3266         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
3267         ushort8 r = 0;
3268         r[0] = r0;
3269         r[4] = r4;
3270         return cast(__m128i) r;
3271     }
3272     else
3273     {
3274         // PERF: ARM32 is lacking
3275         byte16 ab = cast(byte16)a;
3276         byte16 bb = cast(byte16)b;
3277         ubyte[16] t;
3278         foreach(i; 0..16)
3279         {
3280             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
3281             if (diff < 0) diff = -diff;
3282             t[i] = cast(ubyte)(diff);
3283         }
3284         int4 r = _mm_setzero_si128();
3285         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
3286         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
3287         return r;
3288     }
3289 }
3290 unittest
3291 {
3292     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
3293     __m128i B = _mm_set1_epi8(1);
3294     __m128i R = _mm_sad_epu8(A, B);
3295     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
3296                       0,
3297                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
3298                       0];
3299     assert(R.array == correct);
3300 }
3301 
3302 /// Set packed 16-bit integers with the supplied values.
3303 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
3304 {
3305     short8 r = void;
3306     r.ptr[0] = e0;
3307     r.ptr[1] = e1;
3308     r.ptr[2] = e2;
3309     r.ptr[3] = e3;
3310     r.ptr[4] = e4;
3311     r.ptr[5] = e5;
3312     r.ptr[6] = e6;
3313     r.ptr[7] = e7;
3314     return cast(__m128i) r;
3315 }
3316 unittest
3317 {
3318     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
3319     short8 B = cast(short8) A;
3320     foreach(i; 0..8)
3321         assert(B.array[i] == i);
3322 }
3323 
3324 /// Set packed 32-bit integers with the supplied values.
3325 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3326 {
3327     // PERF: does a constant inline correctly? vs int4 field assignment
3328     align(16) int[4] r = [e0, e1, e2, e3];
3329     return *cast(int4*)&r;
3330 }
3331 unittest
3332 {
3333     __m128i A = _mm_set_epi32(3, 2, 1, 0);
3334     foreach(i; 0..4)
3335         assert(A.array[i] == i);
3336 }
3337 
3338 /// Set packed 64-bit integers with the supplied values.
3339 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
3340 {
3341     pragma(inline, true);
3342     long2 r = void;
3343     r.ptr[0] = e0.array[0];
3344     r.ptr[1] = e1.array[0];
3345     return cast(__m128i)(r);
3346 }
3347 unittest
3348 {
3349     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
3350     long2 B = cast(long2) A;
3351     assert(B.array[0] == 5678);
3352     assert(B.array[1] == 1234);
3353 }
3354 
3355 /// Set packed 64-bit integers with the supplied values.
3356 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3357 {
3358     pragma(inline, true);
3359     long2 r = void;
3360     r.ptr[0] = e0;
3361     r.ptr[1] = e1;
3362     return cast(__m128i)(r);
3363 }
3364 unittest
3365 {
3366     __m128i A = _mm_set_epi64x(1234, -5678);
3367     long2 B = cast(long2) A;
3368     assert(B.array[0] == -5678);
3369     assert(B.array[1] == 1234);
3370 }
3371 
3372 /// Set packed 8-bit integers with the supplied values.
3373 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3374                       byte e11, byte e10, byte e9, byte e8,
3375                       byte e7, byte e6, byte e5, byte e4,
3376                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3377 {
3378     align(16) byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3379                                  e8, e9, e10, e11, e12, e13, e14, e15];
3380     return *cast(__m128i*)(result.ptr);
3381 }
3382 unittest
3383 {
3384     byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3385     byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1];
3386     assert(R.array == correct);
3387 }
3388 
3389 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3390 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3391 {
3392     pragma(inline, true);
3393     double2 r = void;
3394     r.ptr[0] = e0;
3395     r.ptr[1] = e1;
3396     return r;
3397 }
3398 unittest
3399 {
3400     __m128d A = _mm_set_pd(61.0, 55.0);
3401     double[2] correct = [55.0, 61.0];
3402     assert(A.array == correct);
3403 }
3404 
3405 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3406 __m128d _mm_set_pd1 (double a) pure @trusted
3407 {
3408     pragma(inline, true);
3409     __m128d r = void;
3410     r.ptr[0] = a;
3411     r.ptr[1] = a;
3412     return r;
3413 }
3414 unittest
3415 {
3416     __m128d A = _mm_set_pd1(61.0);
3417     double[2] correct = [61.0, 61.0];
3418     assert(A.array == correct);
3419 }
3420 
3421 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3422 /// and zero the upper element.
3423 __m128d _mm_set_sd (double a) pure @trusted
3424 {
3425     double2 r = void;
3426     r.ptr[0] = a;
3427     r.ptr[1] = 0.0;
3428     return r;
3429 }
3430 unittest
3431 {
3432     __m128d A = _mm_set_sd(61.0);
3433     double[2] correct = [61.0, 0.0];
3434     assert(A.array == correct);
3435 }
3436 
3437 /// Broadcast 16-bit integer a to all elements of dst.
3438 __m128i _mm_set1_epi16 (short a) pure @trusted
3439 {
3440     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3441     {
3442         short8 v = a;
3443         return cast(__m128i) v;
3444     }
3445     else
3446     {
3447         pragma(inline, true);
3448         return cast(__m128i)(short8(a));
3449     }
3450 }
3451 unittest
3452 {
3453     short8 a = cast(short8) _mm_set1_epi16(31);
3454     for (int i = 0; i < 8; ++i)
3455         assert(a.array[i] == 31);
3456 }
3457 
3458 /// Broadcast 32-bit integer `a` to all elements.
3459 __m128i _mm_set1_epi32 (int a) pure @trusted
3460 {
3461     pragma(inline, true);
3462     return cast(__m128i)(int4(a));
3463 }
3464 unittest
3465 {
3466     int4 a = cast(int4) _mm_set1_epi32(31);
3467     for (int i = 0; i < 4; ++i)
3468         assert(a.array[i] == 31);
3469 }
3470 
3471 /// Broadcast 64-bit integer `a` to all elements.
3472 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3473 {
3474     return _mm_set_epi64(a, a);
3475 }
3476 unittest
3477 {
3478     long b = 0x1DEADCAFE; 
3479     __m64 a;
3480     a.ptr[0] = b;
3481     long2 c = cast(long2) _mm_set1_epi64(a);
3482     assert(c.array[0] == b);
3483     assert(c.array[1] == b);
3484 }
3485 
3486 /// Broadcast 64-bit integer `a` to all elements
3487 __m128i _mm_set1_epi64x (long a) pure @trusted
3488 {
3489     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3490     return cast(__m128i)(b);
3491 }
3492 unittest
3493 {
3494     long b = 0x1DEADCAFE;
3495     long2 c = cast(long2) _mm_set1_epi64x(b);
3496     for (int i = 0; i < 2; ++i)
3497         assert(c.array[i] == b);
3498 }
3499 
3500 /// Broadcast 8-bit integer `a` to all elements.
3501 __m128i _mm_set1_epi8 (byte a) pure @trusted
3502 {
3503     pragma(inline, true);
3504     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3505     return cast(__m128i)(b);
3506 }
3507 unittest
3508 {
3509     byte16 b = cast(byte16) _mm_set1_epi8(31);
3510     for (int i = 0; i < 16; ++i)
3511         assert(b.array[i] == 31);
3512 }
3513 
3514 alias _mm_set1_pd = _mm_set_pd1;
3515 
3516 /// Set packed 16-bit integers with the supplied values in reverse order.
3517 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3518                         short e3, short e2, short e1, short e0) pure @trusted
3519 {
3520     short8 r = void;
3521     r.ptr[0] = e7;
3522     r.ptr[1] = e6;
3523     r.ptr[2] = e5;
3524     r.ptr[3] = e4;
3525     r.ptr[4] = e3;
3526     r.ptr[5] = e2;
3527     r.ptr[6] = e1;
3528     r.ptr[7] = e0;
3529     return cast(__m128i)(r);
3530 }
3531 unittest
3532 {
3533     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3534     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3535     assert(A.array == correct);
3536 }
3537 
3538 /// Set packed 32-bit integers with the supplied values in reverse order.
3539 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3540 {
3541     // Performs better than = void; with GDC
3542     pragma(inline, true);
3543     align(16) int[4] result = [e3, e2, e1, e0];
3544     return *cast(__m128i*)(result.ptr);
3545 }
3546 unittest
3547 {
3548     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3549     int[4] correct = [-1, 0, -2147483648, 2147483647];
3550     assert(A.array == correct);
3551 }
3552 
3553 /// Set packed 64-bit integers with the supplied values in reverse order.
3554 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3555 {
3556     long2 r = void;
3557     r.ptr[0] = e1;
3558     r.ptr[1] = e0;
3559     return cast(__m128i)(r);
3560 }
3561 unittest
3562 {
3563     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3564     long[2] correct = [-1, 0];
3565     assert(A.array == correct);
3566 }
3567 
3568 /// Set packed 8-bit integers with the supplied values in reverse order.
3569 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3570                        byte e11, byte e10, byte e9,  byte e8,
3571                        byte e7,  byte e6,  byte e5,  byte e4,
3572                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3573 {
3574     align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3575                                  e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3576     return *cast(__m128i*)(result.ptr);
3577 }
3578 unittest
3579 {
3580     byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3581     byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14];
3582     assert(R.array == correct);
3583 }
3584 
3585 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3586 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3587 {
3588     pragma(inline, true);
3589     double2 result;
3590     result.ptr[0] = e1;
3591     result.ptr[1] = e0;
3592     return result;
3593 }
3594 unittest
3595 {
3596     __m128d A = _mm_setr_pd(61.0, 55.0);
3597     double[2] correct = [61.0, 55.0];
3598     assert(A.array == correct);
3599 }
3600 
3601 /// Return vector of type `__m128d` with all elements set to zero.
3602 __m128d _mm_setzero_pd() pure @trusted
3603 {
3604     pragma(inline, true);
3605     double2 r = void;
3606     r.ptr[0] = 0.0;
3607     r.ptr[1] = 0.0;
3608     return r;
3609 }
3610 unittest
3611 {
3612     __m128d A = _mm_setzero_pd();
3613     double[2] correct = [0.0, 0.0];
3614     assert(A.array == correct);
3615 }
3616 
3617 /// Return vector of type `__m128i` with all elements set to zero.
3618 __m128i _mm_setzero_si128() pure @trusted
3619 {
3620     pragma(inline, true);
3621     int4 r = void;
3622     r.ptr[0] = 0;
3623     r.ptr[1] = 0;
3624     r.ptr[2] = 0;
3625     r.ptr[3] = 0;
3626     return r;
3627 }
3628 unittest
3629 {
3630     __m128i A = _mm_setzero_si128();
3631     int[4] correct = [0, 0, 0, 0];
3632     assert(A.array == correct);
3633 }
3634 
3635 /// Shuffle 32-bit integers in `a` using the control in `imm8`.
3636 /// See_also: `_MM_SHUFFLE`.
3637 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted
3638 {
3639     // PERF DMD D_SIMD
3640     static if (GDC_with_SSE2)
3641     {
3642         return __builtin_ia32_pshufd(a, imm8);
3643     }
3644     else static if (LDC_with_optimizations)
3645     {
3646         return shufflevectorLDC!(int4, (imm8 >> 0) & 3,
3647                                  (imm8 >> 2) & 3,
3648                                  (imm8 >> 4) & 3,
3649                                  (imm8 >> 6) & 3)(a, a);
3650     }
3651     else
3652     {
3653         int4 r = void;
3654         r.ptr[0] = a.ptr[(imm8 >> 0) & 3];
3655         r.ptr[1] = a.ptr[(imm8 >> 2) & 3];
3656         r.ptr[2] = a.ptr[(imm8 >> 4) & 3];
3657         r.ptr[3] = a.ptr[(imm8 >> 6) & 3];
3658         return r;
3659     }
3660 }
3661 unittest
3662 {
3663     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3664     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3665     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3666     int[4] expectedB = [ 3, 2, 1, 0 ];
3667     assert(B.array == expectedB);
3668 }
3669 
3670 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3671 /// See_also: `_MM_SHUFFLE2`.
3672 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted
3673 {
3674     // PERF DMD D_SIMD
3675     static if (GDC_with_SSE2)
3676     {
3677         return __builtin_ia32_shufpd(a, b, imm8);
3678     }
3679     else version(LDC)
3680     {
3681         return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ),
3682                                  2 + ( (imm8 >> 1) & 1 ))(a, b);
3683     }
3684     else
3685     {
3686         double2 r = void;
3687         r.ptr[0] = a.array[imm8 & 1];
3688         r.ptr[1] = b.array[(imm8 >> 1) & 1];
3689         return r;
3690     }
3691 }
3692 unittest
3693 {
3694     __m128d A = _mm_setr_pd(0.5, 2.0);
3695     __m128d B = _mm_setr_pd(4.0, 5.0);
3696     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3697     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3698     double[2] correct = [ 2.0, 5.0 ];
3699     assert(R.array == correct);
3700 }
3701 
3702 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3703 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3704 /// See also: `_MM_SHUFFLE`.
3705 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted
3706 {
3707     static if (DMD_with_DSIMD)
3708     {
3709         return cast(__m128i) __simd(XMM.PSHUFHW, a, a, cast(ubyte)imm8);
3710     }
3711     else static if (GDC_with_SSE2)
3712     {
3713         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3714     }
3715     else static if (LDC_with_optimizations)
3716     {
3717         return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3,
3718                                           4 + ( (imm8 >> 0) & 3 ),
3719                                           4 + ( (imm8 >> 2) & 3 ),
3720                                           4 + ( (imm8 >> 4) & 3 ),
3721                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3722     }
3723     else
3724     {
3725         short8 r = cast(short8)a;
3726         short8 sa = cast(short8)a;
3727         r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ];
3728         r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ];
3729         r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ];
3730         r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ];
3731         return cast(__m128i) r;
3732     }
3733 }
3734 unittest
3735 {
3736     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3737     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3738     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3739     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3740     assert(C.array == expectedC);
3741 }
3742 
3743 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3744 /// bits of result, with the high 64 bits being copied from from `a` to result.
3745 /// See_also: `_MM_SHUFFLE`.
3746 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted
3747 {
3748     static if (DMD_with_DSIMD)
3749     {
3750         return cast(__m128i) __simd(XMM.PSHUFLW, a, a, cast(ubyte)imm8);
3751     }
3752     else static if (GDC_with_SSE2)
3753     {
3754         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3755     }
3756     else static if (LDC_with_optimizations)
3757     {
3758         return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ),
3759                                                        ( (imm8 >> 2) & 3 ),
3760                                                        ( (imm8 >> 4) & 3 ),
3761                                                        ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3762     }
3763     else
3764     {
3765         short8 r = cast(short8)a;
3766         short8 sa = cast(short8)a;
3767         r.ptr[0] = sa.array[(imm8 >> 0) & 3];
3768         r.ptr[1] = sa.array[(imm8 >> 2) & 3];
3769         r.ptr[2] = sa.array[(imm8 >> 4) & 3];
3770         r.ptr[3] = sa.array[(imm8 >> 6) & 3];
3771         return cast(__m128i) r;
3772     }
3773 }
3774 unittest
3775 {
3776     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3777     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3778     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3779     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3780     assert(B.array == expectedB);
3781 }
3782 
3783 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3784 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3785 {
3786     static if (LDC_with_SSE2)
3787     {
3788         return __builtin_ia32_pslld128(a, count);
3789     }
3790     else static if (GDC_with_SSE2)
3791     {
3792         return __builtin_ia32_pslld128(a, count);
3793     }
3794     else static if (DMD_with_32bit_asm)
3795     {
3796         asm pure nothrow @nogc @trusted
3797         {
3798             movdqu XMM0, a;
3799             movdqu XMM1, count;
3800             pslld XMM0, XMM1;
3801             movdqu a, XMM0;
3802         }
3803         return a;
3804     }
3805     else
3806     {
3807         int4 r = void;
3808         long2 lc = cast(long2)count;
3809         int bits = cast(int)(lc.array[0]);
3810         foreach(i; 0..4)
3811             r[i] = cast(uint)(a[i]) << bits;
3812         return r;
3813     }
3814 }
3815 
3816 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3817 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3818 {
3819     static if (LDC_with_SSE2)
3820     {
3821         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3822     }
3823     else static if (GDC_with_SSE2)
3824     {
3825         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3826     }
3827     else static if (DMD_with_32bit_asm)
3828     {
3829         asm pure nothrow @nogc @trusted
3830         {
3831             movdqu XMM0, a;
3832             movdqu XMM1, count;
3833             psllq XMM0, XMM1;
3834             movdqu a, XMM0;
3835         }
3836         return a;
3837     }
3838     else
3839     {
3840         // ARM: good since LDC 1.12 -O2
3841         // ~but -O0 version is catastrophic
3842         long2 r = void;
3843         long2 sa = cast(long2)a;
3844         long2 lc = cast(long2)count;
3845         int bits = cast(int)(lc.array[0]);
3846         foreach(i; 0..2)
3847             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3848         return cast(__m128i)r;
3849     }
3850 }
3851 
3852 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3853 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3854 {
3855     static if (GDC_or_LDC_with_SSE2)
3856     {
3857         return cast(__m128i)__builtin_ia32_psllw128(cast(short8)a, cast(short8)count);
3858     }
3859     else static if (DMD_with_32bit_asm)
3860     {
3861         asm pure nothrow @nogc @trusted
3862         {
3863             movdqu XMM0, a;
3864             movdqu XMM1, count;
3865             psllw XMM0, XMM1;
3866             movdqu a, XMM0;
3867         }
3868         return a;
3869     }
3870     else
3871     {
3872         short8 sa = cast(short8)a;
3873         long2 lc = cast(long2)count;
3874         int bits = cast(int)(lc.array[0]);
3875         short8 r = void;
3876         foreach(i; 0..8)
3877             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3878         return cast(int4)r;
3879     }
3880 }
3881 
3882 
3883 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3884 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3885 {
3886     static if (GDC_with_SSE2)
3887     {
3888         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3889     }
3890     else static if (LDC_with_SSE2)
3891     {
3892         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3893     }
3894     else
3895     {
3896         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3897         //       D says "It's illegal to shift by the same or more bits 
3898         //       than the size of the quantity being shifted"
3899         //       and it's UB instead.
3900         int4 r = _mm_setzero_si128();
3901 
3902         ubyte count = cast(ubyte) imm8;
3903         if (count > 31)
3904             return r;
3905         
3906         foreach(i; 0..4)
3907             r.array[i] = cast(uint)(a.array[i]) << count;
3908         return r;
3909     }
3910 }
3911 unittest
3912 {
3913     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3914     __m128i B = _mm_slli_epi32(A, 1);
3915     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3916     int[4] expectedB = [ 0, 4, 6, -8];
3917     assert(B.array == expectedB);
3918     assert(B2.array == expectedB);
3919 
3920     __m128i C = _mm_slli_epi32(A, 0);
3921     int[4] expectedC = [ 0, 2, 3, -4];
3922     assert(C.array == expectedC);
3923 
3924     __m128i D = _mm_slli_epi32(A, 65);
3925     int[4] expectedD = [ 0, 0, 0, 0];
3926     assert(D.array == expectedD);
3927 }
3928 
3929 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3930 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3931 {
3932     static if (GDC_with_SSE2)
3933     {
3934         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3935     }
3936     else static if (LDC_with_SSE2)
3937     {
3938         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3939     }
3940     else
3941     {
3942         long2 sa = cast(long2)a;
3943 
3944         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3945         //       D says "It's illegal to shift by the same or more bits 
3946         //       than the size of the quantity being shifted"
3947         //       and it's UB instead.
3948         long2 r = cast(long2) _mm_setzero_si128();
3949         ubyte count = cast(ubyte) imm8;
3950         if (count > 63)
3951             return cast(__m128i)r;
3952 
3953         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3954         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3955         return cast(__m128i)r;
3956     }
3957 }
3958 unittest
3959 {
3960     __m128i A = _mm_setr_epi64(8, -4);
3961     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3962     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3963     long[2] expectedB = [ 16, -8];
3964     assert(B.array == expectedB);
3965     assert(B2.array == expectedB);
3966 
3967     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3968     long[2] expectedC = [ 8, -4];
3969     assert(C.array == expectedC);
3970 
3971     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3972     long[2] expectedD = [ 0, -0];
3973     assert(D.array == expectedD);
3974 }
3975 
3976 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3977 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3978 {
3979     static if (GDC_with_SSE2)
3980     {
3981         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3982     }
3983     else static if (LDC_with_SSE2)
3984     {
3985         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3986     }
3987     else static if (LDC_with_ARM64)
3988     {
3989         short8 sa = cast(short8)a;
3990         short8 r = cast(short8)_mm_setzero_si128();
3991         ubyte count = cast(ubyte) imm8;
3992         if (count > 15)
3993             return cast(__m128i)r;
3994         r = sa << short8(count);
3995         return cast(__m128i)r;
3996     }
3997     else
3998     {
3999         short8 sa = cast(short8)a;
4000         short8 r = cast(short8)_mm_setzero_si128();
4001         ubyte count = cast(ubyte) imm8;
4002         if (count > 15)
4003             return cast(__m128i)r;
4004         foreach(i; 0..8)
4005             r.ptr[i] = cast(short)(sa.array[i] << count);
4006         return cast(__m128i)r;
4007     }
4008 }
4009 unittest
4010 {
4011     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4012     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
4013     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
4014     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
4015     assert(B.array == expectedB);
4016     assert(B2.array == expectedB);
4017 
4018     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
4019     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
4020     assert(C.array == expectedC);
4021 }
4022 
4023 
4024 /// Shift `a` left by `bytes` bytes while shifting in zeros.
4025 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
4026 {
4027     static if (bytes & 0xF0)
4028     {
4029         return _mm_setzero_si128();
4030     }
4031     else static if (DMD_with_DSIMD)
4032     {
4033         return cast(__m128i) __simd_ib(XMM.PSLLDQ, op, bytes);
4034     }
4035     else static if (GDC_with_SSE2)
4036     {
4037         pragma(inline, true); // else it doesn't seem to be inlined at all by GDC TODO _mm_srli_si128
4038         return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
4039     }
4040     else static if (LDC_with_optimizations)
4041     {
4042         return cast(__m128i) shufflevectorLDC!(byte16,
4043                                                16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
4044                                                22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
4045                                                28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
4046                                                (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
4047     }
4048     else static if (DMD_with_32bit_asm)
4049     {
4050         asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
4051         {
4052             movdqu XMM0, op;
4053             pslldq XMM0, bytes;
4054             movdqu op, XMM0;
4055         }
4056         return op;
4057     }
4058     else
4059     {
4060         byte16 A = cast(byte16)op;
4061         byte16 R = void;
4062         for (int n = 15; n >= bytes; --n)
4063             R.ptr[n] = A.array[n-bytes];
4064         for (int n = bytes-1; n >= 0; --n)
4065             R.ptr[n] = 0;
4066         return cast(__m128i)R;
4067     }
4068 }
4069 unittest
4070 {
4071     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4072     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
4073     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
4074     assert(R.array == correct);
4075 
4076     __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1));
4077     int[4] expectedB = [0, 0, 0, 0];
4078     assert(B.array == expectedB);
4079 }
4080 
4081 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
4082 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
4083 {
4084     version(LDC)
4085     {
4086         // Disappeared with LDC 1.11
4087         static if (__VERSION__ < 2081)
4088             return __builtin_ia32_sqrtpd(vec);
4089         else
4090         {
4091             // PERF: use llvm_sqrt on the vector
4092             vec.array[0] = llvm_sqrt(vec.array[0]); 
4093             vec.array[1] = llvm_sqrt(vec.array[1]);
4094             return vec;
4095         }
4096     }
4097     else static if (GDC_with_SSE2)    
4098     {
4099         return __builtin_ia32_sqrtpd(vec);
4100     }
4101     else
4102     {
4103         vec.ptr[0] = sqrt(vec.array[0]);
4104         vec.ptr[1] = sqrt(vec.array[1]);
4105         return vec;
4106     }
4107 }
4108 
4109 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
4110 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
4111 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
4112 {
4113     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
4114     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
4115     //        The quadword at bits 127:64 of the destination operand remains unchanged."
4116     version(LDC)
4117     {
4118         // Disappeared with LDC 1.11
4119         static if (__VERSION__ < 2081)
4120         {
4121             __m128d c = __builtin_ia32_sqrtsd(b);
4122             a[0] = c[0];
4123             return a;
4124         }
4125         else
4126         {
4127             a.array[0] = llvm_sqrt(b.array[0]);
4128             return a;
4129         }
4130     }
4131     else static if (GDC_with_SSE2)
4132     {
4133         __m128d c = __builtin_ia32_sqrtsd(b);
4134         a.ptr[0] = c.array[0];
4135         return a;
4136     }
4137     else
4138     {
4139         a.ptr[0] = sqrt(b.array[0]);
4140         return a;
4141     }
4142 }
4143 unittest
4144 {
4145     __m128d A = _mm_setr_pd(1.0, 3.0);
4146     __m128d B = _mm_setr_pd(4.0, 5.0);
4147     __m128d R = _mm_sqrt_sd(A, B);
4148     double[2] correct = [2.0, 3.0 ];
4149     assert(R.array == correct);
4150 }
4151 
4152 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
4153 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
4154 {
4155     static if (GDC_with_SSE2)
4156     {
4157         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
4158     }
4159     else static if (LDC_with_SSE2)
4160     {
4161         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
4162     }
4163     else
4164     {
4165         short8 sa = cast(short8)a;
4166         long2 lc = cast(long2)count;
4167         int bits = cast(int)(lc.array[0]);
4168         short8 r = void;
4169         foreach(i; 0..8)
4170             r.ptr[i] = cast(short)(sa.array[i] >> bits);
4171         return cast(int4)r;
4172     }
4173 }
4174 
4175 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
4176 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
4177 {
4178     static if (LDC_with_SSE2)
4179     {
4180         return __builtin_ia32_psrad128(a, count);
4181     }
4182     else static if (GDC_with_SSE2)
4183     {
4184         return __builtin_ia32_psrad128(a, count);
4185     }
4186     else
4187     {    
4188         int4 r = void;
4189         long2 lc = cast(long2)count;
4190         int bits = cast(int)(lc.array[0]);
4191         r.ptr[0] = (a.array[0] >> bits);
4192         r.ptr[1] = (a.array[1] >> bits);
4193         r.ptr[2] = (a.array[2] >> bits);
4194         r.ptr[3] = (a.array[3] >> bits);
4195         return r;
4196     }
4197 }
4198 
4199 
4200 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
4201 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
4202 {
4203     static if (GDC_with_SSE2)
4204     {
4205         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
4206     }
4207     else static if (LDC_with_SSE2)
4208     {
4209         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
4210     }
4211     else static if (LDC_with_ARM64)
4212     {
4213         short8 sa = cast(short8)a;
4214         ubyte count = cast(ubyte)imm8;
4215         if (count > 15) 
4216             count = 15;
4217         short8 r = sa >> short8(count);
4218         return cast(__m128i)r;
4219     }
4220     else
4221     {
4222         short8 sa = cast(short8)a;
4223         short8 r = void;
4224 
4225         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4226         //       D says "It's illegal to shift by the same or more bits 
4227         //       than the size of the quantity being shifted"
4228         //       and it's UB instead.
4229         ubyte count = cast(ubyte)imm8;
4230         if (count > 15) 
4231             count = 15;
4232         foreach(i; 0..8)
4233             r.ptr[i] = cast(short)(sa.array[i] >> count);
4234         return cast(int4)r;
4235     }
4236 }
4237 unittest
4238 {
4239     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4240     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
4241     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
4242     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
4243     assert(B.array == expectedB);
4244     assert(B2.array == expectedB);
4245 
4246     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
4247     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
4248     assert(C.array == expectedC);
4249 }
4250 
4251 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
4252 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
4253 {
4254     static if (LDC_with_SSE2)
4255     {
4256         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4257     }
4258     else static if (GDC_with_SSE2)
4259     {
4260         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4261     }
4262     else
4263     {
4264         int4 r = void;
4265 
4266         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4267         //       D says "It's illegal to shift by the same or more bits 
4268         //       than the size of the quantity being shifted"
4269         //       and it's UB instead.
4270         // See Issue: #56
4271         ubyte count = cast(ubyte) imm8;
4272         if (count > 31)
4273             count = 31;
4274 
4275         r.ptr[0] = (a.array[0] >> count);
4276         r.ptr[1] = (a.array[1] >> count);
4277         r.ptr[2] = (a.array[2] >> count);
4278         r.ptr[3] = (a.array[3] >> count);
4279         return r;
4280     }
4281 }
4282 unittest
4283 {
4284     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4285     __m128i B = _mm_srai_epi32(A, 1);
4286     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
4287     int[4] expectedB = [ 0, 1, 1, -2];
4288     assert(B.array == expectedB);
4289     assert(B2.array == expectedB);
4290 
4291     __m128i C = _mm_srai_epi32(A, 32);
4292     int[4] expectedC = [ 0, 0, 0, -1];
4293     assert(C.array == expectedC);
4294 
4295     __m128i D = _mm_srai_epi32(A, 0);
4296     int[4] expectedD = [ 0, 2, 3, -4];
4297     assert(D.array == expectedD);
4298 }
4299 
4300 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
4301 {
4302     static if (LDC_with_SSE2)
4303     {
4304         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
4305     }
4306     else static if (GDC_with_SSE2)
4307     {
4308         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
4309     }
4310     else
4311     {
4312         short8 sa = cast(short8)a;
4313         long2 lc = cast(long2)count;
4314         int bits = cast(int)(lc.array[0]);
4315         short8 r = void;
4316         foreach(i; 0..8)
4317             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
4318         return cast(int4)r;
4319     }
4320 }
4321 
4322 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
4323 {
4324     static if (LDC_with_SSE2)
4325     {
4326         return __builtin_ia32_psrld128(a, count);
4327     }
4328     else static if (GDC_with_SSE2)
4329     {
4330         return __builtin_ia32_psrld128(a, count);
4331     }
4332     else
4333     {
4334         int4 r = void;
4335         long2 lc = cast(long2)count;
4336         int bits = cast(int)(lc.array[0]);
4337         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
4338         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
4339         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
4340         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
4341         return r;
4342     }
4343 }
4344 
4345 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
4346 {
4347     static if (LDC_with_SSE2)
4348     {
4349         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4350     }
4351     else static if (GDC_with_SSE2)
4352     {
4353         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4354     }
4355     else
4356     {
4357         // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047
4358         // => avoid void initialization.
4359         long2 r;
4360         long2 sa = cast(long2)a;
4361         long2 lc = cast(long2)count;
4362         int bits = cast(int)(lc.array[0]);
4363         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
4364         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
4365         return cast(__m128i)r;
4366     }
4367 }
4368 
4369 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
4370 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
4371 {
4372     static if (GDC_with_SSE2)
4373     {
4374         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4375     }
4376     else static if (LDC_with_SSE2)
4377     {
4378         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4379     }
4380     else static if (LDC_with_ARM64)
4381     {
4382         short8 sa = cast(short8)a;
4383         short8 r = cast(short8) _mm_setzero_si128();
4384 
4385         ubyte count = cast(ubyte)imm8;
4386         if (count >= 16)
4387             return cast(__m128i)r;
4388 
4389         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
4390         return cast(__m128i)r;
4391     }
4392     else
4393     {
4394         short8 sa = cast(short8)a;
4395         ubyte count = cast(ubyte)imm8;
4396 
4397         short8 r = cast(short8) _mm_setzero_si128();
4398         if (count >= 16)
4399             return cast(__m128i)r;
4400 
4401         foreach(i; 0..8)
4402             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
4403         return cast(__m128i)r;
4404     }
4405 }
4406 unittest
4407 {
4408     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4409     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
4410     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
4411     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
4412     assert(B.array == expectedB);
4413     assert(B2.array == expectedB);
4414 
4415     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
4416     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
4417     assert(C.array == expectedC);
4418 
4419     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
4420     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
4421     assert(D.array == expectedD);
4422 }
4423 
4424 
4425 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
4426 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
4427 {
4428     static if (GDC_with_SSE2)
4429     {
4430         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4431     }
4432     else static if (LDC_with_SSE2)
4433     {
4434         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4435     }
4436     else
4437     {
4438         ubyte count = cast(ubyte) imm8;
4439 
4440         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4441         //       D says "It's illegal to shift by the same or more bits 
4442         //       than the size of the quantity being shifted"
4443         //       and it's UB instead.
4444         int4 r = _mm_setzero_si128();
4445         if (count >= 32)
4446             return r;
4447         r.ptr[0] = a.array[0] >>> count;
4448         r.ptr[1] = a.array[1] >>> count;
4449         r.ptr[2] = a.array[2] >>> count;
4450         r.ptr[3] = a.array[3] >>> count;
4451         return r;
4452     }
4453 }
4454 unittest
4455 {
4456     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4457     __m128i B = _mm_srli_epi32(A, 1);
4458     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
4459     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
4460     assert(B.array == expectedB);
4461     assert(B2.array == expectedB);
4462  
4463     __m128i C = _mm_srli_epi32(A, 255);
4464     int[4] expectedC = [ 0, 0, 0, 0 ];
4465     assert(C.array == expectedC);
4466 }
4467 
4468 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4469 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4470 {
4471     // PERF DMD
4472     static if (GDC_with_SSE2)
4473     {
4474         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4475     }
4476     else static if (LDC_with_SSE2)
4477     {
4478         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4479     }
4480     else
4481     {
4482         long2 r = cast(long2) _mm_setzero_si128();
4483         long2 sa = cast(long2)a;
4484 
4485         ubyte count = cast(ubyte) imm8;
4486         if (count >= 64)
4487             return cast(__m128i)r;
4488 
4489         r.ptr[0] = sa.array[0] >>> count;
4490         r.ptr[1] = sa.array[1] >>> count;
4491         return cast(__m128i)r;
4492     }
4493 }
4494 unittest
4495 {
4496     __m128i A = _mm_setr_epi64(8, -4);
4497     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4498     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4499     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4500     assert(B.array == expectedB);
4501     assert(B2.array == expectedB);
4502 
4503     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4504     long[2] expectedC = [ 0, 0 ];
4505     assert(C.array == expectedC);
4506 }
4507 
4508 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4509 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted
4510 {
4511     static if (bytes & 0xF0)
4512     {
4513         return _mm_setzero_si128();
4514     }
4515     else static if (DMD_with_DSIMD)
4516     {
4517         return cast(__m128i) __simd_ib(XMM.PSRLDQ, v, bytes);
4518     }
4519     else static if (GDC_with_SSE2)
4520     {
4521         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4522     }
4523     else static if (DMD_with_32bit_asm)
4524     {
4525         asm pure nothrow @nogc @trusted
4526         {
4527             movdqu XMM0, v;
4528             psrldq XMM0, bytes;
4529             movdqu v, XMM0;
4530         }
4531         return v;
4532     }
4533     else static if (LDC_with_optimizations)
4534     {
4535         return cast(__m128i) shufflevectorLDC!(byte16,
4536                                                bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4537                                                bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4538                                                (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4539     }
4540     else
4541     {
4542         byte16 A = cast(byte16)v;
4543         byte16 R = void;
4544         for (int n = 0; n < bytes; ++n)
4545             R.ptr[15-n] = 0;
4546         for (int n = bytes; n < 16; ++n)
4547             R.ptr[15-n] = A.array[15 - n + bytes];
4548         return cast(__m128i)R;
4549     }
4550 }
4551 unittest
4552 {
4553     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1));
4554     int[4] correct = [-2, 3, 4, 0];
4555     assert(R.array == correct);
4556 
4557     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4558     int[4] expectedA = [0, 0, 0, 0];
4559     assert(A.array == expectedA);
4560 }
4561 
4562 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4563 /// #BONUS
4564 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4565 {
4566     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4567 }
4568 unittest
4569 {
4570     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4571     float[4] correct = [3.0f, 4.0f, 0, 0];
4572     assert(R.array == correct);
4573 }
4574 
4575 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4576 /// #BONUS
4577 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4578 {
4579     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4580 }
4581 
4582 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4583 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4584 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4585 {
4586     pragma(inline, true);
4587     __m128d* aligned = cast(__m128d*)mem_addr;
4588     *aligned = a;
4589 }
4590 unittest
4591 {
4592     align(16) double[2] A;
4593     __m128d B = _mm_setr_pd(-8.0, 9.0);
4594     _mm_store_pd(A.ptr, B);
4595     assert(A == [-8.0, 9.0]);
4596 }
4597 
4598 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4599 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4600 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4601 {
4602     __m128d* aligned = cast(__m128d*)mem_addr;
4603     __m128d r; // PERF =void;
4604     r.ptr[0] = a.array[0];
4605     r.ptr[1] = a.array[0];
4606     *aligned = r;
4607 }
4608 
4609 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4610 /// be aligned on any particular boundary.
4611 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4612 {
4613     pragma(inline, true);
4614     *mem_addr = a.array[0];
4615 }
4616 
4617 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4618 /// general-protection exception may be generated.
4619 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4620 {
4621     pragma(inline, true);
4622     *mem_addr = a;
4623 }
4624 
4625 alias _mm_store1_pd = _mm_store_pd1; ///
4626 
4627 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4628 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4629 {
4630     pragma(inline, true);
4631     *mem_addr = a.array[1];
4632 }
4633 
4634 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4635 // expectations from the user point of view. This problem also exist in C++.
4636 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4637 {
4638     pragma(inline, true);
4639     long* dest = cast(long*)mem_addr;
4640     long2 la = cast(long2)a;
4641     *dest = la.array[0];
4642 }
4643 unittest
4644 {
4645     long[3] A = [1, 2, 3];
4646     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4647     long[3] correct = [1, 0x1_0000_0000, 3];
4648     assert(A == correct);
4649 }
4650 
4651 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4652 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4653 {
4654     pragma(inline, true);
4655     *mem_addr = a.array[0];
4656 }
4657 
4658 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse 
4659 /// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 
4660 /// may be generated.
4661 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system
4662 {
4663     __m128d reversed = void;
4664     reversed.ptr[0] = a.array[1];
4665     reversed.ptr[1] = a.array[0];
4666     *cast(__m128d*)mem_addr = reversed;
4667 }
4668 unittest
4669 {
4670     align(16) double[2] A = [0.0, 1.0];
4671     _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0));
4672     assert(A[0] == 3.0 && A[1] == 2.0);
4673 }
4674 
4675 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 
4676 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
4677 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system
4678 {
4679     // PERF DMD
4680     pragma(inline, true);
4681     static if (GDC_with_SSE2)
4682     {
4683         __builtin_ia32_storeupd(mem_addr, a);
4684     }
4685     else static if (LDC_with_optimizations)
4686     {
4687         storeUnaligned!double2(a, mem_addr);
4688     }
4689     else
4690     {
4691         mem_addr[0] = a.array[0];
4692         mem_addr[1] = a.array[1];
4693     }
4694 }
4695 unittest
4696 {
4697     __m128d A = _mm_setr_pd(3.0, 4.0);
4698     align(16) double[4] R = [0.0, 0, 0, 0];
4699     double[2] correct = [3.0, 4.0];
4700     _mm_storeu_pd(&R[1], A);
4701     assert(R[1..3] == correct);
4702 }
4703 
4704 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4705 /// boundary.
4706 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned. Make it @system
4707 {
4708     // PERF: DMD
4709     pragma(inline, true);
4710     static if (GDC_with_SSE2)
4711     {
4712         __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a);
4713     }
4714     else static if (LDC_with_optimizations)
4715     {
4716         storeUnaligned!__m128i(a, cast(int*)mem_addr);
4717     }
4718     else
4719     {
4720         int* p = cast(int*)mem_addr;
4721         p[0] = a.array[0];
4722         p[1] = a.array[1];
4723         p[2] = a.array[2];
4724         p[3] = a.array[3];
4725     }
4726 }
4727 unittest
4728 {
4729     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4730     align(16) int[6] R = [0, 0, 0, 0, 0, 0];
4731     int[4] correct = [1, 2, 3, 4];
4732     _mm_storeu_si128(cast(__m128i*)(&R[1]), A);
4733     assert(R[1..5] == correct);
4734 }
4735 
4736 /// Store 16-bit integer from the first element of `a` into memory. 
4737 /// `mem_addr` does not need to be aligned on any particular boundary.
4738 void _mm_storeu_si16 (void* mem_addr, __m128i a) pure @system
4739 {
4740     short* dest = cast(short*)mem_addr;
4741     *dest = (cast(short8)a).array[0];
4742 }
4743 unittest
4744 {
4745     short[2] arr = [-24, 12];
4746     _mm_storeu_si16(&arr[1], _mm_set1_epi16(26));
4747     short[2] correct = [-24, 26];
4748     assert(arr == correct);
4749 }
4750 
4751 /// Store 32-bit integer from the first element of `a` into memory. 
4752 /// `mem_addr` does not need to be aligned on any particular boundary.
4753 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted // TODO should really be @ssytem
4754 {
4755     pragma(inline, true);
4756     int* dest = cast(int*)mem_addr;
4757     *dest = a.array[0];
4758 }
4759 unittest
4760 {
4761     int[2] arr = [-24, 12];
4762     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4763     assert(arr == [-24, -1]);
4764 }
4765 
4766 /// Store 64-bit integer from the first element of `a` into memory. 
4767 /// `mem_addr` does not need to be aligned on any particular boundary.
4768 void _mm_storeu_si64 (void* mem_addr, __m128i a) pure @system
4769 {
4770     pragma(inline, true);
4771     long* dest = cast(long*)mem_addr;
4772     long2 la = cast(long2)a;
4773     *dest = la.array[0];
4774 }
4775 unittest
4776 {
4777     long[3] A = [1, 2, 3];
4778     _mm_storeu_si64(&A[1], _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4779     long[3] correct = [1, 0x1_0000_0000, 3];
4780     assert(A == correct);
4781 }
4782 
4783 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4784 /// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte
4785 /// boundary or a general-protection exception may be generated.
4786 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4787 void _mm_stream_pd (double* mem_addr, __m128d a) pure @system
4788 {
4789     // PERF DMD D_SIMD
4790     static if (GDC_with_SSE2)
4791     {
4792         return __builtin_ia32_movntpd(mem_addr, a); 
4793     }
4794     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
4795     {
4796         enum prefix = `!0 = !{ i32 1 }`;
4797         enum ir = `
4798             store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0
4799             ret void`;
4800         LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a);
4801     }
4802     else
4803     {
4804         // Regular store instead.
4805         __m128d* dest = cast(__m128d*)mem_addr;
4806         *dest = a;
4807     }
4808 }
4809 unittest
4810 {
4811     align(16) double[2] A;
4812     __m128d B = _mm_setr_pd(-8.0, 9.0);
4813     _mm_stream_pd(A.ptr, B);
4814     assert(A == [-8.0, 9.0]);
4815 }
4816 
4817 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4818 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception
4819 /// may be generated.
4820 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4821 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted
4822 {
4823     // PERF DMD D_SIMD
4824     static if (GDC_with_SSE2)
4825     {
4826         return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a); 
4827     }
4828     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
4829     {
4830         enum prefix = `!0 = !{ i32 1 }`;
4831         enum ir = `
4832             store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0
4833             ret void`;
4834         LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a);
4835     }
4836     else
4837     {
4838         // Regular store instead.
4839         __m128i* dest = cast(__m128i*)mem_addr;
4840         *dest = a;
4841     }
4842 }
4843 unittest
4844 {
4845     align(16) int[4] A;
4846     __m128i B = _mm_setr_epi32(-8, 9, 10, -11);
4847     _mm_stream_si128(cast(__m128i*)A.ptr, B);
4848     assert(A == [-8, 9, 10, -11]);
4849 }
4850 
4851 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4852 /// pollution. If the cache line containing address `mem_addr` is already in the cache,
4853 /// the cache will be updated.
4854 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4855 void _mm_stream_si32 (int* mem_addr, int a) pure @trusted
4856 {
4857     // PERF DMD D_SIMD
4858     static if (GDC_with_SSE2)
4859     {
4860         return __builtin_ia32_movnti(mem_addr, a);
4861     }
4862     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
4863     {
4864         enum prefix = `!0 = !{ i32 1 }`;
4865         enum ir = `
4866             store i32 %1, i32* %0, !nontemporal !0
4867             ret void`;
4868         LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a);
4869     }
4870     else
4871     {
4872         // Regular store instead.
4873         *mem_addr = a;
4874     }
4875 }
4876 unittest
4877 {
4878     int A;
4879     _mm_stream_si32(&A, -34);
4880     assert(A == -34);
4881 }
4882 
4883 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4884 /// cache pollution. If the cache line containing address `mem_addr` is already
4885 /// in the cache, the cache will be updated.
4886 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4887 void _mm_stream_si64 (long* mem_addr, long a) pure @trusted
4888 {
4889     // PERF DMD D_SIMD
4890     static if (GDC_with_SSE2)
4891     {
4892         return __builtin_ia32_movnti64(mem_addr, a);
4893     }
4894     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
4895     {
4896         enum prefix = `!0 = !{ i32 1 }`;
4897         enum ir = `
4898             store i64 %1, i64* %0, !nontemporal !0
4899             ret void`;
4900         LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a);
4901 
4902     }
4903     else
4904     {
4905         // Regular store instead.
4906         *mem_addr = a;
4907     }
4908 }
4909 unittest
4910 {
4911     long A;
4912     _mm_stream_si64(&A, -46);
4913     assert(A == -46);
4914 }
4915 
4916 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4917 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4918 {
4919     pragma(inline, true);
4920     return cast(__m128i)(cast(short8)a - cast(short8)b);
4921 }
4922 unittest
4923 {
4924     __m128i A = _mm_setr_epi16(16,  32767, 1, 2,    3, 4, 6, 6);
4925     __m128i B = _mm_setr_epi16(15, -32768, 6, 8, 1000, 1, 5, 6);
4926     short8 C = cast(short8) _mm_sub_epi16(A, B);
4927     short[8] correct =        [ 1,     -1,-5,-6, -997, 3, 1, 0];
4928     assert(C.array == correct);
4929 }
4930 
4931 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4932 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4933 {
4934     pragma(inline, true);
4935     return cast(__m128i)(cast(int4)a - cast(int4)b);
4936 }
4937 unittest
4938 {
4939     __m128i A = _mm_setr_epi32(16, int.max, 1, 8);
4940     __m128i B = _mm_setr_epi32(15, int.min, 6, 2);
4941     int4 C = cast(int4) _mm_sub_epi32(A, B);
4942     int[4] correct =          [ 1,      -1,-5, 6];
4943     assert(C.array == correct);
4944 }
4945 
4946 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4947 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4948 {
4949     pragma(inline, true);
4950     return cast(__m128i)(cast(long2)a - cast(long2)b);
4951 }
4952 unittest
4953 {
4954     __m128i A = _mm_setr_epi64(  16, long.max);
4955     __m128i B = _mm_setr_epi64( 199, long.min);
4956     long2 C = cast(long2) _mm_sub_epi64(A, B);
4957     long[2] correct =         [-183,       -1];
4958     assert(C.array == correct);
4959 }
4960 
4961 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4962 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4963 {
4964     pragma(inline, true);
4965     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4966 }
4967 unittest
4968 {
4969     __m128i A = _mm_setr_epi8(16,  127, 1, 2, 3, 4, 6, 6, 16,  127, 1, 2, 3, 4, 6, 6);
4970     __m128i B = _mm_setr_epi8(15, -128, 6, 8, 3, 1, 5, 6, 16,  127, 1, 2, 3, 4, 6, 6);
4971     byte16 C = cast(byte16) _mm_sub_epi8(A, B);
4972     byte[16] correct =       [ 1,   -1,-5,-6, 0, 3, 1, 0,  0,    0, 0, 0, 0, 0, 0, 0];
4973     assert(C.array == correct);
4974 }
4975 
4976 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4977 /// floating-point elements in `a`.
4978 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4979 {
4980     pragma(inline, true);
4981     return a - b;
4982 }
4983 unittest
4984 {
4985     __m128d A = _mm_setr_pd(4000.0, -8.0);
4986     __m128d B = _mm_setr_pd(12.0, -8450.0);
4987     __m128d C = _mm_sub_pd(A, B);
4988     double[2] correct =     [3988.0, 8442.0];
4989     assert(C.array == correct);
4990 }
4991 
4992 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4993 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4994 /// upper element of result.
4995 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4996 {
4997     version(DigitalMars)
4998     {
4999         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
5000         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
5001         asm pure nothrow @nogc @trusted { nop;}
5002         a[0] = a[0] - b[0];
5003         return a;
5004     }
5005     else static if (GDC_with_SSE2)
5006     {
5007         return __builtin_ia32_subsd(a, b);
5008     }
5009     else
5010     {
5011         a.ptr[0] -= b.array[0];
5012         return a;
5013     }
5014 }
5015 unittest
5016 {
5017     __m128d a = [1.5, -2.0];
5018     a = _mm_sub_sd(a, a);
5019     assert(a.array == [0.0, -2.0]);
5020 }
5021 
5022 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
5023 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
5024 {
5025     pragma(inline, true);
5026     return a - b;
5027 }
5028 unittest
5029 {
5030     __m64 A, B;
5031     A = -1214;
5032     B = 489415;
5033     __m64 C = _mm_sub_si64(B, A);
5034     assert(C.array[0] == 489415 + 1214);
5035 }
5036 
5037 /// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using
5038 /// saturation.
5039 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
5040 {
5041     // PERF DMD psubsw
5042     static if(LDC_with_saturated_intrinsics)
5043     {
5044         return cast(__m128i) inteli_llvm_subs!short8(cast(short8)a, cast(short8)b);
5045     }
5046     else static if (GDC_with_SSE2)
5047     {
5048         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
5049     }
5050     else
5051     {
5052         short[8] res; // PERF =void;
5053         short8 sa = cast(short8)a;
5054         short8 sb = cast(short8)b;
5055         foreach(i; 0..8)
5056             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
5057         return _mm_loadu_si128(cast(int4*)res.ptr);
5058     }
5059 }
5060 unittest
5061 {
5062     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
5063                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
5064     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
5065     assert(res.array == correctResult);
5066 }
5067 
5068 /// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using
5069 /// saturation.
5070 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
5071 {
5072     static if(LDC_with_saturated_intrinsics)
5073     {
5074         return cast(__m128i) inteli_llvm_subs!byte16(cast(byte16)a, cast(byte16)b);
5075     }
5076     else static if (GDC_with_SSE2)
5077     {
5078         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
5079     }
5080     else
5081     {
5082         byte[16] res; // PERF =void;
5083         byte16 sa = cast(byte16)a;
5084         byte16 sb = cast(byte16)b;
5085         foreach(i; 0..16)
5086             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
5087         return _mm_loadu_si128(cast(int4*)res.ptr);
5088     }
5089 }
5090 unittest
5091 {
5092     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
5093                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
5094     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
5095     assert(res.array == correctResult);
5096 }
5097 
5098 /// Subtract packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
5099 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
5100 {
5101     static if(LDC_with_saturated_intrinsics)
5102     {
5103         return cast(__m128i) inteli_llvm_subus!short8(cast(short8)a, cast(short8)b);
5104     }
5105     else static if (GDC_with_SSE2)
5106     {
5107         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
5108     }
5109     else
5110     {
5111         short[8] res; // PERF =void;
5112         short8 sa = cast(short8)a;
5113         short8 sb = cast(short8)b;
5114         foreach(i; 0..8)
5115         {
5116             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
5117             res[i] = saturateSignedIntToUnsignedShort(sum);
5118         }
5119         return _mm_loadu_si128(cast(int4*)res.ptr);
5120     }
5121 }
5122 unittest
5123 {
5124     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
5125                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
5126     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
5127     assert(R.array == correct);
5128 }
5129 
5130 /// Subtract packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
5131 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
5132 {
5133     static if(LDC_with_saturated_intrinsics)
5134     {
5135         return cast(__m128i) inteli_llvm_subus!byte16(cast(byte16)a, cast(byte16)b);
5136     }
5137     else static if (GDC_with_SSE2)
5138     {
5139         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
5140     }
5141     else
5142     {
5143         ubyte[16] res; // PERF =void;
5144         byte16 sa = cast(byte16)a;
5145         byte16 sb = cast(byte16)b;
5146         foreach(i; 0..16)
5147             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
5148         return _mm_loadu_si128(cast(int4*)res.ptr);
5149     }
5150 }
5151 unittest
5152 {
5153     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
5154                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
5155     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
5156     assert(res.array == correctResult);
5157 }
5158 
5159 // Note: the only difference between these intrinsics is the signalling
5160 //       behaviour of quiet NaNs. This is incorrect but the case where
5161 //       you would want to differentiate between qNaN and sNaN and then
5162 //       treat them differently on purpose seems extremely rare.
5163 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
5164 alias _mm_ucomige_sd = _mm_comige_sd; ///
5165 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
5166 alias _mm_ucomile_sd = _mm_comile_sd; ///
5167 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
5168 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
5169 
5170 /// Return vector of type `__m128d` with undefined elements.
5171 __m128d _mm_undefined_pd() pure @safe
5172 {
5173     pragma(inline, true);
5174     __m128d result = void;
5175     return result;
5176 }
5177 
5178 /// Return vector of type `__m128i` with undefined elements.
5179 __m128i _mm_undefined_si128() pure @safe
5180 {
5181     pragma(inline, true);
5182     __m128i result = void;
5183     return result;
5184 }
5185 
5186 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
5187 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted
5188 {
5189     static if (DMD_with_DSIMD)
5190     {
5191         return cast(__m128i) __simd(XMM.PUNPCKHWD, a, b);
5192     }
5193     else static if (GDC_with_SSE2)
5194     {
5195         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
5196     }
5197     else static if (LDC_with_optimizations)
5198     {
5199         enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
5200                    ret <8 x i16> %r`;
5201         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
5202     }
5203     else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5204     {
5205         asm pure nothrow @nogc @trusted
5206         {
5207             movdqu XMM0, a;
5208             movdqu XMM1, b;
5209             punpckhwd XMM0, XMM1;
5210             movdqu a, XMM0;
5211         }
5212         return a;
5213     }   
5214     else
5215     {
5216         short8 r = void;
5217         short8 sa = cast(short8)a;
5218         short8 sb = cast(short8)b;
5219         r.ptr[0] = sa.array[4];
5220         r.ptr[1] = sb.array[4];
5221         r.ptr[2] = sa.array[5];
5222         r.ptr[3] = sb.array[5];
5223         r.ptr[4] = sa.array[6];
5224         r.ptr[5] = sb.array[6];
5225         r.ptr[6] = sa.array[7];
5226         r.ptr[7] = sb.array[7];
5227         return cast(__m128i)r;
5228     }
5229 }
5230 unittest
5231 {
5232     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
5233     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
5234     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
5235     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
5236     assert(C.array == correct);
5237 }
5238 
5239 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
5240 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
5241 {
5242     static if (DMD_with_DSIMD)
5243     {
5244         return cast(__m128i) __simd(XMM.PUNPCKHDQ, a, b);
5245     }
5246     else static if (GDC_with_SSE2)
5247     {
5248         return __builtin_ia32_punpckhdq128(a, b);
5249     }
5250     else static if (LDC_with_optimizations)
5251     {
5252         enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
5253                    ret <4 x i32> %r`;
5254         return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b);
5255     }
5256     else
5257     {
5258         __m128i r = void;
5259         r.ptr[0] = a.array[2];
5260         r.ptr[1] = b.array[2];
5261         r.ptr[2] = a.array[3];
5262         r.ptr[3] = b.array[3];
5263         return r;
5264     }
5265 }
5266 unittest
5267 {
5268     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5269     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5270     __m128i C = _mm_unpackhi_epi32(A, B);
5271     int[4] correct = [3, 7, 4, 8];
5272     assert(C.array == correct);
5273 }
5274 
5275 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
5276 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
5277 {
5278     static if (GDC_with_SSE2)
5279     {
5280         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
5281     }
5282     else
5283     {
5284         __m128i r = cast(__m128i)b;
5285         r[0] = a[2];
5286         r[1] = a[3];
5287         return r; 
5288     }
5289 }
5290 unittest // Issue #36
5291 {
5292     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5293     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5294     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
5295     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
5296     assert(C.array == correct);
5297 }
5298 
5299 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
5300 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted
5301 {
5302     static if (DMD_with_DSIMD)
5303     {
5304         return cast(__m128i) __simd(XMM.PUNPCKHBW, a, b);
5305     }
5306     else static if (GDC_with_SSE2)
5307     {
5308         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
5309     }
5310     else static if (LDC_with_optimizations)
5311     {
5312         enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
5313                    ret <16 x i8> %r`;
5314         return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
5315     }
5316     else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5317     {
5318         asm pure nothrow @nogc @trusted
5319         {
5320             movdqu XMM0, a;
5321             movdqu XMM1, b;
5322             punpckhbw XMM0, XMM1;
5323             movdqu a, XMM0;
5324         }
5325         return a;
5326     }
5327     else
5328     {
5329         byte16 r = void;
5330         byte16 ba = cast(byte16)a;
5331         byte16 bb = cast(byte16)b;
5332         r.ptr[0] = ba.array[8];
5333         r.ptr[1] = bb.array[8];
5334         r.ptr[2] = ba.array[9];
5335         r.ptr[3] = bb.array[9];
5336         r.ptr[4] = ba.array[10];
5337         r.ptr[5] = bb.array[10];
5338         r.ptr[6] = ba.array[11];
5339         r.ptr[7] = bb.array[11];
5340         r.ptr[8] = ba.array[12];
5341         r.ptr[9] = bb.array[12];
5342         r.ptr[10] = ba.array[13];
5343         r.ptr[11] = bb.array[13];
5344         r.ptr[12] = ba.array[14];
5345         r.ptr[13] = bb.array[14];
5346         r.ptr[14] = ba.array[15];
5347         r.ptr[15] = bb.array[15];
5348         return cast(__m128i)r;
5349     }
5350 }
5351 unittest
5352 {
5353     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5354     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5355     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
5356     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
5357     assert(C.array == correct);
5358 }
5359 
5360 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
5361 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted
5362 {
5363     // PERF DMD D_SIMD
5364     static if (GDC_with_SSE2)
5365     {
5366         return __builtin_ia32_unpckhpd(a, b);
5367     }
5368     else static if (LDC_with_optimizations)
5369     {
5370         enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 1, i32 3>
5371                    ret <2 x double> %r`;
5372         return LDCInlineIR!(ir, double2, double2, double2)(a, b);
5373     }
5374     else
5375     {
5376         double2 r = void;
5377         r.ptr[0] = a.array[1];
5378         r.ptr[1] = b.array[1];
5379         return r;
5380     }
5381 }
5382 unittest
5383 {
5384     __m128d A = _mm_setr_pd(4.0, 6.0);
5385     __m128d B = _mm_setr_pd(7.0, 9.0);
5386     __m128d C = _mm_unpackhi_pd(A, B);
5387     double[2] correct = [6.0, 9.0];
5388     assert(C.array == correct);
5389 }
5390 
5391 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
5392 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted
5393 {
5394     static if (DMD_with_DSIMD)
5395     {
5396         return cast(__m128i) __simd(XMM.PUNPCKLWD, a, b);
5397     }
5398     else static if (GDC_with_SSE2)
5399     {
5400         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
5401     }
5402     else static if (LDC_with_optimizations)
5403     {
5404         enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
5405             ret <8 x i16> %r`;
5406         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
5407     }
5408     else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5409     {
5410         asm pure nothrow @nogc @trusted
5411         {
5412             movdqu XMM0, a;
5413             movdqu XMM1, b;
5414             punpcklwd XMM0, XMM1;
5415             movdqu a, XMM0;
5416         }
5417         return a;
5418     }
5419     else
5420     {
5421         short8 r = void;
5422         short8 sa = cast(short8)a;
5423         short8 sb = cast(short8)b;
5424         r.ptr[0] = sa.array[0];
5425         r.ptr[1] = sb.array[0];
5426         r.ptr[2] = sa.array[1];
5427         r.ptr[3] = sb.array[1];
5428         r.ptr[4] = sa.array[2];
5429         r.ptr[5] = sb.array[2];
5430         r.ptr[6] = sa.array[3];
5431         r.ptr[7] = sb.array[3];
5432         return cast(__m128i)r;
5433     }
5434 }
5435 unittest
5436 {
5437     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
5438     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
5439     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
5440     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
5441     assert(C.array == correct);
5442 }
5443 
5444 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
5445 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
5446 {
5447     static if (DMD_with_DSIMD)
5448     {
5449         return cast(__m128i) __simd(XMM.PUNPCKLDQ, a, b);
5450     }
5451     else static if (GDC_with_SSE2)
5452     {
5453         return __builtin_ia32_punpckldq128(a, b);
5454     }
5455     else static if (LDC_with_optimizations)
5456     {
5457         enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
5458             ret <4 x i32> %r`;
5459         return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b);
5460     }
5461     else
5462     {
5463         __m128i r;
5464         r.ptr[0] = a.array[0];
5465         r.ptr[1] = b.array[0];
5466         r.ptr[2] = a.array[1];
5467         r.ptr[3] = b.array[1];
5468         return r;
5469     }
5470 }
5471 unittest
5472 {
5473     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5474     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5475     __m128i C = _mm_unpacklo_epi32(A, B);
5476     int[4] correct = [1, 5, 2, 6];
5477     assert(C.array == correct);
5478 }
5479 
5480 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
5481 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
5482 {
5483     static if (GDC_with_SSE2)
5484     {
5485         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
5486     }
5487     else
5488     {
5489         long2 lA = cast(long2)a;
5490         long2 lB = cast(long2)b;
5491         long2 R; // PERF =void;
5492         R.ptr[0] = lA.array[0];
5493         R.ptr[1] = lB.array[0];
5494         return cast(__m128i)R;
5495     }
5496 }
5497 unittest // Issue #36
5498 {
5499     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5500     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5501     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
5502     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
5503     assert(C.array == correct);
5504 }
5505 
5506 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
5507 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted
5508 {
5509     static if (DMD_with_DSIMD)
5510     {
5511         return cast(__m128i) __simd(XMM.PUNPCKLBW, a, b);
5512     }
5513     else static if (GDC_with_SSE2)
5514     {
5515         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
5516     }
5517     else static if (LDC_with_optimizations)
5518     {
5519         enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
5520             ret <16 x i8> %r`;
5521         return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
5522     }
5523     else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5524     {
5525         asm pure nothrow @nogc @trusted
5526         {
5527             movdqu XMM0, a;
5528             movdqu XMM1, b;
5529             punpcklbw XMM0, XMM1;
5530             movdqu a, XMM0;
5531         }
5532         return a;
5533     }
5534     else
5535     {
5536         byte16 r = void;
5537         byte16 ba = cast(byte16)a;
5538         byte16 bb = cast(byte16)b;
5539         r.ptr[0] = ba.array[0];
5540         r.ptr[1] = bb.array[0];
5541         r.ptr[2] = ba.array[1];
5542         r.ptr[3] = bb.array[1];
5543         r.ptr[4] = ba.array[2];
5544         r.ptr[5] = bb.array[2];
5545         r.ptr[6] = ba.array[3];
5546         r.ptr[7] = bb.array[3];
5547         r.ptr[8] = ba.array[4];
5548         r.ptr[9] = bb.array[4];
5549         r.ptr[10] = ba.array[5];
5550         r.ptr[11] = bb.array[5];
5551         r.ptr[12] = ba.array[6];
5552         r.ptr[13] = bb.array[6];
5553         r.ptr[14] = ba.array[7];
5554         r.ptr[15] = bb.array[7];
5555         return cast(__m128i)r;
5556     }
5557 }
5558 unittest
5559 {
5560     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5561     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5562     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
5563     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
5564     assert(C.array == correct);
5565 }
5566 
5567 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
5568 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted
5569 {
5570     // PERF DMD D_SIMD
5571     static if (GDC_with_SSE2)
5572     {
5573         return __builtin_ia32_unpcklpd(a, b);
5574     }
5575     else static if (LDC_with_optimizations)
5576     {
5577         enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 0, i32 2>
5578                    ret <2 x double> %r`;
5579         return LDCInlineIR!(ir, double2, double2, double2)(a, b);
5580     }
5581     else
5582     {
5583         double2 r = void;
5584         r.ptr[0] = a.array[0];
5585         r.ptr[1] = b.array[0];
5586         return r;
5587     }
5588 }
5589 unittest
5590 {
5591     __m128d A = _mm_setr_pd(4.0, 6.0);
5592     __m128d B = _mm_setr_pd(7.0, 9.0);
5593     __m128d C = _mm_unpacklo_pd(A, B);
5594     double[2] correct = [4.0, 7.0];
5595     assert(C.array == correct);
5596 }
5597 
5598 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
5599 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
5600 {
5601     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
5602 }
5603 unittest
5604 {
5605     __m128d A = _mm_setr_pd(-4.0, 6.0);
5606     __m128d B = _mm_setr_pd(4.0, -6.0);
5607     long2 R = cast(long2) _mm_xor_pd(A, B);
5608     long[2] correct = [long.min, long.min];
5609     assert(R.array == correct);
5610 }
5611 
5612 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
5613 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
5614 {
5615     return a ^ b;
5616 }
5617 unittest
5618 {
5619     __m128i A = _mm_setr_epi64(975394, 619809709);
5620     __m128i B = _mm_setr_epi64(-920275025, -6);
5621     long2 R = cast(long2) _mm_xor_si128(A, B);
5622     long[2] correct = [975394 ^ (-920275025L), 619809709L ^ -6];
5623     assert(R.array == correct);
5624 }
5625 
5626 unittest
5627 {
5628     float distance(float[4] a, float[4] b) nothrow @nogc
5629     {
5630         __m128 va = _mm_loadu_ps(a.ptr);
5631         __m128 vb = _mm_loadu_ps(b.ptr);
5632         __m128 diffSquared = _mm_sub_ps(va, vb);
5633         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
5634         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
5635         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
5636         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
5637     }
5638     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
5639 }