The OpenD Programming Language

1 /**
2 * SSE4.1 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1
4 *
5 * Copyright: Guillaume Piolat 2021.
6 *            Johan Engelen 2021.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.smmintrin;
10 
11 // SSE4.1 instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
13 // Note: this header will work whether you have SSE4.1 enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
15 // generate SSE4.1 instructions.
16 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions.
17 
18 public import inteli.types;
19 import inteli.internals;
20 
21 // smmintrin pulls in all previous instruction set intrinsics.
22 public import inteli.tmmintrin;
23 
24 nothrow @nogc:
25 
26 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
27 enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
28 enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
29 enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
30 enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
31 enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
32 enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
33 
34 enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
35 enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
36 enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
37 enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
38 enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
39 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
40 
41 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
42 // Note: changed signature, GDC needs a compile-time value for imm8.
43 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) pure @trusted
44 {
45     // PERF DMD
46     static if (GDC_with_SSE41)
47     {
48         pragma(inline, true); // else wouldn't inline in _mm256_blend_epi16
49         return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
50     }
51     else 
52     {
53         // LDC x86 This generates pblendw since LDC 1.1 and -O2
54         short8 r;
55         short8 sa = cast(short8)a;
56         short8 sb = cast(short8)b;
57         for (int n = 0; n < 8; ++n)
58         {
59             r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
60         }
61         return cast(__m128i)r;
62     }
63 }
64 unittest
65 {
66     __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
67     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
68     short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
69     short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
70     assert(C.array == correct);
71 }
72 
73 
74 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
75 // Note: changed signature, GDC needs a compile-time value for `imm8`.
76 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
77 {
78     static assert(imm8 >= 0 && imm8 < 4);
79     // PERF DMD
80     static if (GDC_with_SSE41)
81     {
82         return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
83     }
84     else
85     {
86         // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
87         double2 r;
88         for (int n = 0; n < 2; ++n)
89         {
90             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
91         }
92         return cast(__m128d)r;
93     }
94 }
95 unittest
96 {
97     __m128d A = _mm_setr_pd(0, 1);
98     __m128d B = _mm_setr_pd(8, 9);
99     double2 C = _mm_blend_pd!2(A, B);
100     double[2] correct =    [0, 9];
101     assert(C.array == correct);
102 }
103 
104 
105 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 
106 /// mask `imm8`.
107 // Note: changed signature, GDC needs a compile-time value for imm8.
108 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) pure @trusted
109 {
110     // PERF DMD
111     static assert(imm8 >= 0 && imm8 < 16);
112     static if (GDC_with_SSE41)
113     {
114         return __builtin_ia32_blendps(a, b, imm8);
115     }
116     else version(LDC)
117     {
118         // LDC x86: generates blendps since LDC 1.1 -O2
119         //   arm64: pretty good, two instructions worst case
120         return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0,
121                                          (imm8 & 2) ? 5 : 1,
122                                          (imm8 & 4) ? 6 : 2,
123                                          (imm8 & 8) ? 7 : 3)(a, b);
124     }
125     else
126     {
127         // PERF GDC without SSE4.1 is quite bad
128         __m128 r;
129         for (int n = 0; n < 4; ++n)
130         {
131             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
132         }
133         return r;
134     }
135 }
136 unittest
137 {
138     __m128 A = _mm_setr_ps(0, 1,  2,  3);
139     __m128 B = _mm_setr_ps(8, 9, 10, 11);
140     float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
141     float[4] correct =    [8, 1, 10, 11];
142     assert(C.array == correct);
143 }
144 
145 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
146 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted
147 {
148     // PERF DMD
149     /*static if (GDC_with_SSE41)
150     {
151         // This intrinsic do nothing in GDC 12.
152         // TODO report to GDC. No problem in GCC.
153         return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask);
154     }
155     else*/
156     static if (LDC_with_SSE41)
157     {
158         return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
159     }
160     else static if (LDC_with_ARM64)
161     {
162         // LDC arm64: two instructions since LDC 1.12 -O2
163         byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
164         return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
165     }
166     else
167     {
168         __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
169         return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
170     }
171 }
172 unittest
173 {
174     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
175                                8,  9, 10, 11, 12, 13, 14, 15);
176     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
177                               24, 25, 26, 27, 28, 29, 30, 31);
178     __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
179                                1,  1, -1, -1,  4,  1,  8, -128);
180     byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
181     byte[16] correct =      [  0, 17,  2,  3, 20,  5, 22,  7,
182                                8,  9, 26, 27, 12, 13, 14, 31 ];
183     assert(R.array == correct);
184 }
185 
186 
187 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
188 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
189 {
190     // PERF DMD
191     static if (GDC_with_SSE42)
192     {
193         // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction
194         // with -msse4.2 but not -msse4.1.
195         // Not sure what is the reason, and there is a replacement sequence.
196         // Sounds like a bug.
197         return __builtin_ia32_blendvpd(a, b, mask);
198     }
199     else static if (LDC_with_SSE41)
200     {
201         return __builtin_ia32_blendvpd(a, b, mask);
202     }
203     else static if (LDC_with_ARM64)
204     {
205         long2 shift;
206         shift = 63;
207         long2 lmask = cast(long2)mask >> shift;
208         return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
209     }
210     else
211     {
212         __m128d r; // PERF =void;
213         long2 lmask = cast(long2)mask;
214         for (int n = 0; n < 2; ++n)
215         {
216             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
217         }
218         return r;
219     }
220 }
221 unittest
222 {
223     __m128d A = _mm_setr_pd(1.0, 2.0);
224     __m128d B = _mm_setr_pd(3.0, 4.0);
225     __m128d M1 = _mm_setr_pd(-3.0, 2.0);
226     __m128d R1 = _mm_blendv_pd(A, B, M1);
227     double[2] correct1 = [3.0, 2.0];
228     assert(R1.array == correct1);
229 
230     // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost
231     // See Issue #78
232     __m128d M2 = _mm_setr_pd(double.nan, double.infinity);
233     __m128d R2 = _mm_blendv_pd(A, B, M2);
234     double[2] correct2 = [1.0, 2.0];
235     assert(R2.array == correct2);
236 }
237 
238 
239 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
240 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted
241 {
242     // PERF DMD
243     static if (GDC_with_SSE41)
244     {
245         return __builtin_ia32_blendvps(a, b, mask);
246     }
247     else static if (LDC_with_SSE41)
248     {
249         return __builtin_ia32_blendvps(a, b, mask);
250     }
251     else static if (LDC_with_ARM64)
252     {
253         int4 shift;
254         shift = 31;
255         int4 lmask = cast(int4)mask >> shift;
256         return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
257     }
258     else
259     {
260         __m128 r; // PERF =void;
261         int4 lmask = cast(int4)mask;
262         for (int n = 0; n < 4; ++n)
263         {
264             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
265         }
266         return r;
267     }
268 }
269 unittest
270 {
271     __m128 A  = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
272     __m128 B  = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
273     __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
274     __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f);
275     __m128 R1 = _mm_blendv_ps(A, B, M1);
276     __m128 R2 = _mm_blendv_ps(A, B, M2);
277     float[4] correct1 =    [ 4.0f, 1.0f, 2.0f, 7.0f];
278     float[4] correct2 =    [ 0.0f, 1.0f, 6.0f, 3.0f];
279     assert(R1.array == correct1);
280 
281     // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost
282     // See Issue #78
283     assert(R2.array == correct2);
284 }
285 
286 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 
287 /// and store the results as packed double-precision floating-point elements.
288 __m128d _mm_ceil_pd (__m128d a) @trusted
289 {
290     static if (LDC_with_ARM64)
291     {
292         // LDC arm64 acceptable since 1.8 -O2
293         // Unfortunately x86 intrinsics force a round-trip back to double2
294         // ARM neon semantics wouldn't have that
295         long2 l = vcvtpq_s64_f64(a);
296         double2 r;
297         r.ptr[0] = l.array[0];
298         r.ptr[1] = l.array[1];
299         return r;
300     }
301     else
302     {
303         return _mm_round_pd!2(a);
304     }
305 }
306 unittest
307 {
308     __m128d A = _mm_setr_pd(1.3f, -2.12f);
309     __m128d B = _mm_setr_pd(53.6f, -2.7f);
310     A = _mm_ceil_pd(A);
311     B = _mm_ceil_pd(B);
312     double[2] correctA = [2.0, -2.0];
313     double[2] correctB = [54.0, -2.0];
314     assert(A.array == correctA);
315     assert(B.array == correctB);
316 }
317 
318 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 
319 /// and store the results as packed single-precision floating-point elements.
320 __m128 _mm_ceil_ps (__m128 a) @trusted
321 {
322     static if (LDC_with_ARM64)
323     {
324         // LDC arm64 acceptable since 1.8 -O1
325         int4 l = vcvtpq_s32_f32(a);
326         float4 r;
327         r.ptr[0] = l.array[0];
328         r.ptr[1] = l.array[1];
329         r.ptr[2] = l.array[2];
330         r.ptr[3] = l.array[3];
331         return r;
332     }
333     else
334     {
335         return _mm_round_ps!2(a);
336     }
337 }
338 unittest
339 {
340     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
341     __m128 C = _mm_ceil_ps(A);
342     float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
343     assert(C.array == correct);
344 }
345 
346 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 
347 /// store the result as a double-precision floating-point element in the lower element of result, 
348 /// and copy the upper element from `a` to the upper element of dst.
349 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
350 {
351     static if (LDC_with_ARM64)
352     {
353         a[0] = vcvtps_s64_f64(b[0]);
354         return a;
355     }
356     else
357     {
358         return _mm_round_sd!2(a, b);
359     }
360 }
361 unittest
362 {
363     __m128d A = _mm_setr_pd(1.3, -2.12);
364     __m128d B = _mm_setr_pd(53.6, -3.7);
365     __m128d C = _mm_ceil_sd(A, B);
366     double[2] correct = [54.0, -2.12];
367     assert(C.array == correct);
368 }
369 
370 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
371 /// store the result as a single-precision floating-point element in the lower element of result, 
372 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
373 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
374 {
375     static if (LDC_with_ARM64)
376     {
377         a[0] = vcvtps_s32_f32(b[0]);
378         return a;
379     }
380     else
381     {
382         return _mm_round_ss!2(a, b);
383     }
384 }
385 unittest
386 {
387     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
388     __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
389     __m128 C = _mm_ceil_ss(A, B);
390     float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
391     assert(C.array == correct);
392 }
393 
394 /// Compare packed 64-bit integers in `a` and `b` for equality.
395 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
396 {
397     static if (SIMD_COMPARISON_MASKS_16B)
398     {
399         version(DigitalMars)
400         {
401             // DMD doesn't recognize long2 == long2
402             long2 la = cast(long2)a;
403             long2 lb = cast(long2)b;
404             long2 res;
405             res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
406             res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
407             return cast(__m128i)res;
408         }
409         else
410         {
411             return cast(__m128i)(cast(long2)a == cast(long2)b);
412         }
413     }
414     else static if (GDC_with_SSE41)
415     {
416         return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
417     }
418     else version(LDC)
419     {
420         // LDC x86: generates pcmpeqq since LDC 1.1 -O1
421         //     arm64: generates cmeq since LDC 1.8 -O1
422         return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
423     }
424     else
425     {
426         // Clever pcmpeqd + pand use with LDC 1.24 -O2
427         long2 la = cast(long2)a;
428         long2 lb = cast(long2)b;
429         long2 res;
430         res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
431         res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
432         return cast(__m128i)res;
433     }
434 }
435 unittest
436 {
437     __m128i A = _mm_setr_epi64(-1, -2);
438     __m128i B = _mm_setr_epi64(-3, -2);
439     __m128i C = _mm_setr_epi64(-1, -4);
440     long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
441     long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
442     long[2] correct1 = [0, -1];
443     long[2] correct2 = [-1, 0];
444     assert(AB.array == correct1);
445     assert(AC.array == correct2);
446 }
447 
448 
449 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
450 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
451 {
452     // PERF DMD
453     static if (GDC_with_SSE41)
454     {
455         return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
456     }
457     else static if (LDC_with_optimizations)
458     {
459         // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
460         enum ir = `
461             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
462             %r = sext <4 x i16> %v to <4 x i32>
463             ret <4 x i32> %r`;
464         return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
465     }
466     else
467     {
468         short8 sa = cast(short8)a;
469         int4 r;
470         r.ptr[0] = sa.array[0];
471         r.ptr[1] = sa.array[1];
472         r.ptr[2] = sa.array[2];
473         r.ptr[3] = sa.array[3];
474         return r;
475     }
476 }
477 unittest
478 {
479     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
480     int4 C = cast(int4) _mm_cvtepi16_epi32(A);
481     int[4] correct = [-1, 0, -32768, 32767];
482     assert(C.array == correct);
483 }
484 
485 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
486 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
487 {
488     // PERF DMD
489     static if (GDC_with_SSE41)
490     {
491         return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
492     }
493     else static if (LDC_with_optimizations)
494     {
495         // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
496         enum ir = `
497             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
498             %r = sext <2 x i16> %v to <2 x i64>
499             ret <2 x i64> %r`;
500         return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
501     }
502     else
503     {
504         short8 sa = cast(short8)a;
505         long2 r;
506         r.ptr[0] = sa.array[0];
507         r.ptr[1] = sa.array[1];
508         return cast(__m128i)r;
509     }
510 }
511 unittest
512 {
513     __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
514     long2 C = cast(long2) _mm_cvtepi16_epi64(A);
515     long[2] correct = [-32768, 32767];
516     assert(C.array == correct);
517 }
518 
519 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
520 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
521 {
522     // PERF DMD
523     static if (GDC_with_SSE41)
524     {
525         return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
526     }
527     else static if (LDC_with_optimizations)
528     {
529         // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
530         enum ir = `
531             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
532             %r = sext <2 x i32> %v to <2 x i64>
533             ret <2 x i64> %r`;
534         return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
535     }
536     else
537     {
538         int4 sa = cast(int4)a;
539         long2 r;
540         r.ptr[0] = sa.array[0];
541         r.ptr[1] = sa.array[1];
542         return cast(__m128i)r;
543     }
544 }
545 unittest
546 {
547     __m128i A = _mm_setr_epi32(-4, 42, 0, 0);
548     long2 C = cast(long2) _mm_cvtepi32_epi64(A);
549     long[2] correct = [-4, 42];
550     assert(C.array == correct);
551 }
552 
553 
554 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
555 __m128i _mm_cvtepi8_epi16 (__m128i a) pure @trusted
556 {
557     // PERF DMD
558     static if (GDC_with_SSE41)
559     {
560         alias ubyte16 = __vector(ubyte[16]);
561         return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
562     }
563     else static if (LDC_with_optimizations)
564     {
565         // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 
566         // LDC ARM64: sshll generated since LDC 1.8.0 -O1
567         enum ir = `
568             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
569             %r = sext <8 x i8> %v to <8 x i16>
570             ret <8 x i16> %r`;
571         return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
572     }
573     else
574     {
575         byte16 sa = cast(byte16)a;
576         short8 r;
577         foreach(n; 0..8)
578             r.ptr[n] = sa.array[n];
579         return cast(__m128i)r;
580     }
581 }
582 unittest
583 {
584     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
585     short8 C = cast(short8) _mm_cvtepi8_epi16(A);
586     short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
587     assert(C.array == correct);
588 }
589 
590 
591 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
592 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
593 {
594     // PERF DMD
595     static if (GDC_with_SSE41)
596     {
597         alias ubyte16 = __vector(ubyte[16]);
598         return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
599     }
600     else static if (LDC_with_SSE41 && LDC_with_optimizations)
601     {
602         // LDC x86: Generates pmovsxbd since LDC 1.1 -O0
603         enum ir = `
604             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
605             %r = sext <4 x i8> %v to <4 x i32>
606             ret <4 x i32> %r`;
607         return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
608     }
609     else
610     {
611         // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
612         byte16 sa = cast(byte16)a;
613         int4 r;
614         r.ptr[0] = sa.array[0];
615         r.ptr[1] = sa.array[1];
616         r.ptr[2] = sa.array[2];
617         r.ptr[3] = sa.array[3];
618         return cast(__m128i)r;
619     }
620 }
621 unittest
622 {
623     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
624     int4 C = cast(int4) _mm_cvtepi8_epi32(A);
625     int[4] correct = [127, -128, 1, -1];
626     assert(C.array == correct);
627 }
628 
629 
630 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
631 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
632 {
633     // PERF DMD
634     static if (GDC_with_SSE41)
635     {
636         alias ubyte16 = __vector(ubyte[16]);
637         return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
638     }
639     else static if (LDC_with_optimizations)
640     {
641         // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 
642         // LDC arm64: it's ok since LDC 1.8 -O1
643         enum ir = `
644             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
645             %r = sext <2 x i8> %v to <2 x i64>
646             ret <2 x i64> %r`;
647         return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
648     }
649     else
650     {
651         byte16 sa = cast(byte16)a;
652         long2 r;
653         foreach(n; 0..2)
654             r.ptr[n] = sa.array[n];
655         return cast(__m128i)r;
656     }
657 }
658 unittest
659 {
660     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
661     long2 C = cast(long2) _mm_cvtepi8_epi64(A);
662     long[2] correct = [127, -128];
663     assert(C.array == correct);
664 }
665 
666 
667 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
668 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
669 {
670     // PERF DMD
671     static if (GDC_with_SSE41)
672     {
673         return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
674     }
675     else
676     {
677         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
678         //     arm64: ushll since LDC 1.12 -O1
679         short8 sa = cast(short8)a;
680         int4 r;
681         r.ptr[0] = cast(ushort)sa.array[0];
682         r.ptr[1] = cast(ushort)sa.array[1];
683         r.ptr[2] = cast(ushort)sa.array[2];
684         r.ptr[3] = cast(ushort)sa.array[3];
685         return cast(__m128i)r;
686     }
687 }
688 unittest
689 {
690     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
691     int4 C = cast(int4) _mm_cvtepu16_epi32(A);
692     int[4] correct = [65535, 0, 32768, 32767];
693     assert(C.array == correct);
694 }
695 
696 
697 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
698 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
699 {
700     // PERF DMD
701     static if (GDC_with_SSE41)
702     {
703         return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
704     }
705     else static if (LDC_with_ARM64)
706     {
707         // LDC arm64: a bit shorter than below, in -O2
708         short8 sa = cast(short8)a;
709         long2 r;
710         for(int n = 0; n < 2; ++n)
711             r.ptr[n] = cast(ushort)sa.array[n];
712         return cast(__m128i)r;
713     }
714     else
715     {
716         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
717         short8 sa = cast(short8)a;
718         long2 r;
719         r.ptr[0] = cast(ushort)sa.array[0];
720         r.ptr[1] = cast(ushort)sa.array[1];
721         return cast(__m128i)r;
722     }
723 }
724 unittest
725 {
726     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
727     long2 C = cast(long2) _mm_cvtepu16_epi64(A);
728     long[2] correct = [65535, 0];
729     assert(C.array == correct);
730 }
731 
732 
733 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
734 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
735 {
736     // PERF DMD
737     static if (GDC_with_SSE41)
738     {
739         return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
740     }
741     else
742     {
743         // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
744         //     arm64: generates ushll since LDC 1.12 -O1
745         int4 sa = cast(int4)a;
746         long2 r;
747         r.ptr[0] = cast(uint)sa.array[0];
748         r.ptr[1] = cast(uint)sa.array[1];
749         return cast(__m128i)r;
750     }
751 }
752 unittest
753 {
754     __m128i A = _mm_setr_epi32(-1, 42, 0, 0);
755     long2 C = cast(long2) _mm_cvtepu32_epi64(A);
756     long[2] correct = [4294967295, 42];
757     assert(C.array == correct);
758 }
759 
760 
761 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
762 __m128i _mm_cvtepu8_epi16 (__m128i a) pure @trusted
763 {
764     // PERF DMD
765     static if (GDC_with_SSE41)
766     {
767         return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a);
768     }
769     else static if (LDC_with_optimizations)
770     {
771         enum ir = `
772             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
773             %r = zext <8 x i8> %v to <8 x i16>
774             ret <8 x i16> %r`;
775         return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
776     }
777     else
778     {
779         return _mm_unpacklo_epi8(a, _mm_setzero_si128());
780     }
781 }
782 unittest
783 {
784     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
785     short8 C = cast(short8) _mm_cvtepu8_epi16(A);
786     short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
787     assert(C.array == correct);
788 }
789 
790 
791 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
792 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
793 {
794     // PERF DMD
795     static if (GDC_with_SSE41)
796     {
797         alias ubyte16 = __vector(ubyte[16]);
798         return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
799     }
800     else static if (LDC_with_ARM64)
801     {
802         // LDC arm64: a bit better than below in -O2
803         byte16 sa = cast(byte16)a;
804         int4 r;
805         for(int n = 0; n < 4; ++n) 
806             r.ptr[n] = cast(ubyte)sa.array[n];
807         return cast(__m128i)r;
808     }
809     else
810     {
811         // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
812         // PERF: catastrophic with GDC without SSE4.1
813         byte16 sa = cast(byte16)a;
814         int4 r;
815         r.ptr[0] = cast(ubyte)sa.array[0];
816         r.ptr[1] = cast(ubyte)sa.array[1];
817         r.ptr[2] = cast(ubyte)sa.array[2];
818         r.ptr[3] = cast(ubyte)sa.array[3];
819         return cast(__m128i)r;
820     }
821 }
822 unittest
823 {
824     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
825     int4 C = cast(int4) _mm_cvtepu8_epi32(A);
826     int[4] correct = [127, 128, 1, 255];
827     assert(C.array == correct);
828 }
829 
830 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
831 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
832 {
833     // PERF DMD
834     static if (GDC_with_SSE41)
835     {
836         alias ubyte16 = __vector(ubyte[16]);
837         return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
838     }
839     else static if (LDC_with_ARM64)
840     {
841         // LDC arm64: this optimizes better than the loop below
842         byte16 sa = cast(byte16)a;
843         long2 r;
844         for (int n = 0; n < 2; ++n)
845             r.ptr[n] = cast(ubyte)sa.array[n];
846         return cast(__m128i)r;
847     }
848     else
849     {
850         // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
851         byte16 sa = cast(byte16)a;
852         long2 r;
853         r.ptr[0] = cast(ubyte)sa.array[0];
854         r.ptr[1] = cast(ubyte)sa.array[1];
855         return cast(__m128i)r;
856     }
857 }
858 unittest
859 {
860     __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
861     long2 C = cast(long2) _mm_cvtepu8_epi64(A);
862     long[2] correct = [127, 254];
863     assert(C.array == correct);
864 }
865 
866 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 
867 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
868 /// store the sum in dst using the low 4 bits of `imm8`.
869 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
870 {
871     // PERF DMD
872     static if (GDC_with_SSE41)
873     {
874         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
875     }
876     else static if (LDC_with_SSE41)
877     {
878         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
879     }
880     else
881     {
882         __m128d zero = _mm_setzero_pd();
883         __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
884         double sum = temp.array[0] + temp.array[1];
885         return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
886     }
887 }
888 unittest
889 {
890     __m128d A = _mm_setr_pd(1.0, 2.0);
891     __m128d B = _mm_setr_pd(4.0, 8.0);
892     double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
893     double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
894     double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
895     double[2] correct1 = [ 4.0,  4.0];
896     double[2] correct2 = [16.0,  0.0];
897     double[2] correct3 = [ 0.0, 20.0];
898     assert(R1.array == correct1);
899     assert(R2.array == correct2);
900     assert(R3.array == correct3);
901 }
902 
903 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 
904 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 
905 /// and conditionally store the sum in result using the low 4 bits of `imm8`.
906 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
907 {
908       // PERF DMD
909     static if (GDC_with_SSE41)
910     {
911         return __builtin_ia32_dpps(a, b, cast(ubyte)imm8);
912     }
913     else static if (LDC_with_SSE41)
914     {
915         return __builtin_ia32_dpps(a, b, cast(byte)imm8);
916     }
917     else
918     {
919         __m128 zero = _mm_setzero_ps();
920         __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
921         float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
922         return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
923     }        
924 }
925 unittest
926 {
927     __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
928     __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
929     float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
930     float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
931     float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
932     float[4] correct1 =   [67.0f, 67.0f, 67.0f, 67.0f];
933     float[4] correct2 =   [23.0f, 0.0f, 23.0f, 0.0f];
934     float[4] correct3 =   [0.0f, 29.0f, 0.0f, 29.0f];
935     assert(R1.array == correct1);
936     assert(R2.array == correct2);
937     assert(R3.array == correct3);
938 }
939 
940 
941 /// Extract a 32-bit integer from `a`, selected with `imm8`.
942 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
943 {
944     return (cast(int4)a).array[imm8 & 3];
945 }
946 unittest
947 {
948     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
949     assert(_mm_extract_epi32(A, 0) == 1);
950     assert(_mm_extract_epi32(A, 1 + 8) == 2);
951     assert(_mm_extract_epi32(A, 3 + 4) == 4);
952 }
953 
954 /// Extract a 64-bit integer from `a`, selected with `imm8`.
955 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
956 {
957     long2 la = cast(long2)a;
958     return la.array[imm8 & 1];
959 }
960 unittest
961 {
962     __m128i A = _mm_setr_epi64(45, -67);
963     assert(_mm_extract_epi64(A, 0) == 45);
964     assert(_mm_extract_epi64(A, 1) == -67);
965     assert(_mm_extract_epi64(A, 2) == 45);
966 }
967 
968 /// Extract an 8-bit integer from `a`, selected with `imm8`.
969 /// Warning: the returned value is zero-extended to 32-bits.
970 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
971 {
972     byte16 ba = cast(byte16)a;
973     return cast(ubyte) ba.array[imm8 & 15];
974 }
975 unittest
976 {
977     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
978     assert(_mm_extract_epi8(A, 7) == 7);
979     assert(_mm_extract_epi8(A, 13) == 255);
980     assert(_mm_extract_epi8(A, 7 + 16) == 7);
981 }
982 
983 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
984 /// Note: returns a 32-bit $(I integer).
985 int _mm_extract_ps (__m128 a, const int imm8) @trusted
986 {
987     return (cast(int4)a).array[imm8 & 3];
988 }
989 unittest
990 {
991     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
992     assert(_mm_extract_ps(A, 0) == 0x3f800000);
993     assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
994     assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
995 }
996 
997 
998 
999 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 
1000 /// integer value, and store the results as packed double-precision floating-point elements.
1001 __m128d _mm_floor_pd (__m128d a) @trusted
1002 {
1003     static if (LDC_with_ARM64)
1004     {
1005         // LDC arm64 acceptable since 1.8 -O2
1006         long2 l = vcvtmq_s64_f64(a);
1007         double2 r;
1008         r.ptr[0] = l.array[0];
1009         r.ptr[1] = l.array[1];
1010         return r;
1011     }
1012     else
1013     {
1014         return _mm_round_pd!1(a);
1015     }
1016 }
1017 unittest
1018 {
1019     __m128d A = _mm_setr_pd(1.3f, -2.12f);
1020     __m128d B = _mm_setr_pd(53.6f, -2.7f);
1021     A = _mm_floor_pd(A);
1022     B = _mm_floor_pd(B);
1023     double[2] correctA = [1.0, -3.0];
1024     double[2] correctB = [53.0, -3.0];
1025     assert(A.array == correctA);
1026     assert(B.array == correctB);
1027 }
1028 
1029 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 
1030 /// integer value, and store the results as packed single-precision floating-point elements.
1031 __m128 _mm_floor_ps (__m128 a) @trusted
1032 {
1033     static if (LDC_with_ARM64)
1034     {
1035         // LDC arm64 acceptable since 1.8 -O1
1036         int4 l = vcvtmq_s32_f32(a);
1037         float4 r;
1038         r.ptr[0] = l.array[0];
1039         r.ptr[1] = l.array[1];
1040         r.ptr[2] = l.array[2];
1041         r.ptr[3] = l.array[3];
1042         return r;
1043     }
1044     else
1045     {
1046         return _mm_round_ps!1(a);
1047     }
1048 }
1049 unittest
1050 {
1051     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
1052     __m128 C = _mm_floor_ps(A);
1053     float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
1054     assert(C.array == correct);
1055 }
1056 
1057 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 
1058 /// integer value, store the result as a double-precision floating-point element in the 
1059 /// lower element, and copy the upper element from `a` to the upper element.
1060 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
1061 {
1062     static if (LDC_with_ARM64)
1063     {
1064         a[0] = vcvtms_s64_f64(b[0]);
1065         return a;
1066     }
1067     else
1068     {
1069         return _mm_round_sd!1(a, b);
1070     }
1071 }
1072 unittest
1073 {
1074     __m128d A = _mm_setr_pd(1.3, -2.12);
1075     __m128d B = _mm_setr_pd(-53.1, -3.7);
1076     __m128d C = _mm_floor_sd(A, B);
1077     double[2] correct = [-54.0, -2.12];
1078     assert(C.array == correct);
1079 }
1080 
1081 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an
1082 /// integer value, store the result as a single-precision floating-point element in the
1083 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
1084 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
1085 {
1086     static if (LDC_with_ARM64)
1087     {
1088         a[0] = vcvtms_s32_f32(b[0]);
1089         return a;
1090     }
1091     else
1092     {
1093         return _mm_round_ss!1(a, b);
1094     }
1095 }
1096 unittest
1097 {
1098     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
1099     __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
1100     __m128 C = _mm_floor_ss(A, B);
1101     float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
1102     assert(C.array == correct);
1103 }
1104 
1105 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
1106 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
1107 {
1108     // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
1109     // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
1110     // LDC arm64: ins.s since LDC 1.8 -O2
1111     int4 ia = cast(int4)a;
1112     ia.ptr[imm8 & 3] = i;
1113     return cast(__m128i)ia; 
1114 }
1115 unittest
1116 {
1117     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
1118     int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
1119     int[4] result = [1, 2, 5, 4];
1120     assert(C.array == result);
1121 }
1122 
1123 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
1124 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
1125 {
1126     // GDC: nothing special to do, psinrq generated with -O1 -msse4.1
1127     // LDC x86: always do something sensible.
1128     long2 la = cast(long2)a;
1129     la.ptr[imm8 & 1] = i;
1130     return cast(__m128i)la;
1131 }
1132 unittest
1133 {
1134     __m128i A = _mm_setr_epi64(1, 2);
1135     long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
1136     long[2] result = [1, 5];
1137     assert(C.array == result);
1138 }
1139 
1140 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
1141 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
1142 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
1143 {
1144     // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
1145     // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
1146     byte16 ba = cast(byte16)a;
1147     ba.ptr[imm8 & 15] = cast(byte)i;
1148     return cast(__m128i)ba; 
1149 }
1150 unittest
1151 {
1152     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1153     byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
1154     byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
1155     assert(C.array == result);
1156 }
1157 
1158 
1159 /// Warning: of course it does something totally different from `_mm_insert_epi32`!
1160 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 
1161 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 
1162 /// (elements are zeroed out when the corresponding bit is set).
1163 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
1164 {
1165     // PERF DMD
1166     static if (GDC_with_SSE41)
1167     {
1168         return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8);
1169     }
1170     else static if (LDC_with_SSE41)
1171     {
1172         return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
1173     }
1174     else
1175     {
1176         float4 tmp2 = a;
1177         float tmp1 = b.array[(imm8 >> 6) & 3];
1178         tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
1179         return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
1180     }
1181 }
1182 unittest
1183 {
1184     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1185     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1186     __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
1187     float[4] correct =    [1.0f, 2.0f, 0.0f, 7.0f];
1188     assert(C.array == correct);
1189 }
1190 
1191 
1192 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1193 __m128i _mm_max_epi32 (__m128i a, __m128i b) pure @trusted
1194 {
1195     static if (GDC_with_SSE41)
1196     {
1197         return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
1198     }
1199     else version(LDC)
1200     {
1201         // x86: pmaxsd since LDC 1.1 -O1
1202         // ARM: smax.4s since LDC 1.8 -01
1203         int4 sa = cast(int4)a;
1204         int4 sb = cast(int4)b;
1205         static if (SIMD_COMPARISON_MASKS_16B)
1206             int4 greater = sa > sb;
1207         else
1208             int4 greater = greaterMask!int4(sa, sb);
1209         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1210     }
1211     else
1212     {
1213         __m128i higher = _mm_cmpgt_epi32(a, b);
1214         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1215         __m128i mask = _mm_and_si128(aTob, higher);
1216         return _mm_xor_si128(b, mask);
1217     }
1218 }
1219 unittest
1220 {
1221     int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
1222                                       _mm_setr_epi32(        -4,-8,  9, -8));
1223     int[4] correct =                               [0x7fffffff, 1,  9,  7];
1224     assert(R.array == correct);
1225 }
1226 
1227 /// Compare packed signed 8-bit integers in `a` and `b`, 
1228 /// and return packed maximum values.
1229 __m128i _mm_max_epi8 (__m128i a, __m128i b) pure @trusted
1230 {
1231     // PERF DMD
1232     static if (GDC_with_SSE41)
1233     {
1234         return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
1235     }
1236     else version(LDC)
1237     {
1238         // x86: pmaxsb since LDC 1.1 -O1
1239         // ARM64: smax.16b since LDC 1.8.0 -O1
1240         byte16 sa = cast(byte16)a;
1241         byte16 sb = cast(byte16)b;
1242         static if (SIMD_COMPARISON_MASKS_16B)
1243             byte16 greater = sa > sb;
1244         else
1245             byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1246         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1247     }
1248     else
1249     {
1250         __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
1251         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1252         __m128i mask = _mm_and_si128(aTob, lower);
1253         return _mm_xor_si128(b, mask);
1254     }
1255 }
1256 unittest
1257 {
1258     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1259     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1260     byte16 R = cast(byte16) _mm_max_epi8(A, B);
1261     byte[16] correct =       [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
1262     assert(R.array == correct);
1263 }
1264 
1265 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
1266 __m128i _mm_max_epu16 (__m128i a, __m128i b) pure @trusted
1267 {
1268     // PERF DMD
1269     static if (GDC_with_SSE41)
1270     {
1271         return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
1272     }
1273     else version(LDC)
1274     {
1275         // x86: pmaxuw since LDC 1.1 -O1
1276         // ARM64: umax.8h since LDC 1.8.0 -O1
1277         // PERF: without sse4.1, LLVM 12 produces a very interesting
1278         //          psubusw xmm0, xmm1
1279         //          paddw   xmm0, xmm1
1280         //       sequence that maybe should go in other min/max intrinsics? 
1281         ushort8 sa = cast(ushort8)a;
1282         ushort8 sb = cast(ushort8)b;
1283         static if (SIMD_COMPARISON_MASKS_16B)
1284         {
1285             // Note: doesn't work well with GDC, which prefers the builtin.
1286             ushort8 greater = sa > sb;
1287         }
1288         else
1289             ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
1290         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1291     }
1292     else
1293     {
1294         b = _mm_subs_epu16(b, a);
1295         b = _mm_add_epi16(b, a);
1296         return b;
1297     }
1298 }
1299 unittest
1300 {
1301     short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1302                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1303     short[8] correct =                                  [   -4, -8, -4, -7, 9,-32768, 0, 57];
1304     assert(R.array == correct);
1305 }
1306 
1307 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
1308 __m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted
1309 {
1310     // PERF DMD
1311     static if (GDC_with_SSE41)
1312     {
1313         return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
1314     }
1315     else version(LDC)
1316     {
1317         // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
1318         // ARM64: umax.4s since LDC 1.8.0 -O1
1319         uint4 sa = cast(uint4)a;
1320         uint4 sb = cast(uint4)b;
1321         static if (SIMD_COMPARISON_MASKS_16B)
1322             uint4 greater = sa > sb;
1323         else
1324             uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1325         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1326     }
1327     else
1328     {
1329         // PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128"
1330         /+
1331         movdqa  xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000]
1332         movdqa  xmm3, xmm1
1333         pxor    xmm3, xmm2
1334         pxor    xmm2, xmm0
1335         pcmpgtd xmm2, xmm3
1336         pand    xmm0, xmm2
1337         pandn   xmm2, xmm1
1338         por     xmm0, xmm2
1339         +/
1340         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1341         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
1342         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1343         __m128i mask = _mm_and_si128(aTob, higher);
1344         return _mm_xor_si128(b, mask);
1345     }
1346 }
1347 unittest
1348 {
1349     int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1350                                       _mm_setr_epi32(        -4,-8,  9, -8));
1351     int[4] correct =                                [        -4,-8,  9, -7];
1352     assert(R.array == correct);
1353 }
1354 
1355 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1356 __m128i _mm_min_epi32 (__m128i a, __m128i b) pure @trusted
1357 {
1358     // PERF DMD
1359     static if (GDC_with_SSE41)
1360     {
1361         return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
1362     }
1363     else version(LDC)
1364     {
1365         // x86: pminsd since LDC 1.1 -O1, also good without sse4.1
1366         // ARM: smin.4s since LDC 1.8 -01
1367         int4 sa = cast(int4)a;
1368         int4 sb = cast(int4)b;
1369         static if (SIMD_COMPARISON_MASKS_16B)
1370             int4 greater = sa > sb;
1371         else
1372             int4 greater = greaterMask!int4(sa, sb);
1373         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1374     }
1375     else
1376     {
1377         __m128i higher = _mm_cmplt_epi32(a, b);
1378         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1379         __m128i mask = _mm_and_si128(aTob, higher);
1380         return _mm_xor_si128(b, mask);
1381     }
1382 }
1383 unittest
1384 {
1385     int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff,  1, -4, 7),
1386                                       _mm_setr_epi32(        -4, -8,  9, -8));
1387     int[4] correct =                               [         -4, -8, -4, -8];
1388     assert(R.array == correct);
1389 }
1390 
1391 /// Compare packed signed 8-bit integers in `a` and `b`, 
1392 /// and return packed minimum values.
1393 __m128i _mm_min_epi8 (__m128i a, __m128i b) pure @trusted
1394 {
1395     // PERF DMD
1396     static if (GDC_with_SSE41)
1397     {
1398         return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
1399     }
1400     else version(LDC)
1401     {
1402         // x86: pminsb since LDC 1.1 -O1
1403         // ARM64: smin.16b since LDC 1.8.0 -O1
1404         byte16 sa = cast(byte16)a;
1405         byte16 sb = cast(byte16)b;
1406         static if (SIMD_COMPARISON_MASKS_16B)
1407             byte16 greater = sa > sb;
1408         else
1409             byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1410         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1411     }
1412     else
1413     {
1414         __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
1415         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1416         __m128i mask = _mm_and_si128(aTob, lower);
1417         return _mm_xor_si128(b, mask);
1418     }
1419 }
1420 unittest
1421 {
1422     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1423     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1424     byte16 R = cast(byte16) _mm_min_epi8(A, B);
1425     byte[16] correct =       [  4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0];
1426     assert(R.array == correct);
1427 }
1428 
1429 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
1430 __m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted
1431 {
1432     // PERF DMD
1433     static if (GDC_with_SSE41)
1434     {
1435         return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
1436     }
1437     else version(LDC)
1438     {
1439         // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
1440         // ARM64: umin.8h since LDC 1.8.0 -O1
1441         ushort8 sa = cast(ushort8)a;
1442         ushort8 sb = cast(ushort8)b;
1443         static if (SIMD_COMPARISON_MASKS_16B)
1444             ushort8 greater = (sb > sa);
1445         else
1446             ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
1447         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1448     }
1449     else
1450     {
1451         __m128i c = _mm_subs_epu16(b, a);
1452         b = _mm_sub_epi16(b, c);
1453         return b;
1454     }
1455 }
1456 unittest
1457 {
1458     short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1459                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1460     short[8] correct =                                  [32767,  1,  9, -8, 0,     7, 0,  0];
1461     assert(R.array == correct);
1462 }
1463 
1464 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
1465 __m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted
1466 {
1467     // PERF DMD
1468     static if (GDC_with_SSE41)
1469     {
1470         return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
1471     }
1472     else version(LDC)
1473     {
1474         // x86: pminud since LDC 1.1 -O1, also good without sse4.1
1475         // ARM64: umin.4s since LDC 1.8.0 -O1
1476         uint4 sa = cast(uint4)a;
1477         uint4 sb = cast(uint4)b;
1478         static if (SIMD_COMPARISON_MASKS_16B)
1479             uint4 greater = sa > sb;
1480         else
1481             uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1482         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1483     }
1484     else
1485     {
1486         // PERF: same remark as in _mm_max_epu32
1487         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1488         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
1489         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1490         __m128i mask = _mm_and_si128(aTob, higher);
1491         return _mm_xor_si128(b, mask);
1492     }
1493 }
1494 unittest
1495 {
1496     int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1497                                       _mm_setr_epi32(        -4,-8,  9, -8));
1498     int[4] correct =                                [0x7fffffff, 1,  4, -8];
1499     assert(R.array == correct);
1500 }
1501 
1502 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 
1503 /// store the minimum and index in return value, and zero the remaining bits.
1504 __m128i _mm_minpos_epu16 (__m128i a) @trusted
1505 {
1506     // PERF DMD
1507     static if (GDC_with_SSE41)
1508     {
1509         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1510     }
1511     else static if (LDC_with_SSE41)
1512     {
1513         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1514     }
1515     else static if (LDC_with_ARM64)
1516     {
1517         __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1518         __m128i combinedLo = _mm_unpacklo_epi16(indices, a);
1519         __m128i combinedHi = _mm_unpackhi_epi16(indices, a);
1520         __m128i best = _mm_min_epu32(combinedLo, combinedHi);
1521         best = _mm_min_epu32(best, _mm_srli_si128!8(best));
1522         best = _mm_min_epu32(best, _mm_srli_si128!4(best));
1523         short8 sbest = cast(short8)best;
1524         short8 r;
1525         r[0] = sbest[1];
1526         r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
1527         r[2] = 0;
1528         r[3] = 0;
1529         r[4] = 0;
1530         r[5] = 0;
1531         r[6] = 0;
1532         r[7] = 0;
1533         return cast(__m128i)r;
1534     }
1535     else
1536     {
1537         short8 sa = cast(short8)a;
1538         ushort min = 0xffff;
1539         int index = 0;
1540         for(int n = 0; n < 8; ++n)
1541         {
1542             ushort c = sa.array[n];
1543             if (c < min)
1544             {
1545                 min = c;
1546                 index = n;
1547             }
1548         }
1549         short8 r;
1550         r.ptr[0] = min;
1551         r.ptr[1] = cast(short)index;
1552         return cast(__m128i)r;
1553     }
1554 }
1555 unittest
1556 {
1557     __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
1558     __m128i B = _mm_setr_epi16(14,  4, 4, 2, -3, 2, 5, 6);
1559     short8 R1 = cast(short8) _mm_minpos_epu16(A);
1560     short8 R2 = cast(short8) _mm_minpos_epu16(B);
1561     short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
1562     short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
1563     assert(R1.array == correct1);
1564     assert(R2.array == correct2);
1565 }
1566 
1567 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 
1568 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 
1569 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 
1570 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 
1571 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 
1572 /// at the offset specified in `imm8[2]`.
1573 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
1574 {
1575     // PERF DMD
1576     static if (GDC_with_SSE41)
1577     {
1578         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8);  
1579     }
1580     else static if (LDC_with_SSE41)
1581     {
1582         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
1583     }
1584     else
1585     {
1586         int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
1587         int b_offset = (imm8 & 3) * 4;
1588 
1589         byte16 ba = cast(byte16)a;
1590         byte16 bb = cast(byte16)b;
1591         short8 r;
1592 
1593         __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
1594 
1595         for (int j = 0; j < 8; j += 2)
1596         {
1597             int k = a_offset + j;
1598             __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
1599                                            0, 0, 0, 0, 
1600                                            ba[k+1], ba[k+2], ba[k+3], ba[k+4],
1601                                            0, 0, 0, 0);
1602             short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
1603             r.ptr[j] = diffs.array[0];
1604             r.ptr[j+1] = diffs.array[4];
1605         }
1606         return cast(__m128i)r;
1607     }
1608 }
1609 unittest
1610 {
1611     __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
1612     __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
1613     short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
1614     short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
1615     short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
1616     short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
1617     short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
1618     short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B);
1619     short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B);
1620     short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B);
1621     short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B);
1622     short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B);
1623     assert(r1.array == correct1);
1624     assert(r4.array == correct4);
1625     assert(r5.array == correct5);
1626     assert(r7.array == correct7);
1627     assert(r8.array == correct0);
1628 }
1629 
1630 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
1631 __m128i _mm_mul_epi32 (__m128i a, __m128i b) pure @trusted
1632 {
1633     // PERF DMD
1634     static if (GDC_with_SSE41)
1635     {
1636         return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
1637     }
1638     else static if (LDC_with_SSE41 && LDC_with_optimizations)
1639     {
1640         // For some reason, clang has the builtin but it's not in IntrinsicsX86.td
1641         // Use IR instead.
1642         // This generates pmuldq with since LDC 1.2.0 -O0 
1643         enum ir = `
1644             %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
1645             %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
1646             %la = sext <2 x i32> %ia to <2 x i64>
1647             %lb = sext <2 x i32> %ib to <2 x i64>
1648             %r = mul <2 x i64> %la, %lb
1649             ret <2 x i64> %r`;
1650         return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
1651     }
1652     else static if (LDC_with_ARM64)  
1653     {
1654         // 3 instructions since LDC 1.8 -O2
1655         // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
1656         int2 a_lo = vmovn_s64(cast(long2)a);
1657         int2 b_lo = vmovn_s64(cast(long2)b);
1658         return cast(__m128i) vmull_s32(a_lo, b_lo);
1659     }
1660     else
1661     {
1662         int4 ia = cast(int4)a;
1663         int4 ib = cast(int4)b;
1664         long2 r;
1665         r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
1666         r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
1667         return cast(__m128i)r;
1668     }
1669 }
1670 unittest
1671 {
1672     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1673     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1674     long2 R = cast(long2) _mm_mul_epi32(A, B);
1675     long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
1676     assert(R.array == correct);
1677 }
1678 
1679 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 
1680 /// return the low 32 bits of the intermediate integers.
1681 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted
1682 {
1683     // PERF DMD
1684     // PERF GDC without SSE4.1 could be better
1685     static if (GDC_with_SSE41)
1686     {
1687         int4 ia = cast(int4)a;
1688         int4 ib = cast(int4)b;
1689         // Note: older GDC doesn't have that op, but older GDC
1690         // also has no support for -msse4.1 detection
1691         return cast(__m128i)(a * b); 
1692     }
1693     else version(LDC)
1694     {
1695         int4 ia = cast(int4)a;
1696         int4 ib = cast(int4)b;
1697         return cast(__m128i)(a * b);
1698     }
1699     else
1700     {
1701         // DMD doesn't take the above
1702         int4 ia = cast(int4)a;
1703         int4 ib = cast(int4)b;
1704         int4 r;
1705         r.ptr[0] = ia.array[0] * ib.array[0];
1706         r.ptr[1] = ia.array[1] * ib.array[1];
1707         r.ptr[2] = ia.array[2] * ib.array[2];
1708         r.ptr[3] = ia.array[3] * ib.array[3];
1709         return r;
1710     }
1711 }
1712 unittest
1713 {
1714     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1715     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1716     int4 R = cast(int4) _mm_mullo_epi32(A, B);
1717     int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
1718     assert(R.array == correct);
1719 }
1720 
1721 
1722 /// Convert packed signed 32-bit integers from `a` and `b` 
1723 /// to packed 16-bit integers using unsigned saturation.
1724 __m128i _mm_packus_epi32 (__m128i a, __m128i b) pure @trusted
1725 {
1726     static if (GDC_with_SSE41)
1727     {
1728         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1729     }
1730     else static if (LDC_with_SSE41)
1731     {
1732         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1733     }
1734     else static if (LDC_with_ARM64)
1735     {
1736        int4 z;
1737        z = 0;       
1738        return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
1739                                          vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
1740     }
1741     else
1742     {
1743         __m128i i32768 = _mm_set1_epi32(32768);
1744         __m128i s32768 = _mm_set1_epi16(-32768);
1745         a = _mm_sub_epi32(a, i32768);
1746         b = _mm_sub_epi32(b, i32768);
1747         __m128i clampedSigned = _mm_packs_epi32(a, b);
1748         return _mm_add_epi16(clampedSigned, s32768);
1749     }
1750 }
1751 unittest
1752 {
1753     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
1754     short8 R = cast(short8) _mm_packus_epi32(A, A);
1755     short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
1756     assert(R.array == correct);
1757 }
1758 
1759 
1760 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
1761 /// rounding parameter, and store the results as packed double-precision floating-point elements.
1762 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1763 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1764 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1765 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1766 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1767 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1768 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted
1769 {
1770     // PERF DMD
1771     static if (GDC_with_SSE41)
1772     {
1773         return __builtin_ia32_roundpd(a, rounding);
1774     }
1775     else static if (LDC_with_SSE41)
1776     {
1777         return __builtin_ia32_roundpd(a, rounding);
1778     }
1779     else
1780     {
1781         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1782         {
1783             // Convert to 64-bit integers
1784             long lo = _mm_cvtsd_si64(a);
1785             a.ptr[0] = a.array[1];
1786             long hi = _mm_cvtsd_si64(a);
1787             return _mm_setr_pd(lo, hi);
1788         }
1789         else
1790         {
1791             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1792 
1793             uint old = _MM_GET_ROUNDING_MODE();
1794             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1795             
1796             // Convert to 64-bit integers
1797             long lo = _mm_cvtsd_si64(a);
1798             a.ptr[0] = a.array[1];
1799             long hi = _mm_cvtsd_si64(a);
1800 
1801             // Convert back to double to achieve the rounding
1802             // The problem is that a 64-bit double can't represent all the values 
1803             // a 64-bit integer can (and vice-versa). So this function won't work for
1804             // large values. (TODO: what range exactly?)
1805             _MM_SET_ROUNDING_MODE(old);
1806             return _mm_setr_pd(lo, hi);
1807         }
1808     }
1809 }
1810 unittest
1811 {
1812     // tested in other intrinsics
1813 }
1814 
1815 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
1816 /// rounding parameter, and store the results as packed single-precision floating-point elements.
1817 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1818 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1819 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1820 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1821 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1822 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1823 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted
1824 {
1825     // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally
1826     static if (GDC_or_LDC_with_SSE41)
1827     {
1828         return __builtin_ia32_roundps(a, rounding);
1829     }
1830     else
1831     {
1832         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1833         {
1834             __m128i integers = _mm_cvtps_epi32(a);
1835             return _mm_cvtepi32_ps(integers);
1836         }
1837         else
1838         {
1839             version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
1840             uint old = _MM_GET_ROUNDING_MODE();
1841             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1842             scope(exit) _MM_SET_ROUNDING_MODE(old);
1843 
1844             // Convert to 64-bit integers
1845             __m128i integers = _mm_cvtps_epi32(a);
1846 
1847             // Convert back to float to achieve the rounding
1848             // The problem is that a 32-float can't represent all the values 
1849             // a 32-bit integer can (and vice-versa). So this function won't work for
1850             // large values. (TODO: what range exactly?)
1851             __m128 result = _mm_cvtepi32_ps(integers);
1852 
1853             return result;
1854         }
1855     }
1856 }
1857 unittest
1858 {
1859     // tested in other intrinsics
1860 }
1861 
1862 
1863 /// Round the lower double-precision (64-bit) floating-point element in `b` using the
1864 /// rounding parameter, store the result as a double-precision floating-point element 
1865 /// in the lower element of result, and copy the upper element from `a` to the upper element of result.
1866 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1867 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1868 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1869 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1870 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1871 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1872 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
1873 {
1874     static if (GDC_with_SSE41)
1875     {
1876         return __builtin_ia32_roundsd(a, b, rounding);
1877     }
1878     else static if (LDC_with_SSE41)
1879     {
1880         return __builtin_ia32_roundsd(a, b, rounding);
1881     }
1882     else
1883     {
1884         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1885         {
1886             // Convert to 64-bit integer
1887             long b0 = _mm_cvtsd_si64(b);
1888             a.ptr[0] = b0;
1889             return a;
1890         }
1891         else
1892         {
1893             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1894 
1895             uint old = _MM_GET_ROUNDING_MODE();
1896             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1897             
1898             // Convert to 64-bit integer
1899             long b0 = _mm_cvtsd_si64(b);
1900             a.ptr[0] = b0;
1901 
1902             // Convert back to double to achieve the rounding
1903             // The problem is that a 64-bit double can't represent all the values 
1904             // a 64-bit integer can (and vice-versa). So this function won't work for
1905             // large values. (TODO: what range exactly?)
1906             _MM_SET_ROUNDING_MODE(old);
1907             return a;
1908         }
1909     }
1910 }
1911 unittest
1912 {
1913     // tested in other intrinsics
1914 }
1915 
1916 
1917 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 
1918 /// rounding parameter, store the result as a single-precision floating-point element 
1919 /// in the lower element of result, and copy the upper 3 packed elements from `a`
1920 /// to the upper elements of result.
1921 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1922 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1923 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1924 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1925 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1926 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1927 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
1928 {
1929     static if (GDC_with_SSE41)
1930     {
1931         return __builtin_ia32_roundss(a, b, rounding);
1932     }
1933     else static if (LDC_with_SSE41)
1934     {
1935         return __builtin_ia32_roundss(a, b, rounding);
1936     }
1937     else
1938     {
1939         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1940         {
1941             int b0 = _mm_cvtss_si32(b);
1942             a.ptr[0] = b0;   
1943             return a;
1944         }
1945         else version(GNU)
1946         {
1947             pragma(inline, false)
1948             __m128 GDCworkaround() nothrow @nogc @trusted 
1949             {
1950                 uint old = _MM_GET_ROUNDING_MODE();
1951                 _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1952 
1953                 // Convert to 32-bit integer
1954                 int b0 = _mm_cvtss_si32(b);
1955                 a.ptr[0] = b0;       
1956 
1957                 // Convert back to double to achieve the rounding
1958                 // The problem is that a 32-bit float can't represent all the values 
1959                 // a 32-bit integer can (and vice-versa). So this function won't work for
1960                 // large values. (TODO: what range exactly?)
1961                 _MM_SET_ROUNDING_MODE(old);
1962                 return a;
1963             }
1964             return GDCworkaround();
1965         }
1966         else
1967         {
1968             uint old = _MM_GET_ROUNDING_MODE();
1969             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1970 
1971             // Convert to 32-bit integer
1972             int b0 = _mm_cvtss_si32(b);
1973             a.ptr[0] = b0;       
1974 
1975             // Convert back to double to achieve the rounding
1976             // The problem is that a 32-bit float can't represent all the values 
1977             // a 32-bit integer can (and vice-versa). So this function won't work for
1978             // large values. (TODO: what range exactly?)
1979             _MM_SET_ROUNDING_MODE(old);
1980             return a;
1981         }
1982     }
1983 }
1984 unittest
1985 {
1986     // tested in other intrinsics
1987 }
1988 
1989 
1990 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 
1991 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 
1992 /// exception may be generated.
1993 __m128i _mm_stream_load_si128 (__m128i * mem_addr) pure @trusted
1994 {
1995     // PERF DMD D_SIMD
1996     static if (GDC_with_SSE41)
1997     {
1998         return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr);
1999     }
2000     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
2001     {
2002         enum prefix = `!0 = !{ i32 1 }`;
2003         enum ir = `
2004             %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0
2005             ret <4 x i32> %r`;
2006         return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(mem_addr);
2007     }
2008     else
2009     {
2010         return *mem_addr; // regular move instead
2011     }
2012 }
2013 unittest
2014 {
2015     align(16) static immutable int[4] correct = [1, 2, 3, 4];
2016     __m128i A = _mm_stream_load_si128(cast(__m128i*)(correct.ptr));
2017     _mm_mfence();
2018     assert(A.array == correct);
2019 }
2020 
2021 /// Return 1 if all bits in `a` are all 1's. Else return 0.
2022 int _mm_test_all_ones (__m128i a) @safe
2023 {
2024     return _mm_testc_si128(a, _mm_set1_epi32(-1));
2025 }
2026 unittest
2027 {
2028     __m128i A = _mm_set1_epi32(-1);
2029     __m128i B = _mm_set_epi32(-1, -2, -1, -1);
2030     assert(_mm_test_all_ones(A) == 1);
2031     assert(_mm_test_all_ones(B) == 0);
2032 }
2033 
2034 /// Return 1 if all bits in `a` are all 0's. Else return 0.
2035 // This is a #BONUS since it was lacking in Intel Intrinsics API.
2036 int _mm_test_all_zeros (__m128i a) @safe
2037 {
2038     return _mm_testz_si128(a, _mm_set1_epi32(-1));
2039 }
2040 unittest
2041 {
2042     __m128i A = _mm_set1_epi32(0);
2043     __m128i B = _mm_set_epi32(0, 8, 0, 0);
2044     assert(_mm_test_all_zeros(A) == 1);
2045     assert(_mm_test_all_zeros(B) == 0);
2046 }
2047 
2048 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 
2049 /// and return 1 if the result is zero, otherwise return 0.
2050 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
2051 {
2052     return _mm_testz_si128(a, mask); // it's really the same, but with a good name
2053 }
2054 
2055 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 
2056 /// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with 
2057 /// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and
2058 /// CF values are zero, otherwise return 0.
2059 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
2060 {
2061     return _mm_testnzc_si128(a, mask);
2062 }
2063 
2064 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 
2065 /// result is zero, otherwise return 0.
2066 /// In other words, test if all bits masked by `b` are 1 in `a`.
2067 int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted
2068 {
2069     // PERF DMD
2070     static if (GDC_with_SSE41)
2071     {
2072         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2073     }
2074     else static if (LDC_with_SSE41)
2075     {
2076         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2077     }
2078     else static if (LDC_with_ARM64)
2079     {
2080         // Acceptable since LDC 1.8 -02
2081         long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
2082         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2083     }
2084     else
2085     {
2086         __m128i c = ~a & b;
2087         int[4] zero = [0, 0, 0, 0];
2088         return c.array == zero;
2089     }
2090 }
2091 unittest
2092 {
2093     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2094     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
2095     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2096     assert(_mm_testc_si128(A, A) == 1);
2097     assert(_mm_testc_si128(A, M1) == 0);
2098     assert(_mm_testc_si128(A, M2) == 1);
2099 }
2100 
2101 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 
2102 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
2103 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
2104 /// result is zero, otherwise set CF to 0. 
2105 /// Return 1 if both the ZF and CF values are zero, otherwise return 0.
2106 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
2107 {
2108     // PERF DMD
2109     static if (GDC_with_SSE41)
2110     {
2111         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2112     }
2113     else static if (LDC_with_SSE41)
2114     {
2115         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2116     }
2117     else static if (LDC_with_ARM64)
2118     {
2119         long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
2120         long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
2121 
2122         return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
2123                 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
2124     }
2125     else
2126     {
2127         __m128i c = a & b;
2128         __m128i d = ~a & b;
2129         int[4] zero = [0, 0, 0, 0];
2130         return !( (c.array == zero) || (d.array == zero));
2131     }    
2132 }
2133 unittest
2134 {
2135     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2136     __m128i M  = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
2137     __m128i Z = _mm_setzero_si128();
2138     assert(_mm_testnzc_si128(A, Z) == 0);
2139     assert(_mm_testnzc_si128(A, M) == 1);
2140     assert(_mm_testnzc_si128(A, A) == 0);
2141 }
2142 
2143 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 
2144 /// and return 1 if the result is zero, otherwise return 0.
2145 /// In other words, test if all bits masked by `b` are 0 in `a`.
2146 int _mm_testz_si128 (__m128i a, __m128i b) @trusted
2147 {
2148     // PERF DMD
2149     static if (GDC_with_SSE41)
2150     {
2151         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2152     }
2153     else static if (LDC_with_SSE41)
2154     {
2155         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2156     }
2157     else static if (LDC_with_ARM64)
2158     {
2159         // Acceptable since LDC 1.8 -02
2160         long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
2161         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2162     }
2163     else 
2164     {
2165         __m128i c = a & b;
2166         int[4] zero = [0, 0, 0, 0];
2167         return c.array == zero;
2168     }    
2169 }
2170 unittest
2171 {
2172     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2173     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
2174     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2175     assert(_mm_testz_si128(A, A) == 0);
2176     assert(_mm_testz_si128(A, M1) == 1);
2177     assert(_mm_testz_si128(A, M2) == 0);
2178 }
2179