1 /**
2 * SSE2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
4 *
5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.emmintrin;
9
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14
15 nothrow @nogc:
16
17
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24 pragma(inline, true);
25 return cast(__m128i)(cast(short8)a + cast(short8)b);
26 }
27 unittest
28 {
29 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
30 short8 R = cast(short8) _mm_add_epi16(A, A);
31 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
32 assert(R.array == correct);
33 }
34
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
37 {
38 pragma(inline, true);
39 return cast(__m128i)(cast(int4)a + cast(int4)b);
40 }
41 unittest
42 {
43 __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
44 int4 R = _mm_add_epi32(A, A);
45 int[4] correct = [ -14, -2, 0, 18 ];
46 assert(R.array == correct);
47 }
48
49 /// Add packed 64-bit integers in `a` and `b`.
50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
51 {
52 pragma(inline, true);
53 return cast(__m128i)(cast(long2)a + cast(long2)b);
54 }
55 unittest
56 {
57 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
58 long2 R = cast(long2) _mm_add_epi64(A, A);
59 long[2] correct = [ -2, 0 ];
60 assert(R.array == correct);
61 }
62
63 /// Add packed 8-bit integers in `a` and `b`.
64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
65 {
66 pragma(inline, true);
67 return cast(__m128i)(cast(byte16)a + cast(byte16)b);
68 }
69 unittest
70 {
71 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
72 byte16 R = cast(byte16) _mm_add_epi8(A, A);
73 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
74 assert(R.array == correct);
75 }
76
77 /// Add the lower double-precision (64-bit) floating-point element
78 /// in `a` and `b`, store the result in the lower element of dst,
79 /// and copy the upper element from `a` to the upper element of destination.
80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
81 {
82 static if (DMD_with_DSIMD)
83 {
84 return cast(__m128d) __simd(XMM.ADDSD, a, b);
85 }
86 else static if (GDC_with_SSE2)
87 {
88 return __builtin_ia32_addsd(a, b);
89 }
90 else version(DigitalMars)
91 {
92 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
93 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
94 asm pure nothrow @nogc @trusted { nop;}
95 a[0] = a[0] + b[0];
96 return a;
97 }
98 else
99 {
100 a[0] += b[0];
101 return a;
102 }
103 }
104 unittest
105 {
106 __m128d a = [1.5, -2.0];
107 a = _mm_add_sd(a, a);
108 assert(a.array == [3.0, -2.0]);
109 }
110
111 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
112 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
113 {
114 pragma(inline, true);
115 return a + b;
116 }
117 unittest
118 {
119 __m128d a = [1.5, -2.0];
120 a = _mm_add_pd(a, a);
121 assert(a.array == [3.0, -4.0]);
122 }
123
124 /// Add 64-bit integers `a` and `b`.
125 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
126 {
127 // PERF DMD
128 pragma(inline, true);
129 return a + b;
130 }
131
132 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
133 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
134 {
135 static if (DMD_with_DSIMD)
136 {
137 return cast(__m128i) __simd(XMM.PADDSW, a, b);
138 }
139 else static if (GDC_with_SSE2)
140 {
141 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
142 }
143 else static if(LDC_with_saturated_intrinsics)
144 {
145 return cast(__m128i) inteli_llvm_adds!short8(cast(short8)a, cast(short8)b);
146 }
147 else
148 {
149 short[8] res; // PERF =void;
150 short8 sa = cast(short8)a;
151 short8 sb = cast(short8)b;
152 foreach(i; 0..8)
153 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
154 return _mm_loadu_si128(cast(int4*)res.ptr);
155 }
156 }
157 unittest
158 {
159 short8 res = cast(short8) _mm_adds_epi16(_mm_setr_epi16( 7, 6, 5, -32768, 3, 3, 32767, 0),
160 _mm_setr_epi16( 7, 6, 5, -30000, 3, 1, 1, -10));
161 static immutable short[8] correctResult = [14, 12, 10, -32768, 6, 4, 32767, -10];
162 assert(res.array == correctResult);
163 }
164
165 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
166 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
167 {
168 static if (DMD_with_DSIMD)
169 {
170 return cast(__m128i) __simd(XMM.PADDSB, a, b);
171 }
172 else static if (GDC_with_SSE2)
173 {
174 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
175 }
176 else static if(LDC_with_saturated_intrinsics)
177 {
178 return cast(__m128i) inteli_llvm_adds!byte16(cast(byte16)a, cast(byte16)b);
179 }
180 else
181 {
182 byte[16] res; // PERF =void;
183 byte16 sa = cast(byte16)a;
184 byte16 sb = cast(byte16)b;
185 foreach(i; 0..16)
186 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
187 return _mm_loadu_si128(cast(int4*)res.ptr);
188 }
189 }
190 unittest
191 {
192 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0),
193 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0));
194 static immutable byte[16] correctResult = [0, 2, 4, 6, -128, 10, 12, 14,
195 16, 18, 127, 22, 24, 26, 28, 30];
196 assert(res.array == correctResult);
197 }
198
199 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
200 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
201 {
202 static if (DMD_with_DSIMD)
203 {
204 return cast(__m128i) __simd(XMM.PADDUSB, a, b);
205 }
206 else static if (GDC_with_SSE2)
207 {
208 return cast(__m128i) __builtin_ia32_paddusb128(cast(ubyte16)a, cast(ubyte16)b);
209 }
210 else static if(LDC_with_saturated_intrinsics)
211 {
212 return cast(__m128i) inteli_llvm_addus!byte16(cast(byte16)a, cast(byte16)b);
213 }
214 else
215 {
216 ubyte[16] res; // PERF =void;
217 byte16 sa = cast(byte16)a;
218 byte16 sb = cast(byte16)b;
219 foreach(i; 0..16)
220 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
221 return _mm_loadu_si128(cast(int4*)res.ptr);
222 }
223 }
224 unittest
225 {
226 byte16 res = cast(byte16)
227 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
228 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
229 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14,
230 0, cast(byte)255, 4, 6, 8, 10, 12, 14];
231 assert(res.array == correctResult);
232 }
233
234 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
235 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
236 {
237 static if (DMD_with_DSIMD)
238 {
239 // Note: DMD generates a reverted paddusw vs LDC and GDC, but that doesn't change the result anyway
240 return cast(__m128i) __simd(XMM.PADDUSW, a, b);
241 }
242 else static if (GDC_with_SSE2)
243 {
244 return cast(__m128i) __builtin_ia32_paddusw128(cast(short8)a, cast(short8)b);
245 }
246 else static if(LDC_with_saturated_intrinsics)
247 {
248 return cast(__m128i) inteli_llvm_addus!short8(cast(short8)a, cast(short8)b);
249 }
250 else
251 {
252 ushort[8] res; // PERF =void;
253 short8 sa = cast(short8)a;
254 short8 sb = cast(short8)b;
255 foreach(i; 0..8)
256 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
257 return _mm_loadu_si128(cast(int4*)res.ptr);
258 }
259 }
260 unittest
261 {
262 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
263 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
264 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
265 assert(res.array == correctResult);
266 }
267
268 /// Compute the bitwise AND of packed double-precision (64-bit)
269 /// floating-point elements in `a` and `b`.
270 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
271 {
272 pragma(inline, true);
273 return cast(__m128d)( cast(long2)a & cast(long2)b );
274 }
275 unittest
276 {
277 double a = 4.32;
278 double b = -78.99;
279 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
280 __m128d A = _mm_set_pd(a, b);
281 __m128d B = _mm_set_pd(b, a);
282 long2 R = cast(long2)( _mm_and_pd(A, B) );
283 assert(R.array[0] == correct);
284 assert(R.array[1] == correct);
285 }
286
287 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
288 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
289 {
290 pragma(inline, true);
291 return a & b;
292 }
293 unittest
294 {
295 __m128i A = _mm_set1_epi32(7);
296 __m128i B = _mm_set1_epi32(14);
297 __m128i R = _mm_and_si128(A, B);
298 int[4] correct = [6, 6, 6, 6];
299 assert(R.array == correct);
300 }
301
302 /// Compute the bitwise NOT of packed double-precision (64-bit)
303 /// floating-point elements in `a` and then AND with `b`.
304 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
305 {
306 static if (DMD_with_DSIMD)
307 {
308 return cast(__m128d) __simd(XMM.ANDNPD, a, b);
309 }
310 else
311 {
312 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
313 }
314 }
315 unittest
316 {
317 double a = 4.32;
318 double b = -78.99;
319 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
320 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
321 __m128d A = _mm_setr_pd(a, b);
322 __m128d B = _mm_setr_pd(b, a);
323 long2 R = cast(long2)( _mm_andnot_pd(A, B) );
324 assert(R.array[0] == correct);
325 assert(R.array[1] == correct2);
326 }
327
328 /// Compute the bitwise NOT of 128 bits (representing integer data)
329 /// in `a` and then AND with `b`.
330 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
331 {
332 static if (DMD_with_DSIMD)
333 {
334 return cast(__m128i) __simd(XMM.PANDN, a, b);
335 }
336 else
337 {
338 return (~a) & b;
339 }
340 }
341 unittest
342 {
343 __m128i A = _mm_setr_epi32(7, -2, 9, 54654);
344 __m128i B = _mm_setr_epi32(14, 78, 111, -256);
345 __m128i R = _mm_andnot_si128(A, B);
346 int[4] correct = [8, 0, 102, -54784];
347 assert(R.array == correct);
348 }
349
350 /// Average packed unsigned 16-bit integers in `a` and `b`.
351 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
352 {
353 static if (DMD_with_DSIMD)
354 {
355 return cast(__m128i) __simd(XMM.PAVGW, a, b);
356 }
357 else static if (GDC_with_SSE2)
358 {
359 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
360 }
361 else static if (LDC_with_ARM64)
362 {
363 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
364 }
365 else static if (LDC_with_SSE2 && __VERSION__ >= 2094)
366 {
367 // Exists since LDC 1.18
368 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
369 }
370 else static if (LDC_with_optimizations)
371 {
372 // Generates pavgw even in LDC 1.0, even in -O0
373 // But not in ARM
374 enum ir = `
375 %ia = zext <8 x i16> %0 to <8 x i32>
376 %ib = zext <8 x i16> %1 to <8 x i32>
377 %isum = add <8 x i32> %ia, %ib
378 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
379 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
380 %r = trunc <8 x i32> %isums to <8 x i16>
381 ret <8 x i16> %r`;
382 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
383 }
384 else
385 {
386 short8 sa = cast(short8)a;
387 short8 sb = cast(short8)b;
388 short8 sr = void;
389 foreach(i; 0..8)
390 {
391 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
392 }
393 return cast(int4)sr;
394 }
395 }
396 unittest
397 {
398 __m128i A = _mm_set1_epi16(31);
399 __m128i B = _mm_set1_epi16(64);
400 short8 avg = cast(short8)(_mm_avg_epu16(A, B));
401 foreach(i; 0..8)
402 assert(avg.array[i] == 48);
403 }
404
405 /// Average packed unsigned 8-bit integers in `a` and `b`.
406 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
407 {
408 static if (DMD_with_DSIMD)
409 {
410 return cast(__m128i) __simd(XMM.PAVGB, a, b);
411 }
412 else static if (GDC_with_SSE2)
413 {
414 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
415 }
416 else static if (LDC_with_SSE2 && __VERSION__ >= 2094)
417 {
418 // Exists since LDC 1.18
419 return cast(__m128i) __builtin_ia32_pavgb128(cast(byte16)a, cast(byte16)b);
420 }
421 else static if (LDC_with_ARM64)
422 {
423 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
424 }
425 else static if (LDC_with_optimizations)
426 {
427 // Generates pavgb even in LDC 1.0, even in -O0
428 // But not in ARM
429 enum ir = `
430 %ia = zext <16 x i8> %0 to <16 x i16>
431 %ib = zext <16 x i8> %1 to <16 x i16>
432 %isum = add <16 x i16> %ia, %ib
433 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
434 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
435 %r = trunc <16 x i16> %isums to <16 x i8>
436 ret <16 x i8> %r`;
437 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
438 }
439 else
440 {
441 byte16 sa = cast(byte16)a;
442 byte16 sb = cast(byte16)b;
443 byte16 sr = void;
444 foreach(i; 0..16)
445 {
446 sr.ptr[i] = cast(ubyte)( (cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]) + 1) >> 1 );
447 }
448 return cast(int4)sr;
449 }
450 }
451 unittest
452 {
453 __m128i A = _mm_set1_epi8(31);
454 __m128i B = _mm_set1_epi8(64);
455 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
456 foreach(i; 0..16)
457 assert(avg.array[i] == 48);
458 }
459
460 /// Shift `a` left by `bytes` bytes while shifting in zeros.
461 alias _mm_bslli_si128 = _mm_slli_si128;
462 unittest
463 {
464 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
465 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
466 __m128i result = _mm_bslli_si128!5(toShift);
467 assert( (cast(byte16)result).array == exact);
468 }
469
470 /// Shift `v` right by `bytes` bytes while shifting in zeros.
471 alias _mm_bsrli_si128 = _mm_srli_si128;
472 unittest
473 {
474 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
475 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0];
476 __m128i result = _mm_bsrli_si128!5(toShift);
477 assert( (cast(byte16)result).array == exact);
478 }
479
480 /// Cast vector of type `__m128d` to type `__m128`.
481 /// Note: Also possible with a regular `cast(__m128)(a)`.
482 __m128 _mm_castpd_ps (__m128d a) pure @safe
483 {
484 return cast(__m128)a;
485 }
486
487 /// Cast vector of type `__m128d` to type `__m128i`.
488 /// Note: Also possible with a regular `cast(__m128i)(a)`.
489 __m128i _mm_castpd_si128 (__m128d a) pure @safe
490 {
491 return cast(__m128i)a;
492 }
493
494 /// Cast vector of type `__m128` to type `__m128d`.
495 /// Note: Also possible with a regular `cast(__m128d)(a)`.
496 __m128d _mm_castps_pd (__m128 a) pure @safe
497 {
498 return cast(__m128d)a;
499 }
500
501 /// Cast vector of type `__m128` to type `__m128i`.
502 /// Note: Also possible with a regular `cast(__m128i)(a)`.
503 __m128i _mm_castps_si128 (__m128 a) pure @safe
504 {
505 return cast(__m128i)a;
506 }
507
508 /// Cast vector of type `__m128i` to type `__m128d`.
509 /// Note: Also possible with a regular `cast(__m128d)(a)`.
510 __m128d _mm_castsi128_pd (__m128i a) pure @safe
511 {
512 return cast(__m128d)a;
513 }
514
515 /// Cast vector of type `__m128i` to type `__m128`.
516 /// Note: Also possible with a regular `cast(__m128)(a)`.
517 __m128 _mm_castsi128_ps (__m128i a) pure @safe
518 {
519 return cast(__m128)a;
520 }
521
522 /// Invalidate and flush the cache line that contains `p`
523 /// from all levels of the cache hierarchy.
524 void _mm_clflush (const(void)* p) @trusted
525 {
526 static if (GDC_with_SSE2)
527 {
528 __builtin_ia32_clflush(p);
529 }
530 else static if (LDC_with_SSE2)
531 {
532 __builtin_ia32_clflush(cast(void*)p);
533 }
534 else version(D_InlineAsm_X86)
535 {
536 asm pure nothrow @nogc @trusted
537 {
538 mov EAX, p;
539 clflush [EAX];
540 }
541 }
542 else version(D_InlineAsm_X86_64)
543 {
544 asm pure nothrow @nogc @trusted
545 {
546 mov RAX, p;
547 clflush [RAX];
548 }
549 }
550 else
551 {
552 // Do nothing. Invalidating cacheline does
553 // not affect correctness.
554 }
555 }
556 unittest
557 {
558 ubyte[64] cacheline;
559 _mm_clflush(cacheline.ptr);
560 }
561
562 /// Compare packed 16-bit integers in `a` and `b` for equality.
563 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
564 {
565 static if (SIMD_COMPARISON_MASKS_16B)
566 {
567 return cast(__m128i)(cast(short8)a == cast(short8)b);
568 }
569 else static if (GDC_with_SSE2)
570 {
571 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
572 }
573 else
574 {
575 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
576 }
577 }
578 unittest
579 {
580 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3];
581 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3];
582 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0];
583 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
584 assert(R.array == E);
585 }
586
587 /// Compare packed 32-bit integers in `a` and `b` for equality.
588 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
589 {
590 static if (SIMD_COMPARISON_MASKS_16B)
591 {
592 return cast(__m128i)(cast(int4)a == cast(int4)b);
593 }
594 else static if (GDC_with_SSE2)
595 {
596 return __builtin_ia32_pcmpeqd128(a, b);
597 }
598 else
599 {
600 return equalMask!__m128i(a, b);
601 }
602 }
603 unittest
604 {
605 int4 A = [-3, -2, -1, 0];
606 int4 B = [ 4, -2, 2, 0];
607 int[4] E = [ 0, -1, 0, -1];
608 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B));
609 assert(R.array == E);
610 }
611
612 /// Compare packed 8-bit integers in `a` and `b` for equality.
613 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
614 {
615 static if (SIMD_COMPARISON_MASKS_16B)
616 {
617 return cast(__m128i)(cast(byte16)a == cast(byte16)b);
618 }
619 else static if (GDC_with_SSE2)
620 {
621 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
622 }
623 else
624 {
625 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
626 }
627 }
628 unittest
629 {
630 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
631 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
632 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
633 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
634 assert(C.array == correct);
635 }
636
637 /// Compare packed double-precision (64-bit) floating-point elements
638 /// in `a` and `b` for equality.
639 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
640 {
641 static if (SIMD_COMPARISON_MASKS_16B)
642 {
643 return cast(double2)(cast(double2)a == cast(double2)b);
644 }
645 else static if (GDC_with_SSE2)
646 {
647 return __builtin_ia32_cmpeqpd(a, b);
648 }
649 else
650 {
651 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
652 }
653 }
654 unittest
655 {
656 double2 A = _mm_setr_pd(1.0, 2.0);
657 double2 B = _mm_setr_pd(0.0, 2.0);
658 double2 N = _mm_setr_pd(double.nan, double.nan);
659 long2 C = cast(long2) _mm_cmpeq_pd(A, B);
660 long[2] correctC = [0, -1];
661 assert(C.array == correctC);
662 long2 D = cast(long2) _mm_cmpeq_pd(N, N);
663 long[2] correctD = [0, 0];
664 assert(D.array == correctD);
665 }
666
667 /// Compare the lower double-precision (64-bit) floating-point elements
668 /// in `a` and `b` for equality, store the result in the lower element,
669 /// and copy the upper element from `a`.
670 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
671 {
672 static if (DMD_with_DSIMD)
673 {
674 return cast(__m128d) __simd(XMM.CMPSD, a, b, 0);
675 }
676 else static if (GDC_with_SSE2)
677 {
678 return __builtin_ia32_cmpeqsd(a, b);
679 }
680 else
681 {
682 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
683 }
684 }
685 unittest
686 {
687 double2 A = _mm_setr_pd(0.0, 2.0);
688 double2 B = _mm_setr_pd(1.0, 2.0);
689 double2 C = _mm_setr_pd(1.0, 3.0);
690 double2 D = cast(double2) _mm_cmpeq_sd(A, B);
691 long2 E = cast(long2) _mm_cmpeq_sd(B, C);
692 double[2] correctD = [0.0, 2.0];
693 double two = 2.0;
694 long[2] correctE = [-1, *cast(long*)&two];
695 assert(D.array == correctD);
696 assert(E.array == correctE);
697 }
698
699 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
700 /// #BONUS
701 __m128i _mm_cmpge_epi16 (__m128i a, __m128i b) pure @safe
702 {
703 static if (SIMD_COMPARISON_MASKS_16B)
704 {
705 return cast(__m128i)(cast(short8)a >= cast(short8)b);
706 }
707 else version (LDC)
708 {
709 // LDC ARM64: generates cmge since -O1
710 return cast(__m128i) greaterOrEqualMask!short8(cast(short8)a, cast(short8)b);
711 }
712 else
713 {
714 return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_cmpgt_epi16(a, b));
715 }
716 }
717 unittest
718 {
719 short8 A = [-3, -2, -32768, 0, 0, 1, 2, 3];
720 short8 B = [ 4, 3, 32767, 1, 0, -1, -2, -3];
721 short[8] E = [ 0, 0, 0, 0, -1, -1, -1, -1];
722 short8 R = cast(short8)(_mm_cmpge_epi16(cast(__m128i)A, cast(__m128i)B));
723 assert(R.array == E);
724 }
725
726 /// Compare packed double-precision (64-bit) floating-point elements
727 /// in `a` and `b` for greater-than-or-equal.
728 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
729 {
730 static if (SIMD_COMPARISON_MASKS_16B)
731 {
732 return cast(__m128d)(a >= b);
733 }
734 else static if (GDC_with_SSE2)
735 {
736 return __builtin_ia32_cmpgepd(a, b);
737 }
738 else
739 {
740 return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
741 }
742 }
743
744 /// Compare the lower double-precision (64-bit) floating-point elements
745 /// in `a` and `b` for greater-than-or-equal, store the result in the
746 /// lower element, and copy the upper element from `a`.
747 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
748 {
749 static if (DMD_with_DSIMD)
750 {
751 return cast(__m128d) __simd(XMM.CMPSD, b, a, 2);
752 }
753 else static if (GDC_with_SSE2)
754 {
755 return __builtin_ia32_cmplesd(b, a);
756 }
757 else
758 {
759 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
760 }
761 }
762 unittest
763 {
764 __m128d A = _mm_setr_pd(1.0, 0.0);
765 __m128d B = _mm_setr_pd(double.nan, 0.0);
766 __m128d C = _mm_setr_pd(2.0, 0.0);
767 assert( (cast(long2)_mm_cmpge_sd(A, A)).array[0] == -1);
768 assert( (cast(long2)_mm_cmpge_sd(A, B)).array[0] == 0);
769 assert( (cast(long2)_mm_cmpge_sd(A, C)).array[0] == 0);
770 assert( (cast(long2)_mm_cmpge_sd(B, A)).array[0] == 0);
771 assert( (cast(long2)_mm_cmpge_sd(B, B)).array[0] == 0);
772 assert( (cast(long2)_mm_cmpge_sd(B, C)).array[0] == 0);
773 assert( (cast(long2)_mm_cmpge_sd(C, A)).array[0] == -1);
774 assert( (cast(long2)_mm_cmpge_sd(C, B)).array[0] == 0);
775 assert( (cast(long2)_mm_cmpge_sd(C, C)).array[0] == -1);
776 }
777
778 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
779 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
780 {
781 static if (SIMD_COMPARISON_MASKS_16B)
782 {
783 return cast(__m128i)(cast(short8)a > cast(short8)b);
784 }
785 else static if (GDC_with_SSE2)
786 {
787 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
788 }
789 else
790 {
791 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
792 }
793 }
794 unittest
795 {
796 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3];
797 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3];
798 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1];
799 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
800 assert(R.array == E);
801 }
802
803 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
804 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
805 {
806 static if (SIMD_COMPARISON_MASKS_16B)
807 {
808 return cast(__m128i)(cast(int4)a > cast(int4)b);
809 }
810 else static if (GDC_with_SSE2)
811 {
812 return __builtin_ia32_pcmpgtd128(a, b);
813 }
814 else
815 {
816 return cast(__m128i)( greaterMask!int4(a, b));
817 }
818 }
819 unittest
820 {
821 int4 A = [-3, 2, -1, 0];
822 int4 B = [ 4, -2, 2, 0];
823 int[4] E = [ 0, -1, 0, 0];
824 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B));
825 assert(R.array == E);
826 }
827
828 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
829 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
830 {
831 static if (SIMD_COMPARISON_MASKS_16B)
832 {
833 return cast(__m128i)(cast(byte16)a > cast(byte16)b);
834 }
835 else
836 {
837 // Note: __builtin_ia32_pcmpgtb128 is buggy, do not use with GDC
838 // TODO: re-check that
839 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
840 }
841 }
842 unittest
843 {
844 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
845 __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
846 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
847 byte[16] correct = [0, 0,-1, 0, -1, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
848 __m128i D = _mm_cmpeq_epi8(A, B);
849 assert(C.array == correct);
850 }
851
852 /// Compare packed double-precision (64-bit) floating-point elements
853 /// in `a` and `b` for greater-than.
854 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
855 {
856 static if (SIMD_COMPARISON_MASKS_16B)
857 {
858 return cast(__m128d)(a > b);
859 }
860 else static if (GDC_with_SSE2)
861 {
862 return __builtin_ia32_cmpgtpd(a, b);
863 }
864 else
865 {
866 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
867 }
868 }
869
870 /// Compare the lower double-precision (64-bit) floating-point elements
871 /// in `a` and `b` for greater-than, store the result in the lower element,
872 /// and copy the upper element from `a`.
873 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
874 {
875 static if (DMD_with_DSIMD)
876 {
877 return cast(__m128d) __simd(XMM.CMPSD, b, a, 1);
878 }
879 else static if (GDC_with_SSE2)
880 {
881 return __builtin_ia32_cmpltsd(b, a);
882 }
883 else
884 {
885 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
886 }
887 }
888 unittest
889 {
890 __m128d A = _mm_setr_pd(1.0, 0.0);
891 __m128d B = _mm_setr_pd(double.nan, 0.0);
892 __m128d C = _mm_setr_pd(2.0, 0.0);
893 assert( (cast(long2)_mm_cmpgt_sd(A, A)).array[0] == 0);
894 assert( (cast(long2)_mm_cmpgt_sd(A, B)).array[0] == 0);
895 assert( (cast(long2)_mm_cmpgt_sd(A, C)).array[0] == 0);
896 assert( (cast(long2)_mm_cmpgt_sd(B, A)).array[0] == 0);
897 assert( (cast(long2)_mm_cmpgt_sd(B, B)).array[0] == 0);
898 assert( (cast(long2)_mm_cmpgt_sd(B, C)).array[0] == 0);
899 assert( (cast(long2)_mm_cmpgt_sd(C, A)).array[0] == -1);
900 assert( (cast(long2)_mm_cmpgt_sd(C, B)).array[0] == 0);
901 assert( (cast(long2)_mm_cmpgt_sd(C, C)).array[0] == 0);
902 }
903
904
905 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
906 /// #BONUS
907 __m128i _mm_cmple_epi16 (__m128i a, __m128i b) pure @safe
908 {
909 static if (SIMD_COMPARISON_MASKS_16B)
910 {
911 return cast(__m128i)(cast(short8)a <= cast(short8)b);
912 }
913 else version (LDC)
914 {
915 // LDC ARM64: generates cmge since -O1
916 return cast(__m128i) greaterOrEqualMask!short8(cast(short8)b, cast(short8)a);
917 }
918 else
919 {
920 return _mm_xor_si128(_mm_cmpeq_epi16(b, a), _mm_cmpgt_epi16(b, a));
921 }
922 }
923 unittest
924 {
925 short8 A = [-3, -2, -32768, 1, 0, 1, 2, 3];
926 short8 B = [ 4, 3, 32767, 0, 0, -1, -2, -3];
927 short[8] E = [-1, -1, -1, 0, -1, 0, 0, 0];
928 short8 R = cast(short8)(_mm_cmple_epi16(cast(__m128i)A, cast(__m128i)B));
929 assert(R.array == E);
930 }
931
932 /// Compare packed double-precision (64-bit) floating-point elements
933 /// in `a` and `b` for less-than-or-equal.
934 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
935 {
936 static if (SIMD_COMPARISON_MASKS_16B)
937 {
938 return cast(__m128d)(a <= b);
939 }
940 else static if (GDC_with_SSE2)
941 {
942 return __builtin_ia32_cmplepd(a, b);
943 }
944 else
945 {
946 return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
947 }
948 }
949
950 /// Compare the lower double-precision (64-bit) floating-point elements
951 /// in `a` and `b` for less-than-or-equal, store the result in the
952 /// lower element, and copy the upper element from `a`.
953 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
954 {
955 static if (DMD_with_DSIMD)
956 {
957 return cast(__m128d) __simd(XMM.CMPSD, a, b, 2);
958 }
959 else static if (GDC_with_SSE2)
960 {
961 return __builtin_ia32_cmplesd(a, b);
962 }
963 else
964 {
965 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
966 }
967 }
968
969 /// Compare packed 16-bit integers in `a` and `b` for less-than.
970 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
971 {
972 return _mm_cmpgt_epi16(b, a);
973 }
974
975 /// Compare packed 32-bit integers in `a` and `b` for less-than.
976 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
977 {
978 return _mm_cmpgt_epi32(b, a);
979 }
980
981 /// Compare packed 8-bit integers in `a` and `b` for less-than.
982 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
983 {
984 return _mm_cmpgt_epi8(b, a);
985 }
986
987 /// Compare packed double-precision (64-bit) floating-point elements
988 /// in `a` and `b` for less-than.
989 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
990 {
991 static if (SIMD_COMPARISON_MASKS_16B)
992 {
993 return cast(__m128d)(a < b);
994 }
995 else static if (GDC_with_SSE2)
996 {
997 return __builtin_ia32_cmpltpd(a, b);
998 }
999 else
1000 {
1001 return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
1002 }
1003 }
1004
1005 /// Compare the lower double-precision (64-bit) floating-point elements
1006 /// in `a` and `b` for less-than, store the result in the lower
1007 /// element, and copy the upper element from `a`.
1008 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
1009 {
1010 static if (DMD_with_DSIMD)
1011 {
1012 return cast(__m128d) __simd(XMM.CMPSD, a, b, 1);
1013 }
1014 else static if (GDC_with_SSE2)
1015 {
1016 return __builtin_ia32_cmpltsd(a, b);
1017 }
1018 else
1019 {
1020 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
1021 }
1022 }
1023
1024 /// Compare packed double-precision (64-bit) floating-point elements
1025 /// in `a` and `b` for not-equal.
1026 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
1027 {
1028 static if (GDC_with_SSE2)
1029 {
1030 return __builtin_ia32_cmpneqpd(a, b);
1031 }
1032 else
1033 {
1034 return cast(__m128d) cmppd!(FPComparison.une)(a, b);
1035 }
1036 }
1037
1038 /// Compare the lower double-precision (64-bit) floating-point elements
1039 /// in `a` and `b` for not-equal, store the result in the lower
1040 /// element, and copy the upper element from `a`.
1041 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
1042 {
1043 static if (GDC_with_SSE2)
1044 {
1045 return __builtin_ia32_cmpneqsd(a, b);
1046 }
1047 else
1048 {
1049 return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
1050 }
1051 }
1052
1053 /// Compare packed double-precision (64-bit) floating-point elements
1054 /// in `a` and `b` for not-greater-than-or-equal.
1055 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
1056 {
1057 static if (GDC_with_SSE2)
1058 {
1059 return __builtin_ia32_cmpngepd(a, b);
1060 }
1061 else
1062 {
1063 return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
1064 }
1065 }
1066
1067 /// Compare the lower double-precision (64-bit) floating-point elements
1068 /// in `a` and `b` for not-greater-than-or-equal, store the result in
1069 /// the lower element, and copy the upper element from `a`.
1070 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
1071 {
1072 // Note: There is no __builtin_ia32_cmpngesd builtin.
1073 static if (GDC_with_SSE2)
1074 {
1075 return __builtin_ia32_cmpltsd(b, a);
1076 }
1077 else
1078 {
1079 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
1080 }
1081 }
1082
1083 /// Compare packed double-precision (64-bit) floating-point elements
1084 /// in `a` and `b` for not-greater-than.
1085 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
1086 {
1087 static if (GDC_with_SSE2)
1088 {
1089 return __builtin_ia32_cmpngtpd(a, b);
1090 }
1091 else
1092 {
1093 return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
1094 }
1095 }
1096
1097 /// Compare the lower double-precision (64-bit) floating-point elements
1098 /// in `a` and `b` for not-greater-than, store the result in the
1099 /// lower element, and copy the upper element from `a`.
1100 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
1101 {
1102 // Note: There is no __builtin_ia32_cmpngtsd builtin.
1103 static if (GDC_with_SSE2)
1104 {
1105 return __builtin_ia32_cmplesd(b, a);
1106 }
1107 else
1108 {
1109 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
1110 }
1111 }
1112
1113 /// Compare packed double-precision (64-bit) floating-point elements
1114 /// in `a` and `b` for not-less-than-or-equal.
1115 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
1116 {
1117 static if (GDC_with_SSE2)
1118 {
1119 return __builtin_ia32_cmpnlepd(a, b);
1120 }
1121 else
1122 {
1123 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
1124 }
1125 }
1126
1127 /// Compare the lower double-precision (64-bit) floating-point elements
1128 /// in `a` and `b` for not-less-than-or-equal, store the result in the
1129 /// lower element, and copy the upper element from `a`.
1130 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
1131 {
1132 static if (GDC_with_SSE2)
1133 {
1134 return __builtin_ia32_cmpnlesd(a, b);
1135 }
1136 else
1137 {
1138 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
1139 }
1140 }
1141
1142 /// Compare packed double-precision (64-bit) floating-point elements
1143 /// in `a` and `b` for not-less-than.
1144 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
1145 {
1146 static if (GDC_with_SSE2)
1147 {
1148 return __builtin_ia32_cmpnltpd(a, b);
1149 }
1150 else
1151 {
1152 return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
1153 }
1154 }
1155
1156 /// Compare the lower double-precision (64-bit) floating-point elements
1157 /// in `a` and `b` for not-less-than, store the result in the lower
1158 /// element, and copy the upper element from `a`.
1159 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1160 {
1161 static if (GDC_with_SSE2)
1162 {
1163 return __builtin_ia32_cmpnltsd(a, b);
1164 }
1165 else
1166 {
1167 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1168 }
1169 }
1170
1171 /// Compare packed double-precision (64-bit) floating-point elements
1172 /// in `a` and `b` to see if neither is NaN.
1173 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1174 {
1175 static if (GDC_with_SSE2)
1176 {
1177 return __builtin_ia32_cmpordpd(a, b);
1178 }
1179 else
1180 {
1181 return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1182 }
1183 }
1184
1185 /// Compare the lower double-precision (64-bit) floating-point elements
1186 /// in `a` and `b` to see if neither is NaN, store the result in the
1187 /// lower element, and copy the upper element from `a` to the upper element.
1188 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1189 {
1190 static if (GDC_with_SSE2)
1191 {
1192 return __builtin_ia32_cmpordsd(a, b);
1193 }
1194 else
1195 {
1196 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1197 }
1198 }
1199
1200 /// Compare packed double-precision (64-bit) floating-point elements
1201 /// in `a` and `b` to see if either is NaN.
1202 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1203 {
1204 static if (GDC_with_SSE2)
1205 {
1206 return __builtin_ia32_cmpunordpd(a, b);
1207 }
1208 else
1209 {
1210 return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1211 }
1212 }
1213
1214 /// Compare the lower double-precision (64-bit) floating-point elements
1215 /// in `a` and `b` to see if either is NaN, store the result in the lower
1216 /// element, and copy the upper element from `a` to the upper element.
1217 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1218 {
1219 static if (GDC_with_SSE2)
1220 {
1221 return __builtin_ia32_cmpunordsd(a, b);
1222 }
1223 else
1224 {
1225 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1226 }
1227 }
1228
1229 /// Compare the lower double-precision (64-bit) floating-point element
1230 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1231 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1232 {
1233 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the
1234 // comisd instruction, it returns false in case of unordered instead.
1235 //
1236 // Actually C++ compilers disagree over the meaning of that instruction.
1237 // GCC will manage NaNs like the comisd instruction (return true if unordered),
1238 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1239 // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1240 return a.array[0] == b.array[0];
1241 }
1242 unittest
1243 {
1244 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1245 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1246 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1247 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1248 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1249 }
1250
1251 /// Compare the lower double-precision (64-bit) floating-point element
1252 /// in `a` and `b` for greater-than-or-equal, and return the boolean
1253 /// result (0 or 1).
1254 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1255 {
1256 return a.array[0] >= b.array[0];
1257 }
1258 unittest
1259 {
1260 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1261 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1262 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1263 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1264 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1265 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1266 }
1267
1268 /// Compare the lower double-precision (64-bit) floating-point element
1269 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1270 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1271 {
1272 return a.array[0] > b.array[0];
1273 }
1274 unittest
1275 {
1276 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1277 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1278 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1279 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1280 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1281 }
1282
1283 /// Compare the lower double-precision (64-bit) floating-point element
1284 /// in `a` and `b` for less-than-or-equal.
1285 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1286 {
1287 return a.array[0] <= b.array[0];
1288 }
1289 unittest
1290 {
1291 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1292 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1293 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1294 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1295 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1296 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1297 }
1298
1299 /// Compare the lower double-precision (64-bit) floating-point element
1300 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1301 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1302 {
1303 return a.array[0] < b.array[0];
1304 }
1305 unittest
1306 {
1307 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1308 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1309 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1310 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1311 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1312 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1313 }
1314
1315 /// Compare the lower double-precision (64-bit) floating-point element
1316 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1317 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1318 {
1319 return a.array[0] != b.array[0];
1320 }
1321 unittest
1322 {
1323 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1324 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1325 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1326 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1327 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1328 }
1329
1330 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1331 /// floating-point elements.
1332 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1333 {
1334 static if (LDC_with_optimizations)
1335 {
1336 // Generates cvtdq2pd since LDC 1.0, even without optimizations
1337 enum ir = `
1338 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1339 %r = sitofp <2 x i32> %v to <2 x double>
1340 ret <2 x double> %r`;
1341 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1342 }
1343 else static if (GDC_with_SSE2)
1344 {
1345 return __builtin_ia32_cvtdq2pd(a);
1346 }
1347 else
1348 {
1349 double2 r = void;
1350 r.ptr[0] = a.array[0];
1351 r.ptr[1] = a.array[1];
1352 return r;
1353 }
1354 }
1355 unittest
1356 {
1357 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1358 assert(A.array[0] == 54.0);
1359 assert(A.array[1] == 54.0);
1360 }
1361
1362 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
1363 /// floating-point elements.
1364 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1365 {
1366 static if (DMD_with_DSIMD)
1367 {
1368 return cast(__m128)__simd(XMM.CVTDQ2PS, cast(void16) a);
1369 }
1370 else static if (GDC_with_SSE2)
1371 {
1372 return __builtin_ia32_cvtdq2ps(a);
1373 }
1374 else static if (LDC_with_optimizations)
1375 {
1376 // See #86 for why we had to resort to LLVM IR.
1377 // Plain code below was leading to catastrophic behaviour.
1378 // x86: Generates cvtdq2ps since LDC 1.1.0 -O0
1379 // ARM: Generats scvtf.4s since LDC 1.8.0 -O0
1380 enum ir = `
1381 %r = sitofp <4 x i32> %0 to <4 x float>
1382 ret <4 x float> %r`;
1383 return cast(__m128) LDCInlineIR!(ir, float4, int4)(a);
1384 }
1385 else
1386 {
1387 __m128 res; // PERF =void;
1388 res.ptr[0] = cast(float)a.array[0];
1389 res.ptr[1] = cast(float)a.array[1];
1390 res.ptr[2] = cast(float)a.array[2];
1391 res.ptr[3] = cast(float)a.array[3];
1392 return res;
1393 }
1394 }
1395 unittest
1396 {
1397 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1398 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1399 }
1400
1401 /// Convert packed double-precision (64-bit) floating-point elements
1402 /// in `a` to packed 32-bit integers.
1403 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1404 {
1405 // PERF ARM32
1406 static if (LDC_with_SSE2)
1407 {
1408 return __builtin_ia32_cvtpd2dq(a);
1409 }
1410 else static if (GDC_with_SSE2)
1411 {
1412 return __builtin_ia32_cvtpd2dq(a);
1413 }
1414 else static if (LDC_with_ARM64)
1415 {
1416 // Get current rounding mode.
1417 uint fpscr = arm_get_fpcr();
1418 long2 i;
1419 switch(fpscr & _MM_ROUND_MASK_ARM)
1420 {
1421 default:
1422 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break;
1423 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break;
1424 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break;
1425 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1426 }
1427 int4 zero = 0;
1428 return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero); // PERF: this slow down build for nothing, test without shufflevector
1429 }
1430 else
1431 {
1432 // PERF ARM32
1433 __m128i r = _mm_setzero_si128();
1434 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1435 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1436 return r;
1437 }
1438 }
1439 unittest
1440 {
1441 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1442 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1443 }
1444
1445 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1446 /// to packed 32-bit integers
1447 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1448 {
1449 return to_m64(_mm_cvtpd_epi32(v));
1450 }
1451 unittest
1452 {
1453 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1454 assert(A.array[0] == 55 && A.array[1] == 61);
1455 }
1456
1457 /// Convert packed double-precision (64-bit) floating-point elements
1458 /// in `a` to packed single-precision (32-bit) floating-point elements.
1459 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1460 {
1461 static if (LDC_with_SSE2)
1462 {
1463 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1464 }
1465 else static if (GDC_with_SSE2)
1466 {
1467 return __builtin_ia32_cvtpd2ps(a);
1468 }
1469 else
1470 {
1471 __m128 r = void;
1472 r.ptr[0] = a.array[0];
1473 r.ptr[1] = a.array[1];
1474 r.ptr[2] = 0;
1475 r.ptr[3] = 0;
1476 return r;
1477 }
1478 }
1479 unittest
1480 {
1481 __m128d A = _mm_set_pd(5.25, 4.0);
1482 __m128 B = _mm_cvtpd_ps(A);
1483 assert(B.array == [4.0f, 5.25f, 0, 0]);
1484 }
1485
1486 /// Convert packed 32-bit integers in `v` to packed double-precision
1487 /// (64-bit) floating-point elements.
1488 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1489 {
1490 return _mm_cvtepi32_pd(to_m128i(v));
1491 }
1492 unittest
1493 {
1494 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1495 assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1496 }
1497
1498 /// Convert packed single-precision (32-bit) floating-point elements
1499 /// in `a` to packed 32-bit integers
1500 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1501 {
1502 static if (LDC_with_SSE2)
1503 {
1504 return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1505 }
1506 else static if (GDC_with_SSE2)
1507 {
1508 return __builtin_ia32_cvtps2dq(a);
1509 }
1510 else static if (LDC_with_ARM64)
1511 {
1512 // Get current rounding mode.
1513 uint fpscr = arm_get_fpcr();
1514 switch(fpscr & _MM_ROUND_MASK_ARM)
1515 {
1516 default:
1517 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a);
1518 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a);
1519 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a);
1520 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1521 }
1522 }
1523 else
1524 {
1525 __m128i r = void;
1526 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1527 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1528 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1529 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1530 return r;
1531 }
1532 }
1533 unittest
1534 {
1535 // GDC bug #98607
1536 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1537 // GDC does not provide optimization barrier for rounding mode.
1538 // Workarounded with different literals. This bug will likely only manifest in unittest.
1539 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1540
1541 uint savedRounding = _MM_GET_ROUNDING_MODE();
1542
1543 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1544 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1545 assert(A.array == [1, -2, 54, -3]);
1546
1547 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1548 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1549 assert(A.array == [1, -3, 53, -3]);
1550
1551 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1552 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1553 assert(A.array == [2, -2, 54, -2]);
1554
1555 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1556 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1557 assert(A.array == [1, -2, 53, -2]);
1558
1559 _MM_SET_ROUNDING_MODE(savedRounding);
1560 }
1561
1562 /// Convert packed single-precision (32-bit) floating-point elements
1563 /// in `a` to packed double-precision (64-bit) floating-point elements.
1564 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1565 {
1566 static if (LDC_with_optimizations)
1567 {
1568 // Generates cvtps2pd since LDC 1.0 -O0
1569 enum ir = `
1570 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1571 %r = fpext <2 x float> %v to <2 x double>
1572 ret <2 x double> %r`;
1573 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1574 }
1575 else static if (GDC_with_SSE2)
1576 {
1577 return __builtin_ia32_cvtps2pd(a);
1578 }
1579 else
1580 {
1581 double2 r = void;
1582 r.ptr[0] = a.array[0];
1583 r.ptr[1] = a.array[1];
1584 return r;
1585 }
1586 }
1587 unittest
1588 {
1589 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1590 assert(A.array[0] == 54.0);
1591 assert(A.array[1] == 54.0);
1592 }
1593
1594 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1595 double _mm_cvtsd_f64 (__m128d a) pure @safe
1596 {
1597 return a.array[0];
1598 }
1599
1600 /// Convert the lower double-precision (64-bit) floating-point element
1601 /// in `a` to a 32-bit integer.
1602 int _mm_cvtsd_si32 (__m128d a) @safe
1603 {
1604 static if (LDC_with_SSE2)
1605 {
1606 return __builtin_ia32_cvtsd2si(a);
1607 }
1608 else static if (GDC_with_SSE2)
1609 {
1610 return __builtin_ia32_cvtsd2si(a);
1611 }
1612 else
1613 {
1614 return convertDoubleToInt32UsingMXCSR(a[0]);
1615 }
1616 }
1617 unittest
1618 {
1619 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1620 }
1621
1622 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1623 long _mm_cvtsd_si64 (__m128d a) @trusted
1624 {
1625 static if (LDC_with_SSE2)
1626 {
1627 version (X86_64)
1628 {
1629 return __builtin_ia32_cvtsd2si64(a);
1630 }
1631 else
1632 {
1633 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1634 // using SSE instructions only. So the builtin doesn't exist for this arch.
1635 return convertDoubleToInt64UsingMXCSR(a[0]);
1636 }
1637 }
1638 else
1639 {
1640 return convertDoubleToInt64UsingMXCSR(a.array[0]);
1641 }
1642 }
1643 unittest
1644 {
1645 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1646
1647 uint savedRounding = _MM_GET_ROUNDING_MODE();
1648
1649 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1650 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1651
1652 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1653 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1654
1655 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1656 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1657
1658 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1659 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1660
1661 _MM_SET_ROUNDING_MODE(savedRounding);
1662 }
1663
1664 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1665
1666 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit)
1667 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1668 /// to the upper elements of result.
1669 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1670 {
1671 static if (GDC_with_SSE2)
1672 {
1673 return __builtin_ia32_cvtsd2ss(a, b);
1674 }
1675 else
1676 {
1677 // Generates cvtsd2ss since LDC 1.3 -O0
1678 a.ptr[0] = b.array[0];
1679 return a;
1680 }
1681 }
1682 unittest
1683 {
1684 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1685 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1686 }
1687
1688 /// Get the lower 32-bit integer in `a`.
1689 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1690 {
1691 return a.array[0];
1692 }
1693
1694 /// Get the lower 64-bit integer in `a`.
1695 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1696 {
1697 long2 la = cast(long2)a;
1698 return la.array[0];
1699 }
1700 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1701
1702 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the
1703 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1704 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1705 {
1706 a.ptr[0] = cast(double)b;
1707 return a;
1708 }
1709 unittest
1710 {
1711 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1712 assert(a.array == [42.0, 0]);
1713 }
1714
1715 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1716 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1717 {
1718 int4 r = [0, 0, 0, 0];
1719 r.ptr[0] = a;
1720 return r;
1721 }
1722 unittest
1723 {
1724 __m128i a = _mm_cvtsi32_si128(65);
1725 assert(a.array == [65, 0, 0, 0]);
1726 }
1727
1728 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in
1729 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1730
1731 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1732 {
1733 a.ptr[0] = cast(double)b;
1734 return a;
1735 }
1736 unittest
1737 {
1738 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1739 assert(a.array == [42.0, 0]);
1740 }
1741
1742 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1743 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1744 {
1745 long2 r = [0, 0];
1746 r.ptr[0] = a;
1747 return cast(__m128i)(r);
1748 }
1749
1750 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1751 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1752
1753 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit)
1754 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper
1755 // element of result.
1756 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1757 {
1758 a.ptr[0] = b.array[0];
1759 return a;
1760 }
1761 unittest
1762 {
1763 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1764 assert(a.array == [42.0, 0]);
1765 }
1766
1767 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1768 long _mm_cvttss_si64 (__m128 a) pure @safe
1769 {
1770 return cast(long)(a.array[0]); // Generates cvttss2si as expected
1771 }
1772 unittest
1773 {
1774 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1775 }
1776
1777 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1778 /// Put zeroes in the upper elements of result.
1779 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1780 {
1781 static if (LDC_with_SSE2)
1782 {
1783 return __builtin_ia32_cvttpd2dq(a);
1784 }
1785 else static if (GDC_with_SSE2)
1786 {
1787 return __builtin_ia32_cvttpd2dq(a);
1788 }
1789 else
1790 {
1791 // Note: doesn't generate cvttpd2dq as of LDC 1.13
1792 __m128i r; // PERF =void;
1793 r.ptr[0] = cast(int)a.array[0];
1794 r.ptr[1] = cast(int)a.array[1];
1795 r.ptr[2] = 0;
1796 r.ptr[3] = 0;
1797 return r;
1798 }
1799 }
1800 unittest
1801 {
1802 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1803 assert(R.array == [-4, 45641, 0, 0]);
1804 }
1805
1806 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1807 /// to packed 32-bit integers with truncation.
1808 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1809 {
1810 return to_m64(_mm_cvttpd_epi32(v));
1811 }
1812 unittest
1813 {
1814 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1815 int[2] correct = [-4, 45641];
1816 assert(R.array == correct);
1817 }
1818
1819 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1820 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1821 {
1822 // x86: Generates cvttps2dq since LDC 1.3 -O2
1823 // ARM64: generates fcvtze since LDC 1.8 -O2
1824 __m128i r; // PERF = void;
1825 r.ptr[0] = cast(int)a.array[0];
1826 r.ptr[1] = cast(int)a.array[1];
1827 r.ptr[2] = cast(int)a.array[2];
1828 r.ptr[3] = cast(int)a.array[3];
1829 return r;
1830 }
1831 unittest
1832 {
1833 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1834 assert(R.array == [-4, 45641, 0, 1]);
1835 }
1836
1837 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1838 int _mm_cvttsd_si32 (__m128d a)
1839 {
1840 // Generates cvttsd2si since LDC 1.3 -O0
1841 return cast(int)a.array[0];
1842 }
1843
1844 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1845 long _mm_cvttsd_si64 (__m128d a)
1846 {
1847 // Generates cvttsd2si since LDC 1.3 -O0
1848 // but in 32-bit instead, it's a long sequence that resort to FPU
1849 return cast(long)a.array[0];
1850 }
1851
1852 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1853
1854 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1855 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1856 {
1857 pragma(inline, true);
1858 return a / b;
1859 }
1860
1861 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1862 {
1863 static if (GDC_with_SSE2)
1864 {
1865 return __builtin_ia32_divsd(a, b);
1866 }
1867 else version(DigitalMars)
1868 {
1869 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1870 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1871 asm pure nothrow @nogc @trusted { nop;}
1872 a.array[0] = a.array[0] / b.array[0];
1873 return a;
1874 }
1875 else
1876 {
1877 a.ptr[0] /= b.array[0];
1878 return a;
1879 }
1880 }
1881 unittest
1882 {
1883 __m128d a = [2.0, 4.5];
1884 a = _mm_div_sd(a, a);
1885 assert(a.array == [1.0, 4.5]);
1886 }
1887
1888 /// Extract a 16-bit integer from `v`, selected with `index`.
1889 /// Warning: the returned value is zero-extended to 32-bits.
1890 int _mm_extract_epi16(__m128i v, int index) pure @safe
1891 {
1892 short8 r = cast(short8)v;
1893 return cast(ushort)(r.array[index & 7]);
1894 }
1895 unittest
1896 {
1897 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1898 assert(_mm_extract_epi16(A, 6) == 6);
1899 assert(_mm_extract_epi16(A, 0) == 65535);
1900 assert(_mm_extract_epi16(A, 5 + 8) == 5);
1901 }
1902
1903 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1904 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1905 {
1906 short8 r = cast(short8)v;
1907 r.ptr[index & 7] = cast(short)i;
1908 return cast(__m128i)r;
1909 }
1910 unittest
1911 {
1912 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1913 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1914 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1915 assert(R.array == correct);
1916 }
1917
1918 /// Perform a serializing operation on all load-from-memory instructions that were issued prior
1919 /// to this instruction. Guarantees that every load instruction that precedes, in program order,
1920 /// is globally visible before any load instruction which follows the fence in program order.
1921 void _mm_lfence() @trusted
1922 {
1923 version(GNU)
1924 {
1925 static if (GDC_with_SSE2)
1926 {
1927 __builtin_ia32_lfence();
1928 }
1929 else version(X86)
1930 {
1931 asm pure nothrow @nogc @trusted
1932 {
1933 "lfence;\n" : : : ;
1934 }
1935 }
1936 else
1937 static assert(false);
1938 }
1939 else static if (LDC_with_SSE2)
1940 {
1941 __builtin_ia32_lfence();
1942 }
1943 else static if (LDC_with_ARM64)
1944 {
1945 __builtin_arm_dmb(9); // dmb ishld
1946 }
1947 else static if (DMD_with_asm)
1948 {
1949 asm nothrow @nogc pure @trusted
1950 {
1951 lfence;
1952 }
1953 }
1954 else version(LDC)
1955 {
1956 // When the architecture is unknown, generate a full memory barrier,
1957 // as the semantics of sfence do not really match those of atomics.
1958 llvm_memory_fence();
1959 }
1960 else
1961 static assert(false);
1962 }
1963 unittest
1964 {
1965 _mm_lfence();
1966 }
1967
1968 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1969 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1970 __m128d _mm_load_pd (const(double) * mem_addr) pure
1971 {
1972 pragma(inline, true);
1973 __m128d* aligned = cast(__m128d*)mem_addr;
1974 return *aligned;
1975 }
1976 unittest
1977 {
1978 align(16) double[2] S = [-5.0, 7.0];
1979 __m128d R = _mm_load_pd(S.ptr);
1980 assert(R.array == S);
1981 }
1982
1983 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1984 /// `mem_addr` does not need to be aligned on any particular boundary.
1985 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1986 {
1987 double m = *mem_addr;
1988 __m128d r; // PERF =void;
1989 r.ptr[0] = m;
1990 r.ptr[1] = m;
1991 return r;
1992 }
1993 unittest
1994 {
1995 double what = 4;
1996 __m128d R = _mm_load_pd1(&what);
1997 double[2] correct = [4.0, 4];
1998 assert(R.array == correct);
1999 }
2000
2001 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper
2002 /// element. `mem_addr` does not need to be aligned on any particular boundary.
2003 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
2004 {
2005 double2 r = [0, 0];
2006 r.ptr[0] = *mem_addr;
2007 return r;
2008 }
2009 unittest
2010 {
2011 double x = -42;
2012 __m128d a = _mm_load_sd(&x);
2013 assert(a.array == [-42.0, 0.0]);
2014 }
2015
2016 /// Load 128-bits of integer data from memory into dst.
2017 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2018 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @safe
2019 {
2020 pragma(inline, true);
2021 return *mem_addr;
2022 }
2023 unittest
2024 {
2025 align(16) int[4] correct = [-1, 2, 3, 4];
2026 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
2027 assert(A.array == correct);
2028 }
2029
2030 alias _mm_load1_pd = _mm_load_pd1; ///
2031
2032 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the
2033 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
2034 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
2035 {
2036 pragma(inline, true);
2037 a.ptr[1] = *mem_addr;
2038 return a;
2039 }
2040 unittest
2041 {
2042 double A = 7.0;
2043 __m128d B = _mm_setr_pd(4.0, -5.0);
2044 __m128d R = _mm_loadh_pd(B, &A);
2045 double[2] correct = [ 4.0, 7.0 ];
2046 assert(R.array == correct);
2047 }
2048
2049 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
2050 /// Note: strange signature since the memory doesn't have to aligned, and should point to addressable 64-bit, not 128-bit.
2051 /// You may use `_mm_loadu_si64` instead.
2052 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted
2053 {
2054 pragma(inline, true);
2055 static if (DMD_with_DSIMD)
2056 {
2057 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
2058 }
2059 else
2060 {
2061 auto pLong = cast(const(long)*)mem_addr;
2062 long2 r = [0, 0];
2063 r.ptr[0] = *pLong;
2064 return cast(__m128i)(r);
2065 }
2066 }
2067 unittest
2068 {
2069 long A = 0x7878787870707070;
2070 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
2071 long[2] correct = [0x7878787870707070, 0];
2072 assert(R.array == correct);
2073 }
2074
2075 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the
2076 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
2077 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
2078 {
2079 a.ptr[0] = *mem_addr;
2080 return a;
2081 }
2082 unittest
2083 {
2084 double A = 7.0;
2085 __m128d B = _mm_setr_pd(4.0, -5.0);
2086 __m128d R = _mm_loadl_pd(B, &A);
2087 double[2] correct = [ 7.0, -5.0 ];
2088 assert(R.array == correct);
2089 }
2090
2091 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order.
2092 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2093 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
2094 {
2095 __m128d a = *cast(__m128d*)(mem_addr);
2096 __m128d r; // PERF =void;
2097 r.ptr[0] = a.array[1];
2098 r.ptr[1] = a.array[0];
2099 return r;
2100 }
2101 unittest
2102 {
2103 align(16) double[2] A = [56.0, -74.0];
2104 __m128d R = _mm_loadr_pd(A.ptr);
2105 double[2] correct = [-74.0, 56.0];
2106 assert(R.array == correct);
2107 }
2108
2109 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
2110 /// `mem_addr` does not need to be aligned on any particular boundary.
2111 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
2112 {
2113 pragma(inline, true);
2114 static if (GDC_with_SSE2)
2115 {
2116 return __builtin_ia32_loadupd(mem_addr);
2117 }
2118 else static if (LDC_with_optimizations)
2119 {
2120 return loadUnaligned!(double2)(mem_addr);
2121 }
2122 else version(DigitalMars)
2123 {
2124 // Apparently inside __simd you can use aligned dereferences without fear.
2125 // That was issue 23048 on dlang's Bugzilla.
2126 static if (DMD_with_DSIMD)
2127 {
2128 return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr);
2129 }
2130 else static if (SSESizedVectorsAreEmulated)
2131 {
2132 // Since this vector is emulated, it doesn't have alignement constraints
2133 // and as such we can just cast it.
2134 return *cast(__m128d*)(mem_addr);
2135 }
2136 else
2137 {
2138 __m128d result;
2139 result.ptr[0] = mem_addr[0];
2140 result.ptr[1] = mem_addr[1];
2141 return result;
2142 }
2143 }
2144 else
2145 {
2146 __m128d result;
2147 result.ptr[0] = mem_addr[0];
2148 result.ptr[1] = mem_addr[1];
2149 return result;
2150 }
2151 }
2152 unittest
2153 {
2154 double[2] A = [56.0, -75.0];
2155 __m128d R = _mm_loadu_pd(A.ptr);
2156 double[2] correct = [56.0, -75.0];
2157 assert(R.array == correct);
2158 }
2159
2160 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
2161 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
2162 {
2163 // PERF DMD
2164 pragma(inline, true);
2165 static if (GDC_with_SSE2)
2166 {
2167 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
2168 }
2169 else static if (LDC_with_optimizations)
2170 {
2171 return loadUnaligned!(__m128i)(cast(int*)mem_addr);
2172 }
2173 else
2174 {
2175 const(int)* p = cast(const(int)*)mem_addr;
2176 __m128i r = void;
2177 r.ptr[0] = p[0];
2178 r.ptr[1] = p[1];
2179 r.ptr[2] = p[2];
2180 r.ptr[3] = p[3];
2181 return r;
2182 }
2183 }
2184 unittest
2185 {
2186 align(16) int[4] correct = [-1, 2, -3, 4];
2187 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
2188 assert(A.array == correct);
2189 }
2190
2191 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise.
2192 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted // TODO: should be @system actually
2193 {
2194 static if (DMD_with_DSIMD)
2195 {
2196 int r = *cast(short*)(mem_addr);
2197 return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r);
2198 }
2199 else version(DigitalMars)
2200 {
2201 // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672
2202 // DMD cannot handle the below code...
2203 align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0];
2204 r[0] = *cast(short*)(mem_addr);
2205 return *cast(int4*)(r.ptr);
2206 }
2207 else
2208 {
2209 short r = *cast(short*)(mem_addr);
2210 short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
2211 result.ptr[0] = r;
2212 return cast(__m128i)result;
2213 }
2214 }
2215 unittest
2216 {
2217 short r = 13;
2218 short8 A = cast(short8) _mm_loadu_si16(&r);
2219 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
2220 assert(A.array == correct);
2221 }
2222
2223 /// Load unaligned 32-bit integer from memory into the first element of result.
2224 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted // TODO: should be @system actually
2225 {
2226 pragma(inline, true);
2227 int r = *cast(int*)(mem_addr);
2228 int4 result = [0, 0, 0, 0];
2229 result.ptr[0] = r;
2230 return result;
2231 }
2232 unittest
2233 {
2234 int r = 42;
2235 __m128i A = _mm_loadu_si32(&r);
2236 int[4] correct = [42, 0, 0, 0];
2237 assert(A.array == correct);
2238 }
2239
2240 /// Load unaligned 64-bit integer from memory into the first element of result.
2241 /// Upper 64-bit is zeroed.
2242 __m128i _mm_loadu_si64 (const(void)* mem_addr) pure @system
2243 {
2244 pragma(inline, true);
2245 static if (DMD_with_DSIMD)
2246 {
2247 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
2248 }
2249 else
2250 {
2251 auto pLong = cast(const(long)*)mem_addr;
2252 long2 r = [0, 0];
2253 r.ptr[0] = *pLong;
2254 return cast(__m128i)r;
2255 }
2256 }
2257 unittest
2258 {
2259 long r = 446446446446;
2260 long2 A = cast(long2) _mm_loadu_si64(&r);
2261 long[2] correct = [446446446446, 0];
2262 assert(A.array == correct);
2263 }
2264
2265 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2266 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2267 /// and pack the results in destination.
2268 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2269 {
2270 static if (GDC_with_SSE2)
2271 {
2272 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2273 }
2274 else static if (LDC_with_SSE2)
2275 {
2276 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2277 }
2278 else static if (LDC_with_optimizations)
2279 {
2280 // 5 inst with arm64 + LDC 1.32 + -O1
2281 enum ir = `
2282 %ia = sext <8 x i16> %0 to <8 x i32>
2283 %ib = sext <8 x i16> %1 to <8 x i32>
2284 %p = mul <8 x i32> %ia, %ib
2285 %p_even = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 0, i32 2,i32 4, i32 6>
2286 %p_odd = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 1, i32 3,i32 5, i32 7>
2287 %p_sum = add <4 x i32> %p_even, %p_odd
2288 ret <4 x i32> %p_sum`;
2289 return cast(__m128i) LDCInlineIR!(ir, int4, short8, short8)(cast(short8)a, cast(short8)b);
2290 }
2291 else
2292 {
2293 short8 sa = cast(short8)a;
2294 short8 sb = cast(short8)b;
2295 int4 r;
2296 foreach(i; 0..4)
2297 {
2298 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2299 }
2300 return r;
2301 }
2302 }
2303 unittest
2304 {
2305 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2306 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2307 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2308 int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2309 assert(R.array == correct);
2310 }
2311
2312 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2313 /// (elements are not stored when the highest bit is not set in the corresponding element)
2314 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2315 /// boundary.
2316 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2317 {
2318 static if (GDC_with_SSE2)
2319 {
2320 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2321 }
2322 else static if (LDC_with_SSE2)
2323 {
2324 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2325 }
2326 else static if (LDC_with_ARM64)
2327 {
2328 // PERF: catastrophic on ARM32
2329 byte16 bmask = cast(byte16)mask;
2330 byte16 shift = 7;
2331 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2332 mask = cast(__m128i) bmask;
2333 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2334 dest = (a & mask) | (dest & ~mask);
2335 storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2336 }
2337 else
2338 {
2339 byte16 b = cast(byte16)a;
2340 byte16 m = cast(byte16)mask;
2341 byte* dest = cast(byte*)(mem_addr);
2342 foreach(j; 0..16)
2343 {
2344 if (m.array[j] & 128)
2345 {
2346 dest[j] = b.array[j];
2347 }
2348 }
2349 }
2350 }
2351 unittest
2352 {
2353 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2354 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2355 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2356 _mm_maskmoveu_si128(A, mask, dest.ptr);
2357 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2358 assert(dest == correct);
2359 }
2360
2361 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2362 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2363 {
2364 static if (GDC_with_SSE2)
2365 {
2366 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2367 }
2368 else version(LDC)
2369 {
2370 // x86: pmaxsw since LDC 1.0 -O1
2371 // ARM: smax.8h since LDC 1.5 -01
2372 short8 sa = cast(short8)a;
2373 short8 sb = cast(short8)b;
2374 static if (SIMD_COMPARISON_MASKS_16B)
2375 short8 greater = sa > sb;
2376 else
2377 short8 greater = greaterMask!short8(sa, sb);
2378 return cast(__m128i)( (greater & sa) | (~greater & sb) );
2379 }
2380 else
2381 {
2382 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2383 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2384 __m128i mask = _mm_and_si128(aTob, lowerShorts);
2385 return _mm_xor_si128(b, mask);
2386 }
2387 }
2388 unittest
2389 {
2390 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57),
2391 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0));
2392 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0];
2393 assert(R.array == correct);
2394 }
2395
2396 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2397 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2398 {
2399 // PERF DMD
2400 static if (GDC_with_SSE2)
2401 {
2402 return cast(__m128i) __builtin_ia32_pmaxub128(cast(ubyte16)a, cast(ubyte16)b);
2403 }
2404 else version(LDC)
2405 {
2406 // x86: pmaxub since LDC 1.0.0 -O1
2407 // ARM64: umax.16b since LDC 1.5.0 -O1
2408 // PERF: catastrophic on ARM32
2409 ubyte16 sa = cast(ubyte16)a;
2410 ubyte16 sb = cast(ubyte16)b;
2411 static if (SIMD_COMPARISON_MASKS_16B)
2412 ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b);
2413 else
2414 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2415 return cast(__m128i)( (greater & sa) | (~greater & sb) );
2416 }
2417 else
2418 {
2419 // PERF: use algorithm from _mm_max_epu16
2420 __m128i value128 = _mm_set1_epi8(-128);
2421 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2422 __m128i aTob = a ^ b; // a ^ (a ^ b) == b
2423 __m128i mask = aTob & higher;
2424 return b ^ mask;
2425
2426 }
2427 }
2428 unittest
2429 {
2430 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0),
2431 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57));
2432 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2433 assert(R.array == correct);
2434 }
2435
2436 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return
2437 /// packed maximum values.
2438 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2439 {
2440 static if (GDC_with_SSE2)
2441 {
2442 return __builtin_ia32_maxpd(a, b);
2443 }
2444 else
2445 {
2446 // x86: Generates maxpd starting with LDC 1.9 -O2
2447 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2448 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2449 return a;
2450 }
2451 }
2452 unittest
2453 {
2454 __m128d A = _mm_setr_pd(4.0, 1.0);
2455 __m128d B = _mm_setr_pd(1.0, 8.0);
2456 __m128d M = _mm_max_pd(A, B);
2457 assert(M.array[0] == 4.0);
2458 assert(M.array[1] == 8.0);
2459 }
2460
2461 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the
2462 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2463 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2464 {
2465 static if (GDC_with_SSE2)
2466 {
2467 return __builtin_ia32_maxsd(a, b);
2468 }
2469 else
2470 {
2471 __m128d r = a;
2472 // Generates maxsd starting with LDC 1.3
2473 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2474 return r;
2475 }
2476 }
2477 unittest
2478 {
2479 __m128d A = _mm_setr_pd(1.0, 1.0);
2480 __m128d B = _mm_setr_pd(4.0, 2.0);
2481 __m128d M = _mm_max_sd(A, B);
2482 assert(M.array[0] == 4.0);
2483 assert(M.array[1] == 1.0);
2484 }
2485
2486 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to
2487 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction
2488 /// is globally visible before any memory instruction which follows the fence in program order.
2489 void _mm_mfence() @trusted // not pure!
2490 {
2491 version(GNU)
2492 {
2493 static if (GDC_with_SSE2)
2494 {
2495 __builtin_ia32_mfence();
2496 }
2497 else version(X86)
2498 {
2499 asm pure nothrow @nogc @trusted
2500 {
2501 "mfence;\n" : : : ;
2502 }
2503 }
2504 else
2505 static assert(false);
2506 }
2507 else static if (LDC_with_SSE2)
2508 {
2509 __builtin_ia32_mfence();
2510 }
2511 else static if (DMD_with_asm)
2512 {
2513 asm nothrow @nogc pure @trusted
2514 {
2515 mfence;
2516 }
2517 }
2518 else version(LDC)
2519 {
2520 // Note: will generate the DMB ish instruction on ARM
2521 llvm_memory_fence();
2522 }
2523 else
2524 static assert(false);
2525 }
2526 unittest
2527 {
2528 _mm_mfence();
2529 }
2530
2531 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2532 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2533 {
2534 static if (GDC_with_SSE2)
2535 {
2536 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2537 }
2538 else version(LDC)
2539 {
2540 // x86: pminsw since LDC 1.0 -O1
2541 // ARM64: smin.8h since LDC 1.5 -01
2542 short8 sa = cast(short8)a;
2543 short8 sb = cast(short8)b;
2544 static if (SIMD_COMPARISON_MASKS_16B)
2545 short8 greater = sa > sb;
2546 else
2547 short8 greater = greaterMask!short8(sa, sb);
2548 return cast(__m128i)( (~greater & sa) | (greater & sb) );
2549 }
2550 else
2551 {
2552 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2553 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2554 __m128i mask = _mm_and_si128(aTob, lowerShorts);
2555 return _mm_xor_si128(b, mask);
2556 }
2557 }
2558 unittest
2559 {
2560 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768),
2561 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0));
2562 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768];
2563 assert(R.array == correct);
2564 }
2565
2566 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2567 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2568 {
2569 static if (GDC_with_SSE2)
2570 {
2571 return cast(__m128i) __builtin_ia32_pminub128(cast(ubyte16)a, cast(ubyte16)b);
2572 }
2573 else version(LDC)
2574 {
2575 // x86: pminub since LDC 1.0.0 -O1
2576 // ARM: umin.16b since LDC 1.5.0 -O1
2577 // PERF: catastrophic on ARM32
2578 ubyte16 sa = cast(ubyte16)a;
2579 ubyte16 sb = cast(ubyte16)b;
2580 static if (SIMD_COMPARISON_MASKS_16B)
2581 ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b);
2582 else
2583 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2584 return cast(__m128i)( (~greater & sa) | (greater & sb) );
2585 }
2586 else
2587 {
2588 // PERF: use the algorithm from _mm_max_epu16
2589 __m128i value128 = _mm_set1_epi8(-128);
2590 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2591 __m128i aTob = a ^ b; // a ^ (a ^ b) == b
2592 __m128i mask = aTob & lower;
2593 return b ^ mask;
2594 }
2595 }
2596 unittest
2597 {
2598 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0),
2599 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57));
2600 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0];
2601 assert(R.array == correct);
2602 }
2603
2604 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2605 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2606 {
2607 static if (GDC_with_SSE2)
2608 {
2609 return __builtin_ia32_minpd(a, b);
2610 }
2611 else
2612 {
2613 // Generates minpd starting with LDC 1.9
2614 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2615 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2616 return a;
2617 }
2618 }
2619 unittest
2620 {
2621 __m128d A = _mm_setr_pd(1.0, 2.0);
2622 __m128d B = _mm_setr_pd(4.0, 1.0);
2623 __m128d M = _mm_min_pd(A, B);
2624 assert(M.array[0] == 1.0);
2625 assert(M.array[1] == 1.0);
2626 }
2627
2628 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in
2629 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2630 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2631 {
2632 static if (GDC_with_SSE2)
2633 {
2634 return __builtin_ia32_minsd(a, b);
2635 }
2636 else
2637 {
2638 // Generates minsd starting with LDC 1.3
2639 __m128d r = a;
2640 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2641 return r;
2642 }
2643 }
2644 unittest
2645 {
2646 __m128d A = _mm_setr_pd(1.0, 3.0);
2647 __m128d B = _mm_setr_pd(4.0, 2.0);
2648 __m128d M = _mm_min_sd(A, B);
2649 assert(M.array[0] == 1.0);
2650 assert(M.array[1] == 3.0);
2651 }
2652
2653 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2654 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2655 {
2656 static if (GDC_with_SSE2)
2657 {
2658 // slightly better with GDC -O0
2659 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a);
2660 }
2661 else
2662 {
2663 long2 result = [ 0, 0 ];
2664 long2 la = cast(long2) a;
2665 result.ptr[0] = la.array[0];
2666 return cast(__m128i)(result);
2667 }
2668 }
2669 unittest
2670 {
2671 long2 A = [13, 47];
2672 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2673 long[2] correct = [13, 0];
2674 assert(B.array == correct);
2675 }
2676
2677 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy
2678 /// the upper element from `a` to the upper element of dst.
2679 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2680 {
2681 static if (GDC_with_SSE2)
2682 {
2683 return __builtin_ia32_movsd(a, b);
2684 }
2685 else
2686 {
2687 b.ptr[1] = a.array[1];
2688 return b;
2689 }
2690 }
2691 unittest
2692 {
2693 double2 A = [13.0, 47.0];
2694 double2 B = [34.0, 58.0];
2695 double2 C = _mm_move_sd(A, B);
2696 double[2] correct = [34.0, 47.0];
2697 assert(C.array == correct);
2698 }
2699
2700 /// Create mask from the most significant bit of each 8-bit element in `v`.
2701 int _mm_movemask_epi8 (__m128i a) pure @trusted
2702 {
2703 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2704 static if (GDC_with_SSE2)
2705 {
2706 return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2707 }
2708 else static if (LDC_with_SSE2)
2709 {
2710 return __builtin_ia32_pmovmskb128(cast(byte16)a);
2711 }
2712 else static if (LDC_with_ARM64)
2713 {
2714 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2715 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2716 // SO there might be something a bit faster, but this one is reasonable and branchless.
2717 byte8 mask_shift;
2718 mask_shift.ptr[0] = 7;
2719 mask_shift.ptr[1] = 6;
2720 mask_shift.ptr[2] = 5;
2721 mask_shift.ptr[3] = 4;
2722 mask_shift.ptr[4] = 3;
2723 mask_shift.ptr[5] = 2;
2724 mask_shift.ptr[6] = 1;
2725 mask_shift.ptr[7] = 0;
2726 byte8 mask_and = byte8(-128);
2727 byte8 lo = vget_low_u8(cast(byte16)a);
2728 byte8 hi = vget_high_u8(cast(byte16)a);
2729 lo = vand_u8(lo, mask_and);
2730 lo = vshr_u8(lo, mask_shift);
2731 hi = vand_u8(hi, mask_and);
2732 hi = vshr_u8(hi, mask_shift);
2733 lo = vpadd_u8(lo,lo);
2734 lo = vpadd_u8(lo,lo);
2735 lo = vpadd_u8(lo,lo);
2736 hi = vpadd_u8(hi,hi);
2737 hi = vpadd_u8(hi,hi);
2738 hi = vpadd_u8(hi,hi);
2739 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2740 }
2741 else
2742 {
2743 byte16 ai = cast(byte16)a;
2744 int r = 0;
2745 foreach(bit; 0..16)
2746 {
2747 if (ai.array[bit] < 0) r += (1 << bit);
2748 }
2749 return r;
2750 }
2751 }
2752 unittest
2753 {
2754 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2755 }
2756
2757 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS
2758 int _mm_movemask_epi16 (__m128i a) pure @trusted
2759 {
2760 return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128()));
2761 }
2762 unittest
2763 {
2764 assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8)));
2765 }
2766
2767 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit)
2768 /// loating-point element in `v`.
2769 int _mm_movemask_pd(__m128d v) pure @safe
2770 {
2771 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2772 static if (GDC_or_LDC_with_SSE2)
2773 {
2774 return __builtin_ia32_movmskpd(v);
2775 }
2776 else
2777 {
2778 long2 lv = cast(long2)v;
2779 int r = 0;
2780 if (lv.array[0] < 0) r += 1;
2781 if (lv.array[1] < 0) r += 2;
2782 return r;
2783 }
2784 }
2785 unittest
2786 {
2787 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2788 assert(_mm_movemask_pd(A) == 2);
2789 }
2790
2791 /// Copy the lower 64-bit integer in `v`.
2792 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2793 {
2794 long2 lv = cast(long2)v;
2795 return long1(lv.array[0]);
2796 }
2797 unittest
2798 {
2799 __m128i A = _mm_set_epi64x(-1, -2);
2800 __m64 R = _mm_movepi64_pi64(A);
2801 assert(R.array[0] == -2);
2802 }
2803
2804 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2805 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2806 {
2807 long2 r;
2808 r.ptr[0] = a.array[0];
2809 r.ptr[1] = 0;
2810 return cast(__m128i)r;
2811 }
2812
2813 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`,
2814 /// and store the unsigned 64-bit results.
2815 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2816 {
2817 // PERF DMD D_SIMD
2818 static if (GDC_with_SSE2)
2819 {
2820 return cast(__m128i) __builtin_ia32_pmuludq128 (a, b);
2821 }
2822 else
2823 {
2824 version(LDC)
2825 {
2826 static if (__VERSION__ >= 2088)
2827 {
2828 // Need LLVM9 for proper optimization
2829 long2 la, lb;
2830 la.ptr[0] = cast(uint)a.array[0];
2831 la.ptr[1] = cast(uint)a.array[2];
2832 lb.ptr[0] = cast(uint)b.array[0];
2833 lb.ptr[1] = cast(uint)b.array[2];
2834 }
2835 else
2836 {
2837 __m128i zero;
2838 zero = 0;
2839 long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero);
2840 long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero);
2841 }
2842 }
2843 else
2844 {
2845 long2 la, lb;
2846 la.ptr[0] = cast(uint)a.array[0];
2847 la.ptr[1] = cast(uint)a.array[2];
2848 lb.ptr[0] = cast(uint)b.array[0];
2849 lb.ptr[1] = cast(uint)b.array[2];
2850 }
2851
2852 version(DigitalMars)
2853 {
2854 // DMD has no long2 mul
2855 la.ptr[0] *= lb.array[0];
2856 la.ptr[1] *= lb.array[1];
2857 return cast(__m128i)(la);
2858 }
2859 else
2860 {
2861 static if (__VERSION__ >= 2076)
2862 {
2863 return cast(__m128i)(la * lb);
2864 }
2865 else
2866 {
2867 // long2 mul not supported before LDC 1.5
2868 la.ptr[0] *= lb.array[0];
2869 la.ptr[1] *= lb.array[1];
2870 return cast(__m128i)(la);
2871 }
2872 }
2873 }
2874 }
2875 unittest
2876 {
2877 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2878 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2879 __m128i C = _mm_mul_epu32(A, B);
2880 long2 LC = cast(long2)C;
2881 assert(LC.array[0] == 18446744065119617025uL);
2882 assert(LC.array[1] == 12723420444339690338uL);
2883 }
2884
2885 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results.
2886 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2887 {
2888 pragma(inline, true);
2889 return a * b;
2890 }
2891 unittest
2892 {
2893 __m128d a = [-2.0, 1.5];
2894 a = _mm_mul_pd(a, a);
2895 assert(a.array == [4.0, 2.25]);
2896 }
2897
2898 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower
2899 /// element of result, and copy the upper element from `a` to the upper element of result.
2900 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2901 {
2902 version(DigitalMars)
2903 {
2904 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2905 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2906 asm pure nothrow @nogc @trusted { nop;}
2907 a.array[0] = a.array[0] * b.array[0];
2908 return a;
2909 }
2910 else static if (GDC_with_SSE2)
2911 {
2912 return __builtin_ia32_mulsd(a, b);
2913 }
2914 else
2915 {
2916 a.ptr[0] *= b.array[0];
2917 return a;
2918 }
2919 }
2920 unittest
2921 {
2922 __m128d a = [-2.0, 1.5];
2923 a = _mm_mul_sd(a, a);
2924 assert(a.array == [4.0, 1.5]);
2925 }
2926
2927 /// Multiply the low unsigned 32-bit integers from `a` and `b`,
2928 /// and get an unsigned 64-bit result.
2929 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2930 {
2931 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2932 }
2933 unittest
2934 {
2935 __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2936 __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2937 __m64 C = _mm_mul_su32(A, B);
2938 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2939 }
2940
2941 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the
2942 /// high 16 bits of the intermediate integers.
2943 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2944 {
2945 static if (GDC_with_SSE2)
2946 {
2947 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2948 }
2949 else static if (LDC_with_SSE2)
2950 {
2951 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2952 }
2953 else
2954 {
2955 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2956 // PERF: it seems the simde solution has one less instruction in ARM64.
2957 // PERF: Catastrophic in ARM32.
2958 short8 sa = cast(short8)a;
2959 short8 sb = cast(short8)b;
2960 short8 r = void;
2961 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2962 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2963 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2964 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2965 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2966 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2967 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2968 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2969 return cast(__m128i)r;
2970 }
2971 }
2972 unittest
2973 {
2974 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2975 __m128i B = _mm_set1_epi16(16384);
2976 short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2977 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2978 assert(R.array == correct);
2979 }
2980
2981 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the
2982 /// high 16 bits of the intermediate integers.
2983 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2984 {
2985 static if (GDC_with_SSE2)
2986 {
2987 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2988 }
2989 else static if (LDC_with_SSE2)
2990 {
2991 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2992 }
2993 else
2994 {
2995 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2996 // it seems the simde solution has one less instruction in ARM64
2997 // PERF: Catastrophic in ARM32.
2998 short8 sa = cast(short8)a;
2999 short8 sb = cast(short8)b;
3000 short8 r = void;
3001 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
3002 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
3003 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
3004 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
3005 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
3006 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
3007 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
3008 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
3009 return cast(__m128i)r;
3010 }
3011 }
3012 unittest
3013 {
3014 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
3015 __m128i B = _mm_set1_epi16(16384);
3016 short8 R = cast(short8)_mm_mulhi_epu16(A, B);
3017 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
3018 assert(R.array == correct);
3019 }
3020
3021 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16
3022 /// bits of the intermediate integers.
3023 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
3024 {
3025 return cast(__m128i)(cast(short8)a * cast(short8)b);
3026 }
3027 unittest
3028 {
3029 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7);
3030 __m128i B = _mm_set1_epi16(16384);
3031 short8 R = cast(short8)_mm_mullo_epi16(A, B);
3032 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
3033 assert(R.array == correct);
3034 }
3035
3036 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS
3037 __m128i _mm_not_si128 (__m128i a) pure @safe
3038 {
3039 return ~a;
3040 }
3041 unittest
3042 {
3043 __m128i A = _mm_set1_epi32(-748);
3044 int4 notA = cast(int4) _mm_not_si128(A);
3045 int[4] correct = [747, 747, 747, 747];
3046 assert(notA.array == correct);
3047 }
3048
3049 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
3050 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
3051 {
3052 pragma(inline, true);
3053 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
3054 }
3055
3056 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
3057 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
3058 {
3059 pragma(inline, true);
3060 return a | b;
3061 }
3062
3063 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
3064 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
3065 {
3066 static if (DMD_with_DSIMD)
3067 {
3068 return cast(__m128i) __simd(XMM.PACKSSDW, a, b);
3069 }
3070 else static if (GDC_with_SSE2)
3071 {
3072 return cast(__m128i) __builtin_ia32_packssdw128(a, b);
3073 }
3074 else static if (LDC_with_SSE2)
3075 {
3076 return cast(__m128i) __builtin_ia32_packssdw128(a, b);
3077 }
3078 else static if (LDC_with_ARM64)
3079 {
3080 short4 ra = vqmovn_s32(cast(int4)a);
3081 short4 rb = vqmovn_s32(cast(int4)b);
3082 return cast(__m128i)vcombine_s16(ra, rb);
3083 }
3084 else
3085 {
3086 // PERF: catastrophic on ARM32
3087 short8 r;
3088 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
3089 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
3090 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
3091 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
3092 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
3093 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
3094 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
3095 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
3096 return cast(__m128i)r;
3097 }
3098 }
3099 unittest
3100 {
3101 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
3102 short8 R = cast(short8) _mm_packs_epi32(A, A);
3103 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
3104 assert(R.array == correct);
3105 }
3106
3107 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
3108 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
3109 {
3110 static if (DMD_with_DSIMD)
3111 {
3112 return cast(__m128i) __simd(XMM.PACKSSWB, a, b);
3113 }
3114 else static if (GDC_with_SSE2)
3115 {
3116 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
3117 }
3118 else static if (LDC_with_SSE2)
3119 {
3120 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
3121 }
3122 else static if (LDC_with_ARM64)
3123 {
3124 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
3125 byte8 ra = vqmovn_s16(cast(short8)a);
3126 byte8 rb = vqmovn_s16(cast(short8)b);
3127 return cast(__m128i)vcombine_s8(ra, rb);
3128 }
3129 else
3130 {
3131 // PERF: ARM32 is missing
3132 byte16 r;
3133 short8 sa = cast(short8)a;
3134 short8 sb = cast(short8)b;
3135 foreach(i; 0..8)
3136 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
3137 foreach(i; 0..8)
3138 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
3139 return cast(__m128i)r;
3140 }
3141 }
3142 unittest
3143 {
3144 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
3145 byte16 R = cast(byte16) _mm_packs_epi16(A, A);
3146 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
3147 127, -128, 127, 0, 127, -128, 127, 0];
3148 assert(R.array == correct);
3149 }
3150
3151 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
3152 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
3153 {
3154 // PERF DMD catastrophic
3155 static if (DMD_with_DSIMD)
3156 {
3157 return cast(__m128i) __simd(XMM.PACKUSWB, a, b);
3158 }
3159 else static if (GDC_with_SSE2)
3160 {
3161 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
3162 }
3163 else static if (LDC_with_SSE2)
3164 {
3165 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
3166 }
3167 else static if (LDC_with_ARM64)
3168 {
3169 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
3170 byte8 ra = vqmovun_s16(cast(short8)a);
3171 byte8 rb = vqmovun_s16(cast(short8)b);
3172 return cast(__m128i)vcombine_s8(ra, rb);
3173 }
3174 else
3175 {
3176 short8 sa = cast(short8)a;
3177 short8 sb = cast(short8)b;
3178 align(16) ubyte[16] result = void;
3179 for (int i = 0; i < 8; ++i)
3180 {
3181 short s = sa[i];
3182 if (s < 0) s = 0;
3183 if (s > 255) s = 255;
3184 result[i] = cast(ubyte)s;
3185
3186 s = sb[i];
3187 if (s < 0) s = 0;
3188 if (s > 255) s = 255;
3189 result[i+8] = cast(ubyte)s;
3190 }
3191 return *cast(__m128i*)(result.ptr);
3192 }
3193 }
3194 unittest
3195 {
3196 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
3197 byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
3198 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
3199 0, 255, 0, 255, 255, 2, 1, 0];
3200 foreach(i; 0..16)
3201 assert(AA.array[i] == cast(byte)(correctResult[i]));
3202 }
3203
3204 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance
3205 /// and power consumption of spin-wait loops.
3206 void _mm_pause() @trusted
3207 {
3208 version(GNU)
3209 {
3210 static if (GDC_with_SSE2)
3211 {
3212 __builtin_ia32_pause();
3213 }
3214 else version(X86)
3215 {
3216 asm pure nothrow @nogc @trusted
3217 {
3218 "pause;\n" : : : ;
3219 }
3220 }
3221 else
3222 static assert(false);
3223 }
3224 else static if (LDC_with_SSE2)
3225 {
3226 __builtin_ia32_pause();
3227 }
3228 else static if (DMD_with_asm)
3229 {
3230 asm nothrow @nogc pure @trusted
3231 {
3232 rep; nop; // F3 90 = pause
3233 }
3234 }
3235 else version (LDC)
3236 {
3237 // PERF: Do nothing currently , could be the "yield" intruction on ARM.
3238 }
3239 else
3240 static assert(false);
3241 }
3242 unittest
3243 {
3244 _mm_pause();
3245 }
3246
3247 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each
3248 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the
3249 /// low 16 bits of 64-bit elements in result.
3250 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
3251 {
3252 static if (GDC_with_SSE2)
3253 {
3254 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
3255 }
3256 else static if (LDC_with_SSE2)
3257 {
3258 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
3259 }
3260 else static if (LDC_with_ARM64)
3261 {
3262 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
3263
3264 // PERF: Looks suboptimal vs addp
3265 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
3266 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
3267 ushort8 r = 0;
3268 r[0] = r0;
3269 r[4] = r4;
3270 return cast(__m128i) r;
3271 }
3272 else
3273 {
3274 // PERF: ARM32 is lacking
3275 byte16 ab = cast(byte16)a;
3276 byte16 bb = cast(byte16)b;
3277 ubyte[16] t;
3278 foreach(i; 0..16)
3279 {
3280 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
3281 if (diff < 0) diff = -diff;
3282 t[i] = cast(ubyte)(diff);
3283 }
3284 int4 r = _mm_setzero_si128();
3285 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
3286 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
3287 return r;
3288 }
3289 }
3290 unittest
3291 {
3292 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
3293 __m128i B = _mm_set1_epi8(1);
3294 __m128i R = _mm_sad_epu8(A, B);
3295 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
3296 0,
3297 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
3298 0];
3299 assert(R.array == correct);
3300 }
3301
3302 /// Set packed 16-bit integers with the supplied values.
3303 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
3304 {
3305 short8 r = void;
3306 r.ptr[0] = e0;
3307 r.ptr[1] = e1;
3308 r.ptr[2] = e2;
3309 r.ptr[3] = e3;
3310 r.ptr[4] = e4;
3311 r.ptr[5] = e5;
3312 r.ptr[6] = e6;
3313 r.ptr[7] = e7;
3314 return cast(__m128i) r;
3315 }
3316 unittest
3317 {
3318 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
3319 short8 B = cast(short8) A;
3320 foreach(i; 0..8)
3321 assert(B.array[i] == i);
3322 }
3323
3324 /// Set packed 32-bit integers with the supplied values.
3325 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3326 {
3327 // PERF: does a constant inline correctly? vs int4 field assignment
3328 align(16) int[4] r = [e0, e1, e2, e3];
3329 return *cast(int4*)&r;
3330 }
3331 unittest
3332 {
3333 __m128i A = _mm_set_epi32(3, 2, 1, 0);
3334 foreach(i; 0..4)
3335 assert(A.array[i] == i);
3336 }
3337
3338 /// Set packed 64-bit integers with the supplied values.
3339 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
3340 {
3341 pragma(inline, true);
3342 long2 r = void;
3343 r.ptr[0] = e0.array[0];
3344 r.ptr[1] = e1.array[0];
3345 return cast(__m128i)(r);
3346 }
3347 unittest
3348 {
3349 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
3350 long2 B = cast(long2) A;
3351 assert(B.array[0] == 5678);
3352 assert(B.array[1] == 1234);
3353 }
3354
3355 /// Set packed 64-bit integers with the supplied values.
3356 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3357 {
3358 pragma(inline, true);
3359 long2 r = void;
3360 r.ptr[0] = e0;
3361 r.ptr[1] = e1;
3362 return cast(__m128i)(r);
3363 }
3364 unittest
3365 {
3366 __m128i A = _mm_set_epi64x(1234, -5678);
3367 long2 B = cast(long2) A;
3368 assert(B.array[0] == -5678);
3369 assert(B.array[1] == 1234);
3370 }
3371
3372 /// Set packed 8-bit integers with the supplied values.
3373 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3374 byte e11, byte e10, byte e9, byte e8,
3375 byte e7, byte e6, byte e5, byte e4,
3376 byte e3, byte e2, byte e1, byte e0) pure @trusted
3377 {
3378 align(16) byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7,
3379 e8, e9, e10, e11, e12, e13, e14, e15];
3380 return *cast(__m128i*)(result.ptr);
3381 }
3382 unittest
3383 {
3384 byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3385 byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1];
3386 assert(R.array == correct);
3387 }
3388
3389 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3390 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3391 {
3392 pragma(inline, true);
3393 double2 r = void;
3394 r.ptr[0] = e0;
3395 r.ptr[1] = e1;
3396 return r;
3397 }
3398 unittest
3399 {
3400 __m128d A = _mm_set_pd(61.0, 55.0);
3401 double[2] correct = [55.0, 61.0];
3402 assert(A.array == correct);
3403 }
3404
3405 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3406 __m128d _mm_set_pd1 (double a) pure @trusted
3407 {
3408 pragma(inline, true);
3409 __m128d r = void;
3410 r.ptr[0] = a;
3411 r.ptr[1] = a;
3412 return r;
3413 }
3414 unittest
3415 {
3416 __m128d A = _mm_set_pd1(61.0);
3417 double[2] correct = [61.0, 61.0];
3418 assert(A.array == correct);
3419 }
3420
3421 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result,
3422 /// and zero the upper element.
3423 __m128d _mm_set_sd (double a) pure @trusted
3424 {
3425 double2 r = void;
3426 r.ptr[0] = a;
3427 r.ptr[1] = 0.0;
3428 return r;
3429 }
3430 unittest
3431 {
3432 __m128d A = _mm_set_sd(61.0);
3433 double[2] correct = [61.0, 0.0];
3434 assert(A.array == correct);
3435 }
3436
3437 /// Broadcast 16-bit integer a to all elements of dst.
3438 __m128i _mm_set1_epi16 (short a) pure @trusted
3439 {
3440 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469
3441 {
3442 short8 v = a;
3443 return cast(__m128i) v;
3444 }
3445 else
3446 {
3447 pragma(inline, true);
3448 return cast(__m128i)(short8(a));
3449 }
3450 }
3451 unittest
3452 {
3453 short8 a = cast(short8) _mm_set1_epi16(31);
3454 for (int i = 0; i < 8; ++i)
3455 assert(a.array[i] == 31);
3456 }
3457
3458 /// Broadcast 32-bit integer `a` to all elements.
3459 __m128i _mm_set1_epi32 (int a) pure @trusted
3460 {
3461 pragma(inline, true);
3462 return cast(__m128i)(int4(a));
3463 }
3464 unittest
3465 {
3466 int4 a = cast(int4) _mm_set1_epi32(31);
3467 for (int i = 0; i < 4; ++i)
3468 assert(a.array[i] == 31);
3469 }
3470
3471 /// Broadcast 64-bit integer `a` to all elements.
3472 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3473 {
3474 return _mm_set_epi64(a, a);
3475 }
3476 unittest
3477 {
3478 long b = 0x1DEADCAFE;
3479 __m64 a;
3480 a.ptr[0] = b;
3481 long2 c = cast(long2) _mm_set1_epi64(a);
3482 assert(c.array[0] == b);
3483 assert(c.array[1] == b);
3484 }
3485
3486 /// Broadcast 64-bit integer `a` to all elements
3487 __m128i _mm_set1_epi64x (long a) pure @trusted
3488 {
3489 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3490 return cast(__m128i)(b);
3491 }
3492 unittest
3493 {
3494 long b = 0x1DEADCAFE;
3495 long2 c = cast(long2) _mm_set1_epi64x(b);
3496 for (int i = 0; i < 2; ++i)
3497 assert(c.array[i] == b);
3498 }
3499
3500 /// Broadcast 8-bit integer `a` to all elements.
3501 __m128i _mm_set1_epi8 (byte a) pure @trusted
3502 {
3503 pragma(inline, true);
3504 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3505 return cast(__m128i)(b);
3506 }
3507 unittest
3508 {
3509 byte16 b = cast(byte16) _mm_set1_epi8(31);
3510 for (int i = 0; i < 16; ++i)
3511 assert(b.array[i] == 31);
3512 }
3513
3514 alias _mm_set1_pd = _mm_set_pd1;
3515
3516 /// Set packed 16-bit integers with the supplied values in reverse order.
3517 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4,
3518 short e3, short e2, short e1, short e0) pure @trusted
3519 {
3520 short8 r = void;
3521 r.ptr[0] = e7;
3522 r.ptr[1] = e6;
3523 r.ptr[2] = e5;
3524 r.ptr[3] = e4;
3525 r.ptr[4] = e3;
3526 r.ptr[5] = e2;
3527 r.ptr[6] = e1;
3528 r.ptr[7] = e0;
3529 return cast(__m128i)(r);
3530 }
3531 unittest
3532 {
3533 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3534 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3535 assert(A.array == correct);
3536 }
3537
3538 /// Set packed 32-bit integers with the supplied values in reverse order.
3539 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3540 {
3541 // Performs better than = void; with GDC
3542 pragma(inline, true);
3543 align(16) int[4] result = [e3, e2, e1, e0];
3544 return *cast(__m128i*)(result.ptr);
3545 }
3546 unittest
3547 {
3548 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3549 int[4] correct = [-1, 0, -2147483648, 2147483647];
3550 assert(A.array == correct);
3551 }
3552
3553 /// Set packed 64-bit integers with the supplied values in reverse order.
3554 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3555 {
3556 long2 r = void;
3557 r.ptr[0] = e1;
3558 r.ptr[1] = e0;
3559 return cast(__m128i)(r);
3560 }
3561 unittest
3562 {
3563 long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3564 long[2] correct = [-1, 0];
3565 assert(A.array == correct);
3566 }
3567
3568 /// Set packed 8-bit integers with the supplied values in reverse order.
3569 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3570 byte e11, byte e10, byte e9, byte e8,
3571 byte e7, byte e6, byte e5, byte e4,
3572 byte e3, byte e2, byte e1, byte e0) pure @trusted
3573 {
3574 align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3575 e7, e6, e5, e4, e3, e2, e1, e0];
3576 return *cast(__m128i*)(result.ptr);
3577 }
3578 unittest
3579 {
3580 byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3581 byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14];
3582 assert(R.array == correct);
3583 }
3584
3585 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3586 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3587 {
3588 pragma(inline, true);
3589 double2 result;
3590 result.ptr[0] = e1;
3591 result.ptr[1] = e0;
3592 return result;
3593 }
3594 unittest
3595 {
3596 __m128d A = _mm_setr_pd(61.0, 55.0);
3597 double[2] correct = [61.0, 55.0];
3598 assert(A.array == correct);
3599 }
3600
3601 /// Return vector of type `__m128d` with all elements set to zero.
3602 __m128d _mm_setzero_pd() pure @trusted
3603 {
3604 pragma(inline, true);
3605 double2 r = void;
3606 r.ptr[0] = 0.0;
3607 r.ptr[1] = 0.0;
3608 return r;
3609 }
3610 unittest
3611 {
3612 __m128d A = _mm_setzero_pd();
3613 double[2] correct = [0.0, 0.0];
3614 assert(A.array == correct);
3615 }
3616
3617 /// Return vector of type `__m128i` with all elements set to zero.
3618 __m128i _mm_setzero_si128() pure @trusted
3619 {
3620 pragma(inline, true);
3621 int4 r = void;
3622 r.ptr[0] = 0;
3623 r.ptr[1] = 0;
3624 r.ptr[2] = 0;
3625 r.ptr[3] = 0;
3626 return r;
3627 }
3628 unittest
3629 {
3630 __m128i A = _mm_setzero_si128();
3631 int[4] correct = [0, 0, 0, 0];
3632 assert(A.array == correct);
3633 }
3634
3635 /// Shuffle 32-bit integers in `a` using the control in `imm8`.
3636 /// See_also: `_MM_SHUFFLE`.
3637 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted
3638 {
3639 // PERF DMD D_SIMD
3640 static if (GDC_with_SSE2)
3641 {
3642 return __builtin_ia32_pshufd(a, imm8);
3643 }
3644 else static if (LDC_with_optimizations)
3645 {
3646 return shufflevectorLDC!(int4, (imm8 >> 0) & 3,
3647 (imm8 >> 2) & 3,
3648 (imm8 >> 4) & 3,
3649 (imm8 >> 6) & 3)(a, a);
3650 }
3651 else
3652 {
3653 int4 r = void;
3654 r.ptr[0] = a.ptr[(imm8 >> 0) & 3];
3655 r.ptr[1] = a.ptr[(imm8 >> 2) & 3];
3656 r.ptr[2] = a.ptr[(imm8 >> 4) & 3];
3657 r.ptr[3] = a.ptr[(imm8 >> 6) & 3];
3658 return r;
3659 }
3660 }
3661 unittest
3662 {
3663 __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3664 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3665 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3666 int[4] expectedB = [ 3, 2, 1, 0 ];
3667 assert(B.array == expectedB);
3668 }
3669
3670 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3671 /// See_also: `_MM_SHUFFLE2`.
3672 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted
3673 {
3674 // PERF DMD D_SIMD
3675 static if (GDC_with_SSE2)
3676 {
3677 return __builtin_ia32_shufpd(a, b, imm8);
3678 }
3679 else version(LDC)
3680 {
3681 return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ),
3682 2 + ( (imm8 >> 1) & 1 ))(a, b);
3683 }
3684 else
3685 {
3686 double2 r = void;
3687 r.ptr[0] = a.array[imm8 & 1];
3688 r.ptr[1] = b.array[(imm8 >> 1) & 1];
3689 return r;
3690 }
3691 }
3692 unittest
3693 {
3694 __m128d A = _mm_setr_pd(0.5, 2.0);
3695 __m128d B = _mm_setr_pd(4.0, 5.0);
3696 enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3697 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3698 double[2] correct = [ 2.0, 5.0 ];
3699 assert(R.array == correct);
3700 }
3701
3702 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high
3703 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3704 /// See also: `_MM_SHUFFLE`.
3705 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted
3706 {
3707 static if (DMD_with_DSIMD)
3708 {
3709 return cast(__m128i) __simd(XMM.PSHUFHW, a, a, cast(ubyte)imm8);
3710 }
3711 else static if (GDC_with_SSE2)
3712 {
3713 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3714 }
3715 else static if (LDC_with_optimizations)
3716 {
3717 return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3,
3718 4 + ( (imm8 >> 0) & 3 ),
3719 4 + ( (imm8 >> 2) & 3 ),
3720 4 + ( (imm8 >> 4) & 3 ),
3721 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3722 }
3723 else
3724 {
3725 short8 r = cast(short8)a;
3726 short8 sa = cast(short8)a;
3727 r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ];
3728 r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ];
3729 r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ];
3730 r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ];
3731 return cast(__m128i) r;
3732 }
3733 }
3734 unittest
3735 {
3736 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3737 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3738 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3739 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3740 assert(C.array == expectedC);
3741 }
3742
3743 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64
3744 /// bits of result, with the high 64 bits being copied from from `a` to result.
3745 /// See_also: `_MM_SHUFFLE`.
3746 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted
3747 {
3748 static if (DMD_with_DSIMD)
3749 {
3750 return cast(__m128i) __simd(XMM.PSHUFLW, a, a, cast(ubyte)imm8);
3751 }
3752 else static if (GDC_with_SSE2)
3753 {
3754 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3755 }
3756 else static if (LDC_with_optimizations)
3757 {
3758 return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ),
3759 ( (imm8 >> 2) & 3 ),
3760 ( (imm8 >> 4) & 3 ),
3761 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3762 }
3763 else
3764 {
3765 short8 r = cast(short8)a;
3766 short8 sa = cast(short8)a;
3767 r.ptr[0] = sa.array[(imm8 >> 0) & 3];
3768 r.ptr[1] = sa.array[(imm8 >> 2) & 3];
3769 r.ptr[2] = sa.array[(imm8 >> 4) & 3];
3770 r.ptr[3] = sa.array[(imm8 >> 6) & 3];
3771 return cast(__m128i) r;
3772 }
3773 }
3774 unittest
3775 {
3776 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3777 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3778 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3779 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3780 assert(B.array == expectedB);
3781 }
3782
3783 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3784 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3785 {
3786 static if (LDC_with_SSE2)
3787 {
3788 return __builtin_ia32_pslld128(a, count);
3789 }
3790 else static if (GDC_with_SSE2)
3791 {
3792 return __builtin_ia32_pslld128(a, count);
3793 }
3794 else static if (DMD_with_32bit_asm)
3795 {
3796 asm pure nothrow @nogc @trusted
3797 {
3798 movdqu XMM0, a;
3799 movdqu XMM1, count;
3800 pslld XMM0, XMM1;
3801 movdqu a, XMM0;
3802 }
3803 return a;
3804 }
3805 else
3806 {
3807 int4 r = void;
3808 long2 lc = cast(long2)count;
3809 int bits = cast(int)(lc.array[0]);
3810 foreach(i; 0..4)
3811 r[i] = cast(uint)(a[i]) << bits;
3812 return r;
3813 }
3814 }
3815
3816 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3817 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3818 {
3819 static if (LDC_with_SSE2)
3820 {
3821 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3822 }
3823 else static if (GDC_with_SSE2)
3824 {
3825 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3826 }
3827 else static if (DMD_with_32bit_asm)
3828 {
3829 asm pure nothrow @nogc @trusted
3830 {
3831 movdqu XMM0, a;
3832 movdqu XMM1, count;
3833 psllq XMM0, XMM1;
3834 movdqu a, XMM0;
3835 }
3836 return a;
3837 }
3838 else
3839 {
3840 // ARM: good since LDC 1.12 -O2
3841 // ~but -O0 version is catastrophic
3842 long2 r = void;
3843 long2 sa = cast(long2)a;
3844 long2 lc = cast(long2)count;
3845 int bits = cast(int)(lc.array[0]);
3846 foreach(i; 0..2)
3847 r.array[i] = cast(ulong)(sa.array[i]) << bits;
3848 return cast(__m128i)r;
3849 }
3850 }
3851
3852 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3853 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3854 {
3855 static if (GDC_or_LDC_with_SSE2)
3856 {
3857 return cast(__m128i)__builtin_ia32_psllw128(cast(short8)a, cast(short8)count);
3858 }
3859 else static if (DMD_with_32bit_asm)
3860 {
3861 asm pure nothrow @nogc @trusted
3862 {
3863 movdqu XMM0, a;
3864 movdqu XMM1, count;
3865 psllw XMM0, XMM1;
3866 movdqu a, XMM0;
3867 }
3868 return a;
3869 }
3870 else
3871 {
3872 short8 sa = cast(short8)a;
3873 long2 lc = cast(long2)count;
3874 int bits = cast(int)(lc.array[0]);
3875 short8 r = void;
3876 foreach(i; 0..8)
3877 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3878 return cast(int4)r;
3879 }
3880 }
3881
3882
3883 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3884 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3885 {
3886 static if (GDC_with_SSE2)
3887 {
3888 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3889 }
3890 else static if (LDC_with_SSE2)
3891 {
3892 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3893 }
3894 else
3895 {
3896 // Note: the intrinsics guarantee imm8[0..7] is taken, however
3897 // D says "It's illegal to shift by the same or more bits
3898 // than the size of the quantity being shifted"
3899 // and it's UB instead.
3900 int4 r = _mm_setzero_si128();
3901
3902 ubyte count = cast(ubyte) imm8;
3903 if (count > 31)
3904 return r;
3905
3906 foreach(i; 0..4)
3907 r.array[i] = cast(uint)(a.array[i]) << count;
3908 return r;
3909 }
3910 }
3911 unittest
3912 {
3913 __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3914 __m128i B = _mm_slli_epi32(A, 1);
3915 __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3916 int[4] expectedB = [ 0, 4, 6, -8];
3917 assert(B.array == expectedB);
3918 assert(B2.array == expectedB);
3919
3920 __m128i C = _mm_slli_epi32(A, 0);
3921 int[4] expectedC = [ 0, 2, 3, -4];
3922 assert(C.array == expectedC);
3923
3924 __m128i D = _mm_slli_epi32(A, 65);
3925 int[4] expectedD = [ 0, 0, 0, 0];
3926 assert(D.array == expectedD);
3927 }
3928
3929 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3930 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3931 {
3932 static if (GDC_with_SSE2)
3933 {
3934 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3935 }
3936 else static if (LDC_with_SSE2)
3937 {
3938 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3939 }
3940 else
3941 {
3942 long2 sa = cast(long2)a;
3943
3944 // Note: the intrinsics guarantee imm8[0..7] is taken, however
3945 // D says "It's illegal to shift by the same or more bits
3946 // than the size of the quantity being shifted"
3947 // and it's UB instead.
3948 long2 r = cast(long2) _mm_setzero_si128();
3949 ubyte count = cast(ubyte) imm8;
3950 if (count > 63)
3951 return cast(__m128i)r;
3952
3953 r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3954 r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3955 return cast(__m128i)r;
3956 }
3957 }
3958 unittest
3959 {
3960 __m128i A = _mm_setr_epi64(8, -4);
3961 long2 B = cast(long2) _mm_slli_epi64(A, 1);
3962 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3963 long[2] expectedB = [ 16, -8];
3964 assert(B.array == expectedB);
3965 assert(B2.array == expectedB);
3966
3967 long2 C = cast(long2) _mm_slli_epi64(A, 0);
3968 long[2] expectedC = [ 8, -4];
3969 assert(C.array == expectedC);
3970
3971 long2 D = cast(long2) _mm_slli_epi64(A, 64);
3972 long[2] expectedD = [ 0, -0];
3973 assert(D.array == expectedD);
3974 }
3975
3976 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3977 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3978 {
3979 static if (GDC_with_SSE2)
3980 {
3981 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3982 }
3983 else static if (LDC_with_SSE2)
3984 {
3985 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3986 }
3987 else static if (LDC_with_ARM64)
3988 {
3989 short8 sa = cast(short8)a;
3990 short8 r = cast(short8)_mm_setzero_si128();
3991 ubyte count = cast(ubyte) imm8;
3992 if (count > 15)
3993 return cast(__m128i)r;
3994 r = sa << short8(count);
3995 return cast(__m128i)r;
3996 }
3997 else
3998 {
3999 short8 sa = cast(short8)a;
4000 short8 r = cast(short8)_mm_setzero_si128();
4001 ubyte count = cast(ubyte) imm8;
4002 if (count > 15)
4003 return cast(__m128i)r;
4004 foreach(i; 0..8)
4005 r.ptr[i] = cast(short)(sa.array[i] << count);
4006 return cast(__m128i)r;
4007 }
4008 }
4009 unittest
4010 {
4011 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4012 short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
4013 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
4014 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
4015 assert(B.array == expectedB);
4016 assert(B2.array == expectedB);
4017
4018 short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
4019 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
4020 assert(C.array == expectedC);
4021 }
4022
4023
4024 /// Shift `a` left by `bytes` bytes while shifting in zeros.
4025 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
4026 {
4027 static if (bytes & 0xF0)
4028 {
4029 return _mm_setzero_si128();
4030 }
4031 else static if (DMD_with_DSIMD)
4032 {
4033 return cast(__m128i) __simd_ib(XMM.PSLLDQ, op, bytes);
4034 }
4035 else static if (GDC_with_SSE2)
4036 {
4037 pragma(inline, true); // else it doesn't seem to be inlined at all by GDC TODO _mm_srli_si128
4038 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8));
4039 }
4040 else static if (LDC_with_optimizations)
4041 {
4042 return cast(__m128i) shufflevectorLDC!(byte16,
4043 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
4044 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
4045 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
4046 (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
4047 }
4048 else static if (DMD_with_32bit_asm)
4049 {
4050 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
4051 {
4052 movdqu XMM0, op;
4053 pslldq XMM0, bytes;
4054 movdqu op, XMM0;
4055 }
4056 return op;
4057 }
4058 else
4059 {
4060 byte16 A = cast(byte16)op;
4061 byte16 R = void;
4062 for (int n = 15; n >= bytes; --n)
4063 R.ptr[n] = A.array[n-bytes];
4064 for (int n = bytes-1; n >= 0; --n)
4065 R.ptr[n] = 0;
4066 return cast(__m128i)R;
4067 }
4068 }
4069 unittest
4070 {
4071 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4072 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
4073 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
4074 assert(R.array == correct);
4075
4076 __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1));
4077 int[4] expectedB = [0, 0, 0, 0];
4078 assert(B.array == expectedB);
4079 }
4080
4081 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
4082 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
4083 {
4084 version(LDC)
4085 {
4086 // Disappeared with LDC 1.11
4087 static if (__VERSION__ < 2081)
4088 return __builtin_ia32_sqrtpd(vec);
4089 else
4090 {
4091 // PERF: use llvm_sqrt on the vector
4092 vec.array[0] = llvm_sqrt(vec.array[0]);
4093 vec.array[1] = llvm_sqrt(vec.array[1]);
4094 return vec;
4095 }
4096 }
4097 else static if (GDC_with_SSE2)
4098 {
4099 return __builtin_ia32_sqrtpd(vec);
4100 }
4101 else
4102 {
4103 vec.ptr[0] = sqrt(vec.array[0]);
4104 vec.ptr[1] = sqrt(vec.array[1]);
4105 return vec;
4106 }
4107 }
4108
4109 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in
4110 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
4111 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
4112 {
4113 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
4114 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same.
4115 // The quadword at bits 127:64 of the destination operand remains unchanged."
4116 version(LDC)
4117 {
4118 // Disappeared with LDC 1.11
4119 static if (__VERSION__ < 2081)
4120 {
4121 __m128d c = __builtin_ia32_sqrtsd(b);
4122 a[0] = c[0];
4123 return a;
4124 }
4125 else
4126 {
4127 a.array[0] = llvm_sqrt(b.array[0]);
4128 return a;
4129 }
4130 }
4131 else static if (GDC_with_SSE2)
4132 {
4133 __m128d c = __builtin_ia32_sqrtsd(b);
4134 a.ptr[0] = c.array[0];
4135 return a;
4136 }
4137 else
4138 {
4139 a.ptr[0] = sqrt(b.array[0]);
4140 return a;
4141 }
4142 }
4143 unittest
4144 {
4145 __m128d A = _mm_setr_pd(1.0, 3.0);
4146 __m128d B = _mm_setr_pd(4.0, 5.0);
4147 __m128d R = _mm_sqrt_sd(A, B);
4148 double[2] correct = [2.0, 3.0 ];
4149 assert(R.array == correct);
4150 }
4151
4152 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
4153 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
4154 {
4155 static if (GDC_with_SSE2)
4156 {
4157 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
4158 }
4159 else static if (LDC_with_SSE2)
4160 {
4161 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
4162 }
4163 else
4164 {
4165 short8 sa = cast(short8)a;
4166 long2 lc = cast(long2)count;
4167 int bits = cast(int)(lc.array[0]);
4168 short8 r = void;
4169 foreach(i; 0..8)
4170 r.ptr[i] = cast(short)(sa.array[i] >> bits);
4171 return cast(int4)r;
4172 }
4173 }
4174
4175 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
4176 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
4177 {
4178 static if (LDC_with_SSE2)
4179 {
4180 return __builtin_ia32_psrad128(a, count);
4181 }
4182 else static if (GDC_with_SSE2)
4183 {
4184 return __builtin_ia32_psrad128(a, count);
4185 }
4186 else
4187 {
4188 int4 r = void;
4189 long2 lc = cast(long2)count;
4190 int bits = cast(int)(lc.array[0]);
4191 r.ptr[0] = (a.array[0] >> bits);
4192 r.ptr[1] = (a.array[1] >> bits);
4193 r.ptr[2] = (a.array[2] >> bits);
4194 r.ptr[3] = (a.array[3] >> bits);
4195 return r;
4196 }
4197 }
4198
4199
4200 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
4201 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
4202 {
4203 static if (GDC_with_SSE2)
4204 {
4205 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
4206 }
4207 else static if (LDC_with_SSE2)
4208 {
4209 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
4210 }
4211 else static if (LDC_with_ARM64)
4212 {
4213 short8 sa = cast(short8)a;
4214 ubyte count = cast(ubyte)imm8;
4215 if (count > 15)
4216 count = 15;
4217 short8 r = sa >> short8(count);
4218 return cast(__m128i)r;
4219 }
4220 else
4221 {
4222 short8 sa = cast(short8)a;
4223 short8 r = void;
4224
4225 // Note: the intrinsics guarantee imm8[0..7] is taken, however
4226 // D says "It's illegal to shift by the same or more bits
4227 // than the size of the quantity being shifted"
4228 // and it's UB instead.
4229 ubyte count = cast(ubyte)imm8;
4230 if (count > 15)
4231 count = 15;
4232 foreach(i; 0..8)
4233 r.ptr[i] = cast(short)(sa.array[i] >> count);
4234 return cast(int4)r;
4235 }
4236 }
4237 unittest
4238 {
4239 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4240 short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
4241 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
4242 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
4243 assert(B.array == expectedB);
4244 assert(B2.array == expectedB);
4245
4246 short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
4247 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
4248 assert(C.array == expectedC);
4249 }
4250
4251 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
4252 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
4253 {
4254 static if (LDC_with_SSE2)
4255 {
4256 return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4257 }
4258 else static if (GDC_with_SSE2)
4259 {
4260 return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4261 }
4262 else
4263 {
4264 int4 r = void;
4265
4266 // Note: the intrinsics guarantee imm8[0..7] is taken, however
4267 // D says "It's illegal to shift by the same or more bits
4268 // than the size of the quantity being shifted"
4269 // and it's UB instead.
4270 // See Issue: #56
4271 ubyte count = cast(ubyte) imm8;
4272 if (count > 31)
4273 count = 31;
4274
4275 r.ptr[0] = (a.array[0] >> count);
4276 r.ptr[1] = (a.array[1] >> count);
4277 r.ptr[2] = (a.array[2] >> count);
4278 r.ptr[3] = (a.array[3] >> count);
4279 return r;
4280 }
4281 }
4282 unittest
4283 {
4284 __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4285 __m128i B = _mm_srai_epi32(A, 1);
4286 __m128i B2 = _mm_srai_epi32(A, 1 + 256);
4287 int[4] expectedB = [ 0, 1, 1, -2];
4288 assert(B.array == expectedB);
4289 assert(B2.array == expectedB);
4290
4291 __m128i C = _mm_srai_epi32(A, 32);
4292 int[4] expectedC = [ 0, 0, 0, -1];
4293 assert(C.array == expectedC);
4294
4295 __m128i D = _mm_srai_epi32(A, 0);
4296 int[4] expectedD = [ 0, 2, 3, -4];
4297 assert(D.array == expectedD);
4298 }
4299
4300 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
4301 {
4302 static if (LDC_with_SSE2)
4303 {
4304 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
4305 }
4306 else static if (GDC_with_SSE2)
4307 {
4308 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
4309 }
4310 else
4311 {
4312 short8 sa = cast(short8)a;
4313 long2 lc = cast(long2)count;
4314 int bits = cast(int)(lc.array[0]);
4315 short8 r = void;
4316 foreach(i; 0..8)
4317 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
4318 return cast(int4)r;
4319 }
4320 }
4321
4322 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
4323 {
4324 static if (LDC_with_SSE2)
4325 {
4326 return __builtin_ia32_psrld128(a, count);
4327 }
4328 else static if (GDC_with_SSE2)
4329 {
4330 return __builtin_ia32_psrld128(a, count);
4331 }
4332 else
4333 {
4334 int4 r = void;
4335 long2 lc = cast(long2)count;
4336 int bits = cast(int)(lc.array[0]);
4337 r.ptr[0] = cast(uint)(a.array[0]) >> bits;
4338 r.ptr[1] = cast(uint)(a.array[1]) >> bits;
4339 r.ptr[2] = cast(uint)(a.array[2]) >> bits;
4340 r.ptr[3] = cast(uint)(a.array[3]) >> bits;
4341 return r;
4342 }
4343 }
4344
4345 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
4346 {
4347 static if (LDC_with_SSE2)
4348 {
4349 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4350 }
4351 else static if (GDC_with_SSE2)
4352 {
4353 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4354 }
4355 else
4356 {
4357 // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047
4358 // => avoid void initialization.
4359 long2 r;
4360 long2 sa = cast(long2)a;
4361 long2 lc = cast(long2)count;
4362 int bits = cast(int)(lc.array[0]);
4363 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
4364 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
4365 return cast(__m128i)r;
4366 }
4367 }
4368
4369 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
4370 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
4371 {
4372 static if (GDC_with_SSE2)
4373 {
4374 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4375 }
4376 else static if (LDC_with_SSE2)
4377 {
4378 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4379 }
4380 else static if (LDC_with_ARM64)
4381 {
4382 short8 sa = cast(short8)a;
4383 short8 r = cast(short8) _mm_setzero_si128();
4384
4385 ubyte count = cast(ubyte)imm8;
4386 if (count >= 16)
4387 return cast(__m128i)r;
4388
4389 r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
4390 return cast(__m128i)r;
4391 }
4392 else
4393 {
4394 short8 sa = cast(short8)a;
4395 ubyte count = cast(ubyte)imm8;
4396
4397 short8 r = cast(short8) _mm_setzero_si128();
4398 if (count >= 16)
4399 return cast(__m128i)r;
4400
4401 foreach(i; 0..8)
4402 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
4403 return cast(__m128i)r;
4404 }
4405 }
4406 unittest
4407 {
4408 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4409 short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
4410 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
4411 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
4412 assert(B.array == expectedB);
4413 assert(B2.array == expectedB);
4414
4415 short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
4416 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
4417 assert(C.array == expectedC);
4418
4419 short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
4420 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
4421 assert(D.array == expectedD);
4422 }
4423
4424
4425 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
4426 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
4427 {
4428 static if (GDC_with_SSE2)
4429 {
4430 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4431 }
4432 else static if (LDC_with_SSE2)
4433 {
4434 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4435 }
4436 else
4437 {
4438 ubyte count = cast(ubyte) imm8;
4439
4440 // Note: the intrinsics guarantee imm8[0..7] is taken, however
4441 // D says "It's illegal to shift by the same or more bits
4442 // than the size of the quantity being shifted"
4443 // and it's UB instead.
4444 int4 r = _mm_setzero_si128();
4445 if (count >= 32)
4446 return r;
4447 r.ptr[0] = a.array[0] >>> count;
4448 r.ptr[1] = a.array[1] >>> count;
4449 r.ptr[2] = a.array[2] >>> count;
4450 r.ptr[3] = a.array[3] >>> count;
4451 return r;
4452 }
4453 }
4454 unittest
4455 {
4456 __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4457 __m128i B = _mm_srli_epi32(A, 1);
4458 __m128i B2 = _mm_srli_epi32(A, 1 + 256);
4459 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
4460 assert(B.array == expectedB);
4461 assert(B2.array == expectedB);
4462
4463 __m128i C = _mm_srli_epi32(A, 255);
4464 int[4] expectedC = [ 0, 0, 0, 0 ];
4465 assert(C.array == expectedC);
4466 }
4467
4468 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4469 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4470 {
4471 // PERF DMD
4472 static if (GDC_with_SSE2)
4473 {
4474 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4475 }
4476 else static if (LDC_with_SSE2)
4477 {
4478 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4479 }
4480 else
4481 {
4482 long2 r = cast(long2) _mm_setzero_si128();
4483 long2 sa = cast(long2)a;
4484
4485 ubyte count = cast(ubyte) imm8;
4486 if (count >= 64)
4487 return cast(__m128i)r;
4488
4489 r.ptr[0] = sa.array[0] >>> count;
4490 r.ptr[1] = sa.array[1] >>> count;
4491 return cast(__m128i)r;
4492 }
4493 }
4494 unittest
4495 {
4496 __m128i A = _mm_setr_epi64(8, -4);
4497 long2 B = cast(long2) _mm_srli_epi64(A, 1);
4498 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4499 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4500 assert(B.array == expectedB);
4501 assert(B2.array == expectedB);
4502
4503 long2 C = cast(long2) _mm_srli_epi64(A, 64);
4504 long[2] expectedC = [ 0, 0 ];
4505 assert(C.array == expectedC);
4506 }
4507
4508 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4509 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted
4510 {
4511 static if (bytes & 0xF0)
4512 {
4513 return _mm_setzero_si128();
4514 }
4515 else static if (DMD_with_DSIMD)
4516 {
4517 return cast(__m128i) __simd_ib(XMM.PSRLDQ, v, bytes);
4518 }
4519 else static if (GDC_with_SSE2)
4520 {
4521 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4522 }
4523 else static if (DMD_with_32bit_asm)
4524 {
4525 asm pure nothrow @nogc @trusted
4526 {
4527 movdqu XMM0, v;
4528 psrldq XMM0, bytes;
4529 movdqu v, XMM0;
4530 }
4531 return v;
4532 }
4533 else static if (LDC_with_optimizations)
4534 {
4535 return cast(__m128i) shufflevectorLDC!(byte16,
4536 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4537 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4538 (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4539 }
4540 else
4541 {
4542 byte16 A = cast(byte16)v;
4543 byte16 R = void;
4544 for (int n = 0; n < bytes; ++n)
4545 R.ptr[15-n] = 0;
4546 for (int n = bytes; n < 16; ++n)
4547 R.ptr[15-n] = A.array[15 - n + bytes];
4548 return cast(__m128i)R;
4549 }
4550 }
4551 unittest
4552 {
4553 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1));
4554 int[4] correct = [-2, 3, 4, 0];
4555 assert(R.array == correct);
4556
4557 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4558 int[4] expectedA = [0, 0, 0, 0];
4559 assert(A.array == expectedA);
4560 }
4561
4562 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4563 /// #BONUS
4564 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4565 {
4566 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4567 }
4568 unittest
4569 {
4570 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4571 float[4] correct = [3.0f, 4.0f, 0, 0];
4572 assert(R.array == correct);
4573 }
4574
4575 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4576 /// #BONUS
4577 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4578 {
4579 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4580 }
4581
4582 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory.
4583 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4584 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4585 {
4586 pragma(inline, true);
4587 __m128d* aligned = cast(__m128d*)mem_addr;
4588 *aligned = a;
4589 }
4590 unittest
4591 {
4592 align(16) double[2] A;
4593 __m128d B = _mm_setr_pd(-8.0, 9.0);
4594 _mm_store_pd(A.ptr, B);
4595 assert(A == [-8.0, 9.0]);
4596 }
4597
4598 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory.
4599 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4600 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4601 {
4602 __m128d* aligned = cast(__m128d*)mem_addr;
4603 __m128d r; // PERF =void;
4604 r.ptr[0] = a.array[0];
4605 r.ptr[1] = a.array[0];
4606 *aligned = r;
4607 }
4608
4609 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to
4610 /// be aligned on any particular boundary.
4611 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4612 {
4613 pragma(inline, true);
4614 *mem_addr = a.array[0];
4615 }
4616
4617 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a
4618 /// general-protection exception may be generated.
4619 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4620 {
4621 pragma(inline, true);
4622 *mem_addr = a;
4623 }
4624
4625 alias _mm_store1_pd = _mm_store_pd1; ///
4626
4627 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4628 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4629 {
4630 pragma(inline, true);
4631 *mem_addr = a.array[1];
4632 }
4633
4634 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4635 // expectations from the user point of view. This problem also exist in C++.
4636 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4637 {
4638 pragma(inline, true);
4639 long* dest = cast(long*)mem_addr;
4640 long2 la = cast(long2)a;
4641 *dest = la.array[0];
4642 }
4643 unittest
4644 {
4645 long[3] A = [1, 2, 3];
4646 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4647 long[3] correct = [1, 0x1_0000_0000, 3];
4648 assert(A == correct);
4649 }
4650
4651 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4652 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4653 {
4654 pragma(inline, true);
4655 *mem_addr = a.array[0];
4656 }
4657
4658 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse
4659 /// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception
4660 /// may be generated.
4661 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system
4662 {
4663 __m128d reversed = void;
4664 reversed.ptr[0] = a.array[1];
4665 reversed.ptr[1] = a.array[0];
4666 *cast(__m128d*)mem_addr = reversed;
4667 }
4668 unittest
4669 {
4670 align(16) double[2] A = [0.0, 1.0];
4671 _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0));
4672 assert(A[0] == 3.0 && A[1] == 2.0);
4673 }
4674
4675 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from
4676 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
4677 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system
4678 {
4679 // PERF DMD
4680 pragma(inline, true);
4681 static if (GDC_with_SSE2)
4682 {
4683 __builtin_ia32_storeupd(mem_addr, a);
4684 }
4685 else static if (LDC_with_optimizations)
4686 {
4687 storeUnaligned!double2(a, mem_addr);
4688 }
4689 else
4690 {
4691 mem_addr[0] = a.array[0];
4692 mem_addr[1] = a.array[1];
4693 }
4694 }
4695 unittest
4696 {
4697 __m128d A = _mm_setr_pd(3.0, 4.0);
4698 align(16) double[4] R = [0.0, 0, 0, 0];
4699 double[2] correct = [3.0, 4.0];
4700 _mm_storeu_pd(&R[1], A);
4701 assert(R[1..3] == correct);
4702 }
4703
4704 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular
4705 /// boundary.
4706 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned. Make it @system
4707 {
4708 // PERF: DMD
4709 pragma(inline, true);
4710 static if (GDC_with_SSE2)
4711 {
4712 __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a);
4713 }
4714 else static if (LDC_with_optimizations)
4715 {
4716 storeUnaligned!__m128i(a, cast(int*)mem_addr);
4717 }
4718 else
4719 {
4720 int* p = cast(int*)mem_addr;
4721 p[0] = a.array[0];
4722 p[1] = a.array[1];
4723 p[2] = a.array[2];
4724 p[3] = a.array[3];
4725 }
4726 }
4727 unittest
4728 {
4729 __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4730 align(16) int[6] R = [0, 0, 0, 0, 0, 0];
4731 int[4] correct = [1, 2, 3, 4];
4732 _mm_storeu_si128(cast(__m128i*)(&R[1]), A);
4733 assert(R[1..5] == correct);
4734 }
4735
4736 /// Store 16-bit integer from the first element of `a` into memory.
4737 /// `mem_addr` does not need to be aligned on any particular boundary.
4738 void _mm_storeu_si16 (void* mem_addr, __m128i a) pure @system
4739 {
4740 short* dest = cast(short*)mem_addr;
4741 *dest = (cast(short8)a).array[0];
4742 }
4743 unittest
4744 {
4745 short[2] arr = [-24, 12];
4746 _mm_storeu_si16(&arr[1], _mm_set1_epi16(26));
4747 short[2] correct = [-24, 26];
4748 assert(arr == correct);
4749 }
4750
4751 /// Store 32-bit integer from the first element of `a` into memory.
4752 /// `mem_addr` does not need to be aligned on any particular boundary.
4753 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted // TODO should really be @ssytem
4754 {
4755 pragma(inline, true);
4756 int* dest = cast(int*)mem_addr;
4757 *dest = a.array[0];
4758 }
4759 unittest
4760 {
4761 int[2] arr = [-24, 12];
4762 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4763 assert(arr == [-24, -1]);
4764 }
4765
4766 /// Store 64-bit integer from the first element of `a` into memory.
4767 /// `mem_addr` does not need to be aligned on any particular boundary.
4768 void _mm_storeu_si64 (void* mem_addr, __m128i a) pure @system
4769 {
4770 pragma(inline, true);
4771 long* dest = cast(long*)mem_addr;
4772 long2 la = cast(long2)a;
4773 *dest = la.array[0];
4774 }
4775 unittest
4776 {
4777 long[3] A = [1, 2, 3];
4778 _mm_storeu_si64(&A[1], _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4779 long[3] correct = [1, 0x1_0000_0000, 3];
4780 assert(A == correct);
4781 }
4782
4783 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4784 /// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte
4785 /// boundary or a general-protection exception may be generated.
4786 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4787 void _mm_stream_pd (double* mem_addr, __m128d a) pure @system
4788 {
4789 // PERF DMD D_SIMD
4790 static if (GDC_with_SSE2)
4791 {
4792 return __builtin_ia32_movntpd(mem_addr, a);
4793 }
4794 else static if (LDC_with_InlineIREx && LDC_with_optimizations)
4795 {
4796 enum prefix = `!0 = !{ i32 1 }`;
4797 enum ir = `
4798 store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0
4799 ret void`;
4800 LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a);
4801 }
4802 else
4803 {
4804 // Regular store instead.
4805 __m128d* dest = cast(__m128d*)mem_addr;
4806 *dest = a;
4807 }
4808 }
4809 unittest
4810 {
4811 align(16) double[2] A;
4812 __m128d B = _mm_setr_pd(-8.0, 9.0);
4813 _mm_stream_pd(A.ptr, B);
4814 assert(A == [-8.0, 9.0]);
4815 }
4816
4817 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4818 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception
4819 /// may be generated.
4820 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4821 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted
4822 {
4823 // PERF DMD D_SIMD
4824 static if (GDC_with_SSE2)
4825 {
4826 return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a);
4827 }
4828 else static if (LDC_with_InlineIREx && LDC_with_optimizations)
4829 {
4830 enum prefix = `!0 = !{ i32 1 }`;
4831 enum ir = `
4832 store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0
4833 ret void`;
4834 LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a);
4835 }
4836 else
4837 {
4838 // Regular store instead.
4839 __m128i* dest = cast(__m128i*)mem_addr;
4840 *dest = a;
4841 }
4842 }
4843 unittest
4844 {
4845 align(16) int[4] A;
4846 __m128i B = _mm_setr_epi32(-8, 9, 10, -11);
4847 _mm_stream_si128(cast(__m128i*)A.ptr, B);
4848 assert(A == [-8, 9, 10, -11]);
4849 }
4850
4851 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4852 /// pollution. If the cache line containing address `mem_addr` is already in the cache,
4853 /// the cache will be updated.
4854 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4855 void _mm_stream_si32 (int* mem_addr, int a) pure @trusted
4856 {
4857 // PERF DMD D_SIMD
4858 static if (GDC_with_SSE2)
4859 {
4860 return __builtin_ia32_movnti(mem_addr, a);
4861 }
4862 else static if (LDC_with_InlineIREx && LDC_with_optimizations)
4863 {
4864 enum prefix = `!0 = !{ i32 1 }`;
4865 enum ir = `
4866 store i32 %1, i32* %0, !nontemporal !0
4867 ret void`;
4868 LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a);
4869 }
4870 else
4871 {
4872 // Regular store instead.
4873 *mem_addr = a;
4874 }
4875 }
4876 unittest
4877 {
4878 int A;
4879 _mm_stream_si32(&A, -34);
4880 assert(A == -34);
4881 }
4882
4883 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4884 /// cache pollution. If the cache line containing address `mem_addr` is already
4885 /// in the cache, the cache will be updated.
4886 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4887 void _mm_stream_si64 (long* mem_addr, long a) pure @trusted
4888 {
4889 // PERF DMD D_SIMD
4890 static if (GDC_with_SSE2)
4891 {
4892 return __builtin_ia32_movnti64(mem_addr, a);
4893 }
4894 else static if (LDC_with_InlineIREx && LDC_with_optimizations)
4895 {
4896 enum prefix = `!0 = !{ i32 1 }`;
4897 enum ir = `
4898 store i64 %1, i64* %0, !nontemporal !0
4899 ret void`;
4900 LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a);
4901
4902 }
4903 else
4904 {
4905 // Regular store instead.
4906 *mem_addr = a;
4907 }
4908 }
4909 unittest
4910 {
4911 long A;
4912 _mm_stream_si64(&A, -46);
4913 assert(A == -46);
4914 }
4915
4916 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4917 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4918 {
4919 pragma(inline, true);
4920 return cast(__m128i)(cast(short8)a - cast(short8)b);
4921 }
4922 unittest
4923 {
4924 __m128i A = _mm_setr_epi16(16, 32767, 1, 2, 3, 4, 6, 6);
4925 __m128i B = _mm_setr_epi16(15, -32768, 6, 8, 1000, 1, 5, 6);
4926 short8 C = cast(short8) _mm_sub_epi16(A, B);
4927 short[8] correct = [ 1, -1,-5,-6, -997, 3, 1, 0];
4928 assert(C.array == correct);
4929 }
4930
4931 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4932 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4933 {
4934 pragma(inline, true);
4935 return cast(__m128i)(cast(int4)a - cast(int4)b);
4936 }
4937 unittest
4938 {
4939 __m128i A = _mm_setr_epi32(16, int.max, 1, 8);
4940 __m128i B = _mm_setr_epi32(15, int.min, 6, 2);
4941 int4 C = cast(int4) _mm_sub_epi32(A, B);
4942 int[4] correct = [ 1, -1,-5, 6];
4943 assert(C.array == correct);
4944 }
4945
4946 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4947 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4948 {
4949 pragma(inline, true);
4950 return cast(__m128i)(cast(long2)a - cast(long2)b);
4951 }
4952 unittest
4953 {
4954 __m128i A = _mm_setr_epi64( 16, long.max);
4955 __m128i B = _mm_setr_epi64( 199, long.min);
4956 long2 C = cast(long2) _mm_sub_epi64(A, B);
4957 long[2] correct = [-183, -1];
4958 assert(C.array == correct);
4959 }
4960
4961 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4962 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4963 {
4964 pragma(inline, true);
4965 return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4966 }
4967 unittest
4968 {
4969 __m128i A = _mm_setr_epi8(16, 127, 1, 2, 3, 4, 6, 6, 16, 127, 1, 2, 3, 4, 6, 6);
4970 __m128i B = _mm_setr_epi8(15, -128, 6, 8, 3, 1, 5, 6, 16, 127, 1, 2, 3, 4, 6, 6);
4971 byte16 C = cast(byte16) _mm_sub_epi8(A, B);
4972 byte[16] correct = [ 1, -1,-5,-6, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4973 assert(C.array == correct);
4974 }
4975
4976 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit)
4977 /// floating-point elements in `a`.
4978 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4979 {
4980 pragma(inline, true);
4981 return a - b;
4982 }
4983 unittest
4984 {
4985 __m128d A = _mm_setr_pd(4000.0, -8.0);
4986 __m128d B = _mm_setr_pd(12.0, -8450.0);
4987 __m128d C = _mm_sub_pd(A, B);
4988 double[2] correct = [3988.0, 8442.0];
4989 assert(C.array == correct);
4990 }
4991
4992 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit)
4993 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4994 /// upper element of result.
4995 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4996 {
4997 version(DigitalMars)
4998 {
4999 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
5000 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
5001 asm pure nothrow @nogc @trusted { nop;}
5002 a[0] = a[0] - b[0];
5003 return a;
5004 }
5005 else static if (GDC_with_SSE2)
5006 {
5007 return __builtin_ia32_subsd(a, b);
5008 }
5009 else
5010 {
5011 a.ptr[0] -= b.array[0];
5012 return a;
5013 }
5014 }
5015 unittest
5016 {
5017 __m128d a = [1.5, -2.0];
5018 a = _mm_sub_sd(a, a);
5019 assert(a.array == [0.0, -2.0]);
5020 }
5021
5022 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
5023 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
5024 {
5025 pragma(inline, true);
5026 return a - b;
5027 }
5028 unittest
5029 {
5030 __m64 A, B;
5031 A = -1214;
5032 B = 489415;
5033 __m64 C = _mm_sub_si64(B, A);
5034 assert(C.array[0] == 489415 + 1214);
5035 }
5036
5037 /// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using
5038 /// saturation.
5039 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
5040 {
5041 // PERF DMD psubsw
5042 static if(LDC_with_saturated_intrinsics)
5043 {
5044 return cast(__m128i) inteli_llvm_subs!short8(cast(short8)a, cast(short8)b);
5045 }
5046 else static if (GDC_with_SSE2)
5047 {
5048 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
5049 }
5050 else
5051 {
5052 short[8] res; // PERF =void;
5053 short8 sa = cast(short8)a;
5054 short8 sb = cast(short8)b;
5055 foreach(i; 0..8)
5056 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
5057 return _mm_loadu_si128(cast(int4*)res.ptr);
5058 }
5059 }
5060 unittest
5061 {
5062 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
5063 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0));
5064 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0];
5065 assert(res.array == correctResult);
5066 }
5067
5068 /// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using
5069 /// saturation.
5070 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
5071 {
5072 static if(LDC_with_saturated_intrinsics)
5073 {
5074 return cast(__m128i) inteli_llvm_subs!byte16(cast(byte16)a, cast(byte16)b);
5075 }
5076 else static if (GDC_with_SSE2)
5077 {
5078 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
5079 }
5080 else
5081 {
5082 byte[16] res; // PERF =void;
5083 byte16 sa = cast(byte16)a;
5084 byte16 sb = cast(byte16)b;
5085 foreach(i; 0..16)
5086 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
5087 return _mm_loadu_si128(cast(int4*)res.ptr);
5088 }
5089 }
5090 unittest
5091 {
5092 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
5093 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
5094 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
5095 assert(res.array == correctResult);
5096 }
5097
5098 /// Subtract packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
5099 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
5100 {
5101 static if(LDC_with_saturated_intrinsics)
5102 {
5103 return cast(__m128i) inteli_llvm_subus!short8(cast(short8)a, cast(short8)b);
5104 }
5105 else static if (GDC_with_SSE2)
5106 {
5107 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
5108 }
5109 else
5110 {
5111 short[8] res; // PERF =void;
5112 short8 sa = cast(short8)a;
5113 short8 sb = cast(short8)b;
5114 foreach(i; 0..8)
5115 {
5116 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
5117 res[i] = saturateSignedIntToUnsignedShort(sum);
5118 }
5119 return _mm_loadu_si128(cast(int4*)res.ptr);
5120 }
5121 }
5122 unittest
5123 {
5124 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0),
5125 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
5126 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0];
5127 assert(R.array == correct);
5128 }
5129
5130 /// Subtract packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
5131 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
5132 {
5133 static if(LDC_with_saturated_intrinsics)
5134 {
5135 return cast(__m128i) inteli_llvm_subus!byte16(cast(byte16)a, cast(byte16)b);
5136 }
5137 else static if (GDC_with_SSE2)
5138 {
5139 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
5140 }
5141 else
5142 {
5143 ubyte[16] res; // PERF =void;
5144 byte16 sa = cast(byte16)a;
5145 byte16 sb = cast(byte16)b;
5146 foreach(i; 0..16)
5147 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
5148 return _mm_loadu_si128(cast(int4*)res.ptr);
5149 }
5150 }
5151 unittest
5152 {
5153 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
5154 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
5155 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
5156 assert(res.array == correctResult);
5157 }
5158
5159 // Note: the only difference between these intrinsics is the signalling
5160 // behaviour of quiet NaNs. This is incorrect but the case where
5161 // you would want to differentiate between qNaN and sNaN and then
5162 // treat them differently on purpose seems extremely rare.
5163 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
5164 alias _mm_ucomige_sd = _mm_comige_sd; ///
5165 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
5166 alias _mm_ucomile_sd = _mm_comile_sd; ///
5167 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
5168 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
5169
5170 /// Return vector of type `__m128d` with undefined elements.
5171 __m128d _mm_undefined_pd() pure @safe
5172 {
5173 pragma(inline, true);
5174 __m128d result = void;
5175 return result;
5176 }
5177
5178 /// Return vector of type `__m128i` with undefined elements.
5179 __m128i _mm_undefined_si128() pure @safe
5180 {
5181 pragma(inline, true);
5182 __m128i result = void;
5183 return result;
5184 }
5185
5186 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
5187 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted
5188 {
5189 static if (DMD_with_DSIMD)
5190 {
5191 return cast(__m128i) __simd(XMM.PUNPCKHWD, a, b);
5192 }
5193 else static if (GDC_with_SSE2)
5194 {
5195 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
5196 }
5197 else static if (LDC_with_optimizations)
5198 {
5199 enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
5200 ret <8 x i16> %r`;
5201 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
5202 }
5203 else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5204 {
5205 asm pure nothrow @nogc @trusted
5206 {
5207 movdqu XMM0, a;
5208 movdqu XMM1, b;
5209 punpckhwd XMM0, XMM1;
5210 movdqu a, XMM0;
5211 }
5212 return a;
5213 }
5214 else
5215 {
5216 short8 r = void;
5217 short8 sa = cast(short8)a;
5218 short8 sb = cast(short8)b;
5219 r.ptr[0] = sa.array[4];
5220 r.ptr[1] = sb.array[4];
5221 r.ptr[2] = sa.array[5];
5222 r.ptr[3] = sb.array[5];
5223 r.ptr[4] = sa.array[6];
5224 r.ptr[5] = sb.array[6];
5225 r.ptr[6] = sa.array[7];
5226 r.ptr[7] = sb.array[7];
5227 return cast(__m128i)r;
5228 }
5229 }
5230 unittest
5231 {
5232 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11);
5233 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
5234 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
5235 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
5236 assert(C.array == correct);
5237 }
5238
5239 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
5240 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
5241 {
5242 static if (DMD_with_DSIMD)
5243 {
5244 return cast(__m128i) __simd(XMM.PUNPCKHDQ, a, b);
5245 }
5246 else static if (GDC_with_SSE2)
5247 {
5248 return __builtin_ia32_punpckhdq128(a, b);
5249 }
5250 else static if (LDC_with_optimizations)
5251 {
5252 enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
5253 ret <4 x i32> %r`;
5254 return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b);
5255 }
5256 else
5257 {
5258 __m128i r = void;
5259 r.ptr[0] = a.array[2];
5260 r.ptr[1] = b.array[2];
5261 r.ptr[2] = a.array[3];
5262 r.ptr[3] = b.array[3];
5263 return r;
5264 }
5265 }
5266 unittest
5267 {
5268 __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5269 __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5270 __m128i C = _mm_unpackhi_epi32(A, B);
5271 int[4] correct = [3, 7, 4, 8];
5272 assert(C.array == correct);
5273 }
5274
5275 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
5276 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
5277 {
5278 static if (GDC_with_SSE2)
5279 {
5280 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
5281 }
5282 else
5283 {
5284 __m128i r = cast(__m128i)b;
5285 r[0] = a[2];
5286 r[1] = a[3];
5287 return r;
5288 }
5289 }
5290 unittest // Issue #36
5291 {
5292 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5293 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5294 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
5295 long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
5296 assert(C.array == correct);
5297 }
5298
5299 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
5300 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted
5301 {
5302 static if (DMD_with_DSIMD)
5303 {
5304 return cast(__m128i) __simd(XMM.PUNPCKHBW, a, b);
5305 }
5306 else static if (GDC_with_SSE2)
5307 {
5308 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
5309 }
5310 else static if (LDC_with_optimizations)
5311 {
5312 enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
5313 ret <16 x i8> %r`;
5314 return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
5315 }
5316 else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5317 {
5318 asm pure nothrow @nogc @trusted
5319 {
5320 movdqu XMM0, a;
5321 movdqu XMM1, b;
5322 punpckhbw XMM0, XMM1;
5323 movdqu a, XMM0;
5324 }
5325 return a;
5326 }
5327 else
5328 {
5329 byte16 r = void;
5330 byte16 ba = cast(byte16)a;
5331 byte16 bb = cast(byte16)b;
5332 r.ptr[0] = ba.array[8];
5333 r.ptr[1] = bb.array[8];
5334 r.ptr[2] = ba.array[9];
5335 r.ptr[3] = bb.array[9];
5336 r.ptr[4] = ba.array[10];
5337 r.ptr[5] = bb.array[10];
5338 r.ptr[6] = ba.array[11];
5339 r.ptr[7] = bb.array[11];
5340 r.ptr[8] = ba.array[12];
5341 r.ptr[9] = bb.array[12];
5342 r.ptr[10] = ba.array[13];
5343 r.ptr[11] = bb.array[13];
5344 r.ptr[12] = ba.array[14];
5345 r.ptr[13] = bb.array[14];
5346 r.ptr[14] = ba.array[15];
5347 r.ptr[15] = bb.array[15];
5348 return cast(__m128i)r;
5349 }
5350 }
5351 unittest
5352 {
5353 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
5354 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5355 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
5356 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
5357 assert(C.array == correct);
5358 }
5359
5360 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
5361 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted
5362 {
5363 // PERF DMD D_SIMD
5364 static if (GDC_with_SSE2)
5365 {
5366 return __builtin_ia32_unpckhpd(a, b);
5367 }
5368 else static if (LDC_with_optimizations)
5369 {
5370 enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 1, i32 3>
5371 ret <2 x double> %r`;
5372 return LDCInlineIR!(ir, double2, double2, double2)(a, b);
5373 }
5374 else
5375 {
5376 double2 r = void;
5377 r.ptr[0] = a.array[1];
5378 r.ptr[1] = b.array[1];
5379 return r;
5380 }
5381 }
5382 unittest
5383 {
5384 __m128d A = _mm_setr_pd(4.0, 6.0);
5385 __m128d B = _mm_setr_pd(7.0, 9.0);
5386 __m128d C = _mm_unpackhi_pd(A, B);
5387 double[2] correct = [6.0, 9.0];
5388 assert(C.array == correct);
5389 }
5390
5391 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
5392 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted
5393 {
5394 static if (DMD_with_DSIMD)
5395 {
5396 return cast(__m128i) __simd(XMM.PUNPCKLWD, a, b);
5397 }
5398 else static if (GDC_with_SSE2)
5399 {
5400 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
5401 }
5402 else static if (LDC_with_optimizations)
5403 {
5404 enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
5405 ret <8 x i16> %r`;
5406 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
5407 }
5408 else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5409 {
5410 asm pure nothrow @nogc @trusted
5411 {
5412 movdqu XMM0, a;
5413 movdqu XMM1, b;
5414 punpcklwd XMM0, XMM1;
5415 movdqu a, XMM0;
5416 }
5417 return a;
5418 }
5419 else
5420 {
5421 short8 r = void;
5422 short8 sa = cast(short8)a;
5423 short8 sb = cast(short8)b;
5424 r.ptr[0] = sa.array[0];
5425 r.ptr[1] = sb.array[0];
5426 r.ptr[2] = sa.array[1];
5427 r.ptr[3] = sb.array[1];
5428 r.ptr[4] = sa.array[2];
5429 r.ptr[5] = sb.array[2];
5430 r.ptr[6] = sa.array[3];
5431 r.ptr[7] = sb.array[3];
5432 return cast(__m128i)r;
5433 }
5434 }
5435 unittest
5436 {
5437 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
5438 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
5439 short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
5440 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
5441 assert(C.array == correct);
5442 }
5443
5444 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
5445 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
5446 {
5447 static if (DMD_with_DSIMD)
5448 {
5449 return cast(__m128i) __simd(XMM.PUNPCKLDQ, a, b);
5450 }
5451 else static if (GDC_with_SSE2)
5452 {
5453 return __builtin_ia32_punpckldq128(a, b);
5454 }
5455 else static if (LDC_with_optimizations)
5456 {
5457 enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
5458 ret <4 x i32> %r`;
5459 return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b);
5460 }
5461 else
5462 {
5463 __m128i r;
5464 r.ptr[0] = a.array[0];
5465 r.ptr[1] = b.array[0];
5466 r.ptr[2] = a.array[1];
5467 r.ptr[3] = b.array[1];
5468 return r;
5469 }
5470 }
5471 unittest
5472 {
5473 __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5474 __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5475 __m128i C = _mm_unpacklo_epi32(A, B);
5476 int[4] correct = [1, 5, 2, 6];
5477 assert(C.array == correct);
5478 }
5479
5480 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
5481 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
5482 {
5483 static if (GDC_with_SSE2)
5484 {
5485 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
5486 }
5487 else
5488 {
5489 long2 lA = cast(long2)a;
5490 long2 lB = cast(long2)b;
5491 long2 R; // PERF =void;
5492 R.ptr[0] = lA.array[0];
5493 R.ptr[1] = lB.array[0];
5494 return cast(__m128i)R;
5495 }
5496 }
5497 unittest // Issue #36
5498 {
5499 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5500 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5501 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
5502 long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
5503 assert(C.array == correct);
5504 }
5505
5506 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
5507 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted
5508 {
5509 static if (DMD_with_DSIMD)
5510 {
5511 return cast(__m128i) __simd(XMM.PUNPCKLBW, a, b);
5512 }
5513 else static if (GDC_with_SSE2)
5514 {
5515 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
5516 }
5517 else static if (LDC_with_optimizations)
5518 {
5519 enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
5520 ret <16 x i8> %r`;
5521 return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
5522 }
5523 else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5524 {
5525 asm pure nothrow @nogc @trusted
5526 {
5527 movdqu XMM0, a;
5528 movdqu XMM1, b;
5529 punpcklbw XMM0, XMM1;
5530 movdqu a, XMM0;
5531 }
5532 return a;
5533 }
5534 else
5535 {
5536 byte16 r = void;
5537 byte16 ba = cast(byte16)a;
5538 byte16 bb = cast(byte16)b;
5539 r.ptr[0] = ba.array[0];
5540 r.ptr[1] = bb.array[0];
5541 r.ptr[2] = ba.array[1];
5542 r.ptr[3] = bb.array[1];
5543 r.ptr[4] = ba.array[2];
5544 r.ptr[5] = bb.array[2];
5545 r.ptr[6] = ba.array[3];
5546 r.ptr[7] = bb.array[3];
5547 r.ptr[8] = ba.array[4];
5548 r.ptr[9] = bb.array[4];
5549 r.ptr[10] = ba.array[5];
5550 r.ptr[11] = bb.array[5];
5551 r.ptr[12] = ba.array[6];
5552 r.ptr[13] = bb.array[6];
5553 r.ptr[14] = ba.array[7];
5554 r.ptr[15] = bb.array[7];
5555 return cast(__m128i)r;
5556 }
5557 }
5558 unittest
5559 {
5560 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
5561 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5562 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
5563 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
5564 assert(C.array == correct);
5565 }
5566
5567 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
5568 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted
5569 {
5570 // PERF DMD D_SIMD
5571 static if (GDC_with_SSE2)
5572 {
5573 return __builtin_ia32_unpcklpd(a, b);
5574 }
5575 else static if (LDC_with_optimizations)
5576 {
5577 enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 0, i32 2>
5578 ret <2 x double> %r`;
5579 return LDCInlineIR!(ir, double2, double2, double2)(a, b);
5580 }
5581 else
5582 {
5583 double2 r = void;
5584 r.ptr[0] = a.array[0];
5585 r.ptr[1] = b.array[0];
5586 return r;
5587 }
5588 }
5589 unittest
5590 {
5591 __m128d A = _mm_setr_pd(4.0, 6.0);
5592 __m128d B = _mm_setr_pd(7.0, 9.0);
5593 __m128d C = _mm_unpacklo_pd(A, B);
5594 double[2] correct = [4.0, 7.0];
5595 assert(C.array == correct);
5596 }
5597
5598 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
5599 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
5600 {
5601 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
5602 }
5603 unittest
5604 {
5605 __m128d A = _mm_setr_pd(-4.0, 6.0);
5606 __m128d B = _mm_setr_pd(4.0, -6.0);
5607 long2 R = cast(long2) _mm_xor_pd(A, B);
5608 long[2] correct = [long.min, long.min];
5609 assert(R.array == correct);
5610 }
5611
5612 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
5613 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
5614 {
5615 return a ^ b;
5616 }
5617 unittest
5618 {
5619 __m128i A = _mm_setr_epi64(975394, 619809709);
5620 __m128i B = _mm_setr_epi64(-920275025, -6);
5621 long2 R = cast(long2) _mm_xor_si128(A, B);
5622 long[2] correct = [975394 ^ (-920275025L), 619809709L ^ -6];
5623 assert(R.array == correct);
5624 }
5625
5626 unittest
5627 {
5628 float distance(float[4] a, float[4] b) nothrow @nogc
5629 {
5630 __m128 va = _mm_loadu_ps(a.ptr);
5631 __m128 vb = _mm_loadu_ps(b.ptr);
5632 __m128 diffSquared = _mm_sub_ps(va, vb);
5633 diffSquared = _mm_mul_ps(diffSquared, diffSquared);
5634 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
5635 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
5636 return _mm_cvtss_f32(_mm_sqrt_ss(sum));
5637 }
5638 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
5639 }