The OpenD Programming Language

1 /**
2 * MMX intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=MMX
4 * 
5 * Copyright: Copyright Guillaume Piolat 2019-2020.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.mmx;
9 
10 public import inteli.types;
11 import inteli.internals;
12 
13 import inteli.xmmintrin;
14 import inteli.emmintrin;
15 
16 nothrow @nogc:
17 
18 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics,
19 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen.
20 // intel-intrinsics is just semantics.
21 // Even GDC does not seem to use mm0-mm7 registers, instead preferring xmm0-xmm7.
22 
23 
24 /// Add packed 16-bit integers in `a` and `b`.
25 __m64 _mm_add_pi16 (__m64 a, __m64 b)
26 {
27     return cast(__m64)(cast(short4)a + cast(short4)b);
28 }
29 unittest
30 {
31     short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3));
32     short[4] correct = [7, 7, 7, 7];
33     assert(R.array == correct);
34 }
35 
36 /// Add packed 32-bit integers in `a` and `b`.
37 __m64 _mm_add_pi32 (__m64 a, __m64 b)
38 {
39     return cast(__m64)(cast(int2)a + cast(int2)b);
40 }
41 unittest
42 {
43     int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3));
44     int[2] correct = [7, 7];
45     assert(R.array == correct);
46 }
47 
48 /// Add packed 8-bit integers in `a` and `b`.
49 __m64 _mm_add_pi8 (__m64 a, __m64 b)
50 {
51     return cast(__m64)(cast(byte8)a + cast(byte8)b);
52 }
53 unittest
54 {
55     byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128));
56     byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1];
57     assert(R.array == correct);
58 }
59 
60 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
61 // PERF: PADDSW not generated
62 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted
63 {
64     return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b)));
65 }
66 unittest
67 {
68     short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0),
69                                             _mm_set_pi16(3, 2, 1, 0));
70     static immutable short[4] correctResult = [0, 2, 4, 6];
71     assert(res.array == correctResult);
72 }
73 
74 /// Add packed 8-bit integers in `a` and `b` using signed saturation.
75 // PERF: PADDSB not generated
76 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted
77 {
78     return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b)));
79 }
80 unittest
81 {
82     byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0),
83                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
84     static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
85     assert(res.array == correctResult);
86 }
87 
88 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation.
89 // PERF: PADDUSW not generated
90 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted
91 {
92     return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b)));
93 }
94 unittest
95 {
96     short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0),
97                                             _mm_set_pi16(3, 2, 1, 0));
98     static immutable short[4] correctResult = [0, cast(short)65535, 4, 6];
99     assert(res.array == correctResult);
100 }
101 
102 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation.
103 // PERF: PADDUSB not generated
104 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted
105 {
106     return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b)));
107 }
108 unittest
109 {
110     byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0),
111                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
112     static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14];
113     assert(res.array == correctResult);
114 }
115 
116 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`.
117 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe
118 {
119     return a & b;
120 }
121 unittest
122 {
123     __m64 A = [7];
124     __m64 B = [14];
125     __m64 R = _mm_and_si64(A, B);
126     assert(R.array[0] == 6);
127 }
128 
129 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`.
130 __m64 _mm_andnot_si64 (__m64 a, __m64 b)
131 {
132     return (~a) & b;
133 }
134 unittest
135 {
136     __m64 A = [7];
137     __m64 B = [14];
138     __m64 R = _mm_andnot_si64(A, B);
139     assert(R.array[0] == 8);
140 }
141 
142 /// Compare packed 16-bit integers in `a` and `b` for equality.
143 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe
144 {
145     static if (SIMD_COMPARISON_MASKS_8B)
146     {
147         return cast(__m64)(cast(short4)a == cast(short4)b);
148     }
149     else static if (GDC_with_MMX)
150     {
151         return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b);        
152     }
153     else
154     {
155         return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b);
156     }
157 }
158 unittest
159 {
160     short4   A = [-3, -2, -1,  0];
161     short4   B = [ 4,  3,  2,  1];
162     short[4] E = [ 0,  0,  0,  0];
163     short4   R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B));
164     assert(R.array == E);
165 }
166 
167 /// Compare packed 32-bit integers in `a` and `b` for equality.
168 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe
169 {
170     static if (SIMD_COMPARISON_MASKS_8B)
171     {
172         return cast(__m64)(cast(int2)a == cast(int2)b);
173     }
174     else static if (GDC_with_MMX)
175     {        
176         return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b);
177     }
178     else
179     {
180         return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b);
181     }
182 }
183 unittest
184 {
185     int2   A = [-3, -2];
186     int2   B = [ 4, -2];
187     int[2] E = [ 0, -1];
188     int2   R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B));
189     assert(R.array == E);
190 }
191 
192 /// Compare packed 8-bit integers in `a` and `b` for equality,
193 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe
194 {
195     static if (SIMD_COMPARISON_MASKS_8B)
196     {
197         return cast(__m64)(cast(byte8)a == cast(byte8)b);
198     }
199     else static if (GDC_with_MMX)
200     {        
201         return cast(__m64) __builtin_ia32_pcmpeqb(cast(ubyte8)a, cast(ubyte8)b);
202     }
203     else
204     {
205         return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b);
206     }
207 }
208 unittest
209 {
210     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
211     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
212     byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B);
213     byte[8] correct =     [0,-1, 0, 0, 0,-1, 0, 0];
214     assert(C.array == correct);
215 }
216 
217 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
218 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe
219 {
220     static if (SIMD_COMPARISON_MASKS_8B)
221     {
222         return cast(__m64)(cast(short4)a > cast(short4)b);
223     }
224     else static if (GDC_with_MMX)
225     { 
226         return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
227     }
228     else
229     {
230         return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b);
231     }
232 }
233 unittest
234 {
235     short4   A = [-3, -2, -1,  0];
236     short4   B = [ 4,  3,  2,  1];
237     short[4] E = [ 0,  0,  0,  0];
238     short4   R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B));
239     assert(R.array == E);
240 }
241 
242 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
243 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe
244 {
245     static if (SIMD_COMPARISON_MASKS_8B)
246     {
247         return cast(__m64)(cast(int2)a > cast(int2)b);
248     }
249     else static if (GDC_with_MMX)
250     {
251         return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
252     }
253     else
254     {
255         return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b);
256     }
257 }
258 unittest
259 {
260     int2   A = [-3,  2];
261     int2   B = [ 4, -2];
262     int[2] E = [ 0, -1];
263     int2   R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B));
264     assert(R.array == E);
265 }
266 
267 /// Compare packed signed 8-bit integers in `a` and `b` for greater-than.
268 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe
269 {
270     static if (SIMD_COMPARISON_MASKS_8B)
271     {
272         return cast(__m64)(cast(byte8)a > cast(byte8)b);
273     }
274     else static if (GDC_with_MMX)
275     {
276         return cast(__m64) __builtin_ia32_pcmpgtb (cast(ubyte8)a, cast(ubyte8)b);
277     }
278     else
279     {
280         return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b);
281     }
282 }
283 unittest
284 {
285     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
286     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
287     byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B);
288     byte[8] correct =     [0, 0,-1, 0, 0, 0, 0, 0];
289     assert(C.array == correct);
290 }
291 
292 /// Copy 64-bit integer `a` to `dst`.
293 long _mm_cvtm64_si64 (__m64 a) pure @safe
294 {
295     long1 la = cast(long1)a;
296     return a.array[0];
297 }
298 unittest
299 {
300     __m64 A = _mm_setr_pi32(2, 1);
301     long1 lA = cast(long1)A;
302     assert(A.array[0] == 0x100000002);
303 }
304 
305 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`.
306 __m64 _mm_cvtsi32_si64 (int a) pure @trusted
307 {
308     __m64 r = void;
309     r.ptr[0] = a;
310     return r;
311 }
312 unittest
313 {
314     __m64 R = _mm_cvtsi32_si64(-1);
315     assert(R.array[0] == -1);
316 }
317 
318 /// Copy 64-bit integer `a` to `dst`.
319 __m64 _mm_cvtsi64_m64 (long a) pure @trusted
320 {
321     __m64 r = void;
322     r.ptr[0] = a;
323     return r;
324 }
325 unittest
326 {
327     __m64 R = _mm_cvtsi64_m64(0x123456789A);
328     assert(R.array[0] == 0x123456789A);
329 }
330 
331 /// Get the lower 32-bit integer in `a`.
332 int _mm_cvtsi64_si32 (__m64 a) pure @safe
333 {
334     int2 r = cast(int2)a;
335     return r.array[0];
336 }
337 unittest
338 {
339     __m64 A = _mm_setr_pi32(-6, 5);
340     int R = _mm_cvtsi64_si32(A);
341     assert(R == -6);
342 }
343 
344 /// Empty the MMX state, which marks the x87 FPU registers as available for 
345 /// use by x87 instructions. 
346 /// This instruction is supposed to be used at the end of all MMX technology procedures.
347 /// But this is useless when using `intel-intrinsics`, with all D compilers.
348 void _mm_empty() pure @safe
349 {
350     // do nothing, see comment on top of file
351 }
352 
353 
354 deprecated alias _m_empty = _mm_empty; /// Deprecated intrinsics.
355 deprecated alias _m_from_int =  _mm_cvtsi32_si64; ///ditto
356 deprecated alias _m_from_int64 = _mm_cvtsi64_m64; ///ditto
357 
358 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 
359 /// Horizontally add adjacent pairs of intermediate 32-bit integers
360 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe
361 {
362     return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b)));
363 }
364 unittest
365 {
366     short4 A = [-32768, -32768, 32767, 32767];
367     short4 B = [-32768, -32768, 32767, 32767];
368     int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B);
369     int[2] correct = [-2147483648, 2*32767*32767];
370     assert(R.array == correct);
371 }
372 
373 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
374 /// and store the high 16 bits of the intermediate integers.
375 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe
376 {
377     return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b)));
378 }
379 unittest
380 {
381     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
382     __m64 B = _mm_set1_pi16(16384);
383     short4 R = cast(short4)_mm_mulhi_pi16(A, B);
384     short[4] correct = [1, 2, -4, 1];
385     assert(R.array == correct);
386 }
387 
388 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
389 /// and store the low 16 bits of the intermediate integers.
390 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe
391 {
392     return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b)));
393 }
394 unittest
395 {
396     __m64 A = _mm_setr_pi16(4, 1, 16, 7);
397     __m64 B = _mm_set1_pi16(16384);
398     short4 R = cast(short4)_mm_mullo_pi16(A, B);
399     short[4] correct = [0, 16384, 0, -16384];
400     assert(R.array == correct);
401 }
402 
403 /// Compute the bitwise OR of 64 bits in `a` and `b`.
404 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe
405 {
406     return a | b;
407 }
408 unittest
409 {
410     __m64 A = _mm_setr_pi16(255, 1, -1, 0);
411     __m64 B = _mm_set1_pi16(15);
412     short4 R = cast(short4)_mm_or_si64(A, B);
413     short[4] correct =     [255, 15, -1, 15];
414     assert(R.array == correct);
415 }
416 
417 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
418 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted
419 {
420     int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b));
421     int2 r;
422     r.ptr[0] = p.array[0];
423     r.ptr[1] = p.array[2];
424     return cast(__m64)r;
425 }
426 unittest
427 {
428     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
429     byte8 R = cast(byte8) _mm_packs_pi16(A, A);
430     byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0];
431     assert(R.array == correct);
432 }
433 
434 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
435 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted
436 {
437     int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b));
438     int2 r;
439     r.ptr[0] = p.array[0];
440     r.ptr[1] = p.array[2];
441     return cast(__m64)r;
442 }
443 unittest
444 {
445     __m64 A = _mm_setr_pi32(100000, -100000);
446     short4 R = cast(short4) _mm_packs_pi32(A, A);
447     short[4] correct = [32767, -32768, 32767, -32768];
448     assert(R.array == correct);
449 }
450 
451 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
452 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted
453 {
454     int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b));
455     int2 r;
456     r.ptr[0] = p.array[0];
457     r.ptr[1] = p.array[2];
458     return cast(__m64)r;
459 }
460 unittest
461 {
462     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
463     byte8 R = cast(byte8) _mm_packs_pu16(A, A);
464     ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0];
465     assert(R.array == cast(byte[8])correct);
466 }
467 
468 deprecated alias
469     _m_packssdw = _mm_packs_pi32,     /// Deprecated intrinsics.
470     _m_packsswb = _mm_packs_pi16,     ///ditto
471     _m_packuswb = _mm_packs_pu16,     ///ditto
472     _m_paddb = _mm_add_pi8,           ///ditto
473     _m_paddd = _mm_add_pi32,          ///ditto
474     _m_paddsb = _mm_adds_pi8,         ///ditto
475     _m_paddsw = _mm_adds_pi16,        ///ditto
476     _m_paddusb = _mm_adds_pu8,        ///ditto
477     _m_paddusw = _mm_adds_pu16,       ///ditto
478     _m_paddw = _mm_add_pi16,          ///ditto
479     _m_pand = _mm_and_si64,           ///ditto
480     _m_pandn = _mm_andnot_si64,       ///ditto
481     _m_pcmpeqb = _mm_cmpeq_pi8,       ///ditto
482     _m_pcmpeqd = _mm_cmpeq_pi32,      ///ditto
483     _m_pcmpeqw = _mm_cmpeq_pi16,      ///ditto
484     _m_pcmpgtb = _mm_cmpgt_pi8,       ///ditto
485     _m_pcmpgtd = _mm_cmpgt_pi32,      ///ditto
486     _m_pcmpgtw = _mm_cmpgt_pi16,      ///ditto
487     _m_pmaddwd = _mm_madd_pi16,       ///ditto
488     _m_pmulhw = _mm_mulhi_pi16,       ///ditto
489     _m_pmullw = _mm_mullo_pi16,       ///ditto
490     _m_por = _mm_or_si64,             ///ditto
491     _m_pslld = _mm_sll_pi32,          ///ditto
492     _m_pslldi = _mm_slli_pi32,        ///ditto
493     _m_psllq = _mm_sll_si64,          ///ditto
494     _m_psllqi = _mm_slli_si64,        ///ditto
495     _m_psllw = _mm_sll_pi16,          ///ditto
496     _m_psllwi = _mm_slli_pi16,        ///ditto
497     _m_psrad = _mm_sra_pi32,          ///ditto
498     _m_psradi = _mm_srai_pi32,        ///ditto
499     _m_psraw = _mm_sra_pi16,          ///ditto
500     _m_psrawi = _mm_srai_pi16,        ///ditto
501     _m_psrld = _mm_srl_pi32,          ///ditto
502     _m_psrldi = _mm_srli_pi32,        ///ditto
503     _m_psrlq = _mm_srl_si64,          ///ditto
504     _m_psrlqi = _mm_srli_si64,        ///ditto
505     _m_psrlw = _mm_srl_pi16,          ///ditto
506     _m_psrlwi = _mm_srli_pi16,        ///ditto
507     _m_psubb = _mm_sub_pi8,           ///ditto
508     _m_psubd = _mm_sub_pi32,          ///ditto
509     _m_psubsb = _mm_subs_pi8,         ///ditto
510     _m_psubsw = _mm_subs_pi16,        ///ditto
511     _m_psubusb = _mm_subs_pu8,        ///ditto
512     _m_psubusw = _mm_subs_pu16,       ///ditto
513     _m_psubw = _mm_sub_pi16,          ///ditto
514     _m_punpckhbw = _mm_unpackhi_pi8,  ///ditto
515     _m_punpckhdq = _mm_unpackhi_pi32, ///ditto
516     _m_punpckhwd = _mm_unpackhi_pi16, ///ditto
517     _m_punpcklbw = _mm_unpacklo_pi8,  ///ditto
518     _m_punpckldq = _mm_unpacklo_pi32, ///ditto
519     _m_punpcklwd = _mm_unpacklo_pi16, ///ditto
520     _m_pxor = _mm_xor_si64;           ///ditto
521                 
522 /// Set packed 16-bit integers with the supplied values.
523 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted
524 {
525     short[4] arr = [e0, e1, e2, e3];
526     return *cast(__m64*)(arr.ptr);
527 }
528 unittest
529 {
530     short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0);
531     short[4] correct = [0, 1, 2, 3];
532     assert(R.array == correct);
533 }
534 
535 /// Set packed 32-bit integers with the supplied values.
536 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted
537 {
538     int[2] arr = [e0, e1];
539     return *cast(__m64*)(arr.ptr);
540 }
541 unittest
542 {
543     int2 R = cast(int2) _mm_set_pi32(1, 0);
544     int[2] correct = [0, 1];
545     assert(R.array == correct);
546 }
547 
548 /// Set packed 8-bit integers with the supplied values.
549 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
550 {
551     byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7];
552     return *cast(__m64*)(arr.ptr);
553 }
554 unittest
555 {
556     byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0);
557     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
558     assert(R.array == correct);
559 }
560 
561 /// Broadcast 16-bit integer `a` to all elements.
562 __m64 _mm_set1_pi16 (short a) pure @trusted
563 {
564     return cast(__m64)(short4(a));
565 }
566 unittest
567 {
568     short4 R = cast(short4) _mm_set1_pi16(44);
569     short[4] correct = [44, 44, 44, 44];
570     assert(R.array == correct);
571 }
572 
573 /// Broadcast 32-bit integer `a` to all elements.
574 __m64 _mm_set1_pi32 (int a) pure @trusted
575 {
576     return cast(__m64)(int2(a));
577 }
578 unittest
579 {
580     int2 R = cast(int2) _mm_set1_pi32(43);
581     int[2] correct = [43, 43];
582     assert(R.array == correct);
583 }
584 
585 /// Broadcast 8-bit integer `a` to all elements.
586 __m64 _mm_set1_pi8 (byte a) pure @trusted
587 {
588     return cast(__m64)(byte8(a));
589 }
590 unittest
591 {
592     byte8 R = cast(byte8) _mm_set1_pi8(42);
593     byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42];
594     assert(R.array == correct);
595 }
596 
597 /// Set packed 16-bit integers with the supplied values in reverse order.
598 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted
599 {
600     short[4] arr = [e3, e2, e1, e0];
601     return *cast(__m64*)(arr.ptr);
602 }
603 unittest
604 {
605     short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3);
606     short[4] correct = [0, 1, 2, 3];
607     assert(R.array == correct);
608 }
609 
610 /// Set packed 32-bit integers with the supplied values in reverse order.
611 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted
612 {
613     int[2] arr = [e1, e0];
614     return *cast(__m64*)(arr.ptr);
615 }
616 unittest
617 {
618     int2 R = cast(int2) _mm_setr_pi32(0, 1);
619     int[2] correct = [0, 1];
620     assert(R.array == correct);
621 }
622 
623 /// Set packed 8-bit integers with the supplied values in reverse order.
624 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
625 {
626     byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0];
627     return *cast(__m64*)(arr.ptr);
628 }
629 unittest
630 {
631     byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7);
632     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
633     assert(R.array == correct);
634 }
635 
636 /// Return vector of type `__m64` with all elements set to zero.
637 __m64 _mm_setzero_si64 () pure @trusted
638 {
639     __m64 r; // PERF =void;
640     r.ptr[0] = 0;
641     return r;
642 }
643 unittest
644 {
645     __m64 R = _mm_setzero_si64();
646     assert(R.array[0] == 0);
647 }
648 
649 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros.
650 deprecated("Use _mm_slli_pi16 instead.") __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe
651 {
652     return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits)));
653 }
654 
655 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros.
656 deprecated("Use _mm_slli_pi32 instead.") __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe
657 {
658     return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits)));
659 }
660 
661 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros.
662 deprecated("Use _mm_slli_si64 instead.") __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe
663 {
664     return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits)));
665 }
666 
667 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
668 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe
669 {
670     return to_m64(_mm_slli_epi16(to_m128i(a), imm8));
671 }
672 unittest
673 {
674     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
675     short4 B = cast(short4)( _mm_slli_pi16(A, 1) );
676     short[4] correct = [ -8, -10, 12, 14 ];
677     assert(B.array == correct);
678 }
679 
680 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
681 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe
682 {
683     return to_m64(_mm_slli_epi32(to_m128i(a), imm8));
684 }
685 unittest
686 {
687     __m64 A = _mm_setr_pi32(-4, 5);
688     int2 B = cast(int2)( _mm_slli_pi32(A, 1) );
689     int[2] correct = [ -8, 10 ];
690     assert(B.array == correct);
691 }
692 
693 /// Shift 64-bit integer `a` left by `imm8` while shifting in zeros.
694 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe
695 {
696     return to_m64(_mm_slli_epi64(to_m128i(a), imm8));
697 }
698 unittest
699 {
700     __m64 A = _mm_cvtsi64_m64(-1);
701     long1 R = cast(long1)( _mm_slli_si64(A, 1) );
702     long[1] correct = [ -2 ];
703     assert(R.array == correct);
704 }
705 
706 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits.
707 deprecated("Use _mm_srai_pi16 instead.") __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe
708 {
709     return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits)));
710 }
711 
712 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits.
713 deprecated("Use _mm_srai_pi32 instead.") __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe
714 {
715     return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits)));
716 }
717 
718 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
719 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe
720 {
721     return to_m64(_mm_srai_epi16(to_m128i(a), imm8));
722 }
723 unittest
724 {
725     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
726     short4 B = cast(short4)( _mm_srai_pi16(A, 1) );
727     short[4] correct = [ -2, -3, 3, 3 ];
728     assert(B.array == correct);
729 }
730 
731 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
732 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe
733 {
734     return to_m64(_mm_srai_epi32(to_m128i(a), imm8));
735 }
736 unittest
737 {
738     __m64 A = _mm_setr_pi32(-4, 5);
739     int2 B = cast(int2)( _mm_srai_pi32(A, 1) );
740     int[2] correct = [ -2, 2 ];
741     assert(B.array == correct);
742 }
743 
744 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros.
745 deprecated("Use _mm_srli_pi16 instead.") __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe
746 {
747     return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits)));
748 }
749 
750 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros.
751 deprecated("Use _mm_srli_pi32 instead.") __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe
752 {
753     return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits)));
754 }
755 
756 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros.
757 deprecated("Use _mm_srli_si64 instead.") __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe
758 {
759     return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits)));
760 }
761 
762 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
763 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe
764 {
765     return to_m64(_mm_srli_epi16(to_m128i(a), imm8));
766 }
767 unittest
768 {
769     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
770     short4 B = cast(short4)( _mm_srli_pi16(A, 1) );
771     short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ];
772     assert(B.array == correct);
773 }
774 
775 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
776 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe
777 {
778     return to_m64(_mm_srli_epi32(to_m128i(a), imm8));
779 }
780 unittest
781 {
782     __m64 A = _mm_setr_pi32(-4, 5);
783     int2 B = cast(int2)( _mm_srli_pi32(A, 1) );
784     int[2] correct = [ 0x7ffffffe, 2 ];
785     assert(B.array == correct);
786 }
787 
788 /// Shift 64-bit integer `a` right by `imm8` while shifting in zeros.
789 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe
790 {
791     return to_m64(_mm_srli_epi64(to_m128i(a), imm8));
792 }
793 unittest
794 {
795     __m64 A = _mm_cvtsi64_m64(-1);
796     long1 R = cast(long1)( _mm_srli_si64(A, 1) );
797     long[1] correct = [ 0x7fff_ffff_ffff_ffff ];
798     assert(R.array == correct);
799 }
800 
801 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
802 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe
803 {
804     return cast(__m64)(cast(short4)a - cast(short4)b);
805 }
806 unittest
807 {
808     short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
809                                          _mm_setr_pi16(cast(short)65535, 16, 4, 4));
810     static immutable short[4] correct =                            [ -1,-15, 1, 32764];
811     assert(R.array == correct);
812 }
813 
814 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
815 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe
816 {
817     return cast(__m64)(cast(int2)a - cast(int2)b);
818 }
819 unittest
820 {
821     int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10,   4),
822                                      _mm_setr_pi32( 15, -70));
823     static immutable int[2] correct =             [ -5,  74];
824     assert(R.array == correct);
825 }
826 
827 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
828 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe
829 {
830     return cast(__m64)(cast(byte8)a - cast(byte8)b);
831 }
832 unittest
833 {
834     byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
835                                       _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
836     static immutable byte[8] correct =                 [      -1,   7, -1,-30,  0,  0, 0, 120 ];
837     assert(R.array == correct);
838 }
839 
840 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation.
841 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe
842 {
843     return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b)));
844 }
845 unittest
846 {
847     short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
848                                           _mm_setr_pi16(cast(short)65535, 16, 4, 4));
849     static immutable short[4] correct =                             [ -1,-15, 1, -32768];
850     assert(R.array == correct);
851 }
852 
853 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation.
854 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe
855 {
856     return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b)));
857 }
858 unittest
859 {
860     byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
861                                        _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
862     static immutable byte[8] correct =                 [       -1,   7, -1,-30,  0,  0, 0, -128 ];
863     assert(R.array == correct);
864 }
865 
866 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 
867 /// using saturation.
868 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe
869 {
870     return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b)));
871 }
872 unittest
873 {
874     short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534,  1, 5, 4),
875                                           _mm_setr_pi16(cast(short)65535, 16, 4, 4));
876     static immutable short[4] correct =                              [ 0,  0, 1, 0];
877     assert(R.array == correct);
878 }
879 
880 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 
881 /// using saturation.
882 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe
883 {
884     return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b)));
885 }
886 unittest
887 {
888     byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8),
889                                        _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
890     static immutable byte[8] correct =                 [        0,   7,  0,  0,  0,  0, 0, 0, ];
891     assert(R.array == correct);
892 }
893 
894 deprecated alias _m_to_int = _mm_cvtsi64_si32;  /// Deprecated intrinsics.
895 deprecated alias _m_to_int64 = _mm_cvtm64_si64; ///ditto
896 
897 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
898 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted
899 {   
900     static if (LDC_with_optimizations)
901     {
902         enum ir = `%r = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
903                    ret <4 x i16> %r`;
904         return cast(__m64) LDCInlineIR!(ir, short4, short4, short4)(cast(short4)a, cast(short4)b);
905     }
906     else
907     {
908         short4 ia = cast(short4)a;
909         short4 ib = cast(short4)b;
910         short4 r;
911         r.ptr[0] = ia.array[2];
912         r.ptr[1] = ib.array[2];
913         r.ptr[2] = ia.array[3];
914         r.ptr[3] = ib.array[3];
915         return cast(__m64)r;
916     }
917 }
918 unittest
919 {
920     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
921     __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
922     short4 R = cast(short4) _mm_unpackhi_pi16(A, B);
923     short[4] correct = [-16, -3, 7, 10];
924     assert(R.array == correct);
925 }
926 
927 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
928 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted
929 {
930     // Generate punpckldq as far back as LDC 1.0.0 -O1
931     // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions)
932     int2 ia = cast(int2)a;
933     int2 ib = cast(int2)b;
934     int2 r;
935     r.ptr[0] = ia.array[1];
936     r.ptr[1] = ib.array[1];
937     return cast(__m64)r;
938 }
939 unittest
940 {
941     __m64 A = _mm_setr_pi32(4, 8);
942     __m64 B = _mm_setr_pi32(5, 9);
943     int2 R = cast(int2) _mm_unpackhi_pi32(A, B);
944     int[2] correct = [8, 9];
945     assert(R.array == correct);
946 }
947 
948 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
949 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b)
950 {
951     static if (LDC_with_optimizations)
952     {
953         enum ir = `%r = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
954                    ret <8 x i8> %r`;
955         return cast(__m64) LDCInlineIR!(ir, byte8, byte8, byte8)(cast(byte8)a, cast(byte8)b);
956     }
957     else
958     {
959         byte8 ia = cast(byte8)a;
960         byte8 ib = cast(byte8)b;
961         byte8 r;
962         r.ptr[0] = ia.array[4];
963         r.ptr[1] = ib.array[4];
964         r.ptr[2] = ia.array[5];
965         r.ptr[3] = ib.array[5];
966         r.ptr[4] = ia.array[6];
967         r.ptr[5] = ib.array[6];
968         r.ptr[6] = ia.array[7];
969         r.ptr[7] = ib.array[7];
970         return cast(__m64)r;
971     }
972 }
973 unittest
974 {
975     __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
976     __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
977     byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B);
978     byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8];
979     assert(R.array == correct);
980 }
981 
982 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
983 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b)
984 {
985     // Generates punpcklwd since LDC 1.0.0 -01
986     short4 ia = cast(short4)a;
987     short4 ib = cast(short4)b;
988     short4 r;
989     r.ptr[0] = ia.array[0];
990     r.ptr[1] = ib.array[0];
991     r.ptr[2] = ia.array[1];
992     r.ptr[3] = ib.array[1];
993     return cast(__m64)r;
994 }
995 unittest
996 {
997     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
998     __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
999     short4 R = cast(short4) _mm_unpacklo_pi16(A, B);
1000     short[4] correct = [4, 5, 8, 9];
1001     assert(R.array == correct);
1002 }
1003 
1004 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
1005 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted
1006 {
1007     // x86: Generate punpckldq as far back as LDC 1.0.0 -O1
1008     // ARM: Generate zip as far back as LDC 1.8.0 -O1
1009     int2 ia = cast(int2)a;
1010     int2 ib = cast(int2)b;
1011     int2 r;
1012     r.ptr[0] = ia.array[0];
1013     r.ptr[1] = ib.array[0];
1014     return cast(__m64)r;
1015 }
1016 unittest
1017 {
1018     __m64 A = _mm_setr_pi32(4, 8);
1019     __m64 B = _mm_setr_pi32(5, 9);
1020     int2 R = cast(int2) _mm_unpacklo_pi32(A, B);
1021     int[2] correct = [4, 5];
1022     assert(R.array == correct);
1023 }
1024 
1025 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
1026 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b)
1027 {
1028     static if (LDC_with_optimizations)
1029     {
1030         enum ir = `%r = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
1031             ret <8 x i8> %r`;
1032         return cast(__m64) LDCInlineIR!(ir, byte8, byte8, byte8)(cast(byte8)a, cast(byte8)b);
1033     }
1034     else
1035     {
1036         byte8 ia = cast(byte8)a;
1037         byte8 ib = cast(byte8)b;
1038         byte8 r;
1039         r.ptr[0] = ia.array[0];
1040         r.ptr[1] = ib.array[0];
1041         r.ptr[2] = ia.array[1];
1042         r.ptr[3] = ib.array[1];
1043         r.ptr[4] = ia.array[2];
1044         r.ptr[5] = ib.array[2];
1045         r.ptr[6] = ia.array[3];
1046         r.ptr[7] = ib.array[3];
1047         return cast(__m64)r;
1048     }
1049 }
1050 unittest
1051 {
1052     __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
1053     __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
1054     byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B);
1055     byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4];
1056     assert(R.array == correct);
1057 }
1058 
1059 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`.
1060 __m64 _mm_xor_si64 (__m64 a, __m64 b)
1061 {
1062     return a ^ b;
1063 }
1064 unittest
1065 {
1066     __m64 A = _mm_setr_pi16(255, 1, -1, 0);
1067     __m64 B = _mm_set1_pi16(15);
1068     short4 R = cast(short4)_mm_xor_si64(A, B);
1069     short[4] correct =     [240, 14, -16, 15];
1070     assert(R.array == correct);
1071 }
1072