The OpenD Programming Language

1 // Written in the D programming language.
2 
3 /**
4  * Builtin SIMD intrinsics
5  *
6  * Source: $(DRUNTIMESRC core/_simd.d)
7  *
8  * Copyright: Copyright Digital Mars 2012-2020
9  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
10  * Authors:   $(HTTP digitalmars.com, Walter Bright),
11  * Source:    $(DRUNTIMESRC core/_simd.d)
12  */
13 
14 module core.simd;
15 
16 pure:
17 nothrow:
18 @safe:
19 @nogc:
20 
21 /*******************************
22  * Create a vector type.
23  *
24  * Parameters:
25  *      T = one of double[2], float[4], void[16], byte[16], ubyte[16],
26  *      short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
27  *      For 256 bit vectors,
28  *      one of double[4], float[8], void[32], byte[32], ubyte[32],
29  *      short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
30  */
31 
32 template Vector(T)
33 {
34     /* __vector is compiler magic, hide it behind a template.
35      * The compiler will reject T's that don't work.
36      */
37     alias __vector(T) Vector;
38 }
39 
40 /* Handy aliases
41  */
42 version (LDC)
43 {
44 static if (is(Vector!(void[4])))    alias Vector!(void[4])    void4;        ///
45 static if (is(Vector!(byte[4])))    alias Vector!(byte[4])    byte4;        ///
46 static if (is(Vector!(ubyte[4])))   alias Vector!(ubyte[4])   ubyte4;       ///
47 static if (is(Vector!(short[2])))   alias Vector!(short[2])   short2;       ///
48 static if (is(Vector!(ushort[2])))  alias Vector!(ushort[2])  ushort2;      ///
49 }
50 static if (is(Vector!(void[8])))    alias Vector!(void[8])    void8;        ///
51 static if (is(Vector!(double[1])))  alias Vector!(double[1])  double1;      ///
52 static if (is(Vector!(float[2])))   alias Vector!(float[2])   float2;       ///
53 static if (is(Vector!(byte[8])))    alias Vector!(byte[8])    byte8;        ///
54 static if (is(Vector!(ubyte[8])))   alias Vector!(ubyte[8])   ubyte8;       ///
55 static if (is(Vector!(short[4])))   alias Vector!(short[4])   short4;       ///
56 static if (is(Vector!(ushort[4])))  alias Vector!(ushort[4])  ushort4;      ///
57 static if (is(Vector!(int[2])))     alias Vector!(int[2])     int2;         ///
58 static if (is(Vector!(uint[2])))    alias Vector!(uint[2])    uint2;        ///
59 static if (is(Vector!(long[1])))    alias Vector!(long[1])    long1;        ///
60 static if (is(Vector!(ulong[1])))   alias Vector!(ulong[1])   ulong1;       ///
61 
62 static if (is(Vector!(void[16])))   alias Vector!(void[16])   void16;       ///
63 static if (is(Vector!(double[2])))  alias Vector!(double[2])  double2;      ///
64 static if (is(Vector!(float[4])))   alias Vector!(float[4])   float4;       ///
65 static if (is(Vector!(byte[16])))   alias Vector!(byte[16])   byte16;       ///
66 static if (is(Vector!(ubyte[16])))  alias Vector!(ubyte[16])  ubyte16;      ///
67 static if (is(Vector!(short[8])))   alias Vector!(short[8])   short8;       ///
68 static if (is(Vector!(ushort[8])))  alias Vector!(ushort[8])  ushort8;      ///
69 static if (is(Vector!(int[4])))     alias Vector!(int[4])     int4;         ///
70 static if (is(Vector!(uint[4])))    alias Vector!(uint[4])    uint4;        ///
71 static if (is(Vector!(long[2])))    alias Vector!(long[2])    long2;        ///
72 static if (is(Vector!(ulong[2])))   alias Vector!(ulong[2])   ulong2;       ///
73 
74 static if (is(Vector!(void[32])))   alias Vector!(void[32])   void32;       ///
75 static if (is(Vector!(double[4])))  alias Vector!(double[4])  double4;      ///
76 static if (is(Vector!(float[8])))   alias Vector!(float[8])   float8;       ///
77 static if (is(Vector!(byte[32])))   alias Vector!(byte[32])   byte32;       ///
78 static if (is(Vector!(ubyte[32])))  alias Vector!(ubyte[32])  ubyte32;      ///
79 static if (is(Vector!(short[16])))  alias Vector!(short[16])  short16;      ///
80 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16;     ///
81 static if (is(Vector!(int[8])))     alias Vector!(int[8])     int8;         ///
82 static if (is(Vector!(uint[8])))    alias Vector!(uint[8])    uint8;        ///
83 static if (is(Vector!(long[4])))    alias Vector!(long[4])    long4;        ///
84 static if (is(Vector!(ulong[4])))   alias Vector!(ulong[4])   ulong4;       ///
85 
86 static if (is(Vector!(void[64])))   alias Vector!(void[64])   void64;       ///
87 static if (is(Vector!(double[8])))  alias Vector!(double[8])  double8;      ///
88 static if (is(Vector!(float[16])))  alias Vector!(float[16])  float16;      ///
89 static if (is(Vector!(byte[64])))   alias Vector!(byte[64])   byte64;       ///
90 static if (is(Vector!(ubyte[64])))  alias Vector!(ubyte[64])  ubyte64;      ///
91 static if (is(Vector!(short[32])))  alias Vector!(short[32])  short32;      ///
92 static if (is(Vector!(ushort[32]))) alias Vector!(ushort[32]) ushort32;     ///
93 static if (is(Vector!(int[16])))    alias Vector!(int[16])    int16;        ///
94 static if (is(Vector!(uint[16])))   alias Vector!(uint[16])   uint16;       ///
95 static if (is(Vector!(long[8])))    alias Vector!(long[8])    long8;        ///
96 static if (is(Vector!(ulong[8])))   alias Vector!(ulong[8])   ulong8;       ///
97 
98 version (LDC)
99 {
100     public import ldc.simd : loadUnaligned, storeUnaligned;
101 
102     /*********************
103     * Emit prefetch instruction.
104     * Params:
105     *    address = address to be prefetched
106     *    writeFetch = true for write fetch, false for read fetch
107     *    locality = 0..3 (0 meaning least local, 3 meaning most local)
108     */
109     pragma(inline, true)
110     void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
111     {
112         import ldc.intrinsics : llvm_prefetch;
113         static assert(locality < 4, "0..3 expected for locality");
114         enum dataCache = 1;
115         llvm_prefetch(address, writeFetch, locality, dataCache);
116     }
117 
118     unittest
119     {
120         float[4] data = [ 0.5, 1, 1.5, 2 ];
121         auto ptr = &data[0];
122 
123         prefetch!(false, 0)(ptr);
124         auto v = loadUnaligned!float4(ptr);
125         v *= 2;
126         storeUnaligned!float4(v, ptr);
127 
128         float[4] expected = [ 1, 2, 3, 4 ];
129         assert(data == expected);
130     }
131 }
132 else version (D_SIMD)
133 {
134     /** XMM opcodes that conform to the following:
135     *
136     *  opcode xmm1,xmm2/mem
137     *
138     * and do not have side effects (i.e. do not write to memory).
139     */
140     enum XMM
141     {
142         ADDSS = 0xF30F58,
143         ADDSD = 0xF20F58,
144         ADDPS = 0x000F58,
145         ADDPD = 0x660F58,
146         PADDB = 0x660FFC,
147         PADDW = 0x660FFD,
148         PADDD = 0x660FFE,
149         PADDQ = 0x660FD4,
150 
151         SUBSS = 0xF30F5C,
152         SUBSD = 0xF20F5C,
153         SUBPS = 0x000F5C,
154         SUBPD = 0x660F5C,
155         PSUBB = 0x660FF8,
156         PSUBW = 0x660FF9,
157         PSUBD = 0x660FFA,
158         PSUBQ = 0x660FFB,
159 
160         MULSS = 0xF30F59,
161         MULSD = 0xF20F59,
162         MULPS = 0x000F59,
163         MULPD = 0x660F59,
164         PMULLW = 0x660FD5,
165 
166         DIVSS = 0xF30F5E,
167         DIVSD = 0xF20F5E,
168         DIVPS = 0x000F5E,
169         DIVPD = 0x660F5E,
170 
171         PAND  = 0x660FDB,
172         POR   = 0x660FEB,
173 
174         UCOMISS = 0x000F2E,
175         UCOMISD = 0x660F2E,
176 
177         XORPS = 0x000F57,
178         XORPD = 0x660F57,
179 
180         // Use STO and LOD instead of MOV to distinguish the direction
181         // (Destination is first operand, Source is second operand)
182         STOSS  = 0xF30F11,        /// MOVSS xmm1/m32, xmm2
183         STOSD  = 0xF20F11,        /// MOVSD xmm1/m64, xmm2
184         STOAPS = 0x000F29,        /// MOVAPS xmm2/m128, xmm1
185         STOAPD = 0x660F29,        /// MOVAPD xmm2/m128, xmm1
186         STODQA = 0x660F7F,        /// MOVDQA xmm2/m128, xmm1
187         STOD   = 0x660F7E,        /// MOVD reg/mem64, xmm   66 0F 7E /r
188         STOQ   = 0x660FD6,        /// MOVQ xmm2/m64, xmm1
189 
190         LODSS  = 0xF30F10,        /// MOVSS xmm1, xmm2/m32
191         LODSD  = 0xF20F10,        /// MOVSD xmm1, xmm2/m64
192         LODAPS = 0x000F28,        /// MOVAPS xmm1, xmm2/m128
193         LODAPD = 0x660F28,        /// MOVAPD xmm1, xmm2/m128
194         LODDQA = 0x660F6F,        /// MOVDQA xmm1, xmm2/m128
195         LODD   = 0x660F6E,        /// MOVD xmm, reg/mem64   66 0F 6E /r
196         LODQ   = 0xF30F7E,        /// MOVQ xmm1, xmm2/m64
197 
198         LODDQU   = 0xF30F6F,      /// MOVDQU xmm1, xmm2/mem128  F3 0F 6F /r
199         STODQU   = 0xF30F7F,      /// MOVDQU xmm1/mem128, xmm2  F3 0F 7F /r
200         MOVDQ2Q  = 0xF20FD6,      /// MOVDQ2Q mmx, xmm          F2 0F D6 /r
201         MOVHLPS  = 0x0F12,        /// MOVHLPS xmm1, xmm2        0F 12 /r
202         LODHPD   = 0x660F16,      /// MOVHPD xmm1, m64
203         STOHPD   = 0x660F17,      /// MOVHPD mem64, xmm1        66 0F 17 /r
204         LODHPS   = 0x0F16,        /// MOVHPS xmm1, m64
205         STOHPS   = 0x0F17,        /// MOVHPS m64, xmm1
206         MOVLHPS  = 0x0F16,        /// MOVLHPS xmm1, xmm2
207         LODLPD   = 0x660F12,      /// MOVLPD xmm1, m64
208         STOLPD   = 0x660F13,      /// MOVLPD m64, xmm1
209         LODLPS   = 0x0F12,        /// MOVLPS xmm1, m64
210         STOLPS   = 0x0F13,        /// MOVLPS m64, xmm1
211         MOVMSKPD = 0x660F50,      /// MOVMSKPD reg, xmm
212         MOVMSKPS = 0x0F50,        /// MOVMSKPS reg, xmm
213         MOVNTDQ  = 0x660FE7,      /// MOVNTDQ m128, xmm1
214         MOVNTI   = 0x0FC3,        /// MOVNTI m32, r32
215         MOVNTPD  = 0x660F2B,      /// MOVNTPD m128, xmm1
216         MOVNTPS  = 0x0F2B,        /// MOVNTPS m128, xmm1
217         MOVNTQ   = 0x0FE7,        /// MOVNTQ m64, mm
218         MOVQ2DQ  = 0xF30FD6,      /// MOVQ2DQ
219         LODUPD   = 0x660F10,      /// MOVUPD xmm1, xmm2/m128
220         STOUPD   = 0x660F11,      /// MOVUPD xmm2/m128, xmm1
221         LODUPS   = 0x0F10,        /// MOVUPS xmm1, xmm2/m128
222         STOUPS   = 0x0F11,        /// MOVUPS xmm2/m128, xmm1
223 
224         PACKSSDW = 0x660F6B,
225         PACKSSWB = 0x660F63,
226         PACKUSWB = 0x660F67,
227         PADDSB = 0x660FEC,
228         PADDSW = 0x660FED,
229         PADDUSB = 0x660FDC,
230         PADDUSW = 0x660FDD,
231         PANDN = 0x660FDF,
232         PCMPEQB = 0x660F74,
233         PCMPEQD = 0x660F76,
234         PCMPEQW = 0x660F75,
235         PCMPGTB = 0x660F64,
236         PCMPGTD = 0x660F66,
237         PCMPGTW = 0x660F65,
238         PMADDWD = 0x660FF5,
239         PSLLW = 0x660FF1,
240         PSLLD = 0x660FF2,
241         PSLLQ = 0x660FF3,
242         PSRAW = 0x660FE1,
243         PSRAD = 0x660FE2,
244         PSRLW = 0x660FD1,
245         PSRLD = 0x660FD2,
246         PSRLQ = 0x660FD3,
247         PSUBSB = 0x660FE8,
248         PSUBSW = 0x660FE9,
249         PSUBUSB = 0x660FD8,
250         PSUBUSW = 0x660FD9,
251         PUNPCKHBW = 0x660F68,
252         PUNPCKHDQ = 0x660F6A,
253         PUNPCKHWD = 0x660F69,
254         PUNPCKLBW = 0x660F60,
255         PUNPCKLDQ = 0x660F62,
256         PUNPCKLWD = 0x660F61,
257         PXOR = 0x660FEF,
258         ANDPD = 0x660F54,
259         ANDPS = 0x0F54,
260         ANDNPD = 0x660F55,
261         ANDNPS = 0x0F55,
262         CMPPS = 0x0FC2,
263         CMPPD = 0x660FC2,
264         CMPSD = 0xF20FC2,
265         CMPSS = 0xF30FC2,
266         COMISD = 0x660F2F,
267         COMISS = 0x0F2F,
268         CVTDQ2PD = 0xF30FE6,
269         CVTDQ2PS = 0x0F5B,
270         CVTPD2DQ = 0xF20FE6,
271         CVTPD2PI = 0x660F2D,
272         CVTPD2PS = 0x660F5A,
273         CVTPI2PD = 0x660F2A,
274         CVTPI2PS = 0x0F2A,
275         CVTPS2DQ = 0x660F5B,
276         CVTPS2PD = 0x0F5A,
277         CVTPS2PI = 0x0F2D,
278         CVTSD2SI = 0xF20F2D,
279         CVTSD2SS = 0xF20F5A,
280         CVTSI2SD = 0xF20F2A,
281         CVTSI2SS = 0xF30F2A,
282         CVTSS2SD = 0xF30F5A,
283         CVTSS2SI = 0xF30F2D,
284         CVTTPD2PI = 0x660F2C,
285         CVTTPD2DQ = 0x660FE6,
286         CVTTPS2DQ = 0xF30F5B,
287         CVTTPS2PI = 0x0F2C,
288         CVTTSD2SI = 0xF20F2C,
289         CVTTSS2SI = 0xF30F2C,
290         MASKMOVDQU = 0x660FF7,
291         MASKMOVQ = 0x0FF7,
292         MAXPD = 0x660F5F,
293         MAXPS = 0x0F5F,
294         MAXSD = 0xF20F5F,
295         MAXSS = 0xF30F5F,
296         MINPD = 0x660F5D,
297         MINPS = 0x0F5D,
298         MINSD = 0xF20F5D,
299         MINSS = 0xF30F5D,
300         ORPD = 0x660F56,
301         ORPS = 0x0F56,
302         PAVGB = 0x660FE0,
303         PAVGW = 0x660FE3,
304         PMAXSW = 0x660FEE,
305         //PINSRW = 0x660FC4,
306         PMAXUB = 0x660FDE,
307         PMINSW = 0x660FEA,
308         PMINUB = 0x660FDA,
309         //PMOVMSKB = 0x660FD7,
310         PMULHUW = 0x660FE4,
311         PMULHW = 0x660FE5,
312         PMULUDQ = 0x660FF4,
313         PSADBW = 0x660FF6,
314         PUNPCKHQDQ = 0x660F6D,
315         PUNPCKLQDQ = 0x660F6C,
316         RCPPS = 0x0F53,
317         RCPSS = 0xF30F53,
318         RSQRTPS = 0x0F52,
319         RSQRTSS = 0xF30F52,
320         SQRTPD = 0x660F51,
321         SHUFPD = 0x660FC6,
322         SHUFPS = 0x0FC6,
323         SQRTPS = 0x0F51,
324         SQRTSD = 0xF20F51,
325         SQRTSS = 0xF30F51,
326         UNPCKHPD = 0x660F15,
327         UNPCKHPS = 0x0F15,
328         UNPCKLPD = 0x660F14,
329         UNPCKLPS = 0x0F14,
330 
331         PSHUFD = 0x660F70,
332         PSHUFHW = 0xF30F70,
333         PSHUFLW = 0xF20F70,
334         PSHUFW = 0x0F70,
335         PSLLDQ = 0x07660F73,
336         PSRLDQ = 0x03660F73,
337 
338         //PREFETCH = 0x0F18,
339 
340         // SSE3 Pentium 4 (Prescott)
341 
342         ADDSUBPD = 0x660FD0,
343         ADDSUBPS = 0xF20FD0,
344         HADDPD   = 0x660F7C,
345         HADDPS   = 0xF20F7C,
346         HSUBPD   = 0x660F7D,
347         HSUBPS   = 0xF20F7D,
348         MOVDDUP  = 0xF20F12,
349         MOVSHDUP = 0xF30F16,
350         MOVSLDUP = 0xF30F12,
351         LDDQU    = 0xF20FF0,
352         MONITOR  = 0x0F01C8,
353         MWAIT    = 0x0F01C9,
354 
355         // SSSE3
356         PALIGNR = 0x660F3A0F,
357         PHADDD = 0x660F3802,
358         PHADDW = 0x660F3801,
359         PHADDSW = 0x660F3803,
360         PABSB = 0x660F381C,
361         PABSD = 0x660F381E,
362         PABSW = 0x660F381D,
363         PSIGNB = 0x660F3808,
364         PSIGND = 0x660F380A,
365         PSIGNW = 0x660F3809,
366         PSHUFB = 0x660F3800,
367         PMADDUBSW = 0x660F3804,
368         PMULHRSW = 0x660F380B,
369         PHSUBD = 0x660F3806,
370         PHSUBW = 0x660F3805,
371         PHSUBSW = 0x660F3807,
372 
373         // SSE4.1
374 
375         BLENDPD   = 0x660F3A0D,
376         BLENDPS   = 0x660F3A0C,
377         BLENDVPD  = 0x660F3815,
378         BLENDVPS  = 0x660F3814,
379         DPPD      = 0x660F3A41,
380         DPPS      = 0x660F3A40,
381         EXTRACTPS = 0x660F3A17,
382         INSERTPS  = 0x660F3A21,
383         MPSADBW   = 0x660F3A42,
384         PBLENDVB  = 0x660F3810,
385         PBLENDW   = 0x660F3A0E,
386         PEXTRD    = 0x660F3A16,
387         PEXTRQ    = 0x660F3A16,
388         PINSRB    = 0x660F3A20,
389         PINSRD    = 0x660F3A22,
390         PINSRQ    = 0x660F3A22,
391 
392         MOVNTDQA = 0x660F382A,
393         PACKUSDW = 0x660F382B,
394         PCMPEQQ = 0x660F3829,
395         PEXTRB = 0x660F3A14,
396         PHMINPOSUW = 0x660F3841,
397         PMAXSB = 0x660F383C,
398         PMAXSD = 0x660F383D,
399         PMAXUD = 0x660F383F,
400         PMAXUW = 0x660F383E,
401         PMINSB = 0x660F3838,
402         PMINSD = 0x660F3839,
403         PMINUD = 0x660F383B,
404         PMINUW = 0x660F383A,
405         PMOVSXBW = 0x660F3820,
406         PMOVSXBD = 0x660F3821,
407         PMOVSXBQ = 0x660F3822,
408         PMOVSXWD = 0x660F3823,
409         PMOVSXWQ = 0x660F3824,
410         PMOVSXDQ = 0x660F3825,
411         PMOVZXBW = 0x660F3830,
412         PMOVZXBD = 0x660F3831,
413         PMOVZXBQ = 0x660F3832,
414         PMOVZXWD = 0x660F3833,
415         PMOVZXWQ = 0x660F3834,
416         PMOVZXDQ = 0x660F3835,
417         PMULDQ   = 0x660F3828,
418         PMULLD   = 0x660F3840,
419         PTEST    = 0x660F3817,
420 
421         ROUNDPD = 0x660F3A09,
422         ROUNDPS = 0x660F3A08,
423         ROUNDSD = 0x660F3A0B,
424         ROUNDSS = 0x660F3A0A,
425 
426         // SSE4.2
427         PCMPESTRI  = 0x660F3A61,
428         PCMPESTRM  = 0x660F3A60,
429         PCMPISTRI  = 0x660F3A63,
430         PCMPISTRM  = 0x660F3A62,
431         PCMPGTQ    = 0x660F3837,
432         //CRC32
433 
434         // SSE4a (AMD only)
435         // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
436 
437         // POPCNT and LZCNT (have their own CPUID bits)
438         POPCNT     = 0xF30FB8,
439         // LZCNT
440     }
441 
442     /**
443     * Generate two operand instruction with XMM 128 bit operands.
444     *
445     * This is a compiler magic function - it doesn't behave like
446     * regular D functions.
447     *
448     * Parameters:
449     *      opcode = any of the XMM opcodes; it must be a compile time constant
450     *      op1    = first operand
451     *      op2    = second operand
452     * Returns:
453     *      result of opcode
454     * Example:
455     ---
456     import core.simd;
457     import core.stdc.stdio;
458 
459     void main()
460     {
461         float4 A = [2.34f, -70000.0f, 0.00001f, 345.5f];
462         float4 R = A;
463         R = cast(float4) __simd(XMM.RCPSS, R, A);
464         printf("%g %g %g %g\n", R.array[0], R.array[1], R.array[2], R.array[3]);
465     }
466     ---
467     * Prints `0.427368 -70000 1e-05 345.5`.
468     * The use of the two operand form for `XMM.RCPSS` is necessary because the result of the instruction
469     * contains elements of both operands.
470     * Example:
471     ---
472     double[2] A = [56.0, -75.0];
473     double2 R = cast(double2) __simd(XMM.LODUPD, *cast(double2*)A.ptr);
474     ---
475     * The cast to `double2*` is necessary because the type of `*A.ptr` is `double`.
476     */
477     pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);
478 
479     ///
480     unittest
481     {
482         float4 a;
483         a = cast(float4)__simd(XMM.PXOR, a, a);
484     }
485 
486     /**
487     * Unary SIMD instructions.
488     */
489     pure @safe void16 __simd(XMM opcode, void16 op1);
490     pure @safe void16 __simd(XMM opcode, double d);   ///
491     pure @safe void16 __simd(XMM opcode, float f);    ///
492 
493     ///
494     unittest
495     {
496         float4 a;
497         a = cast(float4)__simd(XMM.LODSS, a);
498     }
499 
500     /****
501     * For instructions:
502     * CMPPD, CMPSS, CMPSD, CMPPS,
503     * PSHUFD, PSHUFHW, PSHUFLW,
504     * BLENDPD, BLENDPS, DPPD, DPPS,
505     * MPSADBW, PBLENDW,
506     * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
507     * Parameters:
508     *      opcode = any of the above XMM opcodes; it must be a compile time constant
509     *      op1    = first operand
510     *      op2    = second operand
511     *      imm8   = third operand; must be a compile time constant
512     * Returns:
513     *      result of opcode
514     */
515     pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);
516 
517     ///
518     unittest
519     {
520         float4 a;
521         a = cast(float4)__simd(XMM.CMPPD, a, a, 0x7A);
522     }
523 
524     /***
525     * For instructions with the imm8 version:
526     * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
527     * PSRLDQ, PSLLDQ
528     * Parameters:
529     *      opcode = any of the XMM opcodes; it must be a compile time constant
530     *      op1    = first operand
531     *      imm8   = second operand; must be a compile time constant
532     * Returns:
533     *      result of opcode
534     */
535     pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);
536 
537     ///
538     unittest
539     {
540         float4 a;
541         a = cast(float4) __simd_ib(XMM.PSRLQ, a, 0x7A);
542     }
543 
544     /*****
545     * For "store" operations of the form:
546     *    op1 op= op2
547     * such as MOVLPS.
548     * Returns:
549     *    op2
550     * These cannot be marked as pure, as semantic() doesn't check them.
551     */
552     @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
553     @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
554     @safe void16 __simd_sto(XMM opcode, float op1, void16 op2);  ///
555     @safe void16 __simd_sto(XMM opcode, void16 op1, long op2); ///
556 
557     ///
558     unittest
559     {
560         void16 a;
561         float f = 1;
562         double d = 1;
563 
564         cast(void)__simd_sto(XMM.STOUPS, a, a);
565         cast(void)__simd_sto(XMM.STOUPS, f, a);
566         cast(void)__simd_sto(XMM.STOUPS, d, a);
567     }
568 
569     /* The following use overloading to ensure correct typing.
570     * Compile with inlining on for best performance.
571     */
572 
573     pure @safe short8 pcmpeq()(short8 v1, short8 v2)
574     {
575         return cast(short8)__simd(XMM.PCMPEQW, v1, v2);
576     }
577 
578     pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
579     {
580         return cast(ushort8)__simd(XMM.PCMPEQW, v1, v2);
581     }
582 
583     /*********************
584     * Emit prefetch instruction.
585     * Params:
586     *    address = address to be prefetched
587     *    writeFetch = true for write fetch, false for read fetch
588     *    locality = 0..3 (0 meaning least local, 3 meaning most local)
589     * Note:
590     *    The Intel mappings are:
591     *    $(TABLE
592     *    $(THEAD writeFetch, locality, Instruction)
593     *    $(TROW false, 0, prefetchnta)
594     *    $(TROW false, 1, prefetch2)
595     *    $(TROW false, 2, prefetch1)
596     *    $(TROW false, 3, prefetch0)
597     *    $(TROW true, 0, prefetchw)
598     *    $(TROW true, 1, prefetchw)
599     *    $(TROW true, 2, prefetchw)
600     *    $(TROW true, 3, prefetchw)
601     *    )
602     */
603     void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
604     {
605         static if (writeFetch)
606             __prefetch(address, 4);
607         else static if (locality < 4)
608             __prefetch(address, 3 - locality);
609         else
610             static assert(0, "0..3 expected for locality");
611     }
612 
613     private void __prefetch(const(void*) address, ubyte encoding);
614 
615     /*************************************
616     * Load unaligned vector from address.
617     * This is a compiler intrinsic.
618     * Params:
619     *    p = pointer to vector
620     * Returns:
621     *    vector
622     */
623 
624     V loadUnaligned(V)(const V* p)
625         if (is(V == void16) ||
626             is(V == byte16) ||
627             is(V == ubyte16) ||
628             is(V == short8) ||
629             is(V == ushort8) ||
630             is(V == int4) ||
631             is(V == uint4) ||
632             is(V == long2) ||
633             is(V == ulong2) ||
634             is(V == double2) ||
635             is(V == float4))
636     {
637         pragma(inline, true);
638         static if (is(V == double2))
639             return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
640         else static if (is(V == float4))
641             return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
642         else
643             return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
644     }
645 } // D_SIMD (keep loadUnaligned unittest for LDC)
646 
647     @system
648     unittest
649     {
650         // Memory to load into the vector:
651         // Should have enough data to test all 16-byte alignments, and still
652         // have room for a 16-byte vector
653         ubyte[32] data;
654         foreach (i; 0..data.length)
655         {
656             data[i] = cast(ubyte)i;
657         }
658 
659         // to test all alignments from 1 ~ 16
660         foreach (i; 0..16)
661         {
662             ubyte* d = &data[i];
663 
664             void test(T)()
665             {
666                 // load the data
667                 T v = loadUnaligned(cast(T*)d);
668 
669                 // check that the data was loaded correctly
670                 ubyte* ptrToV = cast(ubyte*)&v;
671                 foreach (j; 0..T.sizeof)
672                 {
673                     assert(ptrToV[j] == d[j]);
674                 }
675             }
676 
677             test!void16();
678             test!byte16();
679             test!ubyte16();
680             test!short8();
681             test!ushort8();
682             test!int4();
683             test!uint4();
684             test!long2();
685             test!ulong2();
686             test!double2();
687             test!float4();
688         }
689     }
690 
691 version (D_SIMD) // LDC
692 {
693     /*************************************
694     * Store vector to unaligned address.
695     * This is a compiler intrinsic.
696     * Params:
697     *    p = pointer to vector
698     *    value = value to store
699     * Returns:
700     *    value
701     */
702 
703     V storeUnaligned(V)(V* p, V value)
704         if (is(V == void16) ||
705             is(V == byte16) ||
706             is(V == ubyte16) ||
707             is(V == short8) ||
708             is(V == ushort8) ||
709             is(V == int4) ||
710             is(V == uint4) ||
711             is(V == long2) ||
712             is(V == ulong2) ||
713             is(V == double2) ||
714             is(V == float4))
715     {
716         pragma(inline, true);
717         static if (is(V == double2))
718             return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
719         else static if (is(V == float4))
720             return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
721         else
722             return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
723     }
724 } // D_SIMD (keep storeUnaligned unittest for LDC)
725 
726     @system
727     unittest
728     {
729         // Memory to store the vector to:
730         // Should have enough data to test all 16-byte alignments, and still
731         // have room for a 16-byte vector
732         ubyte[32] data;
733 
734         // to test all alignments from 1 ~ 16
735         foreach (i; 0..16)
736         {
737             ubyte* d = &data[i];
738 
739             void test(T)()
740             {
741                 T v;
742 
743                 // populate v` with data
744                 ubyte* ptrToV = cast(ubyte*)&v;
745                 foreach (j; 0..T.sizeof)
746                 {
747                     ptrToV[j] = cast(ubyte)j;
748                 }
749 
750                 // store `v` to location pointed to by `d`
751                 storeUnaligned(cast(T*)d, v);
752 
753                 // check that the data was stored correctly
754                 foreach (j; 0..T.sizeof)
755                 {
756                     assert(ptrToV[j] == d[j]);
757                 }
758             }
759 
760             test!void16();
761             test!byte16();
762             test!ubyte16();
763             test!short8();
764             test!ushort8();
765             test!int4();
766             test!uint4();
767             test!long2();
768             test!ulong2();
769             test!double2();
770             test!float4();
771         }
772     }
773 //} no D_SIMD scope to terminate for LDC