1 // Written in the D programming language. 2 3 /** 4 * Builtin SIMD intrinsics 5 * 6 * Source: $(DRUNTIMESRC core/_simd.d) 7 * 8 * Copyright: Copyright Digital Mars 2012-2020 9 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 10 * Authors: $(HTTP digitalmars.com, Walter Bright), 11 * Source: $(DRUNTIMESRC core/_simd.d) 12 */ 13 14 module core.simd; 15 16 pure: 17 nothrow: 18 @safe: 19 @nogc: 20 21 /******************************* 22 * Create a vector type. 23 * 24 * Parameters: 25 * T = one of double[2], float[4], void[16], byte[16], ubyte[16], 26 * short[8], ushort[8], int[4], uint[4], long[2], ulong[2]. 27 * For 256 bit vectors, 28 * one of double[4], float[8], void[32], byte[32], ubyte[32], 29 * short[16], ushort[16], int[8], uint[8], long[4], ulong[4] 30 */ 31 32 template Vector(T) 33 { 34 /* __vector is compiler magic, hide it behind a template. 35 * The compiler will reject T's that don't work. 36 */ 37 alias __vector(T) Vector; 38 } 39 40 /* Handy aliases 41 */ 42 version (LDC) 43 { 44 static if (is(Vector!(void[4]))) alias Vector!(void[4]) void4; /// 45 static if (is(Vector!(byte[4]))) alias Vector!(byte[4]) byte4; /// 46 static if (is(Vector!(ubyte[4]))) alias Vector!(ubyte[4]) ubyte4; /// 47 static if (is(Vector!(short[2]))) alias Vector!(short[2]) short2; /// 48 static if (is(Vector!(ushort[2]))) alias Vector!(ushort[2]) ushort2; /// 49 } 50 static if (is(Vector!(void[8]))) alias Vector!(void[8]) void8; /// 51 static if (is(Vector!(double[1]))) alias Vector!(double[1]) double1; /// 52 static if (is(Vector!(float[2]))) alias Vector!(float[2]) float2; /// 53 static if (is(Vector!(byte[8]))) alias Vector!(byte[8]) byte8; /// 54 static if (is(Vector!(ubyte[8]))) alias Vector!(ubyte[8]) ubyte8; /// 55 static if (is(Vector!(short[4]))) alias Vector!(short[4]) short4; /// 56 static if (is(Vector!(ushort[4]))) alias Vector!(ushort[4]) ushort4; /// 57 static if (is(Vector!(int[2]))) alias Vector!(int[2]) int2; /// 58 static if (is(Vector!(uint[2]))) alias Vector!(uint[2]) uint2; /// 59 static if (is(Vector!(long[1]))) alias Vector!(long[1]) long1; /// 60 static if (is(Vector!(ulong[1]))) alias Vector!(ulong[1]) ulong1; /// 61 62 static if (is(Vector!(void[16]))) alias Vector!(void[16]) void16; /// 63 static if (is(Vector!(double[2]))) alias Vector!(double[2]) double2; /// 64 static if (is(Vector!(float[4]))) alias Vector!(float[4]) float4; /// 65 static if (is(Vector!(byte[16]))) alias Vector!(byte[16]) byte16; /// 66 static if (is(Vector!(ubyte[16]))) alias Vector!(ubyte[16]) ubyte16; /// 67 static if (is(Vector!(short[8]))) alias Vector!(short[8]) short8; /// 68 static if (is(Vector!(ushort[8]))) alias Vector!(ushort[8]) ushort8; /// 69 static if (is(Vector!(int[4]))) alias Vector!(int[4]) int4; /// 70 static if (is(Vector!(uint[4]))) alias Vector!(uint[4]) uint4; /// 71 static if (is(Vector!(long[2]))) alias Vector!(long[2]) long2; /// 72 static if (is(Vector!(ulong[2]))) alias Vector!(ulong[2]) ulong2; /// 73 74 static if (is(Vector!(void[32]))) alias Vector!(void[32]) void32; /// 75 static if (is(Vector!(double[4]))) alias Vector!(double[4]) double4; /// 76 static if (is(Vector!(float[8]))) alias Vector!(float[8]) float8; /// 77 static if (is(Vector!(byte[32]))) alias Vector!(byte[32]) byte32; /// 78 static if (is(Vector!(ubyte[32]))) alias Vector!(ubyte[32]) ubyte32; /// 79 static if (is(Vector!(short[16]))) alias Vector!(short[16]) short16; /// 80 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16; /// 81 static if (is(Vector!(int[8]))) alias Vector!(int[8]) int8; /// 82 static if (is(Vector!(uint[8]))) alias Vector!(uint[8]) uint8; /// 83 static if (is(Vector!(long[4]))) alias Vector!(long[4]) long4; /// 84 static if (is(Vector!(ulong[4]))) alias Vector!(ulong[4]) ulong4; /// 85 86 static if (is(Vector!(void[64]))) alias Vector!(void[64]) void64; /// 87 static if (is(Vector!(double[8]))) alias Vector!(double[8]) double8; /// 88 static if (is(Vector!(float[16]))) alias Vector!(float[16]) float16; /// 89 static if (is(Vector!(byte[64]))) alias Vector!(byte[64]) byte64; /// 90 static if (is(Vector!(ubyte[64]))) alias Vector!(ubyte[64]) ubyte64; /// 91 static if (is(Vector!(short[32]))) alias Vector!(short[32]) short32; /// 92 static if (is(Vector!(ushort[32]))) alias Vector!(ushort[32]) ushort32; /// 93 static if (is(Vector!(int[16]))) alias Vector!(int[16]) int16; /// 94 static if (is(Vector!(uint[16]))) alias Vector!(uint[16]) uint16; /// 95 static if (is(Vector!(long[8]))) alias Vector!(long[8]) long8; /// 96 static if (is(Vector!(ulong[8]))) alias Vector!(ulong[8]) ulong8; /// 97 98 version (LDC) 99 { 100 public import ldc.simd : loadUnaligned, storeUnaligned; 101 102 /********************* 103 * Emit prefetch instruction. 104 * Params: 105 * address = address to be prefetched 106 * writeFetch = true for write fetch, false for read fetch 107 * locality = 0..3 (0 meaning least local, 3 meaning most local) 108 */ 109 pragma(inline, true) 110 void prefetch(bool writeFetch, ubyte locality)(const(void)* address) 111 { 112 import ldc.intrinsics : llvm_prefetch; 113 static assert(locality < 4, "0..3 expected for locality"); 114 enum dataCache = 1; 115 llvm_prefetch(address, writeFetch, locality, dataCache); 116 } 117 118 unittest 119 { 120 float[4] data = [ 0.5, 1, 1.5, 2 ]; 121 auto ptr = &data[0]; 122 123 prefetch!(false, 0)(ptr); 124 auto v = loadUnaligned!float4(ptr); 125 v *= 2; 126 storeUnaligned!float4(v, ptr); 127 128 float[4] expected = [ 1, 2, 3, 4 ]; 129 assert(data == expected); 130 } 131 } 132 else version (D_SIMD) 133 { 134 /** XMM opcodes that conform to the following: 135 * 136 * opcode xmm1,xmm2/mem 137 * 138 * and do not have side effects (i.e. do not write to memory). 139 */ 140 enum XMM 141 { 142 ADDSS = 0xF30F58, 143 ADDSD = 0xF20F58, 144 ADDPS = 0x000F58, 145 ADDPD = 0x660F58, 146 PADDB = 0x660FFC, 147 PADDW = 0x660FFD, 148 PADDD = 0x660FFE, 149 PADDQ = 0x660FD4, 150 151 SUBSS = 0xF30F5C, 152 SUBSD = 0xF20F5C, 153 SUBPS = 0x000F5C, 154 SUBPD = 0x660F5C, 155 PSUBB = 0x660FF8, 156 PSUBW = 0x660FF9, 157 PSUBD = 0x660FFA, 158 PSUBQ = 0x660FFB, 159 160 MULSS = 0xF30F59, 161 MULSD = 0xF20F59, 162 MULPS = 0x000F59, 163 MULPD = 0x660F59, 164 PMULLW = 0x660FD5, 165 166 DIVSS = 0xF30F5E, 167 DIVSD = 0xF20F5E, 168 DIVPS = 0x000F5E, 169 DIVPD = 0x660F5E, 170 171 PAND = 0x660FDB, 172 POR = 0x660FEB, 173 174 UCOMISS = 0x000F2E, 175 UCOMISD = 0x660F2E, 176 177 XORPS = 0x000F57, 178 XORPD = 0x660F57, 179 180 // Use STO and LOD instead of MOV to distinguish the direction 181 // (Destination is first operand, Source is second operand) 182 STOSS = 0xF30F11, /// MOVSS xmm1/m32, xmm2 183 STOSD = 0xF20F11, /// MOVSD xmm1/m64, xmm2 184 STOAPS = 0x000F29, /// MOVAPS xmm2/m128, xmm1 185 STOAPD = 0x660F29, /// MOVAPD xmm2/m128, xmm1 186 STODQA = 0x660F7F, /// MOVDQA xmm2/m128, xmm1 187 STOD = 0x660F7E, /// MOVD reg/mem64, xmm 66 0F 7E /r 188 STOQ = 0x660FD6, /// MOVQ xmm2/m64, xmm1 189 190 LODSS = 0xF30F10, /// MOVSS xmm1, xmm2/m32 191 LODSD = 0xF20F10, /// MOVSD xmm1, xmm2/m64 192 LODAPS = 0x000F28, /// MOVAPS xmm1, xmm2/m128 193 LODAPD = 0x660F28, /// MOVAPD xmm1, xmm2/m128 194 LODDQA = 0x660F6F, /// MOVDQA xmm1, xmm2/m128 195 LODD = 0x660F6E, /// MOVD xmm, reg/mem64 66 0F 6E /r 196 LODQ = 0xF30F7E, /// MOVQ xmm1, xmm2/m64 197 198 LODDQU = 0xF30F6F, /// MOVDQU xmm1, xmm2/mem128 F3 0F 6F /r 199 STODQU = 0xF30F7F, /// MOVDQU xmm1/mem128, xmm2 F3 0F 7F /r 200 MOVDQ2Q = 0xF20FD6, /// MOVDQ2Q mmx, xmm F2 0F D6 /r 201 MOVHLPS = 0x0F12, /// MOVHLPS xmm1, xmm2 0F 12 /r 202 LODHPD = 0x660F16, /// MOVHPD xmm1, m64 203 STOHPD = 0x660F17, /// MOVHPD mem64, xmm1 66 0F 17 /r 204 LODHPS = 0x0F16, /// MOVHPS xmm1, m64 205 STOHPS = 0x0F17, /// MOVHPS m64, xmm1 206 MOVLHPS = 0x0F16, /// MOVLHPS xmm1, xmm2 207 LODLPD = 0x660F12, /// MOVLPD xmm1, m64 208 STOLPD = 0x660F13, /// MOVLPD m64, xmm1 209 LODLPS = 0x0F12, /// MOVLPS xmm1, m64 210 STOLPS = 0x0F13, /// MOVLPS m64, xmm1 211 MOVMSKPD = 0x660F50, /// MOVMSKPD reg, xmm 212 MOVMSKPS = 0x0F50, /// MOVMSKPS reg, xmm 213 MOVNTDQ = 0x660FE7, /// MOVNTDQ m128, xmm1 214 MOVNTI = 0x0FC3, /// MOVNTI m32, r32 215 MOVNTPD = 0x660F2B, /// MOVNTPD m128, xmm1 216 MOVNTPS = 0x0F2B, /// MOVNTPS m128, xmm1 217 MOVNTQ = 0x0FE7, /// MOVNTQ m64, mm 218 MOVQ2DQ = 0xF30FD6, /// MOVQ2DQ 219 LODUPD = 0x660F10, /// MOVUPD xmm1, xmm2/m128 220 STOUPD = 0x660F11, /// MOVUPD xmm2/m128, xmm1 221 LODUPS = 0x0F10, /// MOVUPS xmm1, xmm2/m128 222 STOUPS = 0x0F11, /// MOVUPS xmm2/m128, xmm1 223 224 PACKSSDW = 0x660F6B, 225 PACKSSWB = 0x660F63, 226 PACKUSWB = 0x660F67, 227 PADDSB = 0x660FEC, 228 PADDSW = 0x660FED, 229 PADDUSB = 0x660FDC, 230 PADDUSW = 0x660FDD, 231 PANDN = 0x660FDF, 232 PCMPEQB = 0x660F74, 233 PCMPEQD = 0x660F76, 234 PCMPEQW = 0x660F75, 235 PCMPGTB = 0x660F64, 236 PCMPGTD = 0x660F66, 237 PCMPGTW = 0x660F65, 238 PMADDWD = 0x660FF5, 239 PSLLW = 0x660FF1, 240 PSLLD = 0x660FF2, 241 PSLLQ = 0x660FF3, 242 PSRAW = 0x660FE1, 243 PSRAD = 0x660FE2, 244 PSRLW = 0x660FD1, 245 PSRLD = 0x660FD2, 246 PSRLQ = 0x660FD3, 247 PSUBSB = 0x660FE8, 248 PSUBSW = 0x660FE9, 249 PSUBUSB = 0x660FD8, 250 PSUBUSW = 0x660FD9, 251 PUNPCKHBW = 0x660F68, 252 PUNPCKHDQ = 0x660F6A, 253 PUNPCKHWD = 0x660F69, 254 PUNPCKLBW = 0x660F60, 255 PUNPCKLDQ = 0x660F62, 256 PUNPCKLWD = 0x660F61, 257 PXOR = 0x660FEF, 258 ANDPD = 0x660F54, 259 ANDPS = 0x0F54, 260 ANDNPD = 0x660F55, 261 ANDNPS = 0x0F55, 262 CMPPS = 0x0FC2, 263 CMPPD = 0x660FC2, 264 CMPSD = 0xF20FC2, 265 CMPSS = 0xF30FC2, 266 COMISD = 0x660F2F, 267 COMISS = 0x0F2F, 268 CVTDQ2PD = 0xF30FE6, 269 CVTDQ2PS = 0x0F5B, 270 CVTPD2DQ = 0xF20FE6, 271 CVTPD2PI = 0x660F2D, 272 CVTPD2PS = 0x660F5A, 273 CVTPI2PD = 0x660F2A, 274 CVTPI2PS = 0x0F2A, 275 CVTPS2DQ = 0x660F5B, 276 CVTPS2PD = 0x0F5A, 277 CVTPS2PI = 0x0F2D, 278 CVTSD2SI = 0xF20F2D, 279 CVTSD2SS = 0xF20F5A, 280 CVTSI2SD = 0xF20F2A, 281 CVTSI2SS = 0xF30F2A, 282 CVTSS2SD = 0xF30F5A, 283 CVTSS2SI = 0xF30F2D, 284 CVTTPD2PI = 0x660F2C, 285 CVTTPD2DQ = 0x660FE6, 286 CVTTPS2DQ = 0xF30F5B, 287 CVTTPS2PI = 0x0F2C, 288 CVTTSD2SI = 0xF20F2C, 289 CVTTSS2SI = 0xF30F2C, 290 MASKMOVDQU = 0x660FF7, 291 MASKMOVQ = 0x0FF7, 292 MAXPD = 0x660F5F, 293 MAXPS = 0x0F5F, 294 MAXSD = 0xF20F5F, 295 MAXSS = 0xF30F5F, 296 MINPD = 0x660F5D, 297 MINPS = 0x0F5D, 298 MINSD = 0xF20F5D, 299 MINSS = 0xF30F5D, 300 ORPD = 0x660F56, 301 ORPS = 0x0F56, 302 PAVGB = 0x660FE0, 303 PAVGW = 0x660FE3, 304 PMAXSW = 0x660FEE, 305 //PINSRW = 0x660FC4, 306 PMAXUB = 0x660FDE, 307 PMINSW = 0x660FEA, 308 PMINUB = 0x660FDA, 309 //PMOVMSKB = 0x660FD7, 310 PMULHUW = 0x660FE4, 311 PMULHW = 0x660FE5, 312 PMULUDQ = 0x660FF4, 313 PSADBW = 0x660FF6, 314 PUNPCKHQDQ = 0x660F6D, 315 PUNPCKLQDQ = 0x660F6C, 316 RCPPS = 0x0F53, 317 RCPSS = 0xF30F53, 318 RSQRTPS = 0x0F52, 319 RSQRTSS = 0xF30F52, 320 SQRTPD = 0x660F51, 321 SHUFPD = 0x660FC6, 322 SHUFPS = 0x0FC6, 323 SQRTPS = 0x0F51, 324 SQRTSD = 0xF20F51, 325 SQRTSS = 0xF30F51, 326 UNPCKHPD = 0x660F15, 327 UNPCKHPS = 0x0F15, 328 UNPCKLPD = 0x660F14, 329 UNPCKLPS = 0x0F14, 330 331 PSHUFD = 0x660F70, 332 PSHUFHW = 0xF30F70, 333 PSHUFLW = 0xF20F70, 334 PSHUFW = 0x0F70, 335 PSLLDQ = 0x07660F73, 336 PSRLDQ = 0x03660F73, 337 338 //PREFETCH = 0x0F18, 339 340 // SSE3 Pentium 4 (Prescott) 341 342 ADDSUBPD = 0x660FD0, 343 ADDSUBPS = 0xF20FD0, 344 HADDPD = 0x660F7C, 345 HADDPS = 0xF20F7C, 346 HSUBPD = 0x660F7D, 347 HSUBPS = 0xF20F7D, 348 MOVDDUP = 0xF20F12, 349 MOVSHDUP = 0xF30F16, 350 MOVSLDUP = 0xF30F12, 351 LDDQU = 0xF20FF0, 352 MONITOR = 0x0F01C8, 353 MWAIT = 0x0F01C9, 354 355 // SSSE3 356 PALIGNR = 0x660F3A0F, 357 PHADDD = 0x660F3802, 358 PHADDW = 0x660F3801, 359 PHADDSW = 0x660F3803, 360 PABSB = 0x660F381C, 361 PABSD = 0x660F381E, 362 PABSW = 0x660F381D, 363 PSIGNB = 0x660F3808, 364 PSIGND = 0x660F380A, 365 PSIGNW = 0x660F3809, 366 PSHUFB = 0x660F3800, 367 PMADDUBSW = 0x660F3804, 368 PMULHRSW = 0x660F380B, 369 PHSUBD = 0x660F3806, 370 PHSUBW = 0x660F3805, 371 PHSUBSW = 0x660F3807, 372 373 // SSE4.1 374 375 BLENDPD = 0x660F3A0D, 376 BLENDPS = 0x660F3A0C, 377 BLENDVPD = 0x660F3815, 378 BLENDVPS = 0x660F3814, 379 DPPD = 0x660F3A41, 380 DPPS = 0x660F3A40, 381 EXTRACTPS = 0x660F3A17, 382 INSERTPS = 0x660F3A21, 383 MPSADBW = 0x660F3A42, 384 PBLENDVB = 0x660F3810, 385 PBLENDW = 0x660F3A0E, 386 PEXTRD = 0x660F3A16, 387 PEXTRQ = 0x660F3A16, 388 PINSRB = 0x660F3A20, 389 PINSRD = 0x660F3A22, 390 PINSRQ = 0x660F3A22, 391 392 MOVNTDQA = 0x660F382A, 393 PACKUSDW = 0x660F382B, 394 PCMPEQQ = 0x660F3829, 395 PEXTRB = 0x660F3A14, 396 PHMINPOSUW = 0x660F3841, 397 PMAXSB = 0x660F383C, 398 PMAXSD = 0x660F383D, 399 PMAXUD = 0x660F383F, 400 PMAXUW = 0x660F383E, 401 PMINSB = 0x660F3838, 402 PMINSD = 0x660F3839, 403 PMINUD = 0x660F383B, 404 PMINUW = 0x660F383A, 405 PMOVSXBW = 0x660F3820, 406 PMOVSXBD = 0x660F3821, 407 PMOVSXBQ = 0x660F3822, 408 PMOVSXWD = 0x660F3823, 409 PMOVSXWQ = 0x660F3824, 410 PMOVSXDQ = 0x660F3825, 411 PMOVZXBW = 0x660F3830, 412 PMOVZXBD = 0x660F3831, 413 PMOVZXBQ = 0x660F3832, 414 PMOVZXWD = 0x660F3833, 415 PMOVZXWQ = 0x660F3834, 416 PMOVZXDQ = 0x660F3835, 417 PMULDQ = 0x660F3828, 418 PMULLD = 0x660F3840, 419 PTEST = 0x660F3817, 420 421 ROUNDPD = 0x660F3A09, 422 ROUNDPS = 0x660F3A08, 423 ROUNDSD = 0x660F3A0B, 424 ROUNDSS = 0x660F3A0A, 425 426 // SSE4.2 427 PCMPESTRI = 0x660F3A61, 428 PCMPESTRM = 0x660F3A60, 429 PCMPISTRI = 0x660F3A63, 430 PCMPISTRM = 0x660F3A62, 431 PCMPGTQ = 0x660F3837, 432 //CRC32 433 434 // SSE4a (AMD only) 435 // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS 436 437 // POPCNT and LZCNT (have their own CPUID bits) 438 POPCNT = 0xF30FB8, 439 // LZCNT 440 } 441 442 /** 443 * Generate two operand instruction with XMM 128 bit operands. 444 * 445 * This is a compiler magic function - it doesn't behave like 446 * regular D functions. 447 * 448 * Parameters: 449 * opcode = any of the XMM opcodes; it must be a compile time constant 450 * op1 = first operand 451 * op2 = second operand 452 * Returns: 453 * result of opcode 454 * Example: 455 --- 456 import core.simd; 457 import core.stdc.stdio; 458 459 void main() 460 { 461 float4 A = [2.34f, -70000.0f, 0.00001f, 345.5f]; 462 float4 R = A; 463 R = cast(float4) __simd(XMM.RCPSS, R, A); 464 printf("%g %g %g %g\n", R.array[0], R.array[1], R.array[2], R.array[3]); 465 } 466 --- 467 * Prints `0.427368 -70000 1e-05 345.5`. 468 * The use of the two operand form for `XMM.RCPSS` is necessary because the result of the instruction 469 * contains elements of both operands. 470 * Example: 471 --- 472 double[2] A = [56.0, -75.0]; 473 double2 R = cast(double2) __simd(XMM.LODUPD, *cast(double2*)A.ptr); 474 --- 475 * The cast to `double2*` is necessary because the type of `*A.ptr` is `double`. 476 */ 477 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2); 478 479 /// 480 unittest 481 { 482 float4 a; 483 a = cast(float4)__simd(XMM.PXOR, a, a); 484 } 485 486 /** 487 * Unary SIMD instructions. 488 */ 489 pure @safe void16 __simd(XMM opcode, void16 op1); 490 pure @safe void16 __simd(XMM opcode, double d); /// 491 pure @safe void16 __simd(XMM opcode, float f); /// 492 493 /// 494 unittest 495 { 496 float4 a; 497 a = cast(float4)__simd(XMM.LODSS, a); 498 } 499 500 /**** 501 * For instructions: 502 * CMPPD, CMPSS, CMPSD, CMPPS, 503 * PSHUFD, PSHUFHW, PSHUFLW, 504 * BLENDPD, BLENDPS, DPPD, DPPS, 505 * MPSADBW, PBLENDW, 506 * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS 507 * Parameters: 508 * opcode = any of the above XMM opcodes; it must be a compile time constant 509 * op1 = first operand 510 * op2 = second operand 511 * imm8 = third operand; must be a compile time constant 512 * Returns: 513 * result of opcode 514 */ 515 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8); 516 517 /// 518 unittest 519 { 520 float4 a; 521 a = cast(float4)__simd(XMM.CMPPD, a, a, 0x7A); 522 } 523 524 /*** 525 * For instructions with the imm8 version: 526 * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW, 527 * PSRLDQ, PSLLDQ 528 * Parameters: 529 * opcode = any of the XMM opcodes; it must be a compile time constant 530 * op1 = first operand 531 * imm8 = second operand; must be a compile time constant 532 * Returns: 533 * result of opcode 534 */ 535 pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8); 536 537 /// 538 unittest 539 { 540 float4 a; 541 a = cast(float4) __simd_ib(XMM.PSRLQ, a, 0x7A); 542 } 543 544 /***** 545 * For "store" operations of the form: 546 * op1 op= op2 547 * such as MOVLPS. 548 * Returns: 549 * op2 550 * These cannot be marked as pure, as semantic() doesn't check them. 551 */ 552 @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2); 553 @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); /// 554 @safe void16 __simd_sto(XMM opcode, float op1, void16 op2); /// 555 @safe void16 __simd_sto(XMM opcode, void16 op1, long op2); /// 556 557 /// 558 unittest 559 { 560 void16 a; 561 float f = 1; 562 double d = 1; 563 564 cast(void)__simd_sto(XMM.STOUPS, a, a); 565 cast(void)__simd_sto(XMM.STOUPS, f, a); 566 cast(void)__simd_sto(XMM.STOUPS, d, a); 567 } 568 569 /* The following use overloading to ensure correct typing. 570 * Compile with inlining on for best performance. 571 */ 572 573 pure @safe short8 pcmpeq()(short8 v1, short8 v2) 574 { 575 return cast(short8)__simd(XMM.PCMPEQW, v1, v2); 576 } 577 578 pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2) 579 { 580 return cast(ushort8)__simd(XMM.PCMPEQW, v1, v2); 581 } 582 583 /********************* 584 * Emit prefetch instruction. 585 * Params: 586 * address = address to be prefetched 587 * writeFetch = true for write fetch, false for read fetch 588 * locality = 0..3 (0 meaning least local, 3 meaning most local) 589 * Note: 590 * The Intel mappings are: 591 * $(TABLE 592 * $(THEAD writeFetch, locality, Instruction) 593 * $(TROW false, 0, prefetchnta) 594 * $(TROW false, 1, prefetch2) 595 * $(TROW false, 2, prefetch1) 596 * $(TROW false, 3, prefetch0) 597 * $(TROW true, 0, prefetchw) 598 * $(TROW true, 1, prefetchw) 599 * $(TROW true, 2, prefetchw) 600 * $(TROW true, 3, prefetchw) 601 * ) 602 */ 603 void prefetch(bool writeFetch, ubyte locality)(const(void)* address) 604 { 605 static if (writeFetch) 606 __prefetch(address, 4); 607 else static if (locality < 4) 608 __prefetch(address, 3 - locality); 609 else 610 static assert(0, "0..3 expected for locality"); 611 } 612 613 private void __prefetch(const(void*) address, ubyte encoding); 614 615 /************************************* 616 * Load unaligned vector from address. 617 * This is a compiler intrinsic. 618 * Params: 619 * p = pointer to vector 620 * Returns: 621 * vector 622 */ 623 624 V loadUnaligned(V)(const V* p) 625 if (is(V == void16) || 626 is(V == byte16) || 627 is(V == ubyte16) || 628 is(V == short8) || 629 is(V == ushort8) || 630 is(V == int4) || 631 is(V == uint4) || 632 is(V == long2) || 633 is(V == ulong2) || 634 is(V == double2) || 635 is(V == float4)) 636 { 637 pragma(inline, true); 638 static if (is(V == double2)) 639 return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p); 640 else static if (is(V == float4)) 641 return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p); 642 else 643 return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p); 644 } 645 } // D_SIMD (keep loadUnaligned unittest for LDC) 646 647 @system 648 unittest 649 { 650 // Memory to load into the vector: 651 // Should have enough data to test all 16-byte alignments, and still 652 // have room for a 16-byte vector 653 ubyte[32] data; 654 foreach (i; 0..data.length) 655 { 656 data[i] = cast(ubyte)i; 657 } 658 659 // to test all alignments from 1 ~ 16 660 foreach (i; 0..16) 661 { 662 ubyte* d = &data[i]; 663 664 void test(T)() 665 { 666 // load the data 667 T v = loadUnaligned(cast(T*)d); 668 669 // check that the data was loaded correctly 670 ubyte* ptrToV = cast(ubyte*)&v; 671 foreach (j; 0..T.sizeof) 672 { 673 assert(ptrToV[j] == d[j]); 674 } 675 } 676 677 test!void16(); 678 test!byte16(); 679 test!ubyte16(); 680 test!short8(); 681 test!ushort8(); 682 test!int4(); 683 test!uint4(); 684 test!long2(); 685 test!ulong2(); 686 test!double2(); 687 test!float4(); 688 } 689 } 690 691 version (D_SIMD) // LDC 692 { 693 /************************************* 694 * Store vector to unaligned address. 695 * This is a compiler intrinsic. 696 * Params: 697 * p = pointer to vector 698 * value = value to store 699 * Returns: 700 * value 701 */ 702 703 V storeUnaligned(V)(V* p, V value) 704 if (is(V == void16) || 705 is(V == byte16) || 706 is(V == ubyte16) || 707 is(V == short8) || 708 is(V == ushort8) || 709 is(V == int4) || 710 is(V == uint4) || 711 is(V == long2) || 712 is(V == ulong2) || 713 is(V == double2) || 714 is(V == float4)) 715 { 716 pragma(inline, true); 717 static if (is(V == double2)) 718 return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value); 719 else static if (is(V == float4)) 720 return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value); 721 else 722 return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value); 723 } 724 } // D_SIMD (keep storeUnaligned unittest for LDC) 725 726 @system 727 unittest 728 { 729 // Memory to store the vector to: 730 // Should have enough data to test all 16-byte alignments, and still 731 // have room for a 16-byte vector 732 ubyte[32] data; 733 734 // to test all alignments from 1 ~ 16 735 foreach (i; 0..16) 736 { 737 ubyte* d = &data[i]; 738 739 void test(T)() 740 { 741 T v; 742 743 // populate v` with data 744 ubyte* ptrToV = cast(ubyte*)&v; 745 foreach (j; 0..T.sizeof) 746 { 747 ptrToV[j] = cast(ubyte)j; 748 } 749 750 // store `v` to location pointed to by `d` 751 storeUnaligned(cast(T*)d, v); 752 753 // check that the data was stored correctly 754 foreach (j; 0..T.sizeof) 755 { 756 assert(ptrToV[j] == d[j]); 757 } 758 } 759 760 test!void16(); 761 test!byte16(); 762 test!ubyte16(); 763 test!short8(); 764 test!ushort8(); 765 test!int4(); 766 test!uint4(); 767 test!long2(); 768 test!ulong2(); 769 test!double2(); 770 test!float4(); 771 } 772 } 773 //} no D_SIMD scope to terminate for LDC