1 /** 2 * AVX intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.avxintrin; 10 11 // AVX instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX 13 // Note: this header will work whether you have AVX enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively 15 // generate AVX instructions. 16 // With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively 17 // generate AVX instructions. 18 19 20 /// IMPORTANT NOTE ABOUT MASK LOAD/STORE: 21 /// 22 /// In theory, masked load/store can adress unadressable memory provided the mask is zero. 23 /// In practice, that is not the case for the following reasons: 24 /// 25 /// - AMD manual says: 26 /// "Exception and trap behavior for elements not selected for loading or storing from/to memory 27 /// is implementation dependent. For instance, a given implementation may signal a data 28 /// breakpoint or a page fault for doublewords that are zero-masked and not actually written." 29 /// 30 /// - Intel fetches the whole cacheline anyway: 31 /// https://erik.science/2019/06/21/AVX-fun.html 32 /// "Even if the mask is stored in the special mask registers, it will still first fetch the data 33 /// before checking the mask." 34 /// 35 /// So intel-intrinsics adopted the tightened semantics of only adressing fully addressable memory 36 /// with masked loads and stores. 37 38 39 /// Some AVX intrinsics takes a float comparison constant. 40 /// When labelled "ordered" it means "AND ordered" 41 /// When labelled "unordered" it means "OR unordered" 42 alias _CMP_EQ = int; 43 ///ditto 44 enum : _CMP_EQ 45 { 46 _CMP_EQ_OQ = 0x00, // Equal (ordered, non-signaling) 47 _CMP_LT_OS = 0x01, // Less-than (ordered, signaling) 48 _CMP_LE_OS = 0x02, // Less-than-or-equal (ordered, signaling) 49 _CMP_UNORD_Q = 0x03, // Unordered (non-signaling) 50 _CMP_NEQ_UQ = 0x04, // Not-equal (unordered, non-signaling) 51 _CMP_NLT_US = 0x05, // Not-less-than (unordered, signaling) 52 _CMP_NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling) 53 _CMP_ORD_Q = 0x07, // Ordered (nonsignaling) 54 _CMP_EQ_UQ = 0x08, // Equal (unordered, non-signaling) 55 _CMP_NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling) 56 _CMP_NGT_US = 0x0a, // Not-greater-than (unordered, signaling) 57 _CMP_FALSE_OQ = 0x0b, // False (ordered, non-signaling) 58 _CMP_NEQ_OQ = 0x0c, // Not-equal (ordered, non-signaling) 59 _CMP_GE_OS = 0x0d, // Greater-than-or-equal (ordered, signaling) 60 _CMP_GT_OS = 0x0e, // Greater-than (ordered, signaling) 61 _CMP_TRUE_UQ = 0x0f, // True (unordered, non-signaling) 62 _CMP_EQ_OS = 0x10, // Equal (ordered, signaling) 63 _CMP_LT_OQ = 0x11, // Less-than (ordered, non-signaling) 64 _CMP_LE_OQ = 0x12, // Less-than-or-equal (ordered, non-signaling) 65 _CMP_UNORD_S = 0x13, // Unordered (signaling) 66 _CMP_NEQ_US = 0x14, // Not-equal (unordered, signaling) 67 _CMP_NLT_UQ = 0x15, // Not-less-than (unordered, non-signaling) 68 _CMP_NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, non-signaling) 69 _CMP_ORD_S = 0x17, // Ordered (signaling) 70 _CMP_EQ_US = 0x18, // Equal (unordered, signaling) 71 _CMP_NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, non-signaling) 72 _CMP_NGT_UQ = 0x1a, // Not-greater-than (unordered, non-signaling) 73 _CMP_FALSE_OS = 0x1b, // False (ordered, signaling) 74 _CMP_NEQ_OS = 0x1c, // Not-equal (ordered, signaling) 75 _CMP_GE_OQ = 0x1d, // Greater-than-or-equal (ordered, non-signaling) 76 _CMP_GT_OQ = 0x1e, // Greater-than (ordered, non-signaling) 77 _CMP_TRUE_US = 0x1f // (unordered, signaling) 78 } 79 80 public import inteli.types; 81 import inteli.internals; 82 83 // Pull in all previous instruction set intrinsics. 84 public import inteli.smmintrin; 85 public import inteli.tmmintrin; 86 public import inteli.nmmintrin; 87 88 89 90 // In x86, LDC earlier version may have trouble preserving the stack pointer when an unsupported 91 // 256-bit vector type is passed, and AVX is disabled. 92 // This leads to disabling some intrinsics in this particular situation, since they are not safe for 93 // the caller. 94 version(LDC) 95 { 96 version(X86) 97 { 98 enum llvm256BitStackWorkaroundIn32BitX86 = __VERSION__ < 2099; 99 } 100 else 101 enum llvm256BitStackWorkaroundIn32BitX86 = false; 102 } 103 else 104 enum llvm256BitStackWorkaroundIn32BitX86 = false; 105 106 107 108 109 nothrow @nogc: 110 111 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 112 __m256d _mm256_add_pd (__m256d a, __m256d b) pure @trusted 113 { 114 return a + b; 115 } 116 unittest 117 { 118 align(32) double[4] A = [-1, 2, -3, 40000]; 119 align(32) double[4] B = [ 9, -7, 8, -0.5]; 120 __m256d R = _mm256_add_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 121 double[4] correct = [8, -5, 5, 39999.5]; 122 assert(R.array == correct); 123 } 124 125 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 126 __m256 _mm256_add_ps (__m256 a, __m256 b) pure @trusted 127 { 128 return a + b; 129 } 130 unittest 131 { 132 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 133 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 134 __m256 R = _mm256_add_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 135 float[8] correct = [8, -5, 5, 39999.5, 8, 10, 8, 5]; 136 assert(R.array == correct); 137 } 138 139 /// Alternatively add and subtract packed double-precision (64-bit) floating-point 140 /// elements in `a` to/from packed elements in `b`. 141 __m256d _mm256_addsub_pd (__m256d a, __m256d b) pure @trusted 142 { 143 // PERF DMD 144 static if (GDC_or_LDC_with_AVX) 145 { 146 return __builtin_ia32_addsubpd256(a, b); 147 } 148 else 149 { 150 //// Note: GDC x86 generates addsubpd since GDC 11.1 with -O3 151 //// LDC x86 generates addsubpd since LDC 1.18 with -O2 152 //// LDC ARM: not fantastic, ok since LDC 1.18 -O2 153 a.ptr[0] = a.array[0] + (-b.array[0]); 154 a.ptr[1] = a.array[1] + b.array[1]; 155 a.ptr[2] = a.array[2] + (-b.array[2]); 156 a.ptr[3] = a.array[3] + b.array[3]; 157 return a; 158 } 159 } 160 unittest 161 { 162 align(32) double[4] A = [-1, 2, -3, 40000]; 163 align(32) double[4] B = [ 9, -7, 8, -0.5]; 164 __m256d R = _mm256_addsub_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 165 double[4] correct = [-10, -5, -11, 39999.5]; 166 assert(R.array == correct); 167 } 168 169 /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements 170 /// in `a` to/from packed elements in `b`. 171 __m256 _mm256_addsub_ps (__m256 a, __m256 b) pure @trusted 172 { 173 // PERF DMD 174 static if (GDC_or_LDC_with_AVX) 175 { 176 return __builtin_ia32_addsubps256(a, b); 177 } 178 else 179 { 180 // Note: GDC x86 generates addsubps since GDC 11 -O3 181 // and in absence of AVX, a pair of SSE3 addsubps since GDC 12 -O2 182 // LDC x86 generates addsubps since LDC 1.18 -O2 183 // and in absence of AVX, a pair of SSE3 addsubps since LDC 1.1 -O1 184 // LDC ARM: neat output since LDC 1.21 -O2 185 186 a.ptr[0] = a.array[0] + (-b.array[0]); 187 a.ptr[1] = a.array[1] + b.array[1]; 188 a.ptr[2] = a.array[2] + (-b.array[2]); 189 a.ptr[3] = a.array[3] + b.array[3]; 190 a.ptr[4] = a.array[4] + (-b.array[4]); 191 a.ptr[5] = a.array[5] + b.array[5]; 192 a.ptr[6] = a.array[6] + (-b.array[6]); 193 a.ptr[7] = a.array[7] + b.array[7]; 194 return a; 195 } 196 } 197 unittest 198 { 199 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 200 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 201 __m256 R = _mm256_addsub_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 202 float[8] correct = [ -10, -5, -11, 39999.5, -8, 10, 2, 5]; 203 assert(R.array == correct); 204 } 205 206 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in `a` and `b`. 207 __m256d _mm256_and_pd (__m256d a, __m256d b) pure @trusted 208 { 209 // Note: GCC avxintrin.h uses the builtins for AND NOTAND OR of _ps and _pd, 210 // but those do not seem needed at any optimization level. 211 return cast(__m256d)(cast(__m256i)a & cast(__m256i)b); 212 } 213 unittest 214 { 215 double a = 4.32; 216 double b = -78.99; 217 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 218 __m256d A = _mm256_set_pd(a, b, a, b); 219 __m256d B = _mm256_set_pd(b, a, b, a); 220 long4 R = cast(long4)( _mm256_and_pd(A, B) ); 221 assert(R.array[0] == correct); 222 assert(R.array[1] == correct); 223 assert(R.array[2] == correct); 224 assert(R.array[3] == correct); 225 } 226 227 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 228 __m256 _mm256_and_ps (__m256 a, __m256 b) pure @trusted 229 { 230 return cast(__m256)(cast(__m256i)a & cast(__m256i)b); 231 } 232 unittest 233 { 234 float a = 4.32f; 235 float b = -78.99f; 236 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 237 __m256 A = _mm256_set_ps(a, b, a, b, a, b, a, b); 238 __m256 B = _mm256_set_ps(b, a, b, a, b, a, b, a); 239 int8 R = cast(int8)( _mm256_and_ps(A, B) ); 240 foreach(i; 0..8) 241 assert(R.array[i] == correct); 242 } 243 244 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a` 245 /// and then AND with b. 246 __m256d _mm256_andnot_pd (__m256d a, __m256d b) pure @trusted 247 { 248 // PERF DMD 249 __m256i notA = _mm256_not_si256(cast(__m256i)a); 250 __m256i ib = cast(__m256i)b; 251 __m256i ab = notA & ib; 252 return cast(__m256d)ab; 253 } 254 unittest 255 { 256 double a = 4.32; 257 double b = -78.99; 258 long notA = ~ ( *cast(long*)(&a) ); 259 long correct = notA & (*cast(long*)(&b)); 260 __m256d A = _mm256_set_pd(a, a, a, a); 261 __m256d B = _mm256_set_pd(b, b, b, b); 262 long4 R = cast(long4)( _mm256_andnot_pd(A, B) ); 263 foreach(i; 0..4) 264 assert(R.array[i] == correct); 265 } 266 267 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` 268 /// and then AND with b. 269 __m256 _mm256_andnot_ps (__m256 a, __m256 b) pure @trusted 270 { 271 // PERF DMD 272 __m256i notA = _mm256_not_si256(cast(__m256i)a); 273 __m256i ib = cast(__m256i)b; 274 __m256i ab = notA & ib; 275 return cast(__m256)ab; 276 } 277 unittest 278 { 279 float a = 4.32f; 280 float b = -78.99f; 281 int notA = ~ ( *cast(int*)(&a) ); 282 int correct = notA & (*cast(int*)(&b)); 283 __m256 A = _mm256_set1_ps(a); 284 __m256 B = _mm256_set1_ps(b); 285 int8 R = cast(int8)( _mm256_andnot_ps(A, B) ); 286 foreach(i; 0..8) 287 assert(R.array[i] == correct); 288 } 289 290 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control 291 /// mask `imm8`. 292 __m256d _mm256_blend_pd(int imm8)(__m256d a, __m256d b) 293 { 294 static assert(imm8 >= 0 && imm8 < 16); 295 296 // PERF DMD 297 static if (GDC_with_AVX) 298 { 299 return __builtin_ia32_blendpd256 (a, b, imm8); 300 } 301 else 302 { 303 // Works great with LDC. 304 double4 r; 305 for (int n = 0; n < 4; ++n) 306 { 307 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 308 } 309 return r; 310 } 311 } 312 unittest 313 { 314 __m256d A = _mm256_setr_pd(0, 1, 2, 3); 315 __m256d B = _mm256_setr_pd(8, 9, 10, 11); 316 double4 C = _mm256_blend_pd!0x06(A, B); 317 double[4] correct = [0, 9, 10, 3]; 318 assert(C.array == correct); 319 } 320 321 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 322 /// mask `imm8`. 323 __m256 _mm256_blend_ps(int imm8)(__m256 a, __m256 b) pure @trusted 324 { 325 static assert(imm8 >= 0 && imm8 < 256); 326 // PERF DMD 327 static if (GDC_with_AVX) 328 { 329 return __builtin_ia32_blendps256 (a, b, imm8); 330 } 331 else version(LDC) 332 { 333 // LDC x86: generates a vblendps since LDC 1.1 -O0 334 // arm64: pretty good, four instructions worst case 335 return shufflevectorLDC!(float8, (imm8 & 1) ? 8 : 0, 336 (imm8 & 2) ? 9 : 1, 337 (imm8 & 4) ? 10 : 2, 338 (imm8 & 8) ? 11 : 3, 339 (imm8 & 16) ? 12 : 4, 340 (imm8 & 32) ? 13 : 5, 341 (imm8 & 64) ? 14 : 6, 342 (imm8 & 128) ? 15 : 7)(a, b); 343 } 344 else 345 { 346 // LDC x86: vblendps generated since LDC 1.27 -O1 347 float8 r; 348 for (int n = 0; n < 8; ++n) 349 { 350 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 351 } 352 return r; 353 } 354 } 355 unittest 356 { 357 __m256 A = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7); 358 __m256 B = _mm256_setr_ps(8, 9, 10, 11, 12, 13, 14, 15); 359 float8 C = _mm256_blend_ps!0xe7(A, B); 360 float[8] correct = [8, 9, 10, 3, 4, 13, 14, 15]; 361 assert(C.array == correct); 362 } 363 364 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using mask. 365 __m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) @trusted 366 { 367 // PERF DMD 368 static if (GDC_with_AVX) 369 { 370 // Amazingly enough, GCC/GDC generates the vblendvpd instruction 371 // with -mavx2 but not -mavx. 372 // Not sure what is the reason, and there is a replacement sequence. 373 // Sounds like a bug, similar to _mm_blendv_pd 374 // or maybe the instruction in unsafe? 375 return __builtin_ia32_blendvpd256(a, b, mask); 376 } 377 else static if (LDC_with_AVX) 378 { 379 return __builtin_ia32_blendvpd256(a, b, mask); 380 } 381 else 382 { 383 // LDC x86: vblendvpd since LDC 1.27 -O2 384 // arm64: only 4 instructions, since LDC 1.27 -O2 385 __m256d r; 386 long4 lmask = cast(long4)mask; 387 for (int n = 0; n < 4; ++n) 388 { 389 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 390 } 391 return r; 392 } 393 } 394 unittest 395 { 396 __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); 397 __m256d B = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0); 398 __m256d M = _mm256_setr_pd(-3.0, 2.0, 1.0, -4.0); 399 __m256d R = _mm256_blendv_pd(A, B, M); 400 double[4] correct1 = [5.0, 2.0, 3.0, 8.0]; 401 assert(R.array == correct1); 402 } 403 404 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 405 /// using `mask`. 406 __m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask) @trusted 407 { 408 // PERF DMD 409 static if (GDC_or_LDC_with_AVX) 410 { 411 return __builtin_ia32_blendvps256(a, b, mask); 412 } 413 else static if (LDC_with_ARM64) 414 { 415 int8 shift; 416 shift = 31; 417 int8 lmask = cast(int8)mask >> shift; 418 int8 ia = cast(int8)a; 419 int8 ib = cast(int8)b; 420 return cast(__m256)(ia ^ ((ia ^ ib) & lmask)); 421 } 422 else 423 { 424 // In both LDC and GDC with SSE4.1, this generates blendvps as fallback 425 __m256 r; 426 int8 lmask = cast(int8)mask; 427 for (int n = 0; n < 8; ++n) 428 { 429 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 430 } 431 return r; 432 } 433 } 434 unittest 435 { 436 __m256 A = _mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); 437 __m256 B = _mm256_setr_ps(5.0f, 6.0f, 7.0f, 8.0f, 5.0f, 6.0f, 7.0f, 8.0f); 438 __m256 M = _mm256_setr_ps(-3.0f, 2.0f, 1.0f, -4.0f, -3.0f, 2.0f, 1.0f, -4.0f); 439 __m256 R = _mm256_blendv_ps(A, B, M); 440 float[8] correct1 = [5.0f, 2.0f, 3.0f, 8.0f, 5.0f, 2.0f, 3.0f, 8.0f]; 441 assert(R.array == correct1); 442 } 443 444 /// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) 445 /// floating-point elements) to all elements. 446 /// This effectively duplicates the 128-bit vector. 447 __m256d _mm256_broadcast_pd (const(__m128d)* mem_addr) pure @trusted 448 { 449 // PERF DMD 450 static if (GDC_with_AVX) 451 { 452 return __builtin_ia32_vbroadcastf128_pd256(cast(float4*)mem_addr); 453 } 454 else 455 { 456 const(double)* p = cast(const(double)*) mem_addr; 457 __m256d r; 458 r.ptr[0] = p[0]; 459 r.ptr[1] = p[1]; 460 r.ptr[2] = p[0]; 461 r.ptr[3] = p[1]; 462 return r; 463 } 464 } 465 unittest 466 { 467 __m128d A = _mm_setr_pd(3, -4); 468 __m256d B = _mm256_broadcast_pd(&A); 469 double[4] correct = [3, -4, 3, -4]; 470 assert(B.array == correct); 471 } 472 473 /// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) 474 /// floating-point elements) to all elements. 475 /// This effectively duplicates the 128-bit vector. 476 __m256 _mm256_broadcast_ps (const(__m128)* mem_addr) pure @trusted 477 { 478 // PERF DMD 479 static if (GDC_with_AVX) 480 { 481 return __builtin_ia32_vbroadcastf128_ps256(cast(float4*)mem_addr); 482 } 483 else 484 { 485 const(float)* p = cast(const(float)*)mem_addr; 486 __m256 r; 487 r.ptr[0] = p[0]; 488 r.ptr[1] = p[1]; 489 r.ptr[2] = p[2]; 490 r.ptr[3] = p[3]; 491 r.ptr[4] = p[0]; 492 r.ptr[5] = p[1]; 493 r.ptr[6] = p[2]; 494 r.ptr[7] = p[3]; 495 return r; 496 } 497 } 498 unittest 499 { 500 __m128 A = _mm_setr_ps(1, 2, 3, -4); 501 __m256 B = _mm256_broadcast_ps(&A); 502 float[8] correct = [1.0f, 2, 3, -4, 1, 2, 3, -4]; 503 assert(B.array == correct); 504 } 505 506 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 507 __m256d _mm256_broadcast_sd (const(double)* mem_addr) pure @trusted 508 { 509 static if (GDC_with_AVX) 510 { 511 return __builtin_ia32_vbroadcastsd256(mem_addr); 512 } 513 else 514 { 515 double a = *mem_addr; 516 __m256d r; 517 r.ptr[0] = a; 518 r.ptr[1] = a; 519 r.ptr[2] = a; 520 r.ptr[3] = a; 521 return r; 522 } 523 } 524 unittest 525 { 526 double t = 7.5f; 527 __m256d A = _mm256_broadcast_sd(&t); 528 double[4] correct = [7.5, 7.5, 7.5, 7.5]; 529 assert(A.array == correct); 530 } 531 532 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 533 __m128 _mm_broadcast_ss (const(float)* mem_addr) pure @trusted 534 { 535 // PERF DMD 536 static if (GDC_with_AVX) 537 { 538 return __builtin_ia32_vbroadcastss(mem_addr); 539 } 540 else 541 { 542 float a = *mem_addr; 543 __m128 r; 544 r.ptr[0] = a; 545 r.ptr[1] = a; 546 r.ptr[2] = a; 547 r.ptr[3] = a; 548 return r; 549 } 550 } 551 unittest 552 { 553 float t = 7.5f; 554 __m128 A = _mm_broadcast_ss(&t); 555 float[4] correct = [7.5f, 7.5f, 7.5f, 7.5f]; 556 assert(A.array == correct); 557 } 558 559 __m256 _mm256_broadcast_ss (const(float)* mem_addr) 560 { 561 // PERF DMD 562 static if (GDC_with_AVX) 563 { 564 return __builtin_ia32_vbroadcastss256 (mem_addr); 565 } 566 else 567 { 568 float a = *mem_addr; 569 __m256 r = __m256(a); 570 return r; 571 } 572 } 573 unittest 574 { 575 float t = 7.5f; 576 __m256 A = _mm256_broadcast_ss(&t); 577 float[8] correct = [7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f]; 578 assert(A.array == correct); 579 } 580 581 /// Cast vector of type `__m256d` to type `__m256`. 582 __m256 _mm256_castpd_ps (__m256d a) pure @safe 583 { 584 return cast(__m256)a; 585 } 586 587 /// Cast vector of type `__m256d` to type `__m256i`. 588 __m256i _mm256_castpd_si256 (__m256d a) pure @safe 589 { 590 return cast(__m256i)a; 591 } 592 593 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are undefined. 594 __m256d _mm256_castpd128_pd256 (__m128d a) pure @trusted 595 { 596 static if (GDC_with_AVX) 597 { 598 return __builtin_ia32_pd256_pd(a); 599 } 600 else 601 { 602 __m256d r = void; 603 r.ptr[0] = a.array[0]; 604 r.ptr[1] = a.array[1]; 605 return r; 606 } 607 } 608 unittest 609 { 610 __m128d A = _mm_setr_pd(4.0, -6.125); 611 __m256d B = _mm256_castpd128_pd256(A); 612 assert(B.array[0] == 4.0); 613 assert(B.array[1] == -6.125); 614 } 615 616 /// Cast vector of type `__m256d` to type `__m128d`; the upper 128 bits of `a` are lost. 617 __m128d _mm256_castpd256_pd128 (__m256d a) pure @trusted 618 { 619 static if (GDC_with_AVX) 620 { 621 return __builtin_ia32_pd_pd256(a); 622 } 623 else 624 { 625 __m128d r; 626 r.ptr[0] = a.array[0]; 627 r.ptr[1] = a.array[1]; 628 return r; 629 } 630 } 631 unittest 632 { 633 __m256d A = _mm256_set_pd(1, 2, -6.25, 4.0); 634 __m128d B = _mm256_castpd256_pd128(A); 635 assert(B.array[0] == 4.0); 636 assert(B.array[1] == -6.25); 637 } 638 639 /// Cast vector of type `__m256` to type `__m256d`. 640 __m256d _mm256_castps_pd (__m256 a) pure @safe 641 { 642 return cast(__m256d)a; 643 } 644 645 /// Cast vector of type `__m256` to type `__m256i`. 646 __m256i _mm256_castps_si256 (__m256 a) pure @safe 647 { 648 return cast(__m256i)a; 649 } 650 651 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are undefined. 652 __m256 _mm256_castps128_ps256 (__m128 a) pure @trusted 653 { 654 static if (GDC_with_AVX) 655 { 656 return __builtin_ia32_ps256_ps(a); 657 } 658 else 659 { 660 __m256 r = void; 661 r.ptr[0] = a.array[0]; 662 r.ptr[1] = a.array[1]; 663 r.ptr[2] = a.array[2]; 664 r.ptr[3] = a.array[3]; 665 return r; 666 } 667 } 668 unittest 669 { 670 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 671 __m256 B = _mm256_castps128_ps256(A); 672 float[4] correct = [1.0f, 2, 3, 4]; 673 assert(B.array[0..4] == correct); 674 } 675 676 /// Cast vector of type `__m256` to type `__m128`. The upper 128-bit of `a` are lost. 677 __m128 _mm256_castps256_ps128 (__m256 a) pure @trusted 678 { 679 return *cast(const(__m128)*)(&a); 680 } 681 unittest 682 { 683 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 684 __m128 B = _mm256_castps256_ps128(A); 685 float[4] correct = [1.0f, 2, 3, 4]; 686 assert(B.array == correct); 687 } 688 689 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are undefined. 690 __m256i _mm256_castsi128_si256 (__m128i a) pure @trusted 691 { 692 long2 la = cast(long2)a; 693 long4 r = void; 694 r.ptr[0] = la.array[0]; 695 r.ptr[1] = la.array[1]; 696 return r; 697 } 698 unittest 699 { 700 __m128i A = _mm_setr_epi64(-1, 42); 701 __m256i B = _mm256_castsi128_si256(A); 702 long[2] correct = [-1, 42]; 703 assert(B.array[0..2] == correct); 704 } 705 706 /// Cast vector of type `__m256i` to type `__m256d`. 707 __m256d _mm256_castsi256_pd (__m256i a) pure @safe 708 { 709 return cast(__m256d)a; 710 } 711 712 /// Cast vector of type `__m256i` to type `__m256`. 713 __m256 _mm256_castsi256_ps (__m256i a) pure @safe 714 { 715 return cast(__m256)a; 716 } 717 718 /// Cast vector of type `__m256i` to type `__m128i`. The upper 128-bit of `a` are lost. 719 __m128i _mm256_castsi256_si128 (__m256i a) pure @trusted 720 { 721 long2 r = void; 722 r.ptr[0] = a.array[0]; 723 r.ptr[1] = a.array[1]; 724 return cast(__m128i)r; 725 } 726 unittest 727 { 728 long4 A; 729 A.ptr[0] = -1; 730 A.ptr[1] = 42; 731 long2 B = cast(long2)(_mm256_castsi256_si128(A)); 732 long[2] correct = [-1, 42]; 733 assert(B.array[0..2] == correct); 734 } 735 736 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer 737 /// value, and store the results as packed double-precision floating-point elements. 738 __m256d _mm256_ceil_pd (__m256d a) @safe 739 { 740 static if (LDC_with_ARM64) 741 { 742 __m128d lo = _mm256_extractf128_pd!0(a); 743 __m128d hi = _mm256_extractf128_pd!1(a); 744 __m128d ilo = _mm_ceil_pd(lo); 745 __m128d ihi = _mm_ceil_pd(hi); 746 return _mm256_set_m128d(ihi, ilo); 747 } 748 else 749 { 750 return _mm256_round_pd!2(a); 751 } 752 } 753 unittest 754 { 755 __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f); 756 A = _mm256_ceil_pd(A); 757 double[4] correct = [2.0, -2.0, 54.0, -2.0]; 758 assert(A.array == correct); 759 } 760 761 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer 762 /// value, and store the results as packed single-precision floating-point elements. 763 __m256 _mm256_ceil_ps (__m256 a) @safe 764 { 765 static if (LDC_with_ARM64) 766 { 767 __m128 lo = _mm256_extractf128_ps!0(a); 768 __m128 hi = _mm256_extractf128_ps!1(a); 769 __m128 ilo = _mm_ceil_ps(lo); 770 __m128 ihi = _mm_ceil_ps(hi); 771 return _mm256_set_m128(ihi, ilo); 772 } 773 else 774 { 775 return _mm256_round_ps!2(a); 776 } 777 } 778 unittest 779 { 780 __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f); 781 __m256 C = _mm256_ceil_ps(A); 782 float[8] correct = [2.0f, -2.0f, 54.0f, -2.0f, -1, 3, -53, 3]; 783 assert(C.array == correct); 784 } 785 786 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b` based on the 787 /// comparison operand specified by `imm8`. 788 __m128d _mm_cmp_pd(int imm8)(__m128d a, __m128d b) pure @safe 789 { 790 enum comparison = mapAVXFPComparison(imm8); 791 return cast(__m128d) cmppd!comparison(a, b); 792 } 793 unittest 794 { 795 __m128d A = _mm_setr_pd(double.infinity, double.nan); 796 __m128d B = _mm_setr_pd(3.0, 4.0); 797 long2 R = cast(long2) _mm_cmp_pd!_CMP_GT_OS(A, B); 798 long[2] correct = [-1, 0]; 799 assert(R.array == correct); 800 801 long2 R2 = cast(long2) _mm_cmp_pd!_CMP_NLE_UQ(A, B); 802 long[2] correct2 = [-1, -1]; 803 assert(R2.array == correct2); 804 } 805 806 ///ditto 807 __m256d _mm256_cmp_pd(int imm8)(__m256d a, __m256d b) pure @safe 808 { 809 enum comparison = mapAVXFPComparison(imm8); 810 return cast(__m256d) cmppd256!comparison(a, b); 811 } 812 unittest 813 { 814 __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, double.nan); 815 __m256d B = _mm256_setr_pd(3.0, 2.0, 1.0, double.nan); 816 __m256i R = cast(__m256i) _mm256_cmp_pd!_CMP_LT_OS(A, B); 817 long[4] correct = [-1, 0, 0, 0]; 818 assert(R.array == correct); 819 } 820 821 /// Compare packed double-precision (32-bit) floating-point elements in `a` and `b` based on the 822 /// comparison operand specified by `imm8`. 823 __m128 _mm_cmp_ps(int imm8)(__m128 a, __m128 b) pure @safe 824 { 825 enum comparison = mapAVXFPComparison(imm8); 826 return cast(__m128) cmpps!comparison(a, b); 827 } 828 829 ///ditto 830 __m256 _mm256_cmp_ps(int imm8)(__m256 a, __m256 b) pure @safe 831 { 832 enum comparison = mapAVXFPComparison(imm8); 833 return cast(__m256) cmpps256!comparison(a, b); 834 } 835 836 /// Compare the lower double-precision (64-bit) floating-point element in `a` and `b` based on the 837 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 838 /// copy the upper element from `a` to the upper element of result. 839 __m128d _mm_cmp_sd(int imm8)(__m128d a, __m128d b) pure @safe 840 { 841 enum comparison = mapAVXFPComparison(imm8); 842 return cast(__m128d) cmpsd!comparison(a, b); 843 } 844 845 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` based on the 846 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 847 /// copy the upper 3 packed elements from `a` to the upper elements of result. 848 __m128 _mm_cmp_ss(int imm8)(__m128 a, __m128 b) pure @safe 849 { 850 enum comparison = mapAVXFPComparison(imm8); 851 return cast(__m128) cmpss!comparison(a, b); 852 } 853 854 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point 855 /// elements. 856 __m256d _mm256_cvtepi32_pd (__m128i a) pure @trusted 857 { 858 static if (LDC_with_optimizations) 859 { 860 enum ir = ` 861 %r = sitofp <4 x i32> %0 to <4 x double> 862 ret <4 x double> %r`; 863 return LDCInlineIR!(ir, double4, __m128i)(a); 864 } 865 else static if (GDC_with_AVX) 866 { 867 return __builtin_ia32_cvtdq2pd256(a); 868 } 869 else 870 { 871 double4 r; 872 r.ptr[0] = a.array[0]; 873 r.ptr[1] = a.array[1]; 874 r.ptr[2] = a.array[2]; 875 r.ptr[3] = a.array[3]; 876 return r; 877 } 878 } 879 unittest 880 { 881 __m256d R = _mm256_cvtepi32_pd(_mm_set1_epi32(54)); 882 double[4] correct = [54.0, 54, 54, 54]; 883 assert(R.array == correct); 884 } 885 886 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point 887 /// elements. 888 __m256 _mm256_cvtepi32_ps (__m256i a) pure @trusted 889 { 890 static if (LDC_with_optimizations) 891 { 892 enum ir = ` 893 %r = sitofp <8 x i32> %0 to <8 x float> 894 ret <8 x float> %r`; 895 return LDCInlineIR!(ir, float8, int8)(cast(int8)a); 896 } 897 else static if (GDC_with_AVX) 898 { 899 return __builtin_ia32_cvtdq2ps256(cast(int8)a); 900 } 901 else 902 { 903 int8 ia = cast(int8)a; 904 __m256 r; 905 r.ptr[0] = ia.array[0]; 906 r.ptr[1] = ia.array[1]; 907 r.ptr[2] = ia.array[2]; 908 r.ptr[3] = ia.array[3]; 909 r.ptr[4] = ia.array[4]; 910 r.ptr[5] = ia.array[5]; 911 r.ptr[6] = ia.array[6]; 912 r.ptr[7] = ia.array[7]; 913 return r; 914 } 915 } 916 unittest 917 { 918 __m256 R = _mm256_cvtepi32_ps(_mm256_set1_epi32(5)); 919 float[8] correct = [5.0f, 5, 5, 5, 5, 5, 5, 5]; 920 assert(R.array == correct); 921 } 922 923 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 924 /// integers. Follows the current rounding mode. 925 __m128i _mm256_cvtpd_epi32 (__m256d a) @safe 926 { 927 static if (GDC_or_LDC_with_AVX) 928 { 929 return __builtin_ia32_cvtpd2dq256(a); 930 } 931 else 932 { 933 __m128d lo = _mm256_extractf128_pd!0(a); 934 __m128d hi = _mm256_extractf128_pd!1(a); 935 __m128i ilo = _mm_cvtpd_epi32(lo); // Only lower 64-bit contains significant values 936 __m128i ihi = _mm_cvtpd_epi32(hi); 937 return _mm_unpacklo_epi64(ilo, ihi); 938 } 939 } 940 unittest 941 { 942 int4 A = _mm256_cvtpd_epi32(_mm256_setr_pd(61.0, 55.0, -100, 1_000_000)); 943 int[4] correct = [61, 55, -100, 1_000_000]; 944 assert(A.array == correct); 945 } 946 947 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) 948 /// floating-point elements. 949 __m128 _mm256_cvtpd_ps (__m256d a) pure @trusted 950 { 951 // PERF DMD 952 static if (GDC_or_LDC_with_AVX) 953 { 954 return __builtin_ia32_cvtpd2ps256(a); 955 } 956 else 957 { 958 __m128 r; 959 r.ptr[0] = a.array[0]; 960 r.ptr[1] = a.array[1]; 961 r.ptr[2] = a.array[2]; 962 r.ptr[3] = a.array[3]; 963 return r; 964 } 965 } 966 unittest 967 { 968 __m256d A = _mm256_setr_pd(1.0, 2, 3, 5); 969 __m128 R = _mm256_cvtpd_ps(A); 970 float[4] correct = [1.0f, 2, 3, 5]; 971 assert(R.array == correct); 972 } 973 974 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 975 /// integers, using the current rounding mode. 976 __m256i _mm256_cvtps_epi32 (__m256 a) @trusted 977 { 978 static if (GDC_or_LDC_with_AVX) 979 { 980 return cast(__m256i) __builtin_ia32_cvtps2dq256(a); 981 } 982 else 983 { 984 __m128 lo = _mm256_extractf128_ps!0(a); 985 __m128 hi = _mm256_extractf128_ps!1(a); 986 __m128i ilo = _mm_cvtps_epi32(lo); 987 __m128i ihi = _mm_cvtps_epi32(hi); 988 return _mm256_set_m128i(ihi, ilo); 989 } 990 } 991 unittest 992 { 993 uint savedRounding = _MM_GET_ROUNDING_MODE(); 994 995 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 996 __m256i A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.1f, 53.5f, -2.9f, -1.4f, 2.1f, -53.5f, 2.9f)); 997 assert( (cast(int8)A).array == [1, -2, 54, -3, -1, 2, -54, 3]); 998 999 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1000 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.11f, 53.4f, -2.8f, -1.3f, 2.11f, -53.4f, 2.8f)); 1001 assert( (cast(int8)A).array == [1, -3, 53, -3, -2, 2, -54, 2]); 1002 1003 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1004 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f)); 1005 assert( (cast(int8)A).array == [2, -2, 54, -2, -1, 3, -53, 3]); 1006 1007 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1008 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.17f, 53.8f, -2.91f, -1.4f, 2.17f, -53.8f, 2.91f)); 1009 assert( (cast(int8)A).array == [1, -2, 53, -2, -1, 2, -53, 2]); 1010 1011 _MM_SET_ROUNDING_MODE(savedRounding); 1012 } 1013 1014 1015 /// Convert packed single-precision (32-bit) floating-point elements in `a`` to packed double-precision 1016 /// (64-bit) floating-point elements. 1017 __m256d _mm256_cvtps_pd (__m128 a) pure @trusted 1018 { 1019 // PERF DMD 1020 static if (GDC_with_AVX) 1021 { 1022 return __builtin_ia32_cvtps2pd256(a); // LDC doesn't have the builtin 1023 } 1024 else 1025 { 1026 // LDC: x86, needs -O2 to generate cvtps2pd since LDC 1.2.0 1027 __m256d r; 1028 r.ptr[0] = a.array[0]; 1029 r.ptr[1] = a.array[1]; 1030 r.ptr[2] = a.array[2]; 1031 r.ptr[3] = a.array[3]; 1032 return r; 1033 } 1034 } 1035 unittest 1036 { 1037 __m128 A = _mm_setr_ps(1.0f, 2, 3, 5); 1038 __m256d R = _mm256_cvtps_pd(A); 1039 double[4] correct = [1.0, 2, 3, 5]; 1040 assert(R.array == correct); 1041 } 1042 1043 /// Return the lower double-precision (64-bit) floating-point element of `a`. 1044 double _mm256_cvtsd_f64 (__m256d a) pure @safe 1045 { 1046 return a.array[0]; 1047 } 1048 1049 /// Return the lower 32-bit integer in `a`. 1050 int _mm256_cvtsi256_si32 (__m256i a) pure @safe 1051 { 1052 return (cast(int8)a).array[0]; 1053 } 1054 1055 /// Return the lower single-precision (32-bit) floating-point element of `a`. 1056 float _mm256_cvtss_f32 (__m256 a) pure @safe 1057 { 1058 return a.array[0]; 1059 } 1060 1061 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 1062 /// integers with truncation. 1063 __m128i _mm256_cvttpd_epi32 (__m256d a) pure @trusted 1064 { 1065 // PERF DMD 1066 static if (GDC_or_LDC_with_AVX) 1067 { 1068 return cast(__m128i)__builtin_ia32_cvttpd2dq256(a); 1069 } 1070 else 1071 { 1072 __m128i r; 1073 r.ptr[0] = cast(int)a.array[0]; 1074 r.ptr[1] = cast(int)a.array[1]; 1075 r.ptr[2] = cast(int)a.array[2]; 1076 r.ptr[3] = cast(int)a.array[3]; 1077 return r; 1078 } 1079 } 1080 unittest 1081 { 1082 __m256d A = _mm256_set_pd(4.7, -1000.9, -7.1, 3.1); 1083 __m128i R = _mm256_cvttpd_epi32(A); 1084 int[4] correct = [3, -7, -1000, 4]; 1085 assert(R.array == correct); 1086 } 1087 1088 /// Convert packed single-precision (32-bit) floating-point elements in `a`. 1089 __m256i _mm256_cvttps_epi32 (__m256 a) pure @trusted 1090 { 1091 // PERF DMD 1092 static if (GDC_or_LDC_with_AVX) 1093 { 1094 return cast(__m256i)__builtin_ia32_cvttps2dq256(a); 1095 } 1096 else 1097 { 1098 int8 r; 1099 r.ptr[0] = cast(int)a.array[0]; 1100 r.ptr[1] = cast(int)a.array[1]; 1101 r.ptr[2] = cast(int)a.array[2]; 1102 r.ptr[3] = cast(int)a.array[3]; 1103 r.ptr[4] = cast(int)a.array[4]; 1104 r.ptr[5] = cast(int)a.array[5]; 1105 r.ptr[6] = cast(int)a.array[6]; 1106 r.ptr[7] = cast(int)a.array[7]; 1107 return cast(__m256i)r; 1108 } 1109 } 1110 unittest 1111 { 1112 __m256 A = _mm256_set_ps(4.7, -1000.9, -7.1, 3.1, 1.4, 2.9, -2.9, 0); 1113 int8 R = cast(int8) _mm256_cvttps_epi32(A); 1114 int[8] correct = [0, -2, 2, 1, 3, -7, -1000, 4]; 1115 assert(R.array == correct); 1116 } 1117 1118 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1119 __m256d _mm256_div_pd (__m256d a, __m256d b) pure @safe 1120 { 1121 return a / b; 1122 } 1123 unittest 1124 { 1125 __m256d a = [1.5, -2.0, 3.0, 1.0]; 1126 a = _mm256_div_pd(a, a); 1127 double[4] correct = [1.0, 1.0, 1.0, 1.0]; 1128 assert(a.array == correct); 1129 } 1130 1131 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`. 1132 __m256 _mm256_div_ps (__m256 a, __m256 b) pure @safe 1133 { 1134 return a / b; 1135 } 1136 unittest 1137 { 1138 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 4.5f, -5.0f, 6.0f, 7.0f]; 1139 a = _mm256_div_ps(a, a); 1140 float[8] correct = [1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f]; 1141 assert(a.array == correct); 1142 } 1143 1144 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in `a` and 1145 /// `b` using the high 4 bits in `imm8`, sum the four products, and conditionally store the sum 1146 /// using the low 4 bits of `imm8`. 1147 __m256 _mm256_dp_ps(int imm8)(__m256 a, __m256 b) 1148 { 1149 // PERF DMD 1150 static if (GDC_or_LDC_with_AVX) 1151 { 1152 return __builtin_ia32_dpps256(a, b, cast(ubyte)imm8); 1153 } 1154 else 1155 { 1156 // Note: in LDC with SSE4.1 but no AVX, we _could_ increase perf a bit by using two 1157 // _mm_dp_ps. 1158 __m256 zero = _mm256_setzero_ps(); 1159 enum ubyte op = (imm8 >>> 4) & 15; 1160 __m256 temp = _mm256_blend_ps!( op | (op << 4) )(zero, a * b); 1161 float lo = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 1162 float hi = temp.array[4] + temp.array[5] + temp.array[6] + temp.array[7]; 1163 __m256 r = _mm256_set_m128(_mm_set1_ps(hi), _mm_set1_ps(lo)); 1164 enum ubyte op2 = (imm8 & 15); 1165 return _mm256_blend_ps!(op2 | (op2 << 4))(zero, r); 1166 } 1167 } 1168 unittest 1169 { 1170 // Products: 9 14 20 24 6 16 12 -24 1171 __m256 A = _mm256_setr_ps(1.0f, 2.0f, 4.0f, 8.0f, 1.0f, 2.0f, 4.0f, 8.0f); 1172 __m256 B = _mm256_setr_ps(9.0f, 7.0f, 5.0f, 3.0f, 6.0f, 8.0f, 3.0f,-3.0f); 1173 float8 R1 = _mm256_dp_ps!(0xf0 + 0xf)(A, B); 1174 float8 R2 = _mm256_dp_ps!(0x30 + 0x5)(A, B); 1175 float8 R3 = _mm256_dp_ps!(0x50 + 0xa)(A, B); 1176 float[8] correct1 = [67.0f, 67.0f, 67.0f,67.0f, 10, 10, 10, 10]; 1177 float[8] correct2 = [23.0f, 0.0f, 23.0f, 0.0f, 22, 0, 22, 0]; 1178 float[8] correct3 = [0.0f, 29.0f, 0.0f, 29.0f, 0, 18, 0, 18]; 1179 assert(R1.array == correct1); 1180 assert(R2.array == correct2); 1181 assert(R3.array == correct3); 1182 } 1183 1184 /// Extract a 32-bit integer from `a`, selected with `imm8`. 1185 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted 1186 { 1187 return (cast(int8)a).array[imm8 & 7]; 1188 } 1189 unittest 1190 { 1191 align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6]; 1192 auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr); 1193 assert(_mm256_extract_epi32(A, 0) == -1); 1194 assert(_mm256_extract_epi32(A, 1 + 8) == 2); 1195 assert(_mm256_extract_epi32(A, 3 + 16) == 4); 1196 assert(_mm256_extract_epi32(A, 7 + 32) == -6); 1197 } 1198 1199 /// Extract a 64-bit integer from `a`, selected with `index`. 1200 long _mm256_extract_epi64 (__m256i a, const int index) pure @safe 1201 { 1202 return a.array[index & 3]; 1203 } 1204 unittest 1205 { 1206 __m256i A = _mm256_setr_epi64x(-7, 6, 42, 0); 1207 assert(_mm256_extract_epi64(A, -8) == -7); 1208 assert(_mm256_extract_epi64(A, 1) == 6); 1209 assert(_mm256_extract_epi64(A, 2 + 4) == 42); 1210 } 1211 1212 /// Extract a 128-bits lane from `a`, selected with `index` (0 or 1). 1213 /// Note: `_mm256_extractf128_pd!0` is equivalent to `_mm256_castpd256_pd128`. 1214 __m128d _mm256_extractf128_pd(ubyte imm8)(__m256d a) pure @trusted 1215 { 1216 version(GNU) pragma(inline, true); // else GDC has trouble inlining this 1217 1218 // PERF DMD 1219 static if (GDC_with_AVX) 1220 { 1221 // Note: needs to be a template intrinsics because of this builtin. 1222 return __builtin_ia32_vextractf128_pd256(a, imm8 & 1); 1223 } 1224 else 1225 { 1226 double2 r = void; 1227 enum int index = 2*(imm8 & 1); 1228 r.ptr[0] = a.array[index+0]; 1229 r.ptr[1] = a.array[index+1]; 1230 return r; 1231 } 1232 } 1233 unittest 1234 { 1235 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 1236 double[4] correct = [1.0, 2, 3, 4]; 1237 __m128d l0 = _mm256_extractf128_pd!18(A); 1238 __m128d l1 = _mm256_extractf128_pd!55(A); 1239 assert(l0.array == correct[0..2]); 1240 assert(l1.array == correct[2..4]); 1241 } 1242 1243 ///ditto 1244 __m128 _mm256_extractf128_ps(ubyte imm8)(__m256 a) pure @trusted 1245 { 1246 version(GNU) pragma(inline, true); // else GDC has trouble inlining this 1247 1248 // PERF DMD 1249 static if (GDC_with_AVX) 1250 { 1251 return __builtin_ia32_vextractf128_ps256(a, imm8 & 1); 1252 } 1253 else 1254 { 1255 float4 r = void; // Optimize well since LDC 1.1 -O1 1256 enum int index = 4*(imm8 & 1); 1257 r.ptr[0] = a.array[index+0]; 1258 r.ptr[1] = a.array[index+1]; 1259 r.ptr[2] = a.array[index+2]; 1260 r.ptr[3] = a.array[index+3]; 1261 return r; 1262 } 1263 } 1264 unittest 1265 { 1266 __m256 A = _mm256_setr_ps(1.0, 2, 3, 4, 5, 6, 7, 8); 1267 float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8]; 1268 __m128 l0 = _mm256_extractf128_ps!8(A); 1269 __m128 l1 = _mm256_extractf128_ps!255(A); 1270 assert(l0.array == correct[0..4]); 1271 assert(l1.array == correct[4..8]); 1272 } 1273 1274 ///ditto 1275 __m128i _mm256_extractf128_si256(ubyte imm8)(__m256i a) pure @trusted 1276 { 1277 version(GNU) pragma(inline, true); // else GDC has trouble inlining this 1278 1279 // PERF DMD 1280 static if (GDC_with_AVX) 1281 { 1282 // Note: if it weren't for this GDC intrinsic, _mm256_extractf128_si256 1283 // could be a non-template, however, this wins in -O0. 1284 // Same story for _mm256_extractf128_ps and _mm256_extractf128_pd 1285 return __builtin_ia32_vextractf128_si256(cast(int8)a, imm8 & 1); 1286 } 1287 else 1288 { 1289 long2 r = void; 1290 enum int index = 2*(imm8 & 1); 1291 r.ptr[0] = a.array[index+0]; 1292 r.ptr[1] = a.array[index+1]; 1293 return cast(__m128i)r; 1294 } 1295 } 1296 unittest 1297 { 1298 __m256i A = _mm256_setr_epi32(9, 2, 3, 4, 5, 6, 7, 8); 1299 int[8] correct = [9, 2, 3, 4, 5, 6, 7, 8]; 1300 __m128i l0 = _mm256_extractf128_si256!0(A); 1301 __m128i l1 = _mm256_extractf128_si256!1(A); 1302 assert(l0.array == correct[0..4]); 1303 assert(l1.array == correct[4..8]); 1304 } 1305 1306 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an integer 1307 /// value, and store the results as packed double-precision floating-point elements. 1308 __m256d _mm256_floor_pd (__m256d a) @safe 1309 { 1310 static if (LDC_with_ARM64) 1311 { 1312 __m128d lo = _mm256_extractf128_pd!0(a); 1313 __m128d hi = _mm256_extractf128_pd!1(a); 1314 __m128d ilo = _mm_floor_pd(lo); 1315 __m128d ihi = _mm_floor_pd(hi); 1316 return _mm256_set_m128d(ihi, ilo); 1317 } 1318 else 1319 { 1320 return _mm256_round_pd!1(a); 1321 } 1322 } 1323 unittest 1324 { 1325 __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f); 1326 A = _mm256_floor_pd(A); 1327 double[4] correct = [1.0, -3.0, 53.0, -3.0]; 1328 assert(A.array == correct); 1329 } 1330 1331 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an integer 1332 /// value, and store the results as packed single-precision floating-point elements. 1333 __m256 _mm256_floor_ps (__m256 a) @safe 1334 { 1335 static if (LDC_with_ARM64) 1336 { 1337 __m128 lo = _mm256_extractf128_ps!0(a); 1338 __m128 hi = _mm256_extractf128_ps!1(a); 1339 __m128 ilo = _mm_floor_ps(lo); 1340 __m128 ihi = _mm_floor_ps(hi); 1341 return _mm256_set_m128(ihi, ilo); 1342 } 1343 else 1344 { 1345 return _mm256_round_ps!1(a); 1346 } 1347 } 1348 unittest 1349 { 1350 __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f); 1351 __m256 C = _mm256_floor_ps(A); 1352 float[8] correct = [1.0f, -3.0f, 53.0f, -3.0f, -2, 2, -54, 2]; 1353 assert(C.array == correct); 1354 } 1355 1356 /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in `a` 1357 /// and `b`. 1358 __m256d _mm256_hadd_pd (__m256d a, __m256d b) pure @trusted 1359 { 1360 static if (GDC_or_LDC_with_AVX) 1361 { 1362 return __builtin_ia32_haddpd256(a, b); 1363 } 1364 else 1365 { 1366 __m256d res; 1367 res.ptr[0] = a.array[1] + a.array[0]; 1368 res.ptr[1] = b.array[1] + b.array[0]; 1369 res.ptr[2] = a.array[3] + a.array[2]; 1370 res.ptr[3] = b.array[3] + b.array[2]; 1371 return res; 1372 } 1373 } 1374 unittest 1375 { 1376 __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0); 1377 __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0); 1378 __m256d C = _mm256_hadd_pd(A, B); 1379 double[4] correct = [3.5, 8.0, 30.0, 114.0]; 1380 assert(C.array == correct); 1381 } 1382 1383 /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in `a` and 1384 /// `b`. 1385 __m256 _mm256_hadd_ps (__m256 a, __m256 b) pure @trusted 1386 { 1387 // PERD DMD 1388 static if (GDC_or_LDC_with_AVX) 1389 { 1390 return __builtin_ia32_haddps256(a, b); 1391 } 1392 else static if (LDC_with_ARM64) 1393 { 1394 __m128 a_hi = _mm256_extractf128_ps!1(a); 1395 __m128 a_lo = _mm256_extractf128_ps!0(a); 1396 __m128 b_hi = _mm256_extractf128_ps!1(b); 1397 __m128 b_lo = _mm256_extractf128_ps!0(b); 1398 __m128 hi = vpaddq_f32(a_hi, b_hi); 1399 __m128 lo = vpaddq_f32(a_lo, b_lo); 1400 return _mm256_set_m128(hi, lo); 1401 } 1402 else 1403 { 1404 __m256 res; 1405 res.ptr[0] = a.array[1] + a.array[0]; 1406 res.ptr[1] = a.array[3] + a.array[2]; 1407 res.ptr[2] = b.array[1] + b.array[0]; 1408 res.ptr[3] = b.array[3] + b.array[2]; 1409 res.ptr[4] = a.array[5] + a.array[4]; 1410 res.ptr[5] = a.array[7] + a.array[6]; 1411 res.ptr[6] = b.array[5] + b.array[4]; 1412 res.ptr[7] = b.array[7] + b.array[6]; 1413 return res; 1414 } 1415 } 1416 unittest 1417 { 1418 __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f); 1419 __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f); 1420 __m256 R = _mm256_hadd_ps(A, B); 1421 float[8] correct = [3.0f, 8.0f, 3.5f, 7.5f, 3.0f, 8.0f, 3.5f, 8.5f]; 1422 assert(R.array == correct); 1423 } 1424 1425 /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in 1426 /// `a` and `b`. 1427 __m256d _mm256_hsub_pd (__m256d a, __m256d b) pure @trusted 1428 { 1429 static if (GDC_or_LDC_with_AVX) 1430 { 1431 return __builtin_ia32_hsubpd256(a, b); 1432 } 1433 else 1434 { 1435 // 2 zip1, 2 zip2, 2 fsub... I don't think there is better in arm64 1436 __m256d res; 1437 res.ptr[0] = a.array[0] - a.array[1]; 1438 res.ptr[1] = b.array[0] - b.array[1]; 1439 res.ptr[2] = a.array[2] - a.array[3]; 1440 res.ptr[3] = b.array[2] - b.array[3]; 1441 return res; 1442 } 1443 } 1444 unittest 1445 { 1446 __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0); 1447 __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0); 1448 __m256d C = _mm256_hsub_pd(A, B); 1449 double[4] correct = [-0.5, -6.0, 12.0, 86.0]; 1450 assert(C.array == correct); 1451 } 1452 1453 __m256 _mm256_hsub_ps (__m256 a, __m256 b) pure @trusted 1454 { 1455 // PERD DMD 1456 static if (GDC_or_LDC_with_AVX) 1457 { 1458 return __builtin_ia32_hsubps256(a, b); 1459 } 1460 else 1461 { 1462 __m128 a_hi = _mm256_extractf128_ps!1(a); 1463 __m128 a_lo = _mm256_extractf128_ps!0(a); 1464 __m128 b_hi = _mm256_extractf128_ps!1(b); 1465 __m128 b_lo = _mm256_extractf128_ps!0(b); 1466 __m128 hi = _mm_hsub_ps(a_hi, b_hi); 1467 __m128 lo = _mm_hsub_ps(a_lo, b_lo); 1468 return _mm256_set_m128(hi, lo); 1469 } 1470 } 1471 unittest 1472 { 1473 __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f); 1474 __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f); 1475 __m256 R = _mm256_hsub_ps(A, B); 1476 float[8] correct = [-1.0f, -2.0f, -0.5f, -0.5f, -1.0f, -2.0f, -0.5f, -1.5f]; 1477 assert(R.array == correct); 1478 } 1479 1480 /// Copy `a`, and insert the 16-bit integer `i` into the result at the location specified by 1481 /// `index & 15`. 1482 __m256i _mm256_insert_epi16 (__m256i a, short i, const int index) pure @trusted 1483 { 1484 short16 sa = cast(short16)a; 1485 sa.ptr[index & 15] = i; 1486 return cast(__m256i)sa; 1487 } 1488 unittest 1489 { 1490 __m256i A = _mm256_set1_epi16(1); 1491 short16 R = cast(short16) _mm256_insert_epi16(A, 2, 16 + 16 + 7); 1492 short[16] correct = [1, 1, 1, 1, 1, 1, 1, 2, 1493 1, 1, 1, 1, 1, 1, 1, 1 ]; 1494 assert(R.array == correct); 1495 } 1496 1497 /// Copy `a`, and insert the 32-bit integer `i` into the result at the location specified by 1498 /// `index & 7`. 1499 __m256i _mm256_insert_epi32 (__m256i a, int i, const int index) pure @trusted 1500 { 1501 int8 ia = cast(int8)a; 1502 ia.ptr[index & 7] = i; 1503 return cast(__m256i)ia; 1504 } 1505 unittest 1506 { 1507 __m256i A = _mm256_set1_epi32(1); 1508 int8 R = cast(int8) _mm256_insert_epi32(A, -2, 8 + 8 + 1); 1509 int[8] correct = [1, -2, 1, 1, 1, 1, 1, 1]; 1510 assert(R.array == correct); 1511 } 1512 1513 /// Copy `a`, and insert the 64-bit integer `i` into the result at the location specified by 1514 /// `index & 3`. 1515 __m256i _mm256_insert_epi64(__m256i a, long i, const int index) pure @trusted 1516 { 1517 a.ptr[index & 3] = i; 1518 return a; 1519 } 1520 unittest 1521 { 1522 __m256i A = _mm256_set1_epi64(1); 1523 long4 R = cast(long4) _mm256_insert_epi64(A, -2, 2 - 4 - 4); 1524 long[4] correct = [1, 1, -2, 1]; 1525 assert(R.array == correct); 1526 } 1527 1528 /// Copy `a`, and insert the 8-bit integer `i` into the result at the location specified by 1529 /// `index & 31`. 1530 __m256i _mm256_insert_epi8(__m256i a, byte i, const int index) pure @trusted 1531 { 1532 byte32 ba = cast(byte32)a; 1533 ba.ptr[index & 31] = i; 1534 return cast(__m256i)ba; 1535 } 1536 unittest 1537 { 1538 __m256i A = _mm256_set1_epi8(1); 1539 byte32 R = cast(byte32) _mm256_insert_epi8(A, -2, 7 - 32 - 32); 1540 byte[32] correct = [1, 1, 1, 1, 1, 1, 1,-2, 1, 1, 1, 1, 1, 1, 1, 1, 1541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]; 1542 assert(R.array == correct); 1543 } 1544 1545 /// Copy `a`, then insert 128 bits (composed of 2 packed double-precision (64-bit) 1546 /// floating-point elements) from `b` at the location specified by `imm8`. 1547 __m256d _mm256_insertf128_pd(int imm8)(__m256d a, __m128d b) pure @trusted 1548 { 1549 static if (GDC_with_AVX) 1550 { 1551 enum ubyte lane = imm8 & 1; 1552 return __builtin_ia32_vinsertf128_pd256(a, b, lane); 1553 } 1554 else 1555 { 1556 __m256d r = a; 1557 enum int index = (imm8 & 1) ? 2 : 0; 1558 r.ptr[index] = b.array[0]; 1559 r.ptr[index+1] = b.array[1]; 1560 return r; 1561 } 1562 } 1563 1564 /// Copy `a` then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point 1565 /// elements) from `b`, at the location specified by `imm8`. 1566 __m256 _mm256_insertf128_ps(int imm8)(__m256 a, __m128 b) pure @trusted 1567 { 1568 static if (GDC_with_AVX) 1569 { 1570 enum ubyte lane = imm8 & 1; 1571 return __builtin_ia32_vinsertf128_ps256(a, b, lane); 1572 } 1573 else 1574 { 1575 __m256 r = a; 1576 enum int index = (imm8 & 1) ? 4 : 0; 1577 r.ptr[index] = b.array[0]; 1578 r.ptr[index+1] = b.array[1]; 1579 r.ptr[index+2] = b.array[2]; 1580 r.ptr[index+3] = b.array[3]; 1581 return r; 1582 } 1583 } 1584 1585 /// Copy `a`, then insert 128 bits from `b` at the location specified by `imm8`. 1586 __m256i _mm256_insertf128_si256(int imm8)(__m256i a, __m128i b) pure @trusted 1587 { 1588 static if (GDC_with_AVX) 1589 { 1590 enum ubyte lane = imm8 & 1; 1591 return cast(__m256i) __builtin_ia32_vinsertf128_si256 (cast(int8)a, b, lane); 1592 } 1593 else 1594 { 1595 long2 lb = cast(long2)b; 1596 __m256i r = a; 1597 enum int index = (imm8 & 1) ? 2 : 0; 1598 r.ptr[index] = lb.array[0]; 1599 r.ptr[index+1] = lb.array[1]; 1600 return r; 1601 } 1602 } 1603 1604 /// Load 256-bits of integer data from unaligned memory into dst. 1605 /// This intrinsic may run better than `_mm256_loadu_si256` when the data crosses a cache 1606 /// line boundary. 1607 __m256i _mm256_lddqu_si256(const(__m256i)* mem_addr) @trusted 1608 { 1609 // PERF DMD 1610 static if (GDC_or_LDC_with_AVX) 1611 { 1612 return cast(__m256i) __builtin_ia32_lddqu256(cast(const(char)*)mem_addr); 1613 } 1614 else 1615 return _mm256_loadu_si256(mem_addr); 1616 } 1617 unittest 1618 { 1619 int[10] correct = [0, -1, 2, -3, 4, 9, -7, 8, -6, 34]; 1620 int8 A = cast(int8) _mm256_lddqu_si256(cast(__m256i*) &correct[1]); 1621 assert(A.array == correct[1..9]); 1622 } 1623 1624 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 1625 /// from memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 1626 /// exception may be generated. 1627 __m256d _mm256_load_pd (const(double)* mem_addr) pure @trusted 1628 { 1629 return *cast(__m256d*)mem_addr; 1630 } 1631 unittest 1632 { 1633 static immutable align(32) double[4] correct = [1.0, 2.0, 3.5, -42.0]; 1634 __m256d A = _mm256_load_pd(correct.ptr); 1635 assert(A.array == correct); 1636 } 1637 1638 /// Load 256-bits (composed of 8 packed single-precision (32-bit) 1639 /// floating-point elements) from memory. 1640 /// `mem_addr` must be aligned on a 32-byte boundary or a 1641 /// general-protection exception may be generated. 1642 __m256 _mm256_load_ps (const(float)* mem_addr) pure @trusted 1643 { 1644 return *cast(__m256*)mem_addr; 1645 } 1646 unittest 1647 { 1648 static immutable align(32) float[8] correct = 1649 [1.0, 2.0, 3.5, -42.0, 7.43f, 0.0f, 3, 2]; 1650 __m256 A = _mm256_load_ps(correct.ptr); 1651 assert(A.array == correct); 1652 } 1653 1654 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on 1655 /// any particular boundary. 1656 // See this dlang forum post => https://forum.dlang.org/thread/vymrsngsfibkmqsqffce@forum.dlang.org 1657 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted 1658 { 1659 // PERF DMD 1660 static if (GDC_with_AVX) 1661 { 1662 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr); 1663 } 1664 else static if (LDC_with_optimizations) 1665 { 1666 return loadUnaligned!(__m256i)(cast(long*)mem_addr); 1667 } 1668 else 1669 { 1670 const(long)* p = cast(const(long)*)mem_addr; 1671 long4 r; 1672 r.ptr[0] = p[0]; 1673 r.ptr[1] = p[1]; 1674 r.ptr[2] = p[2]; 1675 r.ptr[3] = p[3]; 1676 return r; 1677 } 1678 } 1679 unittest 1680 { 1681 align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6]; 1682 int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr); 1683 assert(A.array == correct); 1684 } 1685 1686 /// Load 256-bits of integer data from memory. `mem_addr` must be aligned on a 1687 /// 32-byte boundary or a general-protection exception may be generated. 1688 __m256i _mm256_load_si256 (const(void)* mem_addr) pure @system 1689 { 1690 return *cast(__m256i*)mem_addr; 1691 } 1692 unittest 1693 { 1694 static immutable align(64) long[4] correct = [1, -2, long.min, long.max]; 1695 __m256i A = _mm256_load_si256(correct.ptr); 1696 assert(A.array == correct); 1697 } 1698 1699 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 1700 /// from memory. `mem_addr` does not need to be aligned on any particular boundary. 1701 __m256d _mm256_loadu_pd (const(void)* mem_addr) pure @system 1702 { 1703 // PERF DMD 1704 static if (GDC_with_AVX) 1705 { 1706 return __builtin_ia32_loadupd256 ( cast(const(double)*) mem_addr); 1707 } 1708 else static if (LDC_with_optimizations) 1709 { 1710 return loadUnaligned!(__m256d)(cast(double*)mem_addr); 1711 } 1712 else 1713 { 1714 const(double)* p = cast(const(double)*)mem_addr; 1715 double4 r; 1716 r.ptr[0] = p[0]; 1717 r.ptr[1] = p[1]; 1718 r.ptr[2] = p[2]; 1719 r.ptr[3] = p[3]; 1720 return r; 1721 } 1722 } 1723 unittest 1724 { 1725 double[4] correct = [1.0, -2.0, 0.0, 768.5]; 1726 __m256d A = _mm256_loadu_pd(correct.ptr); 1727 assert(A.array == correct); 1728 } 1729 1730 /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory. 1731 /// `mem_addr` does not need to be aligned on any particular boundary. 1732 __m256 _mm256_loadu_ps (const(float)* mem_addr) pure @system 1733 { 1734 // PERF DMD 1735 static if (GDC_with_AVX) 1736 { 1737 return __builtin_ia32_loadups256 ( cast(const(float)*) mem_addr); 1738 } 1739 else static if (LDC_with_optimizations) 1740 { 1741 return loadUnaligned!(__m256)(cast(float*)mem_addr); 1742 } 1743 else 1744 { 1745 const(float)* p = cast(const(float)*)mem_addr; 1746 float8 r = void; 1747 r.ptr[0] = p[0]; 1748 r.ptr[1] = p[1]; 1749 r.ptr[2] = p[2]; 1750 r.ptr[3] = p[3]; 1751 r.ptr[4] = p[4]; 1752 r.ptr[5] = p[5]; 1753 r.ptr[6] = p[6]; 1754 r.ptr[7] = p[7]; 1755 return r; 1756 } 1757 } 1758 unittest 1759 { 1760 align(32) float[10] correct = [0.0f, 1, 2, 3, 4, 5, 6, 7, 8, 9]; 1761 __m256 A = _mm256_loadu_ps(&correct[1]); 1762 assert(A.array == correct[1..9]); 1763 } 1764 1765 /// Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point 1766 /// elements) from memory, and combine them into a 256-bit value. 1767 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1768 __m256 _mm256_loadu2_m128 (const(float)* hiaddr, const(float)* loaddr) pure @system 1769 { 1770 // Note: no particular instruction for this in x86. 1771 return _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr)); 1772 } 1773 unittest 1774 { 1775 align(32) float[6] A = [4.5f, 2, 8, 97, -1, 3]; 1776 align(32) float[6] B = [6.5f, 3, 9, 98, -2, 4]; 1777 __m256 R = _mm256_loadu2_m128(&B[1], &A[1]); 1778 float[8] correct = [2.0f, 8, 97, -1, 3, 9, 98, -2]; 1779 assert(R.array == correct); 1780 } 1781 1782 /// Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point 1783 /// elements) from memory, and combine them into a 256-bit value. 1784 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1785 __m256d _mm256_loadu2_m128d (const(double)* hiaddr, const(double)* loaddr) pure @system 1786 { 1787 // Note: no particular instruction for this in x86. 1788 return _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr)); 1789 } 1790 unittest 1791 { 1792 align(32) double[4] A = [4.5f, 2, 8, 97]; 1793 align(32) double[4] B = [6.5f, 3, 9, 98]; 1794 __m256d R = _mm256_loadu2_m128d(&B[1], &A[1]); 1795 double[4] correct = [2.0, 8, 3, 9]; 1796 assert(R.array == correct); 1797 } 1798 1799 /// Load two 128-bit values (composed of integer data) from memory, and combine them into a 1800 /// 256-bit value. `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1801 __m256i _mm256_loadu2_m128i (const(__m128i)* hiaddr, const(__m128i)* loaddr) pure @trusted 1802 { 1803 // Note: no particular instruction for this in x86. 1804 return _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr)); 1805 } 1806 unittest 1807 { 1808 align(32) long[4] A = [5, 2, 8, 97]; 1809 align(32) long[4] B = [6, 3, 9, 98]; 1810 __m256i R = _mm256_loadu2_m128i(cast(const(__m128i)*) &B[1], cast(const(__m128i)*) &A[1]); 1811 long[4] correct = [2, 8, 3, 9]; 1812 assert(R.array == correct); 1813 } 1814 1815 version(DigitalMars) 1816 { 1817 // this avoids a bug with DMD < 2.099 -a x86 -O 1818 private enum bool maskLoadWorkaroundDMD = (__VERSION__ < 2099); 1819 } 1820 else 1821 { 1822 private enum bool maskLoadWorkaroundDMD = false; 1823 } 1824 1825 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask` 1826 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1827 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1828 /// only when needed. 1829 /// See: "Note about mask load/store" to know why you must address valid memory only. 1830 __m128d _mm_maskload_pd (const(double)* mem_addr, __m128i mask) /* pure */ @system 1831 { 1832 // PERF DMD 1833 static if (LDC_with_AVX) 1834 { 1835 // MAYDO report that the builtin is impure 1836 return __builtin_ia32_maskloadpd(mem_addr, cast(long2)mask); 1837 } 1838 else static if (GDC_with_AVX) 1839 { 1840 return __builtin_ia32_maskloadpd(cast(double2*)mem_addr, cast(long2)mask); 1841 } 1842 else 1843 { 1844 __m128d a = _mm_loadu_pd(mem_addr); 1845 __m128d zero = _mm_setzero_pd(); 1846 return _mm_blendv_pd(zero, a, cast(double2)mask); 1847 } 1848 } 1849 unittest 1850 { 1851 static if (!maskLoadWorkaroundDMD) 1852 { 1853 double[2] A = [7.5, 1]; 1854 double2 B = _mm_maskload_pd(A.ptr, _mm_setr_epi64(-1, 1)); 1855 double[2] correct = [7.5, 0]; 1856 assert(B.array == correct); 1857 } 1858 } 1859 1860 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask` 1861 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1862 /// See: "Note about mask load/store" to know why you must address valid memory only. 1863 __m256d _mm256_maskload_pd (const(double)* mem_addr, __m256i mask) /*pure*/ @system 1864 { 1865 // PERF DMD 1866 static if (LDC_with_AVX) 1867 { 1868 // MAYDO that the builtin is impure 1869 return __builtin_ia32_maskloadpd256(mem_addr, mask); 1870 } 1871 else static if (GDC_with_AVX) 1872 { 1873 return __builtin_ia32_maskloadpd256(cast(double4*)mem_addr, mask); 1874 } 1875 else 1876 { 1877 __m256d a = _mm256_loadu_pd(mem_addr); 1878 __m256d zero = _mm256_setzero_pd(); 1879 return _mm256_blendv_pd(zero, a, cast(double4)mask); 1880 } 1881 } 1882 unittest 1883 { 1884 static if (!maskLoadWorkaroundDMD) 1885 { 1886 double[4] A = [7.5, 1, 2, 3]; 1887 double4 B = _mm256_maskload_pd(A.ptr, _mm256_setr_epi64(1, -1, -1, 1)); 1888 double[4] correct = [0.0, 1, 2, 0]; 1889 assert(B.array == correct); 1890 } 1891 } 1892 1893 /// Load packed single-precision (32-bit) floating-point elements from memory using mask (elements 1894 /// are zeroed out when the high bit of the corresponding element is not set). 1895 /// Warning: See "Note about mask load/store" to know why you must address valid memory only. 1896 __m128 _mm_maskload_ps (const(float)* mem_addr, __m128i mask) /* pure */ @system 1897 { 1898 // PERF DMD 1899 static if (LDC_with_AVX) 1900 { 1901 // MAYDO report that the builtin is impure 1902 return __builtin_ia32_maskloadps(mem_addr, mask); 1903 } 1904 else static if (GDC_with_AVX) 1905 { 1906 return __builtin_ia32_maskloadps(cast(float4*)mem_addr, mask); 1907 } 1908 else 1909 { 1910 __m128 a = _mm_loadu_ps(mem_addr); 1911 __m128 zero = _mm_setzero_ps(); 1912 return _mm_blendv_ps(zero, a, cast(float4)mask); 1913 } 1914 } 1915 unittest 1916 { 1917 static if (!maskLoadWorkaroundDMD) 1918 { 1919 float[4] A = [7.5f, 1, 2, 3]; 1920 float4 B = _mm_maskload_ps(A.ptr, _mm_setr_epi32(1, -1, -1, 1)); // can address invalid memory with mask load and writes! 1921 float[4] correct = [0.0f, 1, 2, 0]; 1922 assert(B.array == correct); 1923 } 1924 } 1925 1926 /// Load packed single-precision (32-bit) floating-point elements from memory using `mask` 1927 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1928 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1929 /// only when needed. 1930 /// See: "Note about mask load/store" to know why you must address valid memory only. 1931 __m256 _mm256_maskload_ps (const(float)* mem_addr, __m256i mask) /*pure*/ @system 1932 { 1933 // PERF DMD 1934 static if (LDC_with_AVX) 1935 { 1936 // MAYDO that the builtin is impure 1937 return __builtin_ia32_maskloadps256(mem_addr, cast(int8)mask); 1938 } 1939 else static if (GDC_with_AVX) 1940 { 1941 return __builtin_ia32_maskloadps256(cast(float8*)mem_addr, cast(int8)mask); 1942 } 1943 else 1944 { 1945 __m256 a = _mm256_loadu_ps(mem_addr); 1946 __m256 zero = _mm256_setzero_ps(); 1947 return _mm256_blendv_ps(zero, a, cast(float8)mask); 1948 } 1949 } 1950 unittest 1951 { 1952 float[8] A = [1, 7.5f, 1, 2, 3, 4, 5, 6]; 1953 __m256i M = _mm256_setr_epi32(1, -1, 1, -1, 1, -1, -1, 1); 1954 float8 B = _mm256_maskload_ps(A.ptr, M); 1955 float[8] correct = [0.0f, 7.5f, 0, 2, 0, 4, 5, 0]; 1956 assert(B.array == correct); 1957 } 1958 1959 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`. 1960 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1961 /// only when needed. 1962 /// See: "Note about mask load/store" to know why you must address valid memory only. 1963 void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a) /* pure */ @system 1964 { 1965 // PERF DMD 1966 static if (LDC_with_AVX) 1967 { 1968 // MAYDO that the builtin is impure 1969 __builtin_ia32_maskstorepd(mem_addr, cast(long2)mask, a); 1970 } 1971 else static if (GDC_with_AVX) 1972 { 1973 __builtin_ia32_maskstorepd(cast(double2*)mem_addr, cast(long2)mask, a); 1974 } 1975 else 1976 { 1977 __m128d source = _mm_loadu_pd(mem_addr); 1978 __m128d r = _mm_blendv_pd(source, a, cast(double2) mask); 1979 _mm_storeu_pd(mem_addr, r); 1980 } 1981 } 1982 unittest 1983 { 1984 double[2] A = [0.0, 1.0]; 1985 __m128i M = _mm_setr_epi64(-1, 0); 1986 __m128d B = _mm_setr_pd(2.0, 3.0); 1987 _mm_maskstore_pd(A.ptr, M, B); 1988 double[2] correct = [2.0, 1.0]; 1989 assert(A == correct); 1990 } 1991 1992 1993 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`. 1994 /// See: "Note about mask load/store" to know why you must address valid memory only. 1995 static if (!llvm256BitStackWorkaroundIn32BitX86) 1996 { 1997 void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a) /* pure */ @system 1998 { 1999 // PERF DMD 2000 static if (LDC_with_AVX) 2001 { 2002 // MAYDO that the builtin is impure 2003 __builtin_ia32_maskstorepd256(mem_addr, cast(long4)mask, a); 2004 } 2005 else static if (GDC_with_AVX) 2006 { 2007 __builtin_ia32_maskstorepd256(cast(double4*)mem_addr, cast(long4)mask, a); 2008 } 2009 else 2010 { 2011 __m256d source = _mm256_loadu_pd(mem_addr); 2012 __m256d r = _mm256_blendv_pd(source, a, cast(double4) mask); 2013 _mm256_storeu_pd(mem_addr, r); 2014 } 2015 } 2016 unittest 2017 { 2018 double[4] A = [0.0, 1, 2, 3]; 2019 __m256i M = _mm256_setr_epi64x(-9, 0, -1, 0); 2020 __m256d B = _mm256_setr_pd(2, 3, 4, 5); 2021 _mm256_maskstore_pd(A.ptr, M, B); 2022 double[4] correct = [2.0, 1, 4, 3]; 2023 assert(A == correct); 2024 } 2025 } 2026 2027 /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`. 2028 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 2029 /// only when needed. 2030 /// See: "Note about mask load/store" to know why you must address valid memory only. 2031 void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a) /* pure */ @system 2032 { 2033 // PERF DMD 2034 static if (LDC_with_AVX) 2035 { 2036 // MAYDO report that the builtin is impure 2037 __builtin_ia32_maskstoreps(mem_addr, mask, a); 2038 } 2039 else static if (GDC_with_AVX) 2040 { 2041 __builtin_ia32_maskstoreps(cast(float4*)mem_addr, mask, a); 2042 } 2043 else 2044 { 2045 __m128 source = _mm_loadu_ps(mem_addr); 2046 __m128 r = _mm_blendv_ps(source, a, cast(float4) mask); 2047 _mm_storeu_ps(mem_addr, r); 2048 } 2049 } 2050 unittest 2051 { 2052 float[4] A = [0.0f, 1, 2, 6]; 2053 __m128i M = _mm_setr_epi32(-1, 0, -1, 0); 2054 __m128 B = _mm_setr_ps(2, 3, 4, 5); 2055 _mm_maskstore_ps(A.ptr, M, B); 2056 float[4] correct = [2.0f, 1, 4, 6]; 2057 assert(A == correct); 2058 } 2059 2060 static if (!llvm256BitStackWorkaroundIn32BitX86) 2061 { 2062 /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`. 2063 /// See: "Note about mask load/store" to know why you must address valid memory only. 2064 void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) /* pure */ @system 2065 { 2066 // PERF DMD 2067 static if (LDC_with_AVX) 2068 { 2069 // MAYDO report that the builtin is impure 2070 __builtin_ia32_maskstoreps256(mem_addr, cast(int8)mask, a); 2071 } 2072 else static if (GDC_with_AVX) 2073 { 2074 __builtin_ia32_maskstoreps256(cast(float8*)mem_addr, cast(int8)mask, a); 2075 } 2076 else 2077 { 2078 __m256 source = _mm256_loadu_ps(mem_addr); 2079 __m256 r = _mm256_blendv_ps(source, a, cast(float8) mask); 2080 _mm256_storeu_ps(mem_addr, r); 2081 } 2082 } 2083 unittest 2084 { 2085 float[8] A = [0.0f, 0, 1, 2, 3, 4, 5, 7]; 2086 __m256i M = _mm256_setr_epi32( 0, -1, 0, -1, 0, -1, -1, 0); 2087 __m256 B = _mm256_set1_ps(6.0f); 2088 _mm256_maskstore_ps(A.ptr, M, B); 2089 float[8] correct = [0.0f, 6, 1, 6, 3, 6, 6, 7]; 2090 assert(A == correct); 2091 } 2092 } 2093 2094 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2095 /// packed maximum values. 2096 __m256d _mm256_max_pd (__m256d a, __m256d b) pure @trusted 2097 { 2098 // PERF DMD 2099 static if (GDC_or_LDC_with_AVX) 2100 { 2101 return __builtin_ia32_maxpd256(a, b); 2102 } 2103 else 2104 { 2105 // LDC: becomes good in -O2 2106 // PERF: GDC without AVX 2107 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2108 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2109 a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2]; 2110 a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3]; 2111 return a; 2112 } 2113 } 2114 unittest 2115 { 2116 __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity); 2117 __m256d B = _mm256_setr_pd(1.0, 8.0, 0.0, 100000.0); 2118 __m256d M = _mm256_max_pd(A, B); 2119 double[4] correct = [4.0, 8.0, 0.0, double.infinity]; 2120 } 2121 2122 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 2123 /// packed maximum values. 2124 __m256 _mm256_max_ps (__m256 a, __m256 b) pure @trusted 2125 { 2126 // PERF DMD 2127 static if (GDC_or_LDC_with_AVX) 2128 { 2129 return __builtin_ia32_maxps256(a, b); 2130 } 2131 else 2132 { 2133 // LDC: becomes good in -O2, but looks brittle. 2134 // PERF GDC without AVX 2135 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2136 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2137 a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2]; 2138 a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3]; 2139 a.ptr[4] = (a.array[4] > b.array[4]) ? a.array[4] : b.array[4]; 2140 a.ptr[5] = (a.array[5] > b.array[5]) ? a.array[5] : b.array[5]; 2141 a.ptr[6] = (a.array[6] > b.array[6]) ? a.array[6] : b.array[6]; 2142 a.ptr[7] = (a.array[7] > b.array[7]) ? a.array[7] : b.array[7]; 2143 return a; 2144 } 2145 } 2146 unittest 2147 { 2148 __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4); 2149 __m256 B = _mm256_setr_ps(1.0, 8.0, 0.0, 100000.0f , 4, 3, 2, 1); 2150 __m256 M = _mm256_max_ps(A, B); 2151 float[8] correct = [4.0, 8.0, 0.0, float.infinity , 4, 3, 3, 4]; 2152 } 2153 2154 // Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2155 /// packed minimum values. 2156 __m256d _mm256_min_pd (__m256d a, __m256d b) pure @trusted 2157 { 2158 // PERF DMD 2159 static if (GDC_or_LDC_with_AVX) 2160 { 2161 return __builtin_ia32_minpd256(a, b); 2162 } 2163 else 2164 { 2165 // LDC: becomes good in -O2 2166 // PERF: GDC without AVX 2167 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2168 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2169 a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2]; 2170 a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3]; 2171 return a; 2172 } 2173 } 2174 unittest 2175 { 2176 __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity); 2177 __m256d B = _mm256_setr_pd(1.0, 8.0, 0.0, 100000.0); 2178 __m256d M = _mm256_min_pd(A, B); 2179 double[4] correct = [1.0, 8.0, -9.0, 100000.0]; 2180 } 2181 2182 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 2183 /// packed maximum values. 2184 __m256 _mm256_min_ps (__m256 a, __m256 b) pure @trusted 2185 { 2186 // PERF DMD 2187 static if (GDC_or_LDC_with_AVX) 2188 { 2189 return __builtin_ia32_minps256(a, b); 2190 } 2191 else 2192 { 2193 // LDC: becomes good in -O2, but looks brittle. 2194 // PERF GDC without AVX 2195 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2196 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2197 a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2]; 2198 a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3]; 2199 a.ptr[4] = (a.array[4] < b.array[4]) ? a.array[4] : b.array[4]; 2200 a.ptr[5] = (a.array[5] < b.array[5]) ? a.array[5] : b.array[5]; 2201 a.ptr[6] = (a.array[6] < b.array[6]) ? a.array[6] : b.array[6]; 2202 a.ptr[7] = (a.array[7] < b.array[7]) ? a.array[7] : b.array[7]; 2203 return a; 2204 } 2205 } 2206 unittest 2207 { 2208 __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4); 2209 __m256 B = _mm256_setr_ps(1.0, 8.0, 0.0, 100000.0f , 4, 3, 2, 1); 2210 __m256 M = _mm256_min_ps(A, B); 2211 float[8] correct = [1.0, 1.0, -9.0, 100000.0f , 1, 2, 2, 1]; 2212 } 2213 2214 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from `a`. 2215 __m256d _mm256_movedup_pd (__m256d a) @trusted 2216 { 2217 // PERF DMD 2218 static if (GDC_with_AVX) 2219 { 2220 return __builtin_ia32_movddup256 (a); 2221 } 2222 else 2223 { 2224 a.ptr[1] = a.array[0]; 2225 a.ptr[3] = a.array[2]; 2226 return a; 2227 } 2228 } 2229 unittest 2230 { 2231 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 2232 A = _mm256_movedup_pd(A); 2233 double[4] correct = [1.0, 1, 3, 3]; 2234 assert(A.array == correct); 2235 } 2236 2237 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`. 2238 __m256 _mm256_movehdup_ps (__m256 a) @trusted 2239 { 2240 // PERF DMD 2241 static if (GDC_with_AVX) 2242 { 2243 return __builtin_ia32_movshdup256 (a); 2244 } 2245 else 2246 { 2247 a.ptr[0] = a.array[1]; 2248 a.ptr[2] = a.array[3]; 2249 a.ptr[4] = a.array[5]; 2250 a.ptr[6] = a.array[7]; 2251 return a; 2252 } 2253 } 2254 unittest 2255 { 2256 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 2257 A = _mm256_movehdup_ps(A); 2258 float[8] correct = [2.0, 2, 4, 4, 6, 6, 8, 8]; 2259 assert(A.array == correct); 2260 } 2261 2262 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`. 2263 __m256 _mm256_moveldup_ps (__m256 a) @trusted 2264 { 2265 // PERF DMD 2266 static if (GDC_with_AVX) 2267 { 2268 return __builtin_ia32_movsldup256 (a); 2269 } 2270 else 2271 { 2272 a.ptr[1] = a.array[0]; 2273 a.ptr[3] = a.array[2]; 2274 a.ptr[5] = a.array[4]; 2275 a.ptr[7] = a.array[6]; 2276 return a; 2277 } 2278 } 2279 unittest 2280 { 2281 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 2282 A = _mm256_moveldup_ps(A); 2283 float[8] correct = [1.0, 1, 3, 3, 5, 5, 7, 7]; 2284 assert(A.array == correct); 2285 } 2286 2287 /// Set each bit of result mask based on the most significant bit of the corresponding packed 2288 /// double-precision (64-bit) floating-point element in `a`. 2289 int _mm256_movemask_pd (__m256d a) @safe 2290 { 2291 // PERF DMD 2292 static if (GDC_or_LDC_with_AVX) 2293 { 2294 return __builtin_ia32_movmskpd256(a); 2295 } 2296 else static if (LDC_with_SSE2) 2297 { 2298 // this doesn't benefit GDC, and not clear for arm64. 2299 __m128d A_lo = _mm256_extractf128_pd!0(a); 2300 __m128d A_hi = _mm256_extractf128_pd!1(a); 2301 2302 return (_mm_movemask_pd(A_hi) << 2) | _mm_movemask_pd(A_lo); 2303 } 2304 else 2305 { 2306 // Fortunately, branchless on arm64 2307 long4 lv = cast(long4)a; 2308 int r = 0; 2309 if (lv.array[0] < 0) r += 1; 2310 if (lv.array[1] < 0) r += 2; 2311 if (lv.array[2] < 0) r += 4; 2312 if (lv.array[3] < 0) r += 8; 2313 return r; 2314 } 2315 } 2316 unittest 2317 { 2318 __m256d A = _mm256_setr_pd(-1, -double.infinity, 0, -1); 2319 assert(_mm256_movemask_pd(A) == 1 + 2 + 8); 2320 } 2321 2322 /// Set each bit of mask result based on the most significant bit of the corresponding packed 2323 /// single-precision (32-bit) floating-point element in `a`. 2324 int _mm256_movemask_ps (__m256 a) @system 2325 { 2326 // PERF DMD 2327 // PERF GDC without AVX 2328 static if (GDC_or_LDC_with_AVX) 2329 { 2330 return __builtin_ia32_movmskps256(a); 2331 } 2332 else version(LDC) 2333 { 2334 // this doesn't benefit GDC (unable to inline), but benefits both LDC with SSE2 and ARM64 2335 __m128 A_lo = _mm256_extractf128_ps!0(a); 2336 __m128 A_hi = _mm256_extractf128_ps!1(a); 2337 return (_mm_movemask_ps(A_hi) << 4) | _mm_movemask_ps(A_lo); 2338 } 2339 else 2340 { 2341 int8 lv = cast(int8)a; 2342 int r = 0; 2343 if (lv.array[0] < 0) r += 1; 2344 if (lv.array[1] < 0) r += 2; 2345 if (lv.array[2] < 0) r += 4; 2346 if (lv.array[3] < 0) r += 8; 2347 if (lv.array[4] < 0) r += 16; 2348 if (lv.array[5] < 0) r += 32; 2349 if (lv.array[6] < 0) r += 64; 2350 if (lv.array[7] < 0) r += 128; 2351 return r; 2352 } 2353 } 2354 unittest 2355 { 2356 __m256 A = _mm256_setr_ps(-1, -double.infinity, 0, -1, 1, double.infinity, -2, double.nan); 2357 assert(_mm256_movemask_ps(A) == 1 + 2 + 8 + 64); 2358 } 2359 2360 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`. 2361 __m256d _mm256_mul_pd (__m256d a, __m256d b) pure @safe 2362 { 2363 return a * b; 2364 } 2365 unittest 2366 { 2367 __m256d a = [-2.0, 1.5, -2.0, 1.5]; 2368 a = _mm256_mul_pd(a, a); 2369 assert(a.array == [4.0, 2.25, 4.0, 2.25]); 2370 } 2371 2372 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`. 2373 __m256 _mm256_mul_ps (__m256 a, __m256 b) pure @safe 2374 { 2375 return a * b; 2376 } 2377 unittest 2378 { 2379 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2.0f, 3.0f, 1.0f]; 2380 a = _mm256_mul_ps(a, a); 2381 float[8] correct = [2.25f, 4.0f, 9.0f, 1.0f, 2.25f, 4.0f, 9.0f, 1.0f]; 2382 assert(a.array == correct); 2383 } 2384 2385 2386 /// Compute the bitwise NOT of 256 bits in `a`. #BONUS 2387 __m256i _mm256_not_si256 (__m256i a) pure @safe 2388 { 2389 return ~a; 2390 } 2391 unittest 2392 { 2393 __m256i A = _mm256_set1_epi64x(-748); 2394 long4 notA = cast(long4) _mm256_not_si256(A); 2395 int[4] correct = [747, 747, 747, 747]; 2396 assert(notA.array == correct); 2397 } 2398 2399 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2400 __m256d _mm256_or_pd (__m256d a, __m256d b) pure @safe 2401 { 2402 return cast(__m256d)( cast(__m256i)a | cast(__m256i)b ); 2403 } 2404 2405 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 2406 __m256 _mm256_or_ps (__m256 a, __m256 b) pure @safe 2407 { 2408 return cast(__m256)( cast(__m256i)a | cast(__m256i)b ); 2409 } 2410 2411 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `imm8`. 2412 __m128d _mm_permute_pd(int imm8)(__m128d a) pure @trusted 2413 { 2414 static if (GDC_with_AVX) 2415 { 2416 return __builtin_ia32_vpermilpd(a, imm8 & 3); 2417 } 2418 else 2419 { 2420 // Shufflevector not particularly better for LDC here 2421 __m128d r; 2422 r.ptr[0] = a.array[imm8 & 1]; 2423 r.ptr[1] = a.array[(imm8 >> 1) & 1]; 2424 return r; 2425 } 2426 } 2427 unittest 2428 { 2429 __m128d A = _mm_setr_pd(5, 6); 2430 __m128d B = _mm_permute_pd!1(A); 2431 __m128d C = _mm_permute_pd!3(A); 2432 double[2] RB = [6, 5]; 2433 double[2] RC = [6, 6]; 2434 assert(B.array == RB); 2435 assert(C.array == RC); 2436 } 2437 2438 ///ditto 2439 __m256d _mm256_permute_pd(int imm8)(__m256d a) pure @trusted 2440 { 2441 // PERF DMD 2442 static if (GDC_with_AVX) 2443 { 2444 return __builtin_ia32_vpermilpd256(a, imm8 & 15); 2445 } 2446 else version(LDC) 2447 { 2448 return shufflevectorLDC!(double4, 2449 (imm8 >> 0) & 1, 2450 ( (imm8 >> 1) & 1), 2451 2 + ( (imm8 >> 2) & 1), 2452 2 + ( (imm8 >> 3) & 1) )(a, a); 2453 } 2454 else 2455 { 2456 __m256d r; 2457 r.ptr[0] = a.array[ imm8 & 1]; 2458 r.ptr[1] = a.array[(imm8 >> 1) & 1]; 2459 r.ptr[2] = a.array[2 + ((imm8 >> 2) & 1)]; 2460 r.ptr[3] = a.array[2 + ((imm8 >> 3) & 1)]; 2461 return r; 2462 } 2463 } 2464 unittest 2465 { 2466 __m256d A = _mm256_setr_pd(0.0, 1, 2, 3); 2467 __m256d R = _mm256_permute_pd!(1 + 4)(A); 2468 double[4] correct = [1.0, 0, 3, 2]; 2469 assert(R.array == correct); 2470 } 2471 2472 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `imm8`. 2473 __m128 _mm_permute_ps(int imm8)(__m128 a) pure @trusted 2474 { 2475 // PERF DMD 2476 static if (GDC_with_AVX) 2477 { 2478 return __builtin_ia32_vpermilps(a, cast(ubyte)imm8); 2479 } 2480 else version(LDC) 2481 { 2482 return shufflevectorLDC!(float4, (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, 2483 (imm8 >> 6) & 3)(a, a); 2484 } 2485 else 2486 { 2487 // PERF: could use _mm_shuffle_ps which is a super set 2488 // when AVX isn't available 2489 __m128 r; 2490 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 2491 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 2492 r.ptr[2] = a.array[(imm8 >> 4) & 3]; 2493 r.ptr[3] = a.array[(imm8 >> 6) & 3]; 2494 return r; 2495 } 2496 } 2497 unittest 2498 { 2499 __m128 A = _mm_setr_ps(0.0f, 1, 2, 3); 2500 __m128 R = _mm_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A); 2501 float[4] correct = [1.0f, 3, 0, 2]; 2502 assert(R.array == correct); 2503 } 2504 2505 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 2506 /// the control in `imm8`. The same shuffle is applied in lower and higher 128-bit lane. 2507 __m256 _mm256_permute_ps(int imm8)(__m256 a,) pure @trusted 2508 { 2509 // PERF DMD 2510 static if (GDC_with_AVX) 2511 { 2512 return __builtin_ia32_vpermilps256(a, cast(ubyte)imm8); 2513 } 2514 else version(LDC) 2515 { 2516 return shufflevectorLDC!(float8, 2517 (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, (imm8 >> 6) & 3, 2518 4 + ((imm8 >> 0) & 3), 4 + ((imm8 >> 2) & 3), 4 + ((imm8 >> 4) & 3), 2519 4 + ((imm8 >> 6) & 3))(a, a); 2520 } 2521 else 2522 { 2523 __m256 r; 2524 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 2525 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 2526 r.ptr[2] = a.array[(imm8 >> 4) & 3]; 2527 r.ptr[3] = a.array[(imm8 >> 6) & 3]; 2528 r.ptr[4] = a.array[4 + ((imm8 >> 0) & 3)]; 2529 r.ptr[5] = a.array[4 + ((imm8 >> 2) & 3)]; 2530 r.ptr[6] = a.array[4 + ((imm8 >> 4) & 3)]; 2531 r.ptr[7] = a.array[4 + ((imm8 >> 6) & 3)]; 2532 return r; 2533 } 2534 } 2535 unittest 2536 { 2537 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 2538 __m256 R = _mm256_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A); 2539 float[8] correct = [1.0f, 3, 0, 2, 5, 7, 4, 6]; 2540 assert(R.array == correct); 2541 } 2542 2543 /// Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 2544 /// selected by `imm8` from `a` and `b`. 2545 __m256d _mm256_permute2f128_pd(int imm8)(__m256d a, __m256d b) pure @safe 2546 { 2547 return cast(__m256d) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b); 2548 } 2549 ///ditto 2550 __m256d _mm256_permute2f128_ps(int imm8)(__m256 a, __m256 b) pure @safe 2551 { 2552 return cast(__m256) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b); 2553 } 2554 ///ditto 2555 __m256i _mm256_permute2f128_si256(int imm8)(__m256i a, __m256i b) pure @trusted 2556 { 2557 static if (GDC_with_AVX) 2558 { 2559 return cast(__m256i) __builtin_ia32_vperm2f128_si256(cast(int8)a, cast(int8)b, cast(ubyte)imm8); 2560 } 2561 else 2562 { 2563 static __m128i SELECT4(int imm4)(__m256i a, __m256i b) pure @trusted 2564 { 2565 static assert(imm4 >= 0 && imm4 <= 15); 2566 static if (imm4 & 8) 2567 { 2568 return _mm_setzero_si128(); 2569 } 2570 else static if ((imm4 & 2) == 0) 2571 { 2572 long2 r; 2573 enum int index = 2*(imm4 & 1); 2574 r.ptr[0] = a.array[index+0]; 2575 r.ptr[1] = a.array[index+1]; 2576 return cast(__m128i)r; 2577 } 2578 else 2579 { 2580 static assert( (imm4 & 2) != 0); 2581 long2 r; 2582 enum int index = 2*(imm4 & 1); 2583 r.ptr[0] = b.array[index+0]; 2584 r.ptr[1] = b.array[index+1]; 2585 return cast(__m128i)r; 2586 } 2587 } 2588 2589 long4 r; 2590 __m128i lo = SELECT4!(imm8 & 15)(a, b); 2591 __m128i hi = SELECT4!((imm8 >> 4) & 15)(a, b); 2592 return _mm256_set_m128i(hi, lo); 2593 } 2594 } 2595 unittest 2596 { 2597 __m256d A = _mm256_setr_pd(8.0, 1, 2, 3); 2598 __m256d B = _mm256_setr_pd(4.0, 5, 6, 7); 2599 __m256d R = _mm256_permute2f128_pd!(128 + 2)(A, B); 2600 double[4] correct = [4.0, 5.0, 0.0, 0.0]; 2601 assert(R.array == correct); 2602 2603 __m256d R2 = _mm256_permute2f128_pd!(3*16 + 1)(A, B); 2604 double[4] correct2 = [2.0, 3.0, 6.0, 7.0]; 2605 assert(R2.array == correct2); 2606 } 2607 2608 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `b`. 2609 /// Warning: the selector is in bit 1, not bit 0, of each 64-bit element! 2610 /// This is really not intuitive. 2611 __m128d _mm_permutevar_pd(__m128d a, __m128i b) pure @trusted 2612 { 2613 enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64; 2614 2615 static if (GDC_or_LDC_with_AVX) 2616 { 2617 return cast(__m128d) __builtin_ia32_vpermilvarpd(a, cast(long2)b); 2618 } 2619 else static if (implementWithByteShuffle) 2620 { 2621 align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]; 2622 align(16) static immutable byte[16] mmBroadcast_u8 = [0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8]; 2623 int4 bi = cast(int4)b; 2624 long2 two; 2625 two = 2; 2626 bi = _mm_slli_epi64(cast(__m128i)( (cast(long2)bi) & two), 2); 2627 bi = _mm_shuffle_epi8(bi, *cast(__m128i*)mmBroadcast_u8.ptr); 2628 // bi is now [ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ] 2629 byte16 bytesIndices = cast(byte16)bi; 2630 bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr; 2631 2632 // which allows us to make a single _mm_shuffle_epi8 2633 return cast(__m128d) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices); 2634 } 2635 else 2636 { 2637 // This isn't great in ARM64, TBL or TBX instructions can't do that. 2638 // that could fit the bill, if it had 64-bit operands. But it only has 8-bit operands. 2639 // SVE2 could do it with svtbx[_f64] probably. 2640 long2 bl = cast(long2)b; 2641 __m128d r; 2642 r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1]; 2643 r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1]; 2644 return r; 2645 } 2646 } 2647 unittest 2648 { 2649 __m128d A = _mm_setr_pd(5, 6); 2650 __m128d B = _mm_permutevar_pd(A, _mm_setr_epi64(2, 1)); 2651 __m128d C = _mm_permutevar_pd(A, _mm_setr_epi64(1 + 2 + 4, 2)); 2652 // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element 2653 double[2] RB = [6, 5]; 2654 double[2] RC = [6, 6]; 2655 assert(B.array == RB); 2656 assert(C.array == RC); 2657 } 2658 2659 ///ditto 2660 __m256d _mm256_permutevar_pd (__m256d a, __m256i b) pure @trusted 2661 { 2662 // Worth it: for GDC, in SSSE3+ 2663 // for LDC, all the time 2664 version(LDC) 2665 enum bool implementWithByteShuffle = true; 2666 else 2667 enum bool implementWithByteShuffle = GDC_with_SSSE3; 2668 2669 // PERF DMD 2670 static if (GDC_or_LDC_with_AVX) 2671 { 2672 return cast(__m256d) __builtin_ia32_vpermilvarpd256(a, cast(long4)b); 2673 } 2674 else static if (implementWithByteShuffle) 2675 { 2676 // because we don't have 256-bit vectors, split and use _mm_permutevar_ps 2677 __m128d a_lo = _mm256_extractf128_pd!0(a); 2678 __m128d a_hi = _mm256_extractf128_pd!1(a); 2679 __m128i b_lo = _mm256_extractf128_si256!0(b); 2680 __m128i b_hi = _mm256_extractf128_si256!1(b); 2681 __m128d r_lo = _mm_permutevar_pd(a_lo, b_lo); 2682 __m128d r_hi = _mm_permutevar_pd(a_hi, b_hi); 2683 return _mm256_set_m128d(r_hi, r_lo); 2684 } 2685 else 2686 { 2687 long4 bl = cast(long4)b; 2688 __m256d r; 2689 r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1]; 2690 r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1]; 2691 r.ptr[2] = a.array[2 + ((bl.array[2] & 2) >> 1)]; 2692 r.ptr[3] = a.array[2 + ((bl.array[3] & 2) >> 1)]; 2693 return r; 2694 } 2695 } 2696 unittest 2697 { 2698 __m256d A = _mm256_setr_pd(5, 6, 7, 8); 2699 __m256d B = _mm256_permutevar_pd(A, _mm256_setr_epi64(2, 1, 0, 2)); 2700 __m256d C = _mm256_permutevar_pd(A, _mm256_setr_epi64(1 + 2 + 4, 2, 2, 0)); 2701 // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element 2702 double[4] RB = [6, 5, 7, 8]; 2703 double[4] RC = [6, 6, 8, 7]; 2704 assert(B.array == RB); 2705 assert(C.array == RC); 2706 } 2707 2708 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `b`. 2709 __m128 _mm_permutevar_ps (__m128 a, __m128i b) @trusted 2710 { 2711 // PERF DMD 2712 2713 enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64; 2714 2715 static if (GDC_or_LDC_with_AVX) 2716 { 2717 return cast(__m128) __builtin_ia32_vpermilvarps(a, cast(int4)b); 2718 } 2719 else static if (implementWithByteShuffle) 2720 { 2721 // This workaround is worth it: in GDC with SSSE3, in LDC with SSSE3, in ARM64 (neon) 2722 int4 bi = cast(int4)b; 2723 int4 three; 2724 three = 3; 2725 bi = _mm_slli_epi32(bi & three, 2); 2726 // bi is [ind0 0 0 0 ind1 0 0 0 ind2 0 0 0 ind3 0 0 0] 2727 bi = bi | _mm_slli_si128!1(bi); 2728 bi = bi | _mm_slli_si128!2(bi); 2729 // bi is now [ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind2 ind2 ind2 ind2 ind3 ind3 ind3 ind3] 2730 byte16 bytesIndices = cast(byte16)bi; 2731 align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]; 2732 bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr; 2733 2734 // which allows us to make a single _mm_shuffle_epi8 2735 return cast(__m128) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices); 2736 } 2737 else 2738 { 2739 2740 int4 bi = cast(int4)b; 2741 __m128 r; 2742 r.ptr[0] = a.array[ (bi.array[0] & 3) ]; 2743 r.ptr[1] = a.array[ (bi.array[1] & 3) ]; 2744 r.ptr[2] = a.array[ (bi.array[2] & 3) ]; 2745 r.ptr[3] = a.array[ (bi.array[3] & 3) ]; 2746 return r; 2747 } 2748 } 2749 unittest 2750 { 2751 __m128 A = _mm_setr_ps(5, 6, 7, 8); 2752 __m128 B = _mm_permutevar_ps(A, _mm_setr_epi32(2, 1, 0, 2 + 4)); 2753 __m128 C = _mm_permutevar_ps(A, _mm_setr_epi32(2, 3 + 8, 1, 0)); 2754 float[4] RB = [7, 6, 5, 7]; 2755 float[4] RC = [7, 8, 6, 5]; 2756 assert(B.array == RB); 2757 assert(C.array == RC); 2758 } 2759 2760 ///ditto 2761 __m256 _mm256_permutevar_ps (__m256 a, __m256i b) @trusted 2762 { 2763 // In order to do those two, it is necessary to use _mm_shuffle_epi8 and reconstruct the integers afterwards. 2764 enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64; 2765 2766 static if (GDC_or_LDC_with_AVX) 2767 { 2768 return __builtin_ia32_vpermilvarps256(a, cast(int8)b); 2769 } 2770 else static if (implementWithByteShuffle) 2771 { 2772 // because we don't have 256-bit vectors, split and use _mm_permutevar_ps 2773 __m128 a_lo = _mm256_extractf128_ps!0(a); 2774 __m128 a_hi = _mm256_extractf128_ps!1(a); 2775 __m128i b_lo = _mm256_extractf128_si256!0(b); 2776 __m128i b_hi = _mm256_extractf128_si256!1(b); 2777 __m128 r_lo = _mm_permutevar_ps(a_lo, b_lo); 2778 __m128 r_hi = _mm_permutevar_ps(a_hi, b_hi); 2779 return _mm256_set_m128(r_hi, r_lo); 2780 } 2781 else 2782 { 2783 int8 bi = cast(int8)b; 2784 __m256 r; 2785 r.ptr[0] = a.array[ (bi.array[0] & 3) ]; 2786 r.ptr[1] = a.array[ (bi.array[1] & 3) ]; 2787 r.ptr[2] = a.array[ (bi.array[2] & 3) ]; 2788 r.ptr[3] = a.array[ (bi.array[3] & 3) ]; 2789 r.ptr[4] = a.array[ 4 + (bi.array[4] & 3) ]; 2790 r.ptr[5] = a.array[ 4 + (bi.array[5] & 3) ]; 2791 r.ptr[6] = a.array[ 4 + (bi.array[6] & 3) ]; 2792 r.ptr[7] = a.array[ 4 + (bi.array[7] & 3) ]; 2793 return r; 2794 } 2795 } 2796 unittest 2797 { 2798 __m256 A = _mm256_setr_ps(1, 2, 3, 4, 5, 6, 7, 8); 2799 __m256 B = _mm256_permutevar_ps(A, _mm256_setr_epi32(2, 1, 0, 2, 3, 2, 1, 0)); 2800 __m256 C = _mm256_permutevar_ps(A, _mm256_setr_epi32(2, 3 + 8, 1, 0, 2, 3, 0, 1)); 2801 float[8] RB = [3.0f, 2, 1, 3, 8, 7, 6, 5]; 2802 float[8] RC = [3.0f, 4, 2, 1, 7, 8, 5, 6]; 2803 assert(B.array == RB); 2804 assert(C.array == RC); 2805 } 2806 2807 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements 2808 /// in `a`. The maximum relative error for this approximation is less than 1.5*2^-12. 2809 __m256 _mm256_rcp_ps (__m256 a) pure @trusted 2810 { 2811 // PERF DMD 2812 static if (GDC_or_LDC_with_AVX) 2813 { 2814 return __builtin_ia32_rcpps256(a); 2815 } 2816 else 2817 { 2818 a.ptr[0] = 1.0f / a.array[0]; 2819 a.ptr[1] = 1.0f / a.array[1]; 2820 a.ptr[2] = 1.0f / a.array[2]; 2821 a.ptr[3] = 1.0f / a.array[3]; 2822 a.ptr[4] = 1.0f / a.array[4]; 2823 a.ptr[5] = 1.0f / a.array[5]; 2824 a.ptr[6] = 1.0f / a.array[6]; 2825 a.ptr[7] = 1.0f / a.array[7]; 2826 return a; 2827 } 2828 } 2829 unittest 2830 { 2831 __m256 A = _mm256_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f, 9, -46, 1869816, 55583); 2832 __m256 groundTruth = _mm256_set1_ps(1.0f) / A; 2833 __m256 result = _mm256_rcp_ps(A); 2834 foreach(i; 0..8) 2835 { 2836 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2837 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2838 } 2839 } 2840 2841 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 2842 /// rounding parameter, and store the results as packed double-precision floating-point elements. 2843 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 2844 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 2845 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 2846 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 2847 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 2848 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 2849 __m256d _mm256_round_pd(int rounding)(__m256d a) @trusted 2850 { 2851 // PERF DMD 2852 static if (GDC_with_AVX) 2853 { 2854 return __builtin_ia32_roundpd256(a, rounding); 2855 } 2856 else static if (LDC_with_AVX) 2857 { 2858 return __builtin_ia32_roundpd256(a, rounding); 2859 } 2860 else 2861 { 2862 static if (rounding & _MM_FROUND_CUR_DIRECTION) 2863 { 2864 // PERF: non-AVX x86, would probably be faster to convert those double at once to int64 2865 2866 __m128d A_lo = _mm256_extractf128_pd!0(a); 2867 __m128d A_hi = _mm256_extractf128_pd!1(a); 2868 2869 // Convert to 64-bit integers one by one 2870 long x0 = _mm_cvtsd_si64(A_lo); 2871 long x2 = _mm_cvtsd_si64(A_hi); 2872 A_lo.ptr[0] = A_lo.array[1]; 2873 A_hi.ptr[0] = A_hi.array[1]; 2874 long x1 = _mm_cvtsd_si64(A_lo); 2875 long x3 = _mm_cvtsd_si64(A_hi); 2876 2877 return _mm256_setr_pd(x0, x1, x2, x3); 2878 } 2879 else 2880 { 2881 version(GNU) pragma(inline, false); // this was required for SSE4.1 rounding, let it here 2882 2883 uint old = _MM_GET_ROUNDING_MODE(); 2884 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 2885 2886 __m128d A_lo = _mm256_extractf128_pd!0(a); 2887 __m128d A_hi = _mm256_extractf128_pd!1(a); 2888 2889 // Convert to 64-bit integers one by one 2890 long x0 = _mm_cvtsd_si64(A_lo); 2891 long x2 = _mm_cvtsd_si64(A_hi); 2892 A_lo.ptr[0] = A_lo.array[1]; 2893 A_hi.ptr[0] = A_hi.array[1]; 2894 long x1 = _mm_cvtsd_si64(A_lo); 2895 long x3 = _mm_cvtsd_si64(A_hi); 2896 2897 // Convert back to double to achieve the rounding 2898 // The problem is that a 64-bit double can't represent all the values 2899 // a 64-bit integer can (and vice-versa). So this function won't work for 2900 // large values. (FUTURE: what range exactly?) 2901 _MM_SET_ROUNDING_MODE(old); 2902 return _mm256_setr_pd(x0, x1, x2, x3); 2903 } 2904 } 2905 } 2906 unittest 2907 { 2908 // tested in other intrinsics 2909 } 2910 2911 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 2912 /// rounding parameter, and store the results as packed single-precision floating-point elements. 2913 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 2914 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 2915 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 2916 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 2917 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 2918 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 2919 __m256 _mm256_round_ps(int rounding)(__m256 a) @trusted 2920 { 2921 // PERF DMD 2922 static if (GDC_or_LDC_with_AVX) 2923 { 2924 return __builtin_ia32_roundps256(a, rounding); 2925 } 2926 else static if (GDC_or_LDC_with_SSE41) 2927 { 2928 // we can use _mm_round_ps 2929 __m128 lo = _mm256_extractf128_ps!0(a); 2930 __m128 hi = _mm256_extractf128_ps!1(a); 2931 __m128 ilo = _mm_round_ps!rounding(lo); // unfortunately _mm_round_ps isn't fast for arm64, so we avoid that in that case 2932 __m128 ihi = _mm_round_ps!rounding(hi); 2933 return _mm256_set_m128(ihi, ilo); 2934 } 2935 else 2936 { 2937 static if (rounding & _MM_FROUND_CUR_DIRECTION) 2938 { 2939 __m256i integers = _mm256_cvtps_epi32(a); 2940 return _mm256_cvtepi32_ps(integers); 2941 } 2942 else 2943 { 2944 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 2945 uint old = _MM_GET_ROUNDING_MODE(); 2946 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 2947 scope(exit) _MM_SET_ROUNDING_MODE(old); 2948 2949 // Convert to 32-bit integers 2950 __m256i integers = _mm256_cvtps_epi32(a); 2951 2952 // Convert back to float to achieve the rounding 2953 // The problem is that a 32-float can't represent all the values 2954 // a 32-bit integer can (and vice-versa). So this function won't work for 2955 // large values. (FUTURE: what range exactly?) 2956 __m256 result = _mm256_cvtepi32_ps(integers); 2957 2958 return result; 2959 } 2960 } 2961 } 2962 unittest 2963 { 2964 // tested in other intrinsics 2965 } 2966 2967 2968 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) 2969 /// floating-point elements in `a`. The maximum relative error for this approximation is less than 2970 /// 1.5*2^-12. 2971 __m256 _mm256_rsqrt_ps (__m256 a) pure @trusted 2972 { 2973 static if (GDC_or_LDC_with_AVX) 2974 { 2975 return __builtin_ia32_rsqrtps256(a); 2976 } 2977 else version(LDC) 2978 { 2979 a[0] = 1.0f / llvm_sqrt(a[0]); 2980 a[1] = 1.0f / llvm_sqrt(a[1]); 2981 a[2] = 1.0f / llvm_sqrt(a[2]); 2982 a[3] = 1.0f / llvm_sqrt(a[3]); 2983 a[4] = 1.0f / llvm_sqrt(a[4]); 2984 a[5] = 1.0f / llvm_sqrt(a[5]); 2985 a[6] = 1.0f / llvm_sqrt(a[6]); 2986 a[7] = 1.0f / llvm_sqrt(a[7]); 2987 return a; 2988 } 2989 else 2990 { 2991 a.ptr[0] = 1.0f / sqrt(a.array[0]); 2992 a.ptr[1] = 1.0f / sqrt(a.array[1]); 2993 a.ptr[2] = 1.0f / sqrt(a.array[2]); 2994 a.ptr[3] = 1.0f / sqrt(a.array[3]); 2995 a.ptr[4] = 1.0f / sqrt(a.array[4]); 2996 a.ptr[5] = 1.0f / sqrt(a.array[5]); 2997 a.ptr[6] = 1.0f / sqrt(a.array[6]); 2998 a.ptr[7] = 1.0f / sqrt(a.array[7]); 2999 return a; 3000 } 3001 } 3002 unittest 3003 { 3004 __m256 A = _mm256_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f, 2.34f, 70000.0f, 0.00001f, 345.5f); 3005 __m256 groundTruth = _mm256_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f, 3006 0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f); 3007 __m256 result = _mm256_rsqrt_ps(A); 3008 foreach(i; 0..8) 3009 { 3010 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 3011 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 3012 } 3013 } 3014 3015 /// Set packed 16-bit integers with the supplied values. 3016 __m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3017 { 3018 short16 r; // Note: = void would prevent GDC from inlining a constant short16... 3019 r.ptr[0] = e0; 3020 r.ptr[1] = e1; 3021 r.ptr[2] = e2; 3022 r.ptr[3] = e3; 3023 r.ptr[4] = e4; 3024 r.ptr[5] = e5; 3025 r.ptr[6] = e6; 3026 r.ptr[7] = e7; 3027 r.ptr[8] = e8; 3028 r.ptr[9] = e9; 3029 r.ptr[10] = e10; 3030 r.ptr[11] = e11; 3031 r.ptr[12] = e12; 3032 r.ptr[13] = e13; 3033 r.ptr[14] = e14; 3034 r.ptr[15] = e15; 3035 return cast(__m256i) r; 3036 } 3037 unittest 3038 { 3039 short16 A = cast(short16) _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 3040 7, 6, 5, 4, 3, 2, 1, 0); 3041 foreach(i; 0..16) 3042 assert(A.array[i] == i); 3043 } 3044 3045 /// Set packed 32-bit integers with the supplied values. 3046 __m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 3047 { 3048 // Inlines a constant with GCC -O1, LDC -O2 3049 int8 r; // = void would prevent GCC from inlining a constant call 3050 r.ptr[0] = e0; 3051 r.ptr[1] = e1; 3052 r.ptr[2] = e2; 3053 r.ptr[3] = e3; 3054 r.ptr[4] = e4; 3055 r.ptr[5] = e5; 3056 r.ptr[6] = e6; 3057 r.ptr[7] = e7; 3058 return cast(__m256i)r; 3059 } 3060 unittest 3061 { 3062 int8 A = cast(int8) _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 3063 foreach(i; 0..8) 3064 assert(A.array[i] == i); 3065 } 3066 3067 /// Set packed 64-bit integers with the supplied values. 3068 __m256i _mm256_set_epi64x (long e3, long e2, long e1, long e0) pure @trusted 3069 { 3070 long4 r = void; 3071 r.ptr[0] = e0; 3072 r.ptr[1] = e1; 3073 r.ptr[2] = e2; 3074 r.ptr[3] = e3; 3075 return r; 3076 } 3077 unittest 3078 { 3079 __m256i A = _mm256_set_epi64x(-1, 42, long.min, long.max); 3080 long[4] correct = [long.max, long.min, 42, -1]; 3081 assert(A.array == correct); 3082 } 3083 3084 ///ditto 3085 alias _mm256_set_epi64 = _mm256_set_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3086 3087 /// Set packed 8-bit integers with the supplied values. 3088 __m256i _mm256_set_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 3089 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 3090 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 3091 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) 3092 { 3093 // Inline a constant call in GDC -O1 and LDC -O2 3094 align(32) byte[32] result = [ e0, e1, e2, e3, e4, e5, e6, e7, 3095 e8, e9, e10, e11, e12, e13, e14, e15, 3096 e16, e17, e18, e19, e20, e21, e22, e23, 3097 e24, e25, e26, e27, e28, e29, e30, e31 ]; 3098 return *cast(__m256i*)(result.ptr); 3099 } 3100 unittest 3101 { 3102 byte32 R = cast(byte32) _mm256_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7); 3103 byte[32] correct = [7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0, 3104 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1]; 3105 assert(R.array == correct); 3106 } 3107 3108 /// Set packed `__m256d` vector with the supplied values. 3109 __m256 _mm256_set_m128 (__m128 hi, __m128 lo) pure @trusted 3110 { 3111 // DMD PERF 3112 static if (GDC_with_AVX) 3113 { 3114 __m256 r = __builtin_ia32_ps256_ps(lo); 3115 return __builtin_ia32_vinsertf128_ps256(r, hi, 1); 3116 } 3117 else 3118 { 3119 __m256 r = void; 3120 r.ptr[0] = lo.array[0]; 3121 r.ptr[1] = lo.array[1]; 3122 r.ptr[2] = lo.array[2]; 3123 r.ptr[3] = lo.array[3]; 3124 r.ptr[4] = hi.array[0]; 3125 r.ptr[5] = hi.array[1]; 3126 r.ptr[6] = hi.array[2]; 3127 r.ptr[7] = hi.array[3]; 3128 return r; 3129 } 3130 3131 /* 3132 // BUG, doesn't work if AVX vector is emulated, but SSE vector is not 3133 // See issue #108 3134 __m256 r = void; 3135 __m128* p = cast(__m128*)(&r); 3136 p[0] = lo; 3137 p[1] = hi; 3138 return r; 3139 */ 3140 } 3141 unittest 3142 { 3143 __m128 lo = _mm_setr_ps(1.0f, 2, 3, 4); 3144 __m128 hi = _mm_setr_ps(3.0f, 4, 5, 6); 3145 __m256 R = _mm256_set_m128(hi, lo); 3146 float[8] correct = [1.0f, 2, 3, 4, 3, 4, 5, 6]; 3147 assert(R.array == correct); 3148 } 3149 3150 /// Set packed `__m256d` vector with the supplied values. 3151 __m256d _mm256_set_m128d (__m128d hi, __m128d lo) pure @trusted 3152 { 3153 __m256d r = void; 3154 r.ptr[0] = lo.array[0]; 3155 r.ptr[1] = lo.array[1]; 3156 r.ptr[2] = hi.array[0]; 3157 r.ptr[3] = hi.array[1]; 3158 return r; 3159 } 3160 unittest 3161 { 3162 __m128d lo = _mm_setr_pd(1.0, 2.0); 3163 __m128d hi = _mm_setr_pd(3.0, 4.0); 3164 __m256d R = _mm256_set_m128d(hi, lo); 3165 double[4] correct = [1.0, 2.0, 3.0, 4.0]; 3166 assert(R.array == correct); 3167 } 3168 3169 /// Set packed `__m256i` vector with the supplied values. 3170 __m256i _mm256_set_m128i (__m128i hi, __m128i lo) pure @trusted 3171 { 3172 // DMD PERF 3173 static if (GDC_with_AVX) 3174 { 3175 __m256i r = cast(long4) __builtin_ia32_si256_si (lo); 3176 return cast(long4) __builtin_ia32_vinsertf128_si256(cast(int8)r, hi, 1); 3177 } 3178 else 3179 { 3180 int8 r = void; 3181 r.ptr[0] = lo.array[0]; 3182 r.ptr[1] = lo.array[1]; 3183 r.ptr[2] = lo.array[2]; 3184 r.ptr[3] = lo.array[3]; 3185 r.ptr[4] = hi.array[0]; 3186 r.ptr[5] = hi.array[1]; 3187 r.ptr[6] = hi.array[2]; 3188 r.ptr[7] = hi.array[3]; 3189 return cast(long4)r; 3190 } 3191 } 3192 unittest 3193 { 3194 __m128i lo = _mm_setr_epi32( 1, 2, 3, 4); 3195 __m128i hi = _mm_set_epi32(-3, -4, -5, -6); 3196 int8 R = cast(int8)_mm256_set_m128i(hi, lo); 3197 int[8] correct = [1, 2, 3, 4, -6, -5, -4, -3]; 3198 assert(R.array == correct); 3199 } 3200 3201 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3202 __m256d _mm256_set_pd (double e3, double e2, double e1, double e0) pure @trusted 3203 { 3204 __m256d r = void; 3205 r.ptr[0] = e0; 3206 r.ptr[1] = e1; 3207 r.ptr[2] = e2; 3208 r.ptr[3] = e3; 3209 return r; 3210 } 3211 unittest 3212 { 3213 __m256d A = _mm256_set_pd(3, 2, 1, 546); 3214 double[4] correct = [546.0, 1.0, 2.0, 3.0]; 3215 assert(A.array == correct); 3216 } 3217 3218 /// Set packed single-precision (32-bit) floating-point elements with the supplied values. 3219 __m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 3220 { 3221 // PERF: see #102, use = void? 3222 __m256 r; 3223 r.ptr[0] = e0; 3224 r.ptr[1] = e1; 3225 r.ptr[2] = e2; 3226 r.ptr[3] = e3; 3227 r.ptr[4] = e4; 3228 r.ptr[5] = e5; 3229 r.ptr[6] = e6; 3230 r.ptr[7] = e7; 3231 return r; 3232 } 3233 unittest 3234 { 3235 __m256 A = _mm256_set_ps(3, 2, 1, 546.0f, -1.25f, -2, -3, 0); 3236 float[8] correct = [0, -3, -2, -1.25f, 546.0f, 1.0, 2.0, 3.0]; 3237 assert(A.array == correct); 3238 } 3239 3240 /// Broadcast 16-bit integer `a` to all elements of the return value. 3241 __m256i _mm256_set1_epi16 (short a) pure @trusted 3242 { 3243 version(DigitalMars) 3244 { 3245 // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3246 // It used to ICE, after that the codegen was just wrong. 3247 // No issue anymore in DMD 2.101, we can eventually remove that 3248 static if (__VERSION__ < 2101) 3249 { 3250 short16 v = a; 3251 return cast(__m256i) v; 3252 } 3253 else 3254 { 3255 pragma(inline, true); 3256 return cast(__m256i)(short16(a)); 3257 } 3258 } 3259 else 3260 { 3261 pragma(inline, true); 3262 return cast(__m256i)(short16(a)); 3263 } 3264 } 3265 unittest 3266 { 3267 short16 a = cast(short16) _mm256_set1_epi16(31); 3268 for (int i = 0; i < 16; ++i) 3269 assert(a.array[i] == 31); 3270 } 3271 3272 /// Broadcast 32-bit integer `a` to all elements. 3273 __m256i _mm256_set1_epi32 (int a) pure @trusted 3274 { 3275 version(DigitalMars) 3276 { 3277 // No issue anymore in DMD 2.101, we can eventually remove that 3278 static if (__VERSION__ < 2101) 3279 { 3280 int8 v = a; 3281 return cast(__m256i) v; 3282 } 3283 else 3284 { 3285 pragma(inline, true); 3286 return cast(__m256i)(int8(a)); 3287 } 3288 } 3289 else 3290 { 3291 pragma(inline, true); 3292 return cast(__m256i)(int8(a)); 3293 } 3294 } 3295 unittest 3296 { 3297 int8 a = cast(int8) _mm256_set1_epi32(31); 3298 for (int i = 0; i < 8; ++i) 3299 assert(a.array[i] == 31); 3300 } 3301 3302 /// Broadcast 64-bit integer `a` to all elements of the return value. 3303 __m256i _mm256_set1_epi64x (long a) 3304 { 3305 return cast(__m256i)(long4(a)); 3306 } 3307 unittest 3308 { 3309 long4 a = cast(long4) _mm256_set1_epi64x(-31); 3310 for (int i = 0; i < 4; ++i) 3311 assert(a.array[i] == -31); 3312 } 3313 ///ditto 3314 alias _mm256_set1_epi64 = _mm256_set1_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3315 3316 /// Broadcast 8-bit integer `a` to all elements of the return value. 3317 __m256i _mm256_set1_epi8 (byte a) pure @trusted 3318 { 3319 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3320 { 3321 byte32 v = a; 3322 return cast(__m256i) v; 3323 } 3324 else 3325 { 3326 pragma(inline, true); 3327 return cast(__m256i)(byte32(a)); 3328 } 3329 } 3330 unittest 3331 { 3332 byte32 a = cast(byte32) _mm256_set1_epi8(31); 3333 for (int i = 0; i < 32; ++i) 3334 assert(a.array[i] == 31); 3335 } 3336 3337 /// Broadcast double-precision (64-bit) floating-point value `a` to all elements of the return value. 3338 __m256d _mm256_set1_pd (double a) pure @trusted 3339 { 3340 return __m256d(a); 3341 } 3342 unittest 3343 { 3344 double a = 464.21; 3345 double[4] correct = [a, a, a, a]; 3346 double4 A = cast(double4) _mm256_set1_pd(a); 3347 assert(A.array == correct); 3348 } 3349 3350 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements of the return value. 3351 __m256 _mm256_set1_ps (float a) pure @trusted 3352 { 3353 return __m256(a); 3354 } 3355 unittest 3356 { 3357 float a = 464.21f; 3358 float[8] correct = [a, a, a, a, a, a, a, a]; 3359 float8 A = cast(float8) _mm256_set1_ps(a); 3360 assert(A.array == correct); 3361 } 3362 3363 /// Set packed 16-bit integers with the supplied values in reverse order. 3364 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, 3365 short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3366 { 3367 short[16] result = [ e15, e14, e13, e12, e11, e10, e9, e8, 3368 e7, e6, e5, e4, e3, e2, e1, e0]; 3369 static if (GDC_with_AVX) 3370 { 3371 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 3372 } 3373 else static if (LDC_with_optimizations) 3374 { 3375 return cast(__m256i)( loadUnaligned!(short16)(result.ptr) ); 3376 } 3377 else 3378 { 3379 short16 r; 3380 for(int n = 0; n < 16; ++n) 3381 r.ptr[n] = result[n]; 3382 return cast(__m256i)r; 3383 } 3384 } 3385 unittest 3386 { 3387 short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128, 3388 -1, 0, -21, 21, 42, 127, -42, -128); 3389 short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 3390 -1, 0, -21, 21, 42, 127, -42, -128]; 3391 assert(A.array == correct); 3392 } 3393 3394 /// Set packed 32-bit integers with the supplied values in reverse order. 3395 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 3396 { 3397 // Inlines a constant with GCC -O1, LDC -O2 3398 int8 r; // = void would prevent GDC from inlining a constant call 3399 r.ptr[0] = e7; 3400 r.ptr[1] = e6; 3401 r.ptr[2] = e5; 3402 r.ptr[3] = e4; 3403 r.ptr[4] = e3; 3404 r.ptr[5] = e2; 3405 r.ptr[6] = e1; 3406 r.ptr[7] = e0; 3407 return cast(__m256i)r; 3408 } 3409 unittest 3410 { 3411 int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666); 3412 int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666]; 3413 assert(A.array == correct); 3414 } 3415 3416 /// Set packed 64-bit integers with the supplied values in reverse order. 3417 __m256i _mm256_setr_epi64x (long e3, long e2, long e1, long e0) pure @trusted 3418 { 3419 long4 r = void; 3420 r.ptr[0] = e3; 3421 r.ptr[1] = e2; 3422 r.ptr[2] = e1; 3423 r.ptr[3] = e0; 3424 return r; 3425 } 3426 unittest 3427 { 3428 __m256i A = _mm256_setr_epi64x(-1, 42, long.min, long.max); 3429 long[4] correct = [-1, 42, long.min, long.max]; 3430 assert(A.array == correct); 3431 } 3432 ///ditto 3433 alias _mm256_setr_epi64 = _mm256_setr_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3434 3435 /// Set packed 8-bit integers with the supplied values in reverse order. 3436 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 3437 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 3438 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 3439 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 3440 { 3441 // Inline a constant call in GDC -O1 and LDC -O2 3442 align(32) byte[32] result = [ e31, e30, e29, e28, e27, e26, e25, e24, 3443 e23, e22, e21, e20, e19, e18, e17, e16, 3444 e15, e14, e13, e12, e11, e10, e9, e8, 3445 e7, e6, e5, e4, e3, e2, e1, e0]; 3446 return *cast(__m256i*)(result.ptr); 3447 } 3448 unittest 3449 { 3450 byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128, 3451 -1, 0, -21, 21, 42, 127, -42, -128, 3452 -1, 0, -21, 21, 42, 127, -42, -128, 3453 -1, 0, -21, 21, 42, 127, -42, -128); 3454 byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 3455 -1, 0, -21, 21, 42, 127, -42, -128, 3456 -1, 0, -21, 21, 42, 127, -42, -128, 3457 -1, 0, -21, 21, 42, 127, -42, -128]; 3458 assert(A.array == correct); 3459 } 3460 3461 /// Set packed `__m256` vector with the supplied values. 3462 __m256 _mm256_setr_m128 (__m128 lo, __m128 hi) 3463 { 3464 return _mm256_set_m128(hi, lo); 3465 } 3466 unittest 3467 { 3468 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 3469 __m128 B = _mm_setr_ps(3.0f, 4, 5, 6); 3470 __m256 R = _mm256_setr_m128(B, A); 3471 float[8] correct = [3.0f, 4, 5, 6, 1, 2, 3, 4,]; 3472 assert(R.array == correct); 3473 } 3474 3475 /// Set packed `__m256d` vector with the supplied values. 3476 __m256d _mm256_setr_m128d (__m128d lo, __m128d hi) 3477 { 3478 return _mm256_set_m128d(hi, lo); 3479 } 3480 unittest 3481 { 3482 __m128d A = _mm_setr_pd(1.0, 2.0); 3483 __m128d B = _mm_setr_pd(3.0, 4.0); 3484 __m256d R = _mm256_setr_m128d(B, A); 3485 double[4] correct = [3.0, 4.0, 1.0, 2.0]; 3486 assert(R.array == correct); 3487 } 3488 3489 /// Set packed `__m256i` vector with the supplied values. 3490 __m256i _mm256_setr_m128i (__m128i lo, __m128i hi) 3491 { 3492 return _mm256_set_m128i(hi, lo); 3493 } 3494 unittest 3495 { 3496 __m128i A = _mm_setr_epi32( 1, 2, 3, 4); 3497 __m128i B = _mm_set_epi32(-3, -4, -5, -6); 3498 int8 R = cast(int8)_mm256_setr_m128i(B, A); 3499 int[8] correct = [-6, -5, -4, -3, 1, 2, 3, 4]; 3500 assert(R.array == correct); 3501 } 3502 3503 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3504 __m256d _mm256_setr_pd (double e3, double e2, double e1, double e0) pure @trusted 3505 { 3506 static if (LDC_with_optimizations) 3507 { 3508 // PERF, probably not the best 3509 double[4] result = [e3, e2, e1, e0]; 3510 return loadUnaligned!(double4)(result.ptr); 3511 } 3512 else 3513 { 3514 __m256d r; 3515 r.ptr[0] = e3; 3516 r.ptr[1] = e2; 3517 r.ptr[2] = e1; 3518 r.ptr[3] = e0; 3519 return r; 3520 } 3521 } 3522 unittest 3523 { 3524 __m256d A = _mm256_setr_pd(3, 2, 1, 546.125); 3525 double[4] correct = [3.0, 2.0, 1.0, 546.125]; 3526 assert(A.array == correct); 3527 } 3528 3529 3530 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order. 3531 __m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 3532 { 3533 // PERF DMD 3534 static if (GDC_with_AVX) 3535 { 3536 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 3537 return *cast(__m256*)r; 3538 } 3539 else version(LDC) 3540 { 3541 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 3542 return *cast(__m256*)r; 3543 } 3544 else 3545 { 3546 __m256 r; 3547 r.ptr[0] = e7; 3548 r.ptr[1] = e6; 3549 r.ptr[2] = e5; 3550 r.ptr[3] = e4; 3551 r.ptr[4] = e3; 3552 r.ptr[5] = e2; 3553 r.ptr[6] = e1; 3554 r.ptr[7] = e0; 3555 return r; 3556 } 3557 } 3558 unittest 3559 { 3560 __m256 A = _mm256_setr_ps( 3, 2, 1, 546.125f, 4, 5, 6, 7); 3561 float[8] correct = [3.0f, 2, 1, 546.125f, 4, 5, 6, 7]; 3562 assert(A.array == correct); 3563 } 3564 3565 /// Return vector of type `__m256d` with all elements set to zero. 3566 __m256d _mm256_setzero_pd() pure @safe 3567 { 3568 return double4(0.0); 3569 } 3570 unittest 3571 { 3572 __m256d A = _mm256_setzero_pd(); 3573 double[4] correct = [0.0, 0.0, 0.0, 0.0]; 3574 assert(A.array == correct); 3575 } 3576 3577 /// Return vector of type `__m256` with all elements set to zero. 3578 __m256 _mm256_setzero_ps() pure @safe 3579 { 3580 return float8(0.0f); 3581 } 3582 unittest 3583 { 3584 __m256 A = _mm256_setzero_ps(); 3585 float[8] correct = [0.0f, 0, 0, 0, 0, 0, 0, 0]; 3586 assert(A.array == correct); 3587 } 3588 3589 /// Return vector of type `__m256i` with all elements set to zero. 3590 __m256i _mm256_setzero_si256() pure @trusted 3591 { 3592 return __m256i(0); 3593 } 3594 unittest 3595 { 3596 __m256i A = _mm256_setzero_si256(); 3597 long[4] correct = [0, 0, 0, 0]; 3598 assert(A.array == correct); 3599 } 3600 3601 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the 3602 /// control in `imm8`. 3603 __m256d _mm256_shuffle_pd(int imm8)(__m256d a, __m256d b) pure @trusted 3604 { 3605 // PERF DMD 3606 static if (GDC_with_AVX) 3607 { 3608 return __builtin_ia32_shufpd256(a, b, imm8); 3609 } 3610 else version(LDC) 3611 { 3612 return shufflevectorLDC!(double4, 3613 (imm8 >> 0) & 1, 3614 4 + ( (imm8 >> 1) & 1), 3615 2 + ( (imm8 >> 2) & 1), 3616 6 + ( (imm8 >> 3) & 1) )(a, b); 3617 } 3618 else 3619 { 3620 double4 r = void; 3621 r.ptr[0] = a.array[(imm8 >> 0) & 1]; 3622 r.ptr[1] = b.array[(imm8 >> 1) & 1]; 3623 r.ptr[2] = a.array[2 + ( (imm8 >> 2) & 1)]; 3624 r.ptr[3] = b.array[2 + ( (imm8 >> 3) & 1)]; 3625 return r; 3626 } 3627 } 3628 unittest 3629 { 3630 __m256d A = _mm256_setr_pd( 0, 1, 2, 3); 3631 __m256d B = _mm256_setr_pd( 4, 5, 6, 7); 3632 __m256d C = _mm256_shuffle_pd!75 /* 01001011 */(A, B); 3633 double[4] correct = [1.0, 5.0, 2.0, 7.0]; 3634 assert(C.array == correct); 3635 } 3636 3637 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 3638 /// the control in `imm8`. 3639 __m256 _mm256_shuffle_ps(int imm8)(__m256 a, __m256 b) pure @trusted 3640 { 3641 // PERF DMD 3642 static if (GDC_with_AVX) 3643 { 3644 return __builtin_ia32_shufps256(a, b, imm8); 3645 } 3646 else version(LDC) 3647 { 3648 return shufflevectorLDC!(float8, (imm8 >> 0) & 3, 3649 (imm8 >> 2) & 3, 3650 8 + ( (imm8 >> 4) & 3), 3651 8 + ( (imm8 >> 6) & 3), 3652 4 + ( (imm8 >> 0) & 3), 3653 4 + ( (imm8 >> 2) & 3), 3654 12 + ( (imm8 >> 4) & 3), 3655 12 + ( (imm8 >> 6) & 3) )(a, b); 3656 } 3657 else 3658 { 3659 float8 r = void; 3660 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 3661 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 3662 r.ptr[2] = b.array[(imm8 >> 4) & 3]; 3663 r.ptr[3] = b.array[(imm8 >> 6) & 3]; 3664 r.ptr[4] = a.array[4 + ( (imm8 >> 0) & 3 )]; 3665 r.ptr[5] = a.array[4 + ( (imm8 >> 2) & 3 )]; 3666 r.ptr[6] = b.array[4 + ( (imm8 >> 4) & 3 )]; 3667 r.ptr[7] = b.array[4 + ( (imm8 >> 6) & 3 )]; 3668 return r; 3669 } 3670 } 3671 unittest 3672 { 3673 __m256 A = _mm256_setr_ps( 0, 1, 2, 3, 4, 5, 6, 7); 3674 __m256 B = _mm256_setr_ps( 8, 9, 10, 11, 12, 13, 14, 15); 3675 __m256 C = _mm256_shuffle_ps!75 /* 01001011 */(A, B); 3676 float[8] correct = [3.0f, 2, 8, 9, 7, 6, 12, 13]; 3677 assert(C.array == correct); 3678 } 3679 3680 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `a`. 3681 __m256d _mm256_sqrt_pd (__m256d a) pure @trusted 3682 { 3683 static if (GDC_with_AVX) 3684 { 3685 return __builtin_ia32_sqrtpd256(a); 3686 } 3687 else version(LDC) 3688 { 3689 static if (__VERSION__ >= 2084) 3690 return llvm_sqrt(a); // that capability appeared in LDC 1.14 3691 else 3692 { 3693 a.ptr[0] = llvm_sqrt(a.array[0]); 3694 a.ptr[1] = llvm_sqrt(a.array[1]); 3695 a.ptr[2] = llvm_sqrt(a.array[2]); 3696 a.ptr[3] = llvm_sqrt(a.array[3]); 3697 return a; 3698 } 3699 } 3700 else 3701 { 3702 a.ptr[0] = sqrt(a.array[0]); 3703 a.ptr[1] = sqrt(a.array[1]); 3704 a.ptr[2] = sqrt(a.array[2]); 3705 a.ptr[3] = sqrt(a.array[3]); 3706 return a; 3707 } 3708 } 3709 unittest 3710 { 3711 __m256d A = _mm256_sqrt_pd(_mm256_set1_pd(4.0)); 3712 double[4] correct = [2.0, 2, 2, 2]; 3713 assert(A.array == correct); 3714 } 3715 3716 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`. 3717 __m256 _mm256_sqrt_ps (__m256 a) pure @trusted 3718 { 3719 static if (GDC_with_AVX) 3720 { 3721 return __builtin_ia32_sqrtps256(a); 3722 } 3723 else version(LDC) 3724 { 3725 static if (__VERSION__ >= 2084) 3726 return llvm_sqrt(a); // that capability appeared in LDC 1.14 3727 else 3728 { 3729 a.ptr[0] = llvm_sqrt(a.array[0]); 3730 a.ptr[1] = llvm_sqrt(a.array[1]); 3731 a.ptr[2] = llvm_sqrt(a.array[2]); 3732 a.ptr[3] = llvm_sqrt(a.array[3]); 3733 a.ptr[4] = llvm_sqrt(a.array[4]); 3734 a.ptr[5] = llvm_sqrt(a.array[5]); 3735 a.ptr[6] = llvm_sqrt(a.array[6]); 3736 a.ptr[7] = llvm_sqrt(a.array[7]); 3737 return a; 3738 } 3739 } 3740 else 3741 { 3742 a.ptr[0] = sqrt(a.array[0]); 3743 a.ptr[1] = sqrt(a.array[1]); 3744 a.ptr[2] = sqrt(a.array[2]); 3745 a.ptr[3] = sqrt(a.array[3]); 3746 a.ptr[4] = sqrt(a.array[4]); 3747 a.ptr[5] = sqrt(a.array[5]); 3748 a.ptr[6] = sqrt(a.array[6]); 3749 a.ptr[7] = sqrt(a.array[7]); 3750 return a; 3751 } 3752 } 3753 unittest 3754 { 3755 __m256 A = _mm256_sqrt_ps(_mm256_set1_ps(4.0f)); 3756 float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2]; 3757 assert(A.array == correct); 3758 } 3759 3760 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 3761 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 3762 /// exception may be generated. 3763 void _mm256_store_pd (double* mem_addr, __m256d a) pure @system 3764 { 3765 *cast(__m256d*)mem_addr = a; 3766 } 3767 unittest 3768 { 3769 align(32) double[4] mem; 3770 double[4] correct = [1.0, 2, 3, 4]; 3771 _mm256_store_pd(mem.ptr, _mm256_setr_pd(1.0, 2, 3, 4)); 3772 assert(mem == correct); 3773 } 3774 3775 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3776 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 3777 /// exception may be generated. 3778 void _mm256_store_ps (float* mem_addr, __m256 a) pure @system 3779 { 3780 *cast(__m256*)mem_addr = a; 3781 } 3782 unittest 3783 { 3784 align(32) float[8] mem; 3785 float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8]; 3786 _mm256_store_ps(mem.ptr, _mm256_set_ps(8.0, 7, 6, 5, 4, 3, 2, 1)); 3787 assert(mem == correct); 3788 } 3789 3790 /// Store 256-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 32-byte 3791 /// boundary or a general-protection exception may be generated. 3792 void _mm256_store_si256 (__m256i * mem_addr, __m256i a) pure @safe 3793 { 3794 *mem_addr = a; 3795 } 3796 unittest 3797 { 3798 align(32) long[4] mem; 3799 long[4] correct = [5, -6, -7, 8]; 3800 _mm256_store_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8)); 3801 assert(mem == correct); 3802 } 3803 3804 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 3805 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 3806 void _mm256_storeu_pd (double * mem_addr, __m256d a) pure @system 3807 { 3808 // PERF DMD 3809 static if (GDC_with_AVX) 3810 { 3811 __builtin_ia32_storeupd256(mem_addr, a); 3812 } 3813 else static if (LDC_with_optimizations) 3814 { 3815 storeUnaligned!__m256d(a, mem_addr); 3816 } 3817 else 3818 { 3819 for(int n = 0; n < 4; ++n) 3820 mem_addr[n] = a.array[n]; 3821 } 3822 } 3823 unittest 3824 { 3825 align(32) double[6] arr = [0.0, 0, 0, 0, 0, 0]; 3826 _mm256_storeu_pd(&arr[1], _mm256_set1_pd(4.0)); 3827 double[4] correct = [4.0, 4, 4, 4]; 3828 assert(arr[1..5] == correct); 3829 } 3830 3831 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3832 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 3833 void _mm256_storeu_ps (float* mem_addr, __m256 a) pure @system 3834 { 3835 // PERF DMD 3836 static if (GDC_with_AVX) 3837 { 3838 __builtin_ia32_storeups256(mem_addr, a); 3839 } 3840 else static if (LDC_with_optimizations) 3841 { 3842 storeUnaligned!__m256(a, mem_addr); 3843 } 3844 else 3845 { 3846 for(int n = 0; n < 8; ++n) 3847 mem_addr[n] = a.array[n]; 3848 } 3849 } 3850 unittest 3851 { 3852 align(32) float[10] arr = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3853 _mm256_storeu_ps(&arr[1], _mm256_set1_ps(4.0f)); 3854 float[8] correct = [4.0f, 4, 4, 4, 4, 4, 4, 4]; 3855 assert(arr[1..9] == correct); 3856 } 3857 3858 3859 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned 3860 /// on any particular boundary. 3861 void _mm256_storeu_si256 (__m256i* mem_addr, __m256i a) pure @trusted 3862 { 3863 // PERF DMD 3864 static if (GDC_with_AVX) 3865 { 3866 __builtin_ia32_storedqu256(cast(char*)mem_addr, cast(ubyte32) a); 3867 } 3868 else static if (LDC_with_optimizations) 3869 { 3870 storeUnaligned!__m256i(a, cast(long*)mem_addr); 3871 } 3872 else 3873 { 3874 long4 v = cast(long4)a; 3875 long* p = cast(long*)mem_addr; 3876 for(int n = 0; n < 4; ++n) 3877 p[n] = v[n]; 3878 } 3879 } 3880 unittest 3881 { 3882 align(32) long[6] arr = [0, 0, 0, 0, 0, 0]; 3883 _mm256_storeu_si256( cast(__m256i*) &arr[1], _mm256_set1_epi64x(4)); 3884 long[4] correct = [4, 4, 4, 4]; 3885 assert(arr[1..5] == correct); 3886 } 3887 3888 /// Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) 3889 /// floating-point elements) from `a` into memory two different 128-bit locations. 3890 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3891 void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a) pure @system 3892 { 3893 // This is way better on GDC, and similarly in LDC, vs using other intrinsics 3894 loaddr[0] = a.array[0]; 3895 loaddr[1] = a.array[1]; 3896 loaddr[2] = a.array[2]; 3897 loaddr[3] = a.array[3]; 3898 hiaddr[0] = a.array[4]; 3899 hiaddr[1] = a.array[5]; 3900 hiaddr[2] = a.array[6]; 3901 hiaddr[3] = a.array[7]; 3902 } 3903 unittest 3904 { 3905 align(32) float[11] A = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3906 _mm256_storeu2_m128(&A[1], &A[6], _mm256_set1_ps(2.0f)); 3907 float[11] correct = [0.0f, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0]; 3908 assert(A == correct); 3909 } 3910 3911 /// Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) 3912 /// floating-point elements) from `a` into memory two different 128-bit locations. 3913 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3914 void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a) pure @system 3915 { 3916 loaddr[0] = a.array[0]; 3917 loaddr[1] = a.array[1]; 3918 hiaddr[0] = a.array[2]; 3919 hiaddr[1] = a.array[3]; 3920 } 3921 unittest 3922 { 3923 double[2] A; 3924 double[2] B; 3925 _mm256_storeu2_m128d(A.ptr, B.ptr, _mm256_set1_pd(-43.0)); 3926 double[2] correct = [-43.0, -43]; 3927 assert(A == correct); 3928 assert(B == correct); 3929 } 3930 3931 /// Store the high and low 128-bit halves (each composed of integer data) from `a` into memory two 3932 /// different 128-bit locations. 3933 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3934 void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a) pure @trusted 3935 { 3936 long* hi = cast(long*)hiaddr; 3937 long* lo = cast(long*)loaddr; 3938 lo[0] = a.array[0]; 3939 lo[1] = a.array[1]; 3940 hi[0] = a.array[2]; 3941 hi[1] = a.array[3]; 3942 } 3943 unittest 3944 { 3945 long[2] A; 3946 long[2] B; 3947 _mm256_storeu2_m128i(cast(__m128i*)A.ptr, cast(__m128i*)B.ptr, _mm256_set1_epi64x(-42)); 3948 long[2] correct = [-42, -42]; 3949 assert(A == correct); 3950 assert(B == correct); 3951 } 3952 3953 /// Store 256-bits (composed of 4 packed single-precision (64-bit) floating-point elements) from 3954 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 3955 /// boundary or a general-protection exception may be generated. 3956 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 3957 void _mm256_stream_pd (double* mem_addr, __m256d a) pure @system 3958 { 3959 // PERF DMD 3960 // PERF GDC + SSE2 3961 static if (LDC_with_InlineIREx && LDC_with_optimizations) 3962 { 3963 enum prefix = `!0 = !{ i32 1 }`; 3964 enum ir = ` 3965 store <4 x double> %1, <4 x double>* %0, align 32, !nontemporal !0 3966 ret void`; 3967 LDCInlineIREx!(prefix, ir, "", void, double4*, double4)(cast(double4*)mem_addr, a); 3968 } 3969 else static if (GDC_with_AVX) // any hope to be non-temporal? Using SSE2 instructions. 3970 { 3971 __builtin_ia32_movntpd256 (mem_addr, a); 3972 } 3973 else 3974 { 3975 // Regular store instead. 3976 __m256d* dest = cast(__m256d*)mem_addr; 3977 *dest = a; 3978 } 3979 } 3980 unittest 3981 { 3982 align(32) double[4] mem; 3983 double[4] correct = [5.0, -6, -7, 8]; 3984 _mm256_stream_pd(mem.ptr, _mm256_setr_pd(5.0, -6, -7, 8)); 3985 assert(mem == correct); 3986 } 3987 3988 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3989 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 3990 /// boundary or a general-protection exception may be generated. 3991 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 3992 void _mm256_stream_ps (float* mem_addr, __m256 a) pure @system 3993 { 3994 // PERF DMD 3995 // PERF GDC + SSE2 3996 static if (LDC_with_InlineIREx && LDC_with_optimizations) 3997 { 3998 enum prefix = `!0 = !{ i32 1 }`; 3999 enum ir = ` 4000 store <8 x float> %1, <8 x float>* %0, align 32, !nontemporal !0 4001 ret void`; 4002 LDCInlineIREx!(prefix, ir, "", void, float8*, float8)(cast(float8*)mem_addr, a); 4003 } 4004 else static if (GDC_with_AVX) 4005 { 4006 __builtin_ia32_movntps256 (mem_addr, a); 4007 } 4008 else 4009 { 4010 // Regular store instead. 4011 __m256* dest = cast(__m256*)mem_addr; 4012 *dest = a; 4013 } 4014 } 4015 unittest 4016 { 4017 align(32) float[8] mem; 4018 float[8] correct = [5, -6, -7, 8, 1, 2, 3, 4]; 4019 _mm256_stream_ps(mem.ptr, _mm256_setr_ps(5, -6, -7, 8, 1, 2, 3, 4)); 4020 assert(mem == correct); 4021 } 4022 4023 /// Store 256-bits of integer data from `a` into memory using a non-temporal memory hint. 4024 /// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be 4025 /// generated. 4026 /// Note: there isn't any particular instruction in AVX to do that. It just defers to SSE2. 4027 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4028 void _mm256_stream_si256 (__m256i * mem_addr, __m256i a) pure @trusted 4029 { 4030 // PERF DMD 4031 // PERF GDC 4032 static if (LDC_with_InlineIREx && LDC_with_optimizations) 4033 { 4034 enum prefix = `!0 = !{ i32 1 }`; 4035 enum ir = ` 4036 store <4 x i64> %1, <4 x i64>* %0, align 16, !nontemporal !0 4037 ret void`; 4038 LDCInlineIREx!(prefix, ir, "", void, long4*, long4)(mem_addr, a); 4039 } 4040 else static if (GDC_with_SSE2) // any hope to be non-temporal? Using SSE2 instructions. 4041 { 4042 long2 lo, hi; 4043 lo.ptr[0] = a.array[0]; 4044 lo.ptr[1] = a.array[1]; 4045 hi.ptr[0] = a.array[2]; 4046 hi.ptr[1] = a.array[3]; 4047 _mm_stream_si128(cast(__m128i*)mem_addr, cast(__m128i)lo); 4048 _mm_stream_si128((cast(__m128i*)mem_addr) + 1, cast(__m128i)hi); 4049 } 4050 else 4051 { 4052 // Regular store instead. 4053 __m256i* dest = cast(__m256i*)mem_addr; 4054 *dest = a; 4055 } 4056 } 4057 unittest 4058 { 4059 align(32) long[4] mem; 4060 long[4] correct = [5, -6, -7, 8]; 4061 _mm256_stream_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8)); 4062 assert(mem == correct); 4063 } 4064 4065 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from 4066 /// packed double-precision (64-bit) floating-point elements in `a`. 4067 __m256d _mm256_sub_pd (__m256d a, __m256d b) pure @safe 4068 { 4069 return a - b; 4070 } 4071 unittest 4072 { 4073 __m256d a = [1.5, -2.0, 3.0, 200000.0]; 4074 a = _mm256_sub_pd(a, a); 4075 double[4] correct = [0.0, 0, 0, 0]; 4076 assert(a.array == correct); 4077 } 4078 4079 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from 4080 /// packed single-precision (32-bit) floating-point elements in `a`. 4081 __m256 _mm256_sub_ps (__m256 a, __m256 b) pure @safe 4082 { 4083 return a - b; 4084 } 4085 unittest 4086 { 4087 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2000.0f, 3.0f, 1.0f]; 4088 a = _mm256_sub_ps(a, a); 4089 float[8] correct = [0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f]; 4090 assert(a.array == correct); 4091 } 4092 4093 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 4094 /// return 1 if the sign bit of each 64-bit element in the intermediate value is zero, 4095 /// otherwise return 0. 4096 int _mm_testc_pd (__m128d a, __m128d b) pure @trusted 4097 { 4098 static if (GDC_or_LDC_with_AVX) 4099 { 4100 return __builtin_ia32_vtestcpd(a, b); 4101 } 4102 else 4103 { 4104 // PERF: maybe do the generic version more like simde 4105 long2 la = cast(long2)a; 4106 long2 lb = cast(long2)b; 4107 long2 r = ~la & lb; 4108 return r.array[0] >= 0 && r.array[1] >= 0; 4109 } 4110 } 4111 unittest 4112 { 4113 __m128d A = _mm_setr_pd(-1, 1); 4114 __m128d B = _mm_setr_pd(-1, -1); 4115 __m128d C = _mm_setr_pd(1, -1); 4116 assert(_mm_testc_pd(A, A) == 1); 4117 assert(_mm_testc_pd(A, B) == 0); 4118 assert(_mm_testc_pd(B, A) == 1); 4119 } 4120 4121 ///ditto 4122 int _mm256_testc_pd (__m256d a, __m256d b) pure @safe 4123 { 4124 static if (GDC_or_LDC_with_AVX) 4125 { 4126 return __builtin_ia32_vtestcpd256(a, b); 4127 } 4128 else static if (LDC_with_ARM64) 4129 { 4130 // better to split than do vanilla (down to 10 inst) 4131 __m128d lo_a = _mm256_extractf128_pd!0(a); 4132 __m128d lo_b = _mm256_extractf128_pd!0(b); 4133 __m128d hi_a = _mm256_extractf128_pd!1(a); 4134 __m128d hi_b = _mm256_extractf128_pd!1(b); 4135 return _mm_testc_pd(lo_a, lo_b) & _mm_testc_pd(hi_a, hi_b); 4136 } 4137 else 4138 { 4139 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4140 long4 la = cast(long4)a; 4141 long4 lb = cast(long4)b; 4142 long4 r = ~la & lb; 4143 return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0; 4144 } 4145 } 4146 unittest 4147 { 4148 __m256d A = _mm256_setr_pd(-1, 1, -1, 1); 4149 __m256d B = _mm256_setr_pd(-1, -1, -1, -1); 4150 __m256d C = _mm256_setr_pd(1, -1, 1, -1); 4151 assert(_mm256_testc_pd(A, A) == 1); 4152 assert(_mm256_testc_pd(A, B) == 0); 4153 assert(_mm256_testc_pd(B, A) == 1); 4154 } 4155 4156 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 4157 /// return 1 if the sign bit of each 32-bit element in the intermediate value is zero, 4158 /// otherwise return 0. 4159 int _mm_testc_ps (__m128 a, __m128 b) pure @safe 4160 { 4161 // PERF DMD 4162 static if (GDC_or_LDC_with_AVX) 4163 { 4164 return __builtin_ia32_vtestcps(a, b); 4165 } 4166 else static if (LDC_with_ARM64) 4167 { 4168 int4 la = cast(int4)a; 4169 int4 lb = cast(int4)b; 4170 int4 r = ~la & lb; 4171 int4 shift; 4172 shift = 31; 4173 r >>= shift; 4174 int[4] zero = [0, 0, 0, 0]; 4175 return r.array == zero; 4176 } 4177 else 4178 { 4179 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4180 int4 la = cast(int4)a; 4181 int4 lb = cast(int4)b; 4182 int4 r = ~la & lb; 4183 return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0; 4184 } 4185 } 4186 unittest 4187 { 4188 __m128 A = _mm_setr_ps(-1, 1, -1, 1); 4189 __m128 B = _mm_setr_ps(-1, -1, -1, -1); 4190 __m128 C = _mm_setr_ps(1, -1, 1, -1); 4191 assert(_mm_testc_ps(A, A) == 1); 4192 assert(_mm_testc_ps(A, B) == 0); 4193 assert(_mm_testc_ps(B, A) == 1); 4194 } 4195 4196 ///ditto 4197 int _mm256_testc_ps (__m256 a, __m256 b) pure @safe 4198 { 4199 // PERF DMD 4200 static if (GDC_or_LDC_with_AVX) 4201 { 4202 return __builtin_ia32_vtestcps256(a, b); 4203 } 4204 else static if (LDC_with_ARM64) 4205 { 4206 int8 la = cast(int8)a; 4207 int8 lb = cast(int8)b; 4208 int8 r = ~la & lb; 4209 int8 shift; 4210 shift = 31; 4211 r >>= shift; 4212 int[8] zero = [0, 0, 0, 0, 0, 0, 0, 0]; 4213 return r.array == zero; 4214 } 4215 else 4216 { 4217 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4218 int8 la = cast(int8)a; 4219 int8 lb = cast(int8)b; 4220 int8 r = ~la & lb; 4221 return r.array[0] >= 0 4222 && r.array[1] >= 0 4223 && r.array[2] >= 0 4224 && r.array[3] >= 0 4225 && r.array[4] >= 0 4226 && r.array[5] >= 0 4227 && r.array[6] >= 0 4228 && r.array[7] >= 0; 4229 } 4230 } 4231 unittest 4232 { 4233 __m256 A = _mm256_setr_ps(-1, 1, -1, 1, -1, 1, -1, 1); 4234 __m256 B = _mm256_setr_ps(-1, -1, -1, -1, -1, -1, -1, -1); 4235 __m256 C = _mm256_setr_ps( 1, -1, 1, -1, 1, 1, 1, 1); 4236 assert(_mm256_testc_ps(A, A) == 1); 4237 assert(_mm256_testc_ps(B, B) == 1); 4238 assert(_mm256_testc_ps(A, B) == 0); 4239 assert(_mm256_testc_ps(B, A) == 1); 4240 assert(_mm256_testc_ps(C, B) == 0); 4241 assert(_mm256_testc_ps(B, C) == 1); 4242 } 4243 4244 /// Compute the bitwise NOT of `a` and then AND with `b`, and return 1 if the result is zero, 4245 /// otherwise return 0. 4246 /// In other words, test if all bits masked by `b` are also 1 in `a`. 4247 int _mm256_testc_si256 (__m256i a, __m256i b) pure @trusted 4248 { 4249 static if (GDC_or_LDC_with_AVX) 4250 { 4251 return __builtin_ia32_ptestc256(cast(long4)a, cast(long4)b); 4252 } 4253 else static if (LDC_with_ARM64) 4254 { 4255 // better to split than do vanilla (down to 10 inst) 4256 __m128i lo_a = _mm256_extractf128_si256!0(a); 4257 __m128i lo_b = _mm256_extractf128_si256!0(b); 4258 __m128i hi_a = _mm256_extractf128_si256!1(a); 4259 __m128i hi_b = _mm256_extractf128_si256!1(b); 4260 return _mm_testc_si128(lo_a, lo_b) & _mm_testc_si128(hi_a, hi_b); 4261 } 4262 else 4263 { 4264 __m256i c = ~a & b; 4265 long[4] zero = [0, 0, 0, 0]; 4266 return c.array == zero; 4267 } 4268 } 4269 unittest 4270 { 4271 __m256i A = _mm256_setr_epi64(0x01, 0x02, 0x04, 0xf8); 4272 __m256i M1 = _mm256_setr_epi64(0xfe, 0xfd, 0x00, 0x00); 4273 __m256i M2 = _mm256_setr_epi64(0x00, 0x00, 0x04, 0x00); 4274 assert(_mm256_testc_si256(A, A) == 1); 4275 assert(_mm256_testc_si256(A, M1) == 0); 4276 assert(_mm256_testc_si256(A, M2) == 1); 4277 } 4278 4279 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4280 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 4281 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 4282 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4283 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise 4284 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4285 /// 4286 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4287 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4288 int _mm_testnzc_pd (__m128d a, __m128d b) pure @safe 4289 { 4290 // PERF DMD 4291 static if (GDC_or_LDC_with_AVX) 4292 { 4293 return __builtin_ia32_vtestnzcpd(a, b); 4294 } 4295 else 4296 { 4297 // ZF = 0 means "there is at least one pair of negative numbers" 4298 // ZF = 1 means "no pairs of negative numbers" 4299 // CF = 0 means "there is a negative number in b that is next to a positive number in a" 4300 // CF = 1 means "all negative numbers in b are also negative in a" 4301 // Consequently, CF = 0 and ZF = 0 means: 4302 // "There is a pair of matching negative numbers in a and b, 4303 // AND also there is a negative number in b, that is matching a positive number in a" 4304 // Phew. 4305 4306 // courtesy of simd-everywhere 4307 __m128i m = _mm_and_si128(cast(__m128i)a, cast(__m128i)b); 4308 __m128i m2 = _mm_andnot_si128(cast(__m128i)a, cast(__m128i)b); 4309 m = _mm_srli_epi64(m, 63); 4310 m2 = _mm_srli_epi64(m2, 63); 4311 return cast(int)( m.array[0] | m.array[2]) & (m2.array[0] | m2.array[2]); 4312 } 4313 } 4314 unittest 4315 { 4316 __m128d PM = _mm_setr_pd( 1, -1); 4317 __m128d MP = _mm_setr_pd(-1, 1); 4318 __m128d MM = _mm_setr_pd(-1, -1); 4319 assert(_mm_testnzc_pd(PM, MP) == 0); 4320 assert(_mm_testnzc_pd(PM, MM) == 1); 4321 assert(_mm_testnzc_pd(MP, MP) == 0); 4322 assert(_mm_testnzc_pd(MP, MM) == 1); 4323 assert(_mm_testnzc_pd(MM, MM) == 0); 4324 } 4325 4326 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4327 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 4328 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 4329 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4330 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise 4331 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4332 /// 4333 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4334 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4335 int _mm256_testnzc_pd (__m256d a, __m256d b) pure @safe 4336 { 4337 // PERF DMD 4338 static if (GDC_or_LDC_with_AVX) 4339 { 4340 return __builtin_ia32_vtestnzcpd256(a, b); 4341 } 4342 else 4343 { 4344 long4 la = cast(long4)a; 4345 long4 lb = cast(long4)b; 4346 long4 r = la & lb; 4347 long m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4348 int ZF = (~m >> 63) & 1; 4349 long4 r2 = ~la & lb; 4350 long m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3]; 4351 int CF = (~m2 >> 63) & 1; 4352 return (CF | ZF) == 0; 4353 } 4354 } 4355 unittest 4356 { 4357 __m256d PM = _mm256_setr_pd( 1, -1, 1, 1); 4358 __m256d MP = _mm256_setr_pd(-1, 1, 1, 1); 4359 __m256d MM = _mm256_setr_pd(-1, -1, 1, 1); 4360 assert(_mm256_testnzc_pd(PM, MP) == 0); 4361 assert(_mm256_testnzc_pd(PM, MM) == 1); 4362 assert(_mm256_testnzc_pd(MP, MP) == 0); 4363 assert(_mm256_testnzc_pd(MP, MM) == 1); 4364 assert(_mm256_testnzc_pd(MM, MM) == 0); 4365 } 4366 4367 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4368 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 4369 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 4370 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4371 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise 4372 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4373 /// 4374 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4375 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4376 int _mm_testnzc_ps (__m128 a, __m128 b) pure @safe 4377 { 4378 // PERF DMD 4379 static if (GDC_or_LDC_with_AVX) 4380 { 4381 return __builtin_ia32_vtestnzcps(a, b); 4382 } 4383 else 4384 { 4385 int4 la = cast(int4)a; 4386 int4 lb = cast(int4)b; 4387 int4 r = la & lb; 4388 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4389 int ZF = (~m >> 31) & 1; 4390 int4 r2 = ~la & lb; 4391 int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3]; 4392 int CF = (~m2 >> 31) & 1; 4393 return (CF | ZF) == 0; 4394 } 4395 } 4396 unittest 4397 { 4398 __m128 PM = _mm_setr_ps( 1, -1, 1, 1); 4399 __m128 MP = _mm_setr_ps(-1, 1, 1, 1); 4400 __m128 MM = _mm_setr_ps(-1, -1, 1, 1); 4401 assert(_mm_testnzc_ps(PM, MP) == 0); 4402 assert(_mm_testnzc_ps(PM, MM) == 1); 4403 assert(_mm_testnzc_ps(MP, MP) == 0); 4404 assert(_mm_testnzc_ps(MP, MM) == 1); 4405 assert(_mm_testnzc_ps(MM, MM) == 0); 4406 } 4407 4408 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4409 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 4410 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 4411 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4412 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise 4413 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4414 /// 4415 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4416 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4417 int _mm256_testnzc_ps (__m256 a, __m256 b) pure @safe 4418 { 4419 // PERF DMD 4420 static if (GDC_or_LDC_with_AVX) 4421 { 4422 return __builtin_ia32_vtestnzcps256(a, b); 4423 } 4424 else 4425 { 4426 int8 la = cast(int8)a; 4427 int8 lb = cast(int8)b; 4428 int8 r = la & lb; 4429 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3] 4430 | r.array[4] | r.array[5] | r.array[6] | r.array[7]; 4431 int ZF = (~m >> 31) & 1; 4432 int8 r2 = ~la & lb; 4433 int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3] 4434 | r2.array[4] | r2.array[5] | r2.array[6] | r2.array[7]; 4435 int CF = (~m2 >> 31) & 1; 4436 return (CF | ZF) == 0; 4437 } 4438 } 4439 unittest 4440 { 4441 __m256 PM = _mm256_setr_ps(1, 1, 1, 1, 1, -1, 1, 1); 4442 __m256 MP = _mm256_setr_ps(1, 1, 1, 1, -1, 1, 1, 1); 4443 __m256 MM = _mm256_setr_ps(1, 1, 1, 1, -1, -1, 1, 1); 4444 assert(_mm256_testnzc_ps(PM, MP) == 0); 4445 assert(_mm256_testnzc_ps(PM, MM) == 1); 4446 assert(_mm256_testnzc_ps(MP, MP) == 0); 4447 assert(_mm256_testnzc_ps(MP, MM) == 1); 4448 assert(_mm256_testnzc_ps(MM, MM) == 0); 4449 } 4450 4451 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`, 4452 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 4453 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 4454 /// result is zero, otherwise set CF to 0. 4455 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 4456 int _mm256_testnzc_si256 (__m256i a, __m256i b) pure @trusted 4457 { 4458 // PERF ARM64 4459 // PERF DMD 4460 // PERF LDC without AVX 4461 static if (GDC_or_LDC_with_AVX) 4462 { 4463 return __builtin_ia32_ptestnzc256(cast(long4) a, cast(long4) b); 4464 } 4465 else 4466 { 4467 // Need to defer to _mm_testnzc_si128 if possible, for more speed 4468 __m256i c = a & b; 4469 __m256i d = ~a & b; 4470 long m = c.array[0] | c.array[1] | c.array[2] | c.array[3]; 4471 long n = d.array[0] | d.array[1] | d.array[2] | d.array[3]; 4472 return (m != 0) & (n != 0); 4473 } 4474 } 4475 unittest 4476 { 4477 __m256i A = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0, 0, 0, 0); 4478 __m256i M = _mm256_setr_epi32(0x01, 0x40, 0x00, 0x00, 0, 0, 0, 0); 4479 __m256i Z = _mm256_setzero_si256(); 4480 assert(_mm256_testnzc_si256(A, Z) == 0); 4481 assert(_mm256_testnzc_si256(A, M) == 1); 4482 assert(_mm256_testnzc_si256(A, A) == 0); 4483 } 4484 4485 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4486 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of 4487 /// each 64-bit element in the intermediate value is zero, otherwise return 0. 4488 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4489 int _mm_testz_pd (__m128d a, __m128d b) pure @trusted 4490 { 4491 static if (GDC_or_LDC_with_AVX) 4492 { 4493 return __builtin_ia32_vtestzpd(a, b); 4494 } 4495 else 4496 { 4497 long2 la = cast(long2)a; 4498 long2 lb = cast(long2)b; 4499 long2 r = la & lb; 4500 long m = r.array[0] | r.array[1]; 4501 return (~m >> 63) & 1; 4502 } 4503 } 4504 unittest 4505 { 4506 __m128d A = _mm_setr_pd(-1, 1); 4507 __m128d B = _mm_setr_pd(-1, -1); 4508 __m128d C = _mm_setr_pd(1, -1); 4509 assert(_mm_testz_pd(A, A) == 0); 4510 assert(_mm_testz_pd(A, B) == 0); 4511 assert(_mm_testz_pd(C, A) == 1); 4512 } 4513 4514 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4515 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of 4516 /// each 64-bit element in the intermediate value is zero, otherwise return 0. 4517 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4518 int _mm256_testz_pd (__m256d a, __m256d b) pure @trusted 4519 { 4520 static if (GDC_or_LDC_with_AVX) 4521 { 4522 return __builtin_ia32_vtestzpd256(a, b); 4523 } 4524 else 4525 { 4526 long4 la = cast(long4)a; 4527 long4 lb = cast(long4)b; 4528 long4 r = la & lb; 4529 long r2 = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4530 return (~r2 >> 63) & 1; 4531 } 4532 } 4533 unittest 4534 { 4535 __m256d A = _mm256_setr_pd(-1, 1, -1, 1); 4536 __m256d B = _mm256_setr_pd(1, 1, -1, 1); 4537 __m256d C = _mm256_setr_pd(1, -1, 1, -1); 4538 assert(_mm256_testz_pd(A, A) == 0); 4539 assert(_mm256_testz_pd(A, B) == 0); 4540 assert(_mm256_testz_pd(C, A) == 1); 4541 } 4542 4543 /// Compute the bitwise AND of 128 bits (representing double-precision (32-bit) floating-point 4544 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of 4545 /// each 32-bit element in the intermediate value is zero, otherwise return 0. 4546 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4547 int _mm_testz_ps (__m128 a, __m128 b) pure @safe 4548 { 4549 // PERF DMD 4550 static if (GDC_or_LDC_with_AVX) 4551 { 4552 return __builtin_ia32_vtestzps(a, b); 4553 } 4554 else 4555 { 4556 int4 la = cast(int4)a; 4557 int4 lb = cast(int4)b; 4558 int4 r = la & lb; 4559 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4560 return (~m >> 31) & 1; 4561 } 4562 } 4563 unittest 4564 { 4565 __m128 A = _mm_setr_ps(-1, 1, -1, 1); 4566 __m128 B = _mm_setr_ps( 1, 1, -1, 1); 4567 __m128 C = _mm_setr_ps( 1, -1, 1, -1); 4568 assert(_mm_testz_ps(A, A) == 0); 4569 assert(_mm_testz_ps(A, B) == 0); 4570 assert(_mm_testz_ps(C, A) == 1); 4571 assert(_mm_testz_ps(C, B) == 1); 4572 } 4573 4574 /// Compute the bitwise AND of 256 bits (representing double-precision (32-bit) floating-point 4575 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of 4576 /// each 32-bit element in the intermediate value is zero, otherwise return 0. 4577 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4578 int _mm256_testz_ps (__m256 a, __m256 b) pure @safe 4579 { 4580 // PERF DMD 4581 static if (GDC_or_LDC_with_AVX) 4582 { 4583 return __builtin_ia32_vtestzps256(a, b); 4584 } 4585 else 4586 { 4587 int8 la = cast(int8)a; 4588 int8 lb = cast(int8)b; 4589 int8 r = la & lb; 4590 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3] 4591 | r.array[4] | r.array[5] | r.array[6] | r.array[7]; 4592 return (~m >> 31) & 1; 4593 } 4594 } 4595 4596 /// Compute the bitwise AND of 256 bits (representing integer data) in 4597 /// and return 1 if the result is zero, otherwise return 0. 4598 /// In other words, test if all bits masked by `b` are 0 in `a`. 4599 int _mm256_testz_si256 (__m256i a, __m256i b) @trusted 4600 { 4601 // PERF DMD 4602 static if (GDC_with_AVX) 4603 { 4604 return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b); 4605 } 4606 else static if (LDC_with_AVX) 4607 { 4608 return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b); 4609 } 4610 else version(LDC) 4611 { 4612 // better to split than do vanilla (down to 8 inst in arm64) 4613 __m128i lo_a = _mm256_extractf128_si256!0(a); 4614 __m128i lo_b = _mm256_extractf128_si256!0(b); 4615 __m128i hi_a = _mm256_extractf128_si256!1(a); 4616 __m128i hi_b = _mm256_extractf128_si256!1(b); 4617 return _mm_testz_si128(lo_a, lo_b) & _mm_testz_si128(hi_a, hi_b); 4618 } 4619 else 4620 { 4621 __m256i c = a & b; 4622 long[4] zero = [0, 0, 0, 0]; 4623 return c.array == zero; 4624 } 4625 } 4626 unittest 4627 { 4628 __m256i A = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0x01, 0x02, 0x04, 0xf8); 4629 __m256i M1 = _mm256_setr_epi32(0xfe, 0xfd, 0x00, 0x07, 0xfe, 0xfd, 0x00, 0x07); 4630 __m256i M2 = _mm256_setr_epi32(0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00); 4631 assert(_mm256_testz_si256(A, A) == 0); 4632 assert(_mm256_testz_si256(A, M1) == 1); 4633 assert(_mm256_testz_si256(A, M2) == 0); 4634 } 4635 4636 /// Return vector of type __m256d with undefined elements. 4637 __m256d _mm256_undefined_pd () pure @safe 4638 { 4639 __m256d r = void; 4640 return r; 4641 } 4642 4643 /// Return vector of type __m256 with undefined elements. 4644 __m256 _mm256_undefined_ps () pure @safe 4645 { 4646 __m256 r = void; 4647 return r; 4648 } 4649 4650 /// Return vector of type __m256i with undefined elements. 4651 __m256i _mm256_undefined_si256 () pure @safe 4652 { 4653 __m256i r = void; 4654 return r; 4655 } 4656 4657 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 4658 /// each 128-bit lane in `a` and `b`. 4659 __m256d _mm256_unpackhi_pd (__m256d a, __m256d b) pure @trusted 4660 { 4661 static if (LDC_with_optimizations) 4662 { 4663 enum ir = `%r = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 4664 ret <4 x double> %r`; 4665 return LDCInlineIR!(ir, double4, double4, double4)(a, b); 4666 } 4667 else static if (GDC_with_AVX) 4668 { 4669 return __builtin_ia32_unpckhpd256 (a, b); 4670 } 4671 else 4672 { 4673 __m256d r; 4674 r.ptr[0] = a.array[1]; 4675 r.ptr[1] = b.array[1]; 4676 r.ptr[2] = a.array[3]; 4677 r.ptr[3] = b.array[3]; 4678 return r; 4679 } 4680 } 4681 unittest 4682 { 4683 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 4684 __m256d B = _mm256_setr_pd(5.0, 6, 7, 8); 4685 __m256d C = _mm256_unpackhi_pd(A, B); 4686 double[4] correct = [2.0, 6, 4, 8]; 4687 assert(C.array == correct); 4688 } 4689 4690 4691 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 4692 /// each 128-bit lane in `a` and `b`. 4693 __m256 _mm256_unpackhi_ps (__m256 a, __m256 b) pure @trusted 4694 { 4695 static if (LDC_with_optimizations) 4696 { 4697 enum ir = `%r = shufflevector <8 x float> %0, <8 x float> %1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 4698 ret <8 x float> %r`; 4699 return LDCInlineIR!(ir, float8, float8, float8)(a, b); 4700 } 4701 else static if (GDC_with_AVX) 4702 { 4703 return __builtin_ia32_unpckhps256 (a, b); 4704 } 4705 else 4706 { 4707 __m256 r; 4708 r.ptr[0] = a.array[2]; 4709 r.ptr[1] = b.array[2]; 4710 r.ptr[2] = a.array[3]; 4711 r.ptr[3] = b.array[3]; 4712 r.ptr[4] = a.array[6]; 4713 r.ptr[5] = b.array[6]; 4714 r.ptr[6] = a.array[7]; 4715 r.ptr[7] = b.array[7]; 4716 return r; 4717 } 4718 } 4719 unittest 4720 { 4721 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 4722 __m256 B = _mm256_setr_ps(8.0f, 9, 10, 11, 12, 13, 14, 15); 4723 __m256 C = _mm256_unpackhi_ps(A, B); 4724 float[8] correct = [2.0f, 10, 3, 11, 6, 14, 7, 15]; 4725 assert(C.array == correct); 4726 } 4727 4728 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 4729 /// each 128-bit lane in `a` and `b`. 4730 __m256d _mm256_unpacklo_pd (__m256d a, __m256d b) 4731 { 4732 static if (LDC_with_optimizations) 4733 { 4734 enum ir = `%r = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 4735 ret <4 x double> %r`; 4736 return LDCInlineIR!(ir, double4, double4, double4)(a, b); 4737 } 4738 else static if (GDC_with_AVX) 4739 { 4740 return __builtin_ia32_unpcklpd256 (a, b); 4741 } 4742 else 4743 { 4744 __m256d r; 4745 r.ptr[0] = a.array[0]; 4746 r.ptr[1] = b.array[0]; 4747 r.ptr[2] = a.array[2]; 4748 r.ptr[3] = b.array[2]; 4749 return r; 4750 } 4751 } 4752 unittest 4753 { 4754 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 4755 __m256d B = _mm256_setr_pd(5.0, 6, 7, 8); 4756 __m256d C = _mm256_unpacklo_pd(A, B); 4757 double[4] correct = [1.0, 5, 3, 7]; 4758 assert(C.array == correct); 4759 } 4760 4761 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 4762 /// each 128-bit lane in `a` and `b`. 4763 __m256 _mm256_unpacklo_ps (__m256 a, __m256 b) 4764 { 4765 static if (LDC_with_optimizations) 4766 { 4767 enum ir = `%r = shufflevector <8 x float> %0, <8 x float> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 4768 ret <8 x float> %r`; 4769 return LDCInlineIR!(ir, float8, float8, float8)(a, b); 4770 } 4771 else static if (GDC_with_AVX) 4772 { 4773 return __builtin_ia32_unpcklps256 (a, b); 4774 } 4775 else 4776 { 4777 __m256 r; 4778 r.ptr[0] = a.array[0]; 4779 r.ptr[1] = b.array[0]; 4780 r.ptr[2] = a.array[1]; 4781 r.ptr[3] = b.array[1]; 4782 r.ptr[4] = a.array[4]; 4783 r.ptr[5] = b.array[4]; 4784 r.ptr[6] = a.array[5]; 4785 r.ptr[7] = b.array[5]; 4786 return r; 4787 } 4788 } 4789 unittest 4790 { 4791 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 4792 __m256 B = _mm256_setr_ps(8.0f, 9, 10, 11, 12, 13, 14, 15); 4793 __m256 C = _mm256_unpacklo_ps(A, B); 4794 float[8] correct = [0.0f, 8, 1, 9, 4, 12, 5, 13]; 4795 assert(C.array == correct); 4796 } 4797 4798 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 4799 __m256d _mm256_xor_pd (__m256d a, __m256d b) pure @safe 4800 { 4801 return cast(__m256d)( cast(__m256i)a ^ cast(__m256i)b ); 4802 } 4803 4804 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 4805 __m256 _mm256_xor_ps (__m256 a, __m256 b) pure @safe 4806 { 4807 return cast(__m256)( cast(__m256i)a ^ cast(__m256i)b ); 4808 } 4809 4810 void _mm256_zeroall () pure @safe 4811 { 4812 // PERF DMD needs to do it explicitely if AVX is ever used one day. 4813 4814 static if (GDC_with_AVX) 4815 { 4816 __builtin_ia32_vzeroall(); 4817 } 4818 else 4819 { 4820 // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM). 4821 } 4822 } 4823 4824 void _mm256_zeroupper () pure @safe 4825 { 4826 // PERF DMD needs to do it explicitely if AVX is ever used. 4827 4828 static if (GDC_with_AVX) 4829 { 4830 __builtin_ia32_vzeroupper(); 4831 } 4832 else 4833 { 4834 // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM). 4835 } 4836 4837 } 4838 4839 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are zeroed. 4840 __m256d _mm256_zextpd128_pd256 (__m128d a) pure @trusted 4841 { 4842 __m256d r; 4843 r.ptr[0] = a.array[0]; 4844 r.ptr[1] = a.array[1]; 4845 r.ptr[2] = 0; 4846 r.ptr[3] = 0; 4847 return r; 4848 } 4849 unittest 4850 { 4851 __m256d R = _mm256_zextpd128_pd256(_mm_setr_pd(2.0, -3.0)); 4852 double[4] correct = [2.0, -3, 0, 0]; 4853 assert(R.array == correct); 4854 } 4855 4856 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are zeroed. 4857 __m256 _mm256_zextps128_ps256 (__m128 a) pure @trusted 4858 { 4859 double2 la = cast(double2)a; 4860 double4 r; 4861 r.ptr[0] = la.array[0]; 4862 r.ptr[1] = la.array[1]; 4863 r.ptr[2] = 0; 4864 r.ptr[3] = 0; 4865 return cast(__m256)r; 4866 } 4867 unittest 4868 { 4869 __m256 R = _mm256_zextps128_ps256(_mm_setr_ps(2.0, -3.0, 4, -5)); 4870 float[8] correct = [2.0, -3, 4, -5, 0, 0, 0, 0]; 4871 assert(R.array == correct); 4872 } 4873 4874 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are zeroed. 4875 __m256i _mm256_zextsi128_si256 (__m128i a) pure @trusted 4876 { 4877 long2 la = cast(long2)a; 4878 __m256i r; 4879 r.ptr[0] = la.array[0]; 4880 r.ptr[1] = la.array[1]; 4881 r.ptr[2] = 0; 4882 r.ptr[3] = 0; 4883 return r; 4884 } 4885 unittest 4886 { 4887 __m256i R = _mm256_zextsi128_si256(_mm_setr_epi64(-1, 99)); 4888 long[4] correct = [-1, 99, 0, 0]; 4889 assert(R.array == correct); 4890 }