1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.internals; 8 9 import inteli.types; 10 11 // The only math functions needed for intel-intrinsics 12 public import core.math: sqrt; // since it's an intrinsics 13 14 package: 15 nothrow: 16 @nogc: 17 18 19 version(GNU) 20 { 21 version (X86) 22 { 23 // For 32-bit x86, disable vector extensions with GDC. 24 // It just doesn't work well. 25 enum GDC_with_x86 = true; 26 enum GDC_with_MMX = false; 27 enum GDC_with_SSE = false; 28 enum GDC_with_SSE2 = false; 29 enum GDC_with_SSE3 = false; 30 enum GDC_with_SSSE3 = false; 31 enum GDC_with_SSE41 = false; 32 enum GDC_with_SSE42 = false; 33 enum GDC_with_AVX = false; 34 enum GDC_with_AVX2 = false; 35 enum GDC_with_SHA = false; 36 enum GDC_with_BMI2 = false; 37 } 38 else version (X86_64) 39 { 40 // GDC support uses extended inline assembly: 41 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 42 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 43 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 44 45 public import core.simd: byte16, short8, int4, float4, double2; 46 47 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 48 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 49 public import gcc.builtins; 50 51 // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 52 // want to support other archs with GDC 53 54 enum GDC_with_x86 = true; 55 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 56 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 57 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 58 59 static if (__VERSION__ >= 2100) // Starting at GDC 12.1 60 { 61 enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps); 62 enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128); 63 enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps); 64 enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq); 65 enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256); 66 enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df); 67 enum GDC_with_BMI2 = __traits(compiles, __builtin_ia32_pext_si); 68 69 } 70 else 71 { 72 // Before GCC 11.3, no reliable way to detect instruction sets. 73 // We start above detection at GCC 12, with DMDFE 2.100, which 74 // is more conservative. 75 enum GDC_with_SSE3 = false; 76 enum GDC_with_SSSE3 = false; 77 enum GDC_with_SSE41 = false; 78 enum GDC_with_SSE42 = false; 79 enum GDC_with_AVX = false; 80 enum GDC_with_AVX2 = false; 81 enum GDC_with_BMI2 = false; 82 } 83 84 enum GDC_with_SHA = false; // TODO: detect that 85 } 86 else 87 { 88 enum GDC_with_x86 = false; 89 enum GDC_with_MMX = false; 90 enum GDC_with_SSE = false; 91 enum GDC_with_SSE2 = false; 92 enum GDC_with_SSE3 = false; 93 enum GDC_with_SSSE3 = false; 94 enum GDC_with_SSE41 = false; 95 enum GDC_with_SSE42 = false; 96 enum GDC_with_AVX = false; 97 enum GDC_with_AVX2 = false; 98 enum GDC_with_SHA = false; 99 enum GDC_with_BMI2 = false; 100 } 101 } 102 else 103 { 104 enum GDC_with_x86 = false; 105 enum GDC_with_MMX = false; 106 enum GDC_with_SSE = false; 107 enum GDC_with_SSE2 = false; 108 enum GDC_with_SSE3 = false; 109 enum GDC_with_SSSE3 = false; 110 enum GDC_with_SSE41 = false; 111 enum GDC_with_SSE42 = false; 112 enum GDC_with_AVX = false; 113 enum GDC_with_AVX2 = false; 114 enum GDC_with_SHA = false; 115 enum GDC_with_BMI2 = false; 116 } 117 118 version(LDC) 119 { 120 public import core.simd; 121 public import ldc.simd; 122 public import ldc.intrinsics; 123 public import ldc.llvmasm: __asm; 124 125 version (X86) 126 private enum bool some_x86 = true; 127 else version (X86_64) 128 private enum bool some_x86 = true; 129 else 130 private enum bool some_x86 = false; 131 132 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 133 static if (__VERSION__ >= 2083) 134 { 135 import ldc.llvmasm; 136 alias LDCInlineIR = __ir_pure; 137 138 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 139 alias LDCInlineIREx = __irEx_pure; 140 141 enum bool LDC_with_InlineIREx = true; 142 } 143 else 144 { 145 alias LDCInlineIR = inlineIR; 146 enum bool LDC_with_InlineIREx = false; 147 } 148 149 // This is used to disable LDC feature that are expensive at compile time: 150 // everything that relies on inline LLVM IR. 151 version(D_Optimized) 152 { 153 enum bool LDC_with_optimizations = true; 154 } 155 else 156 { 157 static if (__VERSION__ < 2101) 158 { 159 // See Issue #136, D_Optimized only appeared in DMDFE 2.101. 160 // Relying on this had terrible consequences. 161 enum bool LDC_with_optimizations = true; 162 } 163 else 164 enum bool LDC_with_optimizations = false; 165 } 166 167 version(ARM) 168 { 169 public import ldc.gccbuiltins_arm; 170 enum LDC_with_ARM32 = true; 171 enum LDC_with_ARM64 = false; 172 enum LDC_with_ARM64_CRC = false; 173 enum LDC_with_SSE = false; 174 enum LDC_with_SSE2 = false; 175 enum LDC_with_SSE3 = false; 176 enum LDC_with_SSSE3 = false; 177 enum LDC_with_SSE41 = false; 178 enum LDC_with_SSE42 = false; 179 enum LDC_with_CRC32 = false; 180 enum LDC_with_AVX = false; 181 enum LDC_with_AVX2 = false; 182 enum LDC_with_SHA = false; 183 enum LDC_with_BMI2 = false; 184 } 185 else version(AArch64) 186 { 187 public import ldc.gccbuiltins_aarch64; 188 enum LDC_with_ARM32 = false; 189 enum LDC_with_ARM64 = true; // implies "has Neon" 190 enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc"); 191 enum LDC_with_SSE = false; 192 enum LDC_with_SSE2 = false; 193 enum LDC_with_SSE3 = false; 194 enum LDC_with_SSSE3 = false; 195 enum LDC_with_SSE41 = false; 196 enum LDC_with_SSE42 = false; 197 enum LDC_with_CRC32 = false; 198 enum LDC_with_AVX = false; 199 enum LDC_with_AVX2 = false; 200 enum LDC_with_SHA = false; 201 enum LDC_with_BMI2 = false; 202 } 203 else static if (some_x86) 204 { 205 public import ldc.gccbuiltins_x86; 206 207 // Workaround LDC 1.32.0 having NO builtins at all. 208 // See LDC issue 4347 https://github.com/ldc-developers/ldc/issues/4347 209 enum LDC_has_some_x86_builtins = __traits(compiles, __builtin_ia32_clflush); // This one must be available in all of LDC history. 210 211 static if (!LDC_has_some_x86_builtins) 212 { 213 // in case our __builtin_ia32_clflush workaround breaks 214 pragma(msg, "Warning: LDC v1.32.0 has no SIMD builtins. intel-intrinsics will use slow path. Please avoid LDC 1.32.0"); 215 } 216 217 enum LDC_with_ARM32 = false; 218 enum LDC_with_ARM64 = false; 219 enum LDC_with_ARM64_CRC = false; 220 enum LDC_with_SSE = __traits(targetHasFeature, "sse") && LDC_has_some_x86_builtins; 221 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2") && LDC_has_some_x86_builtins; 222 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3") && LDC_has_some_x86_builtins; 223 enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3") && LDC_has_some_x86_builtins; 224 enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1") && LDC_has_some_x86_builtins; 225 enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2") && LDC_has_some_x86_builtins; 226 227 // Since LDC 1.30, crc32 is a separate (and sufficient) attribute from sse4.2 228 // As of Jan 2023, GDC doesn't make that distinction, -msse4.2 includes -mcrc32 for GDC. 229 static if (__VERSION__ >= 2100) 230 { 231 enum LDC_with_CRC32 = __traits(targetHasFeature, "crc32") && LDC_has_some_x86_builtins; 232 } 233 else 234 { 235 enum LDC_with_CRC32 = __traits(targetHasFeature, "sse4.2") && LDC_has_some_x86_builtins; // crc32 used to be included in sse4.2 236 } 237 238 enum LDC_with_AVX = __traits(targetHasFeature, "avx") && LDC_has_some_x86_builtins; 239 enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2") && LDC_has_some_x86_builtins; 240 enum LDC_with_SHA = __traits(targetHasFeature, "sha") && LDC_has_some_x86_builtins; 241 enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2") && LDC_has_some_x86_builtins; 242 } 243 else 244 { 245 enum LDC_with_ARM32 = false; 246 enum LDC_with_ARM64 = false; 247 enum LDC_with_ARM64_CRC = false; 248 enum LDC_with_SSE = false; 249 enum LDC_with_SSE2 = false; 250 enum LDC_with_SSE3 = false; 251 enum LDC_with_SSSE3 = false; 252 enum LDC_with_SSE41 = false; 253 enum LDC_with_SSE42 = false; 254 enum LDC_with_CRC32 = false; 255 enum LDC_with_AVX = false; 256 enum LDC_with_AVX2 = false; 257 enum LDC_with_SHA = false; 258 enum LDC_with_BMI2 = false; 259 } 260 261 // Should we use inline x86 assembly with DMD syntax, in LDC? 262 version(D_InlineAsm_X86) 263 { 264 enum LDC_with_32b_x86_asm = LDC_with_SSE2; // if no SSE support, disable the x86 asm code path 265 enum LDC_with_64b_x86_asm = false; 266 } 267 else version(D_InlineAsm_X86_64) 268 { 269 enum LDC_with_32b_x86_asm = false; 270 enum LDC_with_64b_x86_asm = LDC_with_SSE2; 271 } 272 else 273 { 274 enum LDC_with_32b_x86_asm = false; 275 enum LDC_with_64b_x86_asm = false; 276 } 277 } 278 else 279 { 280 enum LDC_with_ARM32 = false; 281 enum LDC_with_ARM64 = false; 282 enum LDC_with_ARM64_CRC = false; 283 enum LDC_with_SSE = false; 284 enum LDC_with_SSE2 = false; 285 enum LDC_with_SSE3 = false; 286 enum LDC_with_SSSE3 = false; 287 enum LDC_with_SSE41 = false; 288 enum LDC_with_SSE42 = false; 289 enum LDC_with_CRC32 = false; 290 enum LDC_with_AVX = false; 291 enum LDC_with_AVX2 = false; 292 enum LDC_with_SHA = false; 293 enum LDC_with_BMI2 = false; 294 enum LDC_with_InlineIREx = false; 295 enum bool LDC_with_optimizations = false; 296 enum bool LDC_with_32b_x86_asm = false; 297 enum bool LDC_with_64b_x86_asm = false; 298 } 299 enum LDC_with_x86_asm = LDC_with_32b_x86_asm || LDC_with_64b_x86_asm; 300 301 302 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 303 304 version(DigitalMars) 305 { 306 version(D_InlineAsm_X86) 307 enum DMD_with_asm = true; 308 else version(D_InlineAsm_X86_64) 309 enum DMD_with_asm = true; 310 else 311 enum DMD_with_asm = false; 312 313 version(D_InlineAsm_X86) 314 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 315 else 316 enum DMD_with_32bit_asm = false; 317 318 version (D_SIMD) 319 { 320 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 321 322 // Going further, does DMD has SSE4.1 through -mcpu? 323 static if (DMD_with_DSIMD) 324 enum bool DMD_with_DSIMD_and_SSE41 = __traits(compiles, int4(0) * int4(0)); 325 else 326 enum bool DMD_with_DSIMD_and_SSE41 = false; 327 328 // No DMD way to detect those instruction sets => pessimize 329 // would be cool to have a way to detect support for this at CT 330 enum DMD_with_DSIMD_and_SSE3 = DMD_with_DSIMD_and_SSE41; 331 enum DMD_with_DSIMD_and_SSSE3 = DMD_with_DSIMD_and_SSE41; 332 333 version(D_AVX) 334 enum DMD_with_DSIMD_and_AVX = true; 335 else 336 enum DMD_with_DSIMD_and_AVX = false; 337 338 version(D_AVX2) 339 enum DMD_with_DSIMD_and_AVX2 = true; 340 else 341 enum DMD_with_DSIMD_and_AVX2 = false; 342 343 enum DMD_with_DSIMD_and_SSE42 = DMD_with_DSIMD_and_AVX; 344 } 345 else 346 { 347 enum DMD_with_DSIMD = false; 348 enum DMD_with_DSIMD_and_SSE3 = false; 349 enum DMD_with_DSIMD_and_SSSE3 = false; 350 enum DMD_with_DSIMD_and_SSE41 = false; 351 enum DMD_with_DSIMD_and_SSE42 = false; 352 enum DMD_with_DSIMD_and_AVX = false; 353 enum DMD_with_DSIMD_and_AVX2 = false; 354 } 355 } 356 else 357 { 358 enum DMD_with_asm = false; 359 enum DMD_with_32bit_asm = false; 360 enum DMD_with_DSIMD = false; 361 enum DMD_with_DSIMD_and_SSE3 = false; 362 enum DMD_with_DSIMD_and_SSSE3 = false; 363 enum DMD_with_DSIMD_and_SSE41 = false; 364 enum DMD_with_DSIMD_and_SSE42 = false; 365 enum DMD_with_DSIMD_and_AVX = false; 366 enum DMD_with_DSIMD_and_AVX2 = false; 367 } 368 369 370 // Sometimes, can be helpful to merge builtin code, however keep in mind that 371 // LDC and GDC builtins often subtly diverge, wrt. unsigned vs signed vectors, 372 // return types, purity... test it in Godbolt! this is safer with float and double intrinsics. 373 enum GDC_or_LDC_with_SSE = GDC_with_SSE || LDC_with_SSE; 374 enum GDC_or_LDC_with_SSE2 = GDC_with_SSE2 || LDC_with_SSE2; 375 enum GDC_or_LDC_with_SSE3 = GDC_with_SSE3 || LDC_with_SSE3; 376 enum GDC_or_LDC_with_SSE41 = GDC_with_SSE41 || LDC_with_SSE41; 377 enum GDC_or_LDC_with_SSE42 = GDC_with_SSE42 || LDC_with_SSE42; 378 379 enum GDC_or_LDC_with_AVX = GDC_with_AVX || LDC_with_AVX; 380 enum GDC_or_LDC_with_AVX2 = GDC_with_AVX2 || LDC_with_AVX2; 381 enum GDC_or_LDC_with_SHA = GDC_with_SHA || LDC_with_SHA; 382 enum GDC_or_LDC_with_BMI2 = GDC_with_BMI2 || LDC_with_BMI2; 383 384 static if (__VERSION__ >= 2102) 385 { 386 enum SIMD_COMPARISON_MASKS_8B = !MMXSizedVectorsAreEmulated; // can do < <= => > == with builtin 8 bytes __vectors. 387 enum SIMD_COMPARISON_MASKS_16B = !SSESizedVectorsAreEmulated; // can do < <= => > == with builtin 16 bytes __vectors. 388 enum SIMD_COMPARISON_MASKS_32B = !AVXSizedVectorsAreEmulated; // can do < <= => > == with builtin 32 bytes __vectors. 389 } 390 else 391 { 392 enum SIMD_COMPARISON_MASKS_8B = false; 393 enum SIMD_COMPARISON_MASKS_16B = false; 394 enum SIMD_COMPARISON_MASKS_32B = false; 395 } 396 397 398 static if (LDC_with_ARM32) 399 { 400 package uint arm_get_fpcr() nothrow @nogc @trusted 401 { 402 return __builtin_arm_get_fpscr(); 403 } 404 405 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 406 { 407 __builtin_arm_set_fpscr(cw); 408 } 409 } 410 411 static if (LDC_with_ARM64) 412 { 413 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 414 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 415 416 package uint arm_get_fpcr() pure nothrow @nogc @trusted 417 { 418 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 419 return __asm!uint("mrs $0, fpcr", "=r"); 420 } 421 422 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 423 { 424 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 425 long save_x2; 426 __asm!void("str x2, $1 \n" ~ 427 "ldr w2, $0 \n" ~ 428 "msr fpcr, x2 \n" ~ 429 "ldr x2, $1 " , "m,m", cw, &save_x2); 430 } 431 } 432 433 434 // For internal use only, since public API deals with a x86 semantic emulation 435 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 436 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 437 enum uint _MM_ROUND_UP_ARM = 0x00400000; 438 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 439 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 440 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 441 442 443 // 444 // <ROUNDING> 445 // 446 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 447 // doesn't change the FPU rounding mode, and isn't expected to do so. 448 // So we devised these rounding function to help having consistent rounding between 449 // LDC and DMD. It's important that DMD uses whatever is in MXCSR to round. 450 // 451 // Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 452 // functionality. 453 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 454 // We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 455 456 int convertFloatToInt32UsingMXCSR(float value) @trusted 457 { 458 int result; 459 version(GNU) 460 { 461 asm pure nothrow @nogc @trusted 462 { 463 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 464 } 465 } 466 else static if (LDC_with_ARM32) 467 { 468 result = __asm!int(`vldr s2, $1 469 vcvtr.s32.f32 s2, s2 470 vmov $0, s2`, "=r,m,~{s2}", value); 471 } 472 else static if (LDC_with_ARM64) 473 { 474 // Get current rounding mode. 475 uint fpscr = arm_get_fpcr(); 476 477 switch(fpscr & _MM_ROUND_MASK_ARM) 478 { 479 default: 480 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 481 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 482 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 483 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 484 } 485 } 486 else 487 { 488 asm pure nothrow @nogc @trusted 489 { 490 cvtss2si EAX, value; 491 mov result, EAX; 492 } 493 } 494 return result; 495 } 496 497 int convertDoubleToInt32UsingMXCSR(double value) @trusted 498 { 499 int result; 500 version(GNU) 501 { 502 asm pure nothrow @nogc @trusted 503 { 504 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 505 } 506 } 507 else static if (LDC_with_ARM32) 508 { 509 result = __asm!int(`vldr d2, $1 510 vcvtr.s32.f64 s2, d2 511 vmov $0, s2`, "=r,m,~{s2},~{d2}", value); 512 } 513 else static if (LDC_with_ARM64) 514 { 515 // Get current rounding mode. 516 uint fpscr = arm_get_fpcr(); 517 518 switch(fpscr & _MM_ROUND_MASK_ARM) 519 { 520 default: 521 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 522 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 523 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 524 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 525 } 526 } 527 else 528 { 529 asm pure nothrow @nogc @trusted 530 { 531 cvtsd2si EAX, value; 532 mov result, EAX; 533 } 534 } 535 return result; 536 } 537 538 long convertFloatToInt64UsingMXCSR(float value) @trusted 539 { 540 static if (LDC_with_ARM32) 541 { 542 // We have to resort to libc since 32-bit ARM 543 // doesn't seem to have 64-bit registers. 544 545 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 546 547 // Note: converting to double precision else rounding could be different for large integers 548 double asDouble = value; 549 550 switch(fpscr & _MM_ROUND_MASK_ARM) 551 { 552 default: 553 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 554 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 555 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 556 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 557 } 558 } 559 else static if (LDC_with_ARM64) 560 { 561 uint fpscr = arm_get_fpcr(); 562 563 switch(fpscr & _MM_ROUND_MASK_ARM) 564 { 565 default: 566 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 567 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 568 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 569 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 570 } 571 } 572 // 64-bit can use an SSE instruction 573 else version(D_InlineAsm_X86_64) 574 { 575 long result; 576 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 577 { 578 asm pure nothrow @nogc @trusted 579 { 580 movss XMM0, value; 581 cvtss2si RAX, XMM0; 582 mov result, RAX; 583 } 584 } 585 else 586 { 587 asm pure nothrow @nogc @trusted 588 { 589 movss XMM0, value; 590 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 591 mov result, RAX; 592 } 593 } 594 return result; 595 } 596 else version(D_InlineAsm_X86) 597 { 598 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 599 // This leads to an unfortunate FPU sequence in every C++ compiler. 600 // See: https://godbolt.org/z/vZym77 601 602 // Get current MXCSR rounding 603 uint sseRounding; 604 ushort savedFPUCW; 605 ushort newFPUCW; 606 long result; 607 asm pure nothrow @nogc @trusted 608 { 609 stmxcsr sseRounding; 610 fld value; 611 fnstcw savedFPUCW; 612 mov AX, savedFPUCW; 613 and AX, 0xf3ff; // clear FPU rounding bits 614 movzx ECX, word ptr sseRounding; 615 and ECX, 0x6000; // only keep SSE rounding bits 616 shr ECX, 3; 617 or AX, CX; // make a new control word for FPU with SSE bits 618 mov newFPUCW, AX; 619 fldcw newFPUCW; 620 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 621 fldcw savedFPUCW; 622 } 623 return result; 624 } 625 else static if (GDC_with_x86) 626 { 627 version(X86_64) // 64-bit can just use the right instruction 628 { 629 static assert(GDC_with_SSE); 630 __m128 A; 631 A.ptr[0] = value; 632 return __builtin_ia32_cvtss2si64 (A); 633 } 634 else version(X86) // 32-bit 635 { 636 // This is untested! 637 uint sseRounding; 638 ushort savedFPUCW; 639 ushort newFPUCW; 640 long result; 641 asm pure nothrow @nogc @trusted 642 { 643 "stmxcsr %1;\n" ~ 644 "fld %2;\n" ~ 645 "fnstcw %3;\n" ~ 646 "movw %3, %%ax;\n" ~ 647 "andw $0xf3ff, %%ax;\n" ~ 648 "movzwl %1, %%ecx;\n" ~ 649 "andl $0x6000, %%ecx;\n" ~ 650 "shrl $3, %%ecx;\n" ~ 651 "orw %%cx, %%ax\n" ~ 652 "movw %%ax, %4;\n" ~ 653 "fldcw %4;\n" ~ 654 "fistpll %0;\n" ~ 655 "fldcw %3;\n" 656 : "=m"(result) // %0 657 : "m" (sseRounding), 658 "f" (value), 659 "m" (savedFPUCW), 660 "m" (newFPUCW) 661 : "eax", "ecx", "st"; 662 } 663 return result; 664 } 665 else 666 static assert(false); 667 } 668 else 669 static assert(false); 670 } 671 672 673 ///ditto 674 long convertDoubleToInt64UsingMXCSR(double value) @trusted 675 { 676 static if (LDC_with_ARM32) 677 { 678 // We have to resort to libc since 32-bit ARM 679 // doesn't seem to have 64-bit registers. 680 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 681 switch(fpscr & _MM_ROUND_MASK_ARM) 682 { 683 default: 684 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 685 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 686 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 687 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 688 } 689 } 690 else static if (LDC_with_ARM64) 691 { 692 // Get current rounding mode. 693 uint fpscr = arm_get_fpcr(); 694 695 switch(fpscr & _MM_ROUND_MASK_ARM) 696 { 697 default: 698 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 699 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 700 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 701 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 702 } 703 } 704 // 64-bit can use an SSE instruction 705 else version(D_InlineAsm_X86_64) 706 { 707 long result; 708 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 709 { 710 asm pure nothrow @nogc @trusted 711 { 712 movsd XMM0, value; 713 cvtsd2si RAX, XMM0; 714 mov result, RAX; 715 } 716 } 717 else 718 { 719 asm pure nothrow @nogc @trusted 720 { 721 movsd XMM0, value; 722 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 723 mov result, RAX; 724 } 725 } 726 return result; 727 } 728 else version(D_InlineAsm_X86) 729 { 730 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 731 // This leads to an unfortunate FPU sequence in every C++ compiler. 732 // See: https://godbolt.org/z/vZym77 733 734 // Get current MXCSR rounding 735 uint sseRounding; 736 ushort savedFPUCW; 737 ushort newFPUCW; 738 long result; 739 asm pure nothrow @nogc @trusted 740 { 741 stmxcsr sseRounding; 742 fld value; 743 fnstcw savedFPUCW; 744 mov AX, savedFPUCW; 745 and AX, 0xf3ff; 746 movzx ECX, word ptr sseRounding; 747 and ECX, 0x6000; 748 shr ECX, 3; 749 or AX, CX; 750 mov newFPUCW, AX; 751 fldcw newFPUCW; 752 fistp result; 753 fldcw savedFPUCW; 754 } 755 return result; 756 } 757 else static if (GDC_with_x86) 758 { 759 version(X86_64) 760 { 761 static assert(GDC_with_SSE2); 762 __m128d A; 763 A.ptr[0] = value; 764 return __builtin_ia32_cvtsd2si64 (A); 765 } 766 else 767 { 768 // This is untested! 769 uint sseRounding; 770 ushort savedFPUCW; 771 ushort newFPUCW; 772 long result; 773 asm pure nothrow @nogc @trusted 774 { 775 "stmxcsr %1;\n" ~ 776 "fld %2;\n" ~ 777 "fnstcw %3;\n" ~ 778 "movw %3, %%ax;\n" ~ 779 "andw $0xf3ff, %%ax;\n" ~ 780 "movzwl %1, %%ecx;\n" ~ 781 "andl $0x6000, %%ecx;\n" ~ 782 "shrl $3, %%ecx;\n" ~ 783 "orw %%cx, %%ax\n" ~ 784 "movw %%ax, %4;\n" ~ 785 "fldcw %4;\n" ~ 786 "fistpll %0;\n" ~ 787 "fldcw %3;\n" 788 : "=m"(result) // %0 789 : "m" (sseRounding), 790 "t" (value), 791 "m" (savedFPUCW), 792 "m" (newFPUCW) 793 : "eax", "ecx", "st"; 794 } 795 return result; 796 } 797 } 798 else 799 static assert(false); 800 } 801 802 // 803 // </ROUNDING> 804 // 805 806 807 // using the Intel terminology here 808 809 byte saturateSignedWordToSignedByte(short value) pure @safe 810 { 811 if (value > 127) value = 127; 812 if (value < -128) value = -128; 813 return cast(byte) value; 814 } 815 816 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 817 { 818 if (value > 255) value = 255; 819 if (value < 0) value = 0; 820 return cast(ubyte) value; 821 } 822 823 short saturateSignedIntToSignedShort(int value) pure @safe 824 { 825 if (value > 32767) value = 32767; 826 if (value < -32768) value = -32768; 827 return cast(short) value; 828 } 829 830 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 831 { 832 if (value > 65535) value = 65535; 833 if (value < 0) value = 0; 834 return cast(ushort) value; 835 } 836 837 unittest // test saturate operations 838 { 839 assert( saturateSignedWordToSignedByte(32000) == 127); 840 assert( saturateSignedWordToUnsignedByte(32000) == 255); 841 assert( saturateSignedWordToSignedByte(-4000) == -128); 842 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 843 assert( saturateSignedIntToSignedShort(32768) == 32767); 844 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 845 assert( saturateSignedIntToSignedShort(-32769) == -32768); 846 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 847 } 848 849 version(unittest) 850 { 851 // This is just for debugging tests 852 import core.stdc.stdio: printf; 853 854 // printing vectors for implementation 855 // Note: you can override `pure` within a `debug` clause 856 857 void _mm_print_pi64(__m64 v) @trusted 858 { 859 long1 vl = cast(long1)v; 860 printf("%lld\n", vl.array[0]); 861 } 862 863 void _mm_print_pi32(__m64 v) @trusted 864 { 865 int[2] C = (cast(int2)v).array; 866 printf("%d %d\n", C[0], C[1]); 867 } 868 869 void _mm_print_pi16(__m64 v) @trusted 870 { 871 short[4] C = (cast(short4)v).array; 872 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 873 } 874 875 void _mm_print_pi8(__m64 v) @trusted 876 { 877 byte[8] C = (cast(byte8)v).array; 878 printf("%d %d %d %d %d %d %d %d\n", 879 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 880 } 881 882 void _mm_print_epi64(__m128i v) @trusted 883 { 884 long2 vl = cast(long2)v; 885 printf("%lld %lld\n", vl.array[0], vl.array[1]); 886 } 887 888 void _mm_print_epi32(__m128i v) @trusted 889 { 890 printf("%d %d %d %d\n", 891 v.array[0], v.array[1], v.array[2], v.array[3]); 892 } 893 894 void _mm_print_epi16(__m128i v) @trusted 895 { 896 short[8] C = (cast(short8)v).array; 897 printf("%d %d %d %d %d %d %d %d\n", 898 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 899 } 900 901 void _mm_print_epi8(__m128i v) @trusted 902 { 903 byte[16] C = (cast(byte16)v).array; 904 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 905 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 906 } 907 908 void _mm_print_ps(__m128 v) @trusted 909 { 910 // %g because %f can conceal very small numbers and prints zero instead 911 float[4] C = (cast(float4)v).array; 912 printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]); 913 } 914 915 void _mm_print_pd(__m128d v) @trusted 916 { 917 double[2] C = (cast(double2)v).array; 918 printf("%f %f\n", C[0], C[1]); 919 } 920 921 void _mm256_print_pd(__m256d v) @trusted 922 { 923 // %g because %f can conceal very small numbers and prints zero instead 924 printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 925 } 926 927 void _mm256_print_ps(__m256 v) @trusted 928 { 929 // %g because %f can conceal very small numbers and prints zero instead 930 printf("%g %g %g %g %g %g %g %g\n", 931 v.array[0], v.array[1], v.array[2], v.array[3], 932 v.array[4], v.array[5], v.array[6], v.array[7]); 933 } 934 935 void _mm256_print_epi16(__m256i v) @trusted 936 { 937 short16 vl = cast(short16)v; 938 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 939 vl.array[0], vl.array[1], vl.array[2], vl.array[3], 940 vl.array[4], vl.array[5], vl.array[6], vl.array[7], 941 vl.array[8], vl.array[9], vl.array[10], vl.array[11], 942 vl.array[12], vl.array[13], vl.array[14], vl.array[15]); 943 } 944 945 void _mm256_print_epi32(__m256i v) @trusted 946 { 947 int8 vl = cast(int8)v; 948 printf("%d %d %d %d %d %d %d %d\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3], 949 vl.array[4], vl.array[5], vl.array[6], vl.array[7]); 950 } 951 952 void _mm256_print_epi64(__m256i v) @trusted 953 { 954 long4 vl = cast(long4)v; 955 printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]); 956 } 957 958 void _mm256_print_epi8(__m256i v) @trusted 959 { 960 byte[32] C = (cast(byte32)v).array; 961 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 962 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], 963 C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15], 964 C[16], C[17], C[18], C[19], C[20], C[21], C[22], C[23], 965 C[24], C[25], C[26], C[27], C[28], C[29], C[30], C[31]); 966 967 } 968 } 969 970 971 // 972 // <FLOATING-POINT COMPARISONS> 973 // 974 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 975 // need different IR generation. 976 977 enum FPComparison 978 { 979 false_,// always false 980 oeq, // ordered and equal 981 ogt, // ordered and greater than 982 oge, // ordered and greater than or equal 983 olt, // ordered and less than 984 ole, // ordered and less than or equal 985 one, // ordered and not equal 986 ord, // ordered (no nans) 987 ueq, // unordered or equal 988 ugt, // unordered or greater than ("nle") 989 uge, // unordered or greater than or equal ("nlt") 990 ult, // unordered or less than ("nge") 991 ule, // unordered or less than or equal ("ngt") 992 une, // unordered or not equal ("neq") 993 uno, // unordered (either nans) 994 true_, // always true 995 } 996 997 private static immutable string[FPComparison.max+1] FPComparisonToString = 998 [ 999 "false", 1000 "oeq", 1001 "ogt", 1002 "oge", 1003 "olt", 1004 "ole", 1005 "one", 1006 "ord", 1007 "ueq", 1008 "ugt", 1009 "uge", 1010 "ult", 1011 "ule", 1012 "une", 1013 "uno", 1014 "true" 1015 ]; 1016 1017 // AVX FP comparison to FPComparison 1018 FPComparison mapAVXFPComparison(int imm8) pure @safe 1019 { 1020 // Always map on non-signalling 1021 static immutable FPComparison[16] mapping = 1022 [ 1023 FPComparison.oeq, // _CMP_EQ_OQ 1024 FPComparison.olt, // _CMP_LT_OS 1025 FPComparison.ole, // _CMP_LE_OS 1026 FPComparison.uno, // _CMP_UNORD_Q 1027 FPComparison.une, // _CMP_NEQ_UQ // TODO does it mean net-equal OR unordered? 1028 FPComparison.uge, // _CMP_NLT_US 1029 FPComparison.ugt, // _CMP_NLE_US 1030 FPComparison.ord, // _CMP_ORD_Q 1031 FPComparison.ueq, // _CMP_EQ_UQ 1032 FPComparison.ult, // _CMP_NGE_US 1033 FPComparison.ule, // _CMP_NGT_US 1034 FPComparison.false_,// _CMP_FALSE_OQ 1035 FPComparison.one, // _CMP_NEQ_OQ 1036 FPComparison.oge, // _CMP_GE_OS 1037 FPComparison.ogt, // _CMP_GT_OS 1038 FPComparison.true_ // _CMP_TRUE_UQ 1039 ]; 1040 1041 return mapping[imm8 & 0x0f]; // note: signalling NaN information is mixed up 1042 } 1043 1044 // Individual float comparison: returns -1 for true or 0 for false. 1045 // Useful for DMD and testing 1046 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 1047 { 1048 bool unordered = isnan(a) || isnan(b); 1049 final switch(comparison) with(FPComparison) 1050 { 1051 case false_: return false; 1052 case oeq: return a == b; 1053 case ogt: return a > b; 1054 case oge: return a >= b; 1055 case olt: return a < b; 1056 case ole: return a <= b; 1057 case one: return !unordered && (a != b); // NaN with != always yields true 1058 case ord: return !unordered; 1059 case ueq: return unordered || (a == b); 1060 case ugt: return unordered || (a > b); 1061 case uge: return unordered || (a >= b); 1062 case ult: return unordered || (a < b); 1063 case ule: return unordered || (a <= b); 1064 case une: return (a != b); // NaN with != always yields true 1065 case uno: return unordered; 1066 case true_: return true; 1067 } 1068 } 1069 1070 static if (LDC_with_optimizations) // this save time for bigger projects, since LDCInlineIR gets more expensive there. 1071 { 1072 /// Provides packed float comparisons 1073 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 1074 { 1075 enum ir = ` 1076 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 1077 %r = sext <4 x i1> %cmp to <4 x i32> 1078 ret <4 x i32> %r`; 1079 1080 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 1081 } 1082 1083 ///ditto 1084 package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @safe 1085 { 1086 enum ir = ` 1087 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <8 x float> %0, %1 1088 %r = sext <8 x i1> %cmp to <8 x i32> 1089 ret <8 x i32> %r`; 1090 return LDCInlineIR!(ir, int8, float8, float8)(a, b); 1091 } 1092 1093 /// Provides packed double comparisons 1094 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 1095 { 1096 enum ir = ` 1097 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 1098 %r = sext <2 x i1> %cmp to <2 x i64> 1099 ret <2 x i64> %r`; 1100 1101 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 1102 } 1103 1104 ///ditto 1105 package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @safe 1106 { 1107 enum ir = ` 1108 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x double> %0, %1 1109 %r = sext <4 x i1> %cmp to <4 x i64> 1110 ret <4 x i64> %r`; 1111 return LDCInlineIR!(ir, long4, double4, double4)(a, b); 1112 } 1113 1114 /// CMPSS-style comparisons 1115 /// clang implement it through x86 intrinsics, it is possible with IR alone 1116 /// but leads to less optimal code. 1117 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 1118 /// Not that simple. 1119 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 1120 { 1121 /* 1122 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 1123 enum bool invertOp = (predicateNumber & 0x80) != 0; 1124 static if(invertOp) 1125 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 1126 else 1127 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 1128 */ 1129 enum ir = ` 1130 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 1131 %r = sext i1 %cmp to i32 1132 %r2 = bitcast i32 %r to float 1133 ret float %r2`; 1134 1135 float4 r = a; 1136 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 1137 return r; 1138 } 1139 1140 /// CMPSD-style comparisons 1141 /// clang implement it through x86 intrinsics, it is possible with IR alone 1142 /// but leads to less optimal code. 1143 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 1144 /// Not that simple. 1145 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 1146 { 1147 enum ir = ` 1148 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 1149 %r = sext i1 %cmp to i64 1150 %r2 = bitcast i64 %r to double 1151 ret double %r2`; 1152 1153 double2 r = a; 1154 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 1155 return r; 1156 } 1157 } 1158 else 1159 { 1160 /// Provides packed float comparisons 1161 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 1162 { 1163 int4 result; 1164 foreach(i; 0..4) 1165 { 1166 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 1167 } 1168 return result; 1169 } 1170 ///ditto 1171 package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @trusted 1172 { 1173 int8 result; 1174 foreach(i; 0..8) 1175 { 1176 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 1177 } 1178 return result; 1179 } 1180 1181 /// Provides packed double comparisons 1182 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 1183 { 1184 long2 result; 1185 foreach(i; 0..2) 1186 { 1187 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 1188 } 1189 return result; 1190 } 1191 ///ditto 1192 package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @trusted 1193 { 1194 long4 result; 1195 foreach(i; 0..4) 1196 { 1197 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 1198 } 1199 return result; 1200 } 1201 1202 /// Provides CMPSS-style comparison 1203 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 1204 { 1205 int4 result = cast(int4)a; 1206 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 1207 return cast(float4)result; 1208 } 1209 1210 /// Provides CMPSD-style comparison 1211 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 1212 { 1213 long2 result = cast(long2)a; 1214 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 1215 return cast(double2)result; 1216 } 1217 } 1218 unittest // cmpps 1219 { 1220 // Check all comparison type is working 1221 float4 A = [1, 3, 5, float.nan]; 1222 float4 B = [2, 3, 4, 5]; 1223 1224 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 1225 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 1226 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 1227 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 1228 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 1229 int4 result_one = cmpps!(FPComparison.one)(A, B); 1230 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 1231 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 1232 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 1233 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 1234 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 1235 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 1236 int4 result_une = cmpps!(FPComparison.une)(A, B); 1237 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 1238 1239 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 1240 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 1241 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 1242 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 1243 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 1244 static immutable int[4] correct_one = [-1, 0,-1, 0]; 1245 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 1246 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 1247 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 1248 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 1249 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 1250 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 1251 static immutable int[4] correct_une = [-1, 0,-1,-1]; 1252 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 1253 1254 assert(result_oeq.array == correct_oeq); 1255 assert(result_ogt.array == correct_ogt); 1256 assert(result_oge.array == correct_oge); 1257 assert(result_olt.array == correct_olt); 1258 assert(result_ole.array == correct_ole); 1259 assert(result_one.array == correct_one); 1260 assert(result_ord.array == correct_ord); 1261 assert(result_ueq.array == correct_ueq); 1262 assert(result_ugt.array == correct_ugt); 1263 assert(result_uge.array == correct_uge); 1264 assert(result_ult.array == correct_ult); 1265 assert(result_ule.array == correct_ule); 1266 assert(result_une.array == correct_une); 1267 assert(result_uno.array == correct_uno); 1268 } 1269 unittest 1270 { 1271 double2 a = [1, 3]; 1272 double2 b = [2, 3]; 1273 long2 c = cmppd!(FPComparison.ult)(a, b); 1274 static immutable long[2] correct = [cast(long)(-1), 0]; 1275 assert(c.array == correct); 1276 } 1277 unittest // cmpss 1278 { 1279 void testComparison(FPComparison comparison)(float4 A, float4 B) 1280 { 1281 float4 result = cmpss!comparison(A, B); 1282 int4 iresult = cast(int4)result; 1283 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 1284 assert(iresult.array[0] == expected); 1285 assert(result.array[1] == A.array[1]); 1286 assert(result.array[2] == A.array[2]); 1287 assert(result.array[3] == A.array[3]); 1288 } 1289 1290 // Check all comparison type is working 1291 float4 A = [1, 3, 5, 6]; 1292 float4 B = [2, 3, 4, 5]; 1293 float4 C = [float.nan, 3, 4, 5]; 1294 1295 testComparison!(FPComparison.oeq)(A, B); 1296 testComparison!(FPComparison.oeq)(A, C); 1297 testComparison!(FPComparison.ogt)(A, B); 1298 testComparison!(FPComparison.ogt)(A, C); 1299 testComparison!(FPComparison.oge)(A, B); 1300 testComparison!(FPComparison.oge)(A, C); 1301 testComparison!(FPComparison.olt)(A, B); 1302 testComparison!(FPComparison.olt)(A, C); 1303 testComparison!(FPComparison.ole)(A, B); 1304 testComparison!(FPComparison.ole)(A, C); 1305 testComparison!(FPComparison.one)(A, B); 1306 testComparison!(FPComparison.one)(A, C); 1307 testComparison!(FPComparison.ord)(A, B); 1308 testComparison!(FPComparison.ord)(A, C); 1309 testComparison!(FPComparison.ueq)(A, B); 1310 testComparison!(FPComparison.ueq)(A, C); 1311 testComparison!(FPComparison.ugt)(A, B); 1312 testComparison!(FPComparison.ugt)(A, C); 1313 testComparison!(FPComparison.uge)(A, B); 1314 testComparison!(FPComparison.uge)(A, C); 1315 testComparison!(FPComparison.ult)(A, B); 1316 testComparison!(FPComparison.ult)(A, C); 1317 testComparison!(FPComparison.ule)(A, B); 1318 testComparison!(FPComparison.ule)(A, C); 1319 testComparison!(FPComparison.une)(A, B); 1320 testComparison!(FPComparison.une)(A, C); 1321 testComparison!(FPComparison.uno)(A, B); 1322 testComparison!(FPComparison.uno)(A, C); 1323 } 1324 unittest // cmpsd 1325 { 1326 void testComparison(FPComparison comparison)(double2 A, double2 B) 1327 { 1328 double2 result = cmpsd!comparison(A, B); 1329 long2 iresult = cast(long2)result; 1330 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1331 assert(iresult.array[0] == expected); 1332 assert(result.array[1] == A.array[1]); 1333 } 1334 1335 // Check all comparison type is working 1336 double2 A = [1, 3]; 1337 double2 B = [2, 4]; 1338 double2 C = [double.nan, 5]; 1339 1340 testComparison!(FPComparison.oeq)(A, B); 1341 testComparison!(FPComparison.oeq)(A, C); 1342 testComparison!(FPComparison.ogt)(A, B); 1343 testComparison!(FPComparison.ogt)(A, C); 1344 testComparison!(FPComparison.oge)(A, B); 1345 testComparison!(FPComparison.oge)(A, C); 1346 testComparison!(FPComparison.olt)(A, B); 1347 testComparison!(FPComparison.olt)(A, C); 1348 testComparison!(FPComparison.ole)(A, B); 1349 testComparison!(FPComparison.ole)(A, C); 1350 testComparison!(FPComparison.one)(A, B); 1351 testComparison!(FPComparison.one)(A, C); 1352 testComparison!(FPComparison.ord)(A, B); 1353 testComparison!(FPComparison.ord)(A, C); 1354 testComparison!(FPComparison.ueq)(A, B); 1355 testComparison!(FPComparison.ueq)(A, C); 1356 testComparison!(FPComparison.ugt)(A, B); 1357 testComparison!(FPComparison.ugt)(A, C); 1358 testComparison!(FPComparison.uge)(A, B); 1359 testComparison!(FPComparison.uge)(A, C); 1360 testComparison!(FPComparison.ult)(A, B); 1361 testComparison!(FPComparison.ult)(A, C); 1362 testComparison!(FPComparison.ule)(A, B); 1363 testComparison!(FPComparison.ule)(A, C); 1364 testComparison!(FPComparison.une)(A, B); 1365 testComparison!(FPComparison.une)(A, C); 1366 testComparison!(FPComparison.uno)(A, B); 1367 testComparison!(FPComparison.uno)(A, C); 1368 } 1369 1370 // 1371 // </FLOATING-POINT COMPARISONS> 1372 // 1373 1374 1375 __m64 to_m64(__m128i a) pure @trusted 1376 { 1377 long2 la = cast(long2)a; 1378 long1 r = la.array[0]; 1379 return r; 1380 } 1381 1382 __m128i to_m128i(__m64 a) pure @trusted 1383 { 1384 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1385 1386 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1387 { 1388 long2 r = a.array[0]; 1389 r.ptr[1] = 0; 1390 return cast(int4)r; 1391 } 1392 else */ 1393 { 1394 long2 r = [0, 0]; 1395 r.ptr[0] = a.array[0]; 1396 return cast(__m128i)r; 1397 } 1398 } 1399 1400 1401 // ADDITIONAL LLVM INTRINSICS 1402 // Basically LDC didn't add them yet 1403 version(LDC) 1404 { 1405 static if (__VERSION__ >= 2097) // LDC 1.27+ 1406 { 1407 pragma(LDC_intrinsic, "llvm.abs.i#") 1408 T inteli_llvm_abs(T)(T val, bool attrib); 1409 } 1410 1411 static if (__VERSION__ >= 2092) // LDC 1.22+ 1412 { 1413 pragma(LDC_intrinsic, "llvm.sadd.sat.i#") 1414 T inteli_llvm_adds(T)(T a, T b) pure @safe; 1415 pragma(LDC_intrinsic, "llvm.ssub.sat.i#") 1416 T inteli_llvm_subs(T)(T a, T b) pure @safe; 1417 pragma(LDC_intrinsic, "llvm.uadd.sat.i#") 1418 T inteli_llvm_addus(T)(T a, T b) pure @safe; 1419 pragma(LDC_intrinsic, "llvm.usub.sat.i#") 1420 T inteli_llvm_subus(T)(T a, T b) pure @safe; 1421 1422 enum LDC_with_saturated_intrinsics = true; 1423 } 1424 else 1425 enum LDC_with_saturated_intrinsics = false; 1426 } 1427 else 1428 enum LDC_with_saturated_intrinsics = false; 1429 1430 // ADDITIONAL x86 INTRINSICS 1431 // Absent from ldc.gccbuiltins_x86 for some reason, but needed. 1432 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td 1433 static if (LDC_with_SSE41) 1434 { 1435 pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb") 1436 byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe; 1437 } 1438 1439 // SOME NEON INTRINSICS 1440 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1441 // Not in the public API but the simde project expose it all for the user to use. 1442 // MAYDO: create a new neon.d module, for internal use only. 1443 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1444 static if (LDC_with_ARM64) 1445 { 1446 // VERY USEFUL LINK 1447 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1448 // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/ 1449 1450 // Note: it is helpful to verify, in case of complex sequence of intrinsics, that the result is actually false. 1451 // Some intrinsics have trouble when inlined inside another, such as vmovl_low_s32. In this case, it's better to use builtins 1452 // from backend to have an inlining that still match the instruction. 1453 1454 pragma(LDC_intrinsic, "llvm.aarch64.crc32cb") 1455 uint __crc32cb(uint a, uint b) pure @safe; 1456 1457 pragma(LDC_intrinsic, "llvm.aarch64.crc32ch") 1458 uint __crc32ch(uint a, uint b) pure @safe; 1459 1460 pragma(LDC_intrinsic, "llvm.aarch64.crc32cw") 1461 uint __crc32cw(uint a, uint b) pure @safe; 1462 1463 pragma(LDC_intrinsic, "llvm.aarch64.crc32cx") 1464 uint __crc32cd(uint a, ulong b) pure @safe; 1465 1466 //pragma(LDC_intrinsic, "llvm.aarch64.dmb") 1467 // uint __dmb(int a) @safe; // didn't found a name in intrinsic list 1468 1469 pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8") 1470 byte16 vabdq_u8(byte16 a, byte16 b) pure @safe; 1471 1472 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16") 1473 short8 vabsq_s16(short8 a) pure @safe; 1474 1475 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32") 1476 int4 vabsq_s32(int4 a) pure @safe; 1477 1478 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8") 1479 byte16 vabsq_s8(byte16 a) pure @safe; 1480 1481 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1482 { 1483 return a & b; 1484 } 1485 1486 long2 vandq_s64(long2 a, long2 b) 1487 { 1488 return a & b; 1489 } 1490 1491 long2 vbicq_s64(long2 a, long2 b) pure @safe 1492 { 1493 return a & ~b; 1494 } 1495 1496 int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe 1497 { 1498 return c ^ ((c ^ b) & a); 1499 } 1500 1501 byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe 1502 { 1503 return c ^ ((c ^ b) & a); 1504 } 1505 1506 long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe 1507 { 1508 return c ^ ((c ^ b) & a); 1509 } 1510 1511 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1512 { 1513 short8 r; 1514 r.ptr[0] = lo.array[0]; 1515 r.ptr[1] = lo.array[1]; 1516 r.ptr[2] = lo.array[2]; 1517 r.ptr[3] = lo.array[3]; 1518 r.ptr[4] = hi.array[0]; 1519 r.ptr[5] = hi.array[1]; 1520 r.ptr[6] = hi.array[2]; 1521 r.ptr[7] = hi.array[3]; 1522 return r; 1523 } 1524 1525 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1526 { 1527 int4 r; 1528 r.ptr[0] = lo.array[0]; 1529 r.ptr[1] = lo.array[1]; 1530 r.ptr[2] = hi.array[0]; 1531 r.ptr[3] = hi.array[1]; 1532 return r; 1533 } 1534 1535 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1536 { 1537 byte16 r; 1538 r.ptr[0] = lo.array[0]; 1539 r.ptr[1] = lo.array[1]; 1540 r.ptr[2] = lo.array[2]; 1541 r.ptr[3] = lo.array[3]; 1542 r.ptr[4] = lo.array[4]; 1543 r.ptr[5] = lo.array[5]; 1544 r.ptr[6] = lo.array[6]; 1545 r.ptr[7] = lo.array[7]; 1546 r.ptr[8] = hi.array[0]; 1547 r.ptr[9] = hi.array[1]; 1548 r.ptr[10] = hi.array[2]; 1549 r.ptr[11] = hi.array[3]; 1550 r.ptr[12] = hi.array[4]; 1551 r.ptr[13] = hi.array[5]; 1552 r.ptr[14] = hi.array[6]; 1553 r.ptr[15] = hi.array[7]; 1554 return r; 1555 } 1556 1557 short8 vcombine_u16(short4 lo, short4 hi) pure @trusted 1558 { 1559 short8 r; 1560 r.ptr[0] = lo.array[0]; 1561 r.ptr[1] = lo.array[1]; 1562 r.ptr[2] = lo.array[2]; 1563 r.ptr[3] = lo.array[3]; 1564 r.ptr[4] = hi.array[0]; 1565 r.ptr[5] = hi.array[1]; 1566 r.ptr[6] = hi.array[2]; 1567 r.ptr[7] = hi.array[3]; 1568 return r; 1569 } 1570 1571 1572 // float4 => int4 1573 1574 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1575 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1576 1577 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1578 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1579 1580 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1581 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1582 1583 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1584 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1585 1586 1587 // double2 => long2 1588 1589 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64") 1590 long2 vcvtmq_s64_f64(double2 a) pure @safe; 1591 1592 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64") 1593 long2 vcvtnq_s64_f64(double2 a) pure @safe; 1594 1595 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64") 1596 long2 vcvtpq_s64_f64(double2 a) pure @safe; 1597 1598 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64") 1599 long2 vcvtzq_s64_f64(double2 a) pure @safe; 1600 1601 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1602 int vcvtms_s32_f32(float a) pure @safe; 1603 1604 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1605 int vcvtns_s32_f32(float a) pure @safe; 1606 1607 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1608 int vcvtps_s32_f32(float a) pure @safe; 1609 1610 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1611 int vcvts_s32_f32(float a) pure @safe; 1612 1613 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1614 int vcvtms_s32_f64(double a) pure @safe; 1615 1616 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1617 int vcvtns_s32_f64(double a) pure @safe; 1618 1619 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1620 int vcvtps_s32_f64(double a) pure @safe; 1621 1622 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1623 int vcvts_s32_f64(double a) pure @safe; 1624 1625 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1626 long vcvtms_s64_f32(float a) pure @safe; 1627 1628 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1629 long vcvtns_s64_f32(float a) pure @safe; 1630 1631 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1632 long vcvtps_s64_f32(float a) pure @safe; 1633 1634 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1635 long vcvts_s64_f32(float a) pure @safe; 1636 1637 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1638 long vcvtms_s64_f64(double a) pure @safe; 1639 1640 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1641 long vcvtns_s64_f64(double a) pure @safe; 1642 1643 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1644 long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64 1645 1646 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1647 long vcvts_s64_f64(double a) pure @safe; 1648 1649 long2 vdupq_n_s64(long value) pure @safe 1650 { 1651 long2 r; 1652 r = value; 1653 return r; 1654 } 1655 1656 short4 vget_high_s16(short8 a) pure @trusted 1657 { 1658 short4 r; 1659 r.ptr[0] = a.array[4]; 1660 r.ptr[1] = a.array[5]; 1661 r.ptr[2] = a.array[6]; 1662 r.ptr[3] = a.array[7]; 1663 return r; 1664 } 1665 1666 int2 vget_high_s32(int4 a) pure @trusted 1667 { 1668 int2 r; 1669 r.ptr[0] = a.array[2]; 1670 r.ptr[1] = a.array[3]; 1671 return r; 1672 } 1673 1674 byte8 vget_high_u8(byte16 a) pure @trusted 1675 { 1676 byte8 r; 1677 r.ptr[0] = a.array[8]; 1678 r.ptr[1] = a.array[9]; 1679 r.ptr[2] = a.array[10]; 1680 r.ptr[3] = a.array[11]; 1681 r.ptr[4] = a.array[12]; 1682 r.ptr[5] = a.array[13]; 1683 r.ptr[6] = a.array[14]; 1684 r.ptr[7] = a.array[15]; 1685 return r; 1686 } 1687 1688 short4 vget_low_s16(short8 a) pure @trusted 1689 { 1690 short4 r; 1691 r.ptr[0] = a.array[0]; 1692 r.ptr[1] = a.array[1]; 1693 r.ptr[2] = a.array[2]; 1694 r.ptr[3] = a.array[3]; 1695 return r; 1696 } 1697 1698 int2 vget_low_s32(int4 a) pure @trusted 1699 { 1700 int2 r; 1701 r.ptr[0] = a.array[0]; 1702 r.ptr[1] = a.array[1]; 1703 return r; 1704 } 1705 1706 byte8 vget_low_u8(byte16 a) pure @trusted 1707 { 1708 byte8 r; 1709 r.ptr[0] = a.array[0]; 1710 r.ptr[1] = a.array[1]; 1711 r.ptr[2] = a.array[2]; 1712 r.ptr[3] = a.array[3]; 1713 r.ptr[4] = a.array[4]; 1714 r.ptr[5] = a.array[5]; 1715 r.ptr[6] = a.array[6]; 1716 r.ptr[7] = a.array[7]; 1717 return r; 1718 } 1719 1720 long vgetq_lane_s64(long2 v, const int lane) pure @safe 1721 { 1722 return v.array[lane]; 1723 } 1724 1725 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1726 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1727 1728 int4 vmaxq_s32(int4 a, int4 b) pure @safe 1729 { 1730 int4 r; 1731 r[0] = a[0] >= b[0] ? a[0] : b[0]; 1732 r[1] = a[1] >= b[1] ? a[1] : b[1]; 1733 r[2] = a[2] >= b[2] ? a[2] : b[2]; 1734 r[3] = a[3] >= b[3] ? a[3] : b[3]; 1735 return r; 1736 } 1737 1738 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1739 short8 vminq_s16(short8 a, short8 b) pure @safe; 1740 1741 int4 vmovl_u16(short4 a) pure @trusted 1742 { 1743 int4 r; 1744 r.ptr[0] = cast(ushort)a.array[0]; 1745 r.ptr[1] = cast(ushort)a.array[1]; 1746 r.ptr[2] = cast(ushort)a.array[2]; 1747 r.ptr[3] = cast(ushort)a.array[3]; 1748 return r; 1749 } 1750 1751 int2 vmovn_s64(long2 a) pure @trusted 1752 { 1753 int2 r; 1754 r.ptr[0] = cast(int)(a.array[0]); 1755 r.ptr[1] = cast(int)(a.array[1]); 1756 return r; 1757 } 1758 1759 int4 vmull_s16(short4 a, short4 b) pure @trusted 1760 { 1761 int4 r; 1762 r.ptr[0] = a.array[0] * b.array[0]; 1763 r.ptr[1] = a.array[1] * b.array[1]; 1764 r.ptr[2] = a.array[2] * b.array[2]; 1765 r.ptr[3] = a.array[3] * b.array[3]; 1766 return r; 1767 } 1768 1769 pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64") 1770 long2 vmull_s32(int2 a, int2 b) pure @safe; 1771 1772 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16") 1773 short4 vpadd_s16(short4 a, short4 b) pure @safe; 1774 1775 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1776 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1777 1778 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1779 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1780 1781 pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8") 1782 short8 vpaddlq_u8 (byte16 a) pure @safe; 1783 1784 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1785 { 1786 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1787 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1788 } 1789 else 1790 { 1791 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1792 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1793 } 1794 1795 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16") 1796 short8 vpaddq_s16(short8 a, short8 b) pure @safe; 1797 1798 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1799 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1800 1801 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32") 1802 int4 vpaddq_s32(int4 a, int4 b) pure @safe; 1803 1804 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16") 1805 short4 vqadd_s16(short4 a, short4 b) pure @safe; 1806 1807 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16") 1808 short8 vqaddq_s16(short8 a, short8 b) pure @safe; 1809 1810 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1811 byte8 vqmovn_s16(short8 a) pure @safe; 1812 1813 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1814 short4 vqmovn_s32(int4 a) pure @safe; 1815 1816 pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16") 1817 short4 vqmovn_u32(int4 a) pure @safe; 1818 1819 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1820 byte8 vqmovun_s16(short8 a) pure @safe; 1821 1822 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16") 1823 short4 vqsub_s16(short4 a, short4 b) pure @safe; 1824 1825 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16") 1826 short8 vqsubq_s16(short8 a, short8 b) pure @safe; 1827 1828 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8") 1829 byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe; 1830 1831 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1832 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1833 1834 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1835 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1836 1837 pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16") 1838 short4 vrshrn_n_s32(int4 a, int n) pure @safe; 1839 1840 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1841 { 1842 return a >>> b; 1843 } 1844 1845 byte16 vshrq_n_s8(byte16 a, byte r) pure @safe 1846 { 1847 a = a >> byte16(cast(byte)r); 1848 return a; 1849 } 1850 1851 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8") 1852 byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe; 1853 } 1854 1855 version(unittest) 1856 { 1857 double abs_double(double x) @trusted 1858 { 1859 version(LDC) 1860 return llvm_fabs(x); 1861 else 1862 { 1863 long uf = *cast(long*)(&x); 1864 uf &= 0x7fffffff_ffffffff; 1865 return *cast(double*)(&uf); 1866 } 1867 } 1868 } 1869 1870 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure 1871 1872 bool isnan(float x) pure @trusted 1873 { 1874 uint u = *cast(uint*)(&x); 1875 bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF); 1876 return result; 1877 } 1878 unittest 1879 { 1880 float x = float.nan; 1881 assert(isnan(x)); 1882 1883 x = 0; 1884 assert(!isnan(x)); 1885 1886 x = float.infinity; 1887 assert(!isnan(x)); 1888 } 1889 1890 bool isnan(double x) pure @trusted 1891 { 1892 ulong u = *cast(ulong*)(&x); 1893 return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF); 1894 } 1895 unittest 1896 { 1897 double x = double.nan; 1898 assert(isnan(x)); 1899 1900 x = 0; 1901 assert(!isnan(x)); 1902 1903 x = double.infinity; 1904 assert(!isnan(x)); 1905 }