The OpenD Programming Language

1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.internals;
8 
9 import inteli.types;
10 
11 // The only math functions needed for intel-intrinsics
12 public import core.math: sqrt; // since it's an intrinsics
13 
14 package:
15 nothrow:
16 @nogc:
17 
18 
19 version(GNU)
20 {
21     version (X86)
22     {
23         // For 32-bit x86, disable vector extensions with GDC. 
24         // It just doesn't work well.
25         enum GDC_with_x86 = true;
26         enum GDC_with_MMX = false;
27         enum GDC_with_SSE = false;
28         enum GDC_with_SSE2 = false;
29         enum GDC_with_SSE3 = false;
30         enum GDC_with_SSSE3 = false;
31         enum GDC_with_SSE41 = false;
32         enum GDC_with_SSE42 = false;
33         enum GDC_with_AVX = false;
34         enum GDC_with_AVX2 = false;
35         enum GDC_with_SHA = false;
36         enum GDC_with_BMI2 = false;
37     }
38     else version (X86_64)
39     {
40         // GDC support uses extended inline assembly:
41         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
42         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
43         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
44 
45         public import core.simd: byte16, short8, int4, float4, double2;
46 
47         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
48         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
49         public import gcc.builtins;
50 
51         // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 
52         // want to support other archs with GDC
53 
54         enum GDC_with_x86 = true;
55         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
56         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
57         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
58 
59         static if (__VERSION__ >= 2100) // Starting at GDC 12.1
60         {
61             enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps);
62             enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128);
63             enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps);
64             enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq);
65             enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256);
66             enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df);
67             enum GDC_with_BMI2 = __traits(compiles, __builtin_ia32_pext_si);
68 
69         }
70         else
71         {
72             // Before GCC 11.3, no reliable way to detect instruction sets.
73             // We start above detection at GCC 12, with DMDFE 2.100, which
74             // is more conservative.
75             enum GDC_with_SSE3 = false;
76             enum GDC_with_SSSE3 = false;
77             enum GDC_with_SSE41 = false;
78             enum GDC_with_SSE42 = false;
79             enum GDC_with_AVX = false;
80             enum GDC_with_AVX2 = false;
81             enum GDC_with_BMI2 = false;
82         }
83 
84         enum GDC_with_SHA = false; // TODO: detect that
85     }
86     else
87     {
88         enum GDC_with_x86 = false;
89         enum GDC_with_MMX = false;
90         enum GDC_with_SSE = false;
91         enum GDC_with_SSE2 = false;
92         enum GDC_with_SSE3 = false;
93         enum GDC_with_SSSE3 = false;
94         enum GDC_with_SSE41 = false;
95         enum GDC_with_SSE42 = false;
96         enum GDC_with_AVX = false;
97         enum GDC_with_AVX2 = false;
98         enum GDC_with_SHA = false;
99         enum GDC_with_BMI2 = false;
100     }
101 }
102 else
103 {
104     enum GDC_with_x86 = false;
105     enum GDC_with_MMX = false;
106     enum GDC_with_SSE = false;
107     enum GDC_with_SSE2 = false;
108     enum GDC_with_SSE3 = false;
109     enum GDC_with_SSSE3 = false;
110     enum GDC_with_SSE41 = false;
111     enum GDC_with_SSE42 = false;
112     enum GDC_with_AVX = false;
113     enum GDC_with_AVX2 = false;
114     enum GDC_with_SHA = false;
115     enum GDC_with_BMI2 = false;
116 }
117 
118 version(LDC)
119 {
120     public import core.simd;
121     public import ldc.simd;
122     public import ldc.intrinsics;
123     public import ldc.llvmasm: __asm;
124 
125     version (X86)
126         private enum bool some_x86 = true;
127     else version (X86_64)
128         private enum bool some_x86 = true;
129     else
130         private enum bool some_x86 = false;
131 
132     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
133     static if (__VERSION__ >= 2083)
134     {
135         import ldc.llvmasm;
136         alias LDCInlineIR = __ir_pure;
137 
138         // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
139         alias LDCInlineIREx = __irEx_pure; 
140 
141         enum bool LDC_with_InlineIREx = true;
142     }
143     else
144     {
145         alias LDCInlineIR = inlineIR;
146         enum bool LDC_with_InlineIREx = false;
147     }
148 
149     // This is used to disable LDC feature that are expensive at compile time: 
150     // everything that relies on inline LLVM IR.
151     version(D_Optimized)
152     {
153         enum bool LDC_with_optimizations = true;
154     }
155     else
156     {
157         static if (__VERSION__ < 2101)
158         {
159             // See Issue #136, D_Optimized only appeared in DMDFE 2.101.
160             // Relying on this had terrible consequences.
161             enum bool LDC_with_optimizations = true;
162         }
163         else
164             enum bool LDC_with_optimizations = false;
165     }
166 
167     version(ARM)
168     {
169         public import ldc.gccbuiltins_arm;
170         enum LDC_with_ARM32 = true;
171         enum LDC_with_ARM64 = false;
172         enum LDC_with_ARM64_CRC = false;
173         enum LDC_with_SSE = false;
174         enum LDC_with_SSE2 = false;
175         enum LDC_with_SSE3 = false;
176         enum LDC_with_SSSE3 = false;
177         enum LDC_with_SSE41 = false;
178         enum LDC_with_SSE42 = false;
179         enum LDC_with_CRC32 = false;
180         enum LDC_with_AVX = false;
181         enum LDC_with_AVX2 = false;
182         enum LDC_with_SHA = false;
183         enum LDC_with_BMI2 = false;
184     }
185     else version(AArch64)
186     {
187         public import ldc.gccbuiltins_aarch64;
188         enum LDC_with_ARM32 = false;
189         enum LDC_with_ARM64 = true; // implies "has Neon"
190         enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc");
191         enum LDC_with_SSE = false;
192         enum LDC_with_SSE2 = false;
193         enum LDC_with_SSE3 = false;
194         enum LDC_with_SSSE3 = false;
195         enum LDC_with_SSE41 = false;
196         enum LDC_with_SSE42 = false;
197         enum LDC_with_CRC32 = false;
198         enum LDC_with_AVX = false;
199         enum LDC_with_AVX2 = false;
200         enum LDC_with_SHA = false;
201         enum LDC_with_BMI2 = false;
202     }
203     else static if (some_x86)
204     {
205         public import ldc.gccbuiltins_x86;
206 
207         // Workaround LDC 1.32.0 having NO builtins at all.
208         // See LDC issue 4347 https://github.com/ldc-developers/ldc/issues/4347
209         enum LDC_has_some_x86_builtins = __traits(compiles, __builtin_ia32_clflush); // This one must be available in all of LDC history.
210 
211         static if (!LDC_has_some_x86_builtins)
212         {
213             // in case our __builtin_ia32_clflush workaround breaks
214             pragma(msg, "Warning: LDC v1.32.0 has no SIMD builtins. intel-intrinsics will use slow path. Please avoid LDC 1.32.0");
215         }
216 
217         enum LDC_with_ARM32 = false;
218         enum LDC_with_ARM64 = false;
219         enum LDC_with_ARM64_CRC = false;
220         enum LDC_with_SSE = __traits(targetHasFeature, "sse") && LDC_has_some_x86_builtins;
221         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2") && LDC_has_some_x86_builtins;
222         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3") && LDC_has_some_x86_builtins;
223         enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3") && LDC_has_some_x86_builtins;
224         enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1") && LDC_has_some_x86_builtins;
225         enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2") && LDC_has_some_x86_builtins;
226 
227         // Since LDC 1.30, crc32 is a separate (and sufficient) attribute from sse4.2
228         // As of Jan 2023, GDC doesn't make that distinction, -msse4.2 includes -mcrc32 for GDC.
229         static if (__VERSION__ >= 2100)
230         {
231             enum LDC_with_CRC32 = __traits(targetHasFeature, "crc32") && LDC_has_some_x86_builtins;
232         }
233         else
234         {
235             enum LDC_with_CRC32 = __traits(targetHasFeature, "sse4.2") && LDC_has_some_x86_builtins; // crc32 used to be included in sse4.2
236         }
237 
238         enum LDC_with_AVX = __traits(targetHasFeature, "avx") && LDC_has_some_x86_builtins;
239         enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2") && LDC_has_some_x86_builtins;
240         enum LDC_with_SHA = __traits(targetHasFeature, "sha") && LDC_has_some_x86_builtins;
241         enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2") && LDC_has_some_x86_builtins;
242     }
243     else
244     {
245         enum LDC_with_ARM32 = false;
246         enum LDC_with_ARM64 = false;
247         enum LDC_with_ARM64_CRC = false;
248         enum LDC_with_SSE = false;
249         enum LDC_with_SSE2 = false;
250         enum LDC_with_SSE3 = false;
251         enum LDC_with_SSSE3 = false;
252         enum LDC_with_SSE41 = false;
253         enum LDC_with_SSE42 = false;
254         enum LDC_with_CRC32 = false;
255         enum LDC_with_AVX = false;
256         enum LDC_with_AVX2 = false;
257         enum LDC_with_SHA = false;
258         enum LDC_with_BMI2 = false;
259     }
260 
261     // Should we use inline x86 assembly with DMD syntax, in LDC?
262     version(D_InlineAsm_X86)
263     {
264         enum LDC_with_32b_x86_asm = LDC_with_SSE2; // if no SSE support, disable the x86 asm code path
265         enum LDC_with_64b_x86_asm = false;
266     }
267     else version(D_InlineAsm_X86_64)
268     {
269         enum LDC_with_32b_x86_asm = false;
270         enum LDC_with_64b_x86_asm = LDC_with_SSE2;
271     }
272     else
273     {
274         enum LDC_with_32b_x86_asm = false;
275         enum LDC_with_64b_x86_asm = false;
276     }
277 }
278 else
279 {
280     enum LDC_with_ARM32 = false;
281     enum LDC_with_ARM64 = false;
282     enum LDC_with_ARM64_CRC = false;
283     enum LDC_with_SSE = false;
284     enum LDC_with_SSE2 = false;
285     enum LDC_with_SSE3 = false;
286     enum LDC_with_SSSE3 = false;
287     enum LDC_with_SSE41 = false;
288     enum LDC_with_SSE42 = false;
289     enum LDC_with_CRC32 = false;
290     enum LDC_with_AVX = false;
291     enum LDC_with_AVX2 = false;
292     enum LDC_with_SHA = false;
293     enum LDC_with_BMI2 = false;
294     enum LDC_with_InlineIREx = false;
295     enum bool LDC_with_optimizations = false;
296     enum bool LDC_with_32b_x86_asm = false;
297     enum bool LDC_with_64b_x86_asm = false;
298 }
299 enum LDC_with_x86_asm = LDC_with_32b_x86_asm || LDC_with_64b_x86_asm;
300 
301 
302 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
303 
304 version(DigitalMars)
305 {
306     version(D_InlineAsm_X86)
307         enum DMD_with_asm = true;
308     else version(D_InlineAsm_X86_64)
309         enum DMD_with_asm = true;
310     else
311         enum DMD_with_asm = false;
312 
313     version(D_InlineAsm_X86)
314         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
315     else
316         enum DMD_with_32bit_asm = false;
317 
318     version (D_SIMD)
319     {
320         enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
321 
322         // Going further, does DMD has SSE4.1 through -mcpu?
323         static if (DMD_with_DSIMD)
324             enum bool DMD_with_DSIMD_and_SSE41 = __traits(compiles, int4(0) * int4(0));
325         else
326             enum bool DMD_with_DSIMD_and_SSE41 = false;
327 
328         // No DMD way to detect those instruction sets => pessimize
329         // would be cool to have a way to detect support for this at CT
330         enum DMD_with_DSIMD_and_SSE3  = DMD_with_DSIMD_and_SSE41; 
331         enum DMD_with_DSIMD_and_SSSE3 = DMD_with_DSIMD_and_SSE41;
332 
333         version(D_AVX)
334             enum DMD_with_DSIMD_and_AVX   = true;
335         else
336             enum DMD_with_DSIMD_and_AVX   = false;
337 
338         version(D_AVX2)
339             enum DMD_with_DSIMD_and_AVX2  = true;
340         else
341             enum DMD_with_DSIMD_and_AVX2  = false;
342 
343         enum DMD_with_DSIMD_and_SSE42 = DMD_with_DSIMD_and_AVX;
344     }
345     else
346     {
347         enum DMD_with_DSIMD = false;
348         enum DMD_with_DSIMD_and_SSE3  = false;
349         enum DMD_with_DSIMD_and_SSSE3 = false;
350         enum DMD_with_DSIMD_and_SSE41 = false;
351         enum DMD_with_DSIMD_and_SSE42 = false;
352         enum DMD_with_DSIMD_and_AVX   = false;
353         enum DMD_with_DSIMD_and_AVX2  = false;
354     }
355 }
356 else
357 {
358     enum DMD_with_asm = false;
359     enum DMD_with_32bit_asm = false;
360     enum DMD_with_DSIMD = false;
361     enum DMD_with_DSIMD_and_SSE3  = false;
362     enum DMD_with_DSIMD_and_SSSE3 = false;
363     enum DMD_with_DSIMD_and_SSE41 = false;
364     enum DMD_with_DSIMD_and_SSE42 = false;
365     enum DMD_with_DSIMD_and_AVX   = false;
366     enum DMD_with_DSIMD_and_AVX2  = false;
367 }
368 
369 
370 // Sometimes, can be helpful to merge builtin code, however keep in mind that
371 // LDC and GDC builtins often subtly diverge, wrt. unsigned vs signed vectors, 
372 // return types, purity... test it in Godbolt! this is safer with float and double intrinsics.
373 enum GDC_or_LDC_with_SSE  = GDC_with_SSE  || LDC_with_SSE;
374 enum GDC_or_LDC_with_SSE2 = GDC_with_SSE2 || LDC_with_SSE2;
375 enum GDC_or_LDC_with_SSE3 = GDC_with_SSE3 || LDC_with_SSE3;
376 enum GDC_or_LDC_with_SSE41 = GDC_with_SSE41 || LDC_with_SSE41;
377 enum GDC_or_LDC_with_SSE42 = GDC_with_SSE42 || LDC_with_SSE42;
378 
379 enum GDC_or_LDC_with_AVX  = GDC_with_AVX  || LDC_with_AVX;
380 enum GDC_or_LDC_with_AVX2 = GDC_with_AVX2 || LDC_with_AVX2;
381 enum GDC_or_LDC_with_SHA  = GDC_with_SHA  || LDC_with_SHA;
382 enum GDC_or_LDC_with_BMI2 = GDC_with_BMI2 || LDC_with_BMI2;
383 
384 static if (__VERSION__ >= 2102)
385 {
386     enum SIMD_COMPARISON_MASKS_8B  = !MMXSizedVectorsAreEmulated; // can do < <= => > == with builtin 8 bytes __vectors.
387     enum SIMD_COMPARISON_MASKS_16B = !SSESizedVectorsAreEmulated; // can do < <= => > == with builtin 16 bytes __vectors.
388     enum SIMD_COMPARISON_MASKS_32B = !AVXSizedVectorsAreEmulated; // can do < <= => > == with builtin 32 bytes __vectors.
389 }
390 else
391 {
392     enum SIMD_COMPARISON_MASKS_8B = false;
393     enum SIMD_COMPARISON_MASKS_16B = false;
394     enum SIMD_COMPARISON_MASKS_32B = false;
395 }
396 
397 
398 static if (LDC_with_ARM32)
399 {
400     package uint arm_get_fpcr() nothrow @nogc @trusted
401     {
402         return __builtin_arm_get_fpscr();
403     }
404 
405     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
406     {
407         __builtin_arm_set_fpscr(cw);
408     }
409 }
410 
411 static if (LDC_with_ARM64)
412 {
413     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
414         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
415 
416     package uint arm_get_fpcr() pure nothrow @nogc @trusted
417     {
418         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
419         return __asm!uint("mrs $0, fpcr", "=r");
420     }
421 
422     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
423     {
424         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
425         long save_x2;
426         __asm!void("str x2, $1 \n" ~
427                    "ldr w2, $0 \n" ~
428                    "msr fpcr, x2 \n" ~
429                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
430     }
431 }
432 
433 
434 // For internal use only, since public API deals with a x86 semantic emulation
435 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
436 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
437 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
438 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
439 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
440 enum uint _MM_FLUSH_ZERO_MASK_ARM   = 0x01000000;
441 
442 
443 //
444 //  <ROUNDING>
445 //
446 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
447 //  doesn't change the FPU rounding mode, and isn't expected to do so.
448 //  So we devised these rounding function to help having consistent rounding between 
449 //  LDC and DMD. It's important that DMD uses whatever is in MXCSR to round.
450 //
451 //  Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 
452 //  functionality.
453 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
454 //  We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
455 
456 int convertFloatToInt32UsingMXCSR(float value) @trusted
457 {
458     int result;
459     version(GNU)
460     {
461         asm pure nothrow @nogc @trusted
462         {
463             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
464         }
465     }
466     else static if (LDC_with_ARM32)
467     {
468         result = __asm!int(`vldr s2, $1
469                             vcvtr.s32.f32 s2, s2
470                             vmov $0, s2`, "=r,m,~{s2}", value);
471     }
472     else static if (LDC_with_ARM64)
473     {
474         // Get current rounding mode.
475         uint fpscr = arm_get_fpcr();
476 
477         switch(fpscr & _MM_ROUND_MASK_ARM)
478         {
479             default:
480             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
481             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
482             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
483             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
484         }
485     }
486     else
487     {
488         asm pure nothrow @nogc @trusted
489         {
490             cvtss2si EAX, value;
491             mov result, EAX;
492         }
493     }
494     return result;
495 }
496 
497 int convertDoubleToInt32UsingMXCSR(double value) @trusted
498 {
499     int result;
500     version(GNU)
501     {
502         asm pure nothrow @nogc @trusted
503         {
504             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
505         }
506     }
507     else static if (LDC_with_ARM32)
508     {
509         result = __asm!int(`vldr d2, $1
510                             vcvtr.s32.f64 s2, d2
511                             vmov $0, s2`, "=r,m,~{s2},~{d2}", value);
512     }
513     else static if (LDC_with_ARM64)
514     {
515         // Get current rounding mode.
516         uint fpscr = arm_get_fpcr();
517 
518         switch(fpscr & _MM_ROUND_MASK_ARM)
519         {
520             default:
521             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
522             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
523             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
524             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
525         }
526     }
527     else
528     {
529         asm pure nothrow @nogc @trusted
530         {
531             cvtsd2si EAX, value;
532             mov result, EAX;
533         }
534     }
535     return result;
536 }
537 
538 long convertFloatToInt64UsingMXCSR(float value) @trusted
539 {
540     static if (LDC_with_ARM32)
541     {
542         // We have to resort to libc since 32-bit ARM 
543         // doesn't seem to have 64-bit registers.
544         
545         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
546 
547         // Note: converting to double precision else rounding could be different for large integers
548         double asDouble = value; 
549 
550         switch(fpscr & _MM_ROUND_MASK_ARM)
551         {
552             default:
553             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
554             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
555             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
556             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
557         }
558     }
559     else static if (LDC_with_ARM64)
560     {
561         uint fpscr = arm_get_fpcr();
562 
563         switch(fpscr & _MM_ROUND_MASK_ARM)
564         {
565             default:
566             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
567             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
568             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
569             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
570         }
571     }
572     // 64-bit can use an SSE instruction
573     else version(D_InlineAsm_X86_64)
574     {
575         long result;
576         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
577         {
578             asm pure nothrow @nogc @trusted
579             {
580                 movss XMM0, value;
581                 cvtss2si RAX, XMM0;
582                 mov result, RAX;
583             }
584         }
585         else
586         {
587             asm pure nothrow @nogc @trusted
588             {
589                 movss XMM0, value;
590                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
591                 mov result, RAX;
592             }
593         }
594         return result;
595     }
596     else version(D_InlineAsm_X86)
597     {
598         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
599         // This leads to an unfortunate FPU sequence in every C++ compiler.
600         // See: https://godbolt.org/z/vZym77
601 
602         // Get current MXCSR rounding
603         uint sseRounding;
604         ushort savedFPUCW;
605         ushort newFPUCW;
606         long result;
607         asm pure nothrow @nogc @trusted
608         {
609             stmxcsr sseRounding;
610             fld value;
611             fnstcw savedFPUCW;
612             mov AX, savedFPUCW;
613             and AX, 0xf3ff;          // clear FPU rounding bits
614             movzx ECX, word ptr sseRounding;
615             and ECX, 0x6000;         // only keep SSE rounding bits
616             shr ECX, 3;
617             or AX, CX;               // make a new control word for FPU with SSE bits
618             mov newFPUCW, AX;
619             fldcw newFPUCW;
620             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
621             fldcw savedFPUCW;
622         }
623         return result;
624     }
625     else static if (GDC_with_x86)
626     {
627         version(X86_64) // 64-bit can just use the right instruction
628         {
629             static assert(GDC_with_SSE);
630             __m128 A;
631             A.ptr[0] = value;
632             return __builtin_ia32_cvtss2si64 (A);
633         }
634         else version(X86) // 32-bit
635         {
636             // This is untested!
637             uint sseRounding;
638             ushort savedFPUCW;
639             ushort newFPUCW;
640             long result;
641             asm pure nothrow @nogc @trusted
642             {
643                 "stmxcsr %1;\n" ~
644                 "fld %2;\n" ~
645                 "fnstcw %3;\n" ~
646                 "movw %3, %%ax;\n" ~
647                 "andw $0xf3ff, %%ax;\n" ~
648                 "movzwl %1, %%ecx;\n" ~
649                 "andl $0x6000, %%ecx;\n" ~
650                 "shrl $3, %%ecx;\n" ~
651                 "orw %%cx, %%ax\n" ~
652                 "movw %%ax, %4;\n" ~
653                 "fldcw %4;\n" ~
654                 "fistpll %0;\n" ~
655                 "fldcw %3;\n" 
656                   : "=m"(result)    // %0
657                   : "m" (sseRounding),
658                     "f" (value),
659                     "m" (savedFPUCW),
660                     "m" (newFPUCW) 
661                   : "eax", "ecx", "st";
662             }
663             return result;
664         }
665         else
666             static assert(false);
667     }
668     else
669         static assert(false);
670 }
671 
672 
673 ///ditto
674 long convertDoubleToInt64UsingMXCSR(double value) @trusted
675 {
676     static if (LDC_with_ARM32)
677     {
678         // We have to resort to libc since 32-bit ARM 
679         // doesn't seem to have 64-bit registers.
680         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
681         switch(fpscr & _MM_ROUND_MASK_ARM)
682         {
683             default:
684             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
685             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
686             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
687             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
688         }
689     }
690     else static if (LDC_with_ARM64)
691     {
692         // Get current rounding mode.
693         uint fpscr = arm_get_fpcr();
694 
695         switch(fpscr & _MM_ROUND_MASK_ARM)
696         {
697             default:
698             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
699             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
700             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
701             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
702         }
703     }
704     // 64-bit can use an SSE instruction
705     else version(D_InlineAsm_X86_64)
706     {
707         long result;
708         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
709         {
710             asm pure nothrow @nogc @trusted
711             {
712                 movsd XMM0, value;
713                 cvtsd2si RAX, XMM0;
714                 mov result, RAX;
715             }
716         }
717         else
718         {
719             asm pure nothrow @nogc @trusted
720             {
721                 movsd XMM0, value;
722                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
723                 mov result, RAX;
724             }
725         }
726         return result;
727     }
728     else version(D_InlineAsm_X86)
729     {
730         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
731         // This leads to an unfortunate FPU sequence in every C++ compiler.
732         // See: https://godbolt.org/z/vZym77
733 
734         // Get current MXCSR rounding
735         uint sseRounding;
736         ushort savedFPUCW;
737         ushort newFPUCW;
738         long result;
739         asm pure nothrow @nogc @trusted
740         {
741             stmxcsr sseRounding;
742             fld value;
743             fnstcw savedFPUCW;
744             mov AX, savedFPUCW;
745             and AX, 0xf3ff;
746             movzx ECX, word ptr sseRounding;
747             and ECX, 0x6000;
748             shr ECX, 3;
749             or AX, CX;
750             mov newFPUCW, AX;
751             fldcw newFPUCW;
752             fistp result;
753             fldcw savedFPUCW;
754         }
755         return result;
756     }
757     else static if (GDC_with_x86)
758     {
759         version(X86_64)
760         {
761             static assert(GDC_with_SSE2);
762             __m128d A;
763             A.ptr[0] = value;
764             return __builtin_ia32_cvtsd2si64 (A);
765         }
766         else
767         {
768             // This is untested!
769             uint sseRounding;
770             ushort savedFPUCW;
771             ushort newFPUCW;
772             long result;
773             asm pure nothrow @nogc @trusted
774             {
775                 "stmxcsr %1;\n" ~
776                 "fld %2;\n" ~
777                 "fnstcw %3;\n" ~
778                 "movw %3, %%ax;\n" ~
779                 "andw $0xf3ff, %%ax;\n" ~
780                 "movzwl %1, %%ecx;\n" ~
781                 "andl $0x6000, %%ecx;\n" ~
782                 "shrl $3, %%ecx;\n" ~
783                 "orw %%cx, %%ax\n" ~
784                 "movw %%ax, %4;\n" ~
785                 "fldcw %4;\n" ~
786                 "fistpll %0;\n" ~
787                 "fldcw %3;\n"         
788                   : "=m"(result)    // %0
789                   : "m" (sseRounding),
790                     "t" (value),
791                     "m" (savedFPUCW),
792                     "m" (newFPUCW) 
793                   : "eax", "ecx", "st";
794             }
795             return result;
796         }
797     }
798     else
799         static assert(false);
800 }
801 
802 //
803 //  </ROUNDING>
804 //
805 
806 
807 // using the Intel terminology here
808 
809 byte saturateSignedWordToSignedByte(short value) pure @safe
810 {
811     if (value > 127) value = 127;
812     if (value < -128) value = -128;
813     return cast(byte) value;
814 }
815 
816 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
817 {
818     if (value > 255) value = 255;
819     if (value < 0) value = 0;
820     return cast(ubyte) value;
821 }
822 
823 short saturateSignedIntToSignedShort(int value) pure @safe
824 {
825     if (value > 32767) value = 32767;
826     if (value < -32768) value = -32768;
827     return cast(short) value;
828 }
829 
830 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
831 {
832     if (value > 65535) value = 65535;
833     if (value < 0) value = 0;
834     return cast(ushort) value;
835 }
836 
837 unittest // test saturate operations
838 {
839     assert( saturateSignedWordToSignedByte(32000) == 127);
840     assert( saturateSignedWordToUnsignedByte(32000) == 255);
841     assert( saturateSignedWordToSignedByte(-4000) == -128);
842     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
843     assert( saturateSignedIntToSignedShort(32768) == 32767);
844     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
845     assert( saturateSignedIntToSignedShort(-32769) == -32768);
846     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
847 }
848 
849 version(unittest)
850 {
851     // This is just for debugging tests
852     import core.stdc.stdio: printf;
853 
854     // printing vectors for implementation
855     // Note: you can override `pure` within a `debug` clause
856 
857     void _mm_print_pi64(__m64 v) @trusted
858     {
859         long1 vl = cast(long1)v;
860         printf("%lld\n", vl.array[0]);
861     }
862 
863     void _mm_print_pi32(__m64 v) @trusted
864     {
865         int[2] C = (cast(int2)v).array;
866         printf("%d %d\n", C[0], C[1]);
867     }
868 
869     void _mm_print_pi16(__m64 v) @trusted
870     {
871         short[4] C = (cast(short4)v).array;
872         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
873     }
874 
875     void _mm_print_pi8(__m64 v) @trusted
876     {
877         byte[8] C = (cast(byte8)v).array;
878         printf("%d %d %d %d %d %d %d %d\n",
879         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
880     }
881 
882     void _mm_print_epi64(__m128i v) @trusted
883     {
884         long2 vl = cast(long2)v;
885         printf("%lld %lld\n", vl.array[0], vl.array[1]);
886     }
887 
888     void _mm_print_epi32(__m128i v) @trusted
889     {
890         printf("%d %d %d %d\n",
891               v.array[0], v.array[1], v.array[2], v.array[3]);
892     }  
893 
894     void _mm_print_epi16(__m128i v) @trusted
895     {
896         short[8] C = (cast(short8)v).array;
897         printf("%d %d %d %d %d %d %d %d\n",
898         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
899     }
900 
901     void _mm_print_epi8(__m128i v) @trusted
902     {
903         byte[16] C = (cast(byte16)v).array;
904         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
905         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
906     }
907 
908     void _mm_print_ps(__m128 v) @trusted
909     {
910         // %g because %f can conceal very small numbers and prints zero instead
911         float[4] C = (cast(float4)v).array;
912         printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]);
913     }
914 
915     void _mm_print_pd(__m128d v) @trusted
916     {
917         double[2] C = (cast(double2)v).array;
918         printf("%f %f\n", C[0], C[1]);
919     }
920 
921     void _mm256_print_pd(__m256d v) @trusted
922     {
923         // %g because %f can conceal very small numbers and prints zero instead
924         printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 
925     }
926 
927     void _mm256_print_ps(__m256 v) @trusted
928     {
929         // %g because %f can conceal very small numbers and prints zero instead
930         printf("%g %g %g %g %g %g %g %g\n", 
931             v.array[0], v.array[1], v.array[2], v.array[3],
932             v.array[4], v.array[5], v.array[6], v.array[7]); 
933     }
934 
935     void _mm256_print_epi16(__m256i v) @trusted
936     {
937         short16 vl = cast(short16)v;
938         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 
939                vl.array[0], vl.array[1], vl.array[2], vl.array[3],
940                vl.array[4], vl.array[5], vl.array[6], vl.array[7],
941                vl.array[8], vl.array[9], vl.array[10], vl.array[11],
942                vl.array[12], vl.array[13], vl.array[14], vl.array[15]);
943     }
944 
945     void _mm256_print_epi32(__m256i v) @trusted
946     {
947         int8 vl = cast(int8)v;
948         printf("%d %d %d %d %d %d %d %d\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3],
949                                             vl.array[4], vl.array[5], vl.array[6], vl.array[7]);
950     }
951 
952     void _mm256_print_epi64(__m256i v) @trusted
953     {
954         long4 vl = cast(long4)v;
955         printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]);
956     }
957 
958     void _mm256_print_epi8(__m256i v) @trusted
959     {
960         byte[32] C = (cast(byte32)v).array;
961         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
962                C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], 
963                C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15],
964                C[16], C[17], C[18], C[19], C[20], C[21], C[22], C[23], 
965                C[24], C[25], C[26], C[27], C[28], C[29], C[30], C[31]);
966 
967     }
968 }
969 
970 
971 //
972 //  <FLOATING-POINT COMPARISONS>
973 //
974 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
975 //       need different IR generation.
976 
977 enum FPComparison
978 {
979     false_,// always false
980     oeq,   // ordered and equal
981     ogt,   // ordered and greater than
982     oge,   // ordered and greater than or equal
983     olt,   // ordered and less than
984     ole,   // ordered and less than or equal
985     one,   // ordered and not equal
986     ord,   // ordered (no nans)
987     ueq,   // unordered or equal
988     ugt,   // unordered or greater than ("nle")
989     uge,   // unordered or greater than or equal ("nlt")
990     ult,   // unordered or less than ("nge")
991     ule,   // unordered or less than or equal ("ngt")
992     une,   // unordered or not equal ("neq")
993     uno,   // unordered (either nans)
994     true_, // always true
995 }
996 
997 private static immutable string[FPComparison.max+1] FPComparisonToString =
998 [
999     "false",
1000     "oeq",
1001     "ogt",
1002     "oge",
1003     "olt",
1004     "ole",
1005     "one",
1006     "ord",
1007     "ueq",
1008     "ugt",
1009     "uge",
1010     "ult",
1011     "ule",
1012     "une",
1013     "uno",
1014     "true"
1015 ];
1016 
1017 // AVX FP comparison to FPComparison
1018 FPComparison mapAVXFPComparison(int imm8) pure @safe
1019 {
1020     // Always map on non-signalling
1021     static immutable FPComparison[16] mapping =
1022     [
1023         FPComparison.oeq, // _CMP_EQ_OQ
1024         FPComparison.olt, // _CMP_LT_OS
1025         FPComparison.ole, // _CMP_LE_OS
1026         FPComparison.uno, // _CMP_UNORD_Q
1027         FPComparison.une, // _CMP_NEQ_UQ // TODO does it mean net-equal OR unordered?
1028         FPComparison.uge, // _CMP_NLT_US
1029         FPComparison.ugt, // _CMP_NLE_US
1030         FPComparison.ord, // _CMP_ORD_Q
1031         FPComparison.ueq,   // _CMP_EQ_UQ  
1032         FPComparison.ult,   // _CMP_NGE_US 
1033         FPComparison.ule,   // _CMP_NGT_US 
1034         FPComparison.false_,// _CMP_FALSE_OQ
1035         FPComparison.one,   // _CMP_NEQ_OQ
1036         FPComparison.oge,   // _CMP_GE_OS
1037         FPComparison.ogt,   // _CMP_GT_OS
1038         FPComparison.true_  // _CMP_TRUE_UQ
1039     ];
1040 
1041     return mapping[imm8 & 0x0f]; // note: signalling NaN information is mixed up
1042 }
1043 
1044 // Individual float comparison: returns -1 for true or 0 for false.
1045 // Useful for DMD and testing
1046 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
1047 {
1048     bool unordered = isnan(a) || isnan(b);
1049     final switch(comparison) with(FPComparison)
1050     {
1051         case false_: return false;
1052         case oeq: return a == b;
1053         case ogt: return a > b;
1054         case oge: return a >= b;
1055         case olt: return a < b;
1056         case ole: return a <= b;
1057         case one: return !unordered && (a != b); // NaN with != always yields true
1058         case ord: return !unordered; 
1059         case ueq: return unordered || (a == b);
1060         case ugt: return unordered || (a > b);
1061         case uge: return unordered || (a >= b);
1062         case ult: return unordered || (a < b);
1063         case ule: return unordered || (a <= b);
1064         case une: return (a != b); // NaN with != always yields true
1065         case uno: return unordered;
1066         case true_: return true;
1067     }
1068 }
1069 
1070 static if (LDC_with_optimizations) // this save time for bigger projects, since LDCInlineIR gets more expensive there.
1071 {
1072     /// Provides packed float comparisons
1073     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
1074     {
1075         enum ir = `
1076             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
1077             %r = sext <4 x i1> %cmp to <4 x i32>
1078             ret <4 x i32> %r`;
1079 
1080         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
1081     }
1082 
1083     ///ditto
1084     package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @safe
1085     {
1086         enum ir = `
1087             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <8 x float> %0, %1
1088             %r = sext <8 x i1> %cmp to <8 x i32>
1089             ret <8 x i32> %r`;
1090         return LDCInlineIR!(ir, int8, float8, float8)(a, b);
1091     }
1092 
1093     /// Provides packed double comparisons
1094     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
1095     {
1096         enum ir = `
1097             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
1098             %r = sext <2 x i1> %cmp to <2 x i64>
1099             ret <2 x i64> %r`;
1100 
1101         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
1102     }
1103 
1104     ///ditto 
1105     package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @safe
1106     {
1107         enum ir = `
1108             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x double> %0, %1
1109             %r = sext <4 x i1> %cmp to <4 x i64>
1110             ret <4 x i64> %r`;
1111         return LDCInlineIR!(ir, long4, double4, double4)(a, b);
1112     }
1113 
1114     /// CMPSS-style comparisons
1115     /// clang implement it through x86 intrinsics, it is possible with IR alone
1116     /// but leads to less optimal code.
1117     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
1118     /// Not that simple.
1119     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
1120     {
1121         /*
1122         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
1123         enum bool invertOp = (predicateNumber & 0x80) != 0;
1124         static if(invertOp)
1125             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
1126         else
1127             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
1128         */
1129         enum ir = `
1130             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
1131             %r = sext i1 %cmp to i32
1132             %r2 = bitcast i32 %r to float
1133             ret float %r2`;
1134 
1135         float4 r = a;
1136         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
1137         return r;
1138     }
1139 
1140     /// CMPSD-style comparisons
1141     /// clang implement it through x86 intrinsics, it is possible with IR alone
1142     /// but leads to less optimal code.
1143     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
1144     /// Not that simple.    
1145     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
1146     {
1147         enum ir = `
1148             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
1149             %r = sext i1 %cmp to i64
1150             %r2 = bitcast i64 %r to double
1151             ret double %r2`;
1152 
1153         double2 r = a;
1154         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
1155         return r;
1156     }
1157 }
1158 else
1159 {
1160     /// Provides packed float comparisons
1161     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
1162     {
1163         int4 result;
1164         foreach(i; 0..4)
1165         {
1166             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
1167         }
1168         return result;
1169     }
1170     ///ditto
1171     package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @trusted
1172     {
1173         int8 result;
1174         foreach(i; 0..8)
1175         {
1176             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
1177         }
1178         return result;
1179     }
1180 
1181     /// Provides packed double comparisons
1182     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
1183     {
1184         long2 result;
1185         foreach(i; 0..2)
1186         {
1187             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
1188         }
1189         return result;
1190     }
1191     ///ditto
1192     package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @trusted
1193     {
1194         long4 result;
1195         foreach(i; 0..4)
1196         {
1197             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
1198         }
1199         return result;
1200     }
1201 
1202     /// Provides CMPSS-style comparison
1203     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
1204     {
1205         int4 result = cast(int4)a;
1206         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
1207         return cast(float4)result;
1208     }
1209 
1210     /// Provides CMPSD-style comparison
1211     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
1212     {
1213         long2 result = cast(long2)a;
1214         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
1215         return cast(double2)result;
1216     }
1217 }
1218 unittest // cmpps
1219 {
1220     // Check all comparison type is working
1221     float4 A = [1, 3, 5, float.nan];
1222     float4 B = [2, 3, 4, 5];
1223 
1224     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
1225     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
1226     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
1227     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
1228     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
1229     int4 result_one = cmpps!(FPComparison.one)(A, B);
1230     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
1231     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
1232     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
1233     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
1234     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
1235     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
1236     int4 result_une = cmpps!(FPComparison.une)(A, B);
1237     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
1238 
1239     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
1240     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
1241     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
1242     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
1243     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
1244     static immutable int[4] correct_one    = [-1, 0,-1, 0];
1245     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
1246     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
1247     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
1248     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
1249     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
1250     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
1251     static immutable int[4] correct_une    = [-1, 0,-1,-1];
1252     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
1253 
1254     assert(result_oeq.array == correct_oeq);
1255     assert(result_ogt.array == correct_ogt);
1256     assert(result_oge.array == correct_oge);
1257     assert(result_olt.array == correct_olt);
1258     assert(result_ole.array == correct_ole);
1259     assert(result_one.array == correct_one);
1260     assert(result_ord.array == correct_ord);
1261     assert(result_ueq.array == correct_ueq);
1262     assert(result_ugt.array == correct_ugt);
1263     assert(result_uge.array == correct_uge);
1264     assert(result_ult.array == correct_ult);
1265     assert(result_ule.array == correct_ule);
1266     assert(result_une.array == correct_une);
1267     assert(result_uno.array == correct_uno);
1268 }
1269 unittest
1270 {
1271     double2 a = [1, 3];
1272     double2 b = [2, 3];
1273     long2 c = cmppd!(FPComparison.ult)(a, b);
1274     static immutable long[2] correct = [cast(long)(-1), 0];
1275     assert(c.array == correct);
1276 }
1277 unittest // cmpss
1278 {
1279     void testComparison(FPComparison comparison)(float4 A, float4 B)
1280     {
1281         float4 result = cmpss!comparison(A, B);
1282         int4 iresult = cast(int4)result;
1283         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
1284         assert(iresult.array[0] == expected);
1285         assert(result.array[1] == A.array[1]);
1286         assert(result.array[2] == A.array[2]);
1287         assert(result.array[3] == A.array[3]);
1288     }
1289 
1290     // Check all comparison type is working
1291     float4 A = [1, 3, 5, 6];
1292     float4 B = [2, 3, 4, 5];
1293     float4 C = [float.nan, 3, 4, 5];
1294 
1295     testComparison!(FPComparison.oeq)(A, B);
1296     testComparison!(FPComparison.oeq)(A, C);
1297     testComparison!(FPComparison.ogt)(A, B);
1298     testComparison!(FPComparison.ogt)(A, C);
1299     testComparison!(FPComparison.oge)(A, B);
1300     testComparison!(FPComparison.oge)(A, C);
1301     testComparison!(FPComparison.olt)(A, B);
1302     testComparison!(FPComparison.olt)(A, C);
1303     testComparison!(FPComparison.ole)(A, B);
1304     testComparison!(FPComparison.ole)(A, C);
1305     testComparison!(FPComparison.one)(A, B);
1306     testComparison!(FPComparison.one)(A, C);
1307     testComparison!(FPComparison.ord)(A, B);
1308     testComparison!(FPComparison.ord)(A, C);
1309     testComparison!(FPComparison.ueq)(A, B);
1310     testComparison!(FPComparison.ueq)(A, C);
1311     testComparison!(FPComparison.ugt)(A, B);
1312     testComparison!(FPComparison.ugt)(A, C);
1313     testComparison!(FPComparison.uge)(A, B);
1314     testComparison!(FPComparison.uge)(A, C);
1315     testComparison!(FPComparison.ult)(A, B);
1316     testComparison!(FPComparison.ult)(A, C);
1317     testComparison!(FPComparison.ule)(A, B);
1318     testComparison!(FPComparison.ule)(A, C);
1319     testComparison!(FPComparison.une)(A, B);
1320     testComparison!(FPComparison.une)(A, C);
1321     testComparison!(FPComparison.uno)(A, B);
1322     testComparison!(FPComparison.uno)(A, C);
1323 }
1324 unittest // cmpsd
1325 {
1326     void testComparison(FPComparison comparison)(double2 A, double2 B)
1327     {
1328         double2 result = cmpsd!comparison(A, B);
1329         long2 iresult = cast(long2)result;
1330         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1331         assert(iresult.array[0] == expected);
1332         assert(result.array[1] == A.array[1]);
1333     }
1334 
1335     // Check all comparison type is working
1336     double2 A = [1, 3];
1337     double2 B = [2, 4];
1338     double2 C = [double.nan, 5];
1339 
1340     testComparison!(FPComparison.oeq)(A, B);
1341     testComparison!(FPComparison.oeq)(A, C);
1342     testComparison!(FPComparison.ogt)(A, B);
1343     testComparison!(FPComparison.ogt)(A, C);
1344     testComparison!(FPComparison.oge)(A, B);
1345     testComparison!(FPComparison.oge)(A, C);
1346     testComparison!(FPComparison.olt)(A, B);
1347     testComparison!(FPComparison.olt)(A, C);
1348     testComparison!(FPComparison.ole)(A, B);
1349     testComparison!(FPComparison.ole)(A, C);
1350     testComparison!(FPComparison.one)(A, B);
1351     testComparison!(FPComparison.one)(A, C);
1352     testComparison!(FPComparison.ord)(A, B);
1353     testComparison!(FPComparison.ord)(A, C);
1354     testComparison!(FPComparison.ueq)(A, B);
1355     testComparison!(FPComparison.ueq)(A, C);
1356     testComparison!(FPComparison.ugt)(A, B);
1357     testComparison!(FPComparison.ugt)(A, C);
1358     testComparison!(FPComparison.uge)(A, B);
1359     testComparison!(FPComparison.uge)(A, C);
1360     testComparison!(FPComparison.ult)(A, B);
1361     testComparison!(FPComparison.ult)(A, C);
1362     testComparison!(FPComparison.ule)(A, B);
1363     testComparison!(FPComparison.ule)(A, C);
1364     testComparison!(FPComparison.une)(A, B);
1365     testComparison!(FPComparison.une)(A, C);
1366     testComparison!(FPComparison.uno)(A, B);
1367     testComparison!(FPComparison.uno)(A, C);
1368 }
1369 
1370 //
1371 //  </FLOATING-POINT COMPARISONS>
1372 //
1373 
1374 
1375 __m64 to_m64(__m128i a) pure @trusted
1376 {
1377     long2 la = cast(long2)a;
1378     long1 r = la.array[0];
1379     return r;
1380 }
1381 
1382 __m128i to_m128i(__m64 a) pure @trusted
1383 {
1384   /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
1385     
1386     version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
1387     {
1388         long2 r = a.array[0];
1389         r.ptr[1] = 0;
1390         return cast(int4)r;
1391     }
1392     else */
1393     {
1394         long2 r = [0, 0];
1395         r.ptr[0] = a.array[0];
1396         return cast(__m128i)r;
1397     }
1398 }
1399 
1400 
1401 // ADDITIONAL LLVM INTRINSICS
1402 // Basically LDC didn't add them yet
1403 version(LDC)
1404 {
1405     static if (__VERSION__ >= 2097) // LDC 1.27+
1406     {
1407         pragma(LDC_intrinsic, "llvm.abs.i#")
1408             T inteli_llvm_abs(T)(T val, bool attrib);
1409     }
1410 
1411     static if (__VERSION__ >= 2092) // LDC 1.22+
1412     {
1413         pragma(LDC_intrinsic, "llvm.sadd.sat.i#")
1414             T inteli_llvm_adds(T)(T a, T b) pure @safe;
1415         pragma(LDC_intrinsic, "llvm.ssub.sat.i#")
1416             T inteli_llvm_subs(T)(T a, T b) pure @safe;
1417         pragma(LDC_intrinsic, "llvm.uadd.sat.i#")
1418             T inteli_llvm_addus(T)(T a, T b) pure @safe;
1419         pragma(LDC_intrinsic, "llvm.usub.sat.i#")
1420             T inteli_llvm_subus(T)(T a, T b) pure @safe;
1421 
1422         enum LDC_with_saturated_intrinsics = true;
1423     }
1424     else
1425         enum LDC_with_saturated_intrinsics = false;
1426 }
1427 else
1428     enum LDC_with_saturated_intrinsics = false;
1429 
1430 // ADDITIONAL x86 INTRINSICS
1431 // Absent from ldc.gccbuiltins_x86 for some reason, but needed.
1432 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td
1433 static if (LDC_with_SSE41)
1434 {
1435     pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb")
1436         byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe;
1437 }
1438 
1439 // SOME NEON INTRINSICS
1440 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1441 // Not in the public API but the simde project expose it all for the user to use.
1442 // MAYDO: create a new neon.d module, for internal use only.
1443 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1444 static if (LDC_with_ARM64)
1445 {
1446     // VERY USEFUL LINK
1447     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1448     // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/
1449 
1450     // Note: it is helpful to verify, in case of complex sequence of intrinsics, that the result is actually false.
1451     // Some intrinsics have trouble when inlined inside another, such as vmovl_low_s32. In this case, it's better to use builtins 
1452     // from backend to have an inlining that still match the instruction.
1453 
1454     pragma(LDC_intrinsic, "llvm.aarch64.crc32cb")
1455         uint __crc32cb(uint a, uint b) pure @safe;
1456 
1457     pragma(LDC_intrinsic, "llvm.aarch64.crc32ch")
1458         uint __crc32ch(uint a, uint b) pure @safe;
1459 
1460     pragma(LDC_intrinsic, "llvm.aarch64.crc32cw")
1461         uint __crc32cw(uint a, uint b) pure @safe;
1462 
1463     pragma(LDC_intrinsic, "llvm.aarch64.crc32cx")
1464         uint __crc32cd(uint a, ulong b) pure @safe;
1465 
1466     //pragma(LDC_intrinsic, "llvm.aarch64.dmb")
1467     //    uint __dmb(int a) @safe; // didn't found a name in intrinsic list
1468 
1469     pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8")
1470         byte16 vabdq_u8(byte16 a, byte16 b) pure @safe;
1471 
1472     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16")
1473         short8 vabsq_s16(short8 a) pure @safe;
1474 
1475     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32")
1476         int4 vabsq_s32(int4 a) pure @safe;
1477 
1478     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8")
1479         byte16 vabsq_s8(byte16 a) pure @safe;
1480 
1481     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1482     {
1483         return a & b;
1484     }
1485 
1486     long2 vandq_s64(long2 a, long2 b)
1487     {
1488         return a & b;
1489     }
1490 
1491     long2 vbicq_s64(long2 a, long2 b) pure @safe
1492     {
1493         return a & ~b;
1494     }
1495 
1496     int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe
1497     {
1498         return c ^ ((c ^ b) & a);
1499     }
1500 
1501     byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe
1502     {
1503         return c ^ ((c ^ b) & a);
1504     }
1505 
1506     long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe
1507     {
1508         return c ^ ((c ^ b) & a);
1509     }
1510 
1511     short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
1512     {
1513         short8 r;
1514         r.ptr[0]  = lo.array[0];
1515         r.ptr[1]  = lo.array[1];
1516         r.ptr[2]  = lo.array[2];
1517         r.ptr[3]  = lo.array[3];
1518         r.ptr[4]  = hi.array[0];
1519         r.ptr[5]  = hi.array[1];
1520         r.ptr[6]  = hi.array[2];
1521         r.ptr[7]  = hi.array[3];
1522         return r;
1523     }
1524 
1525     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1526     {
1527         int4 r;
1528         r.ptr[0] = lo.array[0];
1529         r.ptr[1] = lo.array[1];
1530         r.ptr[2] = hi.array[0];
1531         r.ptr[3] = hi.array[1];
1532         return r;
1533     }
1534 
1535     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1536     {
1537         byte16 r;
1538         r.ptr[0]  = lo.array[0];
1539         r.ptr[1]  = lo.array[1];
1540         r.ptr[2]  = lo.array[2];
1541         r.ptr[3]  = lo.array[3];
1542         r.ptr[4]  = lo.array[4];
1543         r.ptr[5]  = lo.array[5];
1544         r.ptr[6]  = lo.array[6];
1545         r.ptr[7]  = lo.array[7];
1546         r.ptr[8]  = hi.array[0];
1547         r.ptr[9]  = hi.array[1];
1548         r.ptr[10] = hi.array[2];
1549         r.ptr[11] = hi.array[3];
1550         r.ptr[12] = hi.array[4];
1551         r.ptr[13] = hi.array[5];
1552         r.ptr[14] = hi.array[6];
1553         r.ptr[15] = hi.array[7];
1554         return r;
1555     }
1556 
1557     short8 vcombine_u16(short4 lo, short4 hi) pure @trusted
1558     {
1559         short8 r;
1560         r.ptr[0]  = lo.array[0];
1561         r.ptr[1]  = lo.array[1];
1562         r.ptr[2]  = lo.array[2];
1563         r.ptr[3]  = lo.array[3];
1564         r.ptr[4]  = hi.array[0];
1565         r.ptr[5]  = hi.array[1];
1566         r.ptr[6]  = hi.array[2];
1567         r.ptr[7]  = hi.array[3];
1568         return r;
1569     }
1570 
1571 
1572     // float4 => int4
1573 
1574     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1575         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1576 
1577     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1578         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1579 
1580     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1581         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1582 
1583     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1584         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1585 
1586 
1587     // double2 => long2
1588 
1589     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64")
1590         long2 vcvtmq_s64_f64(double2 a) pure @safe;
1591 
1592     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64")
1593         long2 vcvtnq_s64_f64(double2 a) pure @safe;
1594 
1595     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64")
1596         long2 vcvtpq_s64_f64(double2 a) pure @safe;
1597 
1598     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64")
1599         long2 vcvtzq_s64_f64(double2 a) pure @safe;
1600 
1601     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1602         int vcvtms_s32_f32(float a) pure @safe;
1603 
1604     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1605         int vcvtns_s32_f32(float a) pure @safe;    
1606 
1607     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1608         int vcvtps_s32_f32(float a) pure @safe;
1609 
1610     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1611         int vcvts_s32_f32(float a) pure @safe;
1612      
1613     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1614         int vcvtms_s32_f64(double a) pure @safe;
1615 
1616     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1617         int vcvtns_s32_f64(double a) pure @safe;    
1618 
1619     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1620         int vcvtps_s32_f64(double a) pure @safe;
1621 
1622     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1623         int vcvts_s32_f64(double a) pure @safe;
1624 
1625     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1626         long vcvtms_s64_f32(float a) pure @safe;
1627 
1628     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1629         long vcvtns_s64_f32(float a) pure @safe;    
1630 
1631     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1632         long vcvtps_s64_f32(float a) pure @safe;
1633 
1634     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1635         long vcvts_s64_f32(float a) pure @safe;
1636 
1637     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1638         long vcvtms_s64_f64(double a) pure @safe;
1639 
1640     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1641         long vcvtns_s64_f64(double a) pure @safe;    
1642 
1643     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1644         long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64
1645 
1646     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1647         long vcvts_s64_f64(double a) pure @safe;
1648 
1649     long2 vdupq_n_s64(long value) pure @safe
1650     {
1651         long2 r;
1652         r = value;
1653         return r;
1654     }
1655 
1656     short4 vget_high_s16(short8 a) pure @trusted
1657     {
1658         short4 r;
1659         r.ptr[0] = a.array[4];
1660         r.ptr[1] = a.array[5];
1661         r.ptr[2] = a.array[6];
1662         r.ptr[3] = a.array[7];
1663         return r;
1664     }
1665 
1666     int2 vget_high_s32(int4 a) pure @trusted
1667     {
1668         int2 r;
1669         r.ptr[0] = a.array[2];
1670         r.ptr[1] = a.array[3];
1671         return r;
1672     }
1673 
1674     byte8 vget_high_u8(byte16 a) pure @trusted
1675     {
1676         byte8 r;
1677         r.ptr[0] = a.array[8];
1678         r.ptr[1] = a.array[9];
1679         r.ptr[2] = a.array[10];
1680         r.ptr[3] = a.array[11];
1681         r.ptr[4] = a.array[12];
1682         r.ptr[5] = a.array[13];
1683         r.ptr[6] = a.array[14];
1684         r.ptr[7] = a.array[15];
1685         return r;
1686     }
1687 
1688     short4 vget_low_s16(short8 a) pure @trusted
1689     {
1690         short4 r;
1691         r.ptr[0] = a.array[0];
1692         r.ptr[1] = a.array[1];
1693         r.ptr[2] = a.array[2];
1694         r.ptr[3] = a.array[3];
1695         return r;
1696     } 
1697 
1698     int2 vget_low_s32(int4 a) pure @trusted
1699     {
1700         int2 r;
1701         r.ptr[0] = a.array[0];
1702         r.ptr[1] = a.array[1];
1703         return r;
1704     }
1705 
1706     byte8 vget_low_u8(byte16 a) pure @trusted
1707     {
1708         byte8 r;
1709         r.ptr[0] = a.array[0];
1710         r.ptr[1] = a.array[1];
1711         r.ptr[2] = a.array[2];
1712         r.ptr[3] = a.array[3];
1713         r.ptr[4] = a.array[4];
1714         r.ptr[5] = a.array[5];
1715         r.ptr[6] = a.array[6];
1716         r.ptr[7] = a.array[7];
1717         return r;
1718     }
1719 
1720     long vgetq_lane_s64(long2 v, const int lane) pure @safe
1721     {
1722         return v.array[lane];
1723     }
1724 
1725     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1726         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1727 
1728     int4 vmaxq_s32(int4 a, int4 b) pure @safe
1729     {
1730         int4 r;
1731         r[0] = a[0] >= b[0] ? a[0] : b[0];
1732         r[1] = a[1] >= b[1] ? a[1] : b[1];
1733         r[2] = a[2] >= b[2] ? a[2] : b[2];
1734         r[3] = a[3] >= b[3] ? a[3] : b[3];
1735         return r;
1736     }
1737 
1738     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1739         short8 vminq_s16(short8 a, short8 b) pure @safe;
1740 
1741     int4 vmovl_u16(short4 a) pure @trusted
1742     {
1743         int4 r;
1744         r.ptr[0] = cast(ushort)a.array[0];
1745         r.ptr[1] = cast(ushort)a.array[1];
1746         r.ptr[2] = cast(ushort)a.array[2];
1747         r.ptr[3] = cast(ushort)a.array[3];
1748         return r;
1749     }
1750 
1751     int2 vmovn_s64(long2 a) pure @trusted
1752     {
1753         int2 r;
1754         r.ptr[0] = cast(int)(a.array[0]);
1755         r.ptr[1] = cast(int)(a.array[1]);
1756         return r;
1757     }        
1758 
1759     int4 vmull_s16(short4 a, short4 b) pure @trusted
1760     {
1761         int4 r;
1762         r.ptr[0] = a.array[0] * b.array[0];
1763         r.ptr[1] = a.array[1] * b.array[1];
1764         r.ptr[2] = a.array[2] * b.array[2];
1765         r.ptr[3] = a.array[3] * b.array[3];
1766         return r;
1767     }
1768 
1769     pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64")
1770         long2 vmull_s32(int2 a, int2 b) pure @safe;
1771 
1772     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16")
1773         short4 vpadd_s16(short4 a, short4 b) pure @safe;
1774 
1775     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1776         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1777 
1778     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1779         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1780 
1781     pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8")
1782         short8 vpaddlq_u8 (byte16 a) pure @safe;
1783 
1784     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1785     {
1786         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1787             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1788     }
1789     else
1790     {
1791         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1792             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1793     }
1794     
1795     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16")
1796         short8 vpaddq_s16(short8 a, short8 b) pure @safe;
1797 
1798     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1799         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1800 
1801     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32")
1802         int4 vpaddq_s32(int4 a, int4 b) pure @safe;
1803 
1804     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16")
1805         short4 vqadd_s16(short4 a, short4 b) pure @safe;
1806 
1807     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16")
1808         short8 vqaddq_s16(short8 a, short8 b) pure @safe;
1809 
1810     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1811         byte8 vqmovn_s16(short8 a) pure @safe;
1812 
1813     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
1814         short4 vqmovn_s32(int4 a) pure @safe;
1815 
1816     pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16")
1817         short4 vqmovn_u32(int4 a) pure @safe;
1818 
1819     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1820         byte8 vqmovun_s16(short8 a) pure @safe;
1821 
1822     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16")
1823         short4 vqsub_s16(short4 a, short4 b) pure @safe;
1824 
1825     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16")
1826         short8 vqsubq_s16(short8 a, short8 b) pure @safe;
1827 
1828     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8")
1829         byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe;
1830 
1831     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1832         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1833 
1834     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1835         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1836 
1837     pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16")
1838         short4 vrshrn_n_s32(int4 a, int n) pure @safe;        
1839 
1840     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1841     {
1842         return a >>> b;
1843     }
1844 
1845     byte16 vshrq_n_s8(byte16 a, byte r) pure @safe
1846     { 
1847         a = a >> byte16(cast(byte)r);
1848         return a;
1849     }
1850 
1851     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8")
1852         byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe;
1853 }
1854 
1855 version(unittest)
1856 {
1857     double abs_double(double x) @trusted
1858     {
1859         version(LDC)
1860             return llvm_fabs(x);
1861         else
1862         {
1863             long uf = *cast(long*)(&x);
1864             uf &= 0x7fffffff_ffffffff;
1865             return *cast(double*)(&uf);
1866         }
1867     }
1868 }
1869 
1870 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure
1871 
1872 bool isnan(float x) pure @trusted
1873 {
1874     uint u = *cast(uint*)(&x);
1875     bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF);
1876     return result;
1877 }
1878 unittest
1879 {
1880     float x = float.nan;
1881     assert(isnan(x));
1882 
1883     x = 0;
1884     assert(!isnan(x));
1885     
1886     x = float.infinity;
1887     assert(!isnan(x));
1888 }
1889 
1890 bool isnan(double x) pure @trusted
1891 {
1892     ulong u = *cast(ulong*)(&x);
1893     return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF);
1894 }
1895 unittest
1896 {
1897     double x = double.nan;
1898     assert(isnan(x));
1899 
1900     x = 0;
1901     assert(!isnan(x));
1902     
1903     x = double.infinity;
1904     assert(!isnan(x));
1905 }