1 /** 2 * SSE3 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE3 4 * 5 * Copyright: Guillaume Piolat 2016-2020. 6 * Charles Gregory 2019. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.pmmintrin; 10 11 public import inteli.types; 12 import inteli.internals; 13 public import inteli.emmintrin; 14 15 16 // Note: this header will work whether you have SSE3 enabled or not. 17 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 18 // generate SSE3 instruction (they are often enabled with -O1 or greater). 19 // With GDC, use "dflags-gdc": ["-msse3"] or equivalent to generate SSE3 instructions. 20 21 22 nothrow @nogc: 23 24 /// Alternatively add and subtract packed double-precision (64-bit) 25 /// floating-point elements in `a` to/from packed elements in `b`. 26 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted 27 { 28 static if (DMD_with_DSIMD_and_SSE3) 29 { 30 return cast(__m128d) __simd(XMM.ADDSUBPD, cast(void16)a, cast(void16)b); 31 } 32 else static if (GDC_with_SSE3) 33 { 34 return __builtin_ia32_addsubpd(a, b); 35 } 36 else static if (LDC_with_SSE3) 37 { 38 return __builtin_ia32_addsubpd(a, b); 39 } 40 else 41 { 42 // ARM: well optimized starting with LDC 1.18.0 -O2, not disrupted by LLVM 13+ 43 a.ptr[0] = a.array[0] - b.array[0]; 44 a.ptr[1] = a.array[1] + b.array[1]; 45 return a; 46 } 47 } 48 unittest 49 { 50 auto v1 =_mm_setr_pd(1.0,2.0); 51 auto v2 =_mm_setr_pd(1.0,2.0); 52 assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array); 53 } 54 55 /// Alternatively add and subtract packed single-precision (32-bit) 56 /// floating-point elements in `a` to/from packed elements in `b`. 57 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted 58 { 59 static if (DMD_with_DSIMD_and_SSE3) 60 { 61 return cast(__m128) __simd(XMM.ADDSUBPS, cast(void16)a, cast(void16)b); 62 } 63 else static if (GDC_with_SSE3) 64 { 65 return __builtin_ia32_addsubps(a, b); 66 } 67 else static if (LDC_with_SSE3) 68 { 69 return __builtin_ia32_addsubps(a, b); 70 } 71 else 72 { 73 a.ptr[0] -= b.array[0]; 74 a.ptr[1] += b.array[1]; 75 a.ptr[2] -= b.array[2]; 76 a.ptr[3] += b.array[3]; 77 return a; 78 } 79 } 80 unittest 81 { 82 auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 83 auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 84 assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array ); 85 } 86 87 88 /// Horizontally add adjacent pairs of double-precision (64-bit) 89 /// floating-point elements in `a` and `b`. 90 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted 91 { 92 // PERF: ARM64? 93 static if (DMD_with_DSIMD_and_SSE3) 94 { 95 return cast(__m128d) __simd(XMM.HADDPD, cast(void16)a, cast(void16)b); 96 } 97 else static if (GDC_or_LDC_with_SSE3) 98 { 99 return __builtin_ia32_haddpd(a, b); 100 } 101 else 102 { 103 __m128d res; 104 res.ptr[0] = a.array[1] + a.array[0]; 105 res.ptr[1] = b.array[1] + b.array[0]; 106 return res; 107 } 108 } 109 unittest 110 { 111 auto A =_mm_setr_pd(1.5, 2.0); 112 auto B =_mm_setr_pd(1.0, 2.0); 113 assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array ); 114 } 115 116 /// Horizontally add adjacent pairs of single-precision (32-bit) 117 /// floating-point elements in `a` and `b`. 118 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted 119 { 120 static if (DMD_with_DSIMD_and_SSE3) 121 { 122 return cast(__m128) __simd(XMM.HADDPS, cast(void16)a, cast(void16)b); 123 } 124 else static if (GDC_or_LDC_with_SSE3) 125 { 126 return __builtin_ia32_haddps(a, b); 127 } 128 else static if (LDC_with_ARM64) 129 { 130 return vpaddq_f32(a, b); 131 } 132 else 133 { 134 __m128 res; 135 res.ptr[0] = a.array[1] + a.array[0]; 136 res.ptr[1] = a.array[3] + a.array[2]; 137 res.ptr[2] = b.array[1] + b.array[0]; 138 res.ptr[3] = b.array[3] + b.array[2]; 139 return res; 140 } 141 } 142 unittest 143 { 144 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 145 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 146 assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array ); 147 } 148 149 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 150 /// floating-point elements in `a` and `b`. 151 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted 152 { 153 static if (DMD_with_DSIMD_and_SSE3) 154 { 155 return cast(__m128d) __simd(XMM.HSUBPD, cast(void16)a, cast(void16)b); 156 } 157 else static if (GDC_or_LDC_with_SSE3) 158 { 159 return __builtin_ia32_hsubpd(a, b); 160 } 161 else 162 { 163 // yep, sounds optimal for ARM64 too. Strangely enough. 164 __m128d res; 165 res.ptr[0] = a.array[0] - a.array[1]; 166 res.ptr[1] = b.array[0] - b.array[1]; 167 return res; 168 } 169 } 170 unittest 171 { 172 auto A =_mm_setr_pd(1.5, 2.0); 173 auto B =_mm_setr_pd(1.0, 2.0); 174 assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array ); 175 } 176 177 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 178 /// floating-point elements in `a` and `b`. 179 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted 180 { 181 static if (DMD_with_DSIMD_and_SSE3) 182 { 183 return cast(__m128) __simd(XMM.HSUBPS, cast(void16)a, cast(void16)b); 184 } 185 else static if (GDC_or_LDC_with_SSE3) 186 { 187 return __builtin_ia32_hsubps(a, b); 188 } 189 else static if (LDC_with_ARM64) 190 { 191 int4 mask = [0, 0x80000000, 0, 0x80000000]; 192 a = cast(__m128)(cast(int4)a ^ mask); 193 b = cast(__m128)(cast(int4)b ^ mask); 194 return vpaddq_f32(a, b); 195 } 196 else 197 { 198 __m128 res; 199 res.ptr[0] = a.array[0] - a.array[1]; 200 res.ptr[1] = a.array[2] - a.array[3]; 201 res.ptr[2] = b.array[0] - b.array[1]; 202 res.ptr[3] = b.array[2] - b.array[3]; 203 return res; 204 } 205 } 206 unittest 207 { 208 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 209 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 210 assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array); 211 } 212 213 /// Load 128-bits of integer data from unaligned memory. 214 // Note: The saying is LDDQU was only ever useful around 2008 215 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used 216 alias _mm_lddqu_si128 = _mm_loadu_si128; 217 218 /// Load a double-precision (64-bit) floating-point element from memory into both elements of result. 219 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted 220 { 221 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 222 // Same for GDC with -O1 223 double value = *mem_addr; 224 __m128d res; 225 res.ptr[0] = value; 226 res.ptr[1] = value; 227 return res; 228 } 229 unittest 230 { 231 double a = 7.5; 232 __m128d A = _mm_loaddup_pd(&a); 233 double[2] correct = [7.5, 7.5]; 234 assert(A.array == correct); 235 } 236 237 /// Duplicate the low double-precision (64-bit) floating-point element from `a`. 238 __m128d _mm_movedup_pd (__m128d a) pure @trusted 239 { 240 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 241 // Something efficient with -01 for GDC 242 a.ptr[1] = a.array[0]; 243 return a; 244 } 245 unittest 246 { 247 __m128d A = _mm_setr_pd(7.0, 2.5); 248 assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array); 249 } 250 251 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`. 252 __m128 _mm_movehdup_ps (__m128 a) pure @trusted 253 { 254 static if (GDC_with_SSE3) 255 { 256 return __builtin_ia32_movshdup (a); 257 } 258 else 259 { 260 // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3 261 a.ptr[0] = a.array[1]; 262 a.ptr[2] = a.array[3]; 263 return a; 264 } 265 266 } 267 unittest 268 { 269 __m128 A = _mm_movehdup_ps(_mm_setr_ps(1, 2, 3, 4)); 270 float[4] correct = [2.0f, 2, 4, 4 ]; 271 assert(A.array == correct); 272 } 273 274 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`. 275 __m128 _mm_moveldup_ps (__m128 a) pure @trusted 276 { 277 static if (GDC_with_SSE3) 278 { 279 return __builtin_ia32_movsldup (a); 280 } 281 else 282 { 283 // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3 284 a.ptr[1] = a.array[0]; 285 a.ptr[3] = a.array[2]; 286 return a; 287 } 288 } 289 unittest 290 { 291 __m128 A = _mm_moveldup_ps(_mm_setr_ps(1, 2, 3, 4)); 292 float[4] correct = [1.0f, 1, 3, 3 ]; 293 assert(A.array == correct); 294 }