1 /** 2 * BMI2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2 4 * 5 * Copyright: Copyright Johan Engelen 2021. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.bmi2intrin; 9 10 import inteli.internals; 11 12 nothrow @nogc pure @safe: 13 14 /// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. 15 uint _bzhi_u32 (uint a, uint index) 16 { 17 static if (GDC_or_LDC_with_BMI2) 18 { 19 if (!__ctfe) 20 return __builtin_ia32_bzhi_si(a, index); 21 else 22 return bzhi!uint(a, index); 23 } 24 else 25 { 26 return bzhi!uint(a, index); 27 } 28 } 29 unittest 30 { 31 static assert (_bzhi_u32(0x1234_5678, 5) == 0x18); 32 assert (_bzhi_u32(0x1234_5678, 5) == 0x18); 33 static assert (_bzhi_u32(0x1234_5678, 10) == 0x278); 34 assert (_bzhi_u32(0x1234_5678, 10) == 0x278); 35 static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); 36 assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); 37 } 38 39 /// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. 40 ulong _bzhi_u64 (ulong a, uint index) 41 { 42 static if (GDC_or_LDC_with_BMI2) 43 { 44 if (!__ctfe) 45 { 46 version(X86_64) 47 { 48 // This instruction not available in 32-bit x86. 49 return __builtin_ia32_bzhi_di(a, index); 50 } 51 else 52 return bzhi!ulong(a, index); 53 } 54 else 55 return bzhi!ulong(a, index); 56 } 57 else 58 { 59 return bzhi!ulong(a, index); 60 } 61 } 62 unittest 63 { 64 static assert (_bzhi_u64(0x1234_5678, 5) == 0x18); 65 assert (_bzhi_u64(0x1234_5678, 5) == 0x18); 66 static assert (_bzhi_u64(0x1234_5678, 10) == 0x278); 67 assert (_bzhi_u64(0x1234_5678, 10) == 0x278); 68 static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); 69 assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); 70 static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); 71 assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); 72 } 73 74 // Helper function for BZHI 75 private T bzhi(T)(T a, uint index) 76 { 77 /+ 78 n := index[7:0] 79 dst := a 80 IF (n < number of bits) 81 dst[MSB:n] := 0 82 FI 83 +/ 84 enum numbits = T.sizeof*8; 85 T dst = a; 86 if (index < numbits) 87 { 88 T mask = (T(1) << index) - 1; 89 dst &= mask; 90 } 91 return dst; 92 } 93 94 /// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, 95 /// and store the high 32-bits in `hi`. This does not read or write arithmetic flags. 96 /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say. 97 /// But, those particular semantics don't exist at the level of intrinsics. 98 uint _mulx_u32 (uint a, uint b, uint* hi) 99 { 100 // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for 101 // some reason, even with LLVM IR. 102 // Also same with GDC. 103 ulong result = cast(ulong) a * b; 104 *hi = cast(uint) (result >>> 32); 105 return cast(uint)result; 106 } 107 @system unittest 108 { 109 uint hi; 110 assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840); 111 assert (hi == 0x014B_66DC); 112 } 113 114 /// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and 115 /// store the high 64-bits in `hi`. This does not read or write arithmetic flags. 116 /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say. 117 /// But, those particular semantics don't exist at the level of intrinsics. 118 ulong _mulx_u64 (ulong a, ulong b, ulong* hi) 119 { 120 /+ 121 dst[63:0] := (a * b)[63:0] 122 MEM[hi+63:hi] := (a * b)[127:64] 123 +/ 124 125 static if (LDC_with_optimizations) 126 { 127 static if (__VERSION__ >= 2094) 128 enum bool withLDCIR = true; 129 else 130 enum bool withLDCIR = false; 131 } 132 else 133 { 134 enum bool withLDCIR = false; 135 } 136 137 static if (withLDCIR) 138 { 139 // LDC x86: Generates mulx from -O0 140 enum ir = ` 141 %4 = zext i64 %0 to i128 142 %5 = zext i64 %1 to i128 143 %6 = mul nuw i128 %5, %4 144 %7 = lshr i128 %6, 64 145 %8 = trunc i128 %7 to i64 146 store i64 %8, i64* %2, align 8 147 %9 = trunc i128 %6 to i64 148 ret i64 %9`; 149 return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi); 150 } 151 else 152 { 153 /+ Straight-forward implementation with `ucent`: 154 ucent result = cast(ucent) a * b; 155 *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF); 156 return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF); 157 +/ 158 159 /+ 160 Implementation using 64bit math is more complex... 161 a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low) 162 = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low 163 = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low 164 = c2 << 64 + c11 << 32 + c12 << 32 + c0 165 = z1 << 64 + z0 166 // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that 167 // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that 168 // intermediate result is then the 'carry' that we need to add when calculating z1's sum. 169 z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32 170 The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 171 z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 172 +/ 173 174 const ulong a_low = a & 0xFFFF_FFFF; 175 const ulong a_high = a >>> 32; 176 const ulong b_low = b & 0xFFFF_FFFF; 177 const ulong b_high = b >>> 32; 178 179 const ulong c2 = a_high*b_high; 180 const ulong c11 = a_high*b_low; 181 const ulong c12 = a_low*b_high; 182 const ulong c0 = a_low*b_low; 183 184 const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF); 185 const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32); 186 const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32); 187 188 *hi = z1; 189 return z0; 190 } 191 } 192 @system unittest 193 { 194 ulong hi; 195 // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100 196 assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100); 197 assert (hi == 0x14b_66dc_33f6_acdc); 198 } 199 200 /// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. 201 uint _pdep_u32 (uint a, uint mask) 202 { 203 static if (GDC_or_LDC_with_BMI2) 204 { 205 if (!__ctfe) 206 return __builtin_ia32_pdep_si(a, mask); 207 else 208 return pdep!uint(a, mask); 209 } 210 else 211 { 212 return pdep!uint(a, mask); 213 } 214 } 215 unittest 216 { 217 static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); 218 assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); 219 } 220 221 /// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. 222 ulong _pdep_u64 (ulong a, ulong mask) 223 { 224 static if (GDC_or_LDC_with_BMI2) 225 { 226 if (!__ctfe) 227 { 228 version(X86_64) 229 { 230 // This instruction not available in 32-bit x86. 231 return __builtin_ia32_pdep_di(a, mask); 232 } 233 else 234 return pdep!ulong(a, mask); 235 } 236 else 237 return pdep!ulong(a, mask); 238 } 239 else 240 { 241 return pdep!ulong(a, mask); 242 } 243 } 244 unittest 245 { 246 static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); 247 assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); 248 } 249 250 // Helper function for PDEP 251 private T pdep(T)(T a, T mask) 252 { 253 /+ 254 tmp := a 255 dst := 0 256 m := 0 257 k := 0 258 DO WHILE m < 32 259 IF mask[m] == 1 260 dst[m] := tmp[k] 261 k := k + 1 262 FI 263 m := m + 1 264 OD 265 +/ 266 T dst; 267 T k_bitpos = 1; 268 T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed 269 foreach (m; 0..T.sizeof*8) 270 { 271 if (mask & m_bitpos) 272 { 273 dst |= (a & k_bitpos) ? m_bitpos : 0; 274 k_bitpos <<= 1; 275 } 276 m_bitpos <<= 1; 277 } 278 return dst; 279 } 280 281 282 /// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by 283 /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. 284 uint _pext_u32 (uint a, uint mask) 285 { 286 static if (GDC_or_LDC_with_BMI2) 287 { 288 if (!__ctfe) 289 return __builtin_ia32_pext_si(a, mask); 290 else 291 return pext!uint(a, mask); 292 } 293 else 294 { 295 return pext!uint(a, mask); 296 } 297 } 298 unittest 299 { 300 static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); 301 assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); 302 } 303 304 /// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by 305 /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. 306 ulong _pext_u64 (ulong a, ulong mask) 307 { 308 static if (GDC_or_LDC_with_BMI2) 309 { 310 if (!__ctfe) 311 { 312 version(X86_64) 313 { 314 // This instruction not available in 32-bit x86. 315 return __builtin_ia32_pext_di(a, mask); 316 } 317 else 318 return pext!ulong(a, mask); 319 } 320 else 321 return pext!ulong(a, mask); 322 } 323 else 324 { 325 return pext!ulong(a, mask); 326 } 327 } 328 unittest 329 { 330 static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); 331 assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); 332 } 333 334 // Helper function for PEXT 335 private T pext(T)(T a, T mask) 336 { 337 /+ 338 tmp := a 339 dst := 0 340 m := 0 341 k := 0 342 DO WHILE m < number of bits in T 343 IF mask[m] == 1 344 dst[k] := tmp[m] 345 k := k + 1 346 FI 347 m := m + 1 348 OD 349 +/ 350 T dst; 351 T k_bitpos = 1; 352 T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed 353 foreach (m; 0..T.sizeof*8) 354 { 355 if (mask & m_bitpos) 356 { 357 dst |= (a & m_bitpos) ? k_bitpos : 0; 358 k_bitpos <<= 1; 359 } 360 m_bitpos <<= 1; 361 } 362 return dst; 363 }