The OpenD Programming Language

1 /**
2 * BMI2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
4 *
5 * Copyright: Copyright Johan Engelen 2021.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.bmi2intrin;
9 
10 import inteli.internals;
11 
12 nothrow @nogc pure @safe:
13 
14 /// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
15 uint _bzhi_u32 (uint a, uint index)
16 {
17     static if (GDC_or_LDC_with_BMI2)
18     {
19         if (!__ctfe)
20             return __builtin_ia32_bzhi_si(a, index);
21         else
22             return bzhi!uint(a, index);
23     }
24     else
25     {
26         return bzhi!uint(a, index);
27     }
28 }
29 unittest
30 {
31     static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
32            assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
33     static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
34            assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
35     static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
36            assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
37 }
38 
39 /// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
40 ulong _bzhi_u64 (ulong a, uint index)
41 {
42     static if (GDC_or_LDC_with_BMI2)
43     {
44         if (!__ctfe)
45         {
46             version(X86_64)
47             {
48                 // This instruction not available in 32-bit x86.
49                 return __builtin_ia32_bzhi_di(a, index);
50             }
51             else
52                 return bzhi!ulong(a, index);
53         }
54         else
55             return bzhi!ulong(a, index);
56     }
57     else
58     {
59         return bzhi!ulong(a, index);
60     }
61 }
62 unittest
63 {
64     static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
65            assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
66     static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
67            assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
68     static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
69            assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
70     static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
71            assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
72 }
73 
74 // Helper function for BZHI
75 private T bzhi(T)(T a, uint index)
76 {
77     /+
78         n := index[7:0]
79         dst := a
80         IF (n < number of bits)
81             dst[MSB:n] := 0
82         FI
83     +/
84     enum numbits = T.sizeof*8;
85     T dst = a;
86     if (index < numbits)
87     {
88         T mask = (T(1) << index) - 1;
89         dst &= mask;
90     }
91     return dst;
92 }
93 
94 /// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, 
95 /// and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
96 /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
97 ///       But, those particular semantics don't exist at the level of intrinsics.
98 uint _mulx_u32 (uint a, uint b, uint* hi)
99 {
100     // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for
101     // some reason, even with LLVM IR.
102     // Also same with GDC.
103     ulong result = cast(ulong) a * b;
104     *hi = cast(uint) (result >>> 32);
105     return cast(uint)result;
106 }
107 @system unittest
108 {
109     uint hi;
110     assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
111     assert (hi == 0x014B_66DC);
112 }
113 
114 /// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and 
115 /// store the high 64-bits in `hi`. This does not read or write arithmetic flags.
116 /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
117 ///       But, those particular semantics don't exist at the level of intrinsics.
118 ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
119 {
120     /+
121         dst[63:0] := (a * b)[63:0]
122         MEM[hi+63:hi]  := (a * b)[127:64]
123     +/
124 
125     static if (LDC_with_optimizations)
126     {
127         static if (__VERSION__ >= 2094)
128             enum bool withLDCIR = true;
129         else
130             enum bool withLDCIR = false;
131     }
132     else
133     {
134         enum bool withLDCIR = false;
135     }
136 
137     static if (withLDCIR)
138     {
139         // LDC x86: Generates mulx from -O0
140         enum ir = `
141             %4 = zext i64 %0 to i128
142             %5 = zext i64 %1 to i128
143             %6 = mul nuw i128 %5, %4
144             %7 = lshr i128 %6, 64
145             %8 = trunc i128 %7 to i64
146             store i64 %8, i64* %2, align 8
147             %9 = trunc i128 %6 to i64
148             ret i64 %9`;
149         return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
150     }
151     else
152     {
153         /+ Straight-forward implementation with `ucent`:
154         ucent result = cast(ucent) a * b;
155         *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
156         return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
157         +/
158 
159         /+
160             Implementation using 64bit math is more complex...
161             a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
162                   = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
163                   = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
164                   = c2 << 64 + c11 << 32 + c12 << 32 + c0
165                   = z1 << 64  +  z0
166         // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
167         // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
168         // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
169             z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
170         The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
171             z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
172         +/
173 
174         const ulong a_low = a & 0xFFFF_FFFF;
175         const ulong a_high = a >>> 32;
176         const ulong b_low = b & 0xFFFF_FFFF;
177         const ulong b_high = b >>> 32;
178 
179         const ulong c2 = a_high*b_high;
180         const ulong c11 = a_high*b_low;
181         const ulong c12 = a_low*b_high;
182         const ulong c0 = a_low*b_low;
183 
184         const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
185         const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
186         const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
187 
188         *hi = z1;
189         return z0;
190     }
191 }
192 @system unittest
193 {
194     ulong hi;
195     // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
196     assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
197     assert (hi == 0x14b_66dc_33f6_acdc);
198 }
199 
200 /// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
201 uint _pdep_u32 (uint a, uint mask)
202 {
203     static if (GDC_or_LDC_with_BMI2)
204     {
205         if (!__ctfe)
206             return __builtin_ia32_pdep_si(a, mask);
207         else
208             return pdep!uint(a, mask);
209     }
210     else
211     {
212         return pdep!uint(a, mask);
213     }
214 }
215 unittest
216 {
217     static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
218            assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
219 }
220 
221 /// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
222 ulong _pdep_u64 (ulong a, ulong mask)
223 {
224     static if (GDC_or_LDC_with_BMI2)
225     {
226         if (!__ctfe)
227         {
228             version(X86_64)
229             {
230                 // This instruction not available in 32-bit x86.
231                 return __builtin_ia32_pdep_di(a, mask);
232             }
233             else
234                 return pdep!ulong(a, mask);
235         }
236         else
237             return pdep!ulong(a, mask);
238     }
239     else
240     {
241         return pdep!ulong(a, mask);
242     }
243 }
244 unittest
245 {
246     static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
247            assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
248 }
249 
250 // Helper function for PDEP
251 private T pdep(T)(T a, T mask)
252 {
253     /+
254         tmp := a
255         dst := 0
256         m := 0
257         k := 0
258         DO WHILE m < 32
259             IF mask[m] == 1
260                 dst[m] := tmp[k]
261                 k := k + 1
262             FI
263             m := m + 1
264         OD
265     +/
266     T dst;
267     T k_bitpos = 1;
268     T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
269     foreach (m; 0..T.sizeof*8)
270     {
271         if (mask & m_bitpos)
272         {
273             dst |= (a & k_bitpos) ? m_bitpos : 0;
274             k_bitpos <<= 1;
275         }
276         m_bitpos <<= 1;
277     }
278     return dst;
279 }
280 
281 
282 /// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by 
283 /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
284 uint _pext_u32 (uint a, uint mask)
285 {
286     static if (GDC_or_LDC_with_BMI2)
287     {
288         if (!__ctfe)
289             return __builtin_ia32_pext_si(a, mask);
290         else
291             return pext!uint(a, mask);
292     }
293     else
294     {
295         return pext!uint(a, mask);
296     }
297 }
298 unittest
299 {
300     static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
301            assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
302 }
303 
304 /// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by 
305 /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
306 ulong _pext_u64 (ulong a, ulong mask)
307 {
308     static if (GDC_or_LDC_with_BMI2)
309     {
310         if (!__ctfe)
311         {
312             version(X86_64)
313             {
314                 // This instruction not available in 32-bit x86.
315                 return __builtin_ia32_pext_di(a, mask);
316             }
317             else
318                 return pext!ulong(a, mask);
319         }
320         else
321             return pext!ulong(a, mask);
322     }
323     else
324     {
325         return pext!ulong(a, mask);
326     }
327 }
328 unittest
329 {
330     static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
331            assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
332 }
333 
334 // Helper function for PEXT
335 private T pext(T)(T a, T mask)
336 {
337     /+
338         tmp := a
339         dst := 0
340         m := 0
341         k := 0
342         DO WHILE m < number of bits in T
343             IF mask[m] == 1
344                 dst[k] := tmp[m]
345                 k := k + 1
346             FI
347             m := m + 1
348         OD
349     +/
350     T dst;
351     T k_bitpos = 1;
352     T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
353     foreach (m; 0..T.sizeof*8)
354     {
355         if (mask & m_bitpos)
356         {
357             dst |= (a & m_bitpos) ? k_bitpos : 0;
358             k_bitpos <<= 1;
359         }
360         m_bitpos <<= 1;
361     }
362     return dst;
363 }