The OpenD Programming Language

1 /**
2  * This module contains a collection of bit-level operations.
3  *
4  * Copyright: Copyright Don Clugston 2005 - 2013.
5  * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6  * Authors:   Don Clugston, Sean Kelly, Walter Bright, Alex Rønne Petersen, Thomas Stuart Bockman
7  * Source:    $(DRUNTIMESRC core/_bitop.d)
8  *
9  * Some of the LDC-specific parts came »From GDC ... public domain!«
10  */
11 
12 module core.bitop;
13 
14 nothrow:
15 @safe:
16 @nogc:
17 
18 version (LDC)
19 {
20     import ldc.intrinsics;
21     // Do not use the DMD inline assembler.
22 }
23 else version (D_InlineAsm_X86_64)
24     version = AsmX86;
25 else version (D_InlineAsm_X86)
26     version = AsmX86;
27 
28 version (X86_64)
29     version = AnyX86;
30 else version (X86)
31     version = AnyX86;
32 
33 // Use to implement 64-bit bitops on 32-bit arch.
34 private union Split64
35 {
36     ulong u64;
37     struct
38     {
39         version (LittleEndian)
40         {
41             uint lo;
42             uint hi;
43         }
44         else
45         {
46             uint hi;
47             uint lo;
48         }
49     }
50 
51     pragma(inline, true)
52     this(ulong u64) @safe pure nothrow @nogc
53     {
54         if (__ctfe)
55         {
56             lo = cast(uint) u64;
57             hi = cast(uint) (u64 >>> 32);
58         }
59         else
60             this.u64 = u64;
61     }
62 }
63 
64 unittest
65 {
66     const rt = Split64(1);
67     assert((rt.lo == 1) && (rt.hi == 0));
68 
69     enum ct = Split64(1);
70     assert((ct.lo == rt.lo) && (ct.hi == rt.hi));
71 }
72 
73 /**
74  * Scans the bits in v starting with bit 0, looking
75  * for the first set bit.
76  * Returns:
77  *      The bit number of the first bit set.
78  *      The return value is undefined if v is zero.
79  */
80 pragma(inline, true) // LDC
81 int bsf(uint v) pure
82 {
83   version (LDC)
84   {
85     if (!__ctfe)
86         return cast(int) llvm_cttz(v, true);
87   }
88   else
89   {
90     pragma(inline, false);  // so intrinsic detection will work
91   }
92     return softBsf!uint(v);
93 }
94 
95 /// ditto
96 pragma(inline, true) // LDC
97 int bsf(ulong v) pure
98 {
99     static if (size_t.sizeof == ulong.sizeof)  // 64 bit code gen
100     {
101       version (LDC)
102       {
103         if (!__ctfe)
104             return cast(int) llvm_cttz(v, true);
105       }
106       else
107       {
108         pragma(inline, false);   // so intrinsic detection will work
109       }
110         return softBsf!ulong(v);
111     }
112     else
113     {
114         /* intrinsic not available for 32 bit code,
115          * make do with 32 bit bsf
116          */
117         const sv = Split64(v);
118         return (sv.lo == 0)?
119             bsf(sv.hi) + 32 :
120             bsf(sv.lo);
121     }
122 }
123 
124 ///
125 unittest
126 {
127     assert(bsf(0x21) == 0);
128     assert(bsf(ulong.max << 39) == 39);
129 }
130 
131 unittest
132 {
133     // Make sure bsf() is available at CTFE
134     enum test_ctfe = bsf(ulong.max);
135     assert(test_ctfe == 0);
136 }
137 
138 /**
139  * Scans the bits in v from the most significant bit
140  * to the least significant bit, looking
141  * for the first set bit.
142  * Returns:
143  *      The bit number of the first bit set.
144  *      The return value is undefined if v is zero.
145  */
146 pragma(inline, true) // LDC
147 int bsr(uint v) pure
148 {
149   version (LDC)
150   {
151     if (!__ctfe)
152         return cast(int) (typeof(v).sizeof * 8 - 1 - llvm_ctlz(v, true));
153   }
154   else
155   {
156     pragma(inline, false);  // so intrinsic detection will work
157   }
158     return softBsr!uint(v);
159 }
160 
161 /// ditto
162 pragma(inline, true) // LDC
163 int bsr(ulong v) pure
164 {
165     static if (size_t.sizeof == ulong.sizeof)  // 64 bit code gen
166     {
167       version (LDC)
168       {
169         if (!__ctfe)
170             return cast(int) (size_t.sizeof * 8 - 1 - llvm_ctlz(v, true));
171       }
172       else
173       {
174         pragma(inline, false);   // so intrinsic detection will work
175       }
176         return softBsr!ulong(v);
177     }
178     else
179     {
180         /* intrinsic not available for 32 bit code,
181          * make do with 32 bit bsr
182          */
183         const sv = Split64(v);
184         return (sv.hi == 0)?
185             bsr(sv.lo) :
186             bsr(sv.hi) + 32;
187     }
188 }
189 
190 ///
191 unittest
192 {
193     assert(bsr(0x21) == 5);
194     assert(bsr((ulong.max >> 15) - 1) == 48);
195 }
196 
197 unittest
198 {
199     // Make sure bsr() is available at CTFE
200     enum test_ctfe = bsr(ulong.max);
201     assert(test_ctfe == 63);
202 }
203 
204 private alias softBsf(N) = softScan!(N, true);
205 private alias softBsr(N) = softScan!(N, false);
206 
207 /* Shared software fallback implementation for bit scan foward and reverse.
208 
209 If forward is true, bsf is computed (the index of the first set bit).
210 If forward is false, bsr is computed (the index of the last set bit).
211 
212 -1 is returned if no bits are set (v == 0).
213 */
214 private int softScan(N, bool forward)(N v) pure
215     if (is(N == uint) || is(N == ulong))
216 {
217     // bsf() and bsr() are officially undefined for v == 0.
218     if (!v)
219         return -1;
220 
221     // This is essentially an unrolled binary search:
222     enum mask(ulong lo) = forward ? cast(N) lo : cast(N)~lo;
223     enum inc(int up) = forward ? up : -up;
224 
225     N x;
226     int ret;
227     static if (is(N == ulong))
228     {
229         x = v & mask!0x0000_0000_FFFF_FFFFL;
230         if (x)
231         {
232             v = x;
233             ret = forward ? 0 : 63;
234         }
235         else
236             ret = forward ? 32 : 31;
237 
238         x = v & mask!0x0000_FFFF_0000_FFFFL;
239         if (x)
240             v = x;
241         else
242             ret += inc!16;
243     }
244     else static if (is(N == uint))
245     {
246         x = v & mask!0x0000_FFFF;
247         if (x)
248         {
249             v = x;
250             ret = forward ? 0 : 31;
251         }
252         else
253             ret = forward ? 16 : 15;
254     }
255     else
256         static assert(false);
257 
258     x = v & mask!0x00FF_00FF_00FF_00FFL;
259     if (x)
260         v = x;
261     else
262         ret += inc!8;
263 
264     x = v & mask!0x0F0F_0F0F_0F0F_0F0FL;
265     if (x)
266         v = x;
267     else
268         ret += inc!4;
269 
270     x = v & mask!0x3333_3333_3333_3333L;
271     if (x)
272         v = x;
273     else
274         ret += inc!2;
275 
276     x = v & mask!0x5555_5555_5555_5555L;
277     if (!x)
278         ret += inc!1;
279 
280     return ret;
281 }
282 
283 unittest
284 {
285     assert(softBsf!uint(0u) == -1);
286     assert(softBsr!uint(0u) == -1);
287     assert(softBsf!ulong(0uL) == -1);
288     assert(softBsr!ulong(0uL) == -1);
289 
290     assert(softBsf!uint(0x0031_A000) == 13);
291     assert(softBsr!uint(0x0031_A000) == 21);
292     assert(softBsf!ulong(0x0000_0001_8000_0000L) == 31);
293     assert(softBsr!ulong(0x0000_0001_8000_0000L) == 32);
294 
295     foreach (b; 0 .. 64)
296     {
297         if (b < 32)
298         {
299             assert(softBsf!uint(1u << b) == b);
300             assert(softBsr!uint(1u << b) == b);
301         }
302 
303         assert(softBsf!ulong(1uL << b) == b);
304         assert(softBsr!ulong(1uL << b) == b);
305     }
306 }
307 
308 /**
309  * Tests the bit.
310  * (No longer an intrisic - the compiler recognizes the patterns
311  * in the body.)
312  */
313 version (none)
314 {
315     // Our implementation returns an arbitrary non-zero value if the bit was
316     // set, which is not what std.bitmanip expects.
317     pragma(LDC_intrinsic, "ldc.bitop.bt")
318         int bt(const scope size_t* p, size_t bitnum) pure @system;
319 }
320 else
321 pragma(inline, true) // LDC
322 int bt(const scope size_t* p, size_t bitnum) pure @system
323 {
324     static if (size_t.sizeof == 8)
325         return ((p[bitnum >> 6] & (1L << (bitnum & 63)))) != 0;
326     else static if (size_t.sizeof == 4)
327         return ((p[bitnum >> 5] & (1  << (bitnum & 31)))) != 0;
328     else
329         static assert(0);
330 }
331 ///
332 @system pure unittest
333 {
334     size_t[2] array;
335 
336     array[0] = 2;
337     array[1] = 0x100;
338 
339     assert(bt(array.ptr, 1));
340     assert(array[0] == 2);
341     assert(array[1] == 0x100);
342 }
343 
344 
345 version (LDC)
346 {
347     pragma(LDC_intrinsic, "ldc.bitop.bts")
348         private int __bts(size_t* p, size_t bitnum) pure @system;
349     pragma(LDC_intrinsic, "ldc.bitop.btc")
350         private int __btc(size_t* p, size_t bitnum) pure @system;
351     pragma(LDC_intrinsic, "ldc.bitop.btr")
352         private int __btr(size_t* p, size_t bitnum) pure @system;
353 }
354 
355 private
356 int softBtx(string op)(size_t* p, size_t bitnum) pure @system
357 {
358     size_t indexIntoArray = bitnum / (size_t.sizeof*8);
359     size_t bitmask = size_t(1) << (bitnum & ((size_t.sizeof*8) - 1));
360     size_t original = p[indexIntoArray];
361     mixin("p[indexIntoArray] = original " ~ op ~ " bitmask;");
362     return (original&bitmask) > 0 ? true : false;
363 }
364 
365 /**
366  * Tests and complements the bit.
367  */
368 pragma(inline, true) // LDC
369 int btc(size_t* p, size_t bitnum) pure @system
370 {
371     version (LDC)
372     {
373         if (!__ctfe)
374             return __btc(p, bitnum);
375     }
376     else
377     {
378         pragma(inline, false);  // such that DMD intrinsic detection will work
379     }
380     return softBtx!"^"(p, bitnum);
381 }
382 
383 
384 /**
385  * Tests and resets (sets to 0) the bit.
386  */
387 pragma(inline, true) // LDC
388 int btr(size_t* p, size_t bitnum) pure @system
389 {
390     version (LDC)
391     {
392         if (!__ctfe)
393             return __btr(p, bitnum);
394     }
395     else
396     {
397         pragma(inline, false);  // such that DMD intrinsic detection will work
398     }
399     return softBtx!"& ~"(p, bitnum);
400 }
401 
402 
403 /**
404  * Tests and sets the bit.
405  * Params:
406  * p = a non-NULL pointer to an array of size_ts.
407  * bitnum = a bit number, starting with bit 0 of p[0],
408  * and progressing. It addresses bits like the expression:
409 ---
410 p[index / (size_t.sizeof*8)] & (1 << (index & ((size_t.sizeof*8) - 1)))
411 ---
412  * Returns:
413  *      A non-zero value if the bit was set, and a zero
414  *      if it was clear.
415  */
416 pragma(inline, true) // LDC
417 int bts(size_t* p, size_t bitnum) pure @system
418 {
419     version (LDC)
420     {
421         if (!__ctfe)
422             return __bts(p, bitnum);
423     }
424     else
425     {
426         pragma(inline, false);  // such that DMD intrinsic detection will work
427     }
428     return softBtx!"|"(p, bitnum);
429 }
430 
431 ///
432 @system pure unittest
433 {
434     @system pure bool testbitset()
435     {
436         size_t[2] array;
437 
438         array[0] = 2;
439         array[1] = 0x100;
440 
441         assert(btc(array.ptr, 35) == 0);
442         if (size_t.sizeof == 8)
443         {
444             assert(array[0] == 0x8_0000_0002);
445             assert(array[1] == 0x100);
446         }
447         else
448         {
449             assert(array[0] == 2);
450             assert(array[1] == 0x108);
451         }
452 
453         assert(btc(array.ptr, 35));
454         assert(array[0] == 2);
455         assert(array[1] == 0x100);
456 
457         assert(bts(array.ptr, 35) == 0);
458         if (size_t.sizeof == 8)
459         {
460             assert(array[0] == 0x8_0000_0002);
461             assert(array[1] == 0x100);
462         }
463         else
464         {
465             assert(array[0] == 2);
466             assert(array[1] == 0x108);
467         }
468 
469         assert(btr(array.ptr, 35));
470         assert(array[0] == 2);
471         assert(array[1] == 0x100);
472 
473         return true;
474     }
475 
476     enum b = testbitset(); // CTFE test
477     testbitset(); // runtime test
478 }
479 
480 /**
481  * Range over bit set. Each element is the bit number that is set.
482  *
483  * This is more efficient than testing each bit in a sparsely populated bit
484  * set. Note that the first bit in the bit set would be bit 0.
485  */
486 struct BitRange
487 {
488     /// Number of bits in each size_t
489     enum bitsPerWord = size_t.sizeof * 8;
490 
491     private
492     {
493         const(size_t)*bits; // points at next word of bits to check
494         size_t cur; // needed to allow searching bits using bsf
495         size_t idx; // index of current set bit
496         size_t len; // number of bits in the bit set.
497     }
498     @nogc nothrow pure:
499 
500     /**
501      * Construct a BitRange.
502      *
503      * Params:
504      *   bitarr = The array of bits to iterate over
505      *   numBits = The total number of valid bits in the given bit array
506      */
507     this(const(size_t)* bitarr, size_t numBits) @system
508     {
509         bits = bitarr;
510         len = numBits;
511         if (len)
512         {
513             // prime the first bit
514             cur = *bits++ ^ 1;
515             popFront();
516         }
517     }
518 
519     /// Range functions
520     size_t front()
521     {
522         assert(!empty);
523         return idx;
524     }
525 
526     /// ditto
527     bool empty() const
528     {
529         return idx >= len;
530     }
531 
532     /// ditto
533     void popFront() @system
534     {
535         // clear the current bit
536         auto curbit = idx % bitsPerWord;
537         cur ^= size_t(1) << curbit;
538         if (!cur)
539         {
540             // find next size_t with set bit
541             idx -= curbit;
542             while (!cur)
543             {
544                 if ((idx += bitsPerWord) >= len)
545                     // now empty
546                     return;
547                 cur = *bits++;
548             }
549             idx += bsf(cur);
550         }
551         else
552         {
553             idx += bsf(cur) - curbit;
554         }
555     }
556 }
557 
558 ///
559 @system unittest
560 {
561     import core.stdc.stdlib : malloc, free;
562     import core.stdc.string : memset;
563 
564     // initialize a bit array
565     enum nBytes = (100 + BitRange.bitsPerWord - 1) / 8;
566     size_t *bitArr = cast(size_t *)malloc(nBytes);
567     scope(exit) free(bitArr);
568     memset(bitArr, 0, nBytes);
569 
570     // set some bits
571     bts(bitArr, 48);
572     bts(bitArr, 24);
573     bts(bitArr, 95);
574     bts(bitArr, 78);
575 
576     enum sum = 48 + 24 + 95 + 78;
577 
578     // iterate
579     size_t testSum;
580     size_t nBits;
581     foreach (b; BitRange(bitArr, 100))
582     {
583         testSum += b;
584         ++nBits;
585     }
586 
587     assert(testSum == sum);
588     assert(nBits == 4);
589 }
590 
591 @system unittest
592 {
593     void testIt(size_t numBits, size_t[] bitsToTest...)
594     {
595         import core.stdc.stdlib : malloc, free;
596         import core.stdc.string : memset;
597         immutable numBytes = (numBits + size_t.sizeof * 8 - 1) / 8;
598         size_t* bitArr = cast(size_t *)malloc(numBytes);
599         scope(exit) free(bitArr);
600         memset(bitArr, 0, numBytes);
601         foreach (b; bitsToTest)
602             bts(bitArr, b);
603         auto br = BitRange(bitArr, numBits);
604         foreach (b; bitsToTest)
605         {
606             assert(!br.empty);
607             assert(b == br.front);
608             br.popFront();
609         }
610         assert(br.empty);
611     }
612 
613     testIt(100, 0, 1, 31, 63, 85);
614     testIt(100, 6, 45, 89, 92, 99);
615 }
616 
617 /**
618  * Swaps bytes in a 2 byte ushort.
619  * Params:
620  *      x = value
621  * Returns:
622  *      `x` with bytes swapped
623  */
624 version (LDC)
625 {
626     alias byteswap = llvm_bswap!ushort;
627 }
628 else
629 pragma(inline, false)
630 ushort byteswap(ushort x) pure
631 {
632     /* Calling it bswap(ushort) would break existing code that calls bswap(uint).
633      *
634      * This pattern is meant to be recognized by the dmd code generator.
635      * Don't change it without checking that an XCH instruction is still
636      * used to implement it.
637      * Inlining may also throw it off.
638      */
639     return cast(ushort) (((x >> 8) & 0xFF) | ((x << 8) & 0xFF00u));
640 }
641 
642 ///
643 unittest
644 {
645     assert(byteswap(cast(ushort)0xF234) == 0x34F2);
646     static ushort xx = 0xF234;
647     assert(byteswap(xx) == 0x34F2);
648 }
649 
650 /**
651  * Swaps bytes in a 4 byte uint end-to-end, i.e. byte 0 becomes
652  * byte 3, byte 1 becomes byte 2, byte 2 becomes byte 1, byte 3
653  * becomes byte 0.
654  */
655 version (LDC)
656 {
657     alias bswap = llvm_bswap!uint;
658 }
659 else
660 uint bswap(uint v) pure;
661 
662 ///
663 unittest
664 {
665     assert(bswap(0x01020304u) == 0x04030201u);
666     static uint xx = 0x10203040u;
667     assert(bswap(xx) == 0x40302010u);
668 }
669 
670 /**
671  * Swaps bytes in an 8 byte ulong end-to-end, i.e. byte 0 becomes
672  * byte 7, byte 1 becomes byte 6, etc.
673  * This is meant to be recognized by the compiler as an intrinsic.
674  */
675 version (LDC)
676 {
677     alias bswap = llvm_bswap!ulong;
678 }
679 else
680 ulong bswap(ulong v) pure;
681 
682 ///
683 unittest
684 {
685     assert(bswap(0x01020304_05060708uL) == 0x08070605_04030201uL);
686     static ulong xx = 0x10203040_50607080uL;
687     assert(bswap(xx) == 0x80706050_40302010uL);
688 }
689 
690 version (DigitalMars) version (AnyX86) @system // not pure
691 {
692     /**
693      * Reads I/O port at port_address.
694      */
695     ubyte inp(uint port_address);
696 
697 
698     /**
699      * ditto
700      */
701     ushort inpw(uint port_address);
702 
703 
704     /**
705      * ditto
706      */
707     uint inpl(uint port_address);
708 
709 
710     /**
711      * Writes and returns value to I/O port at port_address.
712      */
713     ubyte outp(uint port_address, ubyte value);
714 
715 
716     /**
717      * ditto
718      */
719     ushort outpw(uint port_address, ushort value);
720 
721 
722     /**
723      * ditto
724      */
725     uint outpl(uint port_address, uint value);
726 }
727 version (LDC) @system // not pure
728 {
729     /**
730      * Reads I/O port at port_address.
731      */
732     uint inp(uint port_address)
733     {
734         assert(false, "inp not yet implemented for LDC.");
735     }
736 
737 
738     /**
739      * ditto
740      */
741     uint inpw(uint port_address)
742     {
743         assert(false, "inpw not yet implemented for LDC.");
744     }
745 
746 
747     /**
748      * ditto
749      */
750     uint inpl(uint port_address)
751     {
752         assert(false, "inpl not yet implemented for LDC.");
753     }
754 
755 
756     /**
757      * Writes and returns value to I/O port at port_address.
758      */
759     ubyte outp(uint port_address, uint value)
760     {
761         assert(false, "outp not yet implemented for LDC.");
762     }
763 
764 
765     /**
766      * ditto
767      */
768     ushort outpw(uint port_address, uint value)
769     {
770         assert(false, "outpw not yet implemented for LDC.");
771     }
772 
773 
774     /**
775      * ditto
776      */
777     uint outpl(uint port_address, uint value)
778     {
779         assert(false, "outpl not yet implemented for LDC.");
780     }
781 }
782 
783 version (LDC)
784 {
785     alias _popcnt = llvm_ctpop!ushort;
786 
787     pragma(inline, true):
788 
789     int _popcnt(uint x) pure
790     {
791         return cast(int) llvm_ctpop(x);
792     }
793 
794     int _popcnt(ulong x) pure
795     {
796         return cast(int) llvm_ctpop(x);
797     }
798 }
799 
800 
801 /**
802  *  Calculates the number of set bits in an integer.
803  */
804 pragma(inline, true) // LDC
805 int popcnt(uint x) pure
806 {
807     // Select the fastest method depending on the compiler and CPU architecture
808     version (LDC)
809     {
810         if (!__ctfe)
811             return _popcnt(x);
812     }
813     else version (DigitalMars)
814     {
815         static if (is(typeof(_popcnt(uint.max))))
816         {
817             import core.cpuid;
818             if (!__ctfe && hasPopcnt)
819                 return _popcnt(x);
820         }
821     }
822 
823     return softPopcnt!uint(x);
824 }
825 
826 unittest
827 {
828     assert( popcnt( 0 ) == 0 );
829     assert( popcnt( 7 ) == 3 );
830     assert( popcnt( 0xAA )== 4 );
831     assert( popcnt( 0x8421_1248 ) == 8 );
832     assert( popcnt( 0xFFFF_FFFF ) == 32 );
833     assert( popcnt( 0xCCCC_CCCC ) == 16 );
834     assert( popcnt( 0x7777_7777 ) == 24 );
835 
836     // Make sure popcnt() is available at CTFE
837     enum test_ctfe = popcnt(uint.max);
838     assert(test_ctfe == 32);
839 }
840 
841 /// ditto
842 pragma(inline, true) // LDC
843 int popcnt(ulong x) pure
844 {
845     // Select the fastest method depending on the compiler and CPU architecture
846     version (LDC)
847     {
848         if (!__ctfe)
849             return _popcnt(x);
850     }
851 
852     import core.cpuid;
853 
854     static if (size_t.sizeof == uint.sizeof)
855     {
856         const sx = Split64(x);
857         version (DigitalMars)
858         {
859             static if (is(typeof(_popcnt(uint.max))))
860             {
861                 if (!__ctfe && hasPopcnt)
862                     return _popcnt(sx.lo) + _popcnt(sx.hi);
863             }
864         }
865 
866         return softPopcnt!uint(sx.lo) + softPopcnt!uint(sx.hi);
867     }
868     else static if (size_t.sizeof == ulong.sizeof)
869     {
870         version (DigitalMars)
871         {
872             static if (is(typeof(_popcnt(ulong.max))))
873             {
874                 if (!__ctfe && hasPopcnt)
875                     return _popcnt(x);
876             }
877         }
878 
879         return softPopcnt!ulong(x);
880     }
881     else
882         static assert(false);
883 }
884 
885 unittest
886 {
887     assert(popcnt(0uL) == 0);
888     assert(popcnt(1uL) == 1);
889     assert(popcnt((1uL << 32) - 1) == 32);
890     assert(popcnt(0x48_65_6C_6C_6F_3F_21_00uL) == 28);
891     assert(popcnt(ulong.max) == 64);
892 
893     // Make sure popcnt() is available at CTFE
894     enum test_ctfe = popcnt(ulong.max);
895     assert(test_ctfe == 64);
896 }
897 
898 private int softPopcnt(N)(N x) pure
899     if (is(N == uint) || is(N == ulong))
900 {
901     // Avoid branches, and the potential for cache misses which
902     // could be incurred with a table lookup.
903 
904     // We need to mask alternate bits to prevent the
905     // sum from overflowing.
906     // add neighbouring bits. Each bit is 0 or 1.
907     enum mask1 = cast(N) 0x5555_5555_5555_5555L;
908     x = x - ((x>>1) & mask1);
909     // now each two bits of x is a number 00,01 or 10.
910     // now add neighbouring pairs
911     enum mask2a = cast(N) 0xCCCC_CCCC_CCCC_CCCCL;
912     enum mask2b = cast(N) 0x3333_3333_3333_3333L;
913     x = ((x & mask2a)>>2) + (x & mask2b);
914     // now each nibble holds 0000-0100. Adding them won't
915     // overflow any more, so we don't need to mask any more
916 
917     enum mask4 = cast(N) 0x0F0F_0F0F_0F0F_0F0FL;
918     x = (x + (x >> 4)) & mask4;
919 
920     enum shiftbits = is(N == uint)? 24 : 56;
921     enum maskMul = cast(N) 0x0101_0101_0101_0101L;
922     x = (x * maskMul) >> shiftbits;
923 
924     return cast(int) x;
925 }
926 
927 version (DigitalMars) version (AnyX86)
928 {
929     /**
930      * Calculates the number of set bits in an integer
931      * using the X86 SSE4 POPCNT instruction.
932      * POPCNT is not available on all X86 CPUs.
933      */
934     ushort _popcnt( ushort x ) pure;
935     /// ditto
936     int _popcnt( uint x ) pure;
937     version (X86_64)
938     {
939         /// ditto
940         int _popcnt( ulong x ) pure;
941     }
942 
943     unittest
944     {
945         // Not everyone has SSE4 instructions
946         import core.cpuid;
947         if (!hasPopcnt)
948             return;
949 
950         static int popcnt_x(ulong u) nothrow @nogc
951         {
952             int c;
953             while (u)
954             {
955                 c += u & 1;
956                 u >>= 1;
957             }
958             return c;
959         }
960 
961         for (uint u = 0; u < 0x1_0000; ++u)
962         {
963             //writefln("%x %x %x", u,   _popcnt(cast(ushort)u), popcnt_x(cast(ushort)u));
964             assert(_popcnt(cast(ushort)u) == popcnt_x(cast(ushort)u));
965 
966             assert(_popcnt(cast(uint)u) == popcnt_x(cast(uint)u));
967             uint ui = u * 0x3_0001;
968             assert(_popcnt(ui) == popcnt_x(ui));
969 
970             version (X86_64)
971             {
972                 assert(_popcnt(cast(ulong)u) == popcnt_x(cast(ulong)u));
973                 ulong ul = u * 0x3_0003_0001;
974                 assert(_popcnt(ul) == popcnt_x(ul));
975             }
976         }
977     }
978 }
979 
980 
981 /**
982  * Reverses the order of bits in a 32-bit integer.
983  */
984 pragma(inline, true)
985 uint bitswap( uint x ) pure
986 {
987     if (!__ctfe)
988     {
989         version (LDC)
990         {
991             return llvm_bitreverse(x);
992         }
993         else
994         static if (is(typeof(asmBitswap32(x))))
995             return asmBitswap32(x);
996     }
997 
998     return softBitswap!uint(x);
999 }
1000 
1001 unittest
1002 {
1003     static void test(alias impl)()
1004     {
1005         assert (impl( 0x8000_0100 ) == 0x0080_0001);
1006         foreach (i; 0 .. 32)
1007             assert (impl(1 << i) == 1 << 32 - i - 1);
1008     }
1009 
1010     test!(bitswap)();
1011     test!(softBitswap!uint)();
1012     static if (is(typeof(asmBitswap32(0u))))
1013         test!(asmBitswap32)();
1014 
1015     // Make sure bitswap() is available at CTFE
1016     enum test_ctfe = bitswap(1U);
1017     assert(test_ctfe == (1U << 31));
1018 }
1019 
1020 /**
1021  * Reverses the order of bits in a 64-bit integer.
1022  */
1023 pragma(inline, true)
1024 ulong bitswap ( ulong x ) pure
1025 {
1026     if (!__ctfe)
1027     {
1028         version (LDC)
1029         {
1030             return llvm_bitreverse(x);
1031         }
1032         else
1033         static if (is(typeof(asmBitswap64(x))))
1034             return asmBitswap64(x);
1035     }
1036 
1037     return softBitswap!ulong(x);
1038 }
1039 
1040 unittest
1041 {
1042     static void test(alias impl)()
1043     {
1044         assert (impl( 0b1000000000000000000000010000000000000000100000000000000000000001)
1045             == 0b1000000000000000000000010000000000000000100000000000000000000001);
1046         assert (impl( 0b1110000000000000000000010000000000000000100000000000000000000001)
1047             == 0b1000000000000000000000010000000000000000100000000000000000000111);
1048         foreach (i; 0 .. 64)
1049             assert (impl(1UL << i) == 1UL << 64 - i - 1);
1050     }
1051 
1052     test!(bitswap)();
1053     test!(softBitswap!ulong)();
1054     static if (is(typeof(asmBitswap64(0uL))))
1055         test!(asmBitswap64)();
1056 
1057     // Make sure bitswap() is available at CTFE
1058     enum test_ctfe = bitswap(1UL);
1059     assert(test_ctfe == (1UL << 63));
1060 }
1061 
1062 private N softBitswap(N)(N x) pure
1063     if (is(N == uint) || is(N == ulong))
1064 {
1065     // swap 1-bit pairs:
1066     enum mask1 = cast(N) 0x5555_5555_5555_5555L;
1067     x = ((x >> 1) & mask1) | ((x & mask1) << 1);
1068     // swap 2-bit pairs:
1069     enum mask2 = cast(N) 0x3333_3333_3333_3333L;
1070     x = ((x >> 2) & mask2) | ((x & mask2) << 2);
1071     // swap 4-bit pairs:
1072     enum mask4 = cast(N) 0x0F0F_0F0F_0F0F_0F0FL;
1073     x = ((x >> 4) & mask4) | ((x & mask4) << 4);
1074 
1075     // reverse the order of all bytes:
1076     x = bswap(x);
1077 
1078     return x;
1079 }
1080 
1081 version (AsmX86)
1082 {
1083     private uint asmBitswap32(uint x) @trusted pure
1084     {
1085         asm pure nothrow @nogc { naked; }
1086 
1087         version (D_InlineAsm_X86_64)
1088         {
1089             version (Win64)
1090                 asm pure nothrow @nogc { mov EAX, ECX; }
1091             else
1092                 asm pure nothrow @nogc { mov EAX, EDI; }
1093         }
1094 
1095         asm pure nothrow @nogc
1096         {
1097             // Author: Tiago Gasiba.
1098             mov EDX, EAX;
1099             shr EAX, 1;
1100             and EDX, 0x5555_5555;
1101             and EAX, 0x5555_5555;
1102             shl EDX, 1;
1103             or  EAX, EDX;
1104             mov EDX, EAX;
1105             shr EAX, 2;
1106             and EDX, 0x3333_3333;
1107             and EAX, 0x3333_3333;
1108             shl EDX, 2;
1109             or  EAX, EDX;
1110             mov EDX, EAX;
1111             shr EAX, 4;
1112             and EDX, 0x0f0f_0f0f;
1113             and EAX, 0x0f0f_0f0f;
1114             shl EDX, 4;
1115             or  EAX, EDX;
1116             bswap EAX;
1117             ret;
1118         }
1119     }
1120 }
1121 
1122 version (LDC) {} else
1123 version (D_InlineAsm_X86_64)
1124 {
1125     private ulong asmBitswap64(ulong x) @trusted pure
1126     {
1127         asm pure nothrow @nogc { naked; }
1128 
1129         version (Win64)
1130             asm pure nothrow @nogc { mov RAX, RCX; }
1131         else
1132             asm pure nothrow @nogc { mov RAX, RDI; }
1133 
1134         asm pure nothrow @nogc
1135         {
1136             // Author: Tiago Gasiba.
1137             mov RDX, RAX;
1138             shr RAX, 1;
1139             mov RCX, 0x5555_5555_5555_5555L;
1140             and RDX, RCX;
1141             and RAX, RCX;
1142             shl RDX, 1;
1143             or  RAX, RDX;
1144 
1145             mov RDX, RAX;
1146             shr RAX, 2;
1147             mov RCX, 0x3333_3333_3333_3333L;
1148             and RDX, RCX;
1149             and RAX, RCX;
1150             shl RDX, 2;
1151             or  RAX, RDX;
1152 
1153             mov RDX, RAX;
1154             shr RAX, 4;
1155             mov RCX, 0x0f0f_0f0f_0f0f_0f0fL;
1156             and RDX, RCX;
1157             and RAX, RCX;
1158             shl RDX, 4;
1159             or  RAX, RDX;
1160             bswap RAX;
1161             ret;
1162         }
1163     }
1164 }
1165 
1166 /**
1167  *  Bitwise rotate `value` left (`rol`) or right (`ror`) by
1168  *  `count` bit positions.
1169  */
1170 pragma(inline, true) // LDC
1171 pure T rol(T)(const T value, const uint count)
1172     if (__traits(isIntegral, T) && __traits(isUnsigned, T))
1173 {
1174     assert(count < 8 * T.sizeof);
1175     if (count == 0)
1176         return cast(T) value;
1177 
1178     return cast(T) ((value << count) | (value >> (T.sizeof * 8 - count)));
1179 }
1180 /// ditto
1181 pragma(inline, true) // LDC
1182 pure T ror(T)(const T value, const uint count)
1183     if (__traits(isIntegral, T) && __traits(isUnsigned, T))
1184 {
1185     assert(count < 8 * T.sizeof);
1186     if (count == 0)
1187         return cast(T) value;
1188 
1189     return cast(T) ((value >> count) | (value << (T.sizeof * 8 - count)));
1190 }
1191 /// ditto
1192 pragma(inline, true) // LDC
1193 pure T rol(uint count, T)(const T value)
1194     if (__traits(isIntegral, T) && __traits(isUnsigned, T))
1195 {
1196     static assert(count < 8 * T.sizeof);
1197     static if (count == 0)
1198         return cast(T) value;
1199 
1200     return cast(T) ((value << count) | (value >> (T.sizeof * 8 - count)));
1201 }
1202 /// ditto
1203 pragma(inline, true) // LDC
1204 pure T ror(uint count, T)(const T value)
1205     if (__traits(isIntegral, T) && __traits(isUnsigned, T))
1206 {
1207     static assert(count < 8 * T.sizeof);
1208     static if (count == 0)
1209         return cast(T) value;
1210 
1211     return cast(T) ((value >> count) | (value << (T.sizeof * 8 - count)));
1212 }
1213 
1214 ///
1215 unittest
1216 {
1217     ubyte a = 0b11110000U;
1218     ulong b = ~1UL;
1219 
1220     assert(rol(a, 1) == 0b11100001);
1221     assert(ror(a, 1) == 0b01111000);
1222     assert(rol(a, 3) == 0b10000111);
1223     assert(ror(a, 3) == 0b00011110);
1224 
1225     assert(rol(a, 0) == a);
1226     assert(ror(a, 0) == a);
1227 
1228     assert(rol(b, 63) == ~(1UL << 63));
1229     assert(ror(b, 63) == ~2UL);
1230 
1231     assert(rol!3(a) == 0b10000111);
1232     assert(ror!3(a) == 0b00011110);
1233 
1234     enum c = rol(uint(1), 0);
1235     enum d = ror(uint(1), 0);
1236     assert(c == uint(1));
1237     assert(d == uint(1));
1238 }