The OpenD Programming Language

1 module gamut.codecs.qoi10b;
2 
3 nothrow @nogc:
4 
5 import core.stdc.stdlib: realloc, malloc, free;
6 import core.stdc.string: memset;
7 
8 import gamut.codecs.qoi2avg;
9 import inteli.emmintrin;
10 
11 /// A QOI-inspired codec for 10-bit images, called "QOI-10b". 
12 /// Input image is 16-bit ushort, but only 10-bits gets encoded making it lossy.
13 ///
14 ///
15 /// Incompatible adaptation of QOI format - https://phoboslab.org
16 ///
17 /// -- LICENSE: The MIT License(MIT)
18 /// Copyright(c) 2021 Dominic Szablewski (original QOI format)
19 /// Copyright(c) 2022 Guillaume Piolat (QOI-10b variant for 10b images, 1/2/3/4 channels).
20 /// Permission is hereby granted, free of charge, to any person obtaining a copy of
21 /// this software and associated documentation files(the "Software"), to deal in
22 /// the Software without restriction, including without limitation the rights to
23 /// use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
24 /// of the Software, and to permit persons to whom the Software is furnished to do
25 /// so, subject to the following conditions :
26 /// The above copyright notice and this permission notice shall be included in all
27 /// copies or substantial portions of the Software.
28 /// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 /// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 /// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
31 /// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 /// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33 /// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 /// SOFTWARE.
35 
36 /// -- Documentation
37 
38 /// This library provides the following functions;
39 /// - qoi10b_decode  -- decode the raw bytes of a QOI-10b image from memory
40 /// - qoi10b_encode  -- encode an rgba buffer into a QOI-10b image in memory
41 /// 
42 ///
43 /// A QOI-10b file has a 25 byte header, compatible with Gamut QOIX.
44 ///
45 /// struct qoix_header_t {
46 ///     char     magic[4];         // magic bytes "qoix"
47 ///     uint32_t width;            // image width in pixels (BE)
48 ///     uint32_t height;           // image height in pixels (BE)
49 ///     uint8_t  version_;         // Major version of QOIX format.
50 ///     uint8_t  channels;         // 1, 2, 3 or 4
51 ///     uint8_t  bitdepth;         // 10 = this QOI-10b codec is always 10-bit (8 would indicate QOI2AVG or QOI-plane)
52 ///     uint8_t  colorspace;       // 0 = sRGB with linear alpha, 1 = all channels linear
53 ///     uint8_t  compression;      // 0 = none, 1 = LZ4
54 ///     float    pixelAspectRatio; // -1 = unknown, else Pixel Aspect Ratio
55 ///     float    resolutionX;      // -1 = unknown, else physical resolution in DPI
56 /// };
57 ///
58 /// The decoder and encoder start with {r: 0; g: 0; b: 0; a: 0} as the previous
59 /// pixel value. Pixels are either encoded as
60 /// - a run of the previous pixel
61 /// - a difference to the previous pixel value
62 /// - full luminance value
63 ///
64 /// The byte stream's end is marked with 5 0xff bytes.
65 ///
66 /// This codec is simply like QOI2AVG but with extra 2 bits for each components, and it breaks byte alignment.
67 ///
68 /// Optimized?    Opcode       Bits(RGB)   Bits(grey)  Meaning
69 /// [ ]           QOI_OP_LUMA      14           6     0ggggg[rrrrbbbb]  (less g or more g => doesn't work)
70 /// [ ]           QOI_OP_LUMA0     12           6     10gggg[rrrbbb]
71 /// [ ]           QOI_OP_LUMA2     22          10     110ggggggg[rrrrrrbbbbbb]
72 /// [ ]           QOI_OP_LUMA3     30          14     11100ggggggggg[rrrrrrrrbbbbbbbb]
73 /// [ ]           QOI_OP_ADIFF     10          10     11101xxxxx
74 /// [x]           QOI_OP_RUN        8           8     11110xxx
75 ///                                16          16     11110111xxxxxxxx
76 /// [ ]           QOI_OP_ADIFF2    16          16     111110xxxxxxxx 
77 /// [x]           QOI_OP_GRAY      18          18     11111100gggggggggg
78 /// [ ]           QOI_OP_RGB       38          18     11111101rrrrrrrrrr[ggggggggggbbbbbbbbbb]
79 /// [ ]           QOI_OP_RGBA      48          28     11111110rrrrrrrrrr[ggggggggggbbbbbbbbbb]aaaaaaaaaa
80 /// [ ]           QOI_OP_END        8           8     11111111
81 
82 // Note: LOCO-I predictor doesn't really work here, it slowdown quite a bit for 0.5% bitrate improvement.
83 //       Not sure what happens here.
84 
85 enum ubyte QOI_OP_ADIFF2 = 0xf8;
86 
87 enum int WORST_OPCODE_BITS = 48;
88 
89 enum enableAveragePrediction = true; 
90 
91 enum bool newpred = false;//true;
92 
93 static immutable ubyte[5] qoi10b_padding = [255,255,255,255,255];
94 
95 enum qoi10_rgba_t initialPredictor = { r:0, g:0, b:0, a:1023 };
96 
97 struct qoi10_rgba_t 
98 {   
99     // 0 to 1023 values
100     ushort r;
101     ushort g;
102     ushort b;
103     ushort a;
104 }
105 
106 uint QOI10b_COLOR_HASH(qoi10_rgba_t C)
107 {
108     long colorAsLong = *(cast(long*)&C);
109     return (((colorAsLong * 2654435769) >> 22) & 1023);
110 }
111 
112 /* Encode raw RGBA16 pixels into a QOI-10b image in memory.
113 This immediately loosen precision from 16-bit t 10-bit, so this is lossy.
114 The function either returns null on failure (invalid parameters or malloc 
115 failed) or a pointer to the encoded data on success. On success the out_len 
116 is set to the size in bytes of the encoded data.
117 The returned qoi data should be free()d after use. */
118 version(encodeQOIX)
119 ubyte* qoi10b_encode(const(ubyte)* data, const(qoi_desc)* desc, int *out_len) 
120 {
121     if ( (desc.channels != 1 && desc.channels != 2 && desc.channels != 3 && desc.channels != 4) ||
122         desc.height >= QOIX_PIXELS_MAX / desc.width || desc.compression != QOIX_COMPRESSION_NONE
123     ) {
124         return null;
125     }
126 
127     if (desc.bitdepth != 10)
128         return null;
129 
130     int channels = desc.channels;
131 
132     // At worst, each pixel take 38 bit to be encoded.
133     int num_pixels = desc.width * desc.height;
134 
135    int scanline_size = cast(int)(qoi10_rgba_t.sizeof) * desc.width;
136 
137     int max_size = cast(int) (cast(long)num_pixels * WORST_OPCODE_BITS + 7) / 8 + QOIX_HEADER_SIZE + cast(int)(qoi10b_padding.sizeof);
138 
139     ubyte* stream;
140 
141     int p = 0; // write index into output stream
142     ubyte* bytes = cast(ubyte*) QOI_MALLOC(max_size + 2 * scanline_size);
143     if (!bytes) 
144     {
145         return null;
146     }
147 
148     qoi_write_32(bytes, &p, QOIX_MAGIC);
149     qoi_write_32(bytes, &p, desc.width);
150     qoi_write_32(bytes, &p, desc.height);
151     bytes[p++] = 1; // Put a version number :)
152     bytes[p++] = desc.channels; // 1, 2, 3 or 4
153     bytes[p++] = desc.bitdepth; // 10
154     bytes[p++] = desc.colorspace;
155     bytes[p++] = QOIX_COMPRESSION_NONE;
156     qoi_write_32f(bytes, &p, desc.pixelAspectRatio);
157     qoi_write_32f(bytes, &p, desc.resolutionY);
158 
159     int currentBit = 7; // beginning of a byte
160     bytes[p] = 0;
161 
162     // write the nbits last bits of x, starting from the highest one
163     void outputBits(uint x, int nbits) nothrow @nogc
164     {
165         assert(nbits >= 2 && nbits <= 16);
166         assert( (nbits % 2) == 0);
167 
168         for (int b = nbits - 2; b >= 0; b -= 2)
169         {
170             // which bit to write
171             ubyte pairOfBits = (x >>> b) & 3;
172             bytes[p] |= (pairOfBits << (currentBit - 1));
173 
174             currentBit -= 2;
175 
176             if (currentBit == -1)
177             {
178                 p++;
179                 bytes[p] = 0;
180                 currentBit = 7;
181             }
182         }
183     }
184 
185     void outputByte(ubyte b)
186     {
187         outputBits(b, 8);
188     }
189 
190     qoi10_rgba_t px = initialPredictor;
191     qoi10_rgba_t px_ref = initialPredictor;
192 
193     // To serve as predictor
194     qoi10_rgba_t* scanlineConverted     = cast(qoi10_rgba_t*)(&bytes[max_size]);
195     qoi10_rgba_t* lastScanlineConverted = cast(qoi10_rgba_t*)(&bytes[max_size + scanline_size]);
196 
197     ubyte[1024] index_lookup;
198     uint index_pos = 0;
199     
200     
201     index_lookup[] = 0;
202 
203     bool streamIsGrey = (channels == 1 || channels == 2);
204     int run = 0;
205     int encoded_pixels = 0;
206 
207     void encodeRun()
208     {
209         assert(run > 0 && run <= 256);
210         run--;
211         if (run < 7) 
212         {
213             outputByte( cast(ubyte)(QOI_OP_RUN | run) ); // run 1 to 7
214         }
215         else 
216         {
217             outputByte( cast(ubyte)(QOI_OP_RUN | 7) ); // QOI_OP_RUN2 is inside the QOI_OP_RUN
218             outputBits(run - 7, 8);
219         }
220         run = 0;
221     }
222     
223     for (int posy = 0; posy < desc.height; ++posy)
224     {
225         {
226             const(ushort)* line = cast(const(ushort*))(data + desc.pitchBytes * posy);
227    
228             // 1. First convert the scanline to full qoi10_rgba_t to serve as predictor.
229 
230             for (int posx = 0; posx < desc.width; ++posx)
231             {
232                 qoi10_rgba_t pixel;
233 
234                 // Note that we drop six lower bits here. This codec is lossy 
235                 // if you really have more than 10-bits of precision.
236                 // The use case is PBR knob in Dplug, who needs 10-bit (presumably) for the elevation map.
237                 switch(channels)
238                 {
239                     default:
240                     case 4:
241                         pixel.r = line[posx * channels + 0];
242                         pixel.g = line[posx * channels + 1];
243                         pixel.b = line[posx * channels + 2];
244                         pixel.a = line[posx * channels + 3];
245                         break;
246                     case 3:
247                         pixel.r = line[posx * channels + 0];
248                         pixel.g = line[posx * channels + 1];
249                         pixel.b = line[posx * channels + 2];
250                         pixel.a = 65535;
251                         break;
252                     case 2:
253                         pixel.r = line[posx * channels + 0];
254                         pixel.g = pixel.r;
255                         pixel.b = pixel.r;
256                         pixel.a = line[posx * channels + 1];
257                         break;
258                     case 1:
259                         pixel.r = line[posx * channels + 0];
260                         pixel.g = pixel.r;
261                         pixel.b = pixel.r;
262                         pixel.a = 65535;
263                         break;
264                 }
265 
266                 pixel.r = pixel.r >>> 6;
267                 pixel.g = pixel.g >>> 6;
268                 pixel.b = pixel.b >>> 6;
269                 pixel.a = pixel.a >>> 6;
270 
271                 assert(pixel.r <= 1023);
272                 assert(pixel.g <= 1023);
273                 assert(pixel.b <= 1023);
274                 assert(pixel.a <= 1023);
275 
276                 scanlineConverted[posx] = pixel; // Note: if we'd like lossy, this would be the reconstructed buffer.
277             }
278         }
279 
280         for (int posx = 0; posx < desc.width; ++posx)
281         {
282             px_ref = px;
283 
284             px = scanlineConverted[posx];
285 
286             if (px == px_ref) 
287             {
288                 run++;
289                 if (run == 256 || encoded_pixels + 1 == num_pixels)
290                     encodeRun();
291             }
292             else 
293             {
294                 if (run > 0) 
295                     encodeRun();
296 
297                 {
298                     int va = (px.a - px_ref.a) & 1023;
299                     if (va) 
300                     {
301                         if (va < 16 || va >= (1024 - 16)) // does it fit on 5 bits?
302                         {
303                             // it fits on 5 bits
304                             outputBits((0x1d << 5) | (va & 0x1f), 10); // QOI_OP_ADIFF
305                         }
306                         else if (va < 128 || va >= (1024 - 128)) // does it fit on 8 bits?
307                         {
308                             outputBits( (QOI_OP_ADIFF2 >>> 2), 6);
309                             outputBits(va, 8);  
310                         }
311                         else
312                         {
313                             outputByte(QOI_OP_RGBA);
314                             outputBits(px.r, 10);
315                             if (!streamIsGrey)
316                             {
317                                 outputBits(px.g, 10);
318                                 outputBits(px.b, 10);
319                             }
320                             outputBits(px.a, 10);
321                             goto pixel_is_encoded;
322                         }
323                     }
324 
325                     if (posy > 0 && enableAveragePrediction)
326                     {
327                         static if (newpred)
328                         {
329                             if (posx == 0)
330                             {
331                                 // first pixel in the row, take above pixel
332                                 px_ref.r = lastScanlineConverted[posx].r;
333                                 px_ref.g = lastScanlineConverted[posx].g;
334                                 px_ref.b = lastScanlineConverted[posx].b;
335                             }
336                             else 
337                             {
338                                 qoi10_rgba_t pred = locoIntraPredictionSIMD(px_ref, lastScanlineConverted[posx], lastScanlineConverted[posx-1]);
339                                 px_ref.r = pred.r;
340                                 px_ref.g = pred.g;
341                                 px_ref.b = pred.b;
342                             }
343                         }
344                         else
345                         {
346                             px_ref.r = (px_ref.r + lastScanlineConverted[posx].r + 1) >> 1;
347                             px_ref.g = (px_ref.g + lastScanlineConverted[posx].g + 1) >> 1;
348                             px_ref.b = (px_ref.b + lastScanlineConverted[posx].b + 1) >> 1;
349                         }
350                     }
351 
352                     int vg   = (px.g - px_ref.g) & 1023;
353                     int vg_r = (px.r - px_ref.r - vg) & 1023;
354                     int vg_b = (px.b - px_ref.b - vg) & 1023;
355 
356                     if (streamIsGrey)
357                     {
358                         assert(vg_r == 0);
359                         assert(vg_b == 0);
360                     }
361 
362 
363                     if ( ( (vg_r >= (1024- 4)) || (vg_r <  4) ) &&  // fits in 3 bits?
364                          ( (vg   >= (1024-8)) || (vg   < 8) ) &&    // fits in 4 bits?
365                         ( (vg_b >= (1024- 4)) || (vg_b <  4) ) )   // fits in 3 bits?
366                     {
367                         outputBits(0x20 | (vg & 0x0f), 6); // QOI_OP_LUMA0
368                         if (!streamIsGrey)
369                         {
370                             outputBits( (vg_r << 3) | (vg_b & 7), 6);
371                         }
372                     }
373                     else if ( ( (vg_r >= (1024- 8)) || (vg_r <  8) ) &&  // fits in 4 bits?
374                          ( (vg   >= (1024-16)) || (vg   < 16) ) &&  // fits in 5 bits?
375                          ( (vg_b >= (1024- 8)) || (vg_b <  8) ) )   // fits in 4 bits?
376                     {
377                         outputBits(vg & 0x1f, 6); // QOI_OP_LUMA
378                         if (!streamIsGrey)
379                         {
380                             outputBits(vg_r, 4);
381                             outputBits(vg_b, 4);
382                         }
383                     }
384                     else if (!streamIsGrey && px.g == px.r && px.g == px.b) 
385                     {  
386                         // Note: in greyscale, more expensive than QOI_OP_LUMA2
387                         // This opcode should not be used if input is grey.
388                         outputByte(QOI_OP_GRAY);
389                         outputBits(px.g, 10);
390                     }
391                     else
392                     if ( ( (vg_r >= (1024-32)) || (vg_r < 32) ) &&  // fits in 6 bits?
393                          ( (vg   >= (1024-64)) || (vg   < 64) ) &&  // fits in 7 bits?
394                          ( (vg_b >= (1024-32)) || (vg_b < 32) ) )   // fits in 6 bits?
395                     {
396                         outputBits((0x6 << 7) | (vg & 0x7f), 10); // QOI_OP_LUMA2
397                         if (!streamIsGrey)
398                         {
399                             outputBits(vg_r, 6);
400                             outputBits(vg_b, 6);
401                         }
402                     } 
403                     else
404                     if ( ( (vg_r >= (1024-128)) || (vg_r < 128) ) && // fits in 8 bits?
405                          ( (vg   >= (1024-256)) || (vg   < 256) ) && // fits in 9 bits?
406                          ( (vg_b >= (1024-128)) || (vg_b < 128) ) )   // fits in 8 bits?
407                     {
408                         outputBits((0x1c << 9) | (vg & 0x1ff), 14); // QOI_OP_LUMA3
409                         if (!streamIsGrey)
410                         {
411                             outputBits(vg_r, 8);
412                             outputBits(vg_b, 8);
413                         }
414                     } 
415                     else
416                     {
417                         outputByte(QOI_OP_RGB);
418                         outputBits(px.r, 10);
419                         if (!streamIsGrey)
420                         {
421                             outputBits(px.g, 10);
422                             outputBits(px.b, 10);
423                         }
424                     }
425                 }
426             }
427 
428             pixel_is_encoded:
429 
430             encoded_pixels++;
431         }
432 
433         // Exchange scanline pointers
434         {
435             qoi10_rgba_t* temp = scanlineConverted;
436             scanlineConverted = lastScanlineConverted;
437             lastScanlineConverted = temp;
438         }
439     }
440 
441     for (int i = 0; i < cast(int)(qoi10b_padding.sizeof); i++) 
442     {
443         outputByte(qoi10b_padding[i]);
444     }
445 
446     // finish the last byte
447     if (currentBit != 7)
448         outputBits(0xff, currentBit + 1);
449     assert(currentBit == 7); // full byte
450 
451     *out_len = p;
452     return bytes;
453 }
454 
455 /* Decode a QOI-10b image from memory.
456 
457 The function either returns null on failure (invalid parameters or malloc 
458 failed) or a pointer to the decoded 16-bit pixels. On success, the qoi_desc struct 
459 is filled with the description from the file header.
460 
461 The returned pixel data should be free()d after use. */
462 version(decodeQOIX)
463 ubyte* qoi10b_decode(const(void)* data, int size, qoi_desc *desc, int channels) 
464 {
465     const(ubyte)* bytes;
466     uint header_magic;
467 
468     int p = 0, run = 0;
469 
470     if (data == null || desc == null ||
471         (channels < 0 || channels > 4) ||
472             size < QOIX_HEADER_SIZE + cast(int)(qoi10b_padding.sizeof)
473                 )
474     {
475         return null;
476     }
477 
478     bytes = cast(const(ubyte)*)data;
479 
480     header_magic = qoi_read_32(bytes, &p);
481     desc.width = qoi_read_32(bytes, &p);
482     desc.height = qoi_read_32(bytes, &p);
483     int qoix_version = bytes[p++];
484     desc.channels = bytes[p++];
485     desc.bitdepth = bytes[p++];
486     desc.colorspace = bytes[p++];
487     desc.compression = bytes[p++];
488     desc.pixelAspectRatio = qoi_read_32f(bytes, &p);
489     desc.resolutionY = qoi_read_32f(bytes, &p);
490 
491     if (desc.width == 0 || desc.height == 0 || 
492         desc.channels < 1 || desc.channels > 4 ||
493         desc.colorspace > 1 ||
494         desc.bitdepth != 10 ||
495         qoix_version > 1 ||
496         desc.compression != QOIX_COMPRESSION_NONE ||
497         header_magic != QOIX_MAGIC ||
498         desc.height >= QOIX_PIXELS_MAX / desc.width
499         ) 
500     {
501         return null;
502     }
503 
504     int streamChannels = desc.channels;
505 
506     if (channels == 0)
507         channels = streamChannels;
508 
509     int stride = desc.width * channels * 2;
510     desc.pitchBytes = stride;         
511 
512     int pixel_data_size = stride * desc.height;
513     int decoded_scanline_size = desc.width * cast(int)qoi10_rgba_t.sizeof;
514 
515     ubyte* pixels = cast(ubyte*) QOI_MALLOC(pixel_data_size + 2 * decoded_scanline_size);
516     if (!pixels) {
517         return null;
518     }
519 
520     // double-buffered scanline, for correct average predictors
521     // (else we can't decode 4/3 channels to 1/2 with average prediction, the predictors would be wrong
522     //  if taken from the decoded output)
523     qoi10_rgba_t* decodedScanline = cast(qoi10_rgba_t*)(&pixels[pixel_data_size]);
524     qoi10_rgba_t* lastDecodedScanline = cast(qoi10_rgba_t*)(&pixels[pixel_data_size + decoded_scanline_size]);
525 
526     assert(channels >= 1 && channels <= 4);
527 
528     int currentBit = 7;
529 
530     void rewindInputBit() nothrow @nogc
531     {
532         if (currentBit == 7)
533         {
534             p--;
535             currentBit = -1;
536         }
537         currentBit++;
538     }
539 
540     int read2Bits() nothrow @nogc
541     {
542         ubyte bb = bytes[p];
543 
544         int bit = (bytes[p] >>> (currentBit - 1)) & 3;
545 
546         currentBit -= 2;
547         if (currentBit == -1)
548         {
549             currentBit = 7;
550             p++;
551         }
552         return bit;
553     }
554 
555     uint readBits(int nbits) nothrow @nogc
556     {
557         assert(nbits % 2 == 0);
558         uint r = 0;
559         for (int b = 0; b < nbits; b += 2)
560         {
561             r = (r << 2) | read2Bits();
562         }
563         return r;
564     }
565 
566     ubyte readByte() nothrow @nogc
567     {
568         return cast(ubyte) readBits(8);
569     }
570 
571     bool streamIsGrey = (streamChannels == 1 || streamChannels == 2);
572 
573     qoi10_rgba_t px = initialPredictor;
574     qoi10_rgba_t px_ref = initialPredictor;
575 
576     int decoded_pixels = 0;
577     int num_pixels = desc.width * desc.height;
578 
579     for (int posy = 0; posy < desc.height; ++posy)
580     {
581         for (int posx = 0; posx < desc.width; ++posx)
582         {
583             px_ref = px;
584 
585             if (run > 0) 
586             {
587                 run--;
588             }
589             else if (decoded_pixels < num_pixels)
590             {
591                 // Compute averaged predictors then
592 
593                 if (posy > 0 && enableAveragePrediction)
594                 {
595                     static if (newpred)
596                     {
597                         if (posx == 0)
598                         {
599                             // first pixel in the row, take above pixel
600                             px_ref.r = lastDecodedScanline[posx].r;
601                             px_ref.g = lastDecodedScanline[posx].g;
602                             px_ref.b = lastDecodedScanline[posx].b;
603                         }
604                         else 
605                         {
606                             qoi10_rgba_t pred = locoIntraPredictionSIMD(px_ref, lastDecodedScanline[posx], lastDecodedScanline[posx-1]);
607                             px_ref.r = pred.r;
608                             px_ref.g = pred.g;
609                             px_ref.b = pred.b;
610                         }
611                     }
612                     else
613                     {
614                         px_ref.r = (px_ref.r + lastDecodedScanline[posx].r + 1) >> 1;
615                         px_ref.g = (px_ref.g + lastDecodedScanline[posx].g + 1) >> 1;
616                         px_ref.b = (px_ref.b + lastDecodedScanline[posx].b + 1) >> 1;
617                     }
618                 }
619 
620                 decode_next_op:
621 
622                 ubyte op = readByte();
623 
624                 if (op < 0x80)              // QOI_OP_LUMA
625                 {
626                     int vg = (op >> 2) & 31;  // vg is a signed 5-bit number
627                     vg = (vg << 27) >> 27;   // extends sign
628                     px.g = (px_ref.g + vg       ) & 1023;
629 
630                     if (!streamIsGrey)
631                     {
632                         int vg_r = ((op & 3) << 2) | readBits(2); // vg_r and vg_b are signed 4-bit number in the stream
633                         vg_r = (vg_r << 28) >> 28;   // extends sign    
634                         int vg_b = cast(int)(readBits(4) << 28) >> 28;
635                         px.r = (px_ref.r + vg + vg_r) & 1023;
636                         px.b = (px_ref.b + vg + vg_b) & 1023;
637                     }
638                     else
639                     {
640                         // Rewind two bits, this is always possible
641                         rewindInputBit();
642                         rewindInputBit();
643                         px.r = px.g;
644                         px.b = px.g;
645                     }
646                 }
647                 else if (op < 0xc0)    // QOI_OP_LUMA0 10xxxx
648                 {
649                     // 10gggg[rrrggg]
650 
651                     int vg = (op >> 2) & 15;  // vg is a signed 4-bit number
652                     vg = (vg << 28) >> 28;   // extends sign
653                     px.g = (px_ref.g + vg       ) & 1023;
654                     if (!streamIsGrey)
655                     {
656                         uint remain = readBits(4);
657                         int vg_r = ((op & 3) << 1) | (remain >> 3);
658                         int vg_b = remain & 7;
659                         vg_r = (vg_r << 29) >> 29;
660                         vg_b = (vg_b << 29) >> 29;
661                         px.r = (px_ref.r + vg + vg_r) & 1023;
662                         px.b = (px_ref.b + vg + vg_b) & 1023;
663                     }
664                     else
665                     {
666                         // Rewind two bits, this is always possible
667                         rewindInputBit();
668                         rewindInputBit();
669                         px.r = px.g;
670                         px.b = px.g;
671                     }
672                 }
673                 else if (op < 0xe0)    // QOI_OP_LUMA2
674                 {
675                     int vg   = ((op & 31) << 2) | readBits(2); // vg is a signed 7-bit number
676                     vg = (vg << 25) >> 25;                     // extends sign
677                     px.g = (px_ref.g + vg       ) & 1023;
678                     if (!streamIsGrey)
679                     {
680                         int vg_r = cast(int)(readBits(6) << 26) >> 26; // vg_r and vg_b are signed 6-bit number in the stream
681                         int vg_b = cast(int)(readBits(6) << 26) >> 26;
682                         px.r = (px_ref.r + vg + vg_r) & 1023;
683                         px.b = (px_ref.b + vg + vg_b) & 1023;
684                     }
685                     else
686                     {
687                         px.r = px.g;
688                         px.b = px.g;
689                     }
690                 }
691                 else if (op < 0xe8)    // QOI_OP_LUMA3
692                 {
693                     int vg   = ((op & 7) << 6) | readBits(6); // vg is a signed 9-bit number
694                     vg = (vg << 23) >> 23;                    // extends sign
695                     px.g = (px_ref.g + vg       ) & 1023;
696                     if (!streamIsGrey)
697                     {
698                         int vg_r = cast(int)(readBits(8) << 24) >> 24; // vg_r and vg_b are signed 8-bit number in the stream
699                         int vg_b = cast(int)(readBits(8) << 24) >> 24;
700                         px.r = (px_ref.r + vg + vg_r) & 1023;
701                         px.b = (px_ref.b + vg + vg_b) & 1023;
702                     }
703                     else
704                     {
705                         px.r = px.g;
706                         px.b = px.g;
707                     }
708                 }  
709                 else if (op < 0xf0)    // QOI_OP_ADIFF
710                 {
711                     int adiff = ((op & 7) << 2) | readBits(2);
712                     adiff = adiff << 27; // Need sign extension, else the negatives aren't negatives.
713                     adiff = adiff >> 27;
714                     px.a = cast(ushort)((px.a + adiff) & 1023);
715                     goto decode_next_op;
716                 }
717                 else if ((op & 0xfc) == QOI_OP_ADIFF2)
718                 {
719                     int adiff = ((op & 3) << 6) | readBits(6);
720                     adiff = (adiff << 24) >> 24; // sign-extend
721                     px.a = cast(ushort)((px.a + adiff) & 1023);
722                     goto decode_next_op;
723                 }
724                 else if (op < 0xf8) // QOI_OP_RUN
725                 {       
726                     run = op & 7;
727                     if (run == 7)
728                     {
729                         run = readBits(8) + 7;
730                     }
731                 }               
732                 else if (op == QOI_OP_RGB)
733                 {
734                     px.r = cast(ushort) readBits(10);
735                     if (!streamIsGrey)
736                     {
737                         px.g = cast(ushort) readBits(10);
738                         px.b = cast(ushort) readBits(10);
739                     }
740                     else
741                     {
742                         px.g = px.r;
743                         px.b = px.r;
744                     }
745                }
746                 else if (op == QOI_OP_RGBA)
747                 {
748                     px.r = cast(ushort) readBits(10);
749                     if (!streamIsGrey)
750                     {
751                         px.g = cast(ushort) readBits(10);
752                         px.b = cast(ushort) readBits(10);
753                     }
754                     else
755                     {
756                         px.g = px.r;
757                         px.b = px.r;
758                     }
759                     px.a = cast(ushort) readBits(10);
760                 }
761                 else if (op == QOI_OP_GRAY)
762                 {
763                     px.r = cast(ushort) readBits(10);
764                     px.g = px.r;
765                     px.b = px.r;
766                 }                    
767                 else if (op == QOI_OP_END)
768                 {
769                     goto finished;
770                 }
771                 else
772                 {
773                     assert(false);
774                 }
775             }
776 
777             decodedScanline[posx] = px;
778             decoded_pixels++;
779         }
780 
781         // convert just-decoded scanline into output type
782         ushort* line      = cast(ushort*)(pixels + desc.pitchBytes * posy);
783 
784         // PERF: can be 4 different loops here?
785         for (int posx = 0; posx < desc.width; ++posx)
786         {
787             qoi10_rgba_t px16b = decodedScanline[posx]; // 0..1023 components
788             px16b.r = cast(ushort)(px16b.r << 6 | (px16b.r >> 4));
789             px16b.g = cast(ushort)(px16b.g << 6 | (px16b.g >> 4));
790             px16b.b = cast(ushort)(px16b.b << 6 | (px16b.b >> 4));
791             px16b.a = cast(ushort)(px16b.a << 6 | (px16b.a >> 4));
792 
793             // Expand 10 bit to 16-bits
794             switch(channels)
795             {
796                 default:
797                 case 4:
798                     line[posx * channels + 0] = px16b.r;
799                     line[posx * channels + 1] = px16b.g;
800                     line[posx * channels + 2] = px16b.b;
801                     line[posx * channels + 3] = px16b.a;
802                     break;
803                 case 3:
804                     line[posx * channels + 0] = px16b.r;
805                     line[posx * channels + 1] = px16b.g;
806                     line[posx * channels + 2] = px16b.b;
807                     break;
808                 case 2:
809                     line[posx * channels + 0] = px16b.r;
810                     line[posx * channels + 1] = px16b.a;
811                     break;
812                 case 1:
813                     line[posx * channels + 0] = px16b.r;
814                     break;
815             }
816         }
817 
818         // swap decoded scanline buffers
819         {
820             qoi10_rgba_t* temp = decodedScanline;
821             decodedScanline = lastDecodedScanline;
822             lastDecodedScanline = temp;
823         }
824     }
825 
826     finished:
827     return pixels;
828 }
829 
830 static qoi10_rgba_t locoIntraPredictionSIMD(qoi10_rgba_t a, qoi10_rgba_t b, qoi10_rgba_t c)
831 {
832     // load RGBA16 pixels
833     __m128i A = _mm_loadu_si64(&a); 
834     __m128i B = _mm_loadu_si64(&b);
835     __m128i C = _mm_loadu_si64(&c);
836 
837     // Max predictor (A + B - C)
838     __m128i P = _mm_sub_epi16(_mm_add_epi16(A, B), C);
839     __m128i maxAB = _mm_max_epi16(A, B);
840     __m128i minAB = _mm_min_epi16(A, B);
841 
842     // 1111 where we should use max(A, B)
843     __m128i maxMask = _mm_cmple_epi16(C, minAB);
844 
845     // 1111 where we should use min(A, B)
846     __m128i minMask = _mm_cmpge_epi16(C, maxAB);
847 
848     P = (P & (~minMask)) | (minAB & minMask);
849     P = (P & (~maxMask)) | (maxAB & maxMask);
850 
851     // Get back to 10-bit, clip
852 
853     __m128i Z = _mm_setzero_si128();
854     __m128i m1023 = _mm_set1_epi16(1023);
855     P = _mm_max_epi16(P, Z);
856     P = _mm_min_epi16(P, m1023);
857 
858     qoi10_rgba_t r;
859     _mm_storel_epi64(cast(__m128i*)&r, P);
860 
861     return r;
862 }
863 
864 /+
865 static short locoIntraPrediction(short a, short b, short c)
866 {
867     short max_ab = a > b ? a : b;
868     short min_ab = a < b ? a : b;
869     if (c >= max_ab)
870         return cast(short)min_ab;
871     else if (c <= min_ab)
872         return cast(short)max_ab;
873     else
874     {
875         int d = a + b - c;
876         if (d < 0)
877             d = 0;
878         if (d > 1023)
879             d = 1023;
880         return cast(short)d;
881     }
882 }
883 +/
884 
885 private __m128i _mm_cmple_epi16(__m128i a, __m128i b) pure @safe
886 {
887     return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
888 }
889 
890 private __m128i _mm_cmpge_epi16(__m128i a, __m128i b)
891 {
892     return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
893 }