The OpenD Programming Language

1 module gamut.codecs.qoi2avg;
2 
3 nothrow @nogc:
4 
5 import core.stdc.stdlib: realloc, malloc, free;
6 import core.stdc.string: memset, memcpy;
7 
8 import inteli.emmintrin;
9 
10 /// Note: this is a translation of "QOI2" mods by @wbd73
11 /// revealed in https://github.com/nigeltao/qoi2-bikeshed/issues/34
12 /// Called "QOIX" in Gamut, since it has a few extensions again, such as LZ4.
13 
14 /* 
15 
16 QOI2 - Lossless image format inspired by QOI “Quite OK Image” format
17 
18 Incompatible adaptation of QOI format - https://phoboslab.org
19 
20 -- LICENSE: The MIT License(MIT)
21 Copyright(c) 2021 Dominic Szablewski (original QOI format)
22 Copyright(c) 2021 wbd73 @ GitHub (compression improvements)
23 Copyright(c) 2022 Guillaume Piolat (D translation, add pitch support)
24 
25 Permission is hereby granted, free of charge, to any person obtaining a copy of
26 this software and associated documentation files(the "Software"), to deal in
27 the Software without restriction, including without limitation the rights to
28 use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
29 of the Software, and to permit persons to whom the Software is furnished to do
30 so, subject to the following conditions :
31 The above copyright notice and this permission notice shall be included in all
32 copies or substantial portions of the Software.
33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
36 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39 SOFTWARE.
40 
41 
42 
43 -- Documentation
44 
45 This library provides the following functions;
46 - qoi_decode  -- decode the raw bytes of a QOI image from memory
47 - qoi_encode  -- encode an rgba buffer into a QOI image in memory
48 
49 See the function declaration below for the signature and more information.
50 
51 
52 -- Data Format
53 
54 A QOI2AVG file has a 25 byte header, compatible with Gamut QOIX.
55 Followed by any number of data "chunks" and an 8-byte end marker.
56 
57 struct qoix_header_t {
58     char     magic[4];         // magic bytes "qoix"
59     uint32_t width;            // image width in pixels (BE)
60     uint32_t height;           // image height in pixels (BE)
61     uint8_t  version_;         // Major version of QOIX format.
62     uint8_t  channels;         // 3 = RGB, 4 = RGBA (1 and 2 indicate QOI-plane codec, see qoiplane.d)
63     uint8_t  bitdepth;         // 8 = this qoi2avg codec is always 8-bit (10 indicates QOI-10 codec, see qoi10b.d)
64     uint8_t  colorspace;       // 0 = sRGB with linear alpha, 1 = all channels linear
65     uint8_t  compression;      // 0 = none, 1 = LZ4
66     float    pixelAspectRatio; // -1 = unknown, else Pixel Aspect Ratio
67     float    resolutionX;      // -1 = unknown, else physical resolution in DPI
68 };
69 */
70 
71 enum QOIX_HEADER_OFFSET_CHANNELS = 13;
72 enum QOIX_HEADER_OFFSET_BITDEPTH = 14;
73 enum QOIX_HEADER_OFFSET_COMPRESSION = 16;
74 
75 
76 /*
77 
78 The decoder and encoder start with {r: 0, g: 0, b: 0, a: 255} as the previous
79 pixel value. Pixels are either encoded as
80  - a run of the previous pixel
81  - an index into an array of previously seen pixels
82  - a difference to the previous pixel value in r,g,b
83  - full r,g,b or a or gray values
84 
85 The color channels are assumed to not be premultiplied with the alpha channel 
86 ("un-premultiplied alpha").
87 
88 Each chunk starts with a tag, followed by a number of data bits. The bit length
89 of chunks is divisible by 8 - i.e. all chunks are byte aligned. All values
90 encoded in these data bits have the most significant bit on the left.
91 
92 The byte stream's end is marked with 4 0xff bytes.
93 
94 A running FIFO array[64] (zero-initialized) of pixel values is maintained by the
95 encoder and decoder. Every pixel en-/decoded by the QOI_OP_LUMA (and variants),
96 QOI_OP_GRAY and QOI_OP_RGB chunks is written to this array. The write position
97 starts at 0 and is incremented with each pixel written. The position wraps back
98 to 0 when it reaches 64. I.e:
99     index[index_pos % 64] = current_pixel;
100     index_pos = index_pos + 1;
101 
102 An encoder can search this array for the current pixel value and, if a match is
103 found, emit a QOI_OP_INDEX with the position within the array.
104 
105 
106 The possible chunks are:
107 
108 
109 .- QOI_OP_INDEX ----------.
110 |         Byte[0]         |
111 |  7  6  5  4  3  2  1  0 |
112 |-------+-----------------|
113 |  1  0 |     index       |
114 `-------------------------`
115 2-bit tag b10
116 6-bit index into the color index array: 0..63
117 
118 
119 .- QOI_OP_LUMA -----(232)-. 
120 |         Byte[0]         |
121 |  7  6  5  4  3  2  1  0 |
122 |----+--------+-----+-----|
123 |  0 | g diff | drg | dbg |
124 `-------------------------`
125 1-bit tag b0
126 3-bit green channel difference from the reference -4..3
127 2-bit   red channel difference minus green channel difference -1..2 or -2..1
128 2-bit  blue channel difference minus green channel difference -1..2 or -2..1
129 
130 For the first line of pixels the reference is the previous pixel.
131 For the next lines of pixels the reference is the rounded down average of the
132 previous pixel and the one above the current pixel.
133 The green channel is used to indicate the general direction of change and is 
134 encoded in 3 bits. The red and green channels (dr and db) base their diffs off
135 of the green channel difference and are encoded in 2 bits. I.e.:
136     dr_dg = (ref.r - cur_px.r) - (ref.g - cur_px.g)
137     db_dg = (ref.b - cur_px.b) - (ref.g - cur_px.g)
138 
139 The difference to the current channel values are using a wraparound operation, 
140 so "1 - 2" will result in 255, while "255 + 1" will result in 0.
141 
142 Values are stored as unsigned integers with a bias of 4 for the green channel 
143 and a bias of 1 or 2 for the red and blue channel depending on the direction
144 (sign bit) of the green channel.
145 
146 
147 .- QOI_OP_LUMA2 ------------------------------(454)-. 
148 |         Byte[0]         |         Byte[1]         |
149 |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |
150 |----------+--------------+-------------+-----------|
151 |  1  1  0 |  green diff  |   dr - dg   |  db - dg  |
152 `---------------------------------------------------`
153 3-bit tag b110
154 5-bit green channel difference from the reference -16..15
155 4-bit   red channel difference minus green channel difference -8..7
156 4-bit  blue channel difference minus green channel difference -8..7
157 
158 The green channel is used to indicate the general direction of change and is 
159 encoded in 5 bits. The red and green channels (dr and db) base their diffs off
160 of the green channel difference and are encoded in 4 bits.
161 
162 Values are stored as unsigned integers with a bias of 16 for the green channel 
163 and a bias of 8 for the red and blue channel.
164 
165 
166 .- QOI_OP_LUMA3 ------------------------------------.-------------------(676)-. 
167 |         Byte[0]         |         Byte[1]         |         Byte[2]         |
168 |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |
169 |----------------+----------------------+-------------------+-----------------|
170 |  1  1  1  0  0 |     green diff       |      dr - dg      |     db - dg     |
171 `-----------------------------------------------------------------------------`
172 4-bit tag b1110
173 7-bit green channel difference from the reference -64..63
174 6-bit   red channel difference minus green channel difference -32..31
175 6-bit  blue channel difference minus green channel difference -32..31
176 
177 The green channel is used to indicate the general direction of change and is 
178 encoded in 7 bits. The red and green channels (dr and db) base their diffs off
179 of the green channel difference and are encoded in 6 bits.
180 
181 Values are stored as unsigned integers with a bias of 64 for the green channel 
182 and a bias of 32 for the red and blue channel.
183 
184 
185 .- QOI_OP_RUN ------------.
186 |         Byte[0]         |
187 |  7  6  5  4  3  2  1  0 |
188 |----------------+--------|
189 |  1  1  1  1  0 |  run   |
190 `-------------------------`
191 5-bit tag b11110
192 3-bit run-length repeating the previous pixel: 1..8
193 
194 The run-length is stored with a bias of 1.
195 
196 
197 .- QOI_OP_RUN2 ---------------------.
198 |         Byte[0]         | Byte[1] |
199 |  7  6  5  4  3  2  1  0 | 7 .. 0  |
200 |-------------------+-----+---------|
201 |  1  1  1  1  1  0 |      run      |
202 `-----------------------------------`
203 6-bit tag b111110
204 10-bit run-length repeating the previous pixel: 1..1024
205 
206 The run-length is stored with a bias of 1.
207 
208 
209 .- QOI_OP_GRAY ---------------------.
210 |         Byte[0]         | Byte[1] |
211 |  7  6  5  4  3  2  1  0 | 7 .. 0  |
212 |-------------------------+---------|
213 |  1  1  1  1  1  1  0  0 |  gray   |
214 `-----------------------------------`
215 8-bit tag b11111100
216 8-bit gray channel value
217 
218 
219 .- QOI_OP_RGB ------------------------------------------.
220 |         Byte[0]         | Byte[1] | Byte[2] | Byte[3] |
221 |  7  6  5  4  3  2  1  0 | 7 .. 0  | 7 .. 0  | 7 .. 0  |
222 |-------------------------+---------+---------+---------|
223 |  1  1  1  1  1  1  0  1 |   red   |  green  |  blue   |
224 `-------------------------------------------------------`
225 8-bit tag b11111101
226 8-bit   red channel value
227 8-bit green channel value
228 8-bit  blue channel value
229 
230 
231 .- QOI_OP_A ------------------------.
232 |         Byte[0]         | Byte[1] |
233 |  7  6  5  4  3  2  1  0 | 7 .. 0  |
234 |-------------------------+---------|
235 |  1  1  1  1  1  1  1  0 |  alpha  |
236 `-----------------------------------`
237 8-bit tag b11111110
238 8-bit alpha channel value
239 
240 
241 .- QOI_OP_END ------------.
242 |         Byte[0]         |
243 |  7  6  5  4  3  2  1  0 |
244 |-------------------------|
245 |  1  1  1  1  1  1  1  1 |
246 `-------------------------`
247 8-bit tag b11111111
248 
249 
250 The byte stream is padded at the end with four 0xff bytes. Since the longest 
251 legal chunk is 4 bytes (QOI_OP_RGB), with this padding it is possible to check 
252 for an overrun only once per decode loop iteration. These 0xff bytes also mark 
253 the end of the data stream, as an encoder should never produce four consecutive
254 0xff bytes within the stream.
255 
256 */
257 
258 /* A pointer to a qoi_desc struct has to be supplied to all of qoi's functions. 
259 It describes either the input format (for qoi_write and qoi_encode), or is 
260 filled with the description read from the file header (for qoi_read and
261 qoi_decode).
262 
263 The colorspace in this qoi_desc is an enum where 
264     0 = sRGB, i.e. gamma scaled RGB channels and a linear alpha channel
265     1 = all channels are linear
266 You may use the constants QOI_SRGB or QOI_LINEAR. The colorspace is purely 
267 informative. It will be saved to the file header, but does not affect
268 en-/decoding in any way. */
269 
270 enum QOI_SRGB = 0;
271 enum QOI_LINEAR = 1;
272 
273 struct qoi_desc
274 {
275     uint width;
276     uint height;
277     int pitchBytes; // number of bytes between start of lines.
278     ubyte channels;
279     ubyte bitdepth;
280     ubyte colorspace;
281     ubyte compression;
282     float pixelAspectRatio; // PAR, in Gamut format
283     float resolutionY;      // Vertical DPI, in Gamut format
284 }
285 
286 alias QOI_MALLOC = malloc;
287 alias QOI_FREE = free;
288 
289 
290 enum int QOI_OP_LUMA   = 0x00; /* 0xxxxxxx */
291 enum int QOI_OP_INDEX  = 0x80; /* 10xxxxxx */
292 enum int QOI_OP_LUMA2  = 0xc0; /* 110xxxxx */
293 enum int QOI_OP_LUMA3  = 0xe0; /* 11100xxx */
294 enum int QOI_OP_ADIFF  = 0xe8; /* 11101xxx */
295 enum int QOI_OP_RUN    = 0xf0; /* 11110xxx */
296 enum int QOI_OP_RUN2   = 0xf8; /* 111110xx */
297 enum int QOI_OP_GRAY   = 0xfc; /* 11111100 */
298 enum int QOI_OP_RGB    = 0xfd; /* 11111101 */
299 enum int QOI_OP_RGBA   = 0xfe; /* 11111110 */
300 enum int QOI_OP_END    = 0xff; /* 11111111 */
301 
302 enum uint QOIX_MAGIC = 0x716F6978; // "qoix"
303 enum QOIX_HEADER_SIZE = 15 + 1 /* version */ + 4 /* PAR */ + 4 /* DPI */ + 1 /* compression */;
304 enum ubyte QOIX_COMPRESSION_NONE = 0;
305 enum ubyte QOIX_COMPRESSION_LZ4  = 1;
306 
307 /* To not have to linearly search through the color index array, we use a hash 
308 of the color value to quickly lookup the index position in a hash table. */
309 uint QOI_COLOR_HASH(qoi_rgba_t C)
310 {
311     return (((C.v * 2654435769) >> 22) & 1023);
312 }
313 
314 /* 2GB is the max file size that this implementation can safely handle. We guard
315 against anything larger than that, assuming the worst case with 5 bytes per 
316 pixel, rounded down to a nice clean value. 400 million pixels ought to be 
317 enough for anybody. */
318 enum uint QOIX_PIXELS_MAX = 400000000;
319 
320 struct RGBA
321 {
322     ubyte r, g, b, a;
323 }
324 static assert(RGBA.sizeof == 4);
325 
326 struct qoi_rgba_t 
327 {   
328     union
329     {
330         RGBA rgba;
331         uint v;
332     }
333 }
334 
335 static immutable ubyte[4] qoi_padding = [255,255,255,255];
336 
337 void qoi_write_32(ubyte* bytes, int *p, uint v) 
338 {
339     bytes[(*p)++] = (0xff000000 & v) >> 24;
340     bytes[(*p)++] = (0x00ff0000 & v) >> 16;
341     bytes[(*p)++] = (0x0000ff00 & v) >> 8;
342     bytes[(*p)++] = (0x000000ff & v);
343 }
344 
345 uint qoi_read_32(const(ubyte)* bytes, int *p) 
346 {
347     uint a = bytes[(*p)++];
348     uint b = bytes[(*p)++];
349     uint c = bytes[(*p)++];
350     uint d = bytes[(*p)++];
351     return a << 24 | b << 16 | c << 8 | d;
352 }
353 
354 void qoi_write_32f(ubyte* bytes, int *p, float f) 
355 {
356     qoi_write_32(bytes, p, *cast(uint*)&f);
357 }
358 
359 float qoi_read_32f(const(ubyte)* bytes, int *p) 
360 {
361     uint r = qoi_read_32(bytes, p);
362     return *cast(float*)&r;
363 }
364 
365 /* Encode raw RGB or RGBA pixels into a QOI2AVG image in memory.
366 
367 The function either returns null on failure (invalid parameters or malloc 
368 failed) or a pointer to the encoded data on success. On success the out_len 
369 is set to the size in bytes of the encoded data.
370 
371 The returned qoi data should be free()d after use. */
372 version(encodeQOIX)
373 ubyte* qoix_encode(const(ubyte)* data, const(qoi_desc)* desc, int *out_len) 
374 {
375     int i, stride, p, run;
376     int px_len, px_end, px_pos, channels;
377     ubyte* bytes;
378     ubyte[1024] index_lookup;
379     uint index_pos = 0;
380     qoi_rgba_t[64] index;
381     qoi_rgba_t px, px_ref;
382 
383     if (
384         data == null || out_len == null || desc == null ||
385         desc.width == 0 || desc.height == 0 ||
386         desc.channels < 3 || desc.channels > 4 ||
387         desc.colorspace > 1 ||
388         desc.bitdepth != 8 ||
389         desc.compression != QOIX_COMPRESSION_NONE ||
390         desc.height >= QOIX_PIXELS_MAX / desc.width
391     ) {
392         return null;
393     }
394 
395     int pixel_data_size = desc.width * desc.height * channels;
396 
397     // Before encoding a scanline, it is converted to RGBA8.
398     // This is double buffered, to help with prediction.
399     int converted_scanline_size = desc.width * 4;  
400 
401     // Allocated 3 rgba8 scanlines for the need of encoding.
402     int extraAllocSize = converted_scanline_size*2;
403 
404     // Overallocate to make room for everything.
405     int max_size = desc.width * desc.height * (desc.channels + 1) + QOIX_HEADER_SIZE + cast(int)(qoi_padding.sizeof);
406 
407     p = 0;
408     bytes = cast(ubyte*) QOI_MALLOC(max_size + extraAllocSize);
409     if (!bytes) 
410     {
411         return null;
412     }
413 
414     // double-buffered scanline, this is intended to speed up decoding
415     qoi_rgba_t* inputScanline     = cast(qoi_rgba_t*)(bytes + max_size);
416     qoi_rgba_t* lastInputScanline = cast(qoi_rgba_t*)(bytes + max_size + converted_scanline_size);
417 
418     qoi_write_32(bytes, &p, QOIX_MAGIC);
419     qoi_write_32(bytes, &p, desc.width);
420     qoi_write_32(bytes, &p, desc.height);
421     bytes[p++] = 1; // Put a version number :)
422     bytes[p++] = desc.channels; // 3, or 4
423     bytes[p++] = desc.bitdepth; // 8, or 10
424     bytes[p++] = desc.colorspace;
425     bytes[p++] = QOIX_COMPRESSION_NONE;
426     qoi_write_32f(bytes, &p, desc.pixelAspectRatio);
427     qoi_write_32f(bytes, &p, desc.resolutionY);
428 
429     //pixels = cast(const(ubyte)*) data;
430 
431     memset(index.ptr, 0, 64 * qoi_rgba_t.sizeof);
432     index_lookup[] = 0;
433 
434     run = 0;
435     px.rgba.r = 0;
436     px.rgba.g = 0;
437     px.rgba.b = 0;
438     px.rgba.a = 255;
439     
440     channels = desc.channels;
441     stride = desc.width * channels;
442     px_len = desc.width * desc.height * channels;
443     px_end = px_len - channels;
444 
445     assert (channels != 1 && channels != 2);
446 
447 
448 
449     for (int posy = 0; posy < desc.height; ++posy)
450     {
451         const(ubyte)* line = data + desc.pitchBytes * posy;
452 
453         // Convert one input scanline at once to rgba8
454         if (desc.channels == 4)
455         {
456             // PERF: replace by pointer swap
457             memcpy(inputScanline, line, desc.pitchBytes);
458         }
459         else
460         {
461             assert(desc.channels == 3);
462             for (int posx = 0; posx < desc.width; ++posx)
463             {
464                 inputScanline[posx].rgba = RGBA(line[posx * 3 + 0], line[posx * 3 + 1], line[posx * 3 + 2], 255);
465             }
466         }
467 
468         for (int posx = 0; posx < desc.width; ++posx)
469         {
470             px_ref.v = px.v;
471             px = inputScanline[posx];
472 
473             if (px.v == px_ref.v) {
474                 run++;
475                 if (run == 1024 || px_pos == px_end) {
476                     run--;
477                     bytes[p++] = QOI_OP_RUN2 | ((run >> 8) & 3);
478                     bytes[p++] = run & 0xff;
479                     run = 0;
480                 }
481             }
482             else {
483                 int hash = QOI_COLOR_HASH(px);
484 
485                 if (run > 0) {
486                     run--;
487                     if (run < 8) {
488                         bytes[p++] = cast(ubyte)(QOI_OP_RUN | run);
489                     }
490                     else {
491                         bytes[p++] = QOI_OP_RUN2 | ((run >> 8) & 3);
492                         bytes[p++] = run & 0xff;
493                     }
494                     run = 0;
495                 }
496 
497                 if (index[index_lookup[hash]].v == px.v) {
498                     bytes[p++] = QOI_OP_INDEX | index_lookup[hash];
499                 }
500                 else {
501                     index_lookup[hash] = cast(ubyte) index_pos;
502                     index[index_pos] = px;
503                     index_pos = (index_pos + 1) & 63;
504 
505                     byte va = cast(byte)(px.rgba.a - px_ref.rgba.a);
506 
507                     if (va) {
508                         if (va >= -4 && va <= 3){
509                             bytes[p++] = cast(ubyte)(QOI_OP_ADIFF | (va + 4));
510                         } else { 
511                             bytes[p++] = QOI_OP_RGBA; // make a grey + alpha opcode?
512                             bytes[p++] = px.rgba.r;
513                             bytes[p++] = px.rgba.g;
514                             bytes[p++] = px.rgba.b;
515                             bytes[p++] = px.rgba.a;
516                             goto pixel_encoded;
517                         }
518                     }
519 
520                     // Note: computing this predictor for the whole scanline in advance, even with 2x pixels at once, was slower.
521                     // because in normal times, you don't compute this predictor all the time.
522                     if (posy > 0)
523                     {
524                         if (posx == 0)
525                         {
526                             // first pixel in the row, take above pixel
527                             RGBA pred = lastInputScanline[posx].rgba;
528                             px_ref.rgba.r = pred.r;
529                             px_ref.rgba.g = pred.g;
530                             px_ref.rgba.b = pred.b;
531                         }
532                         else 
533                         {
534                             RGBA pred = locoIntraPredictionSIMD(px_ref.rgba, lastInputScanline[posx].rgba, lastInputScanline[posx-1].rgba);
535                             px_ref.rgba.r = pred.r;
536                             px_ref.rgba.g = pred.g;
537                             px_ref.rgba.b = pred.b;
538                         }
539                     }
540 
541                     byte vg   = cast(byte)(px.rgba.g - px_ref.rgba.g);
542                     byte vg_r = cast(byte)(px.rgba.r - px_ref.rgba.r - vg);
543                     byte vg_b = cast(byte)(px.rgba.b - px_ref.rgba.b - vg);
544 
545                     if (
546                         vg   >= -4 && vg   <  0 && 
547                         vg_r >= -1 && vg_r <= 2 &&
548                         vg_b >= -1 && vg_b <= 2
549                     ) {
550                         bytes[p++] = cast(ubyte)( QOI_OP_LUMA | (vg + 4) << 4 | (vg_r + 1) << 2 | (vg_b + 1) );
551                     }
552                     else if (
553                         vg   >=  0 && vg   <= 3 && 
554                         vg_r >= -2 && vg_r <= 1 &&
555                         vg_b >= -2 && vg_b <= 1
556                     ) {
557                         bytes[p++] = cast(ubyte)( QOI_OP_LUMA | (vg + 4) << 4 | (vg_r + 2) << 2 | (vg_b + 2) );
558                     }
559                     else if (
560                         px.rgba.g == px.rgba.r &&
561                         px.rgba.g == px.rgba.b
562                     ) {
563                         bytes[p++] = QOI_OP_GRAY;
564                         bytes[p++] = px.rgba.g;
565                     }
566                     else if (
567                         vg_r >=  -8 && vg_r <=  7 && 
568                         vg   >= -16 && vg   <= 15 && 
569                         vg_b >=  -8 && vg_b <=  7
570                     ) {
571                         bytes[p++] = cast(ubyte)( QOI_OP_LUMA2    | (vg   + 16) );
572                         bytes[p++] = cast(ubyte)( (vg_r + 8) << 4 | (vg_b +  8) );
573                     }
574                     else if (
575                         vg_r >= -32 && vg_r <= 31 && 
576                         vg   >= -64 && vg   <= 63 && 
577                         vg_b >= -32 && vg_b <= 31
578                     ) {
579                         int dv = ((vg + 64) << 12) | ((vg_r + 32) << 6) | (vg_b + 32);
580                         bytes[p++] = QOI_OP_LUMA3 | ((dv >> 16) & 31);
581                         bytes[p++] = (dv >> 8) & 255;
582                         bytes[p++] = dv & 255;
583                     } else {
584                         bytes[p++] = QOI_OP_RGB;
585                         bytes[p++] = px.rgba.r;
586                         bytes[p++] = px.rgba.g;
587                         bytes[p++] = px.rgba.b;
588                     }
589                 }
590             }
591 
592             pixel_encoded:
593 
594             px_pos += channels;
595         }
596 
597         // swap input scanline buffers
598         {
599             qoi_rgba_t* temp = inputScanline;
600             inputScanline = lastInputScanline;
601             lastInputScanline = temp;
602         }
603     }
604 
605     for (i = 0; i < cast(int)(qoi_padding.sizeof); i++) 
606     {
607         bytes[p++] = qoi_padding[i];
608     }
609 
610     *out_len = p;
611     return bytes;
612 }
613 
614 /* Decode a QOI2AVG image from memory.
615 
616 The function either returns null on failure (invalid parameters or malloc 
617 failed) or a pointer to the decoded pixels. On success, the qoi_desc struct 
618 is filled with the description from the file header.
619 
620 The returned pixel data should be free()d after use. */
621 version(decodeQOIX)
622 ubyte* qoix_decode(const(void)* data, int size, qoi_desc *desc, int channels) {
623     const(ubyte)* bytes;
624     uint header_magic;
625     qoi_rgba_t[64] index;
626     qoi_rgba_t px, px_ref;
627     int chunks_len;
628     int p = 0, run = 0;
629     int index_pos = 0;
630 
631     if (
632         data == null || desc == null ||
633         (channels != 0 && channels !=  3 && channels !=  4) ||
634         size < QOIX_HEADER_SIZE + cast(int)(qoi_padding.sizeof)
635     ) {
636         return null;
637     }
638 
639     bytes = cast(const(ubyte)*)data;
640 
641     header_magic = qoi_read_32(bytes, &p);
642     desc.width = qoi_read_32(bytes, &p);
643     desc.height = qoi_read_32(bytes, &p);
644     int qoix_version = bytes[p++];
645     desc.channels = bytes[p++];
646     desc.bitdepth = bytes[p++];
647     desc.colorspace = bytes[p++];
648     desc.compression = bytes[p++];
649     desc.pixelAspectRatio = qoi_read_32f(bytes, &p);
650     desc.resolutionY = qoi_read_32f(bytes, &p);
651 
652     if (
653         desc.width == 0 || desc.height == 0 || 
654         desc.channels < 3 || desc.channels > 4 ||
655         desc.colorspace > 1 ||
656         desc.bitdepth != 8 ||
657         qoix_version > 1 ||
658         desc.compression != QOIX_COMPRESSION_NONE ||
659         header_magic != QOIX_MAGIC ||
660         desc.height >= QOIX_PIXELS_MAX / desc.width
661     ) {
662         return null;
663     }
664 
665     if (channels == 0) {
666         channels = desc.channels;
667     }
668 
669     int samplesPerRow = desc.width * channels;
670 
671     desc.pitchBytes = samplesPerRow;
672 
673     int pixel_data_size = desc.width * desc.height * channels;
674     int decoded_scanline_size = desc.width * 4;  
675 
676     int num_samples = desc.width * desc.height * channels;
677     ubyte* pixels = cast(ubyte *) QOI_MALLOC(pixel_data_size + 2 * decoded_scanline_size);
678     if (!pixels) {
679         return null;
680     }
681 
682     // double-buffered scanline, this is intended to speed up decoding
683     qoi_rgba_t* decodedScanline = cast(qoi_rgba_t*)(&pixels[pixel_data_size]);
684     qoi_rgba_t* lastDecodedScanline = cast(qoi_rgba_t*)(&pixels[pixel_data_size + decoded_scanline_size]);
685 
686     assert(channels != 1 && channels != 2);
687 
688     memset(index.ptr, 0, 64 * qoi_rgba_t.sizeof);
689     px.rgba.r = 0;
690     px.rgba.g = 0;
691     px.rgba.b = 0;
692     px.rgba.a = 255;
693 
694     chunks_len = size - cast(int)(qoi_padding.sizeof);
695 
696     int px_pos = 0;
697 
698     for (int posy = 0; posy < desc.height; ++posy)
699     {
700         for (int posx = 0; posx < desc.width; ++posx)
701         {
702             if (run > 0) 
703             {
704                 run--;
705             }
706             else if (p < chunks_len) 
707             {
708                 px_ref.v = px.v;
709 
710                 if (posy > 0)
711                 {
712                     if (posx == 0)
713                     {
714                         // first pixel in the row, take above pixel
715                         px_ref.rgba.r = lastDecodedScanline[posx].rgba.r;
716                         px_ref.rgba.g = lastDecodedScanline[posx].rgba.g;
717                         px_ref.rgba.b = lastDecodedScanline[posx].rgba.b;
718                     }
719                     else 
720                     {
721                         // Called I-LOCO intra prediction
722                         RGBA pred = locoIntraPredictionSIMD(px.rgba, lastDecodedScanline[posx].rgba, lastDecodedScanline[posx-1].rgba);
723                         px_ref.rgba.r = pred.r;
724                         px_ref.rgba.g = pred.g;
725                         px_ref.rgba.b = pred.b;
726                     }
727                 }
728 
729                 decode_op:
730 
731                 int b1 = bytes[p++];
732                 if (b1 < 0x80) {        /* QOI_OP_LUMA */
733                     int vg = ((b1 >> 4) & 7) - 4;
734                     px.rgba.g = cast(ubyte)(px_ref.rgba.g + vg);
735                     if (vg < 0) {
736                         px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 1 + ((b1 >> 2) & 3) );
737                         px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 1 +  (b1 &  3) );
738                     }
739                     else {
740                         px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 2 + ((b1 >> 2) & 3) );
741                         px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 2 +  (b1 &  3) );
742                     }
743                     index[index_pos++ & 63] = px;
744                 }
745                 else if (b1 < 0xc0) {       /* QOI_OP_INDEX */
746                     px = index[b1 & 63];
747                 }
748                 else if (b1 < 0xe0) {       /* QOI_OP_LUMA2 */
749                     int b2 = bytes[p++];
750                     int vg = (b1 & 0x1f) - 16;
751                     px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 8 + ((b2 >> 4) & 0x0f) );
752                     px.rgba.g = cast(ubyte)( px_ref.rgba.g + vg );
753                     px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 8 +  (b2       & 0x0f) );
754                     index[index_pos++ & 63] = px;
755                 }
756                 else if (b1 < 0xe8) {       /* QOI_OP_LUMA3 */
757                     int dv = (b1 << 8) | bytes[p++];
758                     dv = (dv << 8) | bytes[p++];
759                     int vg = ((dv >> 12) & 0x7f) - 64;
760                     px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg + ((dv >> 6) & 0x3f) - 32 );
761                     px.rgba.g = cast(ubyte)( px_ref.rgba.g + vg );
762                     px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg + (dv & 0x3f) - 32 );
763                     index[index_pos++ & 63] = px;
764                 }
765                 else if (b1 < 0xf0) {       /* QOI_OP_ADIFF */
766                     px.rgba.a += (b1 & 7) - 4;
767                     goto decode_op;
768                 }
769                 else if (b1 < 0xf8) {       /* QOI_OP_RUN */
770                     run = b1 & 7;
771                 }
772                 else if (b1 < 0xfc) {       /* QOI_OP_RUN2 */
773                     run = ((b1 & 3) << 8) | bytes[p++];
774                 }
775                 else if (b1 == QOI_OP_GRAY) {
776                     ubyte vg = bytes[p++];
777                     px.rgba.r = vg;
778                     px.rgba.g = vg;
779                     px.rgba.b = vg;
780                     index[index_pos++ & 63] = px;
781                 }
782                 else if (b1 == QOI_OP_RGB) {
783                     px.rgba.r = bytes[p++];
784                     px.rgba.g = bytes[p++];
785                     px.rgba.b = bytes[p++];
786                     index[index_pos++ & 63] = px;
787                 }
788                 else if (b1 == QOI_OP_RGBA) {
789                     px.rgba.r = bytes[p++];
790                     px.rgba.g = bytes[p++];
791                     px.rgba.b = bytes[p++];
792                     px.rgba.a = bytes[p++];
793                     index[index_pos++ & 63] = px;
794                 }
795                 else {              /* QOI_OP_END */
796                     break;
797                 }
798             }
799 
800             decodedScanline[posx] = px;
801             px_pos += channels;
802         }
803 
804         // convert just-decoded scanline into output type
805         ubyte* line = cast(ubyte*)(pixels + desc.pitchBytes * posy);
806 
807         switch(channels)
808         {
809             case 4:
810                 // No particular conversion to do
811                 memcpy(line, &decodedScanline[0], desc.width * 4);
812                 break;
813 
814             case 3:
815                 for (int posx = 0; posx < desc.width; ++posx)
816                 {
817                     qoi_rgba_t decodedPx = decodedScanline[posx]; // No particular conversion to do
818                     line[posx * 3 + 0] = decodedPx.rgba.r;
819                     line[posx * 3 + 1] = decodedPx.rgba.g;
820                     line[posx * 3 + 2] = decodedPx.rgba.b;
821                 }
822                 break;
823             default:
824                 assert(false);
825         }
826 
827         // swap decoded scanline buffers
828         {
829             qoi_rgba_t* temp = decodedScanline;
830             decodedScanline = lastDecodedScanline;
831             lastDecodedScanline = temp;
832         }
833     }
834 
835     return pixels;
836 }
837 
838 private:
839 
840 /* Perform LOCO-I prediction independently over the 4 channels.
841 
842 
843     int max_ab = a > b ? a : b;
844     int min_ab = a < b ? a : b;
845     if (c >= max_ab)
846         return cast(ubyte)min_ab;
847     else if (c <= min_ab)
848         return cast(ubyte)max_ab;
849     else
850     {
851         int d = a + b - c;
852         if (d < 0)
853             d = 0;
854         if (d > 255)
855             d = 0;
856         return cast(ubyte)d;
857     }
858 */
859 
860 static RGBA locoIntraPredictionSIMD(RGBA a, RGBA b, RGBA c)
861 {
862     // load RGBA8 pixels
863     __m128i A = _mm_loadu_si32(&a); 
864     __m128i B = _mm_loadu_si32(&b);
865     __m128i C = _mm_loadu_si32(&c);
866 
867     // extend to 16-bits
868     __m128i Z = _mm_setzero_si128();
869     A = _mm_unpacklo_epi8(A, Z);
870     B = _mm_unpacklo_epi8(B, Z);
871     C = _mm_unpacklo_epi8(C, Z);
872 
873     // Max predictor (A + B - C)
874     __m128i P = _mm_sub_epi16(_mm_add_epi16(A, B), C);
875     __m128i maxAB = _mm_max_epi16(A, B);
876     __m128i minAB = _mm_min_epi16(A, B);
877 
878     // 1111 where we should use max(A, B)
879     __m128i maxMask = _mm_cmple_epi16(C, minAB);
880 
881     // 1111 where we should use min(A, B)
882     __m128i minMask = _mm_cmpge_epi16(C, maxAB);
883 
884     P = (P & (~minMask)) | (minAB & minMask);
885     P = (P & (~maxMask)) | (maxAB & maxMask);
886 
887     // Get back to u8
888     P = _mm_packus_epi16(P, Z);
889 
890     RGBA r;
891     _mm_storeu_si32(&r, P);
892 
893     return r;
894 }
895 
896 private __m128i _mm_cmple_epi16(__m128i a, __m128i b) pure @safe
897 {
898     return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
899 }
900 
901 private __m128i _mm_cmpge_epi16(__m128i a, __m128i b)
902 {
903     return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
904 }