1 module gamut.codecs.qoi2avg; 2 3 nothrow @nogc: 4 5 import core.stdc.stdlib: realloc, malloc, free; 6 import core.stdc.string: memset, memcpy; 7 8 import inteli.emmintrin; 9 10 /// Note: this is a translation of "QOI2" mods by @wbd73 11 /// revealed in https://github.com/nigeltao/qoi2-bikeshed/issues/34 12 /// Called "QOIX" in Gamut, since it has a few extensions again, such as LZ4. 13 14 /* 15 16 QOI2 - Lossless image format inspired by QOI “Quite OK Image” format 17 18 Incompatible adaptation of QOI format - https://phoboslab.org 19 20 -- LICENSE: The MIT License(MIT) 21 Copyright(c) 2021 Dominic Szablewski (original QOI format) 22 Copyright(c) 2021 wbd73 @ GitHub (compression improvements) 23 Copyright(c) 2022 Guillaume Piolat (D translation, add pitch support) 24 25 Permission is hereby granted, free of charge, to any person obtaining a copy of 26 this software and associated documentation files(the "Software"), to deal in 27 the Software without restriction, including without limitation the rights to 28 use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies 29 of the Software, and to permit persons to whom the Software is furnished to do 30 so, subject to the following conditions : 31 The above copyright notice and this permission notice shall be included in all 32 copies or substantial portions of the Software. 33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 34 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 35 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 36 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 37 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 38 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 39 SOFTWARE. 40 41 42 43 -- Documentation 44 45 This library provides the following functions; 46 - qoi_decode -- decode the raw bytes of a QOI image from memory 47 - qoi_encode -- encode an rgba buffer into a QOI image in memory 48 49 See the function declaration below for the signature and more information. 50 51 52 -- Data Format 53 54 A QOI2AVG file has a 25 byte header, compatible with Gamut QOIX. 55 Followed by any number of data "chunks" and an 8-byte end marker. 56 57 struct qoix_header_t { 58 char magic[4]; // magic bytes "qoix" 59 uint32_t width; // image width in pixels (BE) 60 uint32_t height; // image height in pixels (BE) 61 uint8_t version_; // Major version of QOIX format. 62 uint8_t channels; // 3 = RGB, 4 = RGBA (1 and 2 indicate QOI-plane codec, see qoiplane.d) 63 uint8_t bitdepth; // 8 = this qoi2avg codec is always 8-bit (10 indicates QOI-10 codec, see qoi10b.d) 64 uint8_t colorspace; // 0 = sRGB with linear alpha, 1 = all channels linear 65 uint8_t compression; // 0 = none, 1 = LZ4 66 float pixelAspectRatio; // -1 = unknown, else Pixel Aspect Ratio 67 float resolutionX; // -1 = unknown, else physical resolution in DPI 68 }; 69 */ 70 71 enum QOIX_HEADER_OFFSET_CHANNELS = 13; 72 enum QOIX_HEADER_OFFSET_BITDEPTH = 14; 73 enum QOIX_HEADER_OFFSET_COMPRESSION = 16; 74 75 76 /* 77 78 The decoder and encoder start with {r: 0, g: 0, b: 0, a: 255} as the previous 79 pixel value. Pixels are either encoded as 80 - a run of the previous pixel 81 - an index into an array of previously seen pixels 82 - a difference to the previous pixel value in r,g,b 83 - full r,g,b or a or gray values 84 85 The color channels are assumed to not be premultiplied with the alpha channel 86 ("un-premultiplied alpha"). 87 88 Each chunk starts with a tag, followed by a number of data bits. The bit length 89 of chunks is divisible by 8 - i.e. all chunks are byte aligned. All values 90 encoded in these data bits have the most significant bit on the left. 91 92 The byte stream's end is marked with 4 0xff bytes. 93 94 A running FIFO array[64] (zero-initialized) of pixel values is maintained by the 95 encoder and decoder. Every pixel en-/decoded by the QOI_OP_LUMA (and variants), 96 QOI_OP_GRAY and QOI_OP_RGB chunks is written to this array. The write position 97 starts at 0 and is incremented with each pixel written. The position wraps back 98 to 0 when it reaches 64. I.e: 99 index[index_pos % 64] = current_pixel; 100 index_pos = index_pos + 1; 101 102 An encoder can search this array for the current pixel value and, if a match is 103 found, emit a QOI_OP_INDEX with the position within the array. 104 105 106 The possible chunks are: 107 108 109 .- QOI_OP_INDEX ----------. 110 | Byte[0] | 111 | 7 6 5 4 3 2 1 0 | 112 |-------+-----------------| 113 | 1 0 | index | 114 `-------------------------` 115 2-bit tag b10 116 6-bit index into the color index array: 0..63 117 118 119 .- QOI_OP_LUMA -----(232)-. 120 | Byte[0] | 121 | 7 6 5 4 3 2 1 0 | 122 |----+--------+-----+-----| 123 | 0 | g diff | drg | dbg | 124 `-------------------------` 125 1-bit tag b0 126 3-bit green channel difference from the reference -4..3 127 2-bit red channel difference minus green channel difference -1..2 or -2..1 128 2-bit blue channel difference minus green channel difference -1..2 or -2..1 129 130 For the first line of pixels the reference is the previous pixel. 131 For the next lines of pixels the reference is the rounded down average of the 132 previous pixel and the one above the current pixel. 133 The green channel is used to indicate the general direction of change and is 134 encoded in 3 bits. The red and green channels (dr and db) base their diffs off 135 of the green channel difference and are encoded in 2 bits. I.e.: 136 dr_dg = (ref.r - cur_px.r) - (ref.g - cur_px.g) 137 db_dg = (ref.b - cur_px.b) - (ref.g - cur_px.g) 138 139 The difference to the current channel values are using a wraparound operation, 140 so "1 - 2" will result in 255, while "255 + 1" will result in 0. 141 142 Values are stored as unsigned integers with a bias of 4 for the green channel 143 and a bias of 1 or 2 for the red and blue channel depending on the direction 144 (sign bit) of the green channel. 145 146 147 .- QOI_OP_LUMA2 ------------------------------(454)-. 148 | Byte[0] | Byte[1] | 149 | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 150 |----------+--------------+-------------+-----------| 151 | 1 1 0 | green diff | dr - dg | db - dg | 152 `---------------------------------------------------` 153 3-bit tag b110 154 5-bit green channel difference from the reference -16..15 155 4-bit red channel difference minus green channel difference -8..7 156 4-bit blue channel difference minus green channel difference -8..7 157 158 The green channel is used to indicate the general direction of change and is 159 encoded in 5 bits. The red and green channels (dr and db) base their diffs off 160 of the green channel difference and are encoded in 4 bits. 161 162 Values are stored as unsigned integers with a bias of 16 for the green channel 163 and a bias of 8 for the red and blue channel. 164 165 166 .- QOI_OP_LUMA3 ------------------------------------.-------------------(676)-. 167 | Byte[0] | Byte[1] | Byte[2] | 168 | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 169 |----------------+----------------------+-------------------+-----------------| 170 | 1 1 1 0 0 | green diff | dr - dg | db - dg | 171 `-----------------------------------------------------------------------------` 172 4-bit tag b1110 173 7-bit green channel difference from the reference -64..63 174 6-bit red channel difference minus green channel difference -32..31 175 6-bit blue channel difference minus green channel difference -32..31 176 177 The green channel is used to indicate the general direction of change and is 178 encoded in 7 bits. The red and green channels (dr and db) base their diffs off 179 of the green channel difference and are encoded in 6 bits. 180 181 Values are stored as unsigned integers with a bias of 64 for the green channel 182 and a bias of 32 for the red and blue channel. 183 184 185 .- QOI_OP_RUN ------------. 186 | Byte[0] | 187 | 7 6 5 4 3 2 1 0 | 188 |----------------+--------| 189 | 1 1 1 1 0 | run | 190 `-------------------------` 191 5-bit tag b11110 192 3-bit run-length repeating the previous pixel: 1..8 193 194 The run-length is stored with a bias of 1. 195 196 197 .- QOI_OP_RUN2 ---------------------. 198 | Byte[0] | Byte[1] | 199 | 7 6 5 4 3 2 1 0 | 7 .. 0 | 200 |-------------------+-----+---------| 201 | 1 1 1 1 1 0 | run | 202 `-----------------------------------` 203 6-bit tag b111110 204 10-bit run-length repeating the previous pixel: 1..1024 205 206 The run-length is stored with a bias of 1. 207 208 209 .- QOI_OP_GRAY ---------------------. 210 | Byte[0] | Byte[1] | 211 | 7 6 5 4 3 2 1 0 | 7 .. 0 | 212 |-------------------------+---------| 213 | 1 1 1 1 1 1 0 0 | gray | 214 `-----------------------------------` 215 8-bit tag b11111100 216 8-bit gray channel value 217 218 219 .- QOI_OP_RGB ------------------------------------------. 220 | Byte[0] | Byte[1] | Byte[2] | Byte[3] | 221 | 7 6 5 4 3 2 1 0 | 7 .. 0 | 7 .. 0 | 7 .. 0 | 222 |-------------------------+---------+---------+---------| 223 | 1 1 1 1 1 1 0 1 | red | green | blue | 224 `-------------------------------------------------------` 225 8-bit tag b11111101 226 8-bit red channel value 227 8-bit green channel value 228 8-bit blue channel value 229 230 231 .- QOI_OP_A ------------------------. 232 | Byte[0] | Byte[1] | 233 | 7 6 5 4 3 2 1 0 | 7 .. 0 | 234 |-------------------------+---------| 235 | 1 1 1 1 1 1 1 0 | alpha | 236 `-----------------------------------` 237 8-bit tag b11111110 238 8-bit alpha channel value 239 240 241 .- QOI_OP_END ------------. 242 | Byte[0] | 243 | 7 6 5 4 3 2 1 0 | 244 |-------------------------| 245 | 1 1 1 1 1 1 1 1 | 246 `-------------------------` 247 8-bit tag b11111111 248 249 250 The byte stream is padded at the end with four 0xff bytes. Since the longest 251 legal chunk is 4 bytes (QOI_OP_RGB), with this padding it is possible to check 252 for an overrun only once per decode loop iteration. These 0xff bytes also mark 253 the end of the data stream, as an encoder should never produce four consecutive 254 0xff bytes within the stream. 255 256 */ 257 258 /* A pointer to a qoi_desc struct has to be supplied to all of qoi's functions. 259 It describes either the input format (for qoi_write and qoi_encode), or is 260 filled with the description read from the file header (for qoi_read and 261 qoi_decode). 262 263 The colorspace in this qoi_desc is an enum where 264 0 = sRGB, i.e. gamma scaled RGB channels and a linear alpha channel 265 1 = all channels are linear 266 You may use the constants QOI_SRGB or QOI_LINEAR. The colorspace is purely 267 informative. It will be saved to the file header, but does not affect 268 en-/decoding in any way. */ 269 270 enum QOI_SRGB = 0; 271 enum QOI_LINEAR = 1; 272 273 struct qoi_desc 274 { 275 uint width; 276 uint height; 277 int pitchBytes; // number of bytes between start of lines. 278 ubyte channels; 279 ubyte bitdepth; 280 ubyte colorspace; 281 ubyte compression; 282 float pixelAspectRatio; // PAR, in Gamut format 283 float resolutionY; // Vertical DPI, in Gamut format 284 } 285 286 alias QOI_MALLOC = malloc; 287 alias QOI_FREE = free; 288 289 290 enum int QOI_OP_LUMA = 0x00; /* 0xxxxxxx */ 291 enum int QOI_OP_INDEX = 0x80; /* 10xxxxxx */ 292 enum int QOI_OP_LUMA2 = 0xc0; /* 110xxxxx */ 293 enum int QOI_OP_LUMA3 = 0xe0; /* 11100xxx */ 294 enum int QOI_OP_ADIFF = 0xe8; /* 11101xxx */ 295 enum int QOI_OP_RUN = 0xf0; /* 11110xxx */ 296 enum int QOI_OP_RUN2 = 0xf8; /* 111110xx */ 297 enum int QOI_OP_GRAY = 0xfc; /* 11111100 */ 298 enum int QOI_OP_RGB = 0xfd; /* 11111101 */ 299 enum int QOI_OP_RGBA = 0xfe; /* 11111110 */ 300 enum int QOI_OP_END = 0xff; /* 11111111 */ 301 302 enum uint QOIX_MAGIC = 0x716F6978; // "qoix" 303 enum QOIX_HEADER_SIZE = 15 + 1 /* version */ + 4 /* PAR */ + 4 /* DPI */ + 1 /* compression */; 304 enum ubyte QOIX_COMPRESSION_NONE = 0; 305 enum ubyte QOIX_COMPRESSION_LZ4 = 1; 306 307 /* To not have to linearly search through the color index array, we use a hash 308 of the color value to quickly lookup the index position in a hash table. */ 309 uint QOI_COLOR_HASH(qoi_rgba_t C) 310 { 311 return (((C.v * 2654435769) >> 22) & 1023); 312 } 313 314 /* 2GB is the max file size that this implementation can safely handle. We guard 315 against anything larger than that, assuming the worst case with 5 bytes per 316 pixel, rounded down to a nice clean value. 400 million pixels ought to be 317 enough for anybody. */ 318 enum uint QOIX_PIXELS_MAX = 400000000; 319 320 struct RGBA 321 { 322 ubyte r, g, b, a; 323 } 324 static assert(RGBA.sizeof == 4); 325 326 struct qoi_rgba_t 327 { 328 union 329 { 330 RGBA rgba; 331 uint v; 332 } 333 } 334 335 static immutable ubyte[4] qoi_padding = [255,255,255,255]; 336 337 void qoi_write_32(ubyte* bytes, int *p, uint v) 338 { 339 bytes[(*p)++] = (0xff000000 & v) >> 24; 340 bytes[(*p)++] = (0x00ff0000 & v) >> 16; 341 bytes[(*p)++] = (0x0000ff00 & v) >> 8; 342 bytes[(*p)++] = (0x000000ff & v); 343 } 344 345 uint qoi_read_32(const(ubyte)* bytes, int *p) 346 { 347 uint a = bytes[(*p)++]; 348 uint b = bytes[(*p)++]; 349 uint c = bytes[(*p)++]; 350 uint d = bytes[(*p)++]; 351 return a << 24 | b << 16 | c << 8 | d; 352 } 353 354 void qoi_write_32f(ubyte* bytes, int *p, float f) 355 { 356 qoi_write_32(bytes, p, *cast(uint*)&f); 357 } 358 359 float qoi_read_32f(const(ubyte)* bytes, int *p) 360 { 361 uint r = qoi_read_32(bytes, p); 362 return *cast(float*)&r; 363 } 364 365 /* Encode raw RGB or RGBA pixels into a QOI2AVG image in memory. 366 367 The function either returns null on failure (invalid parameters or malloc 368 failed) or a pointer to the encoded data on success. On success the out_len 369 is set to the size in bytes of the encoded data. 370 371 The returned qoi data should be free()d after use. */ 372 version(encodeQOIX) 373 ubyte* qoix_encode(const(ubyte)* data, const(qoi_desc)* desc, int *out_len) 374 { 375 int i, stride, p, run; 376 int px_len, px_end, px_pos, channels; 377 ubyte* bytes; 378 ubyte[1024] index_lookup; 379 uint index_pos = 0; 380 qoi_rgba_t[64] index; 381 qoi_rgba_t px, px_ref; 382 383 if ( 384 data == null || out_len == null || desc == null || 385 desc.width == 0 || desc.height == 0 || 386 desc.channels < 3 || desc.channels > 4 || 387 desc.colorspace > 1 || 388 desc.bitdepth != 8 || 389 desc.compression != QOIX_COMPRESSION_NONE || 390 desc.height >= QOIX_PIXELS_MAX / desc.width 391 ) { 392 return null; 393 } 394 395 int pixel_data_size = desc.width * desc.height * channels; 396 397 // Before encoding a scanline, it is converted to RGBA8. 398 // This is double buffered, to help with prediction. 399 int converted_scanline_size = desc.width * 4; 400 401 // Allocated 3 rgba8 scanlines for the need of encoding. 402 int extraAllocSize = converted_scanline_size*2; 403 404 // Overallocate to make room for everything. 405 int max_size = desc.width * desc.height * (desc.channels + 1) + QOIX_HEADER_SIZE + cast(int)(qoi_padding.sizeof); 406 407 p = 0; 408 bytes = cast(ubyte*) QOI_MALLOC(max_size + extraAllocSize); 409 if (!bytes) 410 { 411 return null; 412 } 413 414 // double-buffered scanline, this is intended to speed up decoding 415 qoi_rgba_t* inputScanline = cast(qoi_rgba_t*)(bytes + max_size); 416 qoi_rgba_t* lastInputScanline = cast(qoi_rgba_t*)(bytes + max_size + converted_scanline_size); 417 418 qoi_write_32(bytes, &p, QOIX_MAGIC); 419 qoi_write_32(bytes, &p, desc.width); 420 qoi_write_32(bytes, &p, desc.height); 421 bytes[p++] = 1; // Put a version number :) 422 bytes[p++] = desc.channels; // 3, or 4 423 bytes[p++] = desc.bitdepth; // 8, or 10 424 bytes[p++] = desc.colorspace; 425 bytes[p++] = QOIX_COMPRESSION_NONE; 426 qoi_write_32f(bytes, &p, desc.pixelAspectRatio); 427 qoi_write_32f(bytes, &p, desc.resolutionY); 428 429 //pixels = cast(const(ubyte)*) data; 430 431 memset(index.ptr, 0, 64 * qoi_rgba_t.sizeof); 432 index_lookup[] = 0; 433 434 run = 0; 435 px.rgba.r = 0; 436 px.rgba.g = 0; 437 px.rgba.b = 0; 438 px.rgba.a = 255; 439 440 channels = desc.channels; 441 stride = desc.width * channels; 442 px_len = desc.width * desc.height * channels; 443 px_end = px_len - channels; 444 445 assert (channels != 1 && channels != 2); 446 447 448 449 for (int posy = 0; posy < desc.height; ++posy) 450 { 451 const(ubyte)* line = data + desc.pitchBytes * posy; 452 453 // Convert one input scanline at once to rgba8 454 if (desc.channels == 4) 455 { 456 // PERF: replace by pointer swap 457 memcpy(inputScanline, line, desc.pitchBytes); 458 } 459 else 460 { 461 assert(desc.channels == 3); 462 for (int posx = 0; posx < desc.width; ++posx) 463 { 464 inputScanline[posx].rgba = RGBA(line[posx * 3 + 0], line[posx * 3 + 1], line[posx * 3 + 2], 255); 465 } 466 } 467 468 for (int posx = 0; posx < desc.width; ++posx) 469 { 470 px_ref.v = px.v; 471 px = inputScanline[posx]; 472 473 if (px.v == px_ref.v) { 474 run++; 475 if (run == 1024 || px_pos == px_end) { 476 run--; 477 bytes[p++] = QOI_OP_RUN2 | ((run >> 8) & 3); 478 bytes[p++] = run & 0xff; 479 run = 0; 480 } 481 } 482 else { 483 int hash = QOI_COLOR_HASH(px); 484 485 if (run > 0) { 486 run--; 487 if (run < 8) { 488 bytes[p++] = cast(ubyte)(QOI_OP_RUN | run); 489 } 490 else { 491 bytes[p++] = QOI_OP_RUN2 | ((run >> 8) & 3); 492 bytes[p++] = run & 0xff; 493 } 494 run = 0; 495 } 496 497 if (index[index_lookup[hash]].v == px.v) { 498 bytes[p++] = QOI_OP_INDEX | index_lookup[hash]; 499 } 500 else { 501 index_lookup[hash] = cast(ubyte) index_pos; 502 index[index_pos] = px; 503 index_pos = (index_pos + 1) & 63; 504 505 byte va = cast(byte)(px.rgba.a - px_ref.rgba.a); 506 507 if (va) { 508 if (va >= -4 && va <= 3){ 509 bytes[p++] = cast(ubyte)(QOI_OP_ADIFF | (va + 4)); 510 } else { 511 bytes[p++] = QOI_OP_RGBA; // make a grey + alpha opcode? 512 bytes[p++] = px.rgba.r; 513 bytes[p++] = px.rgba.g; 514 bytes[p++] = px.rgba.b; 515 bytes[p++] = px.rgba.a; 516 goto pixel_encoded; 517 } 518 } 519 520 // Note: computing this predictor for the whole scanline in advance, even with 2x pixels at once, was slower. 521 // because in normal times, you don't compute this predictor all the time. 522 if (posy > 0) 523 { 524 if (posx == 0) 525 { 526 // first pixel in the row, take above pixel 527 RGBA pred = lastInputScanline[posx].rgba; 528 px_ref.rgba.r = pred.r; 529 px_ref.rgba.g = pred.g; 530 px_ref.rgba.b = pred.b; 531 } 532 else 533 { 534 RGBA pred = locoIntraPredictionSIMD(px_ref.rgba, lastInputScanline[posx].rgba, lastInputScanline[posx-1].rgba); 535 px_ref.rgba.r = pred.r; 536 px_ref.rgba.g = pred.g; 537 px_ref.rgba.b = pred.b; 538 } 539 } 540 541 byte vg = cast(byte)(px.rgba.g - px_ref.rgba.g); 542 byte vg_r = cast(byte)(px.rgba.r - px_ref.rgba.r - vg); 543 byte vg_b = cast(byte)(px.rgba.b - px_ref.rgba.b - vg); 544 545 if ( 546 vg >= -4 && vg < 0 && 547 vg_r >= -1 && vg_r <= 2 && 548 vg_b >= -1 && vg_b <= 2 549 ) { 550 bytes[p++] = cast(ubyte)( QOI_OP_LUMA | (vg + 4) << 4 | (vg_r + 1) << 2 | (vg_b + 1) ); 551 } 552 else if ( 553 vg >= 0 && vg <= 3 && 554 vg_r >= -2 && vg_r <= 1 && 555 vg_b >= -2 && vg_b <= 1 556 ) { 557 bytes[p++] = cast(ubyte)( QOI_OP_LUMA | (vg + 4) << 4 | (vg_r + 2) << 2 | (vg_b + 2) ); 558 } 559 else if ( 560 px.rgba.g == px.rgba.r && 561 px.rgba.g == px.rgba.b 562 ) { 563 bytes[p++] = QOI_OP_GRAY; 564 bytes[p++] = px.rgba.g; 565 } 566 else if ( 567 vg_r >= -8 && vg_r <= 7 && 568 vg >= -16 && vg <= 15 && 569 vg_b >= -8 && vg_b <= 7 570 ) { 571 bytes[p++] = cast(ubyte)( QOI_OP_LUMA2 | (vg + 16) ); 572 bytes[p++] = cast(ubyte)( (vg_r + 8) << 4 | (vg_b + 8) ); 573 } 574 else if ( 575 vg_r >= -32 && vg_r <= 31 && 576 vg >= -64 && vg <= 63 && 577 vg_b >= -32 && vg_b <= 31 578 ) { 579 int dv = ((vg + 64) << 12) | ((vg_r + 32) << 6) | (vg_b + 32); 580 bytes[p++] = QOI_OP_LUMA3 | ((dv >> 16) & 31); 581 bytes[p++] = (dv >> 8) & 255; 582 bytes[p++] = dv & 255; 583 } else { 584 bytes[p++] = QOI_OP_RGB; 585 bytes[p++] = px.rgba.r; 586 bytes[p++] = px.rgba.g; 587 bytes[p++] = px.rgba.b; 588 } 589 } 590 } 591 592 pixel_encoded: 593 594 px_pos += channels; 595 } 596 597 // swap input scanline buffers 598 { 599 qoi_rgba_t* temp = inputScanline; 600 inputScanline = lastInputScanline; 601 lastInputScanline = temp; 602 } 603 } 604 605 for (i = 0; i < cast(int)(qoi_padding.sizeof); i++) 606 { 607 bytes[p++] = qoi_padding[i]; 608 } 609 610 *out_len = p; 611 return bytes; 612 } 613 614 /* Decode a QOI2AVG image from memory. 615 616 The function either returns null on failure (invalid parameters or malloc 617 failed) or a pointer to the decoded pixels. On success, the qoi_desc struct 618 is filled with the description from the file header. 619 620 The returned pixel data should be free()d after use. */ 621 version(decodeQOIX) 622 ubyte* qoix_decode(const(void)* data, int size, qoi_desc *desc, int channels) { 623 const(ubyte)* bytes; 624 uint header_magic; 625 qoi_rgba_t[64] index; 626 qoi_rgba_t px, px_ref; 627 int chunks_len; 628 int p = 0, run = 0; 629 int index_pos = 0; 630 631 if ( 632 data == null || desc == null || 633 (channels != 0 && channels != 3 && channels != 4) || 634 size < QOIX_HEADER_SIZE + cast(int)(qoi_padding.sizeof) 635 ) { 636 return null; 637 } 638 639 bytes = cast(const(ubyte)*)data; 640 641 header_magic = qoi_read_32(bytes, &p); 642 desc.width = qoi_read_32(bytes, &p); 643 desc.height = qoi_read_32(bytes, &p); 644 int qoix_version = bytes[p++]; 645 desc.channels = bytes[p++]; 646 desc.bitdepth = bytes[p++]; 647 desc.colorspace = bytes[p++]; 648 desc.compression = bytes[p++]; 649 desc.pixelAspectRatio = qoi_read_32f(bytes, &p); 650 desc.resolutionY = qoi_read_32f(bytes, &p); 651 652 if ( 653 desc.width == 0 || desc.height == 0 || 654 desc.channels < 3 || desc.channels > 4 || 655 desc.colorspace > 1 || 656 desc.bitdepth != 8 || 657 qoix_version > 1 || 658 desc.compression != QOIX_COMPRESSION_NONE || 659 header_magic != QOIX_MAGIC || 660 desc.height >= QOIX_PIXELS_MAX / desc.width 661 ) { 662 return null; 663 } 664 665 if (channels == 0) { 666 channels = desc.channels; 667 } 668 669 int samplesPerRow = desc.width * channels; 670 671 desc.pitchBytes = samplesPerRow; 672 673 int pixel_data_size = desc.width * desc.height * channels; 674 int decoded_scanline_size = desc.width * 4; 675 676 int num_samples = desc.width * desc.height * channels; 677 ubyte* pixels = cast(ubyte *) QOI_MALLOC(pixel_data_size + 2 * decoded_scanline_size); 678 if (!pixels) { 679 return null; 680 } 681 682 // double-buffered scanline, this is intended to speed up decoding 683 qoi_rgba_t* decodedScanline = cast(qoi_rgba_t*)(&pixels[pixel_data_size]); 684 qoi_rgba_t* lastDecodedScanline = cast(qoi_rgba_t*)(&pixels[pixel_data_size + decoded_scanline_size]); 685 686 assert(channels != 1 && channels != 2); 687 688 memset(index.ptr, 0, 64 * qoi_rgba_t.sizeof); 689 px.rgba.r = 0; 690 px.rgba.g = 0; 691 px.rgba.b = 0; 692 px.rgba.a = 255; 693 694 chunks_len = size - cast(int)(qoi_padding.sizeof); 695 696 int px_pos = 0; 697 698 for (int posy = 0; posy < desc.height; ++posy) 699 { 700 for (int posx = 0; posx < desc.width; ++posx) 701 { 702 if (run > 0) 703 { 704 run--; 705 } 706 else if (p < chunks_len) 707 { 708 px_ref.v = px.v; 709 710 if (posy > 0) 711 { 712 if (posx == 0) 713 { 714 // first pixel in the row, take above pixel 715 px_ref.rgba.r = lastDecodedScanline[posx].rgba.r; 716 px_ref.rgba.g = lastDecodedScanline[posx].rgba.g; 717 px_ref.rgba.b = lastDecodedScanline[posx].rgba.b; 718 } 719 else 720 { 721 // Called I-LOCO intra prediction 722 RGBA pred = locoIntraPredictionSIMD(px.rgba, lastDecodedScanline[posx].rgba, lastDecodedScanline[posx-1].rgba); 723 px_ref.rgba.r = pred.r; 724 px_ref.rgba.g = pred.g; 725 px_ref.rgba.b = pred.b; 726 } 727 } 728 729 decode_op: 730 731 int b1 = bytes[p++]; 732 if (b1 < 0x80) { /* QOI_OP_LUMA */ 733 int vg = ((b1 >> 4) & 7) - 4; 734 px.rgba.g = cast(ubyte)(px_ref.rgba.g + vg); 735 if (vg < 0) { 736 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 1 + ((b1 >> 2) & 3) ); 737 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 1 + (b1 & 3) ); 738 } 739 else { 740 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 2 + ((b1 >> 2) & 3) ); 741 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 2 + (b1 & 3) ); 742 } 743 index[index_pos++ & 63] = px; 744 } 745 else if (b1 < 0xc0) { /* QOI_OP_INDEX */ 746 px = index[b1 & 63]; 747 } 748 else if (b1 < 0xe0) { /* QOI_OP_LUMA2 */ 749 int b2 = bytes[p++]; 750 int vg = (b1 & 0x1f) - 16; 751 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 8 + ((b2 >> 4) & 0x0f) ); 752 px.rgba.g = cast(ubyte)( px_ref.rgba.g + vg ); 753 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 8 + (b2 & 0x0f) ); 754 index[index_pos++ & 63] = px; 755 } 756 else if (b1 < 0xe8) { /* QOI_OP_LUMA3 */ 757 int dv = (b1 << 8) | bytes[p++]; 758 dv = (dv << 8) | bytes[p++]; 759 int vg = ((dv >> 12) & 0x7f) - 64; 760 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg + ((dv >> 6) & 0x3f) - 32 ); 761 px.rgba.g = cast(ubyte)( px_ref.rgba.g + vg ); 762 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg + (dv & 0x3f) - 32 ); 763 index[index_pos++ & 63] = px; 764 } 765 else if (b1 < 0xf0) { /* QOI_OP_ADIFF */ 766 px.rgba.a += (b1 & 7) - 4; 767 goto decode_op; 768 } 769 else if (b1 < 0xf8) { /* QOI_OP_RUN */ 770 run = b1 & 7; 771 } 772 else if (b1 < 0xfc) { /* QOI_OP_RUN2 */ 773 run = ((b1 & 3) << 8) | bytes[p++]; 774 } 775 else if (b1 == QOI_OP_GRAY) { 776 ubyte vg = bytes[p++]; 777 px.rgba.r = vg; 778 px.rgba.g = vg; 779 px.rgba.b = vg; 780 index[index_pos++ & 63] = px; 781 } 782 else if (b1 == QOI_OP_RGB) { 783 px.rgba.r = bytes[p++]; 784 px.rgba.g = bytes[p++]; 785 px.rgba.b = bytes[p++]; 786 index[index_pos++ & 63] = px; 787 } 788 else if (b1 == QOI_OP_RGBA) { 789 px.rgba.r = bytes[p++]; 790 px.rgba.g = bytes[p++]; 791 px.rgba.b = bytes[p++]; 792 px.rgba.a = bytes[p++]; 793 index[index_pos++ & 63] = px; 794 } 795 else { /* QOI_OP_END */ 796 break; 797 } 798 } 799 800 decodedScanline[posx] = px; 801 px_pos += channels; 802 } 803 804 // convert just-decoded scanline into output type 805 ubyte* line = cast(ubyte*)(pixels + desc.pitchBytes * posy); 806 807 switch(channels) 808 { 809 case 4: 810 // No particular conversion to do 811 memcpy(line, &decodedScanline[0], desc.width * 4); 812 break; 813 814 case 3: 815 for (int posx = 0; posx < desc.width; ++posx) 816 { 817 qoi_rgba_t decodedPx = decodedScanline[posx]; // No particular conversion to do 818 line[posx * 3 + 0] = decodedPx.rgba.r; 819 line[posx * 3 + 1] = decodedPx.rgba.g; 820 line[posx * 3 + 2] = decodedPx.rgba.b; 821 } 822 break; 823 default: 824 assert(false); 825 } 826 827 // swap decoded scanline buffers 828 { 829 qoi_rgba_t* temp = decodedScanline; 830 decodedScanline = lastDecodedScanline; 831 lastDecodedScanline = temp; 832 } 833 } 834 835 return pixels; 836 } 837 838 private: 839 840 /* Perform LOCO-I prediction independently over the 4 channels. 841 842 843 int max_ab = a > b ? a : b; 844 int min_ab = a < b ? a : b; 845 if (c >= max_ab) 846 return cast(ubyte)min_ab; 847 else if (c <= min_ab) 848 return cast(ubyte)max_ab; 849 else 850 { 851 int d = a + b - c; 852 if (d < 0) 853 d = 0; 854 if (d > 255) 855 d = 0; 856 return cast(ubyte)d; 857 } 858 */ 859 860 static RGBA locoIntraPredictionSIMD(RGBA a, RGBA b, RGBA c) 861 { 862 // load RGBA8 pixels 863 __m128i A = _mm_loadu_si32(&a); 864 __m128i B = _mm_loadu_si32(&b); 865 __m128i C = _mm_loadu_si32(&c); 866 867 // extend to 16-bits 868 __m128i Z = _mm_setzero_si128(); 869 A = _mm_unpacklo_epi8(A, Z); 870 B = _mm_unpacklo_epi8(B, Z); 871 C = _mm_unpacklo_epi8(C, Z); 872 873 // Max predictor (A + B - C) 874 __m128i P = _mm_sub_epi16(_mm_add_epi16(A, B), C); 875 __m128i maxAB = _mm_max_epi16(A, B); 876 __m128i minAB = _mm_min_epi16(A, B); 877 878 // 1111 where we should use max(A, B) 879 __m128i maxMask = _mm_cmple_epi16(C, minAB); 880 881 // 1111 where we should use min(A, B) 882 __m128i minMask = _mm_cmpge_epi16(C, maxAB); 883 884 P = (P & (~minMask)) | (minAB & minMask); 885 P = (P & (~maxMask)) | (maxAB & maxMask); 886 887 // Get back to u8 888 P = _mm_packus_epi16(P, Z); 889 890 RGBA r; 891 _mm_storeu_si32(&r, P); 892 893 return r; 894 } 895 896 private __m128i _mm_cmple_epi16(__m128i a, __m128i b) pure @safe 897 { 898 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b)); 899 } 900 901 private __m128i _mm_cmpge_epi16(__m128i a, __m128i b) 902 { 903 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b)); 904 }