1 module gamut.codecs.qoi10b; 2 3 nothrow @nogc: 4 5 import core.stdc.stdlib: realloc, malloc, free; 6 import core.stdc.string: memset; 7 8 import gamut.codecs.qoi2avg; 9 import inteli.emmintrin; 10 11 /// A QOI-inspired codec for 10-bit images, called "QOI-10b". 12 /// Input image is 16-bit ushort, but only 10-bits gets encoded making it lossy. 13 /// 14 /// 15 /// Incompatible adaptation of QOI format - https://phoboslab.org 16 /// 17 /// -- LICENSE: The MIT License(MIT) 18 /// Copyright(c) 2021 Dominic Szablewski (original QOI format) 19 /// Copyright(c) 2022 Guillaume Piolat (QOI-10b variant for 10b images, 1/2/3/4 channels). 20 /// Permission is hereby granted, free of charge, to any person obtaining a copy of 21 /// this software and associated documentation files(the "Software"), to deal in 22 /// the Software without restriction, including without limitation the rights to 23 /// use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies 24 /// of the Software, and to permit persons to whom the Software is furnished to do 25 /// so, subject to the following conditions : 26 /// The above copyright notice and this permission notice shall be included in all 27 /// copies or substantial portions of the Software. 28 /// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 /// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 /// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 31 /// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 32 /// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 33 /// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 34 /// SOFTWARE. 35 36 /// -- Documentation 37 38 /// This library provides the following functions; 39 /// - qoi10b_decode -- decode the raw bytes of a QOI-10b image from memory 40 /// - qoi10b_encode -- encode an rgba buffer into a QOI-10b image in memory 41 /// 42 /// 43 /// A QOI-10b file has a 25 byte header, compatible with Gamut QOIX. 44 /// 45 /// struct qoix_header_t { 46 /// char magic[4]; // magic bytes "qoix" 47 /// uint32_t width; // image width in pixels (BE) 48 /// uint32_t height; // image height in pixels (BE) 49 /// uint8_t version_; // Major version of QOIX format. 50 /// uint8_t channels; // 1, 2, 3 or 4 51 /// uint8_t bitdepth; // 10 = this QOI-10b codec is always 10-bit (8 would indicate QOI2AVG or QOI-plane) 52 /// uint8_t colorspace; // 0 = sRGB with linear alpha, 1 = all channels linear 53 /// uint8_t compression; // 0 = none, 1 = LZ4 54 /// float pixelAspectRatio; // -1 = unknown, else Pixel Aspect Ratio 55 /// float resolutionX; // -1 = unknown, else physical resolution in DPI 56 /// }; 57 /// 58 /// The decoder and encoder start with {r: 0; g: 0; b: 0; a: 0} as the previous 59 /// pixel value. Pixels are either encoded as 60 /// - a run of the previous pixel 61 /// - a difference to the previous pixel value 62 /// - full luminance value 63 /// 64 /// The byte stream's end is marked with 5 0xff bytes. 65 /// 66 /// This codec is simply like QOI2AVG but with extra 2 bits for each components, and it breaks byte alignment. 67 /// 68 /// Optimized? Opcode Bits(RGB) Bits(grey) Meaning 69 /// [ ] QOI_OP_LUMA 14 6 0ggggg[rrrrbbbb] (less g or more g => doesn't work) 70 /// [ ] QOI_OP_LUMA0 12 6 10gggg[rrrbbb] 71 /// [ ] QOI_OP_LUMA2 22 10 110ggggggg[rrrrrrbbbbbb] 72 /// [ ] QOI_OP_LUMA3 30 14 11100ggggggggg[rrrrrrrrbbbbbbbb] 73 /// [ ] QOI_OP_ADIFF 10 10 11101xxxxx 74 /// [x] QOI_OP_RUN 8 8 11110xxx 75 /// 16 16 11110111xxxxxxxx 76 /// [ ] QOI_OP_ADIFF2 16 16 111110xxxxxxxx 77 /// [x] QOI_OP_GRAY 18 18 11111100gggggggggg 78 /// [ ] QOI_OP_RGB 38 18 11111101rrrrrrrrrr[ggggggggggbbbbbbbbbb] 79 /// [ ] QOI_OP_RGBA 48 28 11111110rrrrrrrrrr[ggggggggggbbbbbbbbbb]aaaaaaaaaa 80 /// [ ] QOI_OP_END 8 8 11111111 81 82 // Note: LOCO-I predictor doesn't really work here, it slowdown quite a bit for 0.5% bitrate improvement. 83 // Not sure what happens here. 84 85 enum ubyte QOI_OP_ADIFF2 = 0xf8; 86 87 enum int WORST_OPCODE_BITS = 48; 88 89 enum enableAveragePrediction = true; 90 91 enum bool newpred = false;//true; 92 93 static immutable ubyte[5] qoi10b_padding = [255,255,255,255,255]; 94 95 enum qoi10_rgba_t initialPredictor = { r:0, g:0, b:0, a:1023 }; 96 97 struct qoi10_rgba_t 98 { 99 // 0 to 1023 values 100 ushort r; 101 ushort g; 102 ushort b; 103 ushort a; 104 } 105 106 uint QOI10b_COLOR_HASH(qoi10_rgba_t C) 107 { 108 long colorAsLong = *(cast(long*)&C); 109 return (((colorAsLong * 2654435769) >> 22) & 1023); 110 } 111 112 /* Encode raw RGBA16 pixels into a QOI-10b image in memory. 113 This immediately loosen precision from 16-bit t 10-bit, so this is lossy. 114 The function either returns null on failure (invalid parameters or malloc 115 failed) or a pointer to the encoded data on success. On success the out_len 116 is set to the size in bytes of the encoded data. 117 The returned qoi data should be free()d after use. */ 118 version(encodeQOIX) 119 ubyte* qoi10b_encode(const(ubyte)* data, const(qoi_desc)* desc, int *out_len) 120 { 121 if ( (desc.channels != 1 && desc.channels != 2 && desc.channels != 3 && desc.channels != 4) || 122 desc.height >= QOIX_PIXELS_MAX / desc.width || desc.compression != QOIX_COMPRESSION_NONE 123 ) { 124 return null; 125 } 126 127 if (desc.bitdepth != 10) 128 return null; 129 130 int channels = desc.channels; 131 132 // At worst, each pixel take 38 bit to be encoded. 133 int num_pixels = desc.width * desc.height; 134 135 int scanline_size = cast(int)(qoi10_rgba_t.sizeof) * desc.width; 136 137 int max_size = cast(int) (cast(long)num_pixels * WORST_OPCODE_BITS + 7) / 8 + QOIX_HEADER_SIZE + cast(int)(qoi10b_padding.sizeof); 138 139 ubyte* stream; 140 141 int p = 0; // write index into output stream 142 ubyte* bytes = cast(ubyte*) QOI_MALLOC(max_size + 2 * scanline_size); 143 if (!bytes) 144 { 145 return null; 146 } 147 148 qoi_write_32(bytes, &p, QOIX_MAGIC); 149 qoi_write_32(bytes, &p, desc.width); 150 qoi_write_32(bytes, &p, desc.height); 151 bytes[p++] = 1; // Put a version number :) 152 bytes[p++] = desc.channels; // 1, 2, 3 or 4 153 bytes[p++] = desc.bitdepth; // 10 154 bytes[p++] = desc.colorspace; 155 bytes[p++] = QOIX_COMPRESSION_NONE; 156 qoi_write_32f(bytes, &p, desc.pixelAspectRatio); 157 qoi_write_32f(bytes, &p, desc.resolutionY); 158 159 int currentBit = 7; // beginning of a byte 160 bytes[p] = 0; 161 162 // write the nbits last bits of x, starting from the highest one 163 void outputBits(uint x, int nbits) nothrow @nogc 164 { 165 assert(nbits >= 2 && nbits <= 16); 166 assert( (nbits % 2) == 0); 167 168 for (int b = nbits - 2; b >= 0; b -= 2) 169 { 170 // which bit to write 171 ubyte pairOfBits = (x >>> b) & 3; 172 bytes[p] |= (pairOfBits << (currentBit - 1)); 173 174 currentBit -= 2; 175 176 if (currentBit == -1) 177 { 178 p++; 179 bytes[p] = 0; 180 currentBit = 7; 181 } 182 } 183 } 184 185 void outputByte(ubyte b) 186 { 187 outputBits(b, 8); 188 } 189 190 qoi10_rgba_t px = initialPredictor; 191 qoi10_rgba_t px_ref = initialPredictor; 192 193 // To serve as predictor 194 qoi10_rgba_t* scanlineConverted = cast(qoi10_rgba_t*)(&bytes[max_size]); 195 qoi10_rgba_t* lastScanlineConverted = cast(qoi10_rgba_t*)(&bytes[max_size + scanline_size]); 196 197 ubyte[1024] index_lookup; 198 uint index_pos = 0; 199 200 201 index_lookup[] = 0; 202 203 bool streamIsGrey = (channels == 1 || channels == 2); 204 int run = 0; 205 int encoded_pixels = 0; 206 207 void encodeRun() 208 { 209 assert(run > 0 && run <= 256); 210 run--; 211 if (run < 7) 212 { 213 outputByte( cast(ubyte)(QOI_OP_RUN | run) ); // run 1 to 7 214 } 215 else 216 { 217 outputByte( cast(ubyte)(QOI_OP_RUN | 7) ); // QOI_OP_RUN2 is inside the QOI_OP_RUN 218 outputBits(run - 7, 8); 219 } 220 run = 0; 221 } 222 223 for (int posy = 0; posy < desc.height; ++posy) 224 { 225 { 226 const(ushort)* line = cast(const(ushort*))(data + desc.pitchBytes * posy); 227 228 // 1. First convert the scanline to full qoi10_rgba_t to serve as predictor. 229 230 for (int posx = 0; posx < desc.width; ++posx) 231 { 232 qoi10_rgba_t pixel; 233 234 // Note that we drop six lower bits here. This codec is lossy 235 // if you really have more than 10-bits of precision. 236 // The use case is PBR knob in Dplug, who needs 10-bit (presumably) for the elevation map. 237 switch(channels) 238 { 239 default: 240 case 4: 241 pixel.r = line[posx * channels + 0]; 242 pixel.g = line[posx * channels + 1]; 243 pixel.b = line[posx * channels + 2]; 244 pixel.a = line[posx * channels + 3]; 245 break; 246 case 3: 247 pixel.r = line[posx * channels + 0]; 248 pixel.g = line[posx * channels + 1]; 249 pixel.b = line[posx * channels + 2]; 250 pixel.a = 65535; 251 break; 252 case 2: 253 pixel.r = line[posx * channels + 0]; 254 pixel.g = pixel.r; 255 pixel.b = pixel.r; 256 pixel.a = line[posx * channels + 1]; 257 break; 258 case 1: 259 pixel.r = line[posx * channels + 0]; 260 pixel.g = pixel.r; 261 pixel.b = pixel.r; 262 pixel.a = 65535; 263 break; 264 } 265 266 pixel.r = pixel.r >>> 6; 267 pixel.g = pixel.g >>> 6; 268 pixel.b = pixel.b >>> 6; 269 pixel.a = pixel.a >>> 6; 270 271 assert(pixel.r <= 1023); 272 assert(pixel.g <= 1023); 273 assert(pixel.b <= 1023); 274 assert(pixel.a <= 1023); 275 276 scanlineConverted[posx] = pixel; // Note: if we'd like lossy, this would be the reconstructed buffer. 277 } 278 } 279 280 for (int posx = 0; posx < desc.width; ++posx) 281 { 282 px_ref = px; 283 284 px = scanlineConverted[posx]; 285 286 if (px == px_ref) 287 { 288 run++; 289 if (run == 256 || encoded_pixels + 1 == num_pixels) 290 encodeRun(); 291 } 292 else 293 { 294 if (run > 0) 295 encodeRun(); 296 297 { 298 int va = (px.a - px_ref.a) & 1023; 299 if (va) 300 { 301 if (va < 16 || va >= (1024 - 16)) // does it fit on 5 bits? 302 { 303 // it fits on 5 bits 304 outputBits((0x1d << 5) | (va & 0x1f), 10); // QOI_OP_ADIFF 305 } 306 else if (va < 128 || va >= (1024 - 128)) // does it fit on 8 bits? 307 { 308 outputBits( (QOI_OP_ADIFF2 >>> 2), 6); 309 outputBits(va, 8); 310 } 311 else 312 { 313 outputByte(QOI_OP_RGBA); 314 outputBits(px.r, 10); 315 if (!streamIsGrey) 316 { 317 outputBits(px.g, 10); 318 outputBits(px.b, 10); 319 } 320 outputBits(px.a, 10); 321 goto pixel_is_encoded; 322 } 323 } 324 325 if (posy > 0 && enableAveragePrediction) 326 { 327 static if (newpred) 328 { 329 if (posx == 0) 330 { 331 // first pixel in the row, take above pixel 332 px_ref.r = lastScanlineConverted[posx].r; 333 px_ref.g = lastScanlineConverted[posx].g; 334 px_ref.b = lastScanlineConverted[posx].b; 335 } 336 else 337 { 338 qoi10_rgba_t pred = locoIntraPredictionSIMD(px_ref, lastScanlineConverted[posx], lastScanlineConverted[posx-1]); 339 px_ref.r = pred.r; 340 px_ref.g = pred.g; 341 px_ref.b = pred.b; 342 } 343 } 344 else 345 { 346 px_ref.r = (px_ref.r + lastScanlineConverted[posx].r + 1) >> 1; 347 px_ref.g = (px_ref.g + lastScanlineConverted[posx].g + 1) >> 1; 348 px_ref.b = (px_ref.b + lastScanlineConverted[posx].b + 1) >> 1; 349 } 350 } 351 352 int vg = (px.g - px_ref.g) & 1023; 353 int vg_r = (px.r - px_ref.r - vg) & 1023; 354 int vg_b = (px.b - px_ref.b - vg) & 1023; 355 356 if (streamIsGrey) 357 { 358 assert(vg_r == 0); 359 assert(vg_b == 0); 360 } 361 362 363 if ( ( (vg_r >= (1024- 4)) || (vg_r < 4) ) && // fits in 3 bits? 364 ( (vg >= (1024-8)) || (vg < 8) ) && // fits in 4 bits? 365 ( (vg_b >= (1024- 4)) || (vg_b < 4) ) ) // fits in 3 bits? 366 { 367 outputBits(0x20 | (vg & 0x0f), 6); // QOI_OP_LUMA0 368 if (!streamIsGrey) 369 { 370 outputBits( (vg_r << 3) | (vg_b & 7), 6); 371 } 372 } 373 else if ( ( (vg_r >= (1024- 8)) || (vg_r < 8) ) && // fits in 4 bits? 374 ( (vg >= (1024-16)) || (vg < 16) ) && // fits in 5 bits? 375 ( (vg_b >= (1024- 8)) || (vg_b < 8) ) ) // fits in 4 bits? 376 { 377 outputBits(vg & 0x1f, 6); // QOI_OP_LUMA 378 if (!streamIsGrey) 379 { 380 outputBits(vg_r, 4); 381 outputBits(vg_b, 4); 382 } 383 } 384 else if (!streamIsGrey && px.g == px.r && px.g == px.b) 385 { 386 // Note: in greyscale, more expensive than QOI_OP_LUMA2 387 // This opcode should not be used if input is grey. 388 outputByte(QOI_OP_GRAY); 389 outputBits(px.g, 10); 390 } 391 else 392 if ( ( (vg_r >= (1024-32)) || (vg_r < 32) ) && // fits in 6 bits? 393 ( (vg >= (1024-64)) || (vg < 64) ) && // fits in 7 bits? 394 ( (vg_b >= (1024-32)) || (vg_b < 32) ) ) // fits in 6 bits? 395 { 396 outputBits((0x6 << 7) | (vg & 0x7f), 10); // QOI_OP_LUMA2 397 if (!streamIsGrey) 398 { 399 outputBits(vg_r, 6); 400 outputBits(vg_b, 6); 401 } 402 } 403 else 404 if ( ( (vg_r >= (1024-128)) || (vg_r < 128) ) && // fits in 8 bits? 405 ( (vg >= (1024-256)) || (vg < 256) ) && // fits in 9 bits? 406 ( (vg_b >= (1024-128)) || (vg_b < 128) ) ) // fits in 8 bits? 407 { 408 outputBits((0x1c << 9) | (vg & 0x1ff), 14); // QOI_OP_LUMA3 409 if (!streamIsGrey) 410 { 411 outputBits(vg_r, 8); 412 outputBits(vg_b, 8); 413 } 414 } 415 else 416 { 417 outputByte(QOI_OP_RGB); 418 outputBits(px.r, 10); 419 if (!streamIsGrey) 420 { 421 outputBits(px.g, 10); 422 outputBits(px.b, 10); 423 } 424 } 425 } 426 } 427 428 pixel_is_encoded: 429 430 encoded_pixels++; 431 } 432 433 // Exchange scanline pointers 434 { 435 qoi10_rgba_t* temp = scanlineConverted; 436 scanlineConverted = lastScanlineConverted; 437 lastScanlineConverted = temp; 438 } 439 } 440 441 for (int i = 0; i < cast(int)(qoi10b_padding.sizeof); i++) 442 { 443 outputByte(qoi10b_padding[i]); 444 } 445 446 // finish the last byte 447 if (currentBit != 7) 448 outputBits(0xff, currentBit + 1); 449 assert(currentBit == 7); // full byte 450 451 *out_len = p; 452 return bytes; 453 } 454 455 /* Decode a QOI-10b image from memory. 456 457 The function either returns null on failure (invalid parameters or malloc 458 failed) or a pointer to the decoded 16-bit pixels. On success, the qoi_desc struct 459 is filled with the description from the file header. 460 461 The returned pixel data should be free()d after use. */ 462 version(decodeQOIX) 463 ubyte* qoi10b_decode(const(void)* data, int size, qoi_desc *desc, int channels) 464 { 465 const(ubyte)* bytes; 466 uint header_magic; 467 468 int p = 0, run = 0; 469 470 if (data == null || desc == null || 471 (channels < 0 || channels > 4) || 472 size < QOIX_HEADER_SIZE + cast(int)(qoi10b_padding.sizeof) 473 ) 474 { 475 return null; 476 } 477 478 bytes = cast(const(ubyte)*)data; 479 480 header_magic = qoi_read_32(bytes, &p); 481 desc.width = qoi_read_32(bytes, &p); 482 desc.height = qoi_read_32(bytes, &p); 483 int qoix_version = bytes[p++]; 484 desc.channels = bytes[p++]; 485 desc.bitdepth = bytes[p++]; 486 desc.colorspace = bytes[p++]; 487 desc.compression = bytes[p++]; 488 desc.pixelAspectRatio = qoi_read_32f(bytes, &p); 489 desc.resolutionY = qoi_read_32f(bytes, &p); 490 491 if (desc.width == 0 || desc.height == 0 || 492 desc.channels < 1 || desc.channels > 4 || 493 desc.colorspace > 1 || 494 desc.bitdepth != 10 || 495 qoix_version > 1 || 496 desc.compression != QOIX_COMPRESSION_NONE || 497 header_magic != QOIX_MAGIC || 498 desc.height >= QOIX_PIXELS_MAX / desc.width 499 ) 500 { 501 return null; 502 } 503 504 int streamChannels = desc.channels; 505 506 if (channels == 0) 507 channels = streamChannels; 508 509 int stride = desc.width * channels * 2; 510 desc.pitchBytes = stride; 511 512 int pixel_data_size = stride * desc.height; 513 int decoded_scanline_size = desc.width * cast(int)qoi10_rgba_t.sizeof; 514 515 ubyte* pixels = cast(ubyte*) QOI_MALLOC(pixel_data_size + 2 * decoded_scanline_size); 516 if (!pixels) { 517 return null; 518 } 519 520 // double-buffered scanline, for correct average predictors 521 // (else we can't decode 4/3 channels to 1/2 with average prediction, the predictors would be wrong 522 // if taken from the decoded output) 523 qoi10_rgba_t* decodedScanline = cast(qoi10_rgba_t*)(&pixels[pixel_data_size]); 524 qoi10_rgba_t* lastDecodedScanline = cast(qoi10_rgba_t*)(&pixels[pixel_data_size + decoded_scanline_size]); 525 526 assert(channels >= 1 && channels <= 4); 527 528 int currentBit = 7; 529 530 void rewindInputBit() nothrow @nogc 531 { 532 if (currentBit == 7) 533 { 534 p--; 535 currentBit = -1; 536 } 537 currentBit++; 538 } 539 540 int read2Bits() nothrow @nogc 541 { 542 ubyte bb = bytes[p]; 543 544 int bit = (bytes[p] >>> (currentBit - 1)) & 3; 545 546 currentBit -= 2; 547 if (currentBit == -1) 548 { 549 currentBit = 7; 550 p++; 551 } 552 return bit; 553 } 554 555 uint readBits(int nbits) nothrow @nogc 556 { 557 assert(nbits % 2 == 0); 558 uint r = 0; 559 for (int b = 0; b < nbits; b += 2) 560 { 561 r = (r << 2) | read2Bits(); 562 } 563 return r; 564 } 565 566 ubyte readByte() nothrow @nogc 567 { 568 return cast(ubyte) readBits(8); 569 } 570 571 bool streamIsGrey = (streamChannels == 1 || streamChannels == 2); 572 573 qoi10_rgba_t px = initialPredictor; 574 qoi10_rgba_t px_ref = initialPredictor; 575 576 int decoded_pixels = 0; 577 int num_pixels = desc.width * desc.height; 578 579 for (int posy = 0; posy < desc.height; ++posy) 580 { 581 for (int posx = 0; posx < desc.width; ++posx) 582 { 583 px_ref = px; 584 585 if (run > 0) 586 { 587 run--; 588 } 589 else if (decoded_pixels < num_pixels) 590 { 591 // Compute averaged predictors then 592 593 if (posy > 0 && enableAveragePrediction) 594 { 595 static if (newpred) 596 { 597 if (posx == 0) 598 { 599 // first pixel in the row, take above pixel 600 px_ref.r = lastDecodedScanline[posx].r; 601 px_ref.g = lastDecodedScanline[posx].g; 602 px_ref.b = lastDecodedScanline[posx].b; 603 } 604 else 605 { 606 qoi10_rgba_t pred = locoIntraPredictionSIMD(px_ref, lastDecodedScanline[posx], lastDecodedScanline[posx-1]); 607 px_ref.r = pred.r; 608 px_ref.g = pred.g; 609 px_ref.b = pred.b; 610 } 611 } 612 else 613 { 614 px_ref.r = (px_ref.r + lastDecodedScanline[posx].r + 1) >> 1; 615 px_ref.g = (px_ref.g + lastDecodedScanline[posx].g + 1) >> 1; 616 px_ref.b = (px_ref.b + lastDecodedScanline[posx].b + 1) >> 1; 617 } 618 } 619 620 decode_next_op: 621 622 ubyte op = readByte(); 623 624 if (op < 0x80) // QOI_OP_LUMA 625 { 626 int vg = (op >> 2) & 31; // vg is a signed 5-bit number 627 vg = (vg << 27) >> 27; // extends sign 628 px.g = (px_ref.g + vg ) & 1023; 629 630 if (!streamIsGrey) 631 { 632 int vg_r = ((op & 3) << 2) | readBits(2); // vg_r and vg_b are signed 4-bit number in the stream 633 vg_r = (vg_r << 28) >> 28; // extends sign 634 int vg_b = cast(int)(readBits(4) << 28) >> 28; 635 px.r = (px_ref.r + vg + vg_r) & 1023; 636 px.b = (px_ref.b + vg + vg_b) & 1023; 637 } 638 else 639 { 640 // Rewind two bits, this is always possible 641 rewindInputBit(); 642 rewindInputBit(); 643 px.r = px.g; 644 px.b = px.g; 645 } 646 } 647 else if (op < 0xc0) // QOI_OP_LUMA0 10xxxx 648 { 649 // 10gggg[rrrggg] 650 651 int vg = (op >> 2) & 15; // vg is a signed 4-bit number 652 vg = (vg << 28) >> 28; // extends sign 653 px.g = (px_ref.g + vg ) & 1023; 654 if (!streamIsGrey) 655 { 656 uint remain = readBits(4); 657 int vg_r = ((op & 3) << 1) | (remain >> 3); 658 int vg_b = remain & 7; 659 vg_r = (vg_r << 29) >> 29; 660 vg_b = (vg_b << 29) >> 29; 661 px.r = (px_ref.r + vg + vg_r) & 1023; 662 px.b = (px_ref.b + vg + vg_b) & 1023; 663 } 664 else 665 { 666 // Rewind two bits, this is always possible 667 rewindInputBit(); 668 rewindInputBit(); 669 px.r = px.g; 670 px.b = px.g; 671 } 672 } 673 else if (op < 0xe0) // QOI_OP_LUMA2 674 { 675 int vg = ((op & 31) << 2) | readBits(2); // vg is a signed 7-bit number 676 vg = (vg << 25) >> 25; // extends sign 677 px.g = (px_ref.g + vg ) & 1023; 678 if (!streamIsGrey) 679 { 680 int vg_r = cast(int)(readBits(6) << 26) >> 26; // vg_r and vg_b are signed 6-bit number in the stream 681 int vg_b = cast(int)(readBits(6) << 26) >> 26; 682 px.r = (px_ref.r + vg + vg_r) & 1023; 683 px.b = (px_ref.b + vg + vg_b) & 1023; 684 } 685 else 686 { 687 px.r = px.g; 688 px.b = px.g; 689 } 690 } 691 else if (op < 0xe8) // QOI_OP_LUMA3 692 { 693 int vg = ((op & 7) << 6) | readBits(6); // vg is a signed 9-bit number 694 vg = (vg << 23) >> 23; // extends sign 695 px.g = (px_ref.g + vg ) & 1023; 696 if (!streamIsGrey) 697 { 698 int vg_r = cast(int)(readBits(8) << 24) >> 24; // vg_r and vg_b are signed 8-bit number in the stream 699 int vg_b = cast(int)(readBits(8) << 24) >> 24; 700 px.r = (px_ref.r + vg + vg_r) & 1023; 701 px.b = (px_ref.b + vg + vg_b) & 1023; 702 } 703 else 704 { 705 px.r = px.g; 706 px.b = px.g; 707 } 708 } 709 else if (op < 0xf0) // QOI_OP_ADIFF 710 { 711 int adiff = ((op & 7) << 2) | readBits(2); 712 adiff = adiff << 27; // Need sign extension, else the negatives aren't negatives. 713 adiff = adiff >> 27; 714 px.a = cast(ushort)((px.a + adiff) & 1023); 715 goto decode_next_op; 716 } 717 else if ((op & 0xfc) == QOI_OP_ADIFF2) 718 { 719 int adiff = ((op & 3) << 6) | readBits(6); 720 adiff = (adiff << 24) >> 24; // sign-extend 721 px.a = cast(ushort)((px.a + adiff) & 1023); 722 goto decode_next_op; 723 } 724 else if (op < 0xf8) // QOI_OP_RUN 725 { 726 run = op & 7; 727 if (run == 7) 728 { 729 run = readBits(8) + 7; 730 } 731 } 732 else if (op == QOI_OP_RGB) 733 { 734 px.r = cast(ushort) readBits(10); 735 if (!streamIsGrey) 736 { 737 px.g = cast(ushort) readBits(10); 738 px.b = cast(ushort) readBits(10); 739 } 740 else 741 { 742 px.g = px.r; 743 px.b = px.r; 744 } 745 } 746 else if (op == QOI_OP_RGBA) 747 { 748 px.r = cast(ushort) readBits(10); 749 if (!streamIsGrey) 750 { 751 px.g = cast(ushort) readBits(10); 752 px.b = cast(ushort) readBits(10); 753 } 754 else 755 { 756 px.g = px.r; 757 px.b = px.r; 758 } 759 px.a = cast(ushort) readBits(10); 760 } 761 else if (op == QOI_OP_GRAY) 762 { 763 px.r = cast(ushort) readBits(10); 764 px.g = px.r; 765 px.b = px.r; 766 } 767 else if (op == QOI_OP_END) 768 { 769 goto finished; 770 } 771 else 772 { 773 assert(false); 774 } 775 } 776 777 decodedScanline[posx] = px; 778 decoded_pixels++; 779 } 780 781 // convert just-decoded scanline into output type 782 ushort* line = cast(ushort*)(pixels + desc.pitchBytes * posy); 783 784 // PERF: can be 4 different loops here? 785 for (int posx = 0; posx < desc.width; ++posx) 786 { 787 qoi10_rgba_t px16b = decodedScanline[posx]; // 0..1023 components 788 px16b.r = cast(ushort)(px16b.r << 6 | (px16b.r >> 4)); 789 px16b.g = cast(ushort)(px16b.g << 6 | (px16b.g >> 4)); 790 px16b.b = cast(ushort)(px16b.b << 6 | (px16b.b >> 4)); 791 px16b.a = cast(ushort)(px16b.a << 6 | (px16b.a >> 4)); 792 793 // Expand 10 bit to 16-bits 794 switch(channels) 795 { 796 default: 797 case 4: 798 line[posx * channels + 0] = px16b.r; 799 line[posx * channels + 1] = px16b.g; 800 line[posx * channels + 2] = px16b.b; 801 line[posx * channels + 3] = px16b.a; 802 break; 803 case 3: 804 line[posx * channels + 0] = px16b.r; 805 line[posx * channels + 1] = px16b.g; 806 line[posx * channels + 2] = px16b.b; 807 break; 808 case 2: 809 line[posx * channels + 0] = px16b.r; 810 line[posx * channels + 1] = px16b.a; 811 break; 812 case 1: 813 line[posx * channels + 0] = px16b.r; 814 break; 815 } 816 } 817 818 // swap decoded scanline buffers 819 { 820 qoi10_rgba_t* temp = decodedScanline; 821 decodedScanline = lastDecodedScanline; 822 lastDecodedScanline = temp; 823 } 824 } 825 826 finished: 827 return pixels; 828 } 829 830 static qoi10_rgba_t locoIntraPredictionSIMD(qoi10_rgba_t a, qoi10_rgba_t b, qoi10_rgba_t c) 831 { 832 // load RGBA16 pixels 833 __m128i A = _mm_loadu_si64(&a); 834 __m128i B = _mm_loadu_si64(&b); 835 __m128i C = _mm_loadu_si64(&c); 836 837 // Max predictor (A + B - C) 838 __m128i P = _mm_sub_epi16(_mm_add_epi16(A, B), C); 839 __m128i maxAB = _mm_max_epi16(A, B); 840 __m128i minAB = _mm_min_epi16(A, B); 841 842 // 1111 where we should use max(A, B) 843 __m128i maxMask = _mm_cmple_epi16(C, minAB); 844 845 // 1111 where we should use min(A, B) 846 __m128i minMask = _mm_cmpge_epi16(C, maxAB); 847 848 P = (P & (~minMask)) | (minAB & minMask); 849 P = (P & (~maxMask)) | (maxAB & maxMask); 850 851 // Get back to 10-bit, clip 852 853 __m128i Z = _mm_setzero_si128(); 854 __m128i m1023 = _mm_set1_epi16(1023); 855 P = _mm_max_epi16(P, Z); 856 P = _mm_min_epi16(P, m1023); 857 858 qoi10_rgba_t r; 859 _mm_storel_epi64(cast(__m128i*)&r, P); 860 861 return r; 862 } 863 864 /+ 865 static short locoIntraPrediction(short a, short b, short c) 866 { 867 short max_ab = a > b ? a : b; 868 short min_ab = a < b ? a : b; 869 if (c >= max_ab) 870 return cast(short)min_ab; 871 else if (c <= min_ab) 872 return cast(short)max_ab; 873 else 874 { 875 int d = a + b - c; 876 if (d < 0) 877 d = 0; 878 if (d > 1023) 879 d = 1023; 880 return cast(short)d; 881 } 882 } 883 +/ 884 885 private __m128i _mm_cmple_epi16(__m128i a, __m128i b) pure @safe 886 { 887 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b)); 888 } 889 890 private __m128i _mm_cmpge_epi16(__m128i a, __m128i b) 891 { 892 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b)); 893 }