The OpenD Programming Language

1 // jpgd.h - C++ class for JPEG decompression.
2 // Rich Geldreich <richgel99@gmail.com>
3 // Alex Evans: Linear memory allocator (taken from jpge.h).
4 // v1.04, May. 19, 2012: Code tweaks to fix VS2008 static code analysis warnings (all looked harmless)
5 // D translation by Ketmar // Invisible Vector
6 //
7 // This is free and unencumbered software released into the public domain.
8 //
9 // Anyone is free to copy, modify, publish, use, compile, sell, or
10 // distribute this software, either in source code form or as a compiled
11 // binary, for any purpose, commercial or non-commercial, and by any
12 // means.
13 //
14 // In jurisdictions that recognize copyright laws, the author or authors
15 // of this software dedicate any and all copyright interest in the
16 // software to the public domain. We make this dedication for the benefit
17 // of the public at large and to the detriment of our heirs and
18 // successors. We intend this dedication to be an overt act of
19 // relinquishment in perpetuity of all present and future rights to this
20 // software under copyright law.
21 //
22 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
25 // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
26 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
27 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
28 // OTHER DEALINGS IN THE SOFTWARE.
29 //
30 // For more information, please refer to <http://unlicense.org/>
31 //
32 // Supports progressive and baseline sequential JPEG image files, and the most common chroma subsampling factors: Y, H1V1, H2V1, H1V2, and H2V2.
33 //
34 // Chroma upsampling quality: H2V2 is upsampled in the frequency domain, H2V1 and H1V2 are upsampled using point sampling.
35 // Chroma upsampling reference: "Fast Scheme for Image Size Change in the Compressed Domain"
36 // http://vision.ai.uiuc.edu/~dugad/research/dct/index.html
37 /**
38  * Loads a JPEG image from a memory buffer or a file.
39  * req_comps can be 1 (grayscale), 3 (RGB), or 4 (RGBA).
40  * On return, width/height will be set to the image's dimensions, and actual_comps will be set to the either 1 (grayscale) or 3 (RGB).
41  * Requesting a 8 or 32bpp image is currently a little faster than 24bpp because the jpeg_decoder class itself currently always unpacks to either 8 or 32bpp.
42  */
43 /// JPEG image loading.
44 module gamut.codecs.jpegload;
45 
46 import gamut.types;
47 import gamut.internals.binop;
48 import core.stdc.string : memcpy, memset;
49 import core.stdc.stdlib : malloc, free;
50 import inteli.smmintrin;
51 
52 version(decodeJPEG):
53 
54 nothrow:
55 @nogc:
56 
57 // Set to 1 to enable freq. domain chroma upsampling on images using H2V2 subsampling (0=faster nearest neighbor sampling).
58 // This is slower, but results in higher quality on images with highly saturated colors.
59 version = JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING;
60 
61 /// Input stream interface.
62 /// This function is called when the internal input buffer is empty.
63 /// Parameters:
64 ///   pBuf - input buffer
65 ///   max_bytes_to_read - maximum bytes that can be written to pBuf
66 ///   pEOF_flag - set this to true if at end of stream (no more bytes remaining)
67 ///   userData - user context for being used as closure.
68 ///   Returns -1 on error, otherwise return the number of bytes actually written to the buffer (which may be 0).
69 ///   Notes: This delegate will be called in a loop until you set *pEOF_flag to true or the internal buffer is full.
70 alias JpegStreamReadFunc = int function(void* pBuf, int max_bytes_to_read, bool* pEOF_flag, void* userData);
71 
72 
73 // ////////////////////////////////////////////////////////////////////////// //
74 private:
75 
76 void *jpgd_malloc (size_t nSize) 
77 { 
78     return malloc(nSize);
79 }
80 
81 void jpgd_free (void *p) 
82 { 
83     free(p);
84 }
85 
86 // Success/failure error codes.
87 alias jpgd_status = int;
88 enum /*jpgd_status*/ {
89   JPGD_SUCCESS = 0, JPGD_FAILED = -1, JPGD_DONE = 1,
90   JPGD_BAD_DHT_COUNTS = -256, JPGD_BAD_DHT_INDEX, JPGD_BAD_DHT_MARKER, JPGD_BAD_DQT_MARKER, JPGD_BAD_DQT_TABLE,
91   JPGD_BAD_PRECISION, JPGD_BAD_HEIGHT, JPGD_BAD_WIDTH, JPGD_TOO_MANY_COMPONENTS,
92   JPGD_BAD_SOF_LENGTH, JPGD_BAD_VARIABLE_MARKER, JPGD_BAD_DRI_LENGTH, JPGD_BAD_SOS_LENGTH,
93   JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
94   JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
95   JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
96   JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER, JPGD_ASSERTION_ERROR,
97   JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM,
98 }
99 
100 enum {
101   JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4,
102   JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 8192, JPGD_MAX_HEIGHT = 16384, JPGD_MAX_WIDTH = 16384,
103 }
104 
105 // DCT coefficients are stored in this sequence.
106 static immutable int[64] g_ZAG = [  0,1,8,16,9,2,3,10,17,24,32,25,18,11,4,5,12,19,26,33,40,48,41,34,27,20,13,6,7,14,21,28,35,42,49,56,57,50,43,36,29,22,15,23,30,37,44,51,58,59,52,45,38,31,39,46,53,60,61,54,47,55,62,63 ];
107 
108 alias JPEG_MARKER = int;
109 enum /*JPEG_MARKER*/ {
110   M_SOF0  = 0xC0, M_SOF1  = 0xC1, M_SOF2  = 0xC2, M_SOF3  = 0xC3, M_SOF5  = 0xC5, M_SOF6  = 0xC6, M_SOF7  = 0xC7, M_JPG   = 0xC8,
111   M_SOF9  = 0xC9, M_SOF10 = 0xCA, M_SOF11 = 0xCB, M_SOF13 = 0xCD, M_SOF14 = 0xCE, M_SOF15 = 0xCF, M_DHT   = 0xC4, M_DAC   = 0xCC,
112   M_RST0  = 0xD0, M_RST1  = 0xD1, M_RST2  = 0xD2, M_RST3  = 0xD3, M_RST4  = 0xD4, M_RST5  = 0xD5, M_RST6  = 0xD6, M_RST7  = 0xD7,
113   M_SOI   = 0xD8, M_EOI   = 0xD9, M_SOS   = 0xDA, M_DQT   = 0xDB, M_DNL   = 0xDC, M_DRI   = 0xDD, M_DHP   = 0xDE, M_EXP   = 0xDF,
114   M_APP0  = 0xE0, M_APP15 = 0xEF, M_JPG0  = 0xF0, M_JPG13 = 0xFD, M_COM   = 0xFE, M_TEM   = 0x01, M_ERROR = 0x100, RST0   = 0xD0,
115 }
116 
117 alias JPEG_SUBSAMPLING = int;
118 enum /*JPEG_SUBSAMPLING*/ { JPGD_GRAYSCALE = 0, JPGD_YH1V1, JPGD_YH2V1, JPGD_YH1V2, JPGD_YH2V2 };
119 
120 enum CONST_BITS = 13;
121 enum PASS1_BITS = 2;
122 enum SCALEDONE = cast(int)1;
123 
124 enum FIX_0_298631336 = cast(int)2446;  /* FIX(0.298631336) */
125 enum FIX_0_390180644 = cast(int)3196;  /* FIX(0.390180644) */
126 enum FIX_0_541196100 = cast(int)4433;  /* FIX(0.541196100) */
127 enum FIX_0_765366865 = cast(int)6270;  /* FIX(0.765366865) */
128 enum FIX_0_899976223 = cast(int)7373;  /* FIX(0.899976223) */
129 enum FIX_1_175875602 = cast(int)9633;  /* FIX(1.175875602) */
130 enum FIX_1_501321110 = cast(int)12299; /* FIX(1.501321110) */
131 enum FIX_1_847759065 = cast(int)15137; /* FIX(1.847759065) */
132 enum FIX_1_961570560 = cast(int)16069; /* FIX(1.961570560) */
133 enum FIX_2_053119869 = cast(int)16819; /* FIX(2.053119869) */
134 enum FIX_2_562915447 = cast(int)20995; /* FIX(2.562915447) */
135 enum FIX_3_072711026 = cast(int)25172; /* FIX(3.072711026) */
136 
137 int DESCALE() (int x, int n) 
138 { 
139     return ((x + (SCALEDONE << (n-1))) >> n); 
140 }
141 
142 int DESCALE_ZEROSHIFT() (int x, int n) 
143 { 
144     pragma(inline, true); return (((x) + (128 << (n)) + (SCALEDONE << ((n)-1))) >> (n)); 
145 }
146 
147 ubyte CLAMP() (int i) 
148 { 
149     if (i < 0) i = 0;
150     if (i > 255) i = 255;
151     return cast(ubyte)i;
152 }
153 
154 
155 // Compiler creates a fast path 1D IDCT for X non-zero columns
156 struct Row(int NONZERO_COLS) {
157 pure nothrow @trusted @nogc:
158   static void idct(int* pTemp, const(jpeg_decoder.jpgd_block_t)* pSrc) {
159     static if (NONZERO_COLS == 0) {
160       // nothing
161     } else static if (NONZERO_COLS == 1) {
162       immutable int dcval = (pSrc[0] << PASS1_BITS);
163       pTemp[0] = dcval;
164       pTemp[1] = dcval;
165       pTemp[2] = dcval;
166       pTemp[3] = dcval;
167       pTemp[4] = dcval;
168       pTemp[5] = dcval;
169       pTemp[6] = dcval;
170       pTemp[7] = dcval;
171     } else {
172       // ACCESS_COL() will be optimized at compile time to either an array access, or 0.
173       //#define ACCESS_COL(x) (((x) < NONZERO_COLS) ? (int)pSrc[x] : 0)
174       template ACCESS_COL(int x) {
175         static if (x < NONZERO_COLS) enum ACCESS_COL = "cast(int)pSrc["~x.stringof~"]"; else enum ACCESS_COL = "0";
176       }
177 
178       immutable int z2 = mixin(ACCESS_COL!2), z3 = mixin(ACCESS_COL!6);
179 
180       immutable int z1 = (z2 + z3)*FIX_0_541196100;
181       immutable int tmp2 = z1 + z3*(-FIX_1_847759065);
182       immutable int tmp3 = z1 + z2*FIX_0_765366865;
183 
184       immutable int tmp0 = (mixin(ACCESS_COL!0) + mixin(ACCESS_COL!4)) << CONST_BITS;
185       immutable int tmp1 = (mixin(ACCESS_COL!0) - mixin(ACCESS_COL!4)) << CONST_BITS;
186 
187       immutable int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
188 
189       immutable int atmp0 = mixin(ACCESS_COL!7), atmp1 = mixin(ACCESS_COL!5), atmp2 = mixin(ACCESS_COL!3), atmp3 = mixin(ACCESS_COL!1);
190 
191       immutable int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
192       immutable int bz5 = (bz3 + bz4)*FIX_1_175875602;
193 
194       immutable int az1 = bz1*(-FIX_0_899976223);
195       immutable int az2 = bz2*(-FIX_2_562915447);
196       immutable int az3 = bz3*(-FIX_1_961570560) + bz5;
197       immutable int az4 = bz4*(-FIX_0_390180644) + bz5;
198 
199       immutable int btmp0 = atmp0*FIX_0_298631336 + az1 + az3;
200       immutable int btmp1 = atmp1*FIX_2_053119869 + az2 + az4;
201       immutable int btmp2 = atmp2*FIX_3_072711026 + az2 + az3;
202       immutable int btmp3 = atmp3*FIX_1_501321110 + az1 + az4;
203 
204       pTemp[0] = DESCALE(tmp10 + btmp3, CONST_BITS-PASS1_BITS);
205       pTemp[7] = DESCALE(tmp10 - btmp3, CONST_BITS-PASS1_BITS);
206       pTemp[1] = DESCALE(tmp11 + btmp2, CONST_BITS-PASS1_BITS);
207       pTemp[6] = DESCALE(tmp11 - btmp2, CONST_BITS-PASS1_BITS);
208       pTemp[2] = DESCALE(tmp12 + btmp1, CONST_BITS-PASS1_BITS);
209       pTemp[5] = DESCALE(tmp12 - btmp1, CONST_BITS-PASS1_BITS);
210       pTemp[3] = DESCALE(tmp13 + btmp0, CONST_BITS-PASS1_BITS);
211       pTemp[4] = DESCALE(tmp13 - btmp0, CONST_BITS-PASS1_BITS);
212     }
213   }
214 }
215 
216 
217 // Compiler creates a fast path 1D IDCT for X non-zero rows
218 struct Col (int NONZERO_ROWS) {
219 pure nothrow @trusted @nogc:
220   static void idct(ubyte* pDst_ptr, const(int)* pTemp) {
221     static assert(NONZERO_ROWS > 0);
222     static if (NONZERO_ROWS == 1) {
223       int dcval = DESCALE_ZEROSHIFT(pTemp[0], PASS1_BITS+3);
224       immutable ubyte dcval_clamped = cast(ubyte)CLAMP(dcval);
225       pDst_ptr[0*8] = dcval_clamped;
226       pDst_ptr[1*8] = dcval_clamped;
227       pDst_ptr[2*8] = dcval_clamped;
228       pDst_ptr[3*8] = dcval_clamped;
229       pDst_ptr[4*8] = dcval_clamped;
230       pDst_ptr[5*8] = dcval_clamped;
231       pDst_ptr[6*8] = dcval_clamped;
232       pDst_ptr[7*8] = dcval_clamped;
233     } else {
234       // ACCESS_ROW() will be optimized at compile time to either an array access, or 0.
235       //#define ACCESS_ROW(x) (((x) < NONZERO_ROWS) ? pTemp[x * 8] : 0)
236       template ACCESS_ROW(int x) {
237         static if (x < NONZERO_ROWS) enum ACCESS_ROW = "pTemp["~(x*8).stringof~"]"; else enum ACCESS_ROW = "0";
238       }
239 
240       immutable int z2 = mixin(ACCESS_ROW!2);
241       immutable int z3 = mixin(ACCESS_ROW!6);
242 
243       immutable int z1 = (z2 + z3)*FIX_0_541196100;
244       immutable int tmp2 = z1 + z3*(-FIX_1_847759065);
245       immutable int tmp3 = z1 + z2*FIX_0_765366865;
246 
247       immutable int tmp0 = (mixin(ACCESS_ROW!0) + mixin(ACCESS_ROW!4)) << CONST_BITS;
248       immutable int tmp1 = (mixin(ACCESS_ROW!0) - mixin(ACCESS_ROW!4)) << CONST_BITS;
249 
250       immutable int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
251 
252       immutable int atmp0 = mixin(ACCESS_ROW!7), atmp1 = mixin(ACCESS_ROW!5), atmp2 = mixin(ACCESS_ROW!3), atmp3 = mixin(ACCESS_ROW!1);
253 
254       immutable int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
255       immutable int bz5 = (bz3 + bz4)*FIX_1_175875602;
256 
257       immutable int az1 = bz1*(-FIX_0_899976223);
258       immutable int az2 = bz2*(-FIX_2_562915447);
259       immutable int az3 = bz3*(-FIX_1_961570560) + bz5;
260       immutable int az4 = bz4*(-FIX_0_390180644) + bz5;
261 
262       immutable int btmp0 = atmp0*FIX_0_298631336 + az1 + az3;
263       immutable int btmp1 = atmp1*FIX_2_053119869 + az2 + az4;
264       immutable int btmp2 = atmp2*FIX_3_072711026 + az2 + az3;
265       immutable int btmp3 = atmp3*FIX_1_501321110 + az1 + az4;
266 
267       int i = DESCALE_ZEROSHIFT(tmp10 + btmp3, CONST_BITS+PASS1_BITS+3);
268       pDst_ptr[8*0] = cast(ubyte)CLAMP(i);
269 
270       i = DESCALE_ZEROSHIFT(tmp10 - btmp3, CONST_BITS+PASS1_BITS+3);
271       pDst_ptr[8*7] = cast(ubyte)CLAMP(i);
272 
273       i = DESCALE_ZEROSHIFT(tmp11 + btmp2, CONST_BITS+PASS1_BITS+3);
274       pDst_ptr[8*1] = cast(ubyte)CLAMP(i);
275 
276       i = DESCALE_ZEROSHIFT(tmp11 - btmp2, CONST_BITS+PASS1_BITS+3);
277       pDst_ptr[8*6] = cast(ubyte)CLAMP(i);
278 
279       i = DESCALE_ZEROSHIFT(tmp12 + btmp1, CONST_BITS+PASS1_BITS+3);
280       pDst_ptr[8*2] = cast(ubyte)CLAMP(i);
281 
282       i = DESCALE_ZEROSHIFT(tmp12 - btmp1, CONST_BITS+PASS1_BITS+3);
283       pDst_ptr[8*5] = cast(ubyte)CLAMP(i);
284 
285       i = DESCALE_ZEROSHIFT(tmp13 + btmp0, CONST_BITS+PASS1_BITS+3);
286       pDst_ptr[8*3] = cast(ubyte)CLAMP(i);
287 
288       i = DESCALE_ZEROSHIFT(tmp13 - btmp0, CONST_BITS+PASS1_BITS+3);
289       pDst_ptr[8*4] = cast(ubyte)CLAMP(i);
290     }
291   }
292 }
293 
294 
295 static immutable ubyte[512] s_idct_row_table = [
296   1,0,0,0,0,0,0,0, 2,0,0,0,0,0,0,0, 2,1,0,0,0,0,0,0, 2,1,1,0,0,0,0,0, 2,2,1,0,0,0,0,0, 3,2,1,0,0,0,0,0, 4,2,1,0,0,0,0,0, 4,3,1,0,0,0,0,0,
297   4,3,2,0,0,0,0,0, 4,3,2,1,0,0,0,0, 4,3,2,1,1,0,0,0, 4,3,2,2,1,0,0,0, 4,3,3,2,1,0,0,0, 4,4,3,2,1,0,0,0, 5,4,3,2,1,0,0,0, 6,4,3,2,1,0,0,0,
298   6,5,3,2,1,0,0,0, 6,5,4,2,1,0,0,0, 6,5,4,3,1,0,0,0, 6,5,4,3,2,0,0,0, 6,5,4,3,2,1,0,0, 6,5,4,3,2,1,1,0, 6,5,4,3,2,2,1,0, 6,5,4,3,3,2,1,0,
299   6,5,4,4,3,2,1,0, 6,5,5,4,3,2,1,0, 6,6,5,4,3,2,1,0, 7,6,5,4,3,2,1,0, 8,6,5,4,3,2,1,0, 8,7,5,4,3,2,1,0, 8,7,6,4,3,2,1,0, 8,7,6,5,3,2,1,0,
300   8,7,6,5,4,2,1,0, 8,7,6,5,4,3,1,0, 8,7,6,5,4,3,2,0, 8,7,6,5,4,3,2,1, 8,7,6,5,4,3,2,2, 8,7,6,5,4,3,3,2, 8,7,6,5,4,4,3,2, 8,7,6,5,5,4,3,2,
301   8,7,6,6,5,4,3,2, 8,7,7,6,5,4,3,2, 8,8,7,6,5,4,3,2, 8,8,8,6,5,4,3,2, 8,8,8,7,5,4,3,2, 8,8,8,7,6,4,3,2, 8,8,8,7,6,5,3,2, 8,8,8,7,6,5,4,2,
302   8,8,8,7,6,5,4,3, 8,8,8,7,6,5,4,4, 8,8,8,7,6,5,5,4, 8,8,8,7,6,6,5,4, 8,8,8,7,7,6,5,4, 8,8,8,8,7,6,5,4, 8,8,8,8,8,6,5,4, 8,8,8,8,8,7,5,4,
303   8,8,8,8,8,7,6,4, 8,8,8,8,8,7,6,5, 8,8,8,8,8,7,6,6, 8,8,8,8,8,7,7,6, 8,8,8,8,8,8,7,6, 8,8,8,8,8,8,8,6, 8,8,8,8,8,8,8,7, 8,8,8,8,8,8,8,8,
304 ];
305 
306 static immutable ubyte[64] s_idct_col_table = [ 1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ];
307 
308 void idct() (const(jpeg_decoder.jpgd_block_t)* pSrc_ptr, ubyte* pDst_ptr, int block_max_zag) {
309   assert(block_max_zag >= 1);
310   assert(block_max_zag <= 64);
311 
312   if (block_max_zag <= 1)
313   {
314     int k = ((pSrc_ptr[0] + 4) >> 3) + 128;
315     k = CLAMP(k);
316     k = k | (k<<8);
317     k = k | (k<<16);
318 
319     for (int i = 8; i > 0; i--)
320     {
321       *cast(int*)&pDst_ptr[0] = k;
322       *cast(int*)&pDst_ptr[4] = k;
323       pDst_ptr += 8;
324     }
325     return;
326   }
327 
328   int[64] temp;
329 
330   const(jpeg_decoder.jpgd_block_t)* pSrc = pSrc_ptr;
331   int* pTemp = temp.ptr;
332 
333   const(ubyte)* pRow_tab = &s_idct_row_table.ptr[(block_max_zag - 1) * 8];
334   int i;
335   for (i = 8; i > 0; i--, pRow_tab++)
336   {
337     switch (*pRow_tab)
338     {
339       case 0: Row!(0).idct(pTemp, pSrc); break;
340       case 1: Row!(1).idct(pTemp, pSrc); break;
341       case 2: Row!(2).idct(pTemp, pSrc); break;
342       case 3: Row!(3).idct(pTemp, pSrc); break;
343       case 4: Row!(4).idct(pTemp, pSrc); break;
344       case 5: Row!(5).idct(pTemp, pSrc); break;
345       case 6: Row!(6).idct(pTemp, pSrc); break;
346       case 7: Row!(7).idct(pTemp, pSrc); break;
347       case 8: Row!(8).idct(pTemp, pSrc); break;
348       default: assert(0);
349     }
350 
351     pSrc += 8;
352     pTemp += 8;
353   }
354 
355   pTemp = temp.ptr;
356 
357   immutable int nonzero_rows = s_idct_col_table.ptr[block_max_zag - 1];
358   for (i = 8; i > 0; i--)
359   {
360     switch (nonzero_rows)
361     {
362       case 1: Col!(1).idct(pDst_ptr, pTemp); break;
363       case 2: Col!(2).idct(pDst_ptr, pTemp); break;
364       case 3: Col!(3).idct(pDst_ptr, pTemp); break;
365       case 4: Col!(4).idct(pDst_ptr, pTemp); break;
366       case 5: Col!(5).idct(pDst_ptr, pTemp); break;
367       case 6: Col!(6).idct(pDst_ptr, pTemp); break;
368       case 7: Col!(7).idct(pDst_ptr, pTemp); break;
369       case 8: Col!(8).idct(pDst_ptr, pTemp); break;
370       default: assert(0);
371     }
372 
373     pTemp++;
374     pDst_ptr++;
375   }
376 }
377 
378 void idct_4x4() (const(jpeg_decoder.jpgd_block_t)* pSrc_ptr, ubyte* pDst_ptr) {
379   int[64] temp;
380   int* pTemp = temp.ptr;
381   const(jpeg_decoder.jpgd_block_t)* pSrc = pSrc_ptr;
382 
383   for (int i = 4; i > 0; i--)
384   {
385     Row!(4).idct(pTemp, pSrc);
386     pSrc += 8;
387     pTemp += 8;
388   }
389 
390   pTemp = temp.ptr;
391   for (int i = 8; i > 0; i--)
392   {
393     Col!(4).idct(pDst_ptr, pTemp);
394     pTemp++;
395     pDst_ptr++;
396   }
397 }
398 
399 
400 // ////////////////////////////////////////////////////////////////////////// //
401 struct jpeg_decoder {
402 nothrow:
403 @nogc:
404 
405 private:
406   static auto JPGD_MIN(T) (T a, T b) pure nothrow @safe @nogc { pragma(inline, true); return (a < b ? a : b); }
407   static auto JPGD_MAX(T) (T a, T b) pure nothrow @safe @nogc { pragma(inline, true); return (a > b ? a : b); }
408 
409   alias jpgd_quant_t = short;
410   alias jpgd_block_t = short;
411   alias pDecode_block_func = bool function (ref jpeg_decoder, int, int, int); // return false on input error
412 
413   static struct huff_tables {
414     bool ac_table;
415     uint[256] look_up;
416     uint[256] look_up2;
417     ubyte[256] code_size;
418     uint[512] tree;
419   }
420 
421   static struct coeff_buf {
422     ubyte* pData;
423     int block_num_x, block_num_y;
424     int block_len_x, block_len_y;
425     int block_size;
426   }
427 
428   static struct mem_block {
429     mem_block* m_pNext;
430     size_t m_used_count;
431     size_t m_size;
432     char[1] m_data;
433   }
434 
435   mem_block* m_pMem_blocks;
436   int m_image_x_size;
437   int m_image_y_size;
438   JpegStreamReadFunc readfn;
439   void* userData;
440   int m_progressive_flag;
441   ubyte[JPGD_MAX_HUFF_TABLES] m_huff_ac;
442   ubyte*[JPGD_MAX_HUFF_TABLES] m_huff_num;      // pointer to number of Huffman codes per bit size
443   ubyte*[JPGD_MAX_HUFF_TABLES] m_huff_val;      // pointer to Huffman codes per bit size
444   jpgd_quant_t*[JPGD_MAX_QUANT_TABLES] m_quant; // pointer to quantization tables
445   int m_scan_type;                              // Gray, Yh1v1, Yh1v2, Yh2v1, Yh2v2 (CMYK111, CMYK4114 no longer supported)
446   int m_comps_in_frame;                         // # of components in frame
447   int[JPGD_MAX_COMPONENTS] m_comp_h_samp;       // component's horizontal sampling factor
448   int[JPGD_MAX_COMPONENTS] m_comp_v_samp;       // component's vertical sampling factor
449   int[JPGD_MAX_COMPONENTS] m_comp_quant;        // component's quantization table selector
450   int[JPGD_MAX_COMPONENTS] m_comp_ident;        // component's ID
451   int[JPGD_MAX_COMPONENTS] m_comp_h_blocks;
452   int[JPGD_MAX_COMPONENTS] m_comp_v_blocks;
453   int m_comps_in_scan;                          // # of components in scan
454   int[JPGD_MAX_COMPS_IN_SCAN] m_comp_list;      // components in this scan
455   int[JPGD_MAX_COMPONENTS] m_comp_dc_tab;       // component's DC Huffman coding table selector
456   int[JPGD_MAX_COMPONENTS] m_comp_ac_tab;       // component's AC Huffman coding table selector
457   int m_spectral_start;                         // spectral selection start
458   int m_spectral_end;                           // spectral selection end
459   int m_successive_low;                         // successive approximation low
460   int m_successive_high;                        // successive approximation high
461   int m_max_mcu_x_size;                         // MCU's max. X size in pixels
462   int m_max_mcu_y_size;                         // MCU's max. Y size in pixels
463   int m_blocks_per_mcu;
464   int m_max_blocks_per_row;
465   int m_mcus_per_row, m_mcus_per_col;
466   int[JPGD_MAX_BLOCKS_PER_MCU] m_mcu_org;
467   int m_total_lines_left;                       // total # lines left in image
468   int m_mcu_lines_left;                         // total # lines left in this MCU
469   int m_real_dest_bytes_per_scan_line;
470   int m_dest_bytes_per_scan_line;               // rounded up
471   int m_dest_bytes_per_pixel;                   // 4 (RGB) or 1 (Y)
472   huff_tables*[JPGD_MAX_HUFF_TABLES] m_pHuff_tabs;
473   coeff_buf*[JPGD_MAX_COMPONENTS] m_dc_coeffs;
474   coeff_buf*[JPGD_MAX_COMPONENTS] m_ac_coeffs;
475   int m_eob_run;
476   int[JPGD_MAX_COMPONENTS] m_block_y_mcu;
477   ubyte* m_pIn_buf_ofs;
478   int m_in_buf_left;
479   int m_tem_flag;
480   bool m_eof_flag;
481   ubyte[128] m_in_buf_pad_start;
482   ubyte[JPGD_IN_BUF_SIZE+128] m_in_buf;
483   ubyte[128] m_in_buf_pad_end;
484   int m_bits_left;
485   uint m_bit_buf;
486   int m_restart_interval;
487   int m_restarts_left;
488   int m_next_restart_num;
489   int m_max_mcus_per_row;
490   int m_max_blocks_per_mcu;
491   int m_expanded_blocks_per_mcu;
492   int m_expanded_blocks_per_row;
493   int m_expanded_blocks_per_component;
494   bool m_freq_domain_chroma_upsample;
495   int m_max_mcus_per_col;
496   uint[JPGD_MAX_COMPONENTS] m_last_dc_val;
497   jpgd_block_t* m_pMCU_coefficients;
498   int[JPGD_MAX_BLOCKS_PER_MCU] m_mcu_block_max_zag;
499   ubyte* m_pSample_buf;
500   int[256] m_crr;
501   int[256] m_cbb;
502   int[256] m_crg;
503   int[256] m_cbg;
504   ubyte* m_pScan_line_0;
505   ubyte* m_pScan_line_1;
506   jpgd_status m_error_code;
507   bool m_ready_flag;
508   int m_total_bytes_read;
509 
510   float m_pixelsPerInchX;
511   float m_pixelsPerInchY; // -1 if not available
512   float m_pixelAspectRatio; // -1 if not available
513 
514 public:
515   // Inspect `error_code` after constructing to determine if the stream is valid or not. You may look at the `width`, `height`, etc.
516   // methods after the constructor is called. You may then either destruct the object, or begin decoding the image by calling begin_decoding(), then decode() on each scanline.
517   this (JpegStreamReadFunc rfn, void* userData, bool *err) 
518   { 
519      bool success = decode_init(rfn, userData); 
520      // MAYDO: On failure, there is an error code eventually to get more information
521      *err = !success; // for now, ignore that error code
522   }
523 
524   ~this () { free_all_blocks(); }
525 
526   @disable this (this); // no copies
527 
528   // Call this method after constructing the object to begin decompression.
529   // If JPGD_SUCCESS is returned you may then call decode() on each scanline.
530   int begin_decoding () {
531     if (m_ready_flag) return JPGD_SUCCESS;
532     if (m_error_code) return JPGD_FAILED;
533 
534     decode_start();
535     m_ready_flag = true;
536     return JPGD_SUCCESS;
537   }
538 
539   // Returns the next scan line.
540   // For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (`bytes_per_pixel` will return 1).
541   // Otherwise, it will always point to a buffer containing 32-bit RGBA pixels (A will always be 255, and `bytes_per_pixel` will return 4).
542   // Returns JPGD_SUCCESS if a scan line has been returned.
543   // Returns JPGD_DONE if all scan lines have been returned.
544   // Returns JPGD_FAILED if an error occurred. Inspect `error_code` for a more info.
545   int decode (/*const void** */void** pScan_line, uint* pScan_line_len) {
546     if (m_error_code || !m_ready_flag) return JPGD_FAILED;
547     if (m_total_lines_left == 0) return JPGD_DONE;
548 
549       if (m_mcu_lines_left == 0) {
550         if (m_progressive_flag) 
551             load_next_row(); 
552         else 
553         {
554             bool success = decode_next_row();
555             if (!success)
556                 return JPGD_FAILED;
557         }
558         // Find the EOI marker if that was the last row.
559         if (m_total_lines_left <= m_max_mcu_y_size)
560         {
561             if (!find_eoi())
562                 return JPGD_FAILED;
563         }
564         m_mcu_lines_left = m_max_mcu_y_size;
565       }
566       if (m_freq_domain_chroma_upsample) {
567         expanded_convert();
568         *pScan_line = m_pScan_line_0;
569       } else {
570         switch (m_scan_type) {
571           case JPGD_YH2V2:
572             if ((m_mcu_lines_left & 1) == 0) {
573               H2V2Convert();
574               *pScan_line = m_pScan_line_0;
575             } else {
576               *pScan_line = m_pScan_line_1;
577             }
578             break;
579           case JPGD_YH2V1:
580             H2V1Convert();
581             *pScan_line = m_pScan_line_0;
582             break;
583           case JPGD_YH1V2:
584             if ((m_mcu_lines_left & 1) == 0) {
585               H1V2Convert();
586               *pScan_line = m_pScan_line_0;
587             } else {
588               *pScan_line = m_pScan_line_1;
589             }
590             break;
591           case JPGD_YH1V1:
592             H1V1Convert();
593             *pScan_line = m_pScan_line_0;
594             break;
595           case JPGD_GRAYSCALE:
596             gray_convert();
597             *pScan_line = m_pScan_line_0;
598             break;
599           default:
600         }
601       }
602       *pScan_line_len = m_real_dest_bytes_per_scan_line;
603       --m_mcu_lines_left;
604       --m_total_lines_left;
605       return JPGD_SUCCESS;
606   }
607 
608   @property const pure nothrow @safe @nogc {
609     jpgd_status error_code () { return m_error_code; }
610 
611     int width () { return m_image_x_size; }
612     int height () { return m_image_y_size; }
613 
614     int num_components () { return m_comps_in_frame; }
615 
616     int bytes_per_pixel () { return m_dest_bytes_per_pixel; }
617     int bytes_per_scan_line () { return m_image_x_size * bytes_per_pixel(); }
618 
619     // Returns the total number of bytes actually consumed by the decoder (which should equal the actual size of the JPEG file).
620     int total_bytes_read () { return m_total_bytes_read; }
621   }
622 
623 private:
624   // Retrieve one character from the input stream.
625   uint get_char (bool* err) {
626     // Any bytes remaining in buffer?
627     *err = false;
628     if (!m_in_buf_left) {
629       // Try to get more bytes.
630       if (!prep_in_buffer())
631       {
632         *err = true;
633         return 0;
634       }
635       // Still nothing to get?
636       if (!m_in_buf_left) {
637         // Pad the end of the stream with 0xFF 0xD9 (EOI marker)
638         int t = m_tem_flag;
639         m_tem_flag ^= 1;
640         return (t ? 0xD9 : 0xFF);
641       }
642     }
643     uint c = *m_pIn_buf_ofs++;
644     --m_in_buf_left;
645     return c;
646   }
647 
648   // Same as previous method, except can indicate if the character is a pad character or not.
649   uint get_char (bool* pPadding_flag, bool* err) {
650     *err = false;
651     if (!m_in_buf_left) {
652       if (!prep_in_buffer())
653       {
654           *err = true;
655           return 0;
656       }
657       if (!m_in_buf_left) {
658         *pPadding_flag = true;
659         int t = m_tem_flag;
660         m_tem_flag ^= 1;
661         return (t ? 0xD9 : 0xFF);
662       }
663     }
664     *pPadding_flag = false;
665     uint c = *m_pIn_buf_ofs++;
666     --m_in_buf_left;
667     return c;
668   }
669 
670   // Inserts a previously retrieved character back into the input buffer.
671   void stuff_char (ubyte q) {
672     *(--m_pIn_buf_ofs) = q;
673     m_in_buf_left++;
674   }
675 
676   // Retrieves one character from the input stream, but does not read past markers. Will continue to return 0xFF when a marker is encountered.
677   ubyte get_octet () {
678     bool padding_flag;
679     int c = get_char(&padding_flag);
680     if (c == 0xFF) {
681       if (padding_flag) return 0xFF;
682       c = get_char(&padding_flag);
683       if (padding_flag) { stuff_char(0xFF); return 0xFF; }
684       if (c == 0x00) return 0xFF;
685       stuff_char(cast(ubyte)(c));
686       stuff_char(0xFF);
687       return 0xFF;
688     }
689     return cast(ubyte)(c);
690   }
691 
692   // Retrieves a variable number of bits from the input stream. Does not recognize markers.
693   uint get_bits (int num_bits, bool* err) {
694     *err = false;
695     if (!num_bits) return 0;
696     uint i = m_bit_buf >> (32 - num_bits);
697     if ((m_bits_left -= num_bits) <= 0) {
698       m_bit_buf <<= (num_bits += m_bits_left);
699       uint c1 = get_char(err);
700       if (*err)
701           return 0;
702       uint c2 = get_char(err);
703       if (*err)
704           return 0;
705       m_bit_buf = (m_bit_buf & 0xFFFF0000) | (c1 << 8) | c2;
706       m_bit_buf <<= -m_bits_left;
707       m_bits_left += 16;
708       assert(m_bits_left >= 0);
709     } else {
710       m_bit_buf <<= num_bits;
711     }
712     return i;
713   }
714 
715   // Retrieves a variable number of bits from the input stream. Markers will not be read into the input bit buffer. Instead, an infinite number of all 1's will be returned when a marker is encountered.
716   uint get_bits_no_markers (int num_bits, bool* err) {
717     if (!num_bits) return 0;
718     uint i = m_bit_buf >> (32 - num_bits);
719     if ((m_bits_left -= num_bits) <= 0) {
720       m_bit_buf <<= (num_bits += m_bits_left);
721       if (m_in_buf_left < 2 || m_pIn_buf_ofs[0] == 0xFF || m_pIn_buf_ofs[1] == 0xFF) {
722         uint c1 = get_octet();
723         uint c2 = get_octet();
724         m_bit_buf |= (c1 << 8) | c2;
725       } else {
726         m_bit_buf |= (cast(uint)m_pIn_buf_ofs[0] << 8) | m_pIn_buf_ofs[1];
727         m_in_buf_left -= 2;
728         m_pIn_buf_ofs += 2;
729       }
730       m_bit_buf <<= -m_bits_left;
731       m_bits_left += 16;
732       assert(m_bits_left >= 0);
733     } else {
734       m_bit_buf <<= num_bits;
735     }
736     return i;
737   }
738 
739   // Decodes a Huffman encoded symbol.
740   int huff_decode (huff_tables *pH, bool* err) {
741     int symbol;
742     *err = false;
743     // Check first 8-bits: do we have a complete symbol?
744     if ((symbol = pH.look_up.ptr[m_bit_buf >> 24]) < 0) {
745       // Decode more bits, use a tree traversal to find symbol.
746       int ofs = 23;
747       do {
748         symbol = pH.tree.ptr[-cast(int)(symbol + ((m_bit_buf >> ofs) & 1))];
749         --ofs;
750       } while (symbol < 0);
751       get_bits_no_markers(8 + (23 - ofs), err);
752       if (*err)
753           return 0;
754     } else {
755       get_bits_no_markers(pH.code_size.ptr[symbol], err);
756       if (*err)
757           return 0;
758     }
759     return symbol;
760   }
761 
762   // Decodes a Huffman encoded symbol.
763   int huff_decode (huff_tables *pH, ref int extra_bits, bool* err) {
764     int symbol;
765     *err = false;
766     // Check first 8-bits: do we have a complete symbol?
767     if ((symbol = pH.look_up2.ptr[m_bit_buf >> 24]) < 0) {
768       // Use a tree traversal to find symbol.
769       int ofs = 23;
770       do {
771         symbol = pH.tree.ptr[-cast(int)(symbol + ((m_bit_buf >> ofs) & 1))];
772         --ofs;
773       } while (symbol < 0);
774       get_bits_no_markers(8 + (23 - ofs), err);
775       if (*err)
776           return 0;
777       extra_bits = get_bits_no_markers(symbol & 0xF, err);
778       if (*err)
779           return 0;
780     } else {
781       assert(((symbol >> 8) & 31) == pH.code_size.ptr[symbol & 255] + ((symbol & 0x8000) ? (symbol & 15) : 0));
782       if (symbol & 0x8000) {
783         get_bits_no_markers((symbol >> 8) & 31, err);
784         if (*err)
785             return 0;
786         extra_bits = symbol >> 16;
787       } else {
788         int code_size = (symbol >> 8) & 31;
789         int num_extra_bits = symbol & 0xF;
790         int bits = code_size + num_extra_bits;
791         if (bits <= (m_bits_left + 16)) {
792           extra_bits = get_bits_no_markers(bits, err) & ((1 << num_extra_bits) - 1);
793           if (*err)
794               return 0;
795         } else {
796           get_bits_no_markers(code_size, err);
797           if (*err)
798               return 0;
799           extra_bits = get_bits_no_markers(num_extra_bits, err);
800           if (*err)
801               return 0;
802         }
803       }
804       symbol &= 0xFF;
805     }
806     return symbol;
807   }
808 
809   // Tables and macro used to fully decode the DPCM differences.
810   static immutable int[16] s_extend_test = [ 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 ];
811   static immutable int[16] s_extend_offset = [ 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1, ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1, ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1, ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 ];
812   
813   static int JPGD_HUFF_EXTEND (int x, int s) nothrow @trusted @nogc 
814   { 
815       return (((x) < s_extend_test.ptr[s]) ? ((x) + s_extend_offset.ptr[s]) : (x)); 
816   }
817 
818   // Clamps a value between 0-255.
819   alias clamp = CLAMP;
820 
821   static struct DCT_Upsample {
822   static:
823     static struct Matrix44 {
824     pure nothrow @trusted @nogc:
825       alias Element_Type = int;
826       enum { NUM_ROWS = 4, NUM_COLS = 4 }
827 
828       Element_Type[NUM_COLS][NUM_ROWS] v;
829 
830       this() (in Matrix44 m) {
831         foreach (immutable r; 0..NUM_ROWS) v[r][] = m.v[r][];
832       }
833 
834       ref inout(Element_Type) at (int r, int c) inout { pragma(inline, true); return v.ptr[r].ptr[c]; }
835 
836       ref Matrix44 opOpAssign(string op:"+") (in Matrix44 a) {
837         foreach (int r; 0..NUM_ROWS) {
838           at(r, 0) += a.at(r, 0);
839           at(r, 1) += a.at(r, 1);
840           at(r, 2) += a.at(r, 2);
841           at(r, 3) += a.at(r, 3);
842         }
843         return this;
844       }
845 
846       ref Matrix44 opOpAssign(string op:"-") (in Matrix44 a) {
847         foreach (int r; 0..NUM_ROWS) {
848           at(r, 0) -= a.at(r, 0);
849           at(r, 1) -= a.at(r, 1);
850           at(r, 2) -= a.at(r, 2);
851           at(r, 3) -= a.at(r, 3);
852         }
853         return this;
854       }
855 
856       Matrix44 opBinary(string op:"+") (in Matrix44 b) const {
857         alias a = this;
858         Matrix44 ret;
859         foreach (int r; 0..NUM_ROWS) {
860           ret.at(r, 0) = a.at(r, 0) + b.at(r, 0);
861           ret.at(r, 1) = a.at(r, 1) + b.at(r, 1);
862           ret.at(r, 2) = a.at(r, 2) + b.at(r, 2);
863           ret.at(r, 3) = a.at(r, 3) + b.at(r, 3);
864         }
865         return ret;
866       }
867 
868       Matrix44 opBinary(string op:"-") (in Matrix44 b) const {
869         alias a = this;
870         Matrix44 ret;
871         foreach (int r; 0..NUM_ROWS) {
872           ret.at(r, 0) = a.at(r, 0) - b.at(r, 0);
873           ret.at(r, 1) = a.at(r, 1) - b.at(r, 1);
874           ret.at(r, 2) = a.at(r, 2) - b.at(r, 2);
875           ret.at(r, 3) = a.at(r, 3) - b.at(r, 3);
876         }
877         return ret;
878       }
879 
880       static void add_and_store() (jpgd_block_t* pDst, in Matrix44 a, in Matrix44 b) {
881         foreach (int r; 0..4) {
882           pDst[0*8 + r] = cast(jpgd_block_t)(a.at(r, 0) + b.at(r, 0));
883           pDst[1*8 + r] = cast(jpgd_block_t)(a.at(r, 1) + b.at(r, 1));
884           pDst[2*8 + r] = cast(jpgd_block_t)(a.at(r, 2) + b.at(r, 2));
885           pDst[3*8 + r] = cast(jpgd_block_t)(a.at(r, 3) + b.at(r, 3));
886         }
887       }
888 
889       static void sub_and_store() (jpgd_block_t* pDst, in Matrix44 a, in Matrix44 b) {
890         foreach (int r; 0..4) {
891           pDst[0*8 + r] = cast(jpgd_block_t)(a.at(r, 0) - b.at(r, 0));
892           pDst[1*8 + r] = cast(jpgd_block_t)(a.at(r, 1) - b.at(r, 1));
893           pDst[2*8 + r] = cast(jpgd_block_t)(a.at(r, 2) - b.at(r, 2));
894           pDst[3*8 + r] = cast(jpgd_block_t)(a.at(r, 3) - b.at(r, 3));
895         }
896       }
897     }
898 
899     enum FRACT_BITS = 10;
900     enum SCALE = 1 << FRACT_BITS;
901 
902     alias Temp_Type = int;
903 
904     static int D(T) (T i) { pragma(inline, true); return (((i) + (SCALE >> 1)) >> FRACT_BITS); }
905     enum F(float i) = (cast(int)((i) * SCALE + 0.5f));
906 
907     // NUM_ROWS/NUM_COLS = # of non-zero rows/cols in input matrix
908     static struct P_Q(int NUM_ROWS, int NUM_COLS) {
909       static void calc (ref Matrix44 P, ref Matrix44 Q, const(jpgd_block_t)* pSrc) {
910         //auto AT (int c, int r) nothrow @trusted @nogc { return (c >= NUM_COLS || r >= NUM_ROWS ? 0 : pSrc[c+r*8]); }
911         template AT(int c, int r) {
912           static if (c >= NUM_COLS || r >= NUM_ROWS) enum AT = "0"; else enum AT = "pSrc["~c.stringof~"+"~r.stringof~"*8]";
913         }
914         // 4x8 = 4x8 times 8x8, matrix 0 is constant
915         immutable Temp_Type X000 = mixin(AT!(0, 0));
916         immutable Temp_Type X001 = mixin(AT!(0, 1));
917         immutable Temp_Type X002 = mixin(AT!(0, 2));
918         immutable Temp_Type X003 = mixin(AT!(0, 3));
919         immutable Temp_Type X004 = mixin(AT!(0, 4));
920         immutable Temp_Type X005 = mixin(AT!(0, 5));
921         immutable Temp_Type X006 = mixin(AT!(0, 6));
922         immutable Temp_Type X007 = mixin(AT!(0, 7));
923         immutable Temp_Type X010 = D(F!(0.415735f) * mixin(AT!(1, 0)) + F!(0.791065f) * mixin(AT!(3, 0)) + F!(-0.352443f) * mixin(AT!(5, 0)) + F!(0.277785f) * mixin(AT!(7, 0)));
924         immutable Temp_Type X011 = D(F!(0.415735f) * mixin(AT!(1, 1)) + F!(0.791065f) * mixin(AT!(3, 1)) + F!(-0.352443f) * mixin(AT!(5, 1)) + F!(0.277785f) * mixin(AT!(7, 1)));
925         immutable Temp_Type X012 = D(F!(0.415735f) * mixin(AT!(1, 2)) + F!(0.791065f) * mixin(AT!(3, 2)) + F!(-0.352443f) * mixin(AT!(5, 2)) + F!(0.277785f) * mixin(AT!(7, 2)));
926         immutable Temp_Type X013 = D(F!(0.415735f) * mixin(AT!(1, 3)) + F!(0.791065f) * mixin(AT!(3, 3)) + F!(-0.352443f) * mixin(AT!(5, 3)) + F!(0.277785f) * mixin(AT!(7, 3)));
927         immutable Temp_Type X014 = D(F!(0.415735f) * mixin(AT!(1, 4)) + F!(0.791065f) * mixin(AT!(3, 4)) + F!(-0.352443f) * mixin(AT!(5, 4)) + F!(0.277785f) * mixin(AT!(7, 4)));
928         immutable Temp_Type X015 = D(F!(0.415735f) * mixin(AT!(1, 5)) + F!(0.791065f) * mixin(AT!(3, 5)) + F!(-0.352443f) * mixin(AT!(5, 5)) + F!(0.277785f) * mixin(AT!(7, 5)));
929         immutable Temp_Type X016 = D(F!(0.415735f) * mixin(AT!(1, 6)) + F!(0.791065f) * mixin(AT!(3, 6)) + F!(-0.352443f) * mixin(AT!(5, 6)) + F!(0.277785f) * mixin(AT!(7, 6)));
930         immutable Temp_Type X017 = D(F!(0.415735f) * mixin(AT!(1, 7)) + F!(0.791065f) * mixin(AT!(3, 7)) + F!(-0.352443f) * mixin(AT!(5, 7)) + F!(0.277785f) * mixin(AT!(7, 7)));
931         immutable Temp_Type X020 = mixin(AT!(4, 0));
932         immutable Temp_Type X021 = mixin(AT!(4, 1));
933         immutable Temp_Type X022 = mixin(AT!(4, 2));
934         immutable Temp_Type X023 = mixin(AT!(4, 3));
935         immutable Temp_Type X024 = mixin(AT!(4, 4));
936         immutable Temp_Type X025 = mixin(AT!(4, 5));
937         immutable Temp_Type X026 = mixin(AT!(4, 6));
938         immutable Temp_Type X027 = mixin(AT!(4, 7));
939         immutable Temp_Type X030 = D(F!(0.022887f) * mixin(AT!(1, 0)) + F!(-0.097545f) * mixin(AT!(3, 0)) + F!(0.490393f) * mixin(AT!(5, 0)) + F!(0.865723f) * mixin(AT!(7, 0)));
940         immutable Temp_Type X031 = D(F!(0.022887f) * mixin(AT!(1, 1)) + F!(-0.097545f) * mixin(AT!(3, 1)) + F!(0.490393f) * mixin(AT!(5, 1)) + F!(0.865723f) * mixin(AT!(7, 1)));
941         immutable Temp_Type X032 = D(F!(0.022887f) * mixin(AT!(1, 2)) + F!(-0.097545f) * mixin(AT!(3, 2)) + F!(0.490393f) * mixin(AT!(5, 2)) + F!(0.865723f) * mixin(AT!(7, 2)));
942         immutable Temp_Type X033 = D(F!(0.022887f) * mixin(AT!(1, 3)) + F!(-0.097545f) * mixin(AT!(3, 3)) + F!(0.490393f) * mixin(AT!(5, 3)) + F!(0.865723f) * mixin(AT!(7, 3)));
943         immutable Temp_Type X034 = D(F!(0.022887f) * mixin(AT!(1, 4)) + F!(-0.097545f) * mixin(AT!(3, 4)) + F!(0.490393f) * mixin(AT!(5, 4)) + F!(0.865723f) * mixin(AT!(7, 4)));
944         immutable Temp_Type X035 = D(F!(0.022887f) * mixin(AT!(1, 5)) + F!(-0.097545f) * mixin(AT!(3, 5)) + F!(0.490393f) * mixin(AT!(5, 5)) + F!(0.865723f) * mixin(AT!(7, 5)));
945         immutable Temp_Type X036 = D(F!(0.022887f) * mixin(AT!(1, 6)) + F!(-0.097545f) * mixin(AT!(3, 6)) + F!(0.490393f) * mixin(AT!(5, 6)) + F!(0.865723f) * mixin(AT!(7, 6)));
946         immutable Temp_Type X037 = D(F!(0.022887f) * mixin(AT!(1, 7)) + F!(-0.097545f) * mixin(AT!(3, 7)) + F!(0.490393f) * mixin(AT!(5, 7)) + F!(0.865723f) * mixin(AT!(7, 7)));
947 
948         // 4x4 = 4x8 times 8x4, matrix 1 is constant
949         P.at(0, 0) = X000;
950         P.at(0, 1) = D(X001 * F!(0.415735f) + X003 * F!(0.791065f) + X005 * F!(-0.352443f) + X007 * F!(0.277785f));
951         P.at(0, 2) = X004;
952         P.at(0, 3) = D(X001 * F!(0.022887f) + X003 * F!(-0.097545f) + X005 * F!(0.490393f) + X007 * F!(0.865723f));
953         P.at(1, 0) = X010;
954         P.at(1, 1) = D(X011 * F!(0.415735f) + X013 * F!(0.791065f) + X015 * F!(-0.352443f) + X017 * F!(0.277785f));
955         P.at(1, 2) = X014;
956         P.at(1, 3) = D(X011 * F!(0.022887f) + X013 * F!(-0.097545f) + X015 * F!(0.490393f) + X017 * F!(0.865723f));
957         P.at(2, 0) = X020;
958         P.at(2, 1) = D(X021 * F!(0.415735f) + X023 * F!(0.791065f) + X025 * F!(-0.352443f) + X027 * F!(0.277785f));
959         P.at(2, 2) = X024;
960         P.at(2, 3) = D(X021 * F!(0.022887f) + X023 * F!(-0.097545f) + X025 * F!(0.490393f) + X027 * F!(0.865723f));
961         P.at(3, 0) = X030;
962         P.at(3, 1) = D(X031 * F!(0.415735f) + X033 * F!(0.791065f) + X035 * F!(-0.352443f) + X037 * F!(0.277785f));
963         P.at(3, 2) = X034;
964         P.at(3, 3) = D(X031 * F!(0.022887f) + X033 * F!(-0.097545f) + X035 * F!(0.490393f) + X037 * F!(0.865723f));
965         // 40 muls 24 adds
966 
967         // 4x4 = 4x8 times 8x4, matrix 1 is constant
968         Q.at(0, 0) = D(X001 * F!(0.906127f) + X003 * F!(-0.318190f) + X005 * F!(0.212608f) + X007 * F!(-0.180240f));
969         Q.at(0, 1) = X002;
970         Q.at(0, 2) = D(X001 * F!(-0.074658f) + X003 * F!(0.513280f) + X005 * F!(0.768178f) + X007 * F!(-0.375330f));
971         Q.at(0, 3) = X006;
972         Q.at(1, 0) = D(X011 * F!(0.906127f) + X013 * F!(-0.318190f) + X015 * F!(0.212608f) + X017 * F!(-0.180240f));
973         Q.at(1, 1) = X012;
974         Q.at(1, 2) = D(X011 * F!(-0.074658f) + X013 * F!(0.513280f) + X015 * F!(0.768178f) + X017 * F!(-0.375330f));
975         Q.at(1, 3) = X016;
976         Q.at(2, 0) = D(X021 * F!(0.906127f) + X023 * F!(-0.318190f) + X025 * F!(0.212608f) + X027 * F!(-0.180240f));
977         Q.at(2, 1) = X022;
978         Q.at(2, 2) = D(X021 * F!(-0.074658f) + X023 * F!(0.513280f) + X025 * F!(0.768178f) + X027 * F!(-0.375330f));
979         Q.at(2, 3) = X026;
980         Q.at(3, 0) = D(X031 * F!(0.906127f) + X033 * F!(-0.318190f) + X035 * F!(0.212608f) + X037 * F!(-0.180240f));
981         Q.at(3, 1) = X032;
982         Q.at(3, 2) = D(X031 * F!(-0.074658f) + X033 * F!(0.513280f) + X035 * F!(0.768178f) + X037 * F!(-0.375330f));
983         Q.at(3, 3) = X036;
984         // 40 muls 24 adds
985       }
986     }
987 
988     static struct R_S(int NUM_ROWS, int NUM_COLS) {
989       static void calc(ref Matrix44 R, ref Matrix44 S, const(jpgd_block_t)* pSrc) {
990         //auto AT (int c, int r) nothrow @trusted @nogc { return (c >= NUM_COLS || r >= NUM_ROWS ? 0 : pSrc[c+r*8]); }
991         template AT(int c, int r) {
992           static if (c >= NUM_COLS || r >= NUM_ROWS) enum AT = "0"; else enum AT = "pSrc["~c.stringof~"+"~r.stringof~"*8]";
993         }
994         // 4x8 = 4x8 times 8x8, matrix 0 is constant
995         immutable Temp_Type X100 = D(F!(0.906127f) * mixin(AT!(1, 0)) + F!(-0.318190f) * mixin(AT!(3, 0)) + F!(0.212608f) * mixin(AT!(5, 0)) + F!(-0.180240f) * mixin(AT!(7, 0)));
996         immutable Temp_Type X101 = D(F!(0.906127f) * mixin(AT!(1, 1)) + F!(-0.318190f) * mixin(AT!(3, 1)) + F!(0.212608f) * mixin(AT!(5, 1)) + F!(-0.180240f) * mixin(AT!(7, 1)));
997         immutable Temp_Type X102 = D(F!(0.906127f) * mixin(AT!(1, 2)) + F!(-0.318190f) * mixin(AT!(3, 2)) + F!(0.212608f) * mixin(AT!(5, 2)) + F!(-0.180240f) * mixin(AT!(7, 2)));
998         immutable Temp_Type X103 = D(F!(0.906127f) * mixin(AT!(1, 3)) + F!(-0.318190f) * mixin(AT!(3, 3)) + F!(0.212608f) * mixin(AT!(5, 3)) + F!(-0.180240f) * mixin(AT!(7, 3)));
999         immutable Temp_Type X104 = D(F!(0.906127f) * mixin(AT!(1, 4)) + F!(-0.318190f) * mixin(AT!(3, 4)) + F!(0.212608f) * mixin(AT!(5, 4)) + F!(-0.180240f) * mixin(AT!(7, 4)));
1000         immutable Temp_Type X105 = D(F!(0.906127f) * mixin(AT!(1, 5)) + F!(-0.318190f) * mixin(AT!(3, 5)) + F!(0.212608f) * mixin(AT!(5, 5)) + F!(-0.180240f) * mixin(AT!(7, 5)));
1001         immutable Temp_Type X106 = D(F!(0.906127f) * mixin(AT!(1, 6)) + F!(-0.318190f) * mixin(AT!(3, 6)) + F!(0.212608f) * mixin(AT!(5, 6)) + F!(-0.180240f) * mixin(AT!(7, 6)));
1002         immutable Temp_Type X107 = D(F!(0.906127f) * mixin(AT!(1, 7)) + F!(-0.318190f) * mixin(AT!(3, 7)) + F!(0.212608f) * mixin(AT!(5, 7)) + F!(-0.180240f) * mixin(AT!(7, 7)));
1003         immutable Temp_Type X110 = mixin(AT!(2, 0));
1004         immutable Temp_Type X111 = mixin(AT!(2, 1));
1005         immutable Temp_Type X112 = mixin(AT!(2, 2));
1006         immutable Temp_Type X113 = mixin(AT!(2, 3));
1007         immutable Temp_Type X114 = mixin(AT!(2, 4));
1008         immutable Temp_Type X115 = mixin(AT!(2, 5));
1009         immutable Temp_Type X116 = mixin(AT!(2, 6));
1010         immutable Temp_Type X117 = mixin(AT!(2, 7));
1011         immutable Temp_Type X120 = D(F!(-0.074658f) * mixin(AT!(1, 0)) + F!(0.513280f) * mixin(AT!(3, 0)) + F!(0.768178f) * mixin(AT!(5, 0)) + F!(-0.375330f) * mixin(AT!(7, 0)));
1012         immutable Temp_Type X121 = D(F!(-0.074658f) * mixin(AT!(1, 1)) + F!(0.513280f) * mixin(AT!(3, 1)) + F!(0.768178f) * mixin(AT!(5, 1)) + F!(-0.375330f) * mixin(AT!(7, 1)));
1013         immutable Temp_Type X122 = D(F!(-0.074658f) * mixin(AT!(1, 2)) + F!(0.513280f) * mixin(AT!(3, 2)) + F!(0.768178f) * mixin(AT!(5, 2)) + F!(-0.375330f) * mixin(AT!(7, 2)));
1014         immutable Temp_Type X123 = D(F!(-0.074658f) * mixin(AT!(1, 3)) + F!(0.513280f) * mixin(AT!(3, 3)) + F!(0.768178f) * mixin(AT!(5, 3)) + F!(-0.375330f) * mixin(AT!(7, 3)));
1015         immutable Temp_Type X124 = D(F!(-0.074658f) * mixin(AT!(1, 4)) + F!(0.513280f) * mixin(AT!(3, 4)) + F!(0.768178f) * mixin(AT!(5, 4)) + F!(-0.375330f) * mixin(AT!(7, 4)));
1016         immutable Temp_Type X125 = D(F!(-0.074658f) * mixin(AT!(1, 5)) + F!(0.513280f) * mixin(AT!(3, 5)) + F!(0.768178f) * mixin(AT!(5, 5)) + F!(-0.375330f) * mixin(AT!(7, 5)));
1017         immutable Temp_Type X126 = D(F!(-0.074658f) * mixin(AT!(1, 6)) + F!(0.513280f) * mixin(AT!(3, 6)) + F!(0.768178f) * mixin(AT!(5, 6)) + F!(-0.375330f) * mixin(AT!(7, 6)));
1018         immutable Temp_Type X127 = D(F!(-0.074658f) * mixin(AT!(1, 7)) + F!(0.513280f) * mixin(AT!(3, 7)) + F!(0.768178f) * mixin(AT!(5, 7)) + F!(-0.375330f) * mixin(AT!(7, 7)));
1019         immutable Temp_Type X130 = mixin(AT!(6, 0));
1020         immutable Temp_Type X131 = mixin(AT!(6, 1));
1021         immutable Temp_Type X132 = mixin(AT!(6, 2));
1022         immutable Temp_Type X133 = mixin(AT!(6, 3));
1023         immutable Temp_Type X134 = mixin(AT!(6, 4));
1024         immutable Temp_Type X135 = mixin(AT!(6, 5));
1025         immutable Temp_Type X136 = mixin(AT!(6, 6));
1026         immutable Temp_Type X137 = mixin(AT!(6, 7));
1027         // 80 muls 48 adds
1028 
1029         // 4x4 = 4x8 times 8x4, matrix 1 is constant
1030         R.at(0, 0) = X100;
1031         R.at(0, 1) = D(X101 * F!(0.415735f) + X103 * F!(0.791065f) + X105 * F!(-0.352443f) + X107 * F!(0.277785f));
1032         R.at(0, 2) = X104;
1033         R.at(0, 3) = D(X101 * F!(0.022887f) + X103 * F!(-0.097545f) + X105 * F!(0.490393f) + X107 * F!(0.865723f));
1034         R.at(1, 0) = X110;
1035         R.at(1, 1) = D(X111 * F!(0.415735f) + X113 * F!(0.791065f) + X115 * F!(-0.352443f) + X117 * F!(0.277785f));
1036         R.at(1, 2) = X114;
1037         R.at(1, 3) = D(X111 * F!(0.022887f) + X113 * F!(-0.097545f) + X115 * F!(0.490393f) + X117 * F!(0.865723f));
1038         R.at(2, 0) = X120;
1039         R.at(2, 1) = D(X121 * F!(0.415735f) + X123 * F!(0.791065f) + X125 * F!(-0.352443f) + X127 * F!(0.277785f));
1040         R.at(2, 2) = X124;
1041         R.at(2, 3) = D(X121 * F!(0.022887f) + X123 * F!(-0.097545f) + X125 * F!(0.490393f) + X127 * F!(0.865723f));
1042         R.at(3, 0) = X130;
1043         R.at(3, 1) = D(X131 * F!(0.415735f) + X133 * F!(0.791065f) + X135 * F!(-0.352443f) + X137 * F!(0.277785f));
1044         R.at(3, 2) = X134;
1045         R.at(3, 3) = D(X131 * F!(0.022887f) + X133 * F!(-0.097545f) + X135 * F!(0.490393f) + X137 * F!(0.865723f));
1046         // 40 muls 24 adds
1047         // 4x4 = 4x8 times 8x4, matrix 1 is constant
1048         S.at(0, 0) = D(X101 * F!(0.906127f) + X103 * F!(-0.318190f) + X105 * F!(0.212608f) + X107 * F!(-0.180240f));
1049         S.at(0, 1) = X102;
1050         S.at(0, 2) = D(X101 * F!(-0.074658f) + X103 * F!(0.513280f) + X105 * F!(0.768178f) + X107 * F!(-0.375330f));
1051         S.at(0, 3) = X106;
1052         S.at(1, 0) = D(X111 * F!(0.906127f) + X113 * F!(-0.318190f) + X115 * F!(0.212608f) + X117 * F!(-0.180240f));
1053         S.at(1, 1) = X112;
1054         S.at(1, 2) = D(X111 * F!(-0.074658f) + X113 * F!(0.513280f) + X115 * F!(0.768178f) + X117 * F!(-0.375330f));
1055         S.at(1, 3) = X116;
1056         S.at(2, 0) = D(X121 * F!(0.906127f) + X123 * F!(-0.318190f) + X125 * F!(0.212608f) + X127 * F!(-0.180240f));
1057         S.at(2, 1) = X122;
1058         S.at(2, 2) = D(X121 * F!(-0.074658f) + X123 * F!(0.513280f) + X125 * F!(0.768178f) + X127 * F!(-0.375330f));
1059         S.at(2, 3) = X126;
1060         S.at(3, 0) = D(X131 * F!(0.906127f) + X133 * F!(-0.318190f) + X135 * F!(0.212608f) + X137 * F!(-0.180240f));
1061         S.at(3, 1) = X132;
1062         S.at(3, 2) = D(X131 * F!(-0.074658f) + X133 * F!(0.513280f) + X135 * F!(0.768178f) + X137 * F!(-0.375330f));
1063         S.at(3, 3) = X136;
1064         // 40 muls 24 adds
1065       }
1066     }
1067   } // end namespace DCT_Upsample
1068 
1069   // Unconditionally frees all allocated m_blocks.
1070   void free_all_blocks () {
1071     //m_pStream = null;
1072     readfn = null;
1073     for (mem_block *b = m_pMem_blocks; b; ) {
1074       mem_block* n = b.m_pNext;
1075       jpgd_free(b);
1076       b = n;
1077     }
1078     m_pMem_blocks = null;
1079   }
1080 
1081   // This method handles all errors. It will never return.
1082   // It could easily be changed to use C++ exceptions.
1083   deprecated("use set_error instead and fail the decoding") void stop_decoding (jpgd_status status) {
1084     m_error_code = status;
1085     free_all_blocks();
1086     //longjmp(m_jmp_state, status);
1087     assert(false, "jpeg decoding error");
1088   }
1089 
1090   // This method handles all errors. It does return, but the decoder should report an error immediately.
1091   void set_error (jpgd_status status) 
1092   {
1093     m_error_code = status;
1094     free_all_blocks();
1095   }
1096 
1097   // err is true if allocation failed
1098   void* alloc (size_t nSize, bool zero, bool* err, ) {
1099     nSize = (JPGD_MAX(nSize, 1) + 3) & ~3;
1100     char *rv = null;
1101     for (mem_block *b = m_pMem_blocks; b; b = b.m_pNext)
1102     {
1103       if ((b.m_used_count + nSize) <= b.m_size)
1104       {
1105         rv = b.m_data.ptr + b.m_used_count;
1106         b.m_used_count += nSize;
1107         break;
1108       }
1109     }
1110     if (!rv)
1111     {
1112       int capacity = cast(int) JPGD_MAX(32768 - 256, (nSize + 2047) & ~2047);
1113       mem_block *b = cast(mem_block*)jpgd_malloc(mem_block.sizeof + capacity);
1114       if (!b) 
1115       { 
1116         set_error(JPGD_NOTENOUGHMEM);
1117         *err = true;
1118         return null;
1119       }
1120       b.m_pNext = m_pMem_blocks; m_pMem_blocks = b;
1121       b.m_used_count = nSize;
1122       b.m_size = capacity;
1123       rv = b.m_data.ptr;
1124     }
1125     if (zero) memset(rv, 0, nSize);
1126     *err = false;
1127     return rv;
1128   }
1129 
1130   void word_clear (void *p, ushort c, uint n) {
1131     ubyte *pD = cast(ubyte*)p;
1132     immutable ubyte l = c & 0xFF, h = (c >> 8) & 0xFF;
1133     while (n)
1134     {
1135       pD[0] = l; pD[1] = h; pD += 2;
1136       n--;
1137     }
1138   }
1139 
1140   // Refill the input buffer.
1141   // This method will sit in a loop until (A) the buffer is full or (B)
1142   // the stream's read() method reports and end of file condition.
1143   bool prep_in_buffer () {
1144     m_in_buf_left = 0;
1145     m_pIn_buf_ofs = m_in_buf.ptr;
1146 
1147     if (m_eof_flag)
1148       return true;
1149 
1150     do
1151     {
1152       int bytes_read = readfn(m_in_buf.ptr + m_in_buf_left, JPGD_IN_BUF_SIZE - m_in_buf_left, &m_eof_flag, userData);
1153       if (bytes_read == -1)
1154       {
1155         set_error(JPGD_STREAM_READ);
1156         return false;
1157       }
1158 
1159       m_in_buf_left += bytes_read;
1160     } while ((m_in_buf_left < JPGD_IN_BUF_SIZE) && (!m_eof_flag));
1161 
1162     m_total_bytes_read += m_in_buf_left;
1163 
1164     // Pad the end of the block with M_EOI (prevents the decompressor from going off the rails if the stream is invalid).
1165     // (This dates way back to when this decompressor was written in C/asm, and the all-asm Huffman decoder did some fancy things to increase perf.)
1166     word_clear(m_pIn_buf_ofs + m_in_buf_left, 0xD9FF, 64);
1167     return true;
1168   }
1169 
1170   // Read a Huffman code table.
1171   bool read_dht_marker () {
1172     int i, index, count;
1173     ubyte[17] huff_num;
1174     ubyte[256] huff_val;
1175 
1176     bool err;
1177     uint num_left = get_bits(16, &err);
1178     if (err)
1179         return false;
1180 
1181     if (num_left < 2)
1182     {
1183       set_error(JPGD_BAD_DHT_MARKER);
1184       return false;
1185     }
1186 
1187     num_left -= 2;
1188 
1189     while (num_left)
1190     {
1191       index = get_bits(8, &err);
1192       if (err)
1193           return false;
1194 
1195       huff_num.ptr[0] = 0;
1196 
1197       count = 0;
1198 
1199       for (i = 1; i <= 16; i++)
1200       {
1201         huff_num.ptr[i] = cast(ubyte)(get_bits(8, &err));
1202         if (err)
1203             return false;
1204         count += huff_num.ptr[i];
1205       }
1206 
1207       if (count > 255)
1208       {
1209         set_error(JPGD_BAD_DHT_COUNTS);
1210         return false;
1211       }
1212 
1213       for (i = 0; i < count; i++)
1214       {
1215         huff_val.ptr[i] = cast(ubyte)(get_bits(8, &err));
1216         if (err)
1217           return false;
1218       }
1219 
1220       i = 1 + 16 + count;
1221 
1222       if (num_left < cast(uint)i)
1223       {
1224         set_error(JPGD_BAD_DHT_MARKER);
1225         return false;
1226       }
1227 
1228       num_left -= i;
1229 
1230       if ((index & 0x10) > 0x10)
1231       {
1232         set_error(JPGD_BAD_DHT_INDEX);
1233         return false;
1234       }
1235 
1236       index = (index & 0x0F) + ((index & 0x10) >> 4) * (JPGD_MAX_HUFF_TABLES >> 1);
1237 
1238       if (index >= JPGD_MAX_HUFF_TABLES)
1239       {
1240         set_error(JPGD_BAD_DHT_INDEX);
1241         return false;
1242       }
1243 
1244       if (!m_huff_num.ptr[index])
1245       {
1246         m_huff_num.ptr[index] = cast(ubyte*)alloc(17, false, &err);
1247         if (err)
1248             return false;
1249       }
1250 
1251       if (!m_huff_val.ptr[index])
1252       {
1253         m_huff_val.ptr[index] = cast(ubyte*)alloc(256, false, &err);
1254         if (err)
1255             return false;
1256       }
1257 
1258       m_huff_ac.ptr[index] = (index & 0x10) != 0;
1259       memcpy(m_huff_num.ptr[index], huff_num.ptr, 17);
1260       memcpy(m_huff_val.ptr[index], huff_val.ptr, 256);
1261     }
1262     return true;
1263   }
1264 
1265   // Read a quantization table.
1266   bool read_dqt_marker () {
1267     int n, i, prec;
1268     uint num_left;
1269     uint temp;
1270 
1271     bool err;
1272 
1273     num_left = get_bits(16, &err);
1274     if (err)
1275         return false;
1276 
1277     if (num_left < 2)
1278     {
1279       set_error(JPGD_BAD_DQT_MARKER);
1280       return false;
1281     }
1282 
1283     num_left -= 2;
1284 
1285     while (num_left)
1286     {
1287       n = get_bits(8, &err);
1288       if (err)
1289         return false;
1290       prec = n >> 4;
1291       n &= 0x0F;
1292 
1293       if (n >= JPGD_MAX_QUANT_TABLES)
1294       {
1295         set_error(JPGD_BAD_DQT_TABLE);
1296         return false;
1297       }
1298 
1299       if (!m_quant.ptr[n])
1300       {
1301         m_quant.ptr[n] = cast(jpgd_quant_t*)alloc(64 * jpgd_quant_t.sizeof, false, &err);
1302         if (err)
1303             return false;
1304       }
1305 
1306       // read quantization entries, in zag order
1307       for (i = 0; i < 64; i++)
1308       {
1309         temp = get_bits(8, &err);
1310         if (err)
1311             return false;
1312 
1313         if (prec)
1314         {
1315           temp = (temp << 8) + get_bits(8, &err);
1316           if (err)
1317               return false;
1318         }
1319 
1320         m_quant.ptr[n][i] = cast(jpgd_quant_t)(temp);
1321       }
1322 
1323       i = 64 + 1;
1324 
1325       if (prec)
1326         i += 64;
1327 
1328       if (num_left < cast(uint)i)
1329       {
1330         set_error(JPGD_BAD_DQT_LENGTH);
1331         return false;
1332       }
1333 
1334       num_left -= i;
1335     }
1336     return true;
1337   }
1338 
1339   // Read the start of frame (SOF) marker.
1340   bool read_sof_marker () {
1341     int i;
1342     uint num_left;
1343     bool err;
1344 
1345     num_left = get_bits(16, &err);
1346     if (err)
1347         return false;
1348 
1349     if (get_bits(8, &err) != 8)   /* precision: sorry, only 8-bit precision is supported right now */
1350     {
1351       set_error(JPGD_BAD_PRECISION);
1352       return false;
1353     }
1354     if (err)
1355         return false;
1356 
1357     m_image_y_size = get_bits(16, &err);
1358     if (err)
1359         return false;
1360 
1361     if ((m_image_y_size < 1) || (m_image_y_size > JPGD_MAX_HEIGHT))
1362     {
1363       set_error(JPGD_BAD_HEIGHT);
1364       return false;
1365     }
1366 
1367     m_image_x_size = get_bits(16, &err);
1368     if (err)
1369         return false;
1370 
1371     if ((m_image_x_size < 1) || (m_image_x_size > JPGD_MAX_WIDTH))
1372     {
1373       set_error(JPGD_BAD_WIDTH);
1374       return false;
1375     }
1376 
1377     m_comps_in_frame = get_bits(8, &err);
1378     if (err)
1379         return false;
1380 
1381     if (m_comps_in_frame > JPGD_MAX_COMPONENTS)
1382     {
1383       set_error(JPGD_TOO_MANY_COMPONENTS);
1384       return false;
1385     }
1386 
1387     if (num_left != cast(uint)(m_comps_in_frame * 3 + 8))
1388     {
1389       set_error(JPGD_BAD_SOF_LENGTH);
1390       return false;
1391     }
1392 
1393     for (i = 0; i < m_comps_in_frame; i++)
1394     {
1395       m_comp_ident.ptr[i]  = get_bits(8, &err);
1396       if (err)
1397         return false;
1398       m_comp_h_samp.ptr[i] = get_bits(4, &err);
1399       if (err)
1400           return false;
1401       m_comp_v_samp.ptr[i] = get_bits(4, &err);
1402       if (err)
1403           return false;
1404       m_comp_quant.ptr[i]  = get_bits(8, &err);
1405       if (err)
1406           return false;
1407     }
1408     return true;
1409   }
1410 
1411   // Used to skip unrecognized markers.
1412   bool skip_variable_marker () {
1413     uint num_left;
1414 
1415     bool err;
1416     num_left = get_bits(16, &err);
1417     if (err)
1418         return false;
1419 
1420     if (num_left < 2)
1421     {
1422       set_error(JPGD_BAD_VARIABLE_MARKER);
1423       return false;
1424     }
1425 
1426     num_left -= 2;
1427 
1428     while (num_left)
1429     {
1430       get_bits(8, &err);
1431       if (err)
1432           return false;
1433       num_left--;
1434     }
1435     return true;
1436   }
1437 
1438   // Read a define restart interval (DRI) marker.
1439   bool read_dri_marker () 
1440   {
1441     bool err;
1442     int drilen = get_bits(16, &err);
1443     if (err)
1444         return false;
1445     
1446     if (drilen != 4)
1447     {
1448       set_error(JPGD_BAD_DRI_LENGTH);
1449       return false;
1450     }
1451 
1452     m_restart_interval = get_bits(16, &err);
1453     if (err)
1454         return false;
1455     return true;
1456   }
1457 
1458   // Read a start of scan (SOS) marker.
1459   // Return true on success.
1460   bool read_sos_marker () {
1461     bool err;
1462     uint num_left;
1463     int i, ci, n, c, cc;
1464 
1465     num_left = get_bits(16, &err);
1466     if (err)
1467         return false;
1468     n = get_bits(8, &err);
1469     if (err)
1470         return false;
1471 
1472     m_comps_in_scan = n;
1473 
1474     num_left -= 3;
1475 
1476     if ( (num_left != cast(uint)(n * 2 + 3)) || (n < 1) || (n > JPGD_MAX_COMPS_IN_SCAN) )
1477     {
1478         set_error(JPGD_BAD_SOS_LENGTH);
1479         return false;
1480     }
1481 
1482     for (i = 0; i < n; i++)
1483     {
1484       cc = get_bits(8, &err);
1485       if (err)
1486           return false;
1487       c = get_bits(8, &err);
1488       if (err)
1489           return false;
1490       num_left -= 2;
1491 
1492       for (ci = 0; ci < m_comps_in_frame; ci++)
1493         if (cc == m_comp_ident.ptr[ci])
1494           break;
1495 
1496       if (ci >= m_comps_in_frame)
1497       {
1498         set_error(JPGD_BAD_SOS_COMP_ID);
1499         return false;
1500       }
1501 
1502       m_comp_list.ptr[i]    = ci;
1503       m_comp_dc_tab.ptr[ci] = (c >> 4) & 15;
1504       m_comp_ac_tab.ptr[ci] = (c & 15) + (JPGD_MAX_HUFF_TABLES >> 1);
1505     }
1506 
1507     m_spectral_start  = get_bits(8, &err);
1508     if (err)
1509         return false;
1510     m_spectral_end    = get_bits(8, &err);
1511     if (err)
1512         return false;
1513     m_successive_high = get_bits(4, &err);
1514     if (err)
1515         return false;
1516     m_successive_low  = get_bits(4, &err);
1517     if (err)
1518         return false;
1519 
1520     if (!m_progressive_flag)
1521     {
1522       m_spectral_start = 0;
1523       m_spectral_end = 63;
1524     }
1525 
1526     num_left -= 3;
1527 
1528     /* read past whatever is num_left */
1529     while (num_left)
1530     {
1531       get_bits(8, &err);
1532       if (err)
1533           return false;
1534       num_left--;
1535     }
1536     return true;
1537   }
1538 
1539   // Finds the next marker.
1540   int next_marker (bool* err) {
1541     uint c, bytes;
1542     *err = false;
1543     bytes = 0;
1544 
1545     do
1546     {
1547       do
1548       {
1549         bytes++;
1550         c = get_bits(8, err);
1551         if (*err)
1552             return 0;
1553       } while (c != 0xFF);
1554 
1555       do
1556       {
1557         c = get_bits(8, err);
1558         if (*err)
1559             return 0;
1560       } while (c == 0xFF);
1561 
1562     } while (c == 0);
1563 
1564     // If bytes > 0 here, there where extra bytes before the marker (not good).
1565 
1566     return c;
1567   }
1568 
1569   // Process markers. Returns when an SOFx, SOI, EOI, or SOS marker is
1570   // encountered.
1571   // Return true in *err on error, and then the return value is wrong.
1572   int process_markers (bool* err) {
1573     int c;
1574 
1575     for ( ; ; ) {
1576       c = next_marker(err);
1577       if (*err)
1578           return 0;
1579 
1580       switch (c)
1581       {
1582         case M_SOF0:
1583         case M_SOF1:
1584         case M_SOF2:
1585         case M_SOF3:
1586         case M_SOF5:
1587         case M_SOF6:
1588         case M_SOF7:
1589         //case M_JPG:
1590         case M_SOF9:
1591         case M_SOF10:
1592         case M_SOF11:
1593         case M_SOF13:
1594         case M_SOF14:
1595         case M_SOF15:
1596         case M_SOI:
1597         case M_EOI:
1598         case M_SOS:
1599           return c;
1600         case M_DHT:
1601           if (!read_dht_marker())
1602           {
1603             *err = true;
1604             return 0;
1605           }
1606           break;
1607         // No arithmitic support - dumb patents!
1608         case M_DAC:
1609           set_error(JPGD_NO_ARITHMITIC_SUPPORT);
1610           *err = true;
1611           return 0;
1612 
1613         case M_DQT:
1614           if (!read_dqt_marker())
1615           {
1616             *err = true;
1617             return 0;
1618           }
1619           break;
1620         case M_DRI:
1621           if (!read_dri_marker())
1622           {
1623               *err = true;
1624               return 0;
1625           }
1626           break;
1627 
1628         case M_APP0:
1629             uint num_left;
1630 
1631             num_left = get_bits(16, err);
1632             if (*err)
1633                 return 0;
1634             
1635             if (num_left < 7)
1636             {
1637                 *err = true;
1638                 set_error(JPGD_BAD_VARIABLE_MARKER);
1639             }
1640 
1641             num_left -= 2;
1642 
1643             ubyte[5] jfif_id;
1644             foreach(i; 0..5)
1645             {
1646                 jfif_id[i] = cast(ubyte) get_bits(8, err);
1647                 if (*err)
1648                     return 0;
1649             }
1650 
1651             num_left -= 5;
1652             static immutable ubyte[5] JFIF = [0x4A, 0x46, 0x49, 0x46, 0x00];
1653             if (jfif_id == JFIF && num_left >= 7)
1654             {
1655                 // skip version
1656                 get_bits(16, err);
1657                 if (*err) return 0;
1658                 uint units = get_bits(8, err);
1659                 if (*err) return 0;
1660                 int Xdensity = get_bits(16, err);
1661                 if (*err) return 0;
1662                 int Ydensity = get_bits(16, err);
1663                 if (*err) return 0;
1664                 num_left -= 7;
1665 
1666                 m_pixelAspectRatio = (Xdensity/cast(double)Ydensity);
1667 
1668                 switch (units)
1669                 {
1670                     case 0: // no units, just a ratio
1671                         m_pixelsPerInchX = -1;
1672                         m_pixelsPerInchY = -1;
1673                         break;
1674 
1675                     case 1: // dot per inch
1676                         m_pixelsPerInchX = Xdensity;
1677                         m_pixelsPerInchY = Ydensity;
1678                         break;
1679 
1680                     case 2: // dot per cm
1681                         m_pixelsPerInchX = convertInchesToMeters(Xdensity * 100.0f);
1682                         m_pixelsPerInchY = convertInchesToMeters(Ydensity * 100.0f);
1683                         break;
1684                     default:
1685                 }
1686             }
1687 
1688             // skip rests of chunk
1689 
1690             while (num_left)
1691             {
1692                 get_bits(8, err);
1693                 if (*err) return 0;
1694                 num_left--;
1695             }
1696             break;
1697 
1698         case M_APP0+1: // possibly EXIF data
1699          
1700             uint num_left;
1701             num_left = get_bits(16, err);
1702             if (*err) return 0;
1703 
1704             if (num_left < 2)
1705             {
1706                 *err = true;
1707                 set_error(JPGD_BAD_VARIABLE_MARKER);
1708                 return 0;
1709             }
1710             num_left -= 2;
1711 
1712             ubyte[] exifData = (cast(ubyte*) malloc(num_left))[0..num_left];
1713             scope(exit) free(exifData.ptr);
1714 
1715             foreach(i; 0..num_left)
1716             {
1717                 exifData[i] = cast(ubyte)(get_bits(8, err));
1718                 if (*err) 
1719                     return 0;
1720             }
1721 
1722             const(ubyte)* s = exifData.ptr;
1723 
1724             ubyte[6] exif_id;
1725             foreach(i; 0..6)
1726                 exif_id[i] = read_ubyte(s);
1727 
1728             const(ubyte)* remainExifData = s;
1729 
1730             static immutable ubyte[6] ExifIdentifierCode = [0x45, 0x78, 0x69, 0x66, 0x00, 0x00]; // "Exif\0\0"
1731             if (exif_id == ExifIdentifierCode)
1732             {
1733                 // See EXIF specification: http://www.cipa.jp/std/documents/e/DC-008-2012_E.pdf
1734 
1735                 const(ubyte)* tiffFile = s; // save exif chunk from "start of TIFF file"
1736 
1737                 ushort byteOrder = read_ushort_BE(s);
1738                 if (byteOrder != 0x4949 && byteOrder != 0x4D4D)
1739                 {
1740                     set_error(JPGD_DECODE_ERROR);
1741                     *err = true;
1742                     return 0;
1743                 }
1744                 bool littleEndian = (byteOrder == 0x4949);
1745 
1746                 ushort version_ = littleEndian ? read_ushort_LE(s) : read_ushort_BE(s);
1747                 if (version_ != 42)
1748                 {
1749                     *err = true;
1750                     set_error(JPGD_DECODE_ERROR);
1751                     return 0;
1752                 }
1753 
1754                 uint offset = littleEndian ? read_uint_LE(s) : read_uint_BE(s);
1755 
1756                 double resolutionX = 72;
1757                 double resolutionY = 72;
1758                 int unit = 2;
1759 
1760                 // parse all IFDs
1761                 while(offset != 0)
1762                 {
1763                     if (offset > exifData.length)
1764                     {
1765                         *err = true;
1766                         set_error(JPGD_DECODE_ERROR);
1767                         return 0;
1768                     }
1769                     const(ubyte)* pIFD = tiffFile + offset;
1770                     ushort numEntries = littleEndian ? read_ushort_LE(pIFD) : read_ushort_BE(pIFD);
1771 
1772                     foreach(entry; 0..numEntries)
1773                     {
1774                         ushort tag = littleEndian ? read_ushort_LE(pIFD) : read_ushort_BE(pIFD);
1775                         ushort type = littleEndian ? read_ushort_LE(pIFD) : read_ushort_BE(pIFD);
1776                         uint count = littleEndian ? read_uint_LE(pIFD) : read_uint_BE(pIFD);
1777                         uint valueOffset = littleEndian ? read_uint_LE(pIFD) : read_uint_BE(pIFD);
1778 
1779                         if (tag == 282 || tag == 283) // XResolution
1780                         {
1781                             const(ubyte)* tagData = tiffFile + valueOffset;
1782                             double num = littleEndian ? read_uint_LE(tagData) : read_uint_BE(tagData);
1783                             double denom = littleEndian ? read_uint_LE(tagData) : read_uint_BE(tagData);
1784                             double frac = num / denom;
1785                             if (tag == 282)
1786                                 resolutionX = frac;
1787                             else
1788                                 resolutionY = frac;
1789                         }
1790 
1791                         if (tag == 296) // unit
1792                             unit = valueOffset;
1793                     }
1794                     offset = littleEndian ? read_uint_LE(pIFD) : read_uint_BE(pIFD);
1795                 }
1796 
1797                 if (unit == 2) // inches
1798                 {
1799                     m_pixelsPerInchX = resolutionX;
1800                     m_pixelsPerInchY = resolutionY;
1801                     m_pixelAspectRatio = resolutionX / resolutionY;
1802                 }
1803                 else if (unit == 3) // dots per cm
1804                 {
1805                     m_pixelsPerInchX = convertInchesToMeters(resolutionX * 100);
1806                     m_pixelsPerInchY = convertInchesToMeters(resolutionY * 100);
1807                     m_pixelAspectRatio = resolutionX / resolutionY;
1808                 }
1809             }
1810             break;
1811 
1812         case M_JPG:
1813         case M_RST0:    /* no parameters */
1814         case M_RST1:
1815         case M_RST2:
1816         case M_RST3:
1817         case M_RST4:
1818         case M_RST5:
1819         case M_RST6:
1820         case M_RST7:
1821         case M_TEM:
1822           {
1823             *err = true;
1824             return 0;
1825           }
1826         default:    /* must be DNL, DHP, EXP, APPn, JPGn, COM, or RESn or APP0 */
1827           if (!skip_variable_marker())
1828           {
1829             *err = true;
1830             return 0;            
1831           }
1832           break;
1833       }
1834     }
1835   }
1836 
1837   // Finds the start of image (SOI) marker.
1838   // This code is rather defensive: it only checks the first 512 bytes to avoid
1839   // false positives.
1840   // return false on I/O error
1841   bool locate_soi_marker () {
1842     uint lastchar, thischar;
1843     uint bytesleft;
1844 
1845     bool err;
1846     lastchar = get_bits(8, &err);
1847     if (err)
1848         return false;
1849     thischar = get_bits(8, &err);
1850     if (err)
1851         return false;
1852 
1853     /* ok if it's a normal JPEG file without a special header */
1854 
1855     if ((lastchar == 0xFF) && (thischar == M_SOI))
1856       return true;
1857 
1858     bytesleft = 4096; //512;
1859 
1860     for ( ; ; )
1861     {
1862       if (--bytesleft == 0)
1863       {
1864         set_error(JPGD_NOT_JPEG);
1865         return false;
1866       }
1867 
1868       lastchar = thischar;
1869 
1870       thischar = get_bits(8, &err);
1871       if (err)
1872           return false;
1873 
1874       if (lastchar == 0xFF)
1875       {
1876         if (thischar == M_SOI)
1877           break;
1878         else if (thischar == M_EOI) // get_bits will keep returning M_EOI if we read past the end
1879         {
1880           set_error(JPGD_NOT_JPEG);
1881           return false;
1882         }
1883       }
1884     }
1885 
1886     // Check the next character after marker: if it's not 0xFF, it can't be the start of the next marker, so the file is bad.
1887     thischar = (m_bit_buf >> 24) & 0xFF;
1888 
1889     if (thischar != 0xFF)
1890     {
1891       set_error(JPGD_NOT_JPEG);
1892       return false;
1893     }
1894     return true;
1895   }
1896 
1897   // Find a start of frame (SOF) marker.
1898   bool locate_sof_marker () {
1899     if (!locate_soi_marker())
1900         return false;
1901 
1902     bool err;
1903     int c = process_markers(&err);
1904     if (err)
1905         return false;
1906 
1907     switch (c)
1908     {
1909       case M_SOF2:
1910         m_progressive_flag = true;
1911         goto case;
1912       case M_SOF0:  /* baseline DCT */
1913       case M_SOF1:  /* extended sequential DCT */
1914         if (!read_sof_marker())
1915         {
1916             return false;
1917         }
1918         break;
1919       case M_SOF9:  /* Arithmitic coding */
1920         set_error(JPGD_NO_ARITHMITIC_SUPPORT);
1921         return false;
1922 
1923       default:
1924         set_error(JPGD_UNSUPPORTED_MARKER);
1925         return false;
1926     }
1927     return true;
1928   }
1929 
1930   // Find a start of scan (SOS) marker.
1931   int locate_sos_marker (bool* err) {
1932     int c;
1933 
1934     c = process_markers(err);
1935     if (*err)
1936         return false;
1937 
1938     if (c == M_EOI)
1939       return false;
1940     else if (c != M_SOS)
1941     {
1942         *err = true;
1943         set_error(JPGD_UNEXPECTED_MARKER);
1944         return false;
1945     }
1946 
1947     if (!read_sos_marker())
1948     {
1949         *err = true;
1950         return false;
1951     }
1952 
1953     return true;
1954   }
1955 
1956   // Reset everything to default/uninitialized state.
1957   // Return true on success
1958   bool initit (JpegStreamReadFunc rfn, void* userData) 
1959   {
1960     m_pMem_blocks = null;
1961     m_error_code = JPGD_SUCCESS;
1962     m_ready_flag = false;
1963     m_image_x_size = m_image_y_size = 0;
1964     readfn = rfn;
1965     this.userData = userData;
1966     m_progressive_flag = false;
1967 
1968     memset(m_huff_ac.ptr, 0, m_huff_ac.sizeof);
1969     memset(m_huff_num.ptr, 0, m_huff_num.sizeof);
1970     memset(m_huff_val.ptr, 0, m_huff_val.sizeof);
1971     memset(m_quant.ptr, 0, m_quant.sizeof);
1972 
1973     m_scan_type = 0;
1974     m_comps_in_frame = 0;
1975 
1976     memset(m_comp_h_samp.ptr, 0, m_comp_h_samp.sizeof);
1977     memset(m_comp_v_samp.ptr, 0, m_comp_v_samp.sizeof);
1978     memset(m_comp_quant.ptr, 0, m_comp_quant.sizeof);
1979     memset(m_comp_ident.ptr, 0, m_comp_ident.sizeof);
1980     memset(m_comp_h_blocks.ptr, 0, m_comp_h_blocks.sizeof);
1981     memset(m_comp_v_blocks.ptr, 0, m_comp_v_blocks.sizeof);
1982 
1983     m_comps_in_scan = 0;
1984     memset(m_comp_list.ptr, 0, m_comp_list.sizeof);
1985     memset(m_comp_dc_tab.ptr, 0, m_comp_dc_tab.sizeof);
1986     memset(m_comp_ac_tab.ptr, 0, m_comp_ac_tab.sizeof);
1987 
1988     m_spectral_start = 0;
1989     m_spectral_end = 0;
1990     m_successive_low = 0;
1991     m_successive_high = 0;
1992     m_max_mcu_x_size = 0;
1993     m_max_mcu_y_size = 0;
1994     m_blocks_per_mcu = 0;
1995     m_max_blocks_per_row = 0;
1996     m_mcus_per_row = 0;
1997     m_mcus_per_col = 0;
1998     m_expanded_blocks_per_component = 0;
1999     m_expanded_blocks_per_mcu = 0;
2000     m_expanded_blocks_per_row = 0;
2001     m_freq_domain_chroma_upsample = false;
2002 
2003     memset(m_mcu_org.ptr, 0, m_mcu_org.sizeof);
2004 
2005     m_total_lines_left = 0;
2006     m_mcu_lines_left = 0;
2007     m_real_dest_bytes_per_scan_line = 0;
2008     m_dest_bytes_per_scan_line = 0;
2009     m_dest_bytes_per_pixel = 0;
2010 
2011     memset(m_pHuff_tabs.ptr, 0, m_pHuff_tabs.sizeof);
2012 
2013     memset(m_dc_coeffs.ptr, 0, m_dc_coeffs.sizeof);
2014     memset(m_ac_coeffs.ptr, 0, m_ac_coeffs.sizeof);
2015     memset(m_block_y_mcu.ptr, 0, m_block_y_mcu.sizeof);
2016 
2017     m_eob_run = 0;
2018 
2019     memset(m_block_y_mcu.ptr, 0, m_block_y_mcu.sizeof);
2020 
2021     m_pIn_buf_ofs = m_in_buf.ptr;
2022     m_in_buf_left = 0;
2023     m_eof_flag = false;
2024     m_tem_flag = 0;
2025 
2026     memset(m_in_buf_pad_start.ptr, 0, m_in_buf_pad_start.sizeof);
2027     memset(m_in_buf.ptr, 0, m_in_buf.sizeof);
2028     memset(m_in_buf_pad_end.ptr, 0, m_in_buf_pad_end.sizeof);
2029 
2030     m_restart_interval = 0;
2031     m_restarts_left    = 0;
2032     m_next_restart_num = 0;
2033 
2034     m_max_mcus_per_row = 0;
2035     m_max_blocks_per_mcu = 0;
2036     m_max_mcus_per_col = 0;
2037 
2038     memset(m_last_dc_val.ptr, 0, m_last_dc_val.sizeof);
2039     m_pMCU_coefficients = null;
2040     m_pSample_buf = null;
2041 
2042     m_total_bytes_read = 0;
2043 
2044     m_pScan_line_0 = null;
2045     m_pScan_line_1 = null;
2046 
2047     // Ready the input buffer.
2048     if (!prep_in_buffer())
2049         return false;
2050 
2051     // Prime the bit buffer.
2052     m_bits_left = 16;
2053     m_bit_buf = 0;
2054 
2055     bool err;
2056     get_bits(16, &err);
2057     if (err) return false;
2058     get_bits(16, &err);
2059     if (err) return false;
2060 
2061     for (int i = 0; i < JPGD_MAX_BLOCKS_PER_MCU; i++)
2062       m_mcu_block_max_zag.ptr[i] = 64;
2063 
2064     return true;
2065   }
2066 
2067   enum SCALEBITS = 16;
2068   enum ONE_HALF = (cast(int) 1 << (SCALEBITS-1));
2069   enum FIX(float x) = (cast(int)((x) * (1L<<SCALEBITS) + 0.5f));
2070 
2071   // Create a few tables that allow us to quickly convert YCbCr to RGB.
2072   void create_look_ups () {
2073     for (int i = 0; i <= 255; i++)
2074     {
2075       int k = i - 128;
2076       m_crr.ptr[i] = ( FIX!(1.40200f)  * k + ONE_HALF) >> SCALEBITS;
2077       m_cbb.ptr[i] = ( FIX!(1.77200f)  * k + ONE_HALF) >> SCALEBITS;
2078       m_crg.ptr[i] = (-FIX!(0.71414f)) * k;
2079       m_cbg.ptr[i] = (-FIX!(0.34414f)) * k + ONE_HALF;
2080     }
2081   }
2082 
2083   // This method throws back into the stream any bytes that where read
2084   // into the bit buffer during initial marker scanning.
2085   bool fix_in_buffer () {
2086     // In case any 0xFF's where pulled into the buffer during marker scanning.
2087     assert((m_bits_left & 7) == 0);
2088 
2089     if (m_bits_left == 16)
2090       stuff_char(cast(ubyte)(m_bit_buf & 0xFF));
2091 
2092     if (m_bits_left >= 8)
2093       stuff_char(cast(ubyte)((m_bit_buf >> 8) & 0xFF));
2094 
2095     stuff_char(cast(ubyte)((m_bit_buf >> 16) & 0xFF));
2096     stuff_char(cast(ubyte)((m_bit_buf >> 24) & 0xFF));
2097 
2098     m_bits_left = 16;
2099     bool err;
2100     get_bits_no_markers(16, &err);
2101     if (err) return false;
2102     get_bits_no_markers(16, &err);
2103     if (err) return false;
2104     return true;
2105   }
2106 
2107   void transform_mcu (int mcu_row) {
2108     jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
2109     ubyte* pDst_ptr = m_pSample_buf + mcu_row * m_blocks_per_mcu * 64;
2110 
2111     for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
2112     {
2113       idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag.ptr[mcu_block]);
2114       pSrc_ptr += 64;
2115       pDst_ptr += 64;
2116     }
2117   }
2118 
2119   static immutable ubyte[64] s_max_rc = [
2120     17, 18, 34, 50, 50, 51, 52, 52, 52, 68, 84, 84, 84, 84, 85, 86, 86, 86, 86, 86,
2121     102, 118, 118, 118, 118, 118, 118, 119, 120, 120, 120, 120, 120, 120, 120, 136,
2122     136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
2123     136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136
2124   ];
2125 
2126   void transform_mcu_expand (int mcu_row) {
2127     jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
2128     ubyte* pDst_ptr = m_pSample_buf + mcu_row * m_expanded_blocks_per_mcu * 64;
2129 
2130     // Y IDCT
2131     int mcu_block;
2132     for (mcu_block = 0; mcu_block < m_expanded_blocks_per_component; mcu_block++)
2133     {
2134       idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag.ptr[mcu_block]);
2135       pSrc_ptr += 64;
2136       pDst_ptr += 64;
2137     }
2138 
2139     // Chroma IDCT, with upsampling
2140     jpgd_block_t[64] temp_block;
2141 
2142     for (int i = 0; i < 2; i++)
2143     {
2144       DCT_Upsample.Matrix44 P, Q, R, S;
2145 
2146       assert(m_mcu_block_max_zag.ptr[mcu_block] >= 1);
2147       assert(m_mcu_block_max_zag.ptr[mcu_block] <= 64);
2148 
2149       int max_zag = m_mcu_block_max_zag.ptr[mcu_block++] - 1;
2150       if (max_zag <= 0) max_zag = 0; // should never happen, only here to shut up static analysis
2151       switch (s_max_rc.ptr[max_zag])
2152       {
2153       case 1*16+1:
2154         DCT_Upsample.P_Q!(1, 1).calc(P, Q, pSrc_ptr);
2155         DCT_Upsample.R_S!(1, 1).calc(R, S, pSrc_ptr);
2156         break;
2157       case 1*16+2:
2158         DCT_Upsample.P_Q!(1, 2).calc(P, Q, pSrc_ptr);
2159         DCT_Upsample.R_S!(1, 2).calc(R, S, pSrc_ptr);
2160         break;
2161       case 2*16+2:
2162         DCT_Upsample.P_Q!(2, 2).calc(P, Q, pSrc_ptr);
2163         DCT_Upsample.R_S!(2, 2).calc(R, S, pSrc_ptr);
2164         break;
2165       case 3*16+2:
2166         DCT_Upsample.P_Q!(3, 2).calc(P, Q, pSrc_ptr);
2167         DCT_Upsample.R_S!(3, 2).calc(R, S, pSrc_ptr);
2168         break;
2169       case 3*16+3:
2170         DCT_Upsample.P_Q!(3, 3).calc(P, Q, pSrc_ptr);
2171         DCT_Upsample.R_S!(3, 3).calc(R, S, pSrc_ptr);
2172         break;
2173       case 3*16+4:
2174         DCT_Upsample.P_Q!(3, 4).calc(P, Q, pSrc_ptr);
2175         DCT_Upsample.R_S!(3, 4).calc(R, S, pSrc_ptr);
2176         break;
2177       case 4*16+4:
2178         DCT_Upsample.P_Q!(4, 4).calc(P, Q, pSrc_ptr);
2179         DCT_Upsample.R_S!(4, 4).calc(R, S, pSrc_ptr);
2180         break;
2181       case 5*16+4:
2182         DCT_Upsample.P_Q!(5, 4).calc(P, Q, pSrc_ptr);
2183         DCT_Upsample.R_S!(5, 4).calc(R, S, pSrc_ptr);
2184         break;
2185       case 5*16+5:
2186         DCT_Upsample.P_Q!(5, 5).calc(P, Q, pSrc_ptr);
2187         DCT_Upsample.R_S!(5, 5).calc(R, S, pSrc_ptr);
2188         break;
2189       case 5*16+6:
2190         DCT_Upsample.P_Q!(5, 6).calc(P, Q, pSrc_ptr);
2191         DCT_Upsample.R_S!(5, 6).calc(R, S, pSrc_ptr);
2192         break;
2193       case 6*16+6:
2194         DCT_Upsample.P_Q!(6, 6).calc(P, Q, pSrc_ptr);
2195         DCT_Upsample.R_S!(6, 6).calc(R, S, pSrc_ptr);
2196         break;
2197       case 7*16+6:
2198         DCT_Upsample.P_Q!(7, 6).calc(P, Q, pSrc_ptr);
2199         DCT_Upsample.R_S!(7, 6).calc(R, S, pSrc_ptr);
2200         break;
2201       case 7*16+7:
2202         DCT_Upsample.P_Q!(7, 7).calc(P, Q, pSrc_ptr);
2203         DCT_Upsample.R_S!(7, 7).calc(R, S, pSrc_ptr);
2204         break;
2205       case 7*16+8:
2206         DCT_Upsample.P_Q!(7, 8).calc(P, Q, pSrc_ptr);
2207         DCT_Upsample.R_S!(7, 8).calc(R, S, pSrc_ptr);
2208         break;
2209       case 8*16+8:
2210         DCT_Upsample.P_Q!(8, 8).calc(P, Q, pSrc_ptr);
2211         DCT_Upsample.R_S!(8, 8).calc(R, S, pSrc_ptr);
2212         break;
2213       default:
2214         assert(false);
2215       }
2216 
2217       auto a = DCT_Upsample.Matrix44(P + Q);
2218       P -= Q;
2219       DCT_Upsample.Matrix44* b = &P;
2220       auto c = DCT_Upsample.Matrix44(R + S);
2221       R -= S;
2222       DCT_Upsample.Matrix44* d = &R;
2223 
2224       DCT_Upsample.Matrix44.add_and_store(temp_block.ptr, a, c);
2225       idct_4x4(temp_block.ptr, pDst_ptr);
2226       pDst_ptr += 64;
2227 
2228       DCT_Upsample.Matrix44.sub_and_store(temp_block.ptr, a, c);
2229       idct_4x4(temp_block.ptr, pDst_ptr);
2230       pDst_ptr += 64;
2231 
2232       DCT_Upsample.Matrix44.add_and_store(temp_block.ptr, *b, *d);
2233       idct_4x4(temp_block.ptr, pDst_ptr);
2234       pDst_ptr += 64;
2235 
2236       DCT_Upsample.Matrix44.sub_and_store(temp_block.ptr, *b, *d);
2237       idct_4x4(temp_block.ptr, pDst_ptr);
2238       pDst_ptr += 64;
2239 
2240       pSrc_ptr += 64;
2241     }
2242   }
2243 
2244   // Loads and dequantizes the next row of (already decoded) coefficients.
2245   // Progressive images only.
2246   void load_next_row () {
2247     int i;
2248     jpgd_block_t *p;
2249     jpgd_quant_t *q;
2250     int mcu_row, mcu_block, row_block = 0;
2251     int component_num, component_id;
2252     int[JPGD_MAX_COMPONENTS] block_x_mcu;
2253 
2254     memset(block_x_mcu.ptr, 0, JPGD_MAX_COMPONENTS * int.sizeof);
2255 
2256     for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
2257     {
2258       int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
2259 
2260       for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
2261       {
2262         component_id = m_mcu_org.ptr[mcu_block];
2263         q = m_quant.ptr[m_comp_quant.ptr[component_id]];
2264 
2265         p = m_pMCU_coefficients + 64 * mcu_block;
2266 
2267         jpgd_block_t* pAC = coeff_buf_getp(m_ac_coeffs.ptr[component_id], block_x_mcu.ptr[component_id] + block_x_mcu_ofs, m_block_y_mcu.ptr[component_id] + block_y_mcu_ofs);
2268         jpgd_block_t* pDC = coeff_buf_getp(m_dc_coeffs.ptr[component_id], block_x_mcu.ptr[component_id] + block_x_mcu_ofs, m_block_y_mcu.ptr[component_id] + block_y_mcu_ofs);
2269         p[0] = pDC[0];
2270         memcpy(&p[1], &pAC[1], 63 * jpgd_block_t.sizeof);
2271 
2272         for (i = 63; i > 0; i--)
2273           if (p[g_ZAG[i]])
2274             break;
2275 
2276         m_mcu_block_max_zag.ptr[mcu_block] = i + 1;
2277 
2278         for ( ; i >= 0; i--)
2279           if (p[g_ZAG[i]])
2280             p[g_ZAG[i]] = cast(jpgd_block_t)(p[g_ZAG[i]] * q[i]);
2281 
2282         row_block++;
2283 
2284         if (m_comps_in_scan == 1)
2285           block_x_mcu.ptr[component_id]++;
2286         else
2287         {
2288           if (++block_x_mcu_ofs == m_comp_h_samp.ptr[component_id])
2289           {
2290             block_x_mcu_ofs = 0;
2291 
2292             if (++block_y_mcu_ofs == m_comp_v_samp.ptr[component_id])
2293             {
2294               block_y_mcu_ofs = 0;
2295 
2296               block_x_mcu.ptr[component_id] += m_comp_h_samp.ptr[component_id];
2297             }
2298           }
2299         }
2300       }
2301 
2302       if (m_freq_domain_chroma_upsample)
2303         transform_mcu_expand(mcu_row);
2304       else
2305         transform_mcu(mcu_row);
2306     }
2307 
2308     if (m_comps_in_scan == 1)
2309       m_block_y_mcu.ptr[m_comp_list.ptr[0]]++;
2310     else
2311     {
2312       for (component_num = 0; component_num < m_comps_in_scan; component_num++)
2313       {
2314         component_id = m_comp_list.ptr[component_num];
2315 
2316         m_block_y_mcu.ptr[component_id] += m_comp_v_samp.ptr[component_id];
2317       }
2318     }
2319   }
2320 
2321   // Restart interval processing.
2322   bool process_restart () {
2323     int i;
2324     int c = 0;
2325 
2326     // Align to a byte boundry
2327     // FIXME: Is this really necessary? get_bits_no_markers() never reads in markers!
2328     //get_bits_no_markers(m_bits_left & 7);
2329 
2330     bool err;
2331 
2332     // Let's scan a little bit to find the marker, but not _too_ far.
2333     // 1536 is a "fudge factor" that determines how much to scan.
2334     for (i = 1536; i > 0; i--)
2335     {
2336       if (get_char(&err) == 0xFF)
2337           break;
2338       if (err)
2339           return false;
2340     }
2341 
2342     if (i == 0)
2343     {
2344       set_error(JPGD_BAD_RESTART_MARKER);
2345       return false;
2346     }
2347 
2348     for ( ; i > 0; i--)
2349     {
2350       c = get_char(&err);
2351       if (err)
2352         return false;
2353       if (c != 0xFF)
2354         break;
2355     }
2356 
2357     if (i == 0)
2358     {
2359       set_error(JPGD_BAD_RESTART_MARKER);
2360       return false;
2361     }
2362 
2363     // Is it the expected marker? If not, something bad happened.
2364     if (c != (m_next_restart_num + M_RST0))
2365     {
2366       set_error(JPGD_BAD_RESTART_MARKER);
2367       return false;
2368     }
2369 
2370     // Reset each component's DC prediction values.
2371     memset(&m_last_dc_val, 0, m_comps_in_frame * uint.sizeof);
2372 
2373     m_eob_run = 0;
2374 
2375     m_restarts_left = m_restart_interval;
2376 
2377     m_next_restart_num = (m_next_restart_num + 1) & 7;
2378 
2379     // Get the bit buffer going again...
2380 
2381     m_bits_left = 16;
2382     get_bits_no_markers(16, &err);
2383     if (err)
2384         return false;
2385     get_bits_no_markers(16, &err);
2386     if (err)
2387         return false;
2388     return true;
2389   }
2390 
2391   // Decodes and dequantizes the next row of coefficients.
2392   bool decode_next_row () {
2393     int row_block = 0;
2394 
2395     bool err;
2396 
2397     for (int mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
2398     {
2399       if ((m_restart_interval) && (m_restarts_left == 0))
2400       {
2401         if (!process_restart())
2402             return false;
2403       }
2404 
2405       jpgd_block_t* p = m_pMCU_coefficients;
2406       for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++, p += 64)
2407       {
2408         int component_id = m_mcu_org.ptr[mcu_block];
2409         jpgd_quant_t* q = m_quant.ptr[m_comp_quant.ptr[component_id]];
2410 
2411         int r, s;
2412         s = huff_decode(m_pHuff_tabs.ptr[m_comp_dc_tab.ptr[component_id]], r, &err);
2413         if (err)
2414             return false;
2415         s = JPGD_HUFF_EXTEND(r, s);
2416 
2417         m_last_dc_val.ptr[component_id] = (s += m_last_dc_val.ptr[component_id]);
2418 
2419         p[0] = cast(jpgd_block_t)(s * q[0]);
2420 
2421         int prev_num_set = m_mcu_block_max_zag.ptr[mcu_block];
2422 
2423         huff_tables *pH = m_pHuff_tabs.ptr[m_comp_ac_tab.ptr[component_id]];
2424 
2425         int k;
2426         for (k = 1; k < 64; k++)
2427         {
2428           int extra_bits;
2429           s = huff_decode(pH, extra_bits, &err);
2430           if (err)
2431               return false;
2432 
2433           r = s >> 4;
2434           s &= 15;
2435 
2436           if (s)
2437           {
2438             if (r)
2439             {
2440               if ((k + r) > 63)
2441               {
2442                 set_error(JPGD_DECODE_ERROR);
2443                 return false;
2444               }
2445 
2446               if (k < prev_num_set)
2447               {
2448                 int n = JPGD_MIN(r, prev_num_set - k);
2449                 int kt = k;
2450                 while (n--)
2451                   p[g_ZAG[kt++]] = 0;
2452               }
2453 
2454               k += r;
2455             }
2456 
2457             s = JPGD_HUFF_EXTEND(extra_bits, s);
2458 
2459             assert(k < 64);
2460 
2461             p[g_ZAG[k]] = cast(jpgd_block_t)( s * q[k] ); // dequantize
2462           }
2463           else
2464           {
2465             if (r == 15)
2466             {
2467               if ((k + 16) > 64)
2468               {
2469                 set_error(JPGD_DECODE_ERROR);
2470                 return false;
2471               }
2472 
2473               if (k < prev_num_set)
2474               {
2475                 int n = JPGD_MIN(16, prev_num_set - k);
2476                 int kt = k;
2477                 while (n--)
2478                 {
2479                   assert(kt <= 63);
2480                   p[g_ZAG[kt++]] = 0;
2481                 }
2482               }
2483 
2484               k += 16 - 1; // - 1 because the loop counter is k
2485               assert(p[g_ZAG[k]] == 0);
2486             }
2487             else
2488               break;
2489           }
2490         }
2491 
2492         if (k < prev_num_set)
2493         {
2494           int kt = k;
2495           while (kt < prev_num_set)
2496             p[g_ZAG[kt++]] = 0;
2497         }
2498 
2499         m_mcu_block_max_zag.ptr[mcu_block] = k;
2500 
2501         row_block++;
2502       }
2503 
2504       if (m_freq_domain_chroma_upsample)
2505         transform_mcu_expand(mcu_row);
2506       else
2507         transform_mcu(mcu_row);
2508 
2509       m_restarts_left--;
2510     }
2511     return true;
2512   }
2513 
2514   // YCbCr H1V1 (1x1:1:1, 3 m_blocks per MCU) to RGB
2515   void H1V1Convert () {
2516     int row = m_max_mcu_y_size - m_mcu_lines_left;
2517     ubyte *d = m_pScan_line_0;
2518     ubyte *s = m_pSample_buf + row * 8;
2519 
2520     for (int i = m_max_mcus_per_row; i > 0; i--)
2521     {
2522       for (int j = 0; j < 8; j++)
2523       {
2524         int y = s[j];
2525         int cb = s[64+j];
2526         int cr = s[128+j];
2527 
2528 
2529         __m128i zero = _mm_setzero_si128();
2530         __m128i A = _mm_setr_epi32(y + m_crr.ptr[cr], 
2531                                    y + ((m_crg.ptr[cr] + m_cbg.ptr[cb]) >> 16),
2532                                    y + m_cbb.ptr[cb],
2533                                    255);
2534         A = _mm_packs_epi32(A, zero);
2535         A = _mm_packus_epi16(A, zero);
2536         _mm_storeu_si32(&d[0], A);
2537         d += 4;
2538       }
2539 
2540       s += 64*3;
2541     }
2542   }
2543 
2544   // YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
2545   void H2V1Convert () 
2546   {
2547     int row = m_max_mcu_y_size - m_mcu_lines_left;
2548     ubyte *d0 = m_pScan_line_0;
2549     ubyte *y = m_pSample_buf + row * 8;
2550     ubyte *c = m_pSample_buf + 2*64 + row * 8;
2551 
2552     for (int i = m_max_mcus_per_row; i > 0; i--)
2553     {
2554       for (int l = 0; l < 2; l++)
2555       {
2556         for (int j = 0; j < 4; j++)
2557         {
2558           int cb = c[0];
2559           int cr = c[64];
2560 
2561           int rc = m_crr.ptr[cr];
2562           int gc = ((m_crg.ptr[cr] + m_cbg.ptr[cb]) >> 16);
2563           int bc = m_cbb.ptr[cb];
2564 
2565           int yy = y[j<<1];
2566           d0[0] = clamp(yy+rc);
2567           d0[1] = clamp(yy+gc);
2568           d0[2] = clamp(yy+bc);
2569           d0[3] = 255;
2570 
2571           yy = y[(j<<1)+1];
2572           d0[4] = clamp(yy+rc);
2573           d0[5] = clamp(yy+gc);
2574           d0[6] = clamp(yy+bc);
2575           d0[7] = 255;
2576 
2577           d0 += 8;
2578 
2579           c++;
2580         }
2581         y += 64;
2582       }
2583 
2584       y += 64*4 - 64*2;
2585       c += 64*4 - 8;
2586     }
2587   }
2588 
2589   // YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
2590   void H1V2Convert () {
2591     int row = m_max_mcu_y_size - m_mcu_lines_left;
2592     ubyte *d0 = m_pScan_line_0;
2593     ubyte *d1 = m_pScan_line_1;
2594     ubyte *y;
2595     ubyte *c;
2596 
2597     if (row < 8)
2598       y = m_pSample_buf + row * 8;
2599     else
2600       y = m_pSample_buf + 64*1 + (row & 7) * 8;
2601 
2602     c = m_pSample_buf + 64*2 + (row >> 1) * 8;
2603 
2604     for (int i = m_max_mcus_per_row; i > 0; i--)
2605     {
2606       for (int j = 0; j < 8; j++)
2607       {
2608         int cb = c[0+j];
2609         int cr = c[64+j];
2610 
2611         int rc = m_crr.ptr[cr];
2612         int gc = ((m_crg.ptr[cr] + m_cbg.ptr[cb]) >> 16);
2613         int bc = m_cbb.ptr[cb];
2614 
2615         int yy = y[j];
2616         d0[0] = clamp(yy+rc);
2617         d0[1] = clamp(yy+gc);
2618         d0[2] = clamp(yy+bc);
2619         d0[3] = 255;
2620 
2621         yy = y[8+j];
2622         d1[0] = clamp(yy+rc);
2623         d1[1] = clamp(yy+gc);
2624         d1[2] = clamp(yy+bc);
2625         d1[3] = 255;
2626 
2627         d0 += 4;
2628         d1 += 4;
2629       }
2630 
2631       y += 64*4;
2632       c += 64*4;
2633     }
2634   }
2635 
2636   // YCbCr H2V2 (2x2:1:1, 6 m_blocks per MCU) to RGB
2637   void H2V2Convert () {
2638     int row = m_max_mcu_y_size - m_mcu_lines_left;
2639     ubyte *d0 = m_pScan_line_0;
2640     ubyte *d1 = m_pScan_line_1;
2641     ubyte *y;
2642     ubyte *c;
2643 
2644     if (row < 8)
2645       y = m_pSample_buf + row * 8;
2646     else
2647       y = m_pSample_buf + 64*2 + (row & 7) * 8;
2648 
2649     c = m_pSample_buf + 64*4 + (row >> 1) * 8;
2650 
2651     for (int i = m_max_mcus_per_row; i > 0; i--)
2652     {
2653       for (int l = 0; l < 2; l++)
2654       {
2655         for (int j = 0; j < 8; j += 2)
2656         {
2657           int cb = c[0];
2658           int cr = c[64];
2659 
2660           int rc = m_crr.ptr[cr];
2661           int gc = ((m_crg.ptr[cr] + m_cbg.ptr[cb]) >> 16);
2662           int bc = m_cbb.ptr[cb];
2663 
2664           int yy = y[j];
2665           d0[0] = clamp(yy+rc);
2666           d0[1] = clamp(yy+gc);
2667           d0[2] = clamp(yy+bc);
2668           d0[3] = 255;
2669 
2670           yy = y[j+1];
2671           d0[4] = clamp(yy+rc);
2672           d0[5] = clamp(yy+gc);
2673           d0[6] = clamp(yy+bc);
2674           d0[7] = 255;
2675 
2676           yy = y[j+8];
2677           d1[0] = clamp(yy+rc);
2678           d1[1] = clamp(yy+gc);
2679           d1[2] = clamp(yy+bc);
2680           d1[3] = 255;
2681 
2682           yy = y[j+8+1];
2683           d1[4] = clamp(yy+rc);
2684           d1[5] = clamp(yy+gc);
2685           d1[6] = clamp(yy+bc);
2686           d1[7] = 255;
2687 
2688           d0 += 8;
2689           d1 += 8;
2690 
2691           c++;
2692         }
2693         y += 64;
2694       }
2695 
2696       y += 64*6 - 64*2;
2697       c += 64*6 - 8;
2698     }
2699   }
2700 
2701   // Y (1 block per MCU) to 8-bit grayscale
2702   void gray_convert () {
2703     int row = m_max_mcu_y_size - m_mcu_lines_left;
2704     ubyte *d = m_pScan_line_0;
2705     ubyte *s = m_pSample_buf + row * 8;
2706 
2707     for (int i = m_max_mcus_per_row; i > 0; i--)
2708     {
2709       *cast(uint*)d = *cast(uint*)s;
2710       *cast(uint*)(&d[4]) = *cast(uint*)(&s[4]);
2711 
2712       s += 64;
2713       d += 8;
2714     }
2715   }
2716 
2717 
2718   void expanded_convert () 
2719   {
2720     int row = m_max_mcu_y_size - m_mcu_lines_left;
2721 
2722     ubyte* Py = m_pSample_buf + (row / 8) * 64 * m_comp_h_samp.ptr[0] + (row & 7) * 8;
2723 
2724     ubyte* d = m_pScan_line_0;
2725 
2726     for (int i = m_max_mcus_per_row; i > 0; i--)
2727     {
2728       for (int k = 0; k < m_max_mcu_x_size; k += 8)
2729       {
2730         immutable int Y_ofs = k * 8;
2731         immutable int Cb_ofs = Y_ofs + 64 * m_expanded_blocks_per_component;
2732         immutable int Cr_ofs = Y_ofs + 64 * m_expanded_blocks_per_component * 2;
2733 
2734         const(int*) pm_crr = m_crr.ptr;
2735         const(int*) pm_crg = m_crg.ptr;
2736         const(int*) pm_cbg = m_cbg.ptr;
2737         const(int*) pm_cbb = m_cbb.ptr;
2738 
2739         for (int j = 0; j + 3 < 8; j += 4)
2740         {
2741             __m128i mm_y  = _mm_loadu_si32(cast(__m128i*) &Py[Y_ofs + j]);
2742             __m128i mm_cb = _mm_loadu_si32(cast(__m128i*) &Py[Cb_ofs + j]);
2743             __m128i mm_cr = _mm_loadu_si32(cast(__m128i*) &Py[Cr_ofs + j]);
2744             __m128i zero = _mm_setzero_si128();
2745 
2746             // Extend u8 to i32
2747             mm_y  = _mm_unpacklo_epi8(mm_y, zero);
2748             mm_cb = _mm_unpacklo_epi8(mm_cb, zero);
2749             mm_cr = _mm_unpacklo_epi8(mm_cr, zero);
2750             mm_y  = _mm_unpacklo_epi16(mm_y, zero);
2751             mm_cb = _mm_unpacklo_epi16(mm_cb, zero);
2752             mm_cr = _mm_unpacklo_epi16(mm_cr, zero);
2753 
2754             // Avoid table here, since we use SIMD            
2755 
2756             //m_crr.ptr[i] = ( FIX!(1.40200f)  * (i - 128) + ONE_HALF) >> SCALEBITS;
2757             //m_crg.ptr[i] = (-FIX!(0.71414f)) * (i - 128);
2758             //m_cbg.ptr[i] = (-FIX!(0.34414f)) * (i - 128) + ONE_HALF;
2759             //m_cbb.ptr[i] = ( FIX!(1.77200f)  * (i - 128) + ONE_HALF) >> SCALEBITS;            
2760 
2761             __m128i mm_128 = _mm_set1_epi32(128);
2762 
2763             // PERF: would be faster better as short multiplication here, 
2764             // do we need that much precision???
2765 
2766             __m128i mm_crr = _mm_mullo_epi32 (mm_cr - mm_128, _mm_set1_epi32( FIX!(1.40200f) ) );
2767             __m128i mm_crg = _mm_mullo_epi32 (mm_cr - mm_128, _mm_set1_epi32(-FIX!(0.71414f) ) );
2768             __m128i mm_cbg = _mm_mullo_epi32 (mm_cb - mm_128, _mm_set1_epi32(-FIX!(0.34414f) ) );
2769             __m128i mm_cbb = _mm_mullo_epi32 (mm_cb - mm_128, _mm_set1_epi32( FIX!(1.77200f) ) );
2770 
2771             __m128i mm_ONE_HALF = _mm_set1_epi32(ONE_HALF);
2772             mm_crr += mm_ONE_HALF;
2773             mm_cbg += mm_ONE_HALF;
2774             mm_cbb += mm_ONE_HALF;
2775             mm_crr = _mm_srai_epi32(mm_crr, 16);
2776             mm_cbb = _mm_srai_epi32(mm_cbb, 16);
2777 
2778             mm_crg = _mm_srai_epi32(mm_crg + mm_cbg, 16);
2779             mm_crr += mm_y;
2780             mm_crg += mm_y;
2781             mm_cbb += mm_y;
2782 
2783             // We want to store these bytes (read in cols)
2784             //    |
2785             //    v 
2786             //    A         B          C        D
2787             // mm_crr[0] mm_crr[1] mm_crr[2] mm_crr[3]
2788             // mm_crg[0] mm_crg[1] mm_crg[2] mm_crg[3]
2789             // mm_cbb[0] mm_cbb[1] mm_cbb[2] mm_cbb[3]
2790             //   255       255        255      255
2791 
2792             // 4x4 transpose
2793             __m128 A = cast(__m128) mm_crr;
2794             __m128 B = cast(__m128) mm_crg;
2795             __m128 C = cast(__m128) mm_cbb;
2796             __m128 D = cast(__m128) _mm_set1_epi32(255);
2797             _MM_TRANSPOSE4_PS(A, B, C, D);
2798 
2799             // Now pack
2800             __m128i Ai = _mm_packs_epi32(cast(__m128i)A, cast(__m128i)B);
2801             __m128i Ci = _mm_packs_epi32(cast(__m128i)C, cast(__m128i)D);
2802             Ai = _mm_packus_epi16(Ai, Ci);
2803             _mm_storeu_si128(cast(__m128i*) &d[0], Ai);
2804             d += 16;
2805         }
2806       }
2807 
2808       Py += 64 * m_expanded_blocks_per_mcu;
2809     }
2810   }
2811 
2812   // Find end of image (EOI) marker, so we can return to the user the exact size of the input stream.
2813   bool find_eoi () {
2814     bool err;
2815     if (!m_progressive_flag)
2816     {
2817       // Attempt to read the EOI marker.
2818       //get_bits_no_markers(m_bits_left & 7);
2819 
2820       // Prime the bit buffer
2821       m_bits_left = 16;
2822       get_bits(16, &err);
2823       if (err) return false;
2824       get_bits(16, &err);
2825       if (err) return false;
2826 
2827       // The next marker _should_ be EOI
2828       process_markers(&err);
2829       if (err) return false;
2830     }
2831 
2832     m_total_bytes_read -= m_in_buf_left;
2833     return true;
2834   }
2835 
2836   // Creates the tables needed for efficient Huffman decoding.
2837   void make_huff_table (int index, huff_tables *pH) {
2838     int p, i, l, si;
2839     ubyte[257] huffsize;
2840     uint[257] huffcode;
2841     uint code;
2842     uint subtree;
2843     int code_size;
2844     int lastp;
2845     int nextfreeentry;
2846     int currententry;
2847 
2848     pH.ac_table = m_huff_ac.ptr[index] != 0;
2849 
2850     p = 0;
2851 
2852     for (l = 1; l <= 16; l++)
2853     {
2854       for (i = 1; i <= m_huff_num.ptr[index][l]; i++)
2855         huffsize.ptr[p++] = cast(ubyte)(l);
2856     }
2857 
2858     huffsize.ptr[p] = 0;
2859 
2860     lastp = p;
2861 
2862     code = 0;
2863     si = huffsize.ptr[0];
2864     p = 0;
2865 
2866     while (huffsize.ptr[p])
2867     {
2868       while (huffsize.ptr[p] == si)
2869       {
2870         huffcode.ptr[p++] = code;
2871         code++;
2872       }
2873 
2874       code <<= 1;
2875       si++;
2876     }
2877 
2878     memset(pH.look_up.ptr, 0, pH.look_up.sizeof);
2879     memset(pH.look_up2.ptr, 0, pH.look_up2.sizeof);
2880     memset(pH.tree.ptr, 0, pH.tree.sizeof);
2881     memset(pH.code_size.ptr, 0, pH.code_size.sizeof);
2882 
2883     nextfreeentry = -1;
2884 
2885     p = 0;
2886 
2887     while (p < lastp)
2888     {
2889       i = m_huff_val.ptr[index][p];
2890       code = huffcode.ptr[p];
2891       code_size = huffsize.ptr[p];
2892 
2893       pH.code_size.ptr[i] = cast(ubyte)(code_size);
2894 
2895       if (code_size <= 8)
2896       {
2897         code <<= (8 - code_size);
2898 
2899         for (l = 1 << (8 - code_size); l > 0; l--)
2900         {
2901           assert(i < 256);
2902 
2903           pH.look_up.ptr[code] = i;
2904 
2905           bool has_extrabits = false;
2906           int extra_bits = 0;
2907           int num_extra_bits = i & 15;
2908 
2909           int bits_to_fetch = code_size;
2910           if (num_extra_bits)
2911           {
2912             int total_codesize = code_size + num_extra_bits;
2913             if (total_codesize <= 8)
2914             {
2915               has_extrabits = true;
2916               extra_bits = ((1 << num_extra_bits) - 1) & (code >> (8 - total_codesize));
2917               assert(extra_bits <= 0x7FFF);
2918               bits_to_fetch += num_extra_bits;
2919             }
2920           }
2921 
2922           if (!has_extrabits)
2923             pH.look_up2.ptr[code] = i | (bits_to_fetch << 8);
2924           else
2925             pH.look_up2.ptr[code] = i | 0x8000 | (extra_bits << 16) | (bits_to_fetch << 8);
2926 
2927           code++;
2928         }
2929       }
2930       else
2931       {
2932         subtree = (code >> (code_size - 8)) & 0xFF;
2933 
2934         currententry = pH.look_up.ptr[subtree];
2935 
2936         if (currententry == 0)
2937         {
2938           pH.look_up.ptr[subtree] = currententry = nextfreeentry;
2939           pH.look_up2.ptr[subtree] = currententry = nextfreeentry;
2940 
2941           nextfreeentry -= 2;
2942         }
2943 
2944         code <<= (16 - (code_size - 8));
2945 
2946         for (l = code_size; l > 9; l--)
2947         {
2948           if ((code & 0x8000) == 0)
2949             currententry--;
2950 
2951           if (pH.tree.ptr[-currententry - 1] == 0)
2952           {
2953             pH.tree.ptr[-currententry - 1] = nextfreeentry;
2954 
2955             currententry = nextfreeentry;
2956 
2957             nextfreeentry -= 2;
2958           }
2959           else
2960             currententry = pH.tree.ptr[-currententry - 1];
2961 
2962           code <<= 1;
2963         }
2964 
2965         if ((code & 0x8000) == 0)
2966           currententry--;
2967 
2968         pH.tree.ptr[-currententry - 1] = i;
2969       }
2970 
2971       p++;
2972     }
2973   }
2974 
2975   // Verifies the quantization tables needed for this scan are available.
2976   bool check_quant_tables () {
2977     for (int i = 0; i < m_comps_in_scan; i++)
2978     {
2979       if (m_quant.ptr[m_comp_quant.ptr[m_comp_list.ptr[i]]] == null)
2980       {
2981         set_error(JPGD_UNDEFINED_QUANT_TABLE);
2982         return false;
2983       }
2984     }
2985     return true;
2986   }
2987 
2988   // Verifies that all the Huffman tables needed for this scan are available.
2989   bool check_huff_tables () {
2990     for (int i = 0; i < m_comps_in_scan; i++)
2991     {
2992       if ((m_spectral_start == 0) && (m_huff_num.ptr[m_comp_dc_tab.ptr[m_comp_list.ptr[i]]] == null))
2993       {
2994         set_error(JPGD_UNDEFINED_HUFF_TABLE);
2995         return false;
2996       }
2997 
2998       if ((m_spectral_end > 0) && (m_huff_num.ptr[m_comp_ac_tab.ptr[m_comp_list.ptr[i]]] == null))
2999       {
3000         set_error(JPGD_UNDEFINED_HUFF_TABLE);
3001         return false;
3002       }
3003     }
3004 
3005     for (int i = 0; i < JPGD_MAX_HUFF_TABLES; i++)
3006       if (m_huff_num.ptr[i])
3007       {
3008         if (!m_pHuff_tabs.ptr[i])
3009         {
3010             bool err;
3011             m_pHuff_tabs.ptr[i] = cast(huff_tables*)alloc(huff_tables.sizeof, false, &err);
3012             if (err)
3013                 return false;
3014         }
3015 
3016         make_huff_table(i, m_pHuff_tabs.ptr[i]);
3017       }
3018 
3019     return true;
3020   }
3021 
3022   // Determines the component order inside each MCU.
3023   // Also calcs how many MCU's are on each row, etc.
3024   void calc_mcu_block_order () {
3025     int component_num, component_id;
3026     int max_h_samp = 0, max_v_samp = 0;
3027 
3028     for (component_id = 0; component_id < m_comps_in_frame; component_id++)
3029     {
3030       if (m_comp_h_samp.ptr[component_id] > max_h_samp)
3031         max_h_samp = m_comp_h_samp.ptr[component_id];
3032 
3033       if (m_comp_v_samp.ptr[component_id] > max_v_samp)
3034         max_v_samp = m_comp_v_samp.ptr[component_id];
3035     }
3036 
3037     for (component_id = 0; component_id < m_comps_in_frame; component_id++)
3038     {
3039       m_comp_h_blocks.ptr[component_id] = ((((m_image_x_size * m_comp_h_samp.ptr[component_id]) + (max_h_samp - 1)) / max_h_samp) + 7) / 8;
3040       m_comp_v_blocks.ptr[component_id] = ((((m_image_y_size * m_comp_v_samp.ptr[component_id]) + (max_v_samp - 1)) / max_v_samp) + 7) / 8;
3041     }
3042 
3043     if (m_comps_in_scan == 1)
3044     {
3045       m_mcus_per_row = m_comp_h_blocks.ptr[m_comp_list.ptr[0]];
3046       m_mcus_per_col = m_comp_v_blocks.ptr[m_comp_list.ptr[0]];
3047     }
3048     else
3049     {
3050       m_mcus_per_row = (((m_image_x_size + 7) / 8) + (max_h_samp - 1)) / max_h_samp;
3051       m_mcus_per_col = (((m_image_y_size + 7) / 8) + (max_v_samp - 1)) / max_v_samp;
3052     }
3053 
3054     if (m_comps_in_scan == 1)
3055     {
3056       m_mcu_org.ptr[0] = m_comp_list.ptr[0];
3057 
3058       m_blocks_per_mcu = 1;
3059     }
3060     else
3061     {
3062       m_blocks_per_mcu = 0;
3063 
3064       for (component_num = 0; component_num < m_comps_in_scan; component_num++)
3065       {
3066         int num_blocks;
3067 
3068         component_id = m_comp_list.ptr[component_num];
3069 
3070         num_blocks = m_comp_h_samp.ptr[component_id] * m_comp_v_samp.ptr[component_id];
3071 
3072         while (num_blocks--)
3073           m_mcu_org.ptr[m_blocks_per_mcu++] = component_id;
3074       }
3075     }
3076   }
3077 
3078   // Starts a new scan.
3079   int init_scan (bool* err) {
3080 
3081     if (!locate_sos_marker(err))
3082       return false;
3083 
3084     if (*err)
3085         return false;
3086 
3087     calc_mcu_block_order();
3088 
3089     check_huff_tables();
3090 
3091     if (!check_quant_tables())
3092         return false;
3093 
3094     memset(m_last_dc_val.ptr, 0, m_comps_in_frame * uint.sizeof);
3095 
3096     m_eob_run = 0;
3097 
3098     if (m_restart_interval)
3099     {
3100       m_restarts_left = m_restart_interval;
3101       m_next_restart_num = 0;
3102     }
3103 
3104     if (!fix_in_buffer())
3105     {
3106         *err = true;
3107         return false;
3108     }
3109 
3110     return true;
3111   }
3112 
3113   // Starts a frame. Determines if the number of components or sampling factors
3114   // are supported.
3115   // Return true on success.
3116   bool init_frame () 
3117   {
3118     int i;
3119 
3120     if (m_comps_in_frame == 1)
3121     {
3122       if ((m_comp_h_samp.ptr[0] != 1) || (m_comp_v_samp.ptr[0] != 1))
3123       {
3124         set_error(JPGD_UNSUPPORTED_SAMP_FACTORS);
3125         return false;
3126       }
3127 
3128       m_scan_type = JPGD_GRAYSCALE;
3129       m_max_blocks_per_mcu = 1;
3130       m_max_mcu_x_size = 8;
3131       m_max_mcu_y_size = 8;
3132     }
3133     else if (m_comps_in_frame == 3)
3134     {
3135       if ( ((m_comp_h_samp.ptr[1] != 1) || (m_comp_v_samp.ptr[1] != 1)) ||
3136            ((m_comp_h_samp.ptr[2] != 1) || (m_comp_v_samp.ptr[2] != 1)) )
3137       {
3138         set_error(JPGD_UNSUPPORTED_SAMP_FACTORS);
3139         return false;
3140       }
3141 
3142       if ((m_comp_h_samp.ptr[0] == 1) && (m_comp_v_samp.ptr[0] == 1))
3143       {
3144         m_scan_type = JPGD_YH1V1;
3145 
3146         m_max_blocks_per_mcu = 3;
3147         m_max_mcu_x_size = 8;
3148         m_max_mcu_y_size = 8;
3149       }
3150       else if ((m_comp_h_samp.ptr[0] == 2) && (m_comp_v_samp.ptr[0] == 1))
3151       {
3152         m_scan_type = JPGD_YH2V1;
3153         m_max_blocks_per_mcu = 4;
3154         m_max_mcu_x_size = 16;
3155         m_max_mcu_y_size = 8;
3156       }
3157       else if ((m_comp_h_samp.ptr[0] == 1) && (m_comp_v_samp.ptr[0] == 2))
3158       {
3159         m_scan_type = JPGD_YH1V2;
3160         m_max_blocks_per_mcu = 4;
3161         m_max_mcu_x_size = 8;
3162         m_max_mcu_y_size = 16;
3163       }
3164       else if ((m_comp_h_samp.ptr[0] == 2) && (m_comp_v_samp.ptr[0] == 2))
3165       {
3166         m_scan_type = JPGD_YH2V2;
3167         m_max_blocks_per_mcu = 6;
3168         m_max_mcu_x_size = 16;
3169         m_max_mcu_y_size = 16;
3170       }
3171       else
3172       {
3173         set_error(JPGD_UNSUPPORTED_SAMP_FACTORS);
3174         return false;
3175       }
3176     }
3177     else
3178     {
3179       set_error(JPGD_UNSUPPORTED_COLORSPACE);
3180       return false;
3181     }
3182 
3183     m_max_mcus_per_row = (m_image_x_size + (m_max_mcu_x_size - 1)) / m_max_mcu_x_size;
3184     m_max_mcus_per_col = (m_image_y_size + (m_max_mcu_y_size - 1)) / m_max_mcu_y_size;
3185 
3186     // These values are for the *destination* pixels: after conversion.
3187     if (m_scan_type == JPGD_GRAYSCALE)
3188       m_dest_bytes_per_pixel = 1;
3189     else
3190       m_dest_bytes_per_pixel = 4;
3191 
3192     m_dest_bytes_per_scan_line = ((m_image_x_size + 15) & 0xFFF0) * m_dest_bytes_per_pixel;
3193 
3194     m_real_dest_bytes_per_scan_line = (m_image_x_size * m_dest_bytes_per_pixel);
3195 
3196     bool err;
3197 
3198     // Initialize two scan line buffers.
3199     m_pScan_line_0 = cast(ubyte*)alloc(m_dest_bytes_per_scan_line, true, &err);
3200     if (err)
3201         return false;
3202     if ((m_scan_type == JPGD_YH1V2) || (m_scan_type == JPGD_YH2V2))
3203     {
3204       m_pScan_line_1 = cast(ubyte*)alloc(m_dest_bytes_per_scan_line, true, &err);
3205       if (err)
3206         return false;
3207     }
3208 
3209     m_max_blocks_per_row = m_max_mcus_per_row * m_max_blocks_per_mcu;
3210 
3211     // Should never happen
3212     if (m_max_blocks_per_row > JPGD_MAX_BLOCKS_PER_ROW)
3213     {
3214       set_error(JPGD_ASSERTION_ERROR);
3215       return false;
3216     }
3217 
3218     // Allocate the coefficient buffer, enough for one MCU
3219     m_pMCU_coefficients = cast(jpgd_block_t*)alloc(m_max_blocks_per_mcu * 64 * jpgd_block_t.sizeof, false, &err);
3220     if (err)
3221         return false;
3222 
3223     for (i = 0; i < m_max_blocks_per_mcu; i++)
3224       m_mcu_block_max_zag.ptr[i] = 64;
3225 
3226     m_expanded_blocks_per_component = m_comp_h_samp.ptr[0] * m_comp_v_samp.ptr[0];
3227     m_expanded_blocks_per_mcu = m_expanded_blocks_per_component * m_comps_in_frame;
3228     m_expanded_blocks_per_row = m_max_mcus_per_row * m_expanded_blocks_per_mcu;
3229     // Freq. domain chroma upsampling is only supported for H2V2 subsampling factor (the most common one I've seen).
3230     m_freq_domain_chroma_upsample = false;
3231     version(JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING) {
3232       m_freq_domain_chroma_upsample = (m_expanded_blocks_per_mcu == 4*3);
3233     }
3234 
3235     if (m_freq_domain_chroma_upsample)
3236     {
3237       m_pSample_buf = cast(ubyte*)alloc(m_expanded_blocks_per_row * 64, false, &err);
3238       if (err)
3239         return false;
3240     }
3241     else
3242     {
3243       m_pSample_buf = cast(ubyte*)alloc(m_max_blocks_per_row * 64, false, &err);
3244       if (err)
3245         return false;
3246     }
3247 
3248     m_total_lines_left = m_image_y_size;
3249 
3250     m_mcu_lines_left = 0;
3251 
3252     create_look_ups();    
3253     return true;
3254   }
3255 
3256   // The coeff_buf series of methods originally stored the coefficients
3257   // into a "virtual" file which was located in EMS, XMS, or a disk file. A cache
3258   // was used to make this process more efficient. Now, we can store the entire
3259   // thing in RAM.
3260   coeff_buf* coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y, bool* err) 
3261   {
3262     *err = false;
3263     coeff_buf* cb = cast(coeff_buf*)alloc(coeff_buf.sizeof, false, err);
3264     if (*err)
3265         return null;
3266 
3267     cb.block_num_x = block_num_x;
3268     cb.block_num_y = block_num_y;
3269     cb.block_len_x = block_len_x;
3270     cb.block_len_y = block_len_y;
3271     cb.block_size = (block_len_x * block_len_y) * cast(int)(jpgd_block_t.sizeof);
3272     cb.pData = cast(ubyte*)alloc(cb.block_size * block_num_x * block_num_y, true, err);
3273     if (*err)
3274         return null; // TODO: leak here?
3275     return cb;
3276   }
3277 
3278   jpgd_block_t* coeff_buf_getp (coeff_buf *cb, int block_x, int block_y) {
3279     assert((block_x < cb.block_num_x) && (block_y < cb.block_num_y));
3280     return cast(jpgd_block_t*)(cb.pData + block_x * cb.block_size + block_y * (cb.block_size * cb.block_num_x));
3281   }
3282 
3283   // The following methods decode the various types of m_blocks encountered
3284   // in progressively encoded images.
3285   static bool decode_block_dc_first (ref jpeg_decoder pD, int component_id, int block_x, int block_y) {
3286     int s, r;
3287     jpgd_block_t *p = pD.coeff_buf_getp(pD.m_dc_coeffs.ptr[component_id], block_x, block_y);
3288 
3289     bool err;
3290     s = pD.huff_decode(pD.m_pHuff_tabs.ptr[pD.m_comp_dc_tab.ptr[component_id]], &err);
3291     if (err)
3292         return false;
3293 
3294     if (s != 0)
3295     {
3296       r = pD.get_bits_no_markers(s, &err);
3297       if (err)
3298         return false;
3299       s = JPGD_HUFF_EXTEND(r, s);
3300     }
3301 
3302     pD.m_last_dc_val.ptr[component_id] = (s += pD.m_last_dc_val.ptr[component_id]);
3303 
3304     p[0] = cast(jpgd_block_t)(s << pD.m_successive_low);
3305     return true;
3306   }
3307 
3308   static bool decode_block_dc_refine (ref jpeg_decoder pD, int component_id, int block_x, int block_y) {
3309     bool err;
3310     if (pD.get_bits_no_markers(1, &err))
3311     {
3312       jpgd_block_t *p = pD.coeff_buf_getp(pD.m_dc_coeffs.ptr[component_id], block_x, block_y);
3313 
3314       p[0] |= (1 << pD.m_successive_low);
3315     }
3316     if (err)
3317         return false;
3318     return true;
3319   }
3320 
3321   static bool decode_block_ac_first (ref jpeg_decoder pD, int component_id, int block_x, int block_y) 
3322   {
3323     int k, s, r;
3324 
3325     if (pD.m_eob_run)
3326     {
3327       pD.m_eob_run--;
3328       return true;
3329     }
3330 
3331     bool err;
3332 
3333     jpgd_block_t *p = pD.coeff_buf_getp(pD.m_ac_coeffs.ptr[component_id], block_x, block_y);
3334 
3335     for (k = pD.m_spectral_start; k <= pD.m_spectral_end; k++)
3336     {
3337       s = pD.huff_decode(pD.m_pHuff_tabs.ptr[pD.m_comp_ac_tab.ptr[component_id]], &err);
3338       if (err)
3339           return false;
3340 
3341       r = s >> 4;
3342       s &= 15;
3343 
3344       if (s)
3345       {
3346         if ((k += r) > 63)
3347         {
3348           pD.set_error(JPGD_DECODE_ERROR);
3349           return false;
3350         }
3351         r = pD.get_bits_no_markers(s, &err);
3352         if (err) return false;
3353         s = JPGD_HUFF_EXTEND(r, s);
3354 
3355         p[g_ZAG[k]] = cast(jpgd_block_t)(s << pD.m_successive_low);
3356       }
3357       else
3358       {
3359         if (r == 15)
3360         {
3361           if ((k += 15) > 63)
3362           {
3363             pD.set_error(JPGD_DECODE_ERROR);
3364             return false;
3365           }
3366         }
3367         else
3368         {
3369           pD.m_eob_run = 1 << r;
3370 
3371           if (r)
3372           {
3373             pD.m_eob_run += pD.get_bits_no_markers(r, &err);
3374             if (err) return false;
3375           }         
3376 
3377           pD.m_eob_run--;
3378 
3379           break;
3380         }
3381       }
3382     }
3383     return true;
3384   }
3385 
3386   static bool decode_block_ac_refine (ref jpeg_decoder pD, int component_id, int block_x, int block_y) {
3387     int s, k, r;
3388     int p1 = 1 << pD.m_successive_low;
3389     int m1 = (-1) << pD.m_successive_low;
3390     jpgd_block_t *p = pD.coeff_buf_getp(pD.m_ac_coeffs.ptr[component_id], block_x, block_y);
3391 
3392     assert(pD.m_spectral_end <= 63);
3393 
3394     k = pD.m_spectral_start;
3395 
3396     bool err;
3397     if (pD.m_eob_run == 0)
3398     {
3399       for ( ; k <= pD.m_spectral_end; k++)
3400       {
3401         s = pD.huff_decode(pD.m_pHuff_tabs.ptr[pD.m_comp_ac_tab.ptr[component_id]], &err);
3402         if (err) return false;
3403 
3404         r = s >> 4;
3405         s &= 15;
3406 
3407         if (s)
3408         {
3409           if (s != 1)
3410           {
3411             pD.set_error(JPGD_DECODE_ERROR);
3412             return false;
3413           }
3414 
3415           if (pD.get_bits_no_markers(1, &err))
3416           {
3417             if (err)
3418                 return false;
3419             s = p1;
3420           }
3421           else
3422             s = m1;
3423         }
3424         else
3425         {
3426           if (r != 15)
3427           {
3428             pD.m_eob_run = 1 << r;
3429 
3430             if (r)
3431             {
3432               pD.m_eob_run += pD.get_bits_no_markers(r, &err);
3433               if (err)
3434                 return false;
3435             }
3436 
3437             break;
3438           }
3439         }
3440 
3441         do
3442         {
3443           jpgd_block_t *this_coef = p + g_ZAG[k & 63];
3444 
3445           if (*this_coef != 0)
3446           {
3447             if (pD.get_bits_no_markers(1, &err))
3448             {
3449               if (err)
3450                 return false;
3451 
3452               if ((*this_coef & p1) == 0)
3453               {
3454                 if (*this_coef >= 0)
3455                   *this_coef = cast(jpgd_block_t)(*this_coef + p1);
3456                 else
3457                   *this_coef = cast(jpgd_block_t)(*this_coef + m1);
3458               }
3459             }
3460           }
3461           else
3462           {
3463             if (--r < 0)
3464               break;
3465           }
3466 
3467           k++;
3468 
3469         } while (k <= pD.m_spectral_end);
3470 
3471         if ((s) && (k < 64))
3472         {
3473           p[g_ZAG[k]] = cast(jpgd_block_t)(s);
3474         }
3475       }
3476     }
3477 
3478     if (pD.m_eob_run > 0)
3479     {
3480       for ( ; k <= pD.m_spectral_end; k++)
3481       {
3482         jpgd_block_t *this_coef = p + g_ZAG[k & 63]; // logical AND to shut up static code analysis
3483 
3484         if (*this_coef != 0)
3485         {
3486           if (pD.get_bits_no_markers(1, &err))
3487           {
3488             if (err)
3489                 return false;
3490             if ((*this_coef & p1) == 0)
3491             {
3492               if (*this_coef >= 0)
3493                 *this_coef = cast(jpgd_block_t)(*this_coef + p1);
3494               else
3495                 *this_coef = cast(jpgd_block_t)(*this_coef + m1);
3496             }
3497           }
3498         }
3499       }
3500 
3501       pD.m_eob_run--;
3502     }
3503     return true;
3504   }
3505 
3506   // Decode a scan in a progressively encoded image.
3507   bool decode_scan (pDecode_block_func decode_block_func) {
3508     int mcu_row, mcu_col, mcu_block;
3509     int[JPGD_MAX_COMPONENTS] block_x_mcu;
3510     int[JPGD_MAX_COMPONENTS] m_block_y_mcu;
3511 
3512     memset(m_block_y_mcu.ptr, 0, m_block_y_mcu.sizeof);
3513 
3514     for (mcu_col = 0; mcu_col < m_mcus_per_col; mcu_col++)
3515     {
3516       int component_num, component_id;
3517 
3518       memset(block_x_mcu.ptr, 0, block_x_mcu.sizeof);
3519 
3520       for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
3521       {
3522         int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
3523 
3524         if ((m_restart_interval) && (m_restarts_left == 0))
3525         {
3526             if (!process_restart())
3527                 return false;
3528         }
3529 
3530         for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
3531         {
3532           component_id = m_mcu_org.ptr[mcu_block];
3533 
3534           bool success = decode_block_func(this, component_id, block_x_mcu.ptr[component_id] + block_x_mcu_ofs, m_block_y_mcu.ptr[component_id] + block_y_mcu_ofs);
3535           if (!success)
3536               return false;
3537 
3538           if (m_comps_in_scan == 1)
3539             block_x_mcu.ptr[component_id]++;
3540           else
3541           {
3542             if (++block_x_mcu_ofs == m_comp_h_samp.ptr[component_id])
3543             {
3544               block_x_mcu_ofs = 0;
3545 
3546               if (++block_y_mcu_ofs == m_comp_v_samp.ptr[component_id])
3547               {
3548                 block_y_mcu_ofs = 0;
3549                 block_x_mcu.ptr[component_id] += m_comp_h_samp.ptr[component_id];
3550               }
3551             }
3552           }
3553         }
3554 
3555         m_restarts_left--;
3556       }
3557 
3558       if (m_comps_in_scan == 1)
3559         m_block_y_mcu.ptr[m_comp_list.ptr[0]]++;
3560       else
3561       {
3562         for (component_num = 0; component_num < m_comps_in_scan; component_num++)
3563         {
3564           component_id = m_comp_list.ptr[component_num];
3565           m_block_y_mcu.ptr[component_id] += m_comp_v_samp.ptr[component_id];
3566         }
3567       }
3568     }
3569     return true;
3570   }
3571 
3572   // Decode a progressively encoded image. Return true on success.
3573   bool init_progressive () {
3574     int i;
3575 
3576     if (m_comps_in_frame == 4)
3577     {
3578       set_error(JPGD_UNSUPPORTED_COLORSPACE);
3579       return false;
3580     }
3581 
3582     bool err;
3583 
3584     // Allocate the coefficient buffers.
3585     for (i = 0; i < m_comps_in_frame; i++)
3586     {
3587       m_dc_coeffs.ptr[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp.ptr[i], m_max_mcus_per_col * m_comp_v_samp.ptr[i], 1, 1, &err);
3588       if (err)
3589           return false;
3590       m_ac_coeffs.ptr[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp.ptr[i], m_max_mcus_per_col * m_comp_v_samp.ptr[i], 8, 8, &err);
3591       if (err)
3592           return false;
3593     }
3594 
3595     for ( ; ; )
3596     {
3597       int dc_only_scan, refinement_scan;
3598       pDecode_block_func decode_block_func;
3599 
3600       int scanInit = init_scan(&err);
3601       if (err)
3602           return false;
3603       if (!scanInit)
3604         break;
3605 
3606       dc_only_scan = (m_spectral_start == 0);
3607       refinement_scan = (m_successive_high != 0);
3608 
3609       if ((m_spectral_start > m_spectral_end) || (m_spectral_end > 63))
3610       {
3611         set_error(JPGD_BAD_SOS_SPECTRAL);
3612         return false;
3613       }
3614 
3615       if (dc_only_scan)
3616       {
3617         if (m_spectral_end)
3618         {
3619           set_error(JPGD_BAD_SOS_SPECTRAL);
3620           return false;
3621         }
3622       }
3623       else if (m_comps_in_scan != 1)  /* AC scans can only contain one component */
3624       {
3625         set_error(JPGD_BAD_SOS_SPECTRAL);
3626         return false;
3627       }
3628 
3629       if ((refinement_scan) && (m_successive_low != m_successive_high - 1))
3630       {
3631         set_error(JPGD_BAD_SOS_SUCCESSIVE);
3632         return false;
3633       }
3634 
3635       if (dc_only_scan)
3636       {
3637         if (refinement_scan)
3638           decode_block_func = &decode_block_dc_refine;
3639         else
3640           decode_block_func = &decode_block_dc_first;
3641       }
3642       else
3643       {
3644         if (refinement_scan)
3645           decode_block_func = &decode_block_ac_refine;
3646         else
3647           decode_block_func = &decode_block_ac_first;
3648       }
3649 
3650       if (!decode_scan(decode_block_func))
3651           return false;
3652 
3653       m_bits_left = 16;
3654       get_bits(16, &err);
3655       if (err)
3656           return false;
3657       get_bits(16, &err);
3658       if (err)
3659           return false;
3660     }
3661 
3662     m_comps_in_scan = m_comps_in_frame;
3663 
3664     for (i = 0; i < m_comps_in_frame; i++)
3665       m_comp_list.ptr[i] = i;
3666 
3667     calc_mcu_block_order();
3668     return true;
3669   }
3670 
3671   bool init_sequential () {
3672     bool err;
3673     if (!init_scan(&err))
3674     {
3675         set_error(JPGD_UNEXPECTED_MARKER);
3676         return false;
3677     }
3678     if (err)
3679         return false;
3680     return true;
3681   }
3682 
3683   bool decode_start () {
3684     bool success = init_frame();
3685     if (!success)
3686         return false;
3687 
3688     if (m_progressive_flag)
3689       return init_progressive();
3690     else
3691       return init_sequential();
3692   }
3693 
3694   bool decode_init (JpegStreamReadFunc rfn, void* userData) {
3695     bool success = initit(rfn, userData);
3696     if (!success)
3697         return false;
3698     return locate_sof_marker();
3699   }
3700 }
3701 
3702 // ////////////////////////////////////////////////////////////////////////// //
3703 /// decompress JPEG image, what else?
3704 /// you can specify required color components in `req_comps` (3 for RGB or 4 for RGBA), or leave it as is to use image value.
3705 /// Returns pixelAspectRatio and dotsPerInchY, -1 if not available.
3706 public ubyte[] decompress_jpeg_image_from_stream(scope JpegStreamReadFunc rfn, void* userData,
3707                                                  out int width, out int height, out int actual_comps, 
3708                                                  out float pixelAspectRatio, out float dotsPerInchY,
3709                                                  int req_comps=-1) {
3710 
3711   //actual_comps = 0;
3712   if (rfn is null) return null;
3713   if (req_comps != -1 && req_comps != 1 && req_comps != 3 && req_comps != 4) return null;
3714 
3715   bool err;
3716   auto decoder = jpeg_decoder(rfn, userData, &err);
3717   if (err)
3718       return null;
3719   if (decoder.error_code != JPGD_SUCCESS) return null;
3720   version(jpegd_test) scope(exit) { import core.stdc.stdio : printf; printf("%u bytes read.\n", cast(uint)decoder.total_bytes_read); }
3721 
3722   immutable int image_width = decoder.width;
3723   immutable int image_height = decoder.height;
3724   width = image_width;
3725   height = image_height;
3726   pixelAspectRatio = -1;
3727   dotsPerInchY = -1;
3728   actual_comps = decoder.num_components;
3729   if (req_comps < 0) req_comps = decoder.num_components;
3730 
3731   if (decoder.begin_decoding() != JPGD_SUCCESS) return null;
3732 
3733   immutable int dst_bpl = image_width*req_comps;
3734 
3735    ubyte* pImage_data = cast(ubyte*)jpgd_malloc(dst_bpl*image_height);
3736    if (pImage_data is null) return null;
3737    auto idata = pImage_data[0..dst_bpl*image_height];
3738 
3739   for (int y = 0; y < image_height; ++y) {
3740     const(ubyte)* pScan_line;
3741     uint scan_line_len;
3742     if (decoder.decode(/*(const void**)*/cast(void**)&pScan_line, &scan_line_len) != JPGD_SUCCESS) {
3743       jpgd_free(pImage_data);
3744       return null;
3745     }
3746 
3747     ubyte* pDst = pImage_data+y*dst_bpl;
3748 
3749     if ((req_comps == 1 && decoder.num_components == 1) || (req_comps == 4 && decoder.num_components == 3)) {
3750       memcpy(pDst, pScan_line, dst_bpl);
3751     } else if (decoder.num_components == 1) {
3752       if (req_comps == 3) {
3753         for (int x = 0; x < image_width; ++x) {
3754           ubyte luma = pScan_line[x];
3755           pDst[0] = luma;
3756           pDst[1] = luma;
3757           pDst[2] = luma;
3758           pDst += 3;
3759         }
3760       } else {
3761         for (int x = 0; x < image_width; ++x) {
3762           ubyte luma = pScan_line[x];
3763           pDst[0] = luma;
3764           pDst[1] = luma;
3765           pDst[2] = luma;
3766           pDst[3] = 255;
3767           pDst += 4;
3768         }
3769       }
3770     } else if (decoder.num_components == 3) {
3771       if (req_comps == 1) {
3772         immutable int YR = 19595, YG = 38470, YB = 7471;
3773         for (int x = 0; x < image_width; ++x) {
3774           int r = pScan_line[x*4+0];
3775           int g = pScan_line[x*4+1];
3776           int b = pScan_line[x*4+2];
3777           *pDst++ = cast(ubyte)((r * YR + g * YG + b * YB + 32768) >> 16);
3778         }
3779       } else {
3780         for (int x = 0; x < image_width; ++x) {
3781           pDst[0] = pScan_line[x*4+0];
3782           pDst[1] = pScan_line[x*4+1];
3783           pDst[2] = pScan_line[x*4+2];
3784           pDst += 3;
3785         }
3786       }
3787     }
3788   }
3789 
3790   pixelAspectRatio = decoder.m_pixelAspectRatio;
3791   dotsPerInchY = decoder.m_pixelsPerInchY;
3792 
3793   return idata;
3794 }