The OpenD Programming Language

1 /// BC7 encoding image loading.
2 /// D translation of bc7enc16 d3b037f33b8c6df184177a0ae6a0f4cfec1434ad
3 module gamut.codecs.bc7enc16;
4 
5 version(encodeDDS):
6 
7 
8 import core.stdc.string: memset, memcpy;
9 import std.math: abs, sqrt, floor;
10 import gamut.internals.mutex;
11 
12 // File: bc7enc16.h - Richard Geldreich, Jr. - MIT license or public domain (see end of bc7enc16.c)
13 
14 enum BC7ENC16_BLOCK_SIZE = 16;
15 enum BC7ENC16_MAX_PARTITIONS1 = 64;
16 enum BC7ENC16_MAX_UBER_LEVEL = 4;
17 
18 alias bc7enc16_bool = ubyte;
19 enum BC7ENC16_TRUE = 1;
20 enum BC7ENC16_FALSE = 0;
21 
22 nothrow @nogc @safe:
23 
24 struct bc7enc16_compress_block_params
25 {
26     // m_max_partitions_mode1 may range from 0 (disables mode 1) to BC7ENC16_MAX_PARTITIONS1. The higher this value, the slower the compressor, but the higher the quality.
27     uint m_max_partitions_mode1;
28     
29     // Relative RGBA or YCbCrA weights.
30     uint[4] m_weights;
31     
32     // m_uber_level may range from 0 to BC7ENC16_MAX_UBER_LEVEL. The higher this value, the slower the compressor, but the higher the quality.
33     uint m_uber_level;
34 
35     // If m_perceptual is true, colorspace error is computed in YCbCr space, otherwise RGB.
36     bc7enc16_bool m_perceptual;
37 
38     // Set m_try_least_squares to false for slightly faster/lower quality compression.
39     bc7enc16_bool m_try_least_squares;
40     
41     // When m_mode1_partition_estimation_filterbank, the mode1 partition estimator skips lesser used partition patterns unless they are strongly predicted to be potentially useful.
42     // There's a slight loss in quality with this enabled (around .08 dB RGB PSNR or .05 dB Y PSNR), but up to a 11% gain in speed depending on the other settings.
43     bc7enc16_bool m_mode1_partition_estimation_filterbank;
44 }
45 
46 void bc7enc16_compress_block_params_init_linear_weights(bc7enc16_compress_block_params *p) pure
47 {
48     p.m_perceptual = BC7ENC16_FALSE;
49     p.m_weights[0] = 1;
50     p.m_weights[1] = 1;
51     p.m_weights[2] = 1;
52     p.m_weights[3] = 1;
53 }
54 
55 void bc7enc16_compress_block_params_init_perceptual_weights(bc7enc16_compress_block_params *p) pure
56 {
57     p.m_perceptual = BC7ENC16_TRUE;
58     p.m_weights[0] = 128;
59     p.m_weights[1] = 64;
60     p.m_weights[2] = 16;
61     p.m_weights[3] = 32;
62 }
63 
64 void bc7enc16_compress_block_params_init(bc7enc16_compress_block_params *p) pure
65 {
66     p.m_max_partitions_mode1 = BC7ENC16_MAX_PARTITIONS1;
67     p.m_try_least_squares = BC7ENC16_TRUE;
68     p.m_mode1_partition_estimation_filterbank = BC7ENC16_TRUE;
69     p.m_uber_level = 0;
70     bc7enc16_compress_block_params_init_perceptual_weights(p);
71 }
72 
73 
74 // File: bc7enc16.c - Richard Geldreich, Jr. 4/2018 - MIT license or public domain (see end of file)
75 
76 // Helpers
77 int clampi(int value, int low, int high) pure
78 { 
79     if (value < low) 
80         value = low; 
81     else if (value > high) 
82         value = high;   
83     return value; 
84 }
85 
86 float clampf(float value, float low, float high) pure
87 { 
88     if (value < low) 
89         value = low; 
90     else if (value > high) 
91         value = high;   
92     return value; 
93 }
94 
95 float saturate(float value) pure
96 { 
97     return clampf(value, 0, 1.0f); 
98 }
99 
100 ubyte minimumub(ubyte a, ubyte b) pure
101 { 
102     return (a < b) ? a : b; 
103 }
104 
105 uint minimumu(uint a, uint b) pure
106 { 
107     return (a < b) ? a : b; 
108 }
109 
110 float minimumf(float a, float b) pure
111 { 
112     return (a < b) ? a : b; 
113 }
114 
115 ubyte maximumub(ubyte a, ubyte b) pure
116 { 
117     return (a > b) ? a : b; 
118 }
119 
120 uint maximumu(uint a, uint b) pure
121 {
122     return (a > b) ? a : b; 
123 }
124 
125 float maximumf(float a, float b) pure
126 { 
127     return (a > b) ? a : b; 
128 }
129 
130 int squarei(int i) pure 
131 { 
132     return i * i; 
133 }
134 
135 float squaref(float i) pure
136 { 
137     return i * i; 
138 }
139 
140 struct color_quad_u8 
141 { 
142     ubyte[4] m_c; 
143 }
144 
145 struct vec4F 
146 { 
147     float[4] m_c; 
148 }
149 
150 color_quad_u8 *color_quad_u8_set_clamped(color_quad_u8 *pRes, int r, int g, int b, int a) pure @system
151 {
152     pRes.m_c[0] = cast(ubyte)clampi(r, 0, 255); 
153     pRes.m_c[1] = cast(ubyte)clampi(g, 0, 255); 
154     pRes.m_c[2] = cast(ubyte)clampi(b, 0, 255); 
155     pRes.m_c[3] = cast(ubyte)clampi(a, 0, 255); 
156     return pRes; 
157 }
158 
159 color_quad_u8 *color_quad_u8_set(color_quad_u8 *pRes, int r, int g, int b, int a) pure @system
160 {
161     assert(cast(uint)(r | g | b | a) <= 255); 
162     pRes.m_c[0] = cast(ubyte)r; 
163     pRes.m_c[1] = cast(ubyte)g; 
164     pRes.m_c[2] = cast(ubyte)b; 
165     pRes.m_c[3] = cast(ubyte)a; 
166     return pRes; 
167 }
168 
169 bc7enc16_bool color_quad_u8_notequals(ref const(color_quad_u8) pLHS, ref const(color_quad_u8) pRHS) pure
170 {
171     return (pLHS.m_c[0] != pRHS.m_c[0]) 
172         || (pLHS.m_c[1] != pRHS.m_c[1]) 
173         || (pLHS.m_c[2] != pRHS.m_c[2]) 
174         || (pLHS.m_c[3] != pRHS.m_c[3]); 
175 }
176 
177 vec4F* vec4F_set_scalar(vec4F *pV, float x) pure
178 {
179     pV.m_c[0] = x; 
180     pV.m_c[1] = x;
181     pV.m_c[2] = x;  
182     pV.m_c[3] = x;
183     return pV; 
184 }
185 
186 vec4F* vec4F_set(vec4F *pV, float x, float y, float z, float w) pure
187 {
188     pV.m_c[0] = x;  
189     pV.m_c[1] = y;  
190     pV.m_c[2] = z;  
191     pV.m_c[3] = w;  
192     return pV; 
193 }
194 
195 void vec4F_saturate_in_place(ref vec4F pV) pure
196 {
197     pV.m_c[0] = saturate(pV.m_c[0]); 
198     pV.m_c[1] = saturate(pV.m_c[1]); 
199     pV.m_c[2] = saturate(pV.m_c[2]); 
200     pV.m_c[3] = saturate(pV.m_c[3]); 
201 }
202 
203 vec4F vec4F_saturate(const(vec4F)* pV) pure 
204 { 
205     vec4F res; 
206     res.m_c[0] = saturate(pV.m_c[0]); 
207     res.m_c[1] = saturate(pV.m_c[1]); 
208     res.m_c[2] = saturate(pV.m_c[2]); 
209     res.m_c[3] = saturate(pV.m_c[3]); 
210     return res; 
211 }
212 
213 vec4F vec4F_from_color(const(color_quad_u8)* pC) pure @trusted
214 { 
215     vec4F res; 
216     vec4F_set(&res, pC.m_c[0], pC.m_c[1], pC.m_c[2], pC.m_c[3]); 
217     return res; 
218 }
219 
220 vec4F vec4F_add(const(vec4F)* pLHS, const(vec4F)* pRHS) pure @trusted
221 { 
222     vec4F res; 
223     vec4F_set(&res, pLHS.m_c[0] + pRHS.m_c[0], pLHS.m_c[1] + pRHS.m_c[1], 
224                     pLHS.m_c[2] + pRHS.m_c[2], pLHS.m_c[3] + pRHS.m_c[3]); 
225     return res; 
226 }
227 
228 vec4F vec4F_sub(const(vec4F)* pLHS, const(vec4F)* pRHS) pure @trusted
229 { 
230     vec4F res; 
231     vec4F_set(&res, pLHS.m_c[0] - pRHS.m_c[0], pLHS.m_c[1] - pRHS.m_c[1], 
232                     pLHS.m_c[2] - pRHS.m_c[2], pLHS.m_c[3] - pRHS.m_c[3]); 
233     return res; 
234 }
235 
236 float vec4F_dot(const(vec4F)* pLHS, const(vec4F)* pRHS) pure 
237 { 
238     return pLHS.m_c[0] * pRHS.m_c[0] + pLHS.m_c[1] * pRHS.m_c[1] 
239          + pLHS.m_c[2] * pRHS.m_c[2] + pLHS.m_c[3] * pRHS.m_c[3]; 
240 }
241 
242 vec4F vec4F_mul(const(vec4F)* pLHS, float s) pure @trusted
243 { 
244     vec4F res; vec4F_set(&res, pLHS.m_c[0] * s, pLHS.m_c[1] * s, 
245                                pLHS.m_c[2] * s, pLHS.m_c[3] * s); 
246     return res; 
247 }
248 
249 vec4F* vec4F_normalize_in_place(vec4F *pV) pure
250 { 
251     float s = pV.m_c[0] * pV.m_c[0] + pV.m_c[1] * pV.m_c[1] + pV.m_c[2] * pV.m_c[2] + pV.m_c[3] * pV.m_c[3]; 
252     if (s != 0.0f) 
253     { 
254         s = 1.0f / sqrt(s); 
255         pV.m_c[0] *= s; 
256         pV.m_c[1] *= s; 
257         pV.m_c[2] *= s; 
258         pV.m_c[3] *= s; 
259     } 
260     return pV; 
261 }
262 
263 // Various BC7 tables
264 static immutable uint[8] g_bc7_weights3 = [ 0, 9, 18, 27, 37, 46, 55, 64 ];
265 static immutable uint[16] g_bc7_weights4 = [ 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 ];
266 // Precomputed weight constants used during least fit determination. For each entry in g_bc7_weights[]: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w
267 static immutable float[8 * 4] g_bc7_weights3x = 
268 [ 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 
269   0.079102f, 0.202148f, 0.516602f, 0.281250f, 0.177979f, 0.243896f, 0.334229f, 0.421875f, 
270   0.334229f, 0.243896f, 0.177979f, 0.578125f, 0.516602f, 0.202148f, 0.079102f, 0.718750f, 
271   0.738525f, 0.120850f, 0.019775f, 0.859375f, 1.000000f, 0.000000f, 0.000000f, 1.000000f ];
272 
273 static immutable float[16 * 4] g_bc7_weights4x = 
274 [ 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 
275   0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.041260f, 0.161865f, 0.635010f, 0.203125f, 
276   0.070557f, 0.195068f, 0.539307f, 0.265625f, 0.107666f, 0.220459f, 0.451416f, 0.328125f, 
277   0.165039f, 0.241211f, 0.352539f, 0.406250f, 0.219727f, 0.249023f, 0.282227f, 0.468750f, 
278   0.282227f, 0.249023f, 0.219727f, 0.531250f, 0.352539f, 0.241211f, 0.165039f, 0.593750f, 
279   0.451416f, 0.220459f, 0.107666f, 0.671875f, 0.539307f, 0.195068f, 0.070557f, 0.734375f,
280   0.635010f, 0.161865f, 0.041260f, 0.796875f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 
281   0.878906f, 0.058594f, 0.003906f, 0.937500f, 1.000000f, 0.000000f, 0.000000f, 1.000000f ];
282 
283 static immutable ubyte[64] g_bc7_partition1 = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ];
284 static immutable ubyte[64*16] g_bc7_partition2 =
285 [
286     0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,        0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,        0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,        0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,        0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,        0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1,        0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1,        0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,
287     0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,        0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,        0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,        0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,        0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,        0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,        0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,        0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
288     0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1,        0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,        0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,        0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,        0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,        0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,        0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,        0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,
289     0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,        0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,        0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,        0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,        0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0,        0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,        0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0,        0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,
290     0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,        0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,        0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0,        0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0,        0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0,        0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0,        0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,        0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,
291     0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,        0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0,        0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0,        0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0,        0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,        0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1,        0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1,        0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,
292     0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,        0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,        0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,        0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,        0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1,        0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1,        0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0,        0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0,
293     0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1,        0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1,        0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1,        0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,        0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1,        0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0,        0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0,        0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1
294 ];
295 
296 static immutable ubyte[64] g_bc7_table_anchor_index_second_subset = 
297     [ 15,15,15,15,15,15,15,15,        15,15,15,15,15,15,15,15,
298       15, 2, 8, 2, 2, 8, 8,15,        2, 8, 2, 2, 8, 8, 2, 2,
299       15,15, 6, 8, 2, 8,15,15,        2, 8, 2, 2, 2,15,15, 6,
300        6, 2, 6, 8,15,15, 2, 2,        15,15,15,15,15, 2, 2,15 ];
301 
302 static immutable ubyte[8] g_bc7_num_subsets = [ 3, 2, 3, 2, 1, 1, 1, 2 ];
303 static immutable ubyte[8] g_bc7_partition_bits = [ 4, 6, 6, 6, 0, 0, 0, 6 ];
304 static immutable ubyte[8] g_bc7_color_index_bitcount = [ 3, 3, 2, 2, 2, 2, 4, 2 ];
305 
306 int get_bc7_color_index_size(int mode, int index_selection_bit) pure
307 { 
308     return g_bc7_color_index_bitcount[mode] + index_selection_bit; 
309 }
310 
311 static immutable ubyte[8] g_bc7_mode_has_p_bits        = [ 1, 1, 0, 1, 0, 0, 1, 1 ];
312 static immutable ubyte[8] g_bc7_mode_has_shared_p_bits = [ 0, 1, 0, 0, 0, 0, 0, 0 ];
313 static immutable ubyte[8] g_bc7_color_precision_table  = [ 4, 6, 5, 7, 5, 7, 7, 5 ];
314 static immutable byte[8] g_bc7_alpha_precision_table   = [ 0, 0, 0, 0, 6, 8, 7, 5 ];
315 
316 struct endpoint_err 
317 { 
318     ushort m_error; 
319     ubyte m_lo; 
320     ubyte m_hi; 
321 }
322 
323 __gshared endpoint_err[2][256] g_bc7_mode_1_optimal_endpoints; // [c][pbit]
324 __gshared Mutex g_tableProtect;
325 __gshared bool g_tableInitialized = false;
326 
327 enum uint BC7ENC16_MODE_1_OPTIMAL_INDEX = 2;
328 
329 // Initialize the lookup table used for optimal single color compression in mode 1
330 // Warning: bc7enc16_compress_block_init() MUST be called before calling bc7enc16_compress_block() (or you'll get artifacts).
331 // Note: this is racey, so we use a self-init mutex.
332 void bc7enc16_compress_block_init() @trusted
333 {
334     g_tableProtect.lockLazy();
335     scope(exit) g_tableProtect.unlock();
336 
337     if (g_tableInitialized)
338         return;
339 
340     g_tableInitialized = true;
341 
342     for (int c = 0; c < 256; c++)
343     {
344         for (uint lp = 0; lp < 2; lp++)
345         {
346             endpoint_err best;
347             best.m_error = ushort.max;
348             for (uint l = 0; l < 64; l++)
349             {
350                 uint low = ((l << 1) | lp) << 1;
351                 low |= (low >> 7);
352                 for (uint h = 0; h < 64; h++)
353                 {
354                     uint high = ((h << 1) | lp) << 1;
355                     high |= (high >> 7);
356                     const int k = (low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6;
357                     const int err = (k - c) * (k - c);
358                     if (err < best.m_error)
359                     {
360                         best.m_error = cast(ushort)err;
361                         best.m_lo = cast(ubyte)l;
362                         best.m_hi = cast(ubyte)h;
363                     }
364                 }
365             }
366             g_bc7_mode_1_optimal_endpoints[c][lp] = best;
367         }
368     }
369 }
370 
371 void compute_least_squares_endpoints_rgba(uint N, 
372                                           const(ubyte)* pSelectors, 
373                                           const(vec4F)* pSelector_weights, 
374                                           vec4F *pXl, 
375                                           vec4F *pXh, 
376                                           const(color_quad_u8)* pColors) @system
377 {
378     // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
379     // I did this in matrix form first, expanded out all the ops, then optimized it a bit.
380     float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
381     float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
382     float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
383     float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
384     float q00_a = 0.0f, q10_a = 0.0f, t_a = 0.0f;
385     for (uint i = 0; i < N; i++)
386     {
387         const uint sel = pSelectors[i];
388         z00 += pSelector_weights[sel].m_c[0];
389         z10 += pSelector_weights[sel].m_c[1];
390         z11 += pSelector_weights[sel].m_c[2];
391         float w = pSelector_weights[sel].m_c[3];
392         q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
393         q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
394         q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
395         q00_a += w * pColors[i].m_c[3]; t_a += pColors[i].m_c[3];
396     }
397 
398     q10_r = t_r - q00_r;
399     q10_g = t_g - q00_g;
400     q10_b = t_b - q00_b;
401     q10_a = t_a - q00_a;
402 
403     z01 = z10;
404 
405     float det = z00 * z11 - z01 * z10;
406     if (det != 0.0f)
407         det = 1.0f / det;
408 
409     float iz00, iz01, iz10, iz11;
410     iz00 = z11 * det;
411     iz01 = -z01 * det;
412     iz10 = -z10 * det;
413     iz11 = z00 * det;
414 
415     pXl.m_c[0] = cast(float)(iz00 * q00_r + iz01 * q10_r); pXh.m_c[0] = cast(float)(iz10 * q00_r + iz11 * q10_r);
416     pXl.m_c[1] = cast(float)(iz00 * q00_g + iz01 * q10_g); pXh.m_c[1] = cast(float)(iz10 * q00_g + iz11 * q10_g);
417     pXl.m_c[2] = cast(float)(iz00 * q00_b + iz01 * q10_b); pXh.m_c[2] = cast(float)(iz10 * q00_b + iz11 * q10_b);
418     pXl.m_c[3] = cast(float)(iz00 * q00_a + iz01 * q10_a); pXh.m_c[3] = cast(float)(iz10 * q00_a + iz11 * q10_a);
419 }
420 
421 void compute_least_squares_endpoints_rgb(uint N, const ubyte *pSelectors, 
422                                          const(vec4F)* pSelector_weights, 
423                                          vec4F *pXl, vec4F *pXh, const(color_quad_u8)*pColors) @system
424 {
425     float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
426     float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
427     float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
428     float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
429     for (uint i = 0; i < N; i++)
430     {
431         const uint sel = pSelectors[i];
432         z00 += pSelector_weights[sel].m_c[0];
433         z10 += pSelector_weights[sel].m_c[1];
434         z11 += pSelector_weights[sel].m_c[2];
435         float w = pSelector_weights[sel].m_c[3];
436         q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
437         q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
438         q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
439     }
440 
441     q10_r = t_r - q00_r;
442     q10_g = t_g - q00_g;
443     q10_b = t_b - q00_b;
444 
445     z01 = z10;
446 
447     float det = z00 * z11 - z01 * z10;
448     if (det != 0.0f)
449         det = 1.0f / det;
450 
451     float iz00, iz01, iz10, iz11;
452     iz00 = z11 * det;
453     iz01 = -z01 * det;
454     iz10 = -z10 * det;
455     iz11 = z00 * det;
456 
457     pXl.m_c[0] = cast(float)(iz00 * q00_r + iz01 * q10_r); pXh.m_c[0] = cast(float)(iz10 * q00_r + iz11 * q10_r);
458     pXl.m_c[1] = cast(float)(iz00 * q00_g + iz01 * q10_g); pXh.m_c[1] = cast(float)(iz10 * q00_g + iz11 * q10_g);
459     pXl.m_c[2] = cast(float)(iz00 * q00_b + iz01 * q10_b); pXh.m_c[2] = cast(float)(iz10 * q00_b + iz11 * q10_b);
460     pXl.m_c[3] = 255.0f; pXh.m_c[3] = 255.0f;
461 }
462 
463 struct color_cell_compressor_params
464 {
465     uint m_num_pixels;
466     const(color_quad_u8)* m_pPixels;
467     uint m_num_selector_weights;
468     const(uint)* m_pSelector_weights;
469     const(vec4F)* m_pSelector_weightsx;
470     uint m_comp_bits;
471     uint[4] m_weights;
472     bc7enc16_bool m_has_alpha;
473     bc7enc16_bool m_has_pbits;
474     bc7enc16_bool m_endpoints_share_pbit;
475     bc7enc16_bool m_perceptual;
476 }
477 
478 struct color_cell_compressor_results
479 {
480     ulong m_best_overall_err;
481     color_quad_u8 m_low_endpoint;
482     color_quad_u8 m_high_endpoint;
483     uint[2] m_pbits;
484     ubyte *m_pSelectors;
485     ubyte *m_pSelectors_temp;
486 }
487 
488 color_quad_u8 scale_color(ref const(color_quad_u8) pC, const(color_cell_compressor_params) *pParams) pure
489 {
490     color_quad_u8 results;
491 
492     const uint n = pParams.m_comp_bits + (pParams.m_has_pbits ? 1 : 0);
493     assert((n >= 4) && (n <= 8));
494 
495     for (uint i = 0; i < 4; i++)
496     {
497         uint v = pC.m_c[i] << (8 - n);
498         v |= (v >> n);
499         assert(v <= 255);
500         results.m_c[i] = cast(ubyte)(v);
501     }
502 
503     return results;
504 }
505 
506 ulong compute_color_distance_rgb(const(color_quad_u8)* pE1, 
507                                  const(color_quad_u8)* pE2, 
508                                  bc7enc16_bool perceptual, 
509                                  const(uint)* weights) pure @system
510 {
511     int dr, dg, db;
512 
513     if (perceptual)
514     {
515         const int l1 = pE1.m_c[0] * 109 + pE1.m_c[1] * 366 + pE1.m_c[2] * 37;
516         const int cr1 = (cast(int)pE1.m_c[0] << 9) - l1;
517         const int cb1 = (cast(int)pE1.m_c[2] << 9) - l1;
518         const int l2 = pE2.m_c[0] * 109 + pE2.m_c[1] * 366 + pE2.m_c[2] * 37;
519         const int cr2 = (cast(int)pE2.m_c[0] << 9) - l2;
520         const int cb2 = (cast(int)pE2.m_c[2] << 9) - l2;
521         dr = (l1 - l2) >> 8;
522         dg = (cr1 - cr2) >> 8;
523         db = (cb1 - cb2) >> 8;
524     }
525     else
526     {
527         dr = cast(int)pE1.m_c[0] - cast(int)pE2.m_c[0];
528         dg = cast(int)pE1.m_c[1] - cast(int)pE2.m_c[1];
529         db = cast(int)pE1.m_c[2] - cast(int)pE2.m_c[2];
530     }
531 
532     return weights[0] * cast(uint)(dr * dr) + weights[1] * cast(uint)(dg * dg) + weights[2] * cast(uint)(db * db);
533 }
534 
535 ulong compute_color_distance_rgba(const(color_quad_u8)* pE1, const(color_quad_u8)* pE2, bc7enc16_bool perceptual, const(uint)* weights /* [4] */) @system
536 {
537     int da = cast(int)pE1.m_c[3] - cast(int)pE2.m_c[3];
538     return compute_color_distance_rgb(pE1, pE2, perceptual, weights) + (weights[3] * cast(uint)(da * da));
539 }
540 
541 ulong pack_mode1_to_one_color(const(color_cell_compressor_params)* pParams, 
542                               color_cell_compressor_results *pResults, 
543                               uint r, uint g, uint b, ubyte *pSelectors) @system
544 {
545     uint best_err = uint.max;
546     uint best_p = 0;
547 
548     for (uint p = 0; p < 2; p++)
549     {
550         uint err = g_bc7_mode_1_optimal_endpoints[r][p].m_error + g_bc7_mode_1_optimal_endpoints[g][p].m_error + g_bc7_mode_1_optimal_endpoints[b][p].m_error;
551         if (err < best_err)
552         {
553             best_err = err;
554             best_p = p;
555         }
556     }
557 
558     const endpoint_err *pEr = &g_bc7_mode_1_optimal_endpoints[r][best_p];
559     const endpoint_err *pEg = &g_bc7_mode_1_optimal_endpoints[g][best_p];
560     const endpoint_err *pEb = &g_bc7_mode_1_optimal_endpoints[b][best_p];
561 
562     color_quad_u8_set(&pResults.m_low_endpoint, pEr.m_lo, pEg.m_lo, pEb.m_lo, 0);
563     color_quad_u8_set(&pResults.m_high_endpoint, pEr.m_hi, pEg.m_hi, pEb.m_hi, 0);
564     pResults.m_pbits[0] = best_p;
565     pResults.m_pbits[1] = 0;
566 
567     memset(pSelectors, BC7ENC16_MODE_1_OPTIMAL_INDEX, pParams.m_num_pixels);
568 
569     color_quad_u8 p;
570     for (uint i = 0; i < 3; i++)
571     {
572         uint low = ((pResults.m_low_endpoint.m_c[i] << 1) | pResults.m_pbits[0]) << 1;
573         low |= (low >> 7);
574 
575         uint high = ((pResults.m_high_endpoint.m_c[i] << 1) | pResults.m_pbits[0]) << 1;
576         high |= (high >> 7);
577 
578         p.m_c[i] = cast(ubyte)((low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6);
579     }
580     p.m_c[3] = 255;
581 
582     ulong total_err = 0;
583     for (uint i = 0; i < pParams.m_num_pixels; i++)
584         total_err += compute_color_distance_rgb(&p, &pParams.m_pPixels[i], pParams.m_perceptual, pParams.m_weights.ptr);
585 
586     pResults.m_best_overall_err = total_err;
587 
588     return total_err;
589 }
590 
591 ulong evaluate_solution(const(color_quad_u8)* pLow, const(color_quad_u8)* pHigh, 
592                         const(uint)* pbits /*[2]*/, const(color_cell_compressor_params)* pParams, 
593                         color_cell_compressor_results *pResults) @system
594 {
595     color_quad_u8 quantMinColor = *pLow;
596     color_quad_u8 quantMaxColor = *pHigh;
597 
598     if (pParams.m_has_pbits)
599     {
600         uint minPBit, maxPBit;
601 
602         if (pParams.m_endpoints_share_pbit)
603             maxPBit = minPBit = pbits[0];
604         else
605         {
606             minPBit = pbits[0];
607             maxPBit = pbits[1];
608         }
609 
610         quantMinColor.m_c[0] = cast(ubyte)((pLow.m_c[0] << 1) | minPBit);
611         quantMinColor.m_c[1] = cast(ubyte)((pLow.m_c[1] << 1) | minPBit);
612         quantMinColor.m_c[2] = cast(ubyte)((pLow.m_c[2] << 1) | minPBit);
613         quantMinColor.m_c[3] = cast(ubyte)((pLow.m_c[3] << 1) | minPBit);
614 
615         quantMaxColor.m_c[0] = cast(ubyte)((pHigh.m_c[0] << 1) | maxPBit);
616         quantMaxColor.m_c[1] = cast(ubyte)((pHigh.m_c[1] << 1) | maxPBit);
617         quantMaxColor.m_c[2] = cast(ubyte)((pHigh.m_c[2] << 1) | maxPBit);
618         quantMaxColor.m_c[3] = cast(ubyte)((pHigh.m_c[3] << 1) | maxPBit);
619     }
620 
621     color_quad_u8 actualMinColor = scale_color(quantMinColor, pParams);
622     color_quad_u8 actualMaxColor = scale_color(quantMaxColor, pParams);
623 
624     const uint N = pParams.m_num_selector_weights;
625 
626     color_quad_u8[16] weightedColors;
627     weightedColors[0] = actualMinColor;
628     weightedColors[N - 1] = actualMaxColor;
629 
630     const uint nc = pParams.m_has_alpha ? 4 : 3;
631     for (uint i = 1; i < (N - 1); i++)
632         for (uint j = 0; j < nc; j++)
633             weightedColors[i].m_c[j] = cast(ubyte)((actualMinColor.m_c[j] * (64 - pParams.m_pSelector_weights[i]) + actualMaxColor.m_c[j] * pParams.m_pSelector_weights[i] + 32) >> 6);
634 
635     const int lr = actualMinColor.m_c[0];
636     const int lg = actualMinColor.m_c[1];
637     const int lb = actualMinColor.m_c[2];
638     const int dr = actualMaxColor.m_c[0] - lr;
639     const int dg = actualMaxColor.m_c[1] - lg;
640     const int db = actualMaxColor.m_c[2] - lb;
641 
642     ulong total_err = 0;
643 
644     if (!pParams.m_perceptual)
645     {
646         if (pParams.m_has_alpha)
647         {
648             const int la = actualMinColor.m_c[3];
649             const int da = actualMaxColor.m_c[3] - la;
650 
651             const float f = N / cast(float)(squarei(dr) + squarei(dg) + squarei(db) + squarei(da) + .00000125f);
652 
653             for (uint i = 0; i < pParams.m_num_pixels; i++)
654             {
655                 const(color_quad_u8)* pC = &pParams.m_pPixels[i];
656                 int r = pC.m_c[0];
657                 int g = pC.m_c[1];
658                 int b = pC.m_c[2];
659                 int a = pC.m_c[3];
660 
661                 int best_sel = cast(int)(cast(float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db + (a - la) * da) * f + .5f);
662                 best_sel = clampi(best_sel, 1, N - 1);
663 
664                 ulong err0 = compute_color_distance_rgba(&weightedColors[best_sel - 1], pC, BC7ENC16_FALSE, pParams.m_weights.ptr);
665                 ulong err1 = compute_color_distance_rgba(&weightedColors[best_sel], pC, BC7ENC16_FALSE, pParams.m_weights.ptr);
666 
667                 if (err1 > err0)
668                 {
669                     err1 = err0;
670                     --best_sel;
671                 }
672                 total_err += err1;
673 
674                 pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel;
675             }
676         }
677         else
678         {
679             const float f = N / cast(float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f);
680 
681             for (uint i = 0; i < pParams.m_num_pixels; i++)
682             {
683                 const color_quad_u8 *pC = &pParams.m_pPixels[i];
684                 int r = pC.m_c[0];
685                 int g = pC.m_c[1];
686                 int b = pC.m_c[2];
687 
688                 int sel = cast(int)(cast(float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db) * f + .5f);
689                 sel = clampi(sel, 1, N - 1);
690 
691                 ulong err0 = compute_color_distance_rgb(&weightedColors[sel - 1], pC, BC7ENC16_FALSE, pParams.m_weights.ptr);
692                 ulong err1 = compute_color_distance_rgb(&weightedColors[sel], pC, BC7ENC16_FALSE, pParams.m_weights.ptr);
693 
694                 int best_sel = sel;
695                 ulong best_err = err1;
696                 if (err0 < best_err)
697                 {
698                     best_err = err0;
699                     best_sel = sel - 1;
700                 }
701 
702                 total_err += best_err;
703 
704                 pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel;
705             }
706         }
707     }
708     else
709     {
710         for (uint i = 0; i < pParams.m_num_pixels; i++)
711         {
712             ulong best_err = ulong.max;
713             uint best_sel = 0;
714 
715             if (pParams.m_has_alpha)
716             {
717                 for (uint j = 0; j < N; j++)
718                 {
719                     ulong err = compute_color_distance_rgba(&weightedColors[j], &pParams.m_pPixels[i], BC7ENC16_TRUE, pParams.m_weights.ptr);
720                     if (err < best_err)
721                     {
722                         best_err = err;
723                         best_sel = j;
724                     }
725                 }
726             }
727             else
728             {
729                 for (uint j = 0; j < N; j++)
730                 {
731                     ulong err = compute_color_distance_rgb(&weightedColors[j], &pParams.m_pPixels[i], BC7ENC16_TRUE, pParams.m_weights.ptr);
732                     if (err < best_err)
733                     {
734                         best_err = err;
735                         best_sel = j;
736                     }
737                 }
738             }
739 
740             total_err += best_err;
741 
742             pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel;
743         }
744     }
745 
746     if (total_err < pResults.m_best_overall_err)
747     {
748         pResults.m_best_overall_err = total_err;
749 
750         pResults.m_low_endpoint = *pLow;
751         pResults.m_high_endpoint = *pHigh;
752 
753         pResults.m_pbits[0] = pbits[0];
754         pResults.m_pbits[1] = pbits[1];
755 
756         memcpy(pResults.m_pSelectors, pResults.m_pSelectors_temp, (pResults.m_pSelectors[0]).sizeof * pParams.m_num_pixels);
757     }
758 
759     return total_err;
760 }
761 
762 void fixDegenerateEndpoints(uint mode, 
763                             ref color_quad_u8 pTrialMinColor, 
764                             ref color_quad_u8 pTrialMaxColor, 
765                             ref const(vec4F) pXl, ref const(vec4F) pXh, uint iscale)
766 {
767     if (mode == 1)
768     {
769         // fix degenerate case where the input collapses to a single colorspace voxel, and we loose all freedom (test with grayscale ramps)
770         for (uint i = 0; i < 3; i++)
771         {
772             if (pTrialMinColor.m_c[i] == pTrialMaxColor.m_c[i])
773             {
774                 if (abs(pXl.m_c[i] - pXh.m_c[i]) > 0.0f)
775                 {
776                     if (pTrialMinColor.m_c[i] > (iscale >> 1))
777                     {
778                         if (pTrialMinColor.m_c[i] > 0)
779                             pTrialMinColor.m_c[i]--;
780                         else
781                             if (pTrialMaxColor.m_c[i] < iscale)
782                                 pTrialMaxColor.m_c[i]++;
783                     }
784                     else
785                     {
786                         if (pTrialMaxColor.m_c[i] < iscale)
787                             pTrialMaxColor.m_c[i]++;
788                         else if (pTrialMinColor.m_c[i] > 0)
789                             pTrialMinColor.m_c[i]--;
790                     }
791                 }
792             }
793         }
794     }
795 }
796 
797 static ulong find_optimal_solution(uint mode, vec4F xl, vec4F xh, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults) @system
798 {
799     vec4F_saturate_in_place(xl); 
800     vec4F_saturate_in_place(xh);
801 
802     if (pParams.m_has_pbits)
803     {
804         const int iscalep = (1 << (pParams.m_comp_bits + 1)) - 1;
805         const float scalep = cast(float)iscalep;
806 
807         const int totalComps = pParams.m_has_alpha ? 4 : 3;
808 
809         uint[2] best_pbits;
810         color_quad_u8 bestMinColor, bestMaxColor;
811 
812         if (!pParams.m_endpoints_share_pbit)
813         {
814             float best_err0 = 1e+9;
815             float best_err1 = 1e+9;
816 
817             for (int p = 0; p < 2; p++)
818             {
819                 color_quad_u8 xMinColor, xMaxColor;
820 
821                 // Notes: The pbit controls which quantization intervals are selected.
822                 // total_levels=2^(comp_bits+1), where comp_bits=4 for mode 0, etc.
823                 // pbit 0: v=(b*2)/(total_levels-1), pbit 1: v=(b*2+1)/(total_levels-1) where b is the component bin from [0,total_levels/2-1] and v is the [0,1] component value
824                 // rearranging you get for pbit 0: b=floor(v*(total_levels-1)/2+.5)
825                 // rearranging you get for pbit 1: b=floor((v*(total_levels-1)-1)/2+.5)
826                 for (uint c = 0; c < 4; c++)
827                 {
828                     xMinColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
829                     xMaxColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
830                 }
831 
832                 color_quad_u8 scaledLow = scale_color(xMinColor, pParams);
833                 color_quad_u8 scaledHigh = scale_color(xMaxColor, pParams);
834 
835                 float err0 = 0, err1 = 0;
836                 for (int i = 0; i < totalComps; i++)
837                 {
838                     err0 += squaref(scaledLow.m_c[i] - xl.m_c[i] * 255.0f);
839                     err1 += squaref(scaledHigh.m_c[i] - xh.m_c[i] * 255.0f);
840                 }
841 
842                 if (err0 < best_err0)
843                 {
844                     best_err0 = err0;
845                     best_pbits[0] = p;
846 
847                     bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1;
848                     bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1;
849                     bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1;
850                     bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1;
851                 }
852 
853                 if (err1 < best_err1)
854                 {
855                     best_err1 = err1;
856                     best_pbits[1] = p;
857 
858                     bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1;
859                     bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1;
860                     bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1;
861                     bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1;
862                 }
863             }
864         }
865         else
866         {
867             // Endpoints share pbits
868             float best_err = 1e+9;
869 
870             for (int p = 0; p < 2; p++)
871             {
872                 color_quad_u8 xMinColor, xMaxColor;
873                 for (uint c = 0; c < 4; c++)
874                 {
875                     xMinColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
876                     xMaxColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
877                 }
878 
879                 color_quad_u8 scaledLow = scale_color(xMinColor, pParams);
880                 color_quad_u8 scaledHigh = scale_color(xMaxColor, pParams);
881 
882                 float err = 0;
883                 for (int i = 0; i < totalComps; i++)
884                     err += squaref((scaledLow.m_c[i] / 255.0f) - xl.m_c[i]) + squaref((scaledHigh.m_c[i] / 255.0f) - xh.m_c[i]);
885 
886                 if (err < best_err)
887                 {
888                     best_err = err;
889                     best_pbits[0] = p;
890                     best_pbits[1] = p;
891                     for (uint j = 0; j < 4; j++)
892                     {
893                         bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1;
894                         bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1;
895                     }
896                 }
897             }
898         }
899 
900         fixDegenerateEndpoints(mode, bestMinColor, bestMaxColor, xl, xh, iscalep >> 1);
901 
902         if ( (pResults.m_best_overall_err == ulong.max) 
903              || color_quad_u8_notequals(bestMinColor, pResults.m_low_endpoint) 
904              || color_quad_u8_notequals(bestMaxColor, pResults.m_high_endpoint) 
905              || (best_pbits[0] != pResults.m_pbits[0]) 
906              || (best_pbits[1] != pResults.m_pbits[1]) )
907             evaluate_solution(&bestMinColor, &bestMaxColor, best_pbits.ptr, pParams, pResults);
908     }
909     else
910     {
911         const int iscale = (1 << pParams.m_comp_bits) - 1;
912         const float scale = cast(float)iscale;
913 
914         color_quad_u8 trialMinColor, trialMaxColor;
915         color_quad_u8_set_clamped(&trialMinColor, cast(int)(xl.m_c[0] * scale + .5f), cast(int)(xl.m_c[1] * scale + .5f), cast(int)(xl.m_c[2] * scale + .5f), cast(int)(xl.m_c[3] * scale + .5f));
916         color_quad_u8_set_clamped(&trialMaxColor, cast(int)(xh.m_c[0] * scale + .5f), cast(int)(xh.m_c[1] * scale + .5f), cast(int)(xh.m_c[2] * scale + .5f), cast(int)(xh.m_c[3] * scale + .5f));
917 
918         fixDegenerateEndpoints(mode, trialMinColor, trialMaxColor, xl, xh, iscale);
919 
920         if (  (pResults.m_best_overall_err == ulong.max) 
921              || color_quad_u8_notequals(trialMinColor, pResults.m_low_endpoint) 
922              || color_quad_u8_notequals(trialMaxColor, pResults.m_high_endpoint) )
923             evaluate_solution(&trialMinColor, &trialMaxColor, pResults.m_pbits.ptr, pParams, pResults);
924     }
925 
926     return pResults.m_best_overall_err;
927 }
928 
929 ulong color_cell_compression(uint mode, 
930                              const(color_cell_compressor_params)* pParams, 
931                              color_cell_compressor_results *pResults, 
932                              const(bc7enc16_compress_block_params)* pComp_params) @system
933 {
934     assert((mode == 6) || (!pParams.m_has_alpha));
935 
936     pResults.m_best_overall_err = ulong.max;
937 
938     // If the partition's colors are all the same in mode 1, then just pack them as a single color.
939     if (mode == 1)
940     {
941         const uint cr = pParams.m_pPixels[0].m_c[0], cg = pParams.m_pPixels[0].m_c[1], cb = pParams.m_pPixels[0].m_c[2];
942 
943         bc7enc16_bool allSame = BC7ENC16_TRUE;
944         for (uint i = 1; i < pParams.m_num_pixels; i++)
945         {
946             if ((cr != pParams.m_pPixels[i].m_c[0]) || (cg != pParams.m_pPixels[i].m_c[1]) || (cb != pParams.m_pPixels[i].m_c[2]))
947             {
948                 allSame = BC7ENC16_FALSE;
949                 break;
950             }
951         }
952 
953         if (allSame)
954             return pack_mode1_to_one_color(pParams, pResults, cr, cg, cb, pResults.m_pSelectors);
955     }
956 
957     // Compute partition's mean color and principle axis.
958     vec4F meanColor, axis;
959     vec4F_set_scalar(&meanColor, 0.0f);
960 
961     for (uint i = 0; i < pParams.m_num_pixels; i++)
962     {
963         vec4F color = vec4F_from_color(&pParams.m_pPixels[i]);
964         meanColor = vec4F_add(&meanColor, &color);
965     }
966 
967     vec4F meanColorScaled = vec4F_mul(&meanColor, 1.0f / cast(float)(pParams.m_num_pixels));
968 
969     meanColor = vec4F_mul(&meanColor, 1.0f / cast(float)(pParams.m_num_pixels * 255.0f));
970     vec4F_saturate_in_place(meanColor);
971 
972     if (pParams.m_has_alpha)
973     {
974         // Use incremental PCA for RGBA PCA, because it's simple.
975         vec4F_set_scalar(&axis, 0.0f);
976         for (uint i = 0; i < pParams.m_num_pixels; i++)
977         {
978             vec4F color = vec4F_from_color(&pParams.m_pPixels[i]);
979             color = vec4F_sub(&color, &meanColorScaled);
980             vec4F a = vec4F_mul(&color, color.m_c[0]);
981             vec4F b = vec4F_mul(&color, color.m_c[1]);
982             vec4F c = vec4F_mul(&color, color.m_c[2]);
983             vec4F d = vec4F_mul(&color, color.m_c[3]);
984             vec4F n = i ? axis : color;
985             vec4F_normalize_in_place(&n);
986             axis.m_c[0] += vec4F_dot(&a, &n);
987             axis.m_c[1] += vec4F_dot(&b, &n);
988             axis.m_c[2] += vec4F_dot(&c, &n);
989             axis.m_c[3] += vec4F_dot(&d, &n);
990         }
991         vec4F_normalize_in_place(&axis);
992     }
993     else
994     {
995         // Use covar technique for RGB PCA, because it doesn't require per-pixel normalization.
996         float[6] cov = [ 0, 0, 0, 0, 0, 0 ];
997 
998         for (uint i = 0; i < pParams.m_num_pixels; i++)
999         {
1000             const color_quad_u8 *pV = &pParams.m_pPixels[i];
1001             float r = pV.m_c[0] - meanColorScaled.m_c[0];
1002             float g = pV.m_c[1] - meanColorScaled.m_c[1];
1003             float b = pV.m_c[2] - meanColorScaled.m_c[2];
1004             cov[0] += r*r; cov[1] += r*g; cov[2] += r*b; cov[3] += g*g; cov[4] += g*b; cov[5] += b*b;
1005         }
1006 
1007         float vfr = .9f, vfg = 1.0f, vfb = .7f;
1008         for (uint iter = 0; iter < 3; iter++)
1009         {
1010             float r = vfr*cov[0] + vfg*cov[1] + vfb*cov[2];
1011             float g = vfr*cov[1] + vfg*cov[3] + vfb*cov[4];
1012             float b = vfr*cov[2] + vfg*cov[4] + vfb*cov[5];
1013 
1014             float m = maximumf(maximumf(abs(r), abs(g)), abs(b));
1015             if (m > 1e-10f)
1016             {
1017                 m = 1.0f / m;
1018                 r *= m; g *= m; b *= m;
1019             }
1020 
1021             vfr = r; vfg = g; vfb = b;
1022         }
1023 
1024         float len = vfr*vfr + vfg*vfg + vfb*vfb;
1025         if (len < 1e-10f)
1026             vec4F_set_scalar(&axis, 0.0f);
1027         else
1028         {
1029             len = 1.0f / sqrt(len);
1030             vfr *= len; vfg *= len; vfb *= len;
1031             vec4F_set(&axis, vfr, vfg, vfb, 0);
1032         }
1033     }
1034 
1035     if (vec4F_dot(&axis, &axis) < .5f)
1036     {
1037         if (pParams.m_perceptual)
1038             vec4F_set(&axis, .213f, .715f, .072f, pParams.m_has_alpha ? .715f : 0);
1039         else
1040             vec4F_set(&axis, 1.0f, 1.0f, 1.0f, pParams.m_has_alpha ? 1.0f : 0);
1041         vec4F_normalize_in_place(&axis);
1042     }
1043 
1044     float l = 1e+9f, h = -1e+9f;
1045 
1046     for (uint i = 0; i < pParams.m_num_pixels; i++)
1047     {
1048         vec4F color = vec4F_from_color(&pParams.m_pPixels[i]);
1049 
1050         vec4F q = vec4F_sub(&color, &meanColorScaled);
1051         float d = vec4F_dot(&q, &axis);
1052 
1053         l = minimumf(l, d);
1054         h = maximumf(h, d);
1055     }
1056 
1057     l *= (1.0f / 255.0f);
1058     h *= (1.0f / 255.0f);
1059 
1060     vec4F b0 = vec4F_mul(&axis, l);
1061     vec4F b1 = vec4F_mul(&axis, h);
1062     vec4F c0 = vec4F_add(&meanColor, &b0);
1063     vec4F c1 = vec4F_add(&meanColor, &b1);
1064     vec4F minColor = vec4F_saturate(&c0);
1065     vec4F maxColor = vec4F_saturate(&c1);
1066 
1067     vec4F whiteVec;
1068     vec4F_set_scalar(&whiteVec, 1.0f);
1069     if (vec4F_dot(&minColor, &whiteVec) > vec4F_dot(&maxColor, &whiteVec))
1070     {
1071         vec4F temp = minColor;
1072         minColor = maxColor;
1073         maxColor = temp;
1074     }
1075     // First find a solution using the block's PCA.
1076     if (!find_optimal_solution(mode, minColor, maxColor, pParams, pResults))
1077         return 0;
1078 
1079     if (pComp_params.m_try_least_squares)
1080     {
1081         // Now try to refine the solution using least squares by computing the optimal endpoints from the current selectors.
1082         vec4F xl, xh;
1083         vec4F_set_scalar(&xl, 0.0f);
1084         vec4F_set_scalar(&xh, 0.0f);
1085         if (pParams.m_has_alpha)
1086             compute_least_squares_endpoints_rgba(pParams.m_num_pixels, pResults.m_pSelectors, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1087         else
1088             compute_least_squares_endpoints_rgb(pParams.m_num_pixels, pResults.m_pSelectors, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1089 
1090         xl = vec4F_mul(&xl, (1.0f / 255.0f));
1091         xh = vec4F_mul(&xh, (1.0f / 255.0f));
1092 
1093         if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
1094             return 0;
1095     }
1096 
1097     if (pComp_params.m_uber_level > 0)
1098     {
1099         // In uber level 1, try varying the selectors a little, somewhat like cluster fit would. First try incrementing the minimum selectors,
1100         // then try decrementing the selectrors, then try both.
1101         ubyte[16] selectors_temp, selectors_temp1;
1102         memcpy(selectors_temp.ptr, pResults.m_pSelectors, pParams.m_num_pixels);
1103 
1104         const int max_selector = pParams.m_num_selector_weights - 1;
1105 
1106         uint min_sel = 16;
1107         uint max_sel = 0;
1108         for (uint i = 0; i < pParams.m_num_pixels; i++)
1109         {
1110             uint sel = selectors_temp[i];
1111             min_sel = minimumu(min_sel, sel);
1112             max_sel = maximumu(max_sel, sel);
1113         }
1114 
1115         for (uint i = 0; i < pParams.m_num_pixels; i++)
1116         {
1117             uint sel = selectors_temp[i];
1118             if ((sel == min_sel) && (sel < (pParams.m_num_selector_weights - 1)))
1119                 sel++;
1120             selectors_temp1[i] = cast(ubyte)sel;
1121         }
1122 
1123         vec4F xl, xh;
1124         vec4F_set_scalar(&xl, 0.0f);
1125         vec4F_set_scalar(&xh, 0.0f);
1126         if (pParams.m_has_alpha)
1127             compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 
1128                                                  pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1129         else
1130             compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 
1131                                                 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1132 
1133         xl = vec4F_mul(&xl, (1.0f / 255.0f));
1134         xh = vec4F_mul(&xh, (1.0f / 255.0f));
1135 
1136         if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
1137             return 0;
1138 
1139         for (uint i = 0; i < pParams.m_num_pixels; i++)
1140         {
1141             uint sel = selectors_temp[i];
1142             if ((sel == max_sel) && (sel > 0))
1143                 sel--;
1144             selectors_temp1[i] = cast(ubyte)sel;
1145         }
1146 
1147         if (pParams.m_has_alpha)
1148             compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 
1149                                                  pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1150         else
1151             compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 
1152                                                 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1153 
1154         xl = vec4F_mul(&xl, (1.0f / 255.0f));
1155         xh = vec4F_mul(&xh, (1.0f / 255.0f));
1156 
1157         if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
1158             return 0;
1159 
1160         for (uint i = 0; i < pParams.m_num_pixels; i++)
1161         {
1162             uint sel = selectors_temp[i];
1163             if ((sel == min_sel) && (sel < (pParams.m_num_selector_weights - 1)))
1164                 sel++;
1165             else if ((sel == max_sel) && (sel > 0))
1166                 sel--;
1167             selectors_temp1[i] = cast(ubyte)sel;
1168         }
1169 
1170         if (pParams.m_has_alpha)
1171             compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 
1172                                                  pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1173         else
1174             compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 
1175                                                 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1176 
1177         xl = vec4F_mul(&xl, (1.0f / 255.0f));
1178         xh = vec4F_mul(&xh, (1.0f / 255.0f));
1179 
1180         if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
1181             return 0;
1182 
1183         // In uber levels 2+, try taking more advantage of endpoint extrapolation by scaling the selectors in one direction or another.
1184         const uint uber_err_thresh = (pParams.m_num_pixels * 56) >> 4;
1185         if ((pComp_params.m_uber_level >= 2) && (pResults.m_best_overall_err > uber_err_thresh))
1186         {
1187             const int Q = (pComp_params.m_uber_level >= 4) ? (pComp_params.m_uber_level - 2) : 1;
1188             for (int ly = -Q; ly <= 1; ly++)
1189             {
1190                 for (int hy = max_selector - 1; hy <= (max_selector + Q); hy++)
1191                 {
1192                     if ((ly == 0) && (hy == max_selector))
1193                         continue;
1194 
1195                     for (uint i = 0; i < pParams.m_num_pixels; i++)
1196                         selectors_temp1[i] = cast(ubyte)clampf(floor(cast(float)max_selector * (cast(float)selectors_temp[i] - cast(float)ly) / (cast(float)hy - cast(float)ly) + .5f), 0, cast(float)max_selector);
1197 
1198                     //vec4F xl, xh;
1199                     vec4F_set_scalar(&xl, 0.0f);
1200                     vec4F_set_scalar(&xh, 0.0f);
1201                     if (pParams.m_has_alpha)
1202                         compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1203                     else
1204                         compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1205 
1206                     xl = vec4F_mul(&xl, (1.0f / 255.0f));
1207                     xh = vec4F_mul(&xh, (1.0f / 255.0f));
1208 
1209                     if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
1210                         return 0;
1211                 }
1212             }
1213         }
1214     }
1215 
1216     if (mode == 1)
1217     {
1218         // Try encoding the partition as a single color by using the optimal singe colors tables to encode the block to its mean.
1219         color_cell_compressor_results avg_results = *pResults;
1220         const uint r = cast(int)(.5f + meanColor.m_c[0] * 255.0f), 
1221                    g = cast(int)(.5f + meanColor.m_c[1] * 255.0f), 
1222                    b = cast(int)(.5f + meanColor.m_c[2] * 255.0f);
1223         ulong avg_err = pack_mode1_to_one_color(pParams, &avg_results, r, g, b, pResults.m_pSelectors_temp);
1224         if (avg_err < pResults.m_best_overall_err)
1225         {
1226             *pResults = avg_results;
1227             memcpy(pResults.m_pSelectors, pResults.m_pSelectors_temp, (pResults.m_pSelectors[0]).sizeof * pParams.m_num_pixels);
1228             pResults.m_best_overall_err = avg_err;
1229         }
1230     }
1231 
1232     return pResults.m_best_overall_err;
1233 }
1234 
1235 ulong color_cell_compression_est(uint num_pixels, const color_quad_u8 *pPixels, bc7enc16_bool perceptual, uint* pweights/*[4]*/, ulong best_err_so_far) @system
1236 {
1237     // Find RGB bounds as an approximation of the block's principle axis
1238     uint lr = 255, lg = 255, lb = 255;
1239     uint hr = 0, hg = 0, hb = 0;
1240     for (uint i = 0; i < num_pixels; i++)
1241     {
1242         const color_quad_u8 *pC = &pPixels[i];
1243         if (pC.m_c[0] < lr) lr = pC.m_c[0];
1244         if (pC.m_c[1] < lg) lg = pC.m_c[1];
1245         if (pC.m_c[2] < lb) lb = pC.m_c[2];
1246         if (pC.m_c[0] > hr) hr = pC.m_c[0];
1247         if (pC.m_c[1] > hg) hg = pC.m_c[1];
1248         if (pC.m_c[2] > hb) hb = pC.m_c[2];
1249     }
1250 
1251     color_quad_u8 lowColor; color_quad_u8_set(&lowColor, lr, lg, lb, 0);
1252     color_quad_u8 highColor; color_quad_u8_set(&highColor, hr, hg, hb, 0);
1253 
1254     // Place endpoints at bbox diagonals and compute interpolated colors
1255     const uint N = 8;
1256     color_quad_u8[8] weightedColors;
1257 
1258     weightedColors[0] = lowColor;
1259     weightedColors[N - 1] = highColor;
1260     for (uint i = 1; i < (N - 1); i++)
1261     {
1262         weightedColors[i].m_c[0] = cast(ubyte)((lowColor.m_c[0] * (64 - g_bc7_weights3[i]) + highColor.m_c[0] * g_bc7_weights3[i] + 32) >> 6);
1263         weightedColors[i].m_c[1] = cast(ubyte)((lowColor.m_c[1] * (64 - g_bc7_weights3[i]) + highColor.m_c[1] * g_bc7_weights3[i] + 32) >> 6);
1264         weightedColors[i].m_c[2] = cast(ubyte)((lowColor.m_c[2] * (64 - g_bc7_weights3[i]) + highColor.m_c[2] * g_bc7_weights3[i] + 32) >> 6);
1265     }
1266 
1267     // Compute dots and thresholds
1268     const int ar = highColor.m_c[0] - lowColor.m_c[0];
1269     const int ag = highColor.m_c[1] - lowColor.m_c[1];
1270     const int ab = highColor.m_c[2] - lowColor.m_c[2];
1271 
1272     int[8] dots;
1273     for (uint i = 0; i < N; i++)
1274         dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab;
1275 
1276     int[8 - 1] thresh;
1277     for (uint i = 0; i < (N - 1); i++)
1278         thresh[i] = (dots[i] + dots[i + 1] + 1) >> 1;
1279 
1280     ulong total_err = 0;
1281     if (perceptual)
1282     {
1283         // Transform block's interpolated colors to YCbCr
1284         int[8] l1, cr1, cb1;
1285         for (int j = 0; j < 8; j++)
1286         {
1287             const color_quad_u8 *pE1 = &weightedColors[j];
1288             l1[j] = pE1.m_c[0] * 109 + pE1.m_c[1] * 366 + pE1.m_c[2] * 37;
1289             cr1[j] = (cast(int)pE1.m_c[0] << 9) - l1[j];
1290             cb1[j] = (cast(int)pE1.m_c[2] << 9) - l1[j];
1291         }
1292 
1293         for (uint i = 0; i < num_pixels; i++)
1294         {
1295             const color_quad_u8 *pC = &pPixels[i];
1296 
1297             int d = ar * pC.m_c[0] + ag * pC.m_c[1] + ab * pC.m_c[2];
1298 
1299             // Find approximate selector
1300             uint s = 0;
1301             if (d >= thresh[6])
1302                 s = 7;
1303             else if (d >= thresh[5])
1304                 s = 6;
1305             else if (d >= thresh[4])
1306                 s = 5;
1307             else if (d >= thresh[3])
1308                 s = 4;
1309             else if (d >= thresh[2])
1310                 s = 3;
1311             else if (d >= thresh[1])
1312                 s = 2;
1313             else if (d >= thresh[0])
1314                 s = 1;
1315 
1316             // Compute error
1317             const int l2 = pC.m_c[0] * 109 + pC.m_c[1] * 366 + pC.m_c[2] * 37;
1318             const int cr2 = (cast(int)pC.m_c[0] << 9) - l2;
1319             const int cb2 = (cast(int)pC.m_c[2] << 9) - l2;
1320 
1321             const int dl = (l1[s] - l2) >> 8;
1322             const int dcr = (cr1[s] - cr2) >> 8;
1323             const int dcb = (cb1[s] - cb2) >> 8;
1324 
1325             int ie = (pweights[0] * dl * dl) + (pweights[1] * dcr * dcr) + (pweights[2] * dcb * dcb);
1326 
1327             total_err += ie;
1328             if (total_err > best_err_so_far)
1329                 break;
1330         }
1331     }
1332     else
1333     {
1334         for (uint i = 0; i < num_pixels; i++)
1335         {
1336             const color_quad_u8 *pC = &pPixels[i];
1337 
1338             int d = ar * pC.m_c[0] + ag * pC.m_c[1] + ab * pC.m_c[2];
1339 
1340             // Find approximate selector
1341             uint s = 0;
1342             if (d >= thresh[6])
1343                 s = 7;
1344             else if (d >= thresh[5])
1345                 s = 6;
1346             else if (d >= thresh[4])
1347                 s = 5;
1348             else if (d >= thresh[3])
1349                 s = 4;
1350             else if (d >= thresh[2])
1351                 s = 3;
1352             else if (d >= thresh[1])
1353                 s = 2;
1354             else if (d >= thresh[0])
1355                 s = 1;
1356 
1357             // Compute error
1358             const color_quad_u8 *pE1 = &weightedColors[s];
1359 
1360             int dr = cast(int)pE1.m_c[0] - cast(int)pC.m_c[0];
1361             int dg = cast(int)pE1.m_c[1] - cast(int)pC.m_c[1];
1362             int db = cast(int)pE1.m_c[2] - cast(int)pC.m_c[2];
1363 
1364             total_err += pweights[0] * (dr * dr) + pweights[1] * (dg * dg) + pweights[2] * (db * db);
1365             if (total_err > best_err_so_far)
1366                 break;
1367         }
1368     }
1369 
1370     return total_err;
1371 }
1372 
1373 // This table contains bitmasks indicating which "key" partitions must be best ranked before this partition is worth evaluating.
1374 // We first rank the best/most used 14 partitions (sorted by usefulness), record the best one found as the key partition, then use
1375 // that to control the other partitions to evaluate. The quality loss is ~.08 dB RGB PSNR, the perf gain is up to ~11% (at uber level 0).
1376 static immutable uint[35] g_partition_predictors =
1377 [
1378     uint.max,
1379     uint.max,
1380     uint.max,
1381     uint.max,
1382     uint.max,
1383     (1 << 1) | (1 << 2) | (1 << 8),
1384     (1 << 1) | (1 << 3) | (1 << 7),
1385     uint.max,
1386     uint.max,
1387     (1 << 2) | (1 << 8) | (1 << 16),
1388     (1 << 7) | (1 << 3) | (1 << 15),
1389     uint.max,
1390     (1 << 8) | (1 << 14) | (1 << 16),
1391     (1 << 7) | (1 << 14) | (1 << 15),
1392     uint.max,
1393     uint.max,
1394     uint.max,
1395     uint.max,
1396     (1 << 14) | (1 << 15),
1397     (1 << 16) | (1 << 22) | (1 << 14),
1398     (1 << 17) | (1 << 24) | (1 << 14),
1399     (1 << 2) | (1 << 14) | (1 << 15) | (1 << 1),
1400     uint.max,
1401     (1 << 1) | (1 << 3) | (1 << 14) | (1 << 16) | (1 << 22),
1402     uint.max,
1403     (1 << 1) | (1 << 2) | (1 << 15) | (1 << 17) | (1 << 24),
1404     (1 << 1) | (1 << 3) | (1 << 22),
1405     uint.max,
1406     uint.max,
1407     uint.max,
1408     (1 << 14) | (1 << 15) | (1 << 16) | (1 << 17),
1409     uint.max,
1410     uint.max,
1411     (1 << 1) | (1 << 2) | (1 << 3) | (1 << 27) | (1 << 4) | (1 << 24),
1412     (1 << 14) | (1 << 15) | (1 << 16) | (1 << 11) | (1 << 17) | (1 << 27)
1413 ];
1414 
1415 // Estimate the partition used by mode 1. This scans through each partition and computes an approximate error for each.
1416 uint estimate_partition(const(color_quad_u8)* pPixels, 
1417                         const(bc7enc16_compress_block_params)* pComp_params, 
1418                         uint* pweights/*[4]*/) @system
1419 {
1420     const uint total_partitions = minimumu(pComp_params.m_max_partitions_mode1, BC7ENC16_MAX_PARTITIONS1);
1421     if (total_partitions <= 1)
1422         return 0;
1423 
1424     ulong best_err = ulong.max;
1425     uint best_partition = 0;
1426 
1427     // Partition order sorted by usage frequency across a large test corpus. Pattern 34 (checkerboard) must appear in slot 34.
1428     // Using a sorted order allows the user to decrease the # of partitions to scan with minimal loss in quality.
1429     static immutable ubyte[64] s_sorted_partition_order =
1430     [
1431         1 - 1, 14 - 1, 2 - 1, 3 - 1, 16 - 1, 15 - 1, 11 - 1, 17 - 1,
1432         4 - 1, 24 - 1, 27 - 1, 7 - 1, 8 - 1, 22 - 1, 20 - 1, 30 - 1,
1433         9 - 1, 5 - 1, 10 - 1, 21 - 1, 6 - 1, 32 - 1, 23 - 1, 18 - 1,
1434         19 - 1, 12 - 1, 13 - 1, 31 - 1, 25 - 1, 26 - 1, 29 - 1, 28 - 1,
1435         33 - 1, 34 - 1, 35 - 1, 46 - 1, 47 - 1, 52 - 1, 50 - 1, 51 - 1,
1436         49 - 1, 39 - 1, 40 - 1, 38 - 1, 54 - 1, 53 - 1, 55 - 1, 37 - 1,
1437         58 - 1, 59 - 1, 56 - 1, 42 - 1, 41 - 1, 43 - 1, 44 - 1, 60 - 1,
1438         45 - 1, 57 - 1, 48 - 1, 36 - 1, 61 - 1, 64 - 1, 63 - 1, 62 - 1
1439     ];
1440 
1441     assert(s_sorted_partition_order[34] == 34);
1442 
1443     int best_key_partition = 0;
1444 
1445     for (uint partition_iter = 0; (partition_iter < total_partitions) && (best_err > 0); partition_iter++)
1446     {
1447         const uint partition = s_sorted_partition_order[partition_iter];
1448 
1449         // Check to see if we should bother evaluating this partition at all, depending on the best partition found from the first 14.
1450         if (pComp_params.m_mode1_partition_estimation_filterbank)
1451         {
1452             if ((partition_iter >= 14) && (partition_iter <= 34))
1453             {
1454                 const uint best_key_partition_bitmask = 1 << (best_key_partition + 1);
1455                 if ((g_partition_predictors[partition] & best_key_partition_bitmask) == 0)
1456                 {
1457                     if (partition_iter == 34)
1458                         break;
1459 
1460                     continue;
1461                 }
1462             }
1463         }
1464 
1465         const ubyte *pPartition = &g_bc7_partition2[partition * 16];
1466 
1467         color_quad_u8[16][2] subset_colors;
1468         uint[2] subset_total_colors = [ 0, 0 ];
1469         for (uint index = 0; index < 16; index++)
1470             subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = pPixels[index];
1471 
1472         ulong total_subset_err = 0;
1473         for (uint subset = 0; (subset < 2) && (total_subset_err < best_err); subset++)
1474             total_subset_err += color_cell_compression_est(subset_total_colors[subset], &subset_colors[subset][0], pComp_params.m_perceptual, pweights, best_err);
1475 
1476         if (total_subset_err < best_err)
1477         {
1478             best_err = total_subset_err;
1479             best_partition = partition;
1480         }
1481 
1482         // If the checkerboard pattern doesn't get the highest ranking vs. the previous (lower frequency) patterns, then just stop now because statistically the subsequent patterns won't do well either.
1483         if ((partition == 34) && (best_partition != 34))
1484             break;
1485 
1486         if (partition_iter == 13)
1487             best_key_partition = best_partition;
1488 
1489     } // partition
1490 
1491     return best_partition;
1492 }
1493 
1494 void set_block_bits(ubyte *pBytes, uint val, uint num_bits, uint *pCur_ofs) @system
1495 {
1496     assert((num_bits <= 32) && (val < (1UL << num_bits)));
1497     while (num_bits)
1498     {
1499         const uint n = minimumu(8 - (*pCur_ofs & 7), num_bits);
1500         pBytes[*pCur_ofs >> 3] |= cast(ubyte)(val << (*pCur_ofs & 7));
1501         val >>= n;
1502         num_bits -= n;
1503         *pCur_ofs += n;
1504     }
1505     assert(*pCur_ofs <= 128);
1506 }
1507 
1508 struct bc7_optimization_results
1509 {
1510     uint m_mode;
1511     uint m_partition;
1512     ubyte[16] m_selectors;
1513     color_quad_u8[2] m_low;
1514     color_quad_u8[2] m_high;
1515     uint[2][2] m_pbits;
1516 }
1517 
1518 static void encode_bc7_block(void *pBlock, const(bc7_optimization_results)* pResults) @system
1519 {
1520     const uint best_mode = pResults.m_mode;
1521     const uint total_subsets = g_bc7_num_subsets[best_mode];
1522     const uint total_partitions = 1 << g_bc7_partition_bits[best_mode];
1523     const ubyte *pPartition = (total_subsets == 2) ? &g_bc7_partition2[pResults.m_partition * 16] : &g_bc7_partition1[0];
1524 
1525     ubyte[16] color_selectors;
1526     memcpy(color_selectors.ptr, pResults.m_selectors.ptr, 16);
1527 
1528     color_quad_u8[2] low, high;
1529     memcpy(low.ptr, pResults.m_low.ptr, low.sizeof);
1530     memcpy(high.ptr, pResults.m_high.ptr, high.sizeof);
1531 
1532     uint[2][2] pbits;
1533     static assert(pbits.sizeof == 16);
1534     memcpy(pbits.ptr, pResults.m_pbits.ptr, pbits.sizeof);
1535 
1536     int[2] anchor = [ -1, -1 ];
1537 
1538     for (uint k = 0; k < total_subsets; k++)
1539     {
1540         const uint anchor_index = k ? g_bc7_table_anchor_index_second_subset[pResults.m_partition] : 0;
1541         anchor[k] = anchor_index;
1542 
1543         const uint color_index_bits = get_bc7_color_index_size(best_mode, 0);
1544         const uint num_color_indices = 1 << color_index_bits;
1545 
1546         if (color_selectors[anchor_index] & (num_color_indices >> 1))
1547         {
1548             for (uint i = 0; i < 16; i++)
1549                 if (pPartition[i] == k)
1550                     color_selectors[i] = cast(ubyte)((num_color_indices - 1) - color_selectors[i]);
1551 
1552             color_quad_u8 tmp = low[k];
1553             low[k] = high[k];
1554             high[k] = tmp;
1555 
1556             if (!g_bc7_mode_has_shared_p_bits[best_mode])
1557             {
1558                 uint t = pbits[k][0];
1559                 pbits[k][0] = pbits[k][1];
1560                 pbits[k][1] = t;
1561             }
1562         }
1563     }
1564 
1565     ubyte *pBlock_bytes = cast(ubyte *)(pBlock);
1566     memset(pBlock_bytes, 0, BC7ENC16_BLOCK_SIZE);
1567 
1568     uint cur_bit_ofs = 0;
1569     set_block_bits(pBlock_bytes, 1 << best_mode, best_mode + 1, &cur_bit_ofs);
1570 
1571     if (total_partitions > 1)
1572         set_block_bits(pBlock_bytes, pResults.m_partition, 6, &cur_bit_ofs);
1573 
1574     const uint total_comps = (best_mode >= 4) ? 4 : 3;
1575     for (uint comp = 0; comp < total_comps; comp++)
1576     {
1577         for (uint subset = 0; subset < total_subsets; subset++)
1578         {
1579             set_block_bits(pBlock_bytes, low[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs);
1580             set_block_bits(pBlock_bytes, high[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs);
1581         }
1582     }
1583 
1584     for (uint subset = 0; subset < total_subsets; subset++)
1585     {
1586         set_block_bits(pBlock_bytes, pbits[subset][0], 1, &cur_bit_ofs);
1587         if (!g_bc7_mode_has_shared_p_bits[best_mode])
1588             set_block_bits(pBlock_bytes, pbits[subset][1], 1, &cur_bit_ofs);
1589     }
1590 
1591     for (int idx = 0; idx < 16; idx++)
1592     {
1593         uint n = get_bc7_color_index_size(best_mode, 0);
1594         if ((idx == anchor[0]) || (idx == anchor[1]))
1595             n--;
1596         set_block_bits(pBlock_bytes, color_selectors[idx], n, &cur_bit_ofs);
1597     }
1598 
1599     assert(cur_bit_ofs == 128);
1600 }
1601 
1602 void handle_alpha_block(void *pBlock, const(color_quad_u8)* pPixels, 
1603                         const(bc7enc16_compress_block_params)* pComp_params, 
1604                         color_cell_compressor_params *pParams) @system
1605 {
1606     color_cell_compressor_results results6;
1607 
1608     pParams.m_pSelector_weights = g_bc7_weights4.ptr;
1609     pParams.m_pSelector_weightsx = cast(const(vec4F)*) g_bc7_weights4x.ptr;
1610     pParams.m_num_selector_weights = 16;
1611     pParams.m_comp_bits = 7;
1612     pParams.m_has_pbits = BC7ENC16_TRUE;
1613     pParams.m_has_alpha = BC7ENC16_TRUE;
1614     pParams.m_perceptual = pComp_params.m_perceptual;
1615     pParams.m_num_pixels = 16;
1616     pParams.m_pPixels = pPixels;
1617 
1618     bc7_optimization_results opt_results;
1619     results6.m_pSelectors = opt_results.m_selectors.ptr;
1620 
1621     ubyte[16] selectors_temp;
1622     results6.m_pSelectors_temp = selectors_temp.ptr;
1623 
1624     color_cell_compression(6, pParams, &results6, pComp_params);
1625 
1626     opt_results.m_mode = 6;
1627     opt_results.m_partition = 0;
1628     opt_results.m_low[0] = results6.m_low_endpoint;
1629     opt_results.m_high[0] = results6.m_high_endpoint;
1630     opt_results.m_pbits[0][0] = results6.m_pbits[0];
1631     opt_results.m_pbits[0][1] = results6.m_pbits[1];
1632 
1633     encode_bc7_block(pBlock, &opt_results);
1634 }
1635 
1636 static void handle_opaque_block(void *pBlock, 
1637                                 const(color_quad_u8)* pPixels, 
1638                                 const(bc7enc16_compress_block_params)* pComp_params, 
1639                                 color_cell_compressor_params *pParams) @system
1640 {
1641     ubyte[16] selectors_temp;
1642 
1643     // Mode 6
1644     bc7_optimization_results opt_results;
1645 
1646     pParams.m_pSelector_weights = g_bc7_weights4.ptr;
1647     pParams.m_pSelector_weightsx = cast(const vec4F *)g_bc7_weights4x;
1648     pParams.m_num_selector_weights = 16;
1649     pParams.m_comp_bits = 7;
1650     pParams.m_has_pbits = BC7ENC16_TRUE;
1651     pParams.m_endpoints_share_pbit = BC7ENC16_FALSE;
1652     pParams.m_perceptual = pComp_params.m_perceptual;
1653     pParams.m_num_pixels = 16;
1654     pParams.m_pPixels = pPixels;
1655     pParams.m_has_alpha = BC7ENC16_FALSE;
1656 
1657     color_cell_compressor_results results6;
1658     results6.m_pSelectors = opt_results.m_selectors.ptr;
1659     results6.m_pSelectors_temp = selectors_temp.ptr;
1660 
1661     ulong best_err = color_cell_compression(6, pParams, &results6, pComp_params);
1662 
1663     opt_results.m_mode = 6;
1664     opt_results.m_partition = 0;
1665     opt_results.m_low[0] = results6.m_low_endpoint;
1666     opt_results.m_high[0] = results6.m_high_endpoint;
1667     opt_results.m_pbits[0][0] = results6.m_pbits[0];
1668     opt_results.m_pbits[0][1] = results6.m_pbits[1];
1669 
1670     // Mode 1
1671     if ((best_err > 0) && (pComp_params.m_max_partitions_mode1 > 0))
1672     {
1673         const uint trial_partition = estimate_partition(pPixels, pComp_params, pParams.m_weights.ptr);
1674         pParams.m_pSelector_weights = g_bc7_weights3.ptr;
1675         pParams.m_pSelector_weightsx = cast(const vec4F *)g_bc7_weights3x;
1676         pParams.m_num_selector_weights = 8;
1677         pParams.m_comp_bits = 6;
1678         pParams.m_has_pbits = BC7ENC16_TRUE;
1679         pParams.m_endpoints_share_pbit = BC7ENC16_TRUE;
1680 
1681         const ubyte *pPartition = &g_bc7_partition2[trial_partition * 16];
1682 
1683         color_quad_u8[16][2] subset_colors;
1684 
1685         uint[2] subset_total_colors1 = [ 0, 0 ];
1686 
1687         ubyte[16][2] subset_pixel_index1;
1688         ubyte[16][2] subset_selectors1;
1689         color_cell_compressor_results[2] subset_results1;
1690 
1691         for (uint idx = 0; idx < 16; idx++)
1692         {
1693             const uint p = pPartition[idx];
1694             subset_colors[p][subset_total_colors1[p]] = pPixels[idx];
1695             subset_pixel_index1[p][subset_total_colors1[p]] = cast(ubyte)idx;
1696             subset_total_colors1[p]++;
1697         }
1698 
1699         ulong trial_err = 0;
1700         for (uint subset = 0; subset < 2; subset++)
1701         {
1702             pParams.m_num_pixels = subset_total_colors1[subset];
1703             pParams.m_pPixels = &subset_colors[subset][0];
1704 
1705             color_cell_compressor_results *pResults = &subset_results1[subset];
1706             pResults.m_pSelectors = &subset_selectors1[subset][0];
1707             pResults.m_pSelectors_temp = selectors_temp.ptr;
1708             ulong err = color_cell_compression(1, pParams, pResults, pComp_params);
1709             trial_err += err;
1710             if (trial_err > best_err)
1711                 break;
1712 
1713         } // subset
1714 
1715         if (trial_err < best_err)
1716         {
1717             best_err = trial_err;
1718             opt_results.m_mode = 1;
1719             opt_results.m_partition = trial_partition;
1720             for (uint subset = 0; subset < 2; subset++)
1721             {
1722                 for (uint i = 0; i < subset_total_colors1[subset]; i++)
1723                     opt_results.m_selectors[subset_pixel_index1[subset][i]] = subset_selectors1[subset][i];
1724                 opt_results.m_low[subset] = subset_results1[subset].m_low_endpoint;
1725                 opt_results.m_high[subset] = subset_results1[subset].m_high_endpoint;
1726                 opt_results.m_pbits[subset][0] = subset_results1[subset].m_pbits[0];
1727             }
1728         }
1729     }
1730 
1731     encode_bc7_block(pBlock, &opt_results);
1732 }
1733 
1734 // Packs a single block of 16x16 RGBA pixels (R first in memory) to 128-bit BC7 block pBlock, using either mode 1 and/or 6.
1735 // Alpha blocks will always use mode 6, and by default opaque blocks will use either modes 1 or 6.
1736 // Returns BC7ENC16_TRUE if the block had any pixels with alpha < 255, otherwise it return BC7ENC16_FALSE. (This is not an error code - a block is always encoded.)
1737 bc7enc16_bool bc7enc16_compress_block(void *pBlock, 
1738                                       const(void)* pPixelsRGBA, 
1739                                       const(bc7enc16_compress_block_params)* pComp_params) @system
1740 {
1741     assert(g_bc7_mode_1_optimal_endpoints[255][0].m_hi != 0);
1742 
1743     const color_quad_u8 *pPixels = cast(const color_quad_u8 *)(pPixelsRGBA);
1744 
1745     color_cell_compressor_params params;
1746     if (pComp_params.m_perceptual)
1747     {
1748         // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
1749         const float pr_weight = (.5f / (1.0f - .2126f)) * (.5f / (1.0f - .2126f));
1750         const float pb_weight = (.5f / (1.0f - .0722f)) * (.5f / (1.0f - .0722f));
1751         params.m_weights[0] = cast(int)(pComp_params.m_weights[0] * 4.0f);
1752         params.m_weights[1] = cast(int)(pComp_params.m_weights[1] * 4.0f * pr_weight);
1753         params.m_weights[2] = cast(int)(pComp_params.m_weights[2] * 4.0f * pb_weight);
1754         params.m_weights[3] = pComp_params.m_weights[3] * 4;
1755     }
1756     else
1757         memcpy(params.m_weights.ptr, pComp_params.m_weights.ptr, (params.m_weights).sizeof);
1758 
1759     for (uint i = 0; i < 16; i++)
1760     {
1761         if (pPixels[i].m_c[3] < 255)
1762         {
1763             handle_alpha_block(pBlock, pPixels, pComp_params, &params);
1764             return BC7ENC16_TRUE;
1765         }
1766     }
1767     handle_opaque_block(pBlock, pPixels, pComp_params, &params);
1768     return BC7ENC16_FALSE;
1769 }
1770 
1771 /*
1772 ------------------------------------------------------------------------------
1773 This software is available under 2 licenses -- choose whichever you prefer.
1774 ------------------------------------------------------------------------------
1775 ALTERNATIVE A - MIT License
1776 Copyright(c) 2018 Richard Geldreich, Jr.
1777 Permission is hereby granted, free of charge, to any person obtaining a copy of
1778 this software and associated documentation files(the "Software"), to deal in
1779 the Software without restriction, including without limitation the rights to
1780 use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
1781 of the Software, and to permit persons to whom the Software is furnished to do
1782 so, subject to the following conditions :
1783 The above copyright notice and this permission notice shall be included in all
1784 copies or substantial portions of the Software.
1785 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1786 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1787 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
1788 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1789 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1790 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1791 SOFTWARE.
1792 ------------------------------------------------------------------------------
1793 ALTERNATIVE B - Public Domain(www.unlicense.org)
1794 This is free and unencumbered software released into the public domain.
1795 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
1796 software, either in source code form or as a compiled binary, for any purpose,
1797 commercial or non - commercial, and by any means.
1798 In jurisdictions that recognize copyright laws, the author or authors of this
1799 software dedicate any and all copyright interest in the software to the public
1800 domain.We make this dedication for the benefit of the public at large and to
1801 the detriment of our heirs and successors.We intend this dedication to be an
1802 overt act of relinquishment in perpetuity of all present and future rights to
1803 this software under copyright law.
1804 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1805 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1806 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
1807 AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
1808 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
1809 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1810 ------------------------------------------------------------------------------
1811 */