1 /// BC7 encoding image loading. 2 /// D translation of bc7enc16 d3b037f33b8c6df184177a0ae6a0f4cfec1434ad 3 module gamut.codecs.bc7enc16; 4 5 version(encodeDDS): 6 7 8 import core.stdc.string: memset, memcpy; 9 import std.math: abs, sqrt, floor; 10 import gamut.internals.mutex; 11 12 // File: bc7enc16.h - Richard Geldreich, Jr. - MIT license or public domain (see end of bc7enc16.c) 13 14 enum BC7ENC16_BLOCK_SIZE = 16; 15 enum BC7ENC16_MAX_PARTITIONS1 = 64; 16 enum BC7ENC16_MAX_UBER_LEVEL = 4; 17 18 alias bc7enc16_bool = ubyte; 19 enum BC7ENC16_TRUE = 1; 20 enum BC7ENC16_FALSE = 0; 21 22 nothrow @nogc @safe: 23 24 struct bc7enc16_compress_block_params 25 { 26 // m_max_partitions_mode1 may range from 0 (disables mode 1) to BC7ENC16_MAX_PARTITIONS1. The higher this value, the slower the compressor, but the higher the quality. 27 uint m_max_partitions_mode1; 28 29 // Relative RGBA or YCbCrA weights. 30 uint[4] m_weights; 31 32 // m_uber_level may range from 0 to BC7ENC16_MAX_UBER_LEVEL. The higher this value, the slower the compressor, but the higher the quality. 33 uint m_uber_level; 34 35 // If m_perceptual is true, colorspace error is computed in YCbCr space, otherwise RGB. 36 bc7enc16_bool m_perceptual; 37 38 // Set m_try_least_squares to false for slightly faster/lower quality compression. 39 bc7enc16_bool m_try_least_squares; 40 41 // When m_mode1_partition_estimation_filterbank, the mode1 partition estimator skips lesser used partition patterns unless they are strongly predicted to be potentially useful. 42 // There's a slight loss in quality with this enabled (around .08 dB RGB PSNR or .05 dB Y PSNR), but up to a 11% gain in speed depending on the other settings. 43 bc7enc16_bool m_mode1_partition_estimation_filterbank; 44 } 45 46 void bc7enc16_compress_block_params_init_linear_weights(bc7enc16_compress_block_params *p) pure 47 { 48 p.m_perceptual = BC7ENC16_FALSE; 49 p.m_weights[0] = 1; 50 p.m_weights[1] = 1; 51 p.m_weights[2] = 1; 52 p.m_weights[3] = 1; 53 } 54 55 void bc7enc16_compress_block_params_init_perceptual_weights(bc7enc16_compress_block_params *p) pure 56 { 57 p.m_perceptual = BC7ENC16_TRUE; 58 p.m_weights[0] = 128; 59 p.m_weights[1] = 64; 60 p.m_weights[2] = 16; 61 p.m_weights[3] = 32; 62 } 63 64 void bc7enc16_compress_block_params_init(bc7enc16_compress_block_params *p) pure 65 { 66 p.m_max_partitions_mode1 = BC7ENC16_MAX_PARTITIONS1; 67 p.m_try_least_squares = BC7ENC16_TRUE; 68 p.m_mode1_partition_estimation_filterbank = BC7ENC16_TRUE; 69 p.m_uber_level = 0; 70 bc7enc16_compress_block_params_init_perceptual_weights(p); 71 } 72 73 74 // File: bc7enc16.c - Richard Geldreich, Jr. 4/2018 - MIT license or public domain (see end of file) 75 76 // Helpers 77 int clampi(int value, int low, int high) pure 78 { 79 if (value < low) 80 value = low; 81 else if (value > high) 82 value = high; 83 return value; 84 } 85 86 float clampf(float value, float low, float high) pure 87 { 88 if (value < low) 89 value = low; 90 else if (value > high) 91 value = high; 92 return value; 93 } 94 95 float saturate(float value) pure 96 { 97 return clampf(value, 0, 1.0f); 98 } 99 100 ubyte minimumub(ubyte a, ubyte b) pure 101 { 102 return (a < b) ? a : b; 103 } 104 105 uint minimumu(uint a, uint b) pure 106 { 107 return (a < b) ? a : b; 108 } 109 110 float minimumf(float a, float b) pure 111 { 112 return (a < b) ? a : b; 113 } 114 115 ubyte maximumub(ubyte a, ubyte b) pure 116 { 117 return (a > b) ? a : b; 118 } 119 120 uint maximumu(uint a, uint b) pure 121 { 122 return (a > b) ? a : b; 123 } 124 125 float maximumf(float a, float b) pure 126 { 127 return (a > b) ? a : b; 128 } 129 130 int squarei(int i) pure 131 { 132 return i * i; 133 } 134 135 float squaref(float i) pure 136 { 137 return i * i; 138 } 139 140 struct color_quad_u8 141 { 142 ubyte[4] m_c; 143 } 144 145 struct vec4F 146 { 147 float[4] m_c; 148 } 149 150 color_quad_u8 *color_quad_u8_set_clamped(color_quad_u8 *pRes, int r, int g, int b, int a) pure @system 151 { 152 pRes.m_c[0] = cast(ubyte)clampi(r, 0, 255); 153 pRes.m_c[1] = cast(ubyte)clampi(g, 0, 255); 154 pRes.m_c[2] = cast(ubyte)clampi(b, 0, 255); 155 pRes.m_c[3] = cast(ubyte)clampi(a, 0, 255); 156 return pRes; 157 } 158 159 color_quad_u8 *color_quad_u8_set(color_quad_u8 *pRes, int r, int g, int b, int a) pure @system 160 { 161 assert(cast(uint)(r | g | b | a) <= 255); 162 pRes.m_c[0] = cast(ubyte)r; 163 pRes.m_c[1] = cast(ubyte)g; 164 pRes.m_c[2] = cast(ubyte)b; 165 pRes.m_c[3] = cast(ubyte)a; 166 return pRes; 167 } 168 169 bc7enc16_bool color_quad_u8_notequals(ref const(color_quad_u8) pLHS, ref const(color_quad_u8) pRHS) pure 170 { 171 return (pLHS.m_c[0] != pRHS.m_c[0]) 172 || (pLHS.m_c[1] != pRHS.m_c[1]) 173 || (pLHS.m_c[2] != pRHS.m_c[2]) 174 || (pLHS.m_c[3] != pRHS.m_c[3]); 175 } 176 177 vec4F* vec4F_set_scalar(vec4F *pV, float x) pure 178 { 179 pV.m_c[0] = x; 180 pV.m_c[1] = x; 181 pV.m_c[2] = x; 182 pV.m_c[3] = x; 183 return pV; 184 } 185 186 vec4F* vec4F_set(vec4F *pV, float x, float y, float z, float w) pure 187 { 188 pV.m_c[0] = x; 189 pV.m_c[1] = y; 190 pV.m_c[2] = z; 191 pV.m_c[3] = w; 192 return pV; 193 } 194 195 void vec4F_saturate_in_place(ref vec4F pV) pure 196 { 197 pV.m_c[0] = saturate(pV.m_c[0]); 198 pV.m_c[1] = saturate(pV.m_c[1]); 199 pV.m_c[2] = saturate(pV.m_c[2]); 200 pV.m_c[3] = saturate(pV.m_c[3]); 201 } 202 203 vec4F vec4F_saturate(const(vec4F)* pV) pure 204 { 205 vec4F res; 206 res.m_c[0] = saturate(pV.m_c[0]); 207 res.m_c[1] = saturate(pV.m_c[1]); 208 res.m_c[2] = saturate(pV.m_c[2]); 209 res.m_c[3] = saturate(pV.m_c[3]); 210 return res; 211 } 212 213 vec4F vec4F_from_color(const(color_quad_u8)* pC) pure @trusted 214 { 215 vec4F res; 216 vec4F_set(&res, pC.m_c[0], pC.m_c[1], pC.m_c[2], pC.m_c[3]); 217 return res; 218 } 219 220 vec4F vec4F_add(const(vec4F)* pLHS, const(vec4F)* pRHS) pure @trusted 221 { 222 vec4F res; 223 vec4F_set(&res, pLHS.m_c[0] + pRHS.m_c[0], pLHS.m_c[1] + pRHS.m_c[1], 224 pLHS.m_c[2] + pRHS.m_c[2], pLHS.m_c[3] + pRHS.m_c[3]); 225 return res; 226 } 227 228 vec4F vec4F_sub(const(vec4F)* pLHS, const(vec4F)* pRHS) pure @trusted 229 { 230 vec4F res; 231 vec4F_set(&res, pLHS.m_c[0] - pRHS.m_c[0], pLHS.m_c[1] - pRHS.m_c[1], 232 pLHS.m_c[2] - pRHS.m_c[2], pLHS.m_c[3] - pRHS.m_c[3]); 233 return res; 234 } 235 236 float vec4F_dot(const(vec4F)* pLHS, const(vec4F)* pRHS) pure 237 { 238 return pLHS.m_c[0] * pRHS.m_c[0] + pLHS.m_c[1] * pRHS.m_c[1] 239 + pLHS.m_c[2] * pRHS.m_c[2] + pLHS.m_c[3] * pRHS.m_c[3]; 240 } 241 242 vec4F vec4F_mul(const(vec4F)* pLHS, float s) pure @trusted 243 { 244 vec4F res; vec4F_set(&res, pLHS.m_c[0] * s, pLHS.m_c[1] * s, 245 pLHS.m_c[2] * s, pLHS.m_c[3] * s); 246 return res; 247 } 248 249 vec4F* vec4F_normalize_in_place(vec4F *pV) pure 250 { 251 float s = pV.m_c[0] * pV.m_c[0] + pV.m_c[1] * pV.m_c[1] + pV.m_c[2] * pV.m_c[2] + pV.m_c[3] * pV.m_c[3]; 252 if (s != 0.0f) 253 { 254 s = 1.0f / sqrt(s); 255 pV.m_c[0] *= s; 256 pV.m_c[1] *= s; 257 pV.m_c[2] *= s; 258 pV.m_c[3] *= s; 259 } 260 return pV; 261 } 262 263 // Various BC7 tables 264 static immutable uint[8] g_bc7_weights3 = [ 0, 9, 18, 27, 37, 46, 55, 64 ]; 265 static immutable uint[16] g_bc7_weights4 = [ 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 ]; 266 // Precomputed weight constants used during least fit determination. For each entry in g_bc7_weights[]: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w 267 static immutable float[8 * 4] g_bc7_weights3x = 268 [ 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 269 0.079102f, 0.202148f, 0.516602f, 0.281250f, 0.177979f, 0.243896f, 0.334229f, 0.421875f, 270 0.334229f, 0.243896f, 0.177979f, 0.578125f, 0.516602f, 0.202148f, 0.079102f, 0.718750f, 271 0.738525f, 0.120850f, 0.019775f, 0.859375f, 1.000000f, 0.000000f, 0.000000f, 1.000000f ]; 272 273 static immutable float[16 * 4] g_bc7_weights4x = 274 [ 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 275 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.041260f, 0.161865f, 0.635010f, 0.203125f, 276 0.070557f, 0.195068f, 0.539307f, 0.265625f, 0.107666f, 0.220459f, 0.451416f, 0.328125f, 277 0.165039f, 0.241211f, 0.352539f, 0.406250f, 0.219727f, 0.249023f, 0.282227f, 0.468750f, 278 0.282227f, 0.249023f, 0.219727f, 0.531250f, 0.352539f, 0.241211f, 0.165039f, 0.593750f, 279 0.451416f, 0.220459f, 0.107666f, 0.671875f, 0.539307f, 0.195068f, 0.070557f, 0.734375f, 280 0.635010f, 0.161865f, 0.041260f, 0.796875f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 281 0.878906f, 0.058594f, 0.003906f, 0.937500f, 1.000000f, 0.000000f, 0.000000f, 1.000000f ]; 282 283 static immutable ubyte[64] g_bc7_partition1 = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ]; 284 static immutable ubyte[64*16] g_bc7_partition2 = 285 [ 286 0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1, 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1, 0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1, 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1, 0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1, 287 0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1, 0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1, 288 0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1, 0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0, 0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0, 0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1, 289 0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0, 0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0, 0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0, 0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0, 0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0, 0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0, 290 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, 0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1, 0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0, 0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0, 0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0, 0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0, 0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1, 0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1, 291 0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0, 0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0, 0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0, 0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0, 0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0, 0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1, 0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1, 0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0, 292 0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0, 0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0, 0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0, 0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0, 0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0, 0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0, 293 0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1, 0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1, 0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1, 0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1, 0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0, 0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0, 0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1 294 ]; 295 296 static immutable ubyte[64] g_bc7_table_anchor_index_second_subset = 297 [ 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, 298 15, 2, 8, 2, 2, 8, 8,15, 2, 8, 2, 2, 8, 8, 2, 2, 299 15,15, 6, 8, 2, 8,15,15, 2, 8, 2, 2, 2,15,15, 6, 300 6, 2, 6, 8,15,15, 2, 2, 15,15,15,15,15, 2, 2,15 ]; 301 302 static immutable ubyte[8] g_bc7_num_subsets = [ 3, 2, 3, 2, 1, 1, 1, 2 ]; 303 static immutable ubyte[8] g_bc7_partition_bits = [ 4, 6, 6, 6, 0, 0, 0, 6 ]; 304 static immutable ubyte[8] g_bc7_color_index_bitcount = [ 3, 3, 2, 2, 2, 2, 4, 2 ]; 305 306 int get_bc7_color_index_size(int mode, int index_selection_bit) pure 307 { 308 return g_bc7_color_index_bitcount[mode] + index_selection_bit; 309 } 310 311 static immutable ubyte[8] g_bc7_mode_has_p_bits = [ 1, 1, 0, 1, 0, 0, 1, 1 ]; 312 static immutable ubyte[8] g_bc7_mode_has_shared_p_bits = [ 0, 1, 0, 0, 0, 0, 0, 0 ]; 313 static immutable ubyte[8] g_bc7_color_precision_table = [ 4, 6, 5, 7, 5, 7, 7, 5 ]; 314 static immutable byte[8] g_bc7_alpha_precision_table = [ 0, 0, 0, 0, 6, 8, 7, 5 ]; 315 316 struct endpoint_err 317 { 318 ushort m_error; 319 ubyte m_lo; 320 ubyte m_hi; 321 } 322 323 __gshared endpoint_err[2][256] g_bc7_mode_1_optimal_endpoints; // [c][pbit] 324 __gshared Mutex g_tableProtect; 325 __gshared bool g_tableInitialized = false; 326 327 enum uint BC7ENC16_MODE_1_OPTIMAL_INDEX = 2; 328 329 // Initialize the lookup table used for optimal single color compression in mode 1 330 // Warning: bc7enc16_compress_block_init() MUST be called before calling bc7enc16_compress_block() (or you'll get artifacts). 331 // Note: this is racey, so we use a self-init mutex. 332 void bc7enc16_compress_block_init() @trusted 333 { 334 g_tableProtect.lockLazy(); 335 scope(exit) g_tableProtect.unlock(); 336 337 if (g_tableInitialized) 338 return; 339 340 g_tableInitialized = true; 341 342 for (int c = 0; c < 256; c++) 343 { 344 for (uint lp = 0; lp < 2; lp++) 345 { 346 endpoint_err best; 347 best.m_error = ushort.max; 348 for (uint l = 0; l < 64; l++) 349 { 350 uint low = ((l << 1) | lp) << 1; 351 low |= (low >> 7); 352 for (uint h = 0; h < 64; h++) 353 { 354 uint high = ((h << 1) | lp) << 1; 355 high |= (high >> 7); 356 const int k = (low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6; 357 const int err = (k - c) * (k - c); 358 if (err < best.m_error) 359 { 360 best.m_error = cast(ushort)err; 361 best.m_lo = cast(ubyte)l; 362 best.m_hi = cast(ubyte)h; 363 } 364 } 365 } 366 g_bc7_mode_1_optimal_endpoints[c][lp] = best; 367 } 368 } 369 } 370 371 void compute_least_squares_endpoints_rgba(uint N, 372 const(ubyte)* pSelectors, 373 const(vec4F)* pSelector_weights, 374 vec4F *pXl, 375 vec4F *pXh, 376 const(color_quad_u8)* pColors) @system 377 { 378 // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 379 // I did this in matrix form first, expanded out all the ops, then optimized it a bit. 380 float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; 381 float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; 382 float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; 383 float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; 384 float q00_a = 0.0f, q10_a = 0.0f, t_a = 0.0f; 385 for (uint i = 0; i < N; i++) 386 { 387 const uint sel = pSelectors[i]; 388 z00 += pSelector_weights[sel].m_c[0]; 389 z10 += pSelector_weights[sel].m_c[1]; 390 z11 += pSelector_weights[sel].m_c[2]; 391 float w = pSelector_weights[sel].m_c[3]; 392 q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0]; 393 q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1]; 394 q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2]; 395 q00_a += w * pColors[i].m_c[3]; t_a += pColors[i].m_c[3]; 396 } 397 398 q10_r = t_r - q00_r; 399 q10_g = t_g - q00_g; 400 q10_b = t_b - q00_b; 401 q10_a = t_a - q00_a; 402 403 z01 = z10; 404 405 float det = z00 * z11 - z01 * z10; 406 if (det != 0.0f) 407 det = 1.0f / det; 408 409 float iz00, iz01, iz10, iz11; 410 iz00 = z11 * det; 411 iz01 = -z01 * det; 412 iz10 = -z10 * det; 413 iz11 = z00 * det; 414 415 pXl.m_c[0] = cast(float)(iz00 * q00_r + iz01 * q10_r); pXh.m_c[0] = cast(float)(iz10 * q00_r + iz11 * q10_r); 416 pXl.m_c[1] = cast(float)(iz00 * q00_g + iz01 * q10_g); pXh.m_c[1] = cast(float)(iz10 * q00_g + iz11 * q10_g); 417 pXl.m_c[2] = cast(float)(iz00 * q00_b + iz01 * q10_b); pXh.m_c[2] = cast(float)(iz10 * q00_b + iz11 * q10_b); 418 pXl.m_c[3] = cast(float)(iz00 * q00_a + iz01 * q10_a); pXh.m_c[3] = cast(float)(iz10 * q00_a + iz11 * q10_a); 419 } 420 421 void compute_least_squares_endpoints_rgb(uint N, const ubyte *pSelectors, 422 const(vec4F)* pSelector_weights, 423 vec4F *pXl, vec4F *pXh, const(color_quad_u8)*pColors) @system 424 { 425 float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; 426 float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; 427 float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; 428 float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; 429 for (uint i = 0; i < N; i++) 430 { 431 const uint sel = pSelectors[i]; 432 z00 += pSelector_weights[sel].m_c[0]; 433 z10 += pSelector_weights[sel].m_c[1]; 434 z11 += pSelector_weights[sel].m_c[2]; 435 float w = pSelector_weights[sel].m_c[3]; 436 q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0]; 437 q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1]; 438 q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2]; 439 } 440 441 q10_r = t_r - q00_r; 442 q10_g = t_g - q00_g; 443 q10_b = t_b - q00_b; 444 445 z01 = z10; 446 447 float det = z00 * z11 - z01 * z10; 448 if (det != 0.0f) 449 det = 1.0f / det; 450 451 float iz00, iz01, iz10, iz11; 452 iz00 = z11 * det; 453 iz01 = -z01 * det; 454 iz10 = -z10 * det; 455 iz11 = z00 * det; 456 457 pXl.m_c[0] = cast(float)(iz00 * q00_r + iz01 * q10_r); pXh.m_c[0] = cast(float)(iz10 * q00_r + iz11 * q10_r); 458 pXl.m_c[1] = cast(float)(iz00 * q00_g + iz01 * q10_g); pXh.m_c[1] = cast(float)(iz10 * q00_g + iz11 * q10_g); 459 pXl.m_c[2] = cast(float)(iz00 * q00_b + iz01 * q10_b); pXh.m_c[2] = cast(float)(iz10 * q00_b + iz11 * q10_b); 460 pXl.m_c[3] = 255.0f; pXh.m_c[3] = 255.0f; 461 } 462 463 struct color_cell_compressor_params 464 { 465 uint m_num_pixels; 466 const(color_quad_u8)* m_pPixels; 467 uint m_num_selector_weights; 468 const(uint)* m_pSelector_weights; 469 const(vec4F)* m_pSelector_weightsx; 470 uint m_comp_bits; 471 uint[4] m_weights; 472 bc7enc16_bool m_has_alpha; 473 bc7enc16_bool m_has_pbits; 474 bc7enc16_bool m_endpoints_share_pbit; 475 bc7enc16_bool m_perceptual; 476 } 477 478 struct color_cell_compressor_results 479 { 480 ulong m_best_overall_err; 481 color_quad_u8 m_low_endpoint; 482 color_quad_u8 m_high_endpoint; 483 uint[2] m_pbits; 484 ubyte *m_pSelectors; 485 ubyte *m_pSelectors_temp; 486 } 487 488 color_quad_u8 scale_color(ref const(color_quad_u8) pC, const(color_cell_compressor_params) *pParams) pure 489 { 490 color_quad_u8 results; 491 492 const uint n = pParams.m_comp_bits + (pParams.m_has_pbits ? 1 : 0); 493 assert((n >= 4) && (n <= 8)); 494 495 for (uint i = 0; i < 4; i++) 496 { 497 uint v = pC.m_c[i] << (8 - n); 498 v |= (v >> n); 499 assert(v <= 255); 500 results.m_c[i] = cast(ubyte)(v); 501 } 502 503 return results; 504 } 505 506 ulong compute_color_distance_rgb(const(color_quad_u8)* pE1, 507 const(color_quad_u8)* pE2, 508 bc7enc16_bool perceptual, 509 const(uint)* weights) pure @system 510 { 511 int dr, dg, db; 512 513 if (perceptual) 514 { 515 const int l1 = pE1.m_c[0] * 109 + pE1.m_c[1] * 366 + pE1.m_c[2] * 37; 516 const int cr1 = (cast(int)pE1.m_c[0] << 9) - l1; 517 const int cb1 = (cast(int)pE1.m_c[2] << 9) - l1; 518 const int l2 = pE2.m_c[0] * 109 + pE2.m_c[1] * 366 + pE2.m_c[2] * 37; 519 const int cr2 = (cast(int)pE2.m_c[0] << 9) - l2; 520 const int cb2 = (cast(int)pE2.m_c[2] << 9) - l2; 521 dr = (l1 - l2) >> 8; 522 dg = (cr1 - cr2) >> 8; 523 db = (cb1 - cb2) >> 8; 524 } 525 else 526 { 527 dr = cast(int)pE1.m_c[0] - cast(int)pE2.m_c[0]; 528 dg = cast(int)pE1.m_c[1] - cast(int)pE2.m_c[1]; 529 db = cast(int)pE1.m_c[2] - cast(int)pE2.m_c[2]; 530 } 531 532 return weights[0] * cast(uint)(dr * dr) + weights[1] * cast(uint)(dg * dg) + weights[2] * cast(uint)(db * db); 533 } 534 535 ulong compute_color_distance_rgba(const(color_quad_u8)* pE1, const(color_quad_u8)* pE2, bc7enc16_bool perceptual, const(uint)* weights /* [4] */) @system 536 { 537 int da = cast(int)pE1.m_c[3] - cast(int)pE2.m_c[3]; 538 return compute_color_distance_rgb(pE1, pE2, perceptual, weights) + (weights[3] * cast(uint)(da * da)); 539 } 540 541 ulong pack_mode1_to_one_color(const(color_cell_compressor_params)* pParams, 542 color_cell_compressor_results *pResults, 543 uint r, uint g, uint b, ubyte *pSelectors) @system 544 { 545 uint best_err = uint.max; 546 uint best_p = 0; 547 548 for (uint p = 0; p < 2; p++) 549 { 550 uint err = g_bc7_mode_1_optimal_endpoints[r][p].m_error + g_bc7_mode_1_optimal_endpoints[g][p].m_error + g_bc7_mode_1_optimal_endpoints[b][p].m_error; 551 if (err < best_err) 552 { 553 best_err = err; 554 best_p = p; 555 } 556 } 557 558 const endpoint_err *pEr = &g_bc7_mode_1_optimal_endpoints[r][best_p]; 559 const endpoint_err *pEg = &g_bc7_mode_1_optimal_endpoints[g][best_p]; 560 const endpoint_err *pEb = &g_bc7_mode_1_optimal_endpoints[b][best_p]; 561 562 color_quad_u8_set(&pResults.m_low_endpoint, pEr.m_lo, pEg.m_lo, pEb.m_lo, 0); 563 color_quad_u8_set(&pResults.m_high_endpoint, pEr.m_hi, pEg.m_hi, pEb.m_hi, 0); 564 pResults.m_pbits[0] = best_p; 565 pResults.m_pbits[1] = 0; 566 567 memset(pSelectors, BC7ENC16_MODE_1_OPTIMAL_INDEX, pParams.m_num_pixels); 568 569 color_quad_u8 p; 570 for (uint i = 0; i < 3; i++) 571 { 572 uint low = ((pResults.m_low_endpoint.m_c[i] << 1) | pResults.m_pbits[0]) << 1; 573 low |= (low >> 7); 574 575 uint high = ((pResults.m_high_endpoint.m_c[i] << 1) | pResults.m_pbits[0]) << 1; 576 high |= (high >> 7); 577 578 p.m_c[i] = cast(ubyte)((low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6); 579 } 580 p.m_c[3] = 255; 581 582 ulong total_err = 0; 583 for (uint i = 0; i < pParams.m_num_pixels; i++) 584 total_err += compute_color_distance_rgb(&p, &pParams.m_pPixels[i], pParams.m_perceptual, pParams.m_weights.ptr); 585 586 pResults.m_best_overall_err = total_err; 587 588 return total_err; 589 } 590 591 ulong evaluate_solution(const(color_quad_u8)* pLow, const(color_quad_u8)* pHigh, 592 const(uint)* pbits /*[2]*/, const(color_cell_compressor_params)* pParams, 593 color_cell_compressor_results *pResults) @system 594 { 595 color_quad_u8 quantMinColor = *pLow; 596 color_quad_u8 quantMaxColor = *pHigh; 597 598 if (pParams.m_has_pbits) 599 { 600 uint minPBit, maxPBit; 601 602 if (pParams.m_endpoints_share_pbit) 603 maxPBit = minPBit = pbits[0]; 604 else 605 { 606 minPBit = pbits[0]; 607 maxPBit = pbits[1]; 608 } 609 610 quantMinColor.m_c[0] = cast(ubyte)((pLow.m_c[0] << 1) | minPBit); 611 quantMinColor.m_c[1] = cast(ubyte)((pLow.m_c[1] << 1) | minPBit); 612 quantMinColor.m_c[2] = cast(ubyte)((pLow.m_c[2] << 1) | minPBit); 613 quantMinColor.m_c[3] = cast(ubyte)((pLow.m_c[3] << 1) | minPBit); 614 615 quantMaxColor.m_c[0] = cast(ubyte)((pHigh.m_c[0] << 1) | maxPBit); 616 quantMaxColor.m_c[1] = cast(ubyte)((pHigh.m_c[1] << 1) | maxPBit); 617 quantMaxColor.m_c[2] = cast(ubyte)((pHigh.m_c[2] << 1) | maxPBit); 618 quantMaxColor.m_c[3] = cast(ubyte)((pHigh.m_c[3] << 1) | maxPBit); 619 } 620 621 color_quad_u8 actualMinColor = scale_color(quantMinColor, pParams); 622 color_quad_u8 actualMaxColor = scale_color(quantMaxColor, pParams); 623 624 const uint N = pParams.m_num_selector_weights; 625 626 color_quad_u8[16] weightedColors; 627 weightedColors[0] = actualMinColor; 628 weightedColors[N - 1] = actualMaxColor; 629 630 const uint nc = pParams.m_has_alpha ? 4 : 3; 631 for (uint i = 1; i < (N - 1); i++) 632 for (uint j = 0; j < nc; j++) 633 weightedColors[i].m_c[j] = cast(ubyte)((actualMinColor.m_c[j] * (64 - pParams.m_pSelector_weights[i]) + actualMaxColor.m_c[j] * pParams.m_pSelector_weights[i] + 32) >> 6); 634 635 const int lr = actualMinColor.m_c[0]; 636 const int lg = actualMinColor.m_c[1]; 637 const int lb = actualMinColor.m_c[2]; 638 const int dr = actualMaxColor.m_c[0] - lr; 639 const int dg = actualMaxColor.m_c[1] - lg; 640 const int db = actualMaxColor.m_c[2] - lb; 641 642 ulong total_err = 0; 643 644 if (!pParams.m_perceptual) 645 { 646 if (pParams.m_has_alpha) 647 { 648 const int la = actualMinColor.m_c[3]; 649 const int da = actualMaxColor.m_c[3] - la; 650 651 const float f = N / cast(float)(squarei(dr) + squarei(dg) + squarei(db) + squarei(da) + .00000125f); 652 653 for (uint i = 0; i < pParams.m_num_pixels; i++) 654 { 655 const(color_quad_u8)* pC = &pParams.m_pPixels[i]; 656 int r = pC.m_c[0]; 657 int g = pC.m_c[1]; 658 int b = pC.m_c[2]; 659 int a = pC.m_c[3]; 660 661 int best_sel = cast(int)(cast(float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db + (a - la) * da) * f + .5f); 662 best_sel = clampi(best_sel, 1, N - 1); 663 664 ulong err0 = compute_color_distance_rgba(&weightedColors[best_sel - 1], pC, BC7ENC16_FALSE, pParams.m_weights.ptr); 665 ulong err1 = compute_color_distance_rgba(&weightedColors[best_sel], pC, BC7ENC16_FALSE, pParams.m_weights.ptr); 666 667 if (err1 > err0) 668 { 669 err1 = err0; 670 --best_sel; 671 } 672 total_err += err1; 673 674 pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel; 675 } 676 } 677 else 678 { 679 const float f = N / cast(float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f); 680 681 for (uint i = 0; i < pParams.m_num_pixels; i++) 682 { 683 const color_quad_u8 *pC = &pParams.m_pPixels[i]; 684 int r = pC.m_c[0]; 685 int g = pC.m_c[1]; 686 int b = pC.m_c[2]; 687 688 int sel = cast(int)(cast(float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db) * f + .5f); 689 sel = clampi(sel, 1, N - 1); 690 691 ulong err0 = compute_color_distance_rgb(&weightedColors[sel - 1], pC, BC7ENC16_FALSE, pParams.m_weights.ptr); 692 ulong err1 = compute_color_distance_rgb(&weightedColors[sel], pC, BC7ENC16_FALSE, pParams.m_weights.ptr); 693 694 int best_sel = sel; 695 ulong best_err = err1; 696 if (err0 < best_err) 697 { 698 best_err = err0; 699 best_sel = sel - 1; 700 } 701 702 total_err += best_err; 703 704 pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel; 705 } 706 } 707 } 708 else 709 { 710 for (uint i = 0; i < pParams.m_num_pixels; i++) 711 { 712 ulong best_err = ulong.max; 713 uint best_sel = 0; 714 715 if (pParams.m_has_alpha) 716 { 717 for (uint j = 0; j < N; j++) 718 { 719 ulong err = compute_color_distance_rgba(&weightedColors[j], &pParams.m_pPixels[i], BC7ENC16_TRUE, pParams.m_weights.ptr); 720 if (err < best_err) 721 { 722 best_err = err; 723 best_sel = j; 724 } 725 } 726 } 727 else 728 { 729 for (uint j = 0; j < N; j++) 730 { 731 ulong err = compute_color_distance_rgb(&weightedColors[j], &pParams.m_pPixels[i], BC7ENC16_TRUE, pParams.m_weights.ptr); 732 if (err < best_err) 733 { 734 best_err = err; 735 best_sel = j; 736 } 737 } 738 } 739 740 total_err += best_err; 741 742 pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel; 743 } 744 } 745 746 if (total_err < pResults.m_best_overall_err) 747 { 748 pResults.m_best_overall_err = total_err; 749 750 pResults.m_low_endpoint = *pLow; 751 pResults.m_high_endpoint = *pHigh; 752 753 pResults.m_pbits[0] = pbits[0]; 754 pResults.m_pbits[1] = pbits[1]; 755 756 memcpy(pResults.m_pSelectors, pResults.m_pSelectors_temp, (pResults.m_pSelectors[0]).sizeof * pParams.m_num_pixels); 757 } 758 759 return total_err; 760 } 761 762 void fixDegenerateEndpoints(uint mode, 763 ref color_quad_u8 pTrialMinColor, 764 ref color_quad_u8 pTrialMaxColor, 765 ref const(vec4F) pXl, ref const(vec4F) pXh, uint iscale) 766 { 767 if (mode == 1) 768 { 769 // fix degenerate case where the input collapses to a single colorspace voxel, and we loose all freedom (test with grayscale ramps) 770 for (uint i = 0; i < 3; i++) 771 { 772 if (pTrialMinColor.m_c[i] == pTrialMaxColor.m_c[i]) 773 { 774 if (abs(pXl.m_c[i] - pXh.m_c[i]) > 0.0f) 775 { 776 if (pTrialMinColor.m_c[i] > (iscale >> 1)) 777 { 778 if (pTrialMinColor.m_c[i] > 0) 779 pTrialMinColor.m_c[i]--; 780 else 781 if (pTrialMaxColor.m_c[i] < iscale) 782 pTrialMaxColor.m_c[i]++; 783 } 784 else 785 { 786 if (pTrialMaxColor.m_c[i] < iscale) 787 pTrialMaxColor.m_c[i]++; 788 else if (pTrialMinColor.m_c[i] > 0) 789 pTrialMinColor.m_c[i]--; 790 } 791 } 792 } 793 } 794 } 795 } 796 797 static ulong find_optimal_solution(uint mode, vec4F xl, vec4F xh, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults) @system 798 { 799 vec4F_saturate_in_place(xl); 800 vec4F_saturate_in_place(xh); 801 802 if (pParams.m_has_pbits) 803 { 804 const int iscalep = (1 << (pParams.m_comp_bits + 1)) - 1; 805 const float scalep = cast(float)iscalep; 806 807 const int totalComps = pParams.m_has_alpha ? 4 : 3; 808 809 uint[2] best_pbits; 810 color_quad_u8 bestMinColor, bestMaxColor; 811 812 if (!pParams.m_endpoints_share_pbit) 813 { 814 float best_err0 = 1e+9; 815 float best_err1 = 1e+9; 816 817 for (int p = 0; p < 2; p++) 818 { 819 color_quad_u8 xMinColor, xMaxColor; 820 821 // Notes: The pbit controls which quantization intervals are selected. 822 // total_levels=2^(comp_bits+1), where comp_bits=4 for mode 0, etc. 823 // pbit 0: v=(b*2)/(total_levels-1), pbit 1: v=(b*2+1)/(total_levels-1) where b is the component bin from [0,total_levels/2-1] and v is the [0,1] component value 824 // rearranging you get for pbit 0: b=floor(v*(total_levels-1)/2+.5) 825 // rearranging you get for pbit 1: b=floor((v*(total_levels-1)-1)/2+.5) 826 for (uint c = 0; c < 4; c++) 827 { 828 xMinColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 829 xMaxColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 830 } 831 832 color_quad_u8 scaledLow = scale_color(xMinColor, pParams); 833 color_quad_u8 scaledHigh = scale_color(xMaxColor, pParams); 834 835 float err0 = 0, err1 = 0; 836 for (int i = 0; i < totalComps; i++) 837 { 838 err0 += squaref(scaledLow.m_c[i] - xl.m_c[i] * 255.0f); 839 err1 += squaref(scaledHigh.m_c[i] - xh.m_c[i] * 255.0f); 840 } 841 842 if (err0 < best_err0) 843 { 844 best_err0 = err0; 845 best_pbits[0] = p; 846 847 bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1; 848 bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1; 849 bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1; 850 bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1; 851 } 852 853 if (err1 < best_err1) 854 { 855 best_err1 = err1; 856 best_pbits[1] = p; 857 858 bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1; 859 bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1; 860 bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1; 861 bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1; 862 } 863 } 864 } 865 else 866 { 867 // Endpoints share pbits 868 float best_err = 1e+9; 869 870 for (int p = 0; p < 2; p++) 871 { 872 color_quad_u8 xMinColor, xMaxColor; 873 for (uint c = 0; c < 4; c++) 874 { 875 xMinColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 876 xMaxColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 877 } 878 879 color_quad_u8 scaledLow = scale_color(xMinColor, pParams); 880 color_quad_u8 scaledHigh = scale_color(xMaxColor, pParams); 881 882 float err = 0; 883 for (int i = 0; i < totalComps; i++) 884 err += squaref((scaledLow.m_c[i] / 255.0f) - xl.m_c[i]) + squaref((scaledHigh.m_c[i] / 255.0f) - xh.m_c[i]); 885 886 if (err < best_err) 887 { 888 best_err = err; 889 best_pbits[0] = p; 890 best_pbits[1] = p; 891 for (uint j = 0; j < 4; j++) 892 { 893 bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1; 894 bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1; 895 } 896 } 897 } 898 } 899 900 fixDegenerateEndpoints(mode, bestMinColor, bestMaxColor, xl, xh, iscalep >> 1); 901 902 if ( (pResults.m_best_overall_err == ulong.max) 903 || color_quad_u8_notequals(bestMinColor, pResults.m_low_endpoint) 904 || color_quad_u8_notequals(bestMaxColor, pResults.m_high_endpoint) 905 || (best_pbits[0] != pResults.m_pbits[0]) 906 || (best_pbits[1] != pResults.m_pbits[1]) ) 907 evaluate_solution(&bestMinColor, &bestMaxColor, best_pbits.ptr, pParams, pResults); 908 } 909 else 910 { 911 const int iscale = (1 << pParams.m_comp_bits) - 1; 912 const float scale = cast(float)iscale; 913 914 color_quad_u8 trialMinColor, trialMaxColor; 915 color_quad_u8_set_clamped(&trialMinColor, cast(int)(xl.m_c[0] * scale + .5f), cast(int)(xl.m_c[1] * scale + .5f), cast(int)(xl.m_c[2] * scale + .5f), cast(int)(xl.m_c[3] * scale + .5f)); 916 color_quad_u8_set_clamped(&trialMaxColor, cast(int)(xh.m_c[0] * scale + .5f), cast(int)(xh.m_c[1] * scale + .5f), cast(int)(xh.m_c[2] * scale + .5f), cast(int)(xh.m_c[3] * scale + .5f)); 917 918 fixDegenerateEndpoints(mode, trialMinColor, trialMaxColor, xl, xh, iscale); 919 920 if ( (pResults.m_best_overall_err == ulong.max) 921 || color_quad_u8_notequals(trialMinColor, pResults.m_low_endpoint) 922 || color_quad_u8_notequals(trialMaxColor, pResults.m_high_endpoint) ) 923 evaluate_solution(&trialMinColor, &trialMaxColor, pResults.m_pbits.ptr, pParams, pResults); 924 } 925 926 return pResults.m_best_overall_err; 927 } 928 929 ulong color_cell_compression(uint mode, 930 const(color_cell_compressor_params)* pParams, 931 color_cell_compressor_results *pResults, 932 const(bc7enc16_compress_block_params)* pComp_params) @system 933 { 934 assert((mode == 6) || (!pParams.m_has_alpha)); 935 936 pResults.m_best_overall_err = ulong.max; 937 938 // If the partition's colors are all the same in mode 1, then just pack them as a single color. 939 if (mode == 1) 940 { 941 const uint cr = pParams.m_pPixels[0].m_c[0], cg = pParams.m_pPixels[0].m_c[1], cb = pParams.m_pPixels[0].m_c[2]; 942 943 bc7enc16_bool allSame = BC7ENC16_TRUE; 944 for (uint i = 1; i < pParams.m_num_pixels; i++) 945 { 946 if ((cr != pParams.m_pPixels[i].m_c[0]) || (cg != pParams.m_pPixels[i].m_c[1]) || (cb != pParams.m_pPixels[i].m_c[2])) 947 { 948 allSame = BC7ENC16_FALSE; 949 break; 950 } 951 } 952 953 if (allSame) 954 return pack_mode1_to_one_color(pParams, pResults, cr, cg, cb, pResults.m_pSelectors); 955 } 956 957 // Compute partition's mean color and principle axis. 958 vec4F meanColor, axis; 959 vec4F_set_scalar(&meanColor, 0.0f); 960 961 for (uint i = 0; i < pParams.m_num_pixels; i++) 962 { 963 vec4F color = vec4F_from_color(&pParams.m_pPixels[i]); 964 meanColor = vec4F_add(&meanColor, &color); 965 } 966 967 vec4F meanColorScaled = vec4F_mul(&meanColor, 1.0f / cast(float)(pParams.m_num_pixels)); 968 969 meanColor = vec4F_mul(&meanColor, 1.0f / cast(float)(pParams.m_num_pixels * 255.0f)); 970 vec4F_saturate_in_place(meanColor); 971 972 if (pParams.m_has_alpha) 973 { 974 // Use incremental PCA for RGBA PCA, because it's simple. 975 vec4F_set_scalar(&axis, 0.0f); 976 for (uint i = 0; i < pParams.m_num_pixels; i++) 977 { 978 vec4F color = vec4F_from_color(&pParams.m_pPixels[i]); 979 color = vec4F_sub(&color, &meanColorScaled); 980 vec4F a = vec4F_mul(&color, color.m_c[0]); 981 vec4F b = vec4F_mul(&color, color.m_c[1]); 982 vec4F c = vec4F_mul(&color, color.m_c[2]); 983 vec4F d = vec4F_mul(&color, color.m_c[3]); 984 vec4F n = i ? axis : color; 985 vec4F_normalize_in_place(&n); 986 axis.m_c[0] += vec4F_dot(&a, &n); 987 axis.m_c[1] += vec4F_dot(&b, &n); 988 axis.m_c[2] += vec4F_dot(&c, &n); 989 axis.m_c[3] += vec4F_dot(&d, &n); 990 } 991 vec4F_normalize_in_place(&axis); 992 } 993 else 994 { 995 // Use covar technique for RGB PCA, because it doesn't require per-pixel normalization. 996 float[6] cov = [ 0, 0, 0, 0, 0, 0 ]; 997 998 for (uint i = 0; i < pParams.m_num_pixels; i++) 999 { 1000 const color_quad_u8 *pV = &pParams.m_pPixels[i]; 1001 float r = pV.m_c[0] - meanColorScaled.m_c[0]; 1002 float g = pV.m_c[1] - meanColorScaled.m_c[1]; 1003 float b = pV.m_c[2] - meanColorScaled.m_c[2]; 1004 cov[0] += r*r; cov[1] += r*g; cov[2] += r*b; cov[3] += g*g; cov[4] += g*b; cov[5] += b*b; 1005 } 1006 1007 float vfr = .9f, vfg = 1.0f, vfb = .7f; 1008 for (uint iter = 0; iter < 3; iter++) 1009 { 1010 float r = vfr*cov[0] + vfg*cov[1] + vfb*cov[2]; 1011 float g = vfr*cov[1] + vfg*cov[3] + vfb*cov[4]; 1012 float b = vfr*cov[2] + vfg*cov[4] + vfb*cov[5]; 1013 1014 float m = maximumf(maximumf(abs(r), abs(g)), abs(b)); 1015 if (m > 1e-10f) 1016 { 1017 m = 1.0f / m; 1018 r *= m; g *= m; b *= m; 1019 } 1020 1021 vfr = r; vfg = g; vfb = b; 1022 } 1023 1024 float len = vfr*vfr + vfg*vfg + vfb*vfb; 1025 if (len < 1e-10f) 1026 vec4F_set_scalar(&axis, 0.0f); 1027 else 1028 { 1029 len = 1.0f / sqrt(len); 1030 vfr *= len; vfg *= len; vfb *= len; 1031 vec4F_set(&axis, vfr, vfg, vfb, 0); 1032 } 1033 } 1034 1035 if (vec4F_dot(&axis, &axis) < .5f) 1036 { 1037 if (pParams.m_perceptual) 1038 vec4F_set(&axis, .213f, .715f, .072f, pParams.m_has_alpha ? .715f : 0); 1039 else 1040 vec4F_set(&axis, 1.0f, 1.0f, 1.0f, pParams.m_has_alpha ? 1.0f : 0); 1041 vec4F_normalize_in_place(&axis); 1042 } 1043 1044 float l = 1e+9f, h = -1e+9f; 1045 1046 for (uint i = 0; i < pParams.m_num_pixels; i++) 1047 { 1048 vec4F color = vec4F_from_color(&pParams.m_pPixels[i]); 1049 1050 vec4F q = vec4F_sub(&color, &meanColorScaled); 1051 float d = vec4F_dot(&q, &axis); 1052 1053 l = minimumf(l, d); 1054 h = maximumf(h, d); 1055 } 1056 1057 l *= (1.0f / 255.0f); 1058 h *= (1.0f / 255.0f); 1059 1060 vec4F b0 = vec4F_mul(&axis, l); 1061 vec4F b1 = vec4F_mul(&axis, h); 1062 vec4F c0 = vec4F_add(&meanColor, &b0); 1063 vec4F c1 = vec4F_add(&meanColor, &b1); 1064 vec4F minColor = vec4F_saturate(&c0); 1065 vec4F maxColor = vec4F_saturate(&c1); 1066 1067 vec4F whiteVec; 1068 vec4F_set_scalar(&whiteVec, 1.0f); 1069 if (vec4F_dot(&minColor, &whiteVec) > vec4F_dot(&maxColor, &whiteVec)) 1070 { 1071 vec4F temp = minColor; 1072 minColor = maxColor; 1073 maxColor = temp; 1074 } 1075 // First find a solution using the block's PCA. 1076 if (!find_optimal_solution(mode, minColor, maxColor, pParams, pResults)) 1077 return 0; 1078 1079 if (pComp_params.m_try_least_squares) 1080 { 1081 // Now try to refine the solution using least squares by computing the optimal endpoints from the current selectors. 1082 vec4F xl, xh; 1083 vec4F_set_scalar(&xl, 0.0f); 1084 vec4F_set_scalar(&xh, 0.0f); 1085 if (pParams.m_has_alpha) 1086 compute_least_squares_endpoints_rgba(pParams.m_num_pixels, pResults.m_pSelectors, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1087 else 1088 compute_least_squares_endpoints_rgb(pParams.m_num_pixels, pResults.m_pSelectors, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1089 1090 xl = vec4F_mul(&xl, (1.0f / 255.0f)); 1091 xh = vec4F_mul(&xh, (1.0f / 255.0f)); 1092 1093 if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 1094 return 0; 1095 } 1096 1097 if (pComp_params.m_uber_level > 0) 1098 { 1099 // In uber level 1, try varying the selectors a little, somewhat like cluster fit would. First try incrementing the minimum selectors, 1100 // then try decrementing the selectrors, then try both. 1101 ubyte[16] selectors_temp, selectors_temp1; 1102 memcpy(selectors_temp.ptr, pResults.m_pSelectors, pParams.m_num_pixels); 1103 1104 const int max_selector = pParams.m_num_selector_weights - 1; 1105 1106 uint min_sel = 16; 1107 uint max_sel = 0; 1108 for (uint i = 0; i < pParams.m_num_pixels; i++) 1109 { 1110 uint sel = selectors_temp[i]; 1111 min_sel = minimumu(min_sel, sel); 1112 max_sel = maximumu(max_sel, sel); 1113 } 1114 1115 for (uint i = 0; i < pParams.m_num_pixels; i++) 1116 { 1117 uint sel = selectors_temp[i]; 1118 if ((sel == min_sel) && (sel < (pParams.m_num_selector_weights - 1))) 1119 sel++; 1120 selectors_temp1[i] = cast(ubyte)sel; 1121 } 1122 1123 vec4F xl, xh; 1124 vec4F_set_scalar(&xl, 0.0f); 1125 vec4F_set_scalar(&xh, 0.0f); 1126 if (pParams.m_has_alpha) 1127 compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 1128 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1129 else 1130 compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 1131 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1132 1133 xl = vec4F_mul(&xl, (1.0f / 255.0f)); 1134 xh = vec4F_mul(&xh, (1.0f / 255.0f)); 1135 1136 if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 1137 return 0; 1138 1139 for (uint i = 0; i < pParams.m_num_pixels; i++) 1140 { 1141 uint sel = selectors_temp[i]; 1142 if ((sel == max_sel) && (sel > 0)) 1143 sel--; 1144 selectors_temp1[i] = cast(ubyte)sel; 1145 } 1146 1147 if (pParams.m_has_alpha) 1148 compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 1149 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1150 else 1151 compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 1152 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1153 1154 xl = vec4F_mul(&xl, (1.0f / 255.0f)); 1155 xh = vec4F_mul(&xh, (1.0f / 255.0f)); 1156 1157 if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 1158 return 0; 1159 1160 for (uint i = 0; i < pParams.m_num_pixels; i++) 1161 { 1162 uint sel = selectors_temp[i]; 1163 if ((sel == min_sel) && (sel < (pParams.m_num_selector_weights - 1))) 1164 sel++; 1165 else if ((sel == max_sel) && (sel > 0)) 1166 sel--; 1167 selectors_temp1[i] = cast(ubyte)sel; 1168 } 1169 1170 if (pParams.m_has_alpha) 1171 compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 1172 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1173 else 1174 compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 1175 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1176 1177 xl = vec4F_mul(&xl, (1.0f / 255.0f)); 1178 xh = vec4F_mul(&xh, (1.0f / 255.0f)); 1179 1180 if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 1181 return 0; 1182 1183 // In uber levels 2+, try taking more advantage of endpoint extrapolation by scaling the selectors in one direction or another. 1184 const uint uber_err_thresh = (pParams.m_num_pixels * 56) >> 4; 1185 if ((pComp_params.m_uber_level >= 2) && (pResults.m_best_overall_err > uber_err_thresh)) 1186 { 1187 const int Q = (pComp_params.m_uber_level >= 4) ? (pComp_params.m_uber_level - 2) : 1; 1188 for (int ly = -Q; ly <= 1; ly++) 1189 { 1190 for (int hy = max_selector - 1; hy <= (max_selector + Q); hy++) 1191 { 1192 if ((ly == 0) && (hy == max_selector)) 1193 continue; 1194 1195 for (uint i = 0; i < pParams.m_num_pixels; i++) 1196 selectors_temp1[i] = cast(ubyte)clampf(floor(cast(float)max_selector * (cast(float)selectors_temp[i] - cast(float)ly) / (cast(float)hy - cast(float)ly) + .5f), 0, cast(float)max_selector); 1197 1198 //vec4F xl, xh; 1199 vec4F_set_scalar(&xl, 0.0f); 1200 vec4F_set_scalar(&xh, 0.0f); 1201 if (pParams.m_has_alpha) 1202 compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1203 else 1204 compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1205 1206 xl = vec4F_mul(&xl, (1.0f / 255.0f)); 1207 xh = vec4F_mul(&xh, (1.0f / 255.0f)); 1208 1209 if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 1210 return 0; 1211 } 1212 } 1213 } 1214 } 1215 1216 if (mode == 1) 1217 { 1218 // Try encoding the partition as a single color by using the optimal singe colors tables to encode the block to its mean. 1219 color_cell_compressor_results avg_results = *pResults; 1220 const uint r = cast(int)(.5f + meanColor.m_c[0] * 255.0f), 1221 g = cast(int)(.5f + meanColor.m_c[1] * 255.0f), 1222 b = cast(int)(.5f + meanColor.m_c[2] * 255.0f); 1223 ulong avg_err = pack_mode1_to_one_color(pParams, &avg_results, r, g, b, pResults.m_pSelectors_temp); 1224 if (avg_err < pResults.m_best_overall_err) 1225 { 1226 *pResults = avg_results; 1227 memcpy(pResults.m_pSelectors, pResults.m_pSelectors_temp, (pResults.m_pSelectors[0]).sizeof * pParams.m_num_pixels); 1228 pResults.m_best_overall_err = avg_err; 1229 } 1230 } 1231 1232 return pResults.m_best_overall_err; 1233 } 1234 1235 ulong color_cell_compression_est(uint num_pixels, const color_quad_u8 *pPixels, bc7enc16_bool perceptual, uint* pweights/*[4]*/, ulong best_err_so_far) @system 1236 { 1237 // Find RGB bounds as an approximation of the block's principle axis 1238 uint lr = 255, lg = 255, lb = 255; 1239 uint hr = 0, hg = 0, hb = 0; 1240 for (uint i = 0; i < num_pixels; i++) 1241 { 1242 const color_quad_u8 *pC = &pPixels[i]; 1243 if (pC.m_c[0] < lr) lr = pC.m_c[0]; 1244 if (pC.m_c[1] < lg) lg = pC.m_c[1]; 1245 if (pC.m_c[2] < lb) lb = pC.m_c[2]; 1246 if (pC.m_c[0] > hr) hr = pC.m_c[0]; 1247 if (pC.m_c[1] > hg) hg = pC.m_c[1]; 1248 if (pC.m_c[2] > hb) hb = pC.m_c[2]; 1249 } 1250 1251 color_quad_u8 lowColor; color_quad_u8_set(&lowColor, lr, lg, lb, 0); 1252 color_quad_u8 highColor; color_quad_u8_set(&highColor, hr, hg, hb, 0); 1253 1254 // Place endpoints at bbox diagonals and compute interpolated colors 1255 const uint N = 8; 1256 color_quad_u8[8] weightedColors; 1257 1258 weightedColors[0] = lowColor; 1259 weightedColors[N - 1] = highColor; 1260 for (uint i = 1; i < (N - 1); i++) 1261 { 1262 weightedColors[i].m_c[0] = cast(ubyte)((lowColor.m_c[0] * (64 - g_bc7_weights3[i]) + highColor.m_c[0] * g_bc7_weights3[i] + 32) >> 6); 1263 weightedColors[i].m_c[1] = cast(ubyte)((lowColor.m_c[1] * (64 - g_bc7_weights3[i]) + highColor.m_c[1] * g_bc7_weights3[i] + 32) >> 6); 1264 weightedColors[i].m_c[2] = cast(ubyte)((lowColor.m_c[2] * (64 - g_bc7_weights3[i]) + highColor.m_c[2] * g_bc7_weights3[i] + 32) >> 6); 1265 } 1266 1267 // Compute dots and thresholds 1268 const int ar = highColor.m_c[0] - lowColor.m_c[0]; 1269 const int ag = highColor.m_c[1] - lowColor.m_c[1]; 1270 const int ab = highColor.m_c[2] - lowColor.m_c[2]; 1271 1272 int[8] dots; 1273 for (uint i = 0; i < N; i++) 1274 dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab; 1275 1276 int[8 - 1] thresh; 1277 for (uint i = 0; i < (N - 1); i++) 1278 thresh[i] = (dots[i] + dots[i + 1] + 1) >> 1; 1279 1280 ulong total_err = 0; 1281 if (perceptual) 1282 { 1283 // Transform block's interpolated colors to YCbCr 1284 int[8] l1, cr1, cb1; 1285 for (int j = 0; j < 8; j++) 1286 { 1287 const color_quad_u8 *pE1 = &weightedColors[j]; 1288 l1[j] = pE1.m_c[0] * 109 + pE1.m_c[1] * 366 + pE1.m_c[2] * 37; 1289 cr1[j] = (cast(int)pE1.m_c[0] << 9) - l1[j]; 1290 cb1[j] = (cast(int)pE1.m_c[2] << 9) - l1[j]; 1291 } 1292 1293 for (uint i = 0; i < num_pixels; i++) 1294 { 1295 const color_quad_u8 *pC = &pPixels[i]; 1296 1297 int d = ar * pC.m_c[0] + ag * pC.m_c[1] + ab * pC.m_c[2]; 1298 1299 // Find approximate selector 1300 uint s = 0; 1301 if (d >= thresh[6]) 1302 s = 7; 1303 else if (d >= thresh[5]) 1304 s = 6; 1305 else if (d >= thresh[4]) 1306 s = 5; 1307 else if (d >= thresh[3]) 1308 s = 4; 1309 else if (d >= thresh[2]) 1310 s = 3; 1311 else if (d >= thresh[1]) 1312 s = 2; 1313 else if (d >= thresh[0]) 1314 s = 1; 1315 1316 // Compute error 1317 const int l2 = pC.m_c[0] * 109 + pC.m_c[1] * 366 + pC.m_c[2] * 37; 1318 const int cr2 = (cast(int)pC.m_c[0] << 9) - l2; 1319 const int cb2 = (cast(int)pC.m_c[2] << 9) - l2; 1320 1321 const int dl = (l1[s] - l2) >> 8; 1322 const int dcr = (cr1[s] - cr2) >> 8; 1323 const int dcb = (cb1[s] - cb2) >> 8; 1324 1325 int ie = (pweights[0] * dl * dl) + (pweights[1] * dcr * dcr) + (pweights[2] * dcb * dcb); 1326 1327 total_err += ie; 1328 if (total_err > best_err_so_far) 1329 break; 1330 } 1331 } 1332 else 1333 { 1334 for (uint i = 0; i < num_pixels; i++) 1335 { 1336 const color_quad_u8 *pC = &pPixels[i]; 1337 1338 int d = ar * pC.m_c[0] + ag * pC.m_c[1] + ab * pC.m_c[2]; 1339 1340 // Find approximate selector 1341 uint s = 0; 1342 if (d >= thresh[6]) 1343 s = 7; 1344 else if (d >= thresh[5]) 1345 s = 6; 1346 else if (d >= thresh[4]) 1347 s = 5; 1348 else if (d >= thresh[3]) 1349 s = 4; 1350 else if (d >= thresh[2]) 1351 s = 3; 1352 else if (d >= thresh[1]) 1353 s = 2; 1354 else if (d >= thresh[0]) 1355 s = 1; 1356 1357 // Compute error 1358 const color_quad_u8 *pE1 = &weightedColors[s]; 1359 1360 int dr = cast(int)pE1.m_c[0] - cast(int)pC.m_c[0]; 1361 int dg = cast(int)pE1.m_c[1] - cast(int)pC.m_c[1]; 1362 int db = cast(int)pE1.m_c[2] - cast(int)pC.m_c[2]; 1363 1364 total_err += pweights[0] * (dr * dr) + pweights[1] * (dg * dg) + pweights[2] * (db * db); 1365 if (total_err > best_err_so_far) 1366 break; 1367 } 1368 } 1369 1370 return total_err; 1371 } 1372 1373 // This table contains bitmasks indicating which "key" partitions must be best ranked before this partition is worth evaluating. 1374 // We first rank the best/most used 14 partitions (sorted by usefulness), record the best one found as the key partition, then use 1375 // that to control the other partitions to evaluate. The quality loss is ~.08 dB RGB PSNR, the perf gain is up to ~11% (at uber level 0). 1376 static immutable uint[35] g_partition_predictors = 1377 [ 1378 uint.max, 1379 uint.max, 1380 uint.max, 1381 uint.max, 1382 uint.max, 1383 (1 << 1) | (1 << 2) | (1 << 8), 1384 (1 << 1) | (1 << 3) | (1 << 7), 1385 uint.max, 1386 uint.max, 1387 (1 << 2) | (1 << 8) | (1 << 16), 1388 (1 << 7) | (1 << 3) | (1 << 15), 1389 uint.max, 1390 (1 << 8) | (1 << 14) | (1 << 16), 1391 (1 << 7) | (1 << 14) | (1 << 15), 1392 uint.max, 1393 uint.max, 1394 uint.max, 1395 uint.max, 1396 (1 << 14) | (1 << 15), 1397 (1 << 16) | (1 << 22) | (1 << 14), 1398 (1 << 17) | (1 << 24) | (1 << 14), 1399 (1 << 2) | (1 << 14) | (1 << 15) | (1 << 1), 1400 uint.max, 1401 (1 << 1) | (1 << 3) | (1 << 14) | (1 << 16) | (1 << 22), 1402 uint.max, 1403 (1 << 1) | (1 << 2) | (1 << 15) | (1 << 17) | (1 << 24), 1404 (1 << 1) | (1 << 3) | (1 << 22), 1405 uint.max, 1406 uint.max, 1407 uint.max, 1408 (1 << 14) | (1 << 15) | (1 << 16) | (1 << 17), 1409 uint.max, 1410 uint.max, 1411 (1 << 1) | (1 << 2) | (1 << 3) | (1 << 27) | (1 << 4) | (1 << 24), 1412 (1 << 14) | (1 << 15) | (1 << 16) | (1 << 11) | (1 << 17) | (1 << 27) 1413 ]; 1414 1415 // Estimate the partition used by mode 1. This scans through each partition and computes an approximate error for each. 1416 uint estimate_partition(const(color_quad_u8)* pPixels, 1417 const(bc7enc16_compress_block_params)* pComp_params, 1418 uint* pweights/*[4]*/) @system 1419 { 1420 const uint total_partitions = minimumu(pComp_params.m_max_partitions_mode1, BC7ENC16_MAX_PARTITIONS1); 1421 if (total_partitions <= 1) 1422 return 0; 1423 1424 ulong best_err = ulong.max; 1425 uint best_partition = 0; 1426 1427 // Partition order sorted by usage frequency across a large test corpus. Pattern 34 (checkerboard) must appear in slot 34. 1428 // Using a sorted order allows the user to decrease the # of partitions to scan with minimal loss in quality. 1429 static immutable ubyte[64] s_sorted_partition_order = 1430 [ 1431 1 - 1, 14 - 1, 2 - 1, 3 - 1, 16 - 1, 15 - 1, 11 - 1, 17 - 1, 1432 4 - 1, 24 - 1, 27 - 1, 7 - 1, 8 - 1, 22 - 1, 20 - 1, 30 - 1, 1433 9 - 1, 5 - 1, 10 - 1, 21 - 1, 6 - 1, 32 - 1, 23 - 1, 18 - 1, 1434 19 - 1, 12 - 1, 13 - 1, 31 - 1, 25 - 1, 26 - 1, 29 - 1, 28 - 1, 1435 33 - 1, 34 - 1, 35 - 1, 46 - 1, 47 - 1, 52 - 1, 50 - 1, 51 - 1, 1436 49 - 1, 39 - 1, 40 - 1, 38 - 1, 54 - 1, 53 - 1, 55 - 1, 37 - 1, 1437 58 - 1, 59 - 1, 56 - 1, 42 - 1, 41 - 1, 43 - 1, 44 - 1, 60 - 1, 1438 45 - 1, 57 - 1, 48 - 1, 36 - 1, 61 - 1, 64 - 1, 63 - 1, 62 - 1 1439 ]; 1440 1441 assert(s_sorted_partition_order[34] == 34); 1442 1443 int best_key_partition = 0; 1444 1445 for (uint partition_iter = 0; (partition_iter < total_partitions) && (best_err > 0); partition_iter++) 1446 { 1447 const uint partition = s_sorted_partition_order[partition_iter]; 1448 1449 // Check to see if we should bother evaluating this partition at all, depending on the best partition found from the first 14. 1450 if (pComp_params.m_mode1_partition_estimation_filterbank) 1451 { 1452 if ((partition_iter >= 14) && (partition_iter <= 34)) 1453 { 1454 const uint best_key_partition_bitmask = 1 << (best_key_partition + 1); 1455 if ((g_partition_predictors[partition] & best_key_partition_bitmask) == 0) 1456 { 1457 if (partition_iter == 34) 1458 break; 1459 1460 continue; 1461 } 1462 } 1463 } 1464 1465 const ubyte *pPartition = &g_bc7_partition2[partition * 16]; 1466 1467 color_quad_u8[16][2] subset_colors; 1468 uint[2] subset_total_colors = [ 0, 0 ]; 1469 for (uint index = 0; index < 16; index++) 1470 subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = pPixels[index]; 1471 1472 ulong total_subset_err = 0; 1473 for (uint subset = 0; (subset < 2) && (total_subset_err < best_err); subset++) 1474 total_subset_err += color_cell_compression_est(subset_total_colors[subset], &subset_colors[subset][0], pComp_params.m_perceptual, pweights, best_err); 1475 1476 if (total_subset_err < best_err) 1477 { 1478 best_err = total_subset_err; 1479 best_partition = partition; 1480 } 1481 1482 // If the checkerboard pattern doesn't get the highest ranking vs. the previous (lower frequency) patterns, then just stop now because statistically the subsequent patterns won't do well either. 1483 if ((partition == 34) && (best_partition != 34)) 1484 break; 1485 1486 if (partition_iter == 13) 1487 best_key_partition = best_partition; 1488 1489 } // partition 1490 1491 return best_partition; 1492 } 1493 1494 void set_block_bits(ubyte *pBytes, uint val, uint num_bits, uint *pCur_ofs) @system 1495 { 1496 assert((num_bits <= 32) && (val < (1UL << num_bits))); 1497 while (num_bits) 1498 { 1499 const uint n = minimumu(8 - (*pCur_ofs & 7), num_bits); 1500 pBytes[*pCur_ofs >> 3] |= cast(ubyte)(val << (*pCur_ofs & 7)); 1501 val >>= n; 1502 num_bits -= n; 1503 *pCur_ofs += n; 1504 } 1505 assert(*pCur_ofs <= 128); 1506 } 1507 1508 struct bc7_optimization_results 1509 { 1510 uint m_mode; 1511 uint m_partition; 1512 ubyte[16] m_selectors; 1513 color_quad_u8[2] m_low; 1514 color_quad_u8[2] m_high; 1515 uint[2][2] m_pbits; 1516 } 1517 1518 static void encode_bc7_block(void *pBlock, const(bc7_optimization_results)* pResults) @system 1519 { 1520 const uint best_mode = pResults.m_mode; 1521 const uint total_subsets = g_bc7_num_subsets[best_mode]; 1522 const uint total_partitions = 1 << g_bc7_partition_bits[best_mode]; 1523 const ubyte *pPartition = (total_subsets == 2) ? &g_bc7_partition2[pResults.m_partition * 16] : &g_bc7_partition1[0]; 1524 1525 ubyte[16] color_selectors; 1526 memcpy(color_selectors.ptr, pResults.m_selectors.ptr, 16); 1527 1528 color_quad_u8[2] low, high; 1529 memcpy(low.ptr, pResults.m_low.ptr, low.sizeof); 1530 memcpy(high.ptr, pResults.m_high.ptr, high.sizeof); 1531 1532 uint[2][2] pbits; 1533 static assert(pbits.sizeof == 16); 1534 memcpy(pbits.ptr, pResults.m_pbits.ptr, pbits.sizeof); 1535 1536 int[2] anchor = [ -1, -1 ]; 1537 1538 for (uint k = 0; k < total_subsets; k++) 1539 { 1540 const uint anchor_index = k ? g_bc7_table_anchor_index_second_subset[pResults.m_partition] : 0; 1541 anchor[k] = anchor_index; 1542 1543 const uint color_index_bits = get_bc7_color_index_size(best_mode, 0); 1544 const uint num_color_indices = 1 << color_index_bits; 1545 1546 if (color_selectors[anchor_index] & (num_color_indices >> 1)) 1547 { 1548 for (uint i = 0; i < 16; i++) 1549 if (pPartition[i] == k) 1550 color_selectors[i] = cast(ubyte)((num_color_indices - 1) - color_selectors[i]); 1551 1552 color_quad_u8 tmp = low[k]; 1553 low[k] = high[k]; 1554 high[k] = tmp; 1555 1556 if (!g_bc7_mode_has_shared_p_bits[best_mode]) 1557 { 1558 uint t = pbits[k][0]; 1559 pbits[k][0] = pbits[k][1]; 1560 pbits[k][1] = t; 1561 } 1562 } 1563 } 1564 1565 ubyte *pBlock_bytes = cast(ubyte *)(pBlock); 1566 memset(pBlock_bytes, 0, BC7ENC16_BLOCK_SIZE); 1567 1568 uint cur_bit_ofs = 0; 1569 set_block_bits(pBlock_bytes, 1 << best_mode, best_mode + 1, &cur_bit_ofs); 1570 1571 if (total_partitions > 1) 1572 set_block_bits(pBlock_bytes, pResults.m_partition, 6, &cur_bit_ofs); 1573 1574 const uint total_comps = (best_mode >= 4) ? 4 : 3; 1575 for (uint comp = 0; comp < total_comps; comp++) 1576 { 1577 for (uint subset = 0; subset < total_subsets; subset++) 1578 { 1579 set_block_bits(pBlock_bytes, low[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); 1580 set_block_bits(pBlock_bytes, high[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); 1581 } 1582 } 1583 1584 for (uint subset = 0; subset < total_subsets; subset++) 1585 { 1586 set_block_bits(pBlock_bytes, pbits[subset][0], 1, &cur_bit_ofs); 1587 if (!g_bc7_mode_has_shared_p_bits[best_mode]) 1588 set_block_bits(pBlock_bytes, pbits[subset][1], 1, &cur_bit_ofs); 1589 } 1590 1591 for (int idx = 0; idx < 16; idx++) 1592 { 1593 uint n = get_bc7_color_index_size(best_mode, 0); 1594 if ((idx == anchor[0]) || (idx == anchor[1])) 1595 n--; 1596 set_block_bits(pBlock_bytes, color_selectors[idx], n, &cur_bit_ofs); 1597 } 1598 1599 assert(cur_bit_ofs == 128); 1600 } 1601 1602 void handle_alpha_block(void *pBlock, const(color_quad_u8)* pPixels, 1603 const(bc7enc16_compress_block_params)* pComp_params, 1604 color_cell_compressor_params *pParams) @system 1605 { 1606 color_cell_compressor_results results6; 1607 1608 pParams.m_pSelector_weights = g_bc7_weights4.ptr; 1609 pParams.m_pSelector_weightsx = cast(const(vec4F)*) g_bc7_weights4x.ptr; 1610 pParams.m_num_selector_weights = 16; 1611 pParams.m_comp_bits = 7; 1612 pParams.m_has_pbits = BC7ENC16_TRUE; 1613 pParams.m_has_alpha = BC7ENC16_TRUE; 1614 pParams.m_perceptual = pComp_params.m_perceptual; 1615 pParams.m_num_pixels = 16; 1616 pParams.m_pPixels = pPixels; 1617 1618 bc7_optimization_results opt_results; 1619 results6.m_pSelectors = opt_results.m_selectors.ptr; 1620 1621 ubyte[16] selectors_temp; 1622 results6.m_pSelectors_temp = selectors_temp.ptr; 1623 1624 color_cell_compression(6, pParams, &results6, pComp_params); 1625 1626 opt_results.m_mode = 6; 1627 opt_results.m_partition = 0; 1628 opt_results.m_low[0] = results6.m_low_endpoint; 1629 opt_results.m_high[0] = results6.m_high_endpoint; 1630 opt_results.m_pbits[0][0] = results6.m_pbits[0]; 1631 opt_results.m_pbits[0][1] = results6.m_pbits[1]; 1632 1633 encode_bc7_block(pBlock, &opt_results); 1634 } 1635 1636 static void handle_opaque_block(void *pBlock, 1637 const(color_quad_u8)* pPixels, 1638 const(bc7enc16_compress_block_params)* pComp_params, 1639 color_cell_compressor_params *pParams) @system 1640 { 1641 ubyte[16] selectors_temp; 1642 1643 // Mode 6 1644 bc7_optimization_results opt_results; 1645 1646 pParams.m_pSelector_weights = g_bc7_weights4.ptr; 1647 pParams.m_pSelector_weightsx = cast(const vec4F *)g_bc7_weights4x; 1648 pParams.m_num_selector_weights = 16; 1649 pParams.m_comp_bits = 7; 1650 pParams.m_has_pbits = BC7ENC16_TRUE; 1651 pParams.m_endpoints_share_pbit = BC7ENC16_FALSE; 1652 pParams.m_perceptual = pComp_params.m_perceptual; 1653 pParams.m_num_pixels = 16; 1654 pParams.m_pPixels = pPixels; 1655 pParams.m_has_alpha = BC7ENC16_FALSE; 1656 1657 color_cell_compressor_results results6; 1658 results6.m_pSelectors = opt_results.m_selectors.ptr; 1659 results6.m_pSelectors_temp = selectors_temp.ptr; 1660 1661 ulong best_err = color_cell_compression(6, pParams, &results6, pComp_params); 1662 1663 opt_results.m_mode = 6; 1664 opt_results.m_partition = 0; 1665 opt_results.m_low[0] = results6.m_low_endpoint; 1666 opt_results.m_high[0] = results6.m_high_endpoint; 1667 opt_results.m_pbits[0][0] = results6.m_pbits[0]; 1668 opt_results.m_pbits[0][1] = results6.m_pbits[1]; 1669 1670 // Mode 1 1671 if ((best_err > 0) && (pComp_params.m_max_partitions_mode1 > 0)) 1672 { 1673 const uint trial_partition = estimate_partition(pPixels, pComp_params, pParams.m_weights.ptr); 1674 pParams.m_pSelector_weights = g_bc7_weights3.ptr; 1675 pParams.m_pSelector_weightsx = cast(const vec4F *)g_bc7_weights3x; 1676 pParams.m_num_selector_weights = 8; 1677 pParams.m_comp_bits = 6; 1678 pParams.m_has_pbits = BC7ENC16_TRUE; 1679 pParams.m_endpoints_share_pbit = BC7ENC16_TRUE; 1680 1681 const ubyte *pPartition = &g_bc7_partition2[trial_partition * 16]; 1682 1683 color_quad_u8[16][2] subset_colors; 1684 1685 uint[2] subset_total_colors1 = [ 0, 0 ]; 1686 1687 ubyte[16][2] subset_pixel_index1; 1688 ubyte[16][2] subset_selectors1; 1689 color_cell_compressor_results[2] subset_results1; 1690 1691 for (uint idx = 0; idx < 16; idx++) 1692 { 1693 const uint p = pPartition[idx]; 1694 subset_colors[p][subset_total_colors1[p]] = pPixels[idx]; 1695 subset_pixel_index1[p][subset_total_colors1[p]] = cast(ubyte)idx; 1696 subset_total_colors1[p]++; 1697 } 1698 1699 ulong trial_err = 0; 1700 for (uint subset = 0; subset < 2; subset++) 1701 { 1702 pParams.m_num_pixels = subset_total_colors1[subset]; 1703 pParams.m_pPixels = &subset_colors[subset][0]; 1704 1705 color_cell_compressor_results *pResults = &subset_results1[subset]; 1706 pResults.m_pSelectors = &subset_selectors1[subset][0]; 1707 pResults.m_pSelectors_temp = selectors_temp.ptr; 1708 ulong err = color_cell_compression(1, pParams, pResults, pComp_params); 1709 trial_err += err; 1710 if (trial_err > best_err) 1711 break; 1712 1713 } // subset 1714 1715 if (trial_err < best_err) 1716 { 1717 best_err = trial_err; 1718 opt_results.m_mode = 1; 1719 opt_results.m_partition = trial_partition; 1720 for (uint subset = 0; subset < 2; subset++) 1721 { 1722 for (uint i = 0; i < subset_total_colors1[subset]; i++) 1723 opt_results.m_selectors[subset_pixel_index1[subset][i]] = subset_selectors1[subset][i]; 1724 opt_results.m_low[subset] = subset_results1[subset].m_low_endpoint; 1725 opt_results.m_high[subset] = subset_results1[subset].m_high_endpoint; 1726 opt_results.m_pbits[subset][0] = subset_results1[subset].m_pbits[0]; 1727 } 1728 } 1729 } 1730 1731 encode_bc7_block(pBlock, &opt_results); 1732 } 1733 1734 // Packs a single block of 16x16 RGBA pixels (R first in memory) to 128-bit BC7 block pBlock, using either mode 1 and/or 6. 1735 // Alpha blocks will always use mode 6, and by default opaque blocks will use either modes 1 or 6. 1736 // Returns BC7ENC16_TRUE if the block had any pixels with alpha < 255, otherwise it return BC7ENC16_FALSE. (This is not an error code - a block is always encoded.) 1737 bc7enc16_bool bc7enc16_compress_block(void *pBlock, 1738 const(void)* pPixelsRGBA, 1739 const(bc7enc16_compress_block_params)* pComp_params) @system 1740 { 1741 assert(g_bc7_mode_1_optimal_endpoints[255][0].m_hi != 0); 1742 1743 const color_quad_u8 *pPixels = cast(const color_quad_u8 *)(pPixelsRGBA); 1744 1745 color_cell_compressor_params params; 1746 if (pComp_params.m_perceptual) 1747 { 1748 // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion 1749 const float pr_weight = (.5f / (1.0f - .2126f)) * (.5f / (1.0f - .2126f)); 1750 const float pb_weight = (.5f / (1.0f - .0722f)) * (.5f / (1.0f - .0722f)); 1751 params.m_weights[0] = cast(int)(pComp_params.m_weights[0] * 4.0f); 1752 params.m_weights[1] = cast(int)(pComp_params.m_weights[1] * 4.0f * pr_weight); 1753 params.m_weights[2] = cast(int)(pComp_params.m_weights[2] * 4.0f * pb_weight); 1754 params.m_weights[3] = pComp_params.m_weights[3] * 4; 1755 } 1756 else 1757 memcpy(params.m_weights.ptr, pComp_params.m_weights.ptr, (params.m_weights).sizeof); 1758 1759 for (uint i = 0; i < 16; i++) 1760 { 1761 if (pPixels[i].m_c[3] < 255) 1762 { 1763 handle_alpha_block(pBlock, pPixels, pComp_params, ¶ms); 1764 return BC7ENC16_TRUE; 1765 } 1766 } 1767 handle_opaque_block(pBlock, pPixels, pComp_params, ¶ms); 1768 return BC7ENC16_FALSE; 1769 } 1770 1771 /* 1772 ------------------------------------------------------------------------------ 1773 This software is available under 2 licenses -- choose whichever you prefer. 1774 ------------------------------------------------------------------------------ 1775 ALTERNATIVE A - MIT License 1776 Copyright(c) 2018 Richard Geldreich, Jr. 1777 Permission is hereby granted, free of charge, to any person obtaining a copy of 1778 this software and associated documentation files(the "Software"), to deal in 1779 the Software without restriction, including without limitation the rights to 1780 use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies 1781 of the Software, and to permit persons to whom the Software is furnished to do 1782 so, subject to the following conditions : 1783 The above copyright notice and this permission notice shall be included in all 1784 copies or substantial portions of the Software. 1785 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1786 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1787 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 1788 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1789 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1790 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 1791 SOFTWARE. 1792 ------------------------------------------------------------------------------ 1793 ALTERNATIVE B - Public Domain(www.unlicense.org) 1794 This is free and unencumbered software released into the public domain. 1795 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 1796 software, either in source code form or as a compiled binary, for any purpose, 1797 commercial or non - commercial, and by any means. 1798 In jurisdictions that recognize copyright laws, the author or authors of this 1799 software dedicate any and all copyright interest in the software to the public 1800 domain.We make this dedication for the benefit of the public at large and to 1801 the detriment of our heirs and successors.We intend this dedication to be an 1802 overt act of relinquishment in perpetuity of all present and future rights to 1803 this software under copyright law. 1804 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1805 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1806 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 1807 AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 1808 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 1809 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 1810 ------------------------------------------------------------------------------ 1811 */