summaryrefslogtreecommitdiff
path: root/src/video_core/host_shaders
diff options
context:
space:
mode:
authorGravatar ameerj2021-03-25 16:53:51 -0400
committerGravatar ameerj2021-03-25 16:53:51 -0400
commit2f83d9a61bca42d9ef24074beb2b11b19bd4cecd (patch)
tree514e40eb750280c2e3025f9301befb6f8c9b46e9 /src/video_core/host_shaders
parentastc_decoder: Reimplement Layers (diff)
downloadyuzu-2f83d9a61bca42d9ef24074beb2b11b19bd4cecd.tar.gz
yuzu-2f83d9a61bca42d9ef24074beb2b11b19bd4cecd.tar.xz
yuzu-2f83d9a61bca42d9ef24074beb2b11b19bd4cecd.zip
astc_decoder: Refactor for style and more efficient memory use
Diffstat (limited to 'src/video_core/host_shaders')
-rw-r--r--src/video_core/host_shaders/astc_decoder.comp569
1 files changed, 307 insertions, 262 deletions
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index b903a2d37..703e34587 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -9,13 +9,13 @@
9#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { 9#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
10#define END_PUSH_CONSTANTS }; 10#define END_PUSH_CONSTANTS };
11#define UNIFORM(n) 11#define UNIFORM(n)
12#define BINDING_SWIZZLE_BUFFER 0 12#define BINDING_INPUT_BUFFER 0
13#define BINDING_INPUT_BUFFER 1 13#define BINDING_ENC_BUFFER 1
14#define BINDING_ENC_BUFFER 2 14#define BINDING_6_TO_8_BUFFER 2
15#define BINDING_6_TO_8_BUFFER 3 15#define BINDING_7_TO_8_BUFFER 3
16#define BINDING_7_TO_8_BUFFER 4 16#define BINDING_8_TO_8_BUFFER 4
17#define BINDING_8_TO_8_BUFFER 5 17#define BINDING_BYTE_TO_16_BUFFER 5
18#define BINDING_BYTE_TO_16_BUFFER 6 18#define BINDING_SWIZZLE_BUFFER 6
19#define BINDING_OUTPUT_IMAGE 7 19#define BINDING_OUTPUT_IMAGE 7
20 20
21#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv 21#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
@@ -37,28 +37,16 @@
37layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; 37layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
38 38
39BEGIN_PUSH_CONSTANTS 39BEGIN_PUSH_CONSTANTS
40UNIFORM(0) uvec2 num_image_blocks;
41UNIFORM(1) uvec2 block_dims; 40UNIFORM(1) uvec2 block_dims;
42 41
43UNIFORM(2) uvec3 origin; 42UNIFORM(2) uint bytes_per_block_log2;
44UNIFORM(3) ivec3 destination; 43UNIFORM(3) uint layer_stride;
45UNIFORM(4) uint bytes_per_block_log2; 44UNIFORM(4) uint block_size;
46UNIFORM(5) uint layer_stride; 45UNIFORM(5) uint x_shift;
47UNIFORM(6) uint block_size; 46UNIFORM(6) uint block_height;
48UNIFORM(7) uint x_shift; 47UNIFORM(7) uint block_height_mask;
49UNIFORM(8) uint block_height;
50UNIFORM(9) uint block_height_mask;
51END_PUSH_CONSTANTS 48END_PUSH_CONSTANTS
52 49
53uint current_index = 0;
54int bitsread = 0;
55uint total_bitsread = 0;
56uint local_buff[16];
57
58const int JustBits = 0;
59const int Quint = 1;
60const int Trit = 2;
61
62struct EncodingData { 50struct EncodingData {
63 uint encoding; 51 uint encoding;
64 uint num_bits; 52 uint num_bits;
@@ -68,11 +56,11 @@ struct EncodingData {
68 56
69struct TexelWeightParams { 57struct TexelWeightParams {
70 uvec2 size; 58 uvec2 size;
71 bool dual_plane;
72 uint max_weight; 59 uint max_weight;
73 bool Error; 60 bool dual_plane;
74 bool VoidExtentLDR; 61 bool error_state;
75 bool VoidExtentHDR; 62 bool void_extent_ldr;
63 bool void_extent_hdr;
76}; 64};
77 65
78// Swizzle data 66// Swizzle data
@@ -116,6 +104,75 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHI
116 104
117const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); 105const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
118 106
107const int BLOCK_SIZE_IN_BYTES = 16;
108
109const int BLOCK_INFO_ERROR = 0;
110const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
111const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
112const int BLOCK_INFO_NORMAL = 3;
113
114const int JUST_BITS = 0;
115const int QUINT = 1;
116const int TRIT = 2;
117
118// The following constants are expanded variants of the Replicate()
119// function calls corresponding to the following arguments:
120// value: index into the generated table
121// num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4.
122// to_bit: the integer after "TO_"
123const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
124const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
125
126const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
127const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
128const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
129const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
130 uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
131const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
132 uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
133 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
134const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
135const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
136const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
137const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
138 uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
139const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
140 uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
141 47, 49, 51, 53, 55, 57, 59, 61, 63);
142
143// Input ASTC texture globals
144uint current_index = 0;
145int bitsread = 0;
146uint total_bitsread = 0;
147uint local_buff[16];
148
149// Color data globals
150uint color_endpoint_data[16];
151int color_bitsread = 0;
152uint total_color_bitsread = 0;
153int color_index = 0;
154
155// Four values, two endpoints, four maximum paritions
156uint color_values[32];
157int colvals_index = 0;
158
159// Weight data globals
160uint texel_weight_data[16];
161int texel_bitsread = 0;
162uint total_texel_bitsread = 0;
163int texel_index = 0;
164
165bool texel_flag = false;
166
167// Global "vectors" to be pushed into when decoding
168EncodingData result_vector[100];
169int result_index = 0;
170
171EncodingData texel_vector[100];
172int texel_vector_index = 0;
173
174uint unquantized_texel_weights[2][144];
175
119uint SwizzleOffset(uvec2 pos) { 176uint SwizzleOffset(uvec2 pos) {
120 pos = pos & SWIZZLE_MASK; 177 pos = pos & SWIZZLE_MASK;
121 return swizzle_table[pos.y * 64 + pos.x]; 178 return swizzle_table[pos.y * 64 + pos.x];
@@ -126,21 +183,10 @@ uint ReadTexel(uint offset) {
126 return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8); 183 return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
127} 184}
128 185
129 186// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
130const int BLOCK_SIZE_IN_BYTES = 16; 187// is the same as [(num_bits - 1):0] and repeats all the way down.
131
132const int BLOCK_INFO_ERROR = 0;
133const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
134const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
135const int BLOCK_INFO_NORMAL = 3;
136
137// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
138// is the same as [(numBits - 1):0] and repeats all the way down.
139uint Replicate(uint val, uint num_bits, uint to_bit) { 188uint Replicate(uint val, uint num_bits, uint to_bit) {
140 if (num_bits == 0) { 189 if (num_bits == 0 || to_bit == 0) {
141 return 0;
142 }
143 if (to_bit == 0) {
144 return 0; 190 return 0;
145 } 191 }
146 const uint v = val & uint((1 << num_bits) - 1); 192 const uint v = val & uint((1 << num_bits) - 1);
@@ -165,26 +211,14 @@ uvec4 ReplicateByteTo16(uvec4 value) {
165 REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]); 211 REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]);
166} 212}
167 213
168const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
169uint ReplicateBitTo7(uint value) { 214uint ReplicateBitTo7(uint value) {
170 return REPLICATE_BIT_TO_7_TABLE[value]; 215 return REPLICATE_BIT_TO_7_TABLE[value];
171 ;
172} 216}
173 217
174const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
175uint ReplicateBitTo9(uint value) { 218uint ReplicateBitTo9(uint value) {
176 return REPLICATE_1_BIT_TO_9_TABLE[value]; 219 return REPLICATE_1_BIT_TO_9_TABLE[value];
177} 220}
178 221
179const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
180const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
181const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
182const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
183 uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
184const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
185 uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
186 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
187
188uint FastReplicateTo8(uint value, uint num_bits) { 222uint FastReplicateTo8(uint value, uint num_bits) {
189 switch (num_bits) { 223 switch (num_bits) {
190 case 1: 224 case 1:
@@ -207,15 +241,6 @@ uint FastReplicateTo8(uint value, uint num_bits) {
207 return Replicate(value, num_bits, 8); 241 return Replicate(value, num_bits, 8);
208} 242}
209 243
210const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
211const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
212const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
213const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
214 uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
215const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
216 uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
217 47, 49, 51, 53, 55, 57, 59, 61, 63);
218
219uint FastReplicateTo6(uint value, uint num_bits) { 244uint FastReplicateTo6(uint value, uint num_bits) {
220 switch (num_bits) { 245 switch (num_bits) {
221 case 1: 246 case 1:
@@ -232,7 +257,23 @@ uint FastReplicateTo6(uint value, uint num_bits) {
232 return Replicate(value, num_bits, 6); 257 return Replicate(value, num_bits, 6);
233} 258}
234 259
235uint hash52(uint p) { 260uint Div3Floor(uint v) {
261 return (v * 0x5556) >> 16;
262}
263
264uint Div3Ceil(uint v) {
265 return Div3Floor(v + 2);
266}
267
268uint Div5Floor(uint v) {
269 return (v * 0x3334) >> 16;
270}
271
272uint Div5Ceil(uint v) {
273 return Div5Floor(v + 4);
274}
275
276uint Hash52(uint p) {
236 p ^= p >> 15; 277 p ^= p >> 15;
237 p -= p << 17; 278 p -= p << 17;
238 p += p << 7; 279 p += p << 7;
@@ -247,9 +288,9 @@ uint hash52(uint p) {
247} 288}
248 289
249uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) { 290uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) {
250 if (1 == partition_count) 291 if (partition_count == 1) {
251 return 0; 292 return 0;
252 293 }
253 if (small_block) { 294 if (small_block) {
254 x <<= 1; 295 x <<= 1;
255 y <<= 1; 296 y <<= 1;
@@ -258,7 +299,7 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
258 299
259 seed += (partition_count - 1) * 1024; 300 seed += (partition_count - 1) * 1024;
260 301
261 uint rnum = hash52(uint(seed)); 302 uint rnum = Hash52(uint(seed));
262 uint seed1 = uint(rnum & 0xF); 303 uint seed1 = uint(rnum & 0xF);
263 uint seed2 = uint((rnum >> 4) & 0xF); 304 uint seed2 = uint((rnum >> 4) & 0xF);
264 uint seed3 = uint((rnum >> 8) & 0xF); 305 uint seed3 = uint((rnum >> 8) & 0xF);
@@ -318,18 +359,22 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
318 c &= 0x3F; 359 c &= 0x3F;
319 d &= 0x3F; 360 d &= 0x3F;
320 361
321 if (partition_count < 4) 362 if (partition_count < 4) {
322 d = 0; 363 d = 0;
323 if (partition_count < 3) 364 }
365 if (partition_count < 3) {
324 c = 0; 366 c = 0;
367 }
325 368
326 if (a >= b && a >= c && a >= d) 369 if (a >= b && a >= c && a >= d) {
327 return 0; 370 return 0;
328 else if (b >= c && b >= d) 371 } else if (b >= c && b >= d) {
329 return 1; 372 return 1;
330 else if (c >= d) 373 } else if (c >= d) {
331 return 2; 374 return 2;
332 return 3; 375 } else {
376 return 3;
377 }
333} 378}
334 379
335uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { 380uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
@@ -341,10 +386,10 @@ uint ReadBit() {
341 return 0; 386 return 0;
342 } 387 }
343 uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1); 388 uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1);
344 bitsread++; 389 ++bitsread;
345 total_bitsread++; 390 ++total_bitsread;
346 if (bitsread == 8) { 391 if (bitsread == 8) {
347 current_index++; 392 ++current_index;
348 bitsread = 0; 393 bitsread = 0;
349 } 394 }
350 return bit; 395 return bit;
@@ -358,36 +403,22 @@ uint StreamBits(uint num_bits) {
358 return ret; 403 return ret;
359} 404}
360 405
361// Define color data.
362uint color_endpoint_data[16];
363int color_bitsread = 0;
364uint total_color_bitsread = 0;
365int color_index = 0;
366
367// Define color data.
368uint texel_weight_data[16];
369int texel_bitsread = 0;
370uint total_texel_bitsread = 0;
371int texel_index = 0;
372
373bool texel_flag = false;
374
375uint ReadColorBit() { 406uint ReadColorBit() {
376 uint bit = 0; 407 uint bit = 0;
377 if (texel_flag) { 408 if (texel_flag) {
378 bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1); 409 bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1);
379 texel_bitsread++; 410 ++texel_bitsread;
380 total_texel_bitsread++; 411 ++total_texel_bitsread;
381 if (texel_bitsread == 8) { 412 if (texel_bitsread == 8) {
382 texel_index++; 413 ++texel_index;
383 texel_bitsread = 0; 414 texel_bitsread = 0;
384 } 415 }
385 } else { 416 } else {
386 bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1); 417 bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1);
387 color_bitsread++; 418 ++color_bitsread;
388 total_color_bitsread++; 419 ++total_color_bitsread;
389 if (color_bitsread == 8) { 420 if (color_bitsread == 8) {
390 color_index++; 421 ++color_index;
391 color_bitsread = 0; 422 color_bitsread = 0;
392 } 423 }
393 } 424 }
@@ -402,31 +433,25 @@ uint StreamColorBits(uint num_bits) {
402 return ret; 433 return ret;
403} 434}
404 435
405EncodingData result_vector[100];
406int result_index = 0;
407
408EncodingData texel_vector[100];
409int texel_vector_index = 0;
410
411void ResultEmplaceBack(EncodingData val) { 436void ResultEmplaceBack(EncodingData val) {
412 if (texel_flag) { 437 if (texel_flag) {
413 texel_vector[texel_vector_index] = val; 438 texel_vector[texel_vector_index] = val;
414 texel_vector_index++; 439 ++texel_vector_index;
415 } else { 440 } else {
416 result_vector[result_index] = val; 441 result_vector[result_index] = val;
417 result_index++; 442 ++result_index;
418 } 443 }
419} 444}
420 445
421// Returns the number of bits required to encode n_vals values. 446// Returns the number of bits required to encode n_vals values.
422uint GetBitLength(uint n_vals, uint encoding_index) { 447uint GetBitLength(uint n_vals, uint encoding_index) {
423 uint totalBits = encoding_values[encoding_index].num_bits * n_vals; 448 uint total_bits = encoding_values[encoding_index].num_bits * n_vals;
424 if (encoding_values[encoding_index].encoding == Trit) { 449 if (encoding_values[encoding_index].encoding == TRIT) {
425 totalBits += (n_vals * 8 + 4) / 5; 450 total_bits += Div5Ceil(n_vals * 8);
426 } else if (encoding_values[encoding_index].encoding == Quint) { 451 } else if (encoding_values[encoding_index].encoding == QUINT) {
427 totalBits += (n_vals * 7 + 2) / 3; 452 total_bits += Div3Ceil(n_vals * 7);
428 } 453 }
429 return totalBits; 454 return total_bits;
430} 455}
431 456
432uint GetNumWeightValues(uvec2 size, bool dual_plane) { 457uint GetNumWeightValues(uvec2 size, bool dual_plane) {
@@ -459,7 +484,7 @@ uint BitsOp(uint bits, uint start, uint end) {
459 return ((bits >> start) & mask); 484 return ((bits >> start) & mask);
460} 485}
461 486
462void DecodeQuintBlock(uint num_bits) { // Value number of bits 487void DecodeQuintBlock(uint num_bits) {
463 uint m[3]; 488 uint m[3];
464 uint q[3]; 489 uint q[3];
465 uint Q; 490 uint Q;
@@ -483,7 +508,6 @@ void DecodeQuintBlock(uint num_bits) { // Value number of bits
483 q[2] = BitsOp(Q, 5, 6); 508 q[2] = BitsOp(Q, 5, 6);
484 C = BitsOp(Q, 0, 4); 509 C = BitsOp(Q, 0, 4);
485 } 510 }
486
487 if (BitsOp(C, 0, 2) == 5) { 511 if (BitsOp(C, 0, 2) == 5) {
488 q[1] = 4; 512 q[1] = 4;
489 q[0] = BitsOp(C, 3, 4); 513 q[0] = BitsOp(C, 3, 4);
@@ -492,10 +516,9 @@ void DecodeQuintBlock(uint num_bits) { // Value number of bits
492 q[0] = BitsOp(C, 0, 2); 516 q[0] = BitsOp(C, 0, 2);
493 } 517 }
494 } 518 }
495
496 for (uint i = 0; i < 3; i++) { 519 for (uint i = 0; i < 3; i++) {
497 EncodingData val; 520 EncodingData val;
498 val.encoding = Quint; 521 val.encoding = QUINT;
499 val.num_bits = num_bits; 522 val.num_bits = num_bits;
500 val.bit_value = m[i]; 523 val.bit_value = m[i];
501 val.quint_trit_value = q[i]; 524 val.quint_trit_value = q[i];
@@ -547,29 +570,28 @@ void DecodeTritBlock(uint num_bits) {
547 } 570 }
548 for (uint i = 0; i < 5; i++) { 571 for (uint i = 0; i < 5; i++) {
549 EncodingData val; 572 EncodingData val;
550 val.encoding = Trit; 573 val.encoding = TRIT;
551 val.num_bits = num_bits; 574 val.num_bits = num_bits;
552 val.bit_value = m[i]; 575 val.bit_value = m[i];
553 val.quint_trit_value = t[i]; 576 val.quint_trit_value = t[i];
554 ResultEmplaceBack(val); 577 ResultEmplaceBack(val);
555 } 578 }
556} 579}
580
557void DecodeIntegerSequence(uint max_range, uint num_values) { 581void DecodeIntegerSequence(uint max_range, uint num_values) {
558 EncodingData val = encoding_values[max_range]; 582 EncodingData val = encoding_values[max_range];
559 uint vals_decoded = 0; 583 uint vals_decoded = 0;
560 while (vals_decoded < num_values) { 584 while (vals_decoded < num_values) {
561 switch (val.encoding) { 585 switch (val.encoding) {
562 case Quint: 586 case QUINT:
563 DecodeQuintBlock(val.num_bits); 587 DecodeQuintBlock(val.num_bits);
564 vals_decoded += 3; 588 vals_decoded += 3;
565 break; 589 break;
566 590 case TRIT:
567 case Trit:
568 DecodeTritBlock(val.num_bits); 591 DecodeTritBlock(val.num_bits);
569 vals_decoded += 5; 592 vals_decoded += 5;
570 break; 593 break;
571 594 case JUST_BITS:
572 case JustBits:
573 val.bit_value = StreamColorBits(val.num_bits); 595 val.bit_value = StreamColorBits(val.num_bits);
574 ResultEmplaceBack(val); 596 ResultEmplaceBack(val);
575 vals_decoded++; 597 vals_decoded++;
@@ -578,8 +600,7 @@ void DecodeIntegerSequence(uint max_range, uint num_values) {
578 } 600 }
579} 601}
580 602
581void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitions, 603void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
582 uint color_data_bits) {
583 uint num_values = 0; 604 uint num_values = 0;
584 for (uint i = 0; i < num_partitions; i++) { 605 for (uint i = 0; i < num_partitions; i++) {
585 num_values += ((modes[i] >> 2) + 1) << 1; 606 num_values += ((modes[i] >> 2) + 1) << 1;
@@ -587,21 +608,21 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio
587 int range = 256; 608 int range = 256;
588 while (--range > 0) { 609 while (--range > 0) {
589 EncodingData val = encoding_values[range]; 610 EncodingData val = encoding_values[range];
590 uint bitLength = GetBitLength(num_values, range); 611 uint bit_length = GetBitLength(num_values, range);
591 if (bitLength <= color_data_bits) { 612 if (bit_length <= color_data_bits) {
592 while (--range > 0) { 613 while (--range > 0) {
593 EncodingData newval = encoding_values[range]; 614 EncodingData newval = encoding_values[range];
594 if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) { 615 if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) {
595 break; 616 break;
596 } 617 }
597 } 618 }
598 range++; 619 ++range;
599 break; 620 break;
600 } 621 }
601 } 622 }
602 DecodeIntegerSequence(range, num_values); 623 DecodeIntegerSequence(range, num_values);
603 uint out_index = 0; 624 uint out_index = 0;
604 for (int itr = 0; itr < result_index; itr++) { 625 for (int itr = 0; itr < result_index; ++itr) {
605 if (out_index >= num_values) { 626 if (out_index >= num_values) {
606 break; 627 break;
607 } 628 }
@@ -611,77 +632,83 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio
611 uint A = 0, B = 0, C = 0, D = 0; 632 uint A = 0, B = 0, C = 0, D = 0;
612 A = ReplicateBitTo9((bitval & 1)); 633 A = ReplicateBitTo9((bitval & 1));
613 switch (val.encoding) { 634 switch (val.encoding) {
614 case JustBits: 635 case JUST_BITS:
615 color_values[out_index++] = FastReplicateTo8(bitval, bitlen); 636 color_values[out_index++] = FastReplicateTo8(bitval, bitlen);
616 break; 637 break;
617 case Trit: { 638 case TRIT: {
618 D = val.quint_trit_value; 639 D = val.quint_trit_value;
619 switch (bitlen) { 640 switch (bitlen) {
620 case 1: { 641 case 1:
621 C = 204; 642 C = 204;
622 } break; 643 break;
623 case 2: { 644 case 2: {
624 C = 93; 645 C = 93;
625 uint b = (bitval >> 1) & 1; 646 uint b = (bitval >> 1) & 1;
626 B = (b << 8) | (b << 4) | (b << 2) | (b << 1); 647 B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
627 } break; 648 break;
628 649 }
629 case 3: { 650 case 3: {
630 C = 44; 651 C = 44;
631 uint cb = (bitval >> 1) & 3; 652 uint cb = (bitval >> 1) & 3;
632 B = (cb << 7) | (cb << 2) | cb; 653 B = (cb << 7) | (cb << 2) | cb;
633 } break; 654 break;
634 655 }
635 case 4: { 656 case 4: {
636 C = 22; 657 C = 22;
637 uint dcb = (bitval >> 1) & 7; 658 uint dcb = (bitval >> 1) & 7;
638 B = (dcb << 6) | dcb; 659 B = (dcb << 6) | dcb;
639 } break; 660 break;
640 661 }
641 case 5: { 662 case 5: {
642 C = 11; 663 C = 11;
643 uint edcb = (bitval >> 1) & 0xF; 664 uint edcb = (bitval >> 1) & 0xF;
644 B = (edcb << 5) | (edcb >> 2); 665 B = (edcb << 5) | (edcb >> 2);
645 } break; 666 break;
646 667 }
647 case 6: { 668 case 6: {
648 C = 5; 669 C = 5;
649 uint fedcb = (bitval >> 1) & 0x1F; 670 uint fedcb = (bitval >> 1) & 0x1F;
650 B = (fedcb << 4) | (fedcb >> 4); 671 B = (fedcb << 4) | (fedcb >> 4);
651 } break; 672 break;
652 } 673 }
653 } break; 674 }
654 case Quint: { 675 break;
676 }
677 case QUINT: {
655 D = val.quint_trit_value; 678 D = val.quint_trit_value;
656 switch (bitlen) { 679 switch (bitlen) {
657 case 1: { 680 case 1:
658 C = 113; 681 C = 113;
659 } break; 682 break;
660 case 2: { 683 case 2: {
661 C = 54; 684 C = 54;
662 uint b = (bitval >> 1) & 1; 685 uint b = (bitval >> 1) & 1;
663 B = (b << 8) | (b << 3) | (b << 2); 686 B = (b << 8) | (b << 3) | (b << 2);
664 } break; 687 break;
688 }
665 case 3: { 689 case 3: {
666 C = 26; 690 C = 26;
667 uint cb = (bitval >> 1) & 3; 691 uint cb = (bitval >> 1) & 3;
668 B = (cb << 7) | (cb << 1) | (cb >> 1); 692 B = (cb << 7) | (cb << 1) | (cb >> 1);
669 } break; 693 break;
694 }
670 case 4: { 695 case 4: {
671 C = 13; 696 C = 13;
672 uint dcb = (bitval >> 1) & 7; 697 uint dcb = (bitval >> 1) & 7;
673 B = (dcb << 6) | (dcb >> 1); 698 B = (dcb << 6) | (dcb >> 1);
674 } break; 699 break;
700 }
675 case 5: { 701 case 5: {
676 C = 6; 702 C = 6;
677 uint edcb = (bitval >> 1) & 0xF; 703 uint edcb = (bitval >> 1) & 0xF;
678 B = (edcb << 5) | (edcb >> 3); 704 B = (edcb << 5) | (edcb >> 3);
679 } break; 705 break;
680 } 706 }
681 } break; 707 }
708 break;
682 } 709 }
683 710 }
684 if (val.encoding != JustBits) { 711 if (val.encoding != JUST_BITS) {
685 uint T = (D * C) + B; 712 uint T = (D * C) + B;
686 T ^= A; 713 T ^= A;
687 T = (A & 0x80) | (T >> 2); 714 T = (A & 0x80) | (T >> 2);
@@ -689,30 +716,31 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio
689 } 716 }
690 } 717 }
691} 718}
719
692ivec2 BitTransferSigned(int a, int b) { 720ivec2 BitTransferSigned(int a, int b) {
693 ivec2 transferred; 721 ivec2 transferred;
694 transferred[1] = b >> 1; 722 transferred.y = b >> 1;
695 transferred[1] |= a & 0x80; 723 transferred.y |= a & 0x80;
696 transferred[0] = a >> 1; 724 transferred.x = a >> 1;
697 transferred[0] &= 0x3F; 725 transferred.x &= 0x3F;
698 if ((transferred[0] & 0x20) > 0) { 726 if ((transferred.x & 0x20) > 0) {
699 transferred[0] -= 0x40; 727 transferred.x -= 0x40;
700 } 728 }
701 return transferred; 729 return transferred;
702} 730}
703 731
704uvec4 ClampByte(ivec4 color) { 732uvec4 ClampByte(ivec4 color) {
705 for (uint i = 0; i < 4; i++) { 733 for (uint i = 0; i < 4; ++i) {
706 color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); 734 color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
707 } 735 }
708 return uvec4(color); 736 return uvec4(color);
709} 737}
738
710ivec4 BlueContract(int a, int r, int g, int b) { 739ivec4 BlueContract(int a, int r, int g, int b) {
711 return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); 740 return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
712} 741}
713int colvals_index = 0; 742
714void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32], 743void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) {
715 uint color_endpoint_mode) {
716#define READ_UINT_VALUES(N) \ 744#define READ_UINT_VALUES(N) \
717 uint v[N]; \ 745 uint v[N]; \
718 for (uint i = 0; i < N; i++) { \ 746 for (uint i = 0; i < N; i++) { \
@@ -730,113 +758,120 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32],
730 READ_UINT_VALUES(2) 758 READ_UINT_VALUES(2)
731 ep1 = uvec4(0xFF, v[0], v[0], v[0]); 759 ep1 = uvec4(0xFF, v[0], v[0], v[0]);
732 ep2 = uvec4(0xFF, v[1], v[1], v[1]); 760 ep2 = uvec4(0xFF, v[1], v[1], v[1]);
733 } break; 761 break;
734 762 }
735 case 1: { 763 case 1: {
736 READ_UINT_VALUES(2) 764 READ_UINT_VALUES(2)
737 uint L0 = (v[0] >> 2) | (v[1] & 0xC0); 765 uint L0 = (v[0] >> 2) | (v[1] & 0xC0);
738 uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU); 766 uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU);
739 ep1 = uvec4(0xFF, L0, L0, L0); 767 ep1 = uvec4(0xFF, L0, L0, L0);
740 ep2 = uvec4(0xFF, L1, L1, L1); 768 ep2 = uvec4(0xFF, L1, L1, L1);
741 } break; 769 break;
742 770 }
743 case 4: { 771 case 4: {
744 READ_UINT_VALUES(4) 772 READ_UINT_VALUES(4)
745 ep1 = uvec4(v[2], v[0], v[0], v[0]); 773 ep1 = uvec4(v[2], v[0], v[0], v[0]);
746 ep2 = uvec4(v[3], v[1], v[1], v[1]); 774 ep2 = uvec4(v[3], v[1], v[1], v[1]);
747 } break; 775 break;
748 776 }
749 case 5: { 777 case 5: {
750 READ_INT_VALUES(4) 778 READ_INT_VALUES(4)
751 ivec2 transferred = BitTransferSigned(v[1], v[0]); 779 ivec2 transferred = BitTransferSigned(v[1], v[0]);
752 v[1] = transferred[0]; 780 v[1] = transferred.x;
753 v[0] = transferred[1]; 781 v[0] = transferred.y;
754 transferred = BitTransferSigned(v[3], v[2]); 782 transferred = BitTransferSigned(v[3], v[2]);
755 v[3] = transferred[0]; 783 v[3] = transferred.x;
756 v[2] = transferred[1]; 784 v[2] = transferred.y;
757 ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0])); 785 ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0]));
758 ep2 = ClampByte(ivec4((v[2] + v[3]), v[0] + v[1], v[0] + v[1], v[0] + v[1])); 786 ep2 = ClampByte(ivec4(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]));
759 } break; 787 break;
760 788 }
761 case 6: { 789 case 6: {
762 READ_UINT_VALUES(4) 790 READ_UINT_VALUES(4)
763 ep1 = uvec4(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); 791 ep1 = uvec4(0xFF, (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8);
764 ep2 = uvec4(0xFF, v[0], v[1], v[2]); 792 ep2 = uvec4(0xFF, v[0], v[1], v[2]);
765 } break; 793 break;
766 794 }
767 case 8: { 795 case 8: {
768 READ_UINT_VALUES(6) 796 READ_UINT_VALUES(6)
769 if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { 797 if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) {
770 ep1 = uvec4(0xFF, v[0], v[2], v[4]); 798 ep1 = uvec4(0xFF, v[0], v[2], v[4]);
771 ep2 = uvec4(0xFF, v[1], v[3], v[5]); 799 ep2 = uvec4(0xFF, v[1], v[3], v[5]);
772 } else { 800 } else {
773 ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5]))); 801 ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5])));
774 ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4]))); 802 ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4])));
775 } 803 }
776 } break; 804 break;
777 805 }
778 case 9: { 806 case 9: {
779 READ_INT_VALUES(6) 807 READ_INT_VALUES(6)
780 ivec2 transferred = BitTransferSigned(v[1], v[0]); 808 ivec2 transferred = BitTransferSigned(v[1], v[0]);
781 v[1] = transferred[0]; 809 v[1] = transferred.x;
782 v[0] = transferred[1]; 810 v[0] = transferred.y;
783 transferred = BitTransferSigned(v[3], v[2]); 811 transferred = BitTransferSigned(v[3], v[2]);
784 v[3] = transferred[0]; 812 v[3] = transferred.x;
785 v[2] = transferred[1]; 813 v[2] = transferred.y;
786 transferred = BitTransferSigned(v[5], v[4]); 814 transferred = BitTransferSigned(v[5], v[4]);
787 v[5] = transferred[0]; 815 v[5] = transferred.x;
788 v[4] = transferred[1]; 816 v[4] = transferred.y;
789 if (v[1] + v[3] + v[5] >= 0) { 817 if ((v[1] + v[3] + v[5]) >= 0) {
790 ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4])); 818 ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4]));
791 ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); 819 ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
792 } else { 820 } else {
793 ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); 821 ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
794 ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4])); 822 ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4]));
795 } 823 }
796 } break; 824 break;
797 825 }
798 case 10: { 826 case 10: {
799 READ_UINT_VALUES(6) 827 READ_UINT_VALUES(6)
800 ep1 = uvec4(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); 828 ep1 = uvec4(v[4], (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8);
801 ep2 = uvec4(v[5], v[0], v[1], v[2]); 829 ep2 = uvec4(v[5], v[0], v[1], v[2]);
802 } break; 830 break;
803 831 }
804 case 12: { 832 case 12: {
805 READ_UINT_VALUES(8) 833 READ_UINT_VALUES(8)
806 if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { 834 if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) {
807 ep1 = uvec4(v[6], v[0], v[2], v[4]); 835 ep1 = uvec4(v[6], v[0], v[2], v[4]);
808 ep2 = uvec4(v[7], v[1], v[3], v[5]); 836 ep2 = uvec4(v[7], v[1], v[3], v[5]);
809 } else { 837 } else {
810 ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5]))); 838 ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5])));
811 ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4]))); 839 ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4])));
812 } 840 }
813 } break; 841 break;
814 842 }
815 case 13: { 843 case 13: {
816 READ_INT_VALUES(8) 844 READ_INT_VALUES(8)
817 ivec2 transferred = BitTransferSigned(v[1], v[0]); 845 ivec2 transferred = BitTransferSigned(v[1], v[0]);
818 v[1] = transferred[0]; 846 v[1] = transferred.x;
819 v[0] = transferred[1]; 847 v[0] = transferred.y;
820 transferred = BitTransferSigned(v[3], v[2]); 848 transferred = BitTransferSigned(v[3], v[2]);
821 v[3] = transferred[0]; 849 v[3] = transferred.x;
822 v[2] = transferred[1]; 850 v[2] = transferred.y;
823 851
824 transferred = BitTransferSigned(v[5], v[4]); 852 transferred = BitTransferSigned(v[5], v[4]);
825 v[5] = transferred[0]; 853 v[5] = transferred.x;
826 v[4] = transferred[1]; 854 v[4] = transferred.y;
827 855
828 transferred = BitTransferSigned(v[7], v[6]); 856 transferred = BitTransferSigned(v[7], v[6]);
829 v[7] = transferred[0]; 857 v[7] = transferred.x;
830 v[6] = transferred[1]; 858 v[6] = transferred.y;
831 859
832 if (v[1] + v[3] + v[5] >= 0) { 860 if ((v[1] + v[3] + v[5]) >= 0) {
833 ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4])); 861 ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4]));
834 ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5])); 862 ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
835 } else { 863 } else {
836 ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5])); 864 ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
837 ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4])); 865 ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4]));
838 } 866 }
839 } break; 867 break;
868 }
869 default: {
870 // HDR mode, or more likely a bug computing the color_endpoint_mode
871 ep1 = uvec4(0xFF, 0xFF, 0, 0);
872 ep2 = uvec4(0xFF, 0xFF, 0, 0);
873 break;
874 }
840 } 875 }
841#undef READ_UINT_VALUES 876#undef READ_UINT_VALUES
842#undef READ_INT_VALUES 877#undef READ_INT_VALUES
@@ -849,52 +884,61 @@ uint UnquantizeTexelWeight(EncodingData val) {
849 uint B = 0, C = 0, D = 0; 884 uint B = 0, C = 0, D = 0;
850 uint result = 0; 885 uint result = 0;
851 switch (val.encoding) { 886 switch (val.encoding) {
852 case JustBits: 887 case JUST_BITS:
853 result = FastReplicateTo6(bitval, bitlen); 888 result = FastReplicateTo6(bitval, bitlen);
854 break; 889 break;
855 case Trit: { 890 case TRIT: {
856 D = val.quint_trit_value; 891 D = val.quint_trit_value;
857 switch (bitlen) { 892 switch (bitlen) {
858 case 0: { 893 case 0: {
859 uint results[3] = {0, 32, 63}; 894 uint results[3] = {0, 32, 63};
860 result = results[D]; 895 result = results[D];
861 } break; 896 break;
897 }
862 case 1: { 898 case 1: {
863 C = 50; 899 C = 50;
864 } break; 900 break;
901 }
865 case 2: { 902 case 2: {
866 C = 23; 903 C = 23;
867 uint b = (bitval >> 1) & 1; 904 uint b = (bitval >> 1) & 1;
868 B = (b << 6) | (b << 2) | b; 905 B = (b << 6) | (b << 2) | b;
869 } break; 906 break;
907 }
870 case 3: { 908 case 3: {
871 C = 11; 909 C = 11;
872 uint cb = (bitval >> 1) & 3; 910 uint cb = (bitval >> 1) & 3;
873 B = (cb << 5) | cb; 911 B = (cb << 5) | cb;
874 } break; 912 break;
913 }
875 default: 914 default:
876 break; 915 break;
877 } 916 }
878 } break; 917 break;
879 case Quint: { 918 }
919 case QUINT: {
880 D = val.quint_trit_value; 920 D = val.quint_trit_value;
881 switch (bitlen) { 921 switch (bitlen) {
882 case 0: { 922 case 0: {
883 uint results[5] = {0, 16, 32, 47, 63}; 923 uint results[5] = {0, 16, 32, 47, 63};
884 result = results[D]; 924 result = results[D];
885 } break; 925 break;
926 }
886 case 1: { 927 case 1: {
887 C = 28; 928 C = 28;
888 } break; 929 break;
930 }
889 case 2: { 931 case 2: {
890 C = 13; 932 C = 13;
891 uint b = (bitval >> 1) & 1; 933 uint b = (bitval >> 1) & 1;
892 B = (b << 6) | (b << 1); 934 B = (b << 6) | (b << 1);
893 } break; 935 break;
894 } 936 }
895 } break; 937 }
938 break;
896 } 939 }
897 if (val.encoding != JustBits && bitlen > 0) { 940 }
941 if (val.encoding != JUST_BITS && bitlen > 0) {
898 result = D * C + B; 942 result = D * C + B;
899 result ^= A; 943 result ^= A;
900 result = (A & 0x20) | (result >> 2); 944 result = (A & 0x20) | (result >> 2);
@@ -905,7 +949,7 @@ uint UnquantizeTexelWeight(EncodingData val) {
905 return result; 949 return result;
906} 950}
907 951
908void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 size) { 952void UnquantizeTexelWeights(bool dual_plane, uvec2 size) {
909 uint weight_idx = 0; 953 uint weight_idx = 0;
910 uint unquantized[2][144]; 954 uint unquantized[2][144];
911 uint area = size.x * size.y; 955 uint area = size.x * size.y;
@@ -921,11 +965,12 @@ void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 s
921 if (++weight_idx >= (area)) 965 if (++weight_idx >= (area))
922 break; 966 break;
923 } 967 }
924 uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); 968
925 uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); 969 const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
926 uint kPlaneScale = dual_plane ? 2 : 1; 970 const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
927 for (uint plane = 0; plane < kPlaneScale; plane++) 971 const uint k_plane_scale = dual_plane ? 2 : 1;
928 for (uint t = 0; t < block_dims.y; t++) 972 for (uint plane = 0; plane < k_plane_scale; plane++) {
973 for (uint t = 0; t < block_dims.y; t++) {
929 for (uint s = 0; s < block_dims.x; s++) { 974 for (uint s = 0; s < block_dims.x; s++) {
930 uint cs = Ds * s; 975 uint cs = Ds * s;
931 uint ct = Dt * t; 976 uint ct = Dt * t;
@@ -955,8 +1000,10 @@ void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 s
955 if ((v0 + size.x + 1) < (area)) { 1000 if ((v0 + size.x + 1) < (area)) {
956 p.w = unquantized[plane][(v0 + size.x + 1)]; 1001 p.w = unquantized[plane][(v0 + size.x + 1)];
957 } 1002 }
958 outbuffer[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; 1003 unquantized_texel_weights[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4;
959 } 1004 }
1005 }
1006 }
960} 1007}
961 1008
962int FindLayout(uint mode) { 1009int FindLayout(uint mode) {
@@ -991,25 +1038,25 @@ int FindLayout(uint mode) {
991} 1038}
992 1039
993TexelWeightParams DecodeBlockInfo(uint block_index) { 1040TexelWeightParams DecodeBlockInfo(uint block_index) {
994 TexelWeightParams params = TexelWeightParams(uvec2(0), false, 0, false, false, false); 1041 TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false);
995 uint mode = StreamBits(11); 1042 uint mode = StreamBits(11);
996 if ((mode & 0x1ff) == 0x1fc) { 1043 if ((mode & 0x1ff) == 0x1fc) {
997 if ((mode & 0x200) != 0) { 1044 if ((mode & 0x200) != 0) {
998 params.VoidExtentHDR = true; 1045 params.void_extent_hdr = true;
999 } else { 1046 } else {
1000 params.VoidExtentLDR = true; 1047 params.void_extent_ldr = true;
1001 } 1048 }
1002 if ((mode & 0x400) == 0 || StreamBits(1) == 0) { 1049 if ((mode & 0x400) == 0 || StreamBits(1) == 0) {
1003 params.Error = true; 1050 params.error_state = true;
1004 } 1051 }
1005 return params; 1052 return params;
1006 } 1053 }
1007 if ((mode & 0xf) == 0) { 1054 if ((mode & 0xf) == 0) {
1008 params.Error = true; 1055 params.error_state = true;
1009 return params; 1056 return params;
1010 } 1057 }
1011 if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) { 1058 if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) {
1012 params.Error = true; 1059 params.error_state = true;
1013 return params; 1060 return params;
1014 } 1061 }
1015 uint A, B; 1062 uint A, B;
@@ -1060,7 +1107,7 @@ TexelWeightParams DecodeBlockInfo(uint block_index) {
1060 params.size = uvec2(A + 6, B + 6); 1107 params.size = uvec2(A + 6, B + 6);
1061 break; 1108 break;
1062 default: 1109 default:
1063 params.Error = true; 1110 params.error_state = true;
1064 break; 1111 break;
1065 } 1112 }
1066 params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); 1113 params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
@@ -1089,11 +1136,8 @@ void FillError(ivec3 coord) {
1089 } 1136 }
1090} 1137}
1091 1138
1092void FillVoidExtentLDR(ivec3 coord, uint block_index) { 1139void FillVoidExtentLDR(ivec3 coord) {
1093 for (int i = 0; i < 4; i++) { 1140 StreamBits(52);
1094 StreamBits(13);
1095 }
1096
1097 uint r_u = StreamBits(16); 1141 uint r_u = StreamBits(16);
1098 uint g_u = StreamBits(16); 1142 uint g_u = StreamBits(16);
1099 uint b_u = StreamBits(16); 1143 uint b_u = StreamBits(16);
@@ -1110,21 +1154,20 @@ void FillVoidExtentLDR(ivec3 coord, uint block_index) {
1110} 1154}
1111 1155
1112void DecompressBlock(ivec3 coord, uint block_index) { 1156void DecompressBlock(ivec3 coord, uint block_index) {
1113 TexelWeightParams params; 1157 TexelWeightParams params = DecodeBlockInfo(block_index);
1114 params = DecodeBlockInfo(block_index); 1158 if (params.error_state) {
1115 if (params.Error) {
1116 FillError(coord); 1159 FillError(coord);
1117 return; 1160 return;
1118 } 1161 }
1119 if (params.VoidExtentHDR) { 1162 if (params.void_extent_hdr) {
1120 FillError(coord); 1163 FillError(coord);
1121 return; 1164 return;
1122 } 1165 }
1123 if (params.VoidExtentLDR) { 1166 if (params.void_extent_ldr) {
1124 FillVoidExtentLDR(coord, block_index); 1167 FillVoidExtentLDR(coord);
1125 return; 1168 return;
1126 } 1169 }
1127 if (params.size.x > block_dims.x || params.size.y > block_dims.y) { 1170 if ((params.size.x > block_dims.x) || (params.size.y > block_dims.y)) {
1128 FillError(coord); 1171 FillError(coord);
1129 return; 1172 return;
1130 } 1173 }
@@ -1139,7 +1182,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {
1139 uint ced_pointer = 0; 1182 uint ced_pointer = 0;
1140 uint base_cem = 0; 1183 uint base_cem = 0;
1141 if (num_partitions == 1) { 1184 if (num_partitions == 1) {
1142 color_endpoint_mode[0] = StreamBits(4); 1185 color_endpoint_mode.x = StreamBits(4);
1143 partition_index = 0; 1186 partition_index = 0;
1144 } else { 1187 } else {
1145 partition_index = StreamBits(10); 1188 partition_index = StreamBits(10);
@@ -1181,7 +1224,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {
1181 int nb = int(min(remaining_bits, 8U)); 1224 int nb = int(min(remaining_bits, 8U));
1182 uint b = StreamBits(nb); 1225 uint b = StreamBits(nb);
1183 color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); 1226 color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
1184 ced_pointer++; 1227 ++ced_pointer;
1185 remaining_bits -= nb; 1228 remaining_bits -= nb;
1186 } 1229 }
1187 plane_index = int(StreamBits(plane_selector_bits)); 1230 plane_index = int(StreamBits(plane_selector_bits));
@@ -1189,20 +1232,20 @@ void DecompressBlock(ivec3 coord, uint block_index) {
1189 uint extra_cem = StreamBits(extra_cem_bits); 1232 uint extra_cem = StreamBits(extra_cem_bits);
1190 uint cem = (extra_cem << 6) | base_cem; 1233 uint cem = (extra_cem << 6) | base_cem;
1191 cem >>= 2; 1234 cem >>= 2;
1192 uint C[4] = {0, 0, 0, 0}; 1235 uvec4 C = uvec4(0);
1193 for (uint i = 0; i < num_partitions; i++) { 1236 for (uint i = 0; i < num_partitions; i++) {
1194 C[i] = cem & 1; 1237 C[i] = (cem & 1);
1195 cem >>= 1; 1238 cem >>= 1;
1196 } 1239 }
1197 uint M[4] = {0, 0, 0, 0}; 1240 uvec4 M = uvec4(0);
1198 for (uint i = 0; i < num_partitions; i++) { 1241 for (uint i = 0; i < num_partitions; i++) {
1199 M[i] = cem & 3; 1242 M[i] = cem & 3;
1200 cem >>= 2; 1243 cem >>= 2;
1201 } 1244 }
1202 for (uint i = 0; i < num_partitions; i++) { 1245 for (uint i = 0; i < num_partitions; i++) {
1203 color_endpoint_mode[i] = base_mode; 1246 color_endpoint_mode[i] = base_mode;
1204 if ((C[i]) == 0) { 1247 if (C[i] == 0) {
1205 color_endpoint_mode[i] -= 1; 1248 --color_endpoint_mode[i];
1206 } 1249 }
1207 color_endpoint_mode[i] <<= 2; 1250 color_endpoint_mode[i] <<= 2;
1208 color_endpoint_mode[i] |= M[i]; 1251 color_endpoint_mode[i] |= M[i];
@@ -1213,13 +1256,13 @@ void DecompressBlock(ivec3 coord, uint block_index) {
1213 color_endpoint_mode[i] = cem; 1256 color_endpoint_mode[i] = cem;
1214 } 1257 }
1215 } 1258 }
1259 DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits);
1216 1260
1217 uint color_values[32]; // Four values, two endpoints, four maximum paritions
1218 DecodeColorValues(color_values, color_endpoint_mode, num_partitions, color_data_bits);
1219 uvec4 endpoints[4][2]; 1261 uvec4 endpoints[4][2];
1220 for (uint i = 0; i < num_partitions; i++) { 1262 for (uint i = 0; i < num_partitions; i++) {
1221 ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_values, color_endpoint_mode[i]); 1263 ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]);
1222 } 1264 }
1265
1223 for (uint i = 0; i < 16; i++) { 1266 for (uint i = 0; i < 16; i++) {
1224 texel_weight_data[i] = local_buff[i]; 1267 texel_weight_data[i] = local_buff[i];
1225 } 1268 }
@@ -1238,12 +1281,13 @@ void DecompressBlock(ivec3 coord, uint block_index) {
1238 uint( 1281 uint(
1239 ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); 1282 ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
1240 for (uint i = 0; i < 16 - clear_byte_start; i++) { 1283 for (uint i = 0; i < 16 - clear_byte_start; i++) {
1241 texel_weight_data[clear_byte_start + i] = uint(0U); 1284 texel_weight_data[clear_byte_start + i] = 0U;
1242 } 1285 }
1243 texel_flag = true; // use texel "vector" and bit stream in integer decoding 1286 texel_flag = true; // use texel "vector" and bit stream in integer decoding
1244 DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); 1287 DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
1245 uint weights[2][144]; 1288
1246 UnquantizeTexelWeights(weights, params.dual_plane, params.size); 1289 UnquantizeTexelWeights(params.dual_plane, params.size);
1290
1247 for (uint j = 0; j < block_dims.y; j++) { 1291 for (uint j = 0; j < block_dims.y; j++) {
1248 for (uint i = 0; i < block_dims.x; i++) { 1292 for (uint i = 0; i < block_dims.x; i++) {
1249 uint local_partition = Select2DPartition(partition_index, i, j, num_partitions, 1293 uint local_partition = Select2DPartition(partition_index, i, j, num_partitions,
@@ -1257,9 +1301,9 @@ void DecompressBlock(ivec3 coord, uint block_index) {
1257 if (params.dual_plane && (((plane_index + 1) & 3) == c)) { 1301 if (params.dual_plane && (((plane_index + 1) & 3) == c)) {
1258 plane_vec[c] = 1; 1302 plane_vec[c] = 1;
1259 } 1303 }
1260 weight_vec[c] = weights[plane_vec[c]][j * block_dims.x + i]; 1304 weight_vec[c] = unquantized_texel_weights[plane_vec[c]][j * block_dims.x + i];
1261 } 1305 }
1262 vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) >> 6); 1306 vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64);
1263 p = (Cf / 65535.0); 1307 p = (Cf / 65535.0);
1264 imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); 1308 imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar);
1265 } 1309 }
@@ -1267,7 +1311,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {
1267} 1311}
1268 1312
1269void main() { 1313void main() {
1270 uvec3 pos = gl_GlobalInvocationID + origin; 1314 uvec3 pos = gl_GlobalInvocationID;
1271 pos.x <<= bytes_per_block_log2; 1315 pos.x <<= bytes_per_block_log2;
1272 1316
1273 // Read as soon as possible due to its latency 1317 // Read as soon as possible due to its latency
@@ -1282,9 +1326,10 @@ void main() {
1282 offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; 1326 offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
1283 offset += swizzle; 1327 offset += swizzle;
1284 1328
1285 const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1.0)); 1329 const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1));
1286 uint block_index = 1330 uint block_index =
1287 pos.z * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x; 1331 pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x;
1332
1288 current_index = 0; 1333 current_index = 0;
1289 bitsread = 0; 1334 bitsread = 0;
1290 for (int i = 0; i < 16; i++) { 1335 for (int i = 0; i < 16; i++) {