summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar liamwhite2023-08-21 16:08:51 -0400
committerGravatar GitHub2023-08-21 16:08:51 -0400
commit18c08cee43ea674738897dff1c869e78016fd7a1 (patch)
tree5192c687b384d7589c9b7e6383bfe05c6a593e10
parentandroid: Use sensor landscape for landscape mode (#11337) (diff)
parentflatten color_values (diff)
downloadyuzu-18c08cee43ea674738897dff1c869e78016fd7a1.tar.gz
yuzu-18c08cee43ea674738897dff1c869e78016fd7a1.tar.xz
yuzu-18c08cee43ea674738897dff1c869e78016fd7a1.zip
Merge pull request #11149 from ameerj/astc-perf-prod
host_shaders: ASTC compute shader optimizations
Diffstat (limited to '')
-rw-r--r--src/video_core/host_shaders/astc_decoder.comp988
-rw-r--r--src/video_core/renderer_opengl/util_shaders.cpp1
2 files changed, 454 insertions, 535 deletions
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index bf2693559..5ff17cd0c 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -33,26 +33,14 @@ UNIFORM(6) uint block_height_mask;
33END_PUSH_CONSTANTS 33END_PUSH_CONSTANTS
34 34
35struct EncodingData { 35struct EncodingData {
36 uint encoding; 36 uint data;
37 uint num_bits;
38 uint bit_value;
39 uint quint_trit_value;
40}; 37};
41 38
42struct TexelWeightParams { 39layout(binding = BINDING_INPUT_BUFFER, std430) readonly restrict buffer InputBufferU32 {
43 uvec2 size;
44 uint max_weight;
45 bool dual_plane;
46 bool error_state;
47 bool void_extent_ldr;
48 bool void_extent_hdr;
49};
50
51layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 {
52 uvec4 astc_data[]; 40 uvec4 astc_data[];
53}; 41};
54 42
55layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; 43layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly restrict image2DArray dest_image;
56 44
57const uint GOB_SIZE_X_SHIFT = 6; 45const uint GOB_SIZE_X_SHIFT = 6;
58const uint GOB_SIZE_Y_SHIFT = 3; 46const uint GOB_SIZE_Y_SHIFT = 3;
@@ -60,64 +48,21 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT;
60 48
61const uint BYTES_PER_BLOCK_LOG2 = 4; 49const uint BYTES_PER_BLOCK_LOG2 = 4;
62 50
63const int JUST_BITS = 0; 51const uint JUST_BITS = 0u;
64const int QUINT = 1; 52const uint QUINT = 1u;
65const int TRIT = 2; 53const uint TRIT = 2u;
66 54
67// ASTC Encodings data, sorted in ascending order based on their BitLength value 55// ASTC Encodings data, sorted in ascending order based on their BitLength value
68// (see GetBitLength() function) 56// (see GetBitLength() function)
69EncodingData encoding_values[22] = EncodingData[]( 57const uint encoding_values[22] = uint[](
70 EncodingData(JUST_BITS, 0, 0, 0), EncodingData(JUST_BITS, 1, 0, 0), EncodingData(TRIT, 0, 0, 0), 58 (JUST_BITS), (JUST_BITS | (1u << 8u)), (TRIT), (JUST_BITS | (2u << 8u)),
71 EncodingData(JUST_BITS, 2, 0, 0), EncodingData(QUINT, 0, 0, 0), EncodingData(TRIT, 1, 0, 0), 59 (QUINT), (TRIT | (1u << 8u)), (JUST_BITS | (3u << 8u)), (QUINT | (1u << 8u)),
72 EncodingData(JUST_BITS, 3, 0, 0), EncodingData(QUINT, 1, 0, 0), EncodingData(TRIT, 2, 0, 0), 60 (TRIT | (2u << 8u)), (JUST_BITS | (4u << 8u)), (QUINT | (2u << 8u)), (TRIT | (3u << 8u)),
73 EncodingData(JUST_BITS, 4, 0, 0), EncodingData(QUINT, 2, 0, 0), EncodingData(TRIT, 3, 0, 0), 61 (JUST_BITS | (5u << 8u)), (QUINT | (3u << 8u)), (TRIT | (4u << 8u)), (JUST_BITS | (6u << 8u)),
74 EncodingData(JUST_BITS, 5, 0, 0), EncodingData(QUINT, 3, 0, 0), EncodingData(TRIT, 4, 0, 0), 62 (QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)),
75 EncodingData(JUST_BITS, 6, 0, 0), EncodingData(QUINT, 4, 0, 0), EncodingData(TRIT, 5, 0, 0), 63 (TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)));
76 EncodingData(JUST_BITS, 7, 0, 0), EncodingData(QUINT, 5, 0, 0), EncodingData(TRIT, 6, 0, 0),
77 EncodingData(JUST_BITS, 8, 0, 0)
78);
79
80// The following constants are expanded variants of the Replicate()
81// function calls corresponding to the following arguments:
82// value: index into the generated table
83// num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4.
84// to_bit: the integer after "TO_"
85const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
86const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
87
88const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
89const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
90const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
91const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
92 uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
93const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
94 uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
95 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
96const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
97const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
98const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
99const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
100 uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
101const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
102 uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
103 47, 49, 51, 53, 55, 57, 59, 61, 63);
104const uint REPLICATE_6_BIT_TO_8_TABLE[64] =
105 uint[](0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 65, 69, 73, 77, 81, 85, 89,
106 93, 97, 101, 105, 109, 113, 117, 121, 125, 130, 134, 138, 142, 146, 150, 154, 158, 162,
107 166, 170, 174, 178, 182, 186, 190, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235,
108 239, 243, 247, 251, 255);
109const uint REPLICATE_7_BIT_TO_8_TABLE[128] =
110 uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44,
111 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88,
112 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
113 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159, 161, 163,
114 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199,
115 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235,
116 237, 239, 241, 243, 245, 247, 249, 251, 253, 255);
117 64
118// Input ASTC texture globals 65// Input ASTC texture globals
119uint current_index = 0;
120int bitsread = 0;
121int total_bitsread = 0; 66int total_bitsread = 0;
122uvec4 local_buff; 67uvec4 local_buff;
123 68
@@ -125,50 +70,60 @@ uvec4 local_buff;
125uvec4 color_endpoint_data; 70uvec4 color_endpoint_data;
126int color_bitsread = 0; 71int color_bitsread = 0;
127 72
128// Four values, two endpoints, four maximum partitions 73// Global "vector" to be pushed into when decoding
129uint color_values[32]; 74// At most will require BLOCK_WIDTH x BLOCK_HEIGHT in single plane mode
130int colvals_index = 0; 75// At most will require BLOCK_WIDTH x BLOCK_HEIGHT x 2 in dual plane mode
131 76// So the maximum would be 144 (12 x 12) elements, x 2 for two planes
132// Weight data globals 77#define DIVCEIL(number, divisor) (number + divisor - 1) / divisor
133uvec4 texel_weight_data; 78#define ARRAY_NUM_ELEMENTS 144
134int texel_bitsread = 0; 79#define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4)
80uint result_vector[ARRAY_NUM_ELEMENTS * 2];
135 81
136bool texel_flag = false;
137
138// Global "vectors" to be pushed into when decoding
139EncodingData result_vector[144];
140int result_index = 0; 82int result_index = 0;
83uint result_vector_max_index;
84bool result_limit_reached = false;
141 85
142EncodingData texel_vector[144]; 86// EncodingData helpers
143int texel_vector_index = 0; 87uint Encoding(EncodingData val) {
88 return bitfieldExtract(val.data, 0, 8);
89}
90uint NumBits(EncodingData val) {
91 return bitfieldExtract(val.data, 8, 8);
92}
93uint BitValue(EncodingData val) {
94 return bitfieldExtract(val.data, 16, 8);
95}
96uint QuintTritValue(EncodingData val) {
97 return bitfieldExtract(val.data, 24, 8);
98}
144 99
145uint unquantized_texel_weights[2][144]; 100void Encoding(inout EncodingData val, uint v) {
101 val.data = bitfieldInsert(val.data, v, 0, 8);
102}
103void NumBits(inout EncodingData val, uint v) {
104 val.data = bitfieldInsert(val.data, v, 8, 8);
105}
106void BitValue(inout EncodingData val, uint v) {
107 val.data = bitfieldInsert(val.data, v, 16, 8);
108}
109void QuintTritValue(inout EncodingData val, uint v) {
110 val.data = bitfieldInsert(val.data, v, 24, 8);
111}
146 112
147uint SwizzleOffset(uvec2 pos) { 113EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint quint_trit_val) {
148 uint x = pos.x; 114 return EncodingData(((encoding) << 0u) | ((num_bits) << 8u) |
149 uint y = pos.y; 115 ((bit_val) << 16u) | ((quint_trit_val) << 24u));
150 return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 +
151 (y % 2) * 16 + (x % 16);
152} 116}
153 117
154// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] 118
155// is the same as [(num_bits - 1):0] and repeats all the way down. 119void ResultEmplaceBack(EncodingData val) {
156uint Replicate(uint val, uint num_bits, uint to_bit) { 120 if (result_index >= result_vector_max_index) {
157 const uint v = val & uint((1 << num_bits) - 1); 121 // Alert callers to avoid decoding more than needed by this phase
158 uint res = v; 122 result_limit_reached = true;
159 uint reslen = num_bits; 123 return;
160 while (reslen < to_bit) {
161 uint comp = 0;
162 if (num_bits > to_bit - reslen) {
163 uint newshift = to_bit - reslen;
164 comp = num_bits - newshift;
165 num_bits = newshift;
166 }
167 res = uint(res << num_bits);
168 res = uint(res | (v >> comp));
169 reslen += num_bits;
170 } 124 }
171 return res; 125 result_vector[result_index] = val.data;
126 ++result_index;
172} 127}
173 128
174uvec4 ReplicateByteTo16(uvec4 value) { 129uvec4 ReplicateByteTo16(uvec4 value) {
@@ -176,64 +131,40 @@ uvec4 ReplicateByteTo16(uvec4 value) {
176} 131}
177 132
178uint ReplicateBitTo7(uint value) { 133uint ReplicateBitTo7(uint value) {
179 return REPLICATE_BIT_TO_7_TABLE[value]; 134 return value * 127;
180} 135}
181 136
182uint ReplicateBitTo9(uint value) { 137uint ReplicateBitTo9(uint value) {
183 return REPLICATE_1_BIT_TO_9_TABLE[value]; 138 return value * 511;
184} 139}
185 140
186uint FastReplicate(uint value, uint num_bits, uint to_bit) { 141uint ReplicateBits(uint value, uint num_bits, uint to_bit) {
187 if (num_bits == 0) { 142 if (value == 0 || num_bits == 0) {
188 return 0; 143 return 0;
189 } 144 }
190 if (num_bits == to_bit) { 145 if (num_bits >= to_bit) {
191 return value; 146 return value;
192 } 147 }
193 if (to_bit == 6) { 148 const uint v = value & uint((1 << num_bits) - 1);
194 switch (num_bits) { 149 uint res = v;
195 case 1: 150 uint reslen = num_bits;
196 return REPLICATE_1_BIT_TO_6_TABLE[value]; 151 while (reslen < to_bit) {
197 case 2: 152 const uint num_dst_bits_to_shift_up = min(num_bits, to_bit - reslen);
198 return REPLICATE_2_BIT_TO_6_TABLE[value]; 153 const uint num_src_bits_to_shift_down = num_bits - num_dst_bits_to_shift_up;
199 case 3: 154
200 return REPLICATE_3_BIT_TO_6_TABLE[value]; 155 res <<= num_dst_bits_to_shift_up;
201 case 4: 156 res |= (v >> num_src_bits_to_shift_down);
202 return REPLICATE_4_BIT_TO_6_TABLE[value]; 157 reslen += num_bits;
203 case 5:
204 return REPLICATE_5_BIT_TO_6_TABLE[value];
205 default:
206 break;
207 }
208 } else { /* if (to_bit == 8) */
209 switch (num_bits) {
210 case 1:
211 return REPLICATE_1_BIT_TO_8_TABLE[value];
212 case 2:
213 return REPLICATE_2_BIT_TO_8_TABLE[value];
214 case 3:
215 return REPLICATE_3_BIT_TO_8_TABLE[value];
216 case 4:
217 return REPLICATE_4_BIT_TO_8_TABLE[value];
218 case 5:
219 return REPLICATE_5_BIT_TO_8_TABLE[value];
220 case 6:
221 return REPLICATE_6_BIT_TO_8_TABLE[value];
222 case 7:
223 return REPLICATE_7_BIT_TO_8_TABLE[value];
224 default:
225 break;
226 }
227 } 158 }
228 return Replicate(value, num_bits, to_bit); 159 return res;
229} 160}
230 161
231uint FastReplicateTo8(uint value, uint num_bits) { 162uint FastReplicateTo8(uint value, uint num_bits) {
232 return FastReplicate(value, num_bits, 8); 163 return ReplicateBits(value, num_bits, 8);
233} 164}
234 165
235uint FastReplicateTo6(uint value, uint num_bits) { 166uint FastReplicateTo6(uint value, uint num_bits) {
236 return FastReplicate(value, num_bits, 6); 167 return ReplicateBits(value, num_bits, 6);
237} 168}
238 169
239uint Div3Floor(uint v) { 170uint Div3Floor(uint v) {
@@ -266,15 +197,15 @@ uint Hash52(uint p) {
266 return p; 197 return p;
267} 198}
268 199
269uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { 200uint Select2DPartition(uint seed, uint x, uint y, uint partition_count) {
270 if (small_block) { 201 if ((block_dims.y * block_dims.x) < 32) {
271 x <<= 1; 202 x <<= 1;
272 y <<= 1; 203 y <<= 1;
273 } 204 }
274 205
275 seed += (partition_count - 1) * 1024; 206 seed += (partition_count - 1) * 1024;
276 207
277 uint rnum = Hash52(uint(seed)); 208 const uint rnum = Hash52(uint(seed));
278 uint seed1 = uint(rnum & 0xF); 209 uint seed1 = uint(rnum & 0xF);
279 uint seed2 = uint((rnum >> 4) & 0xF); 210 uint seed2 = uint((rnum >> 4) & 0xF);
280 uint seed3 = uint((rnum >> 8) & 0xF); 211 uint seed3 = uint((rnum >> 8) & 0xF);
@@ -342,53 +273,52 @@ uint ExtractBits(uvec4 payload, int offset, int bits) {
342 if (bits <= 0) { 273 if (bits <= 0) {
343 return 0; 274 return 0;
344 } 275 }
345 int last_offset = offset + bits - 1; 276 if (bits > 32) {
346 int shifted_offset = offset >> 5; 277 return 0;
278 }
279 const int last_offset = offset + bits - 1;
280 const int shifted_offset = offset >> 5;
347 if ((last_offset >> 5) == shifted_offset) { 281 if ((last_offset >> 5) == shifted_offset) {
348 return bitfieldExtract(payload[shifted_offset], offset & 31, bits); 282 return bitfieldExtract(payload[shifted_offset], offset & 31, bits);
349 } 283 }
350 int first_bits = 32 - (offset & 31); 284 const int first_bits = 32 - (offset & 31);
351 int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits)); 285 const int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits));
352 int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits)); 286 const int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits));
353 return result_first | (result_second << first_bits); 287 return result_first | (result_second << first_bits);
354} 288}
355 289
356uint StreamBits(uint num_bits) { 290uint StreamBits(uint num_bits) {
357 int int_bits = int(num_bits); 291 const int int_bits = int(num_bits);
358 uint ret = ExtractBits(local_buff, total_bitsread, int_bits); 292 const uint ret = ExtractBits(local_buff, total_bitsread, int_bits);
359 total_bitsread += int_bits; 293 total_bitsread += int_bits;
360 return ret; 294 return ret;
361} 295}
362 296
297void SkipBits(uint num_bits) {
298 const int int_bits = int(num_bits);
299 total_bitsread += int_bits;
300}
301
363uint StreamColorBits(uint num_bits) { 302uint StreamColorBits(uint num_bits) {
364 uint ret = 0; 303 const int int_bits = int(num_bits);
365 int int_bits = int(num_bits); 304 const uint ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits);
366 if (texel_flag) { 305 color_bitsread += int_bits;
367 ret = ExtractBits(texel_weight_data, texel_bitsread, int_bits);
368 texel_bitsread += int_bits;
369 } else {
370 ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits);
371 color_bitsread += int_bits;
372 }
373 return ret; 306 return ret;
374} 307}
375 308
376void ResultEmplaceBack(EncodingData val) { 309EncodingData GetEncodingFromVector(uint index) {
377 if (texel_flag) { 310 const uint data = result_vector[index];
378 texel_vector[texel_vector_index] = val; 311 return EncodingData(data);
379 ++texel_vector_index;
380 } else {
381 result_vector[result_index] = val;
382 ++result_index;
383 }
384} 312}
385 313
386// Returns the number of bits required to encode n_vals values. 314// Returns the number of bits required to encode n_vals values.
387uint GetBitLength(uint n_vals, uint encoding_index) { 315uint GetBitLength(uint n_vals, uint encoding_index) {
388 uint total_bits = encoding_values[encoding_index].num_bits * n_vals; 316 const EncodingData encoding_value = EncodingData(encoding_values[encoding_index]);
389 if (encoding_values[encoding_index].encoding == TRIT) { 317 const uint encoding = Encoding(encoding_value);
318 uint total_bits = NumBits(encoding_value) * n_vals;
319 if (encoding == TRIT) {
390 total_bits += Div5Ceil(n_vals * 8); 320 total_bits += Div5Ceil(n_vals * 8);
391 } else if (encoding_values[encoding_index].encoding == QUINT) { 321 } else if (encoding == QUINT) {
392 total_bits += Div3Ceil(n_vals * 7); 322 total_bits += Div3Ceil(n_vals * 7);
393 } 323 }
394 return total_bits; 324 return total_bits;
@@ -403,7 +333,7 @@ uint GetNumWeightValues(uvec2 size, bool dual_plane) {
403} 333}
404 334
405uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) { 335uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) {
406 uint n_vals = GetNumWeightValues(size, dual_plane); 336 const uint n_vals = GetNumWeightValues(size, dual_plane);
407 return GetBitLength(n_vals, max_weight); 337 return GetBitLength(n_vals, max_weight);
408} 338}
409 339
@@ -412,87 +342,74 @@ uint BitsBracket(uint bits, uint pos) {
412} 342}
413 343
414uint BitsOp(uint bits, uint start, uint end) { 344uint BitsOp(uint bits, uint start, uint end) {
415 if (start == end) { 345 const uint mask = (1 << (end - start + 1)) - 1;
416 return BitsBracket(bits, start);
417 } else if (start > end) {
418 uint t = start;
419 start = end;
420 end = t;
421 }
422
423 uint mask = (1 << (end - start + 1)) - 1;
424 return ((bits >> start) & mask); 346 return ((bits >> start) & mask);
425} 347}
426 348
427void DecodeQuintBlock(uint num_bits) { 349void DecodeQuintBlock(uint num_bits) {
428 uint m[3]; 350 uvec3 m;
429 uint q[3]; 351 uvec4 qQ;
430 uint Q;
431 m[0] = StreamColorBits(num_bits); 352 m[0] = StreamColorBits(num_bits);
432 Q = StreamColorBits(3); 353 qQ.w = StreamColorBits(3);
433 m[1] = StreamColorBits(num_bits); 354 m[1] = StreamColorBits(num_bits);
434 Q |= StreamColorBits(2) << 3; 355 qQ.w |= StreamColorBits(2) << 3;
435 m[2] = StreamColorBits(num_bits); 356 m[2] = StreamColorBits(num_bits);
436 Q |= StreamColorBits(2) << 5; 357 qQ.w |= StreamColorBits(2) << 5;
437 if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) { 358 if (BitsOp(qQ.w, 1, 2) == 3 && BitsOp(qQ.w, 5, 6) == 0) {
438 q[0] = 4; 359 qQ.x = 4;
439 q[1] = 4; 360 qQ.y = 4;
440 q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) | 361 qQ.z = (BitsBracket(qQ.w, 0) << 2) | ((BitsBracket(qQ.w, 4) & ~BitsBracket(qQ.w, 0)) << 1) |
441 (BitsBracket(Q, 3) & ~BitsBracket(Q, 0)); 362 (BitsBracket(qQ.w, 3) & ~BitsBracket(qQ.w, 0));
442 } else { 363 } else {
443 uint C = 0; 364 uint C = 0;
444 if (BitsOp(Q, 1, 2) == 3) { 365 if (BitsOp(qQ.w, 1, 2) == 3) {
445 q[2] = 4; 366 qQ.z = 4;
446 C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0); 367 C = (BitsOp(qQ.w, 3, 4) << 3) | ((~BitsOp(qQ.w, 5, 6) & 3) << 1) | BitsBracket(qQ.w, 0);
447 } else { 368 } else {
448 q[2] = BitsOp(Q, 5, 6); 369 qQ.z = BitsOp(qQ.w, 5, 6);
449 C = BitsOp(Q, 0, 4); 370 C = BitsOp(qQ.w, 0, 4);
450 } 371 }
451 if (BitsOp(C, 0, 2) == 5) { 372 if (BitsOp(C, 0, 2) == 5) {
452 q[1] = 4; 373 qQ.y = 4;
453 q[0] = BitsOp(C, 3, 4); 374 qQ.x = BitsOp(C, 3, 4);
454 } else { 375 } else {
455 q[1] = BitsOp(C, 3, 4); 376 qQ.y = BitsOp(C, 3, 4);
456 q[0] = BitsOp(C, 0, 2); 377 qQ.x = BitsOp(C, 0, 2);
457 } 378 }
458 } 379 }
459 for (uint i = 0; i < 3; i++) { 380 for (uint i = 0; i < 3; i++) {
460 EncodingData val; 381 const EncodingData val = CreateEncodingData(QUINT, num_bits, m[i], qQ[i]);
461 val.encoding = QUINT;
462 val.num_bits = num_bits;
463 val.bit_value = m[i];
464 val.quint_trit_value = q[i];
465 ResultEmplaceBack(val); 382 ResultEmplaceBack(val);
466 } 383 }
467} 384}
468 385
469void DecodeTritBlock(uint num_bits) { 386void DecodeTritBlock(uint num_bits) {
470 uint m[5]; 387 uvec4 m;
471 uint t[5]; 388 uvec4 t;
472 uint T; 389 uvec3 Tm5t5;
473 m[0] = StreamColorBits(num_bits); 390 m[0] = StreamColorBits(num_bits);
474 T = StreamColorBits(2); 391 Tm5t5.x = StreamColorBits(2);
475 m[1] = StreamColorBits(num_bits); 392 m[1] = StreamColorBits(num_bits);
476 T |= StreamColorBits(2) << 2; 393 Tm5t5.x |= StreamColorBits(2) << 2;
477 m[2] = StreamColorBits(num_bits); 394 m[2] = StreamColorBits(num_bits);
478 T |= StreamColorBits(1) << 4; 395 Tm5t5.x |= StreamColorBits(1) << 4;
479 m[3] = StreamColorBits(num_bits); 396 m[3] = StreamColorBits(num_bits);
480 T |= StreamColorBits(2) << 5; 397 Tm5t5.x |= StreamColorBits(2) << 5;
481 m[4] = StreamColorBits(num_bits); 398 Tm5t5.y = StreamColorBits(num_bits);
482 T |= StreamColorBits(1) << 7; 399 Tm5t5.x |= StreamColorBits(1) << 7;
483 uint C = 0; 400 uint C = 0;
484 if (BitsOp(T, 2, 4) == 7) { 401 if (BitsOp(Tm5t5.x, 2, 4) == 7) {
485 C = (BitsOp(T, 5, 7) << 2) | BitsOp(T, 0, 1); 402 C = (BitsOp(Tm5t5.x, 5, 7) << 2) | BitsOp(Tm5t5.x, 0, 1);
486 t[4] = 2; 403 Tm5t5.z = 2;
487 t[3] = 2; 404 t[3] = 2;
488 } else { 405 } else {
489 C = BitsOp(T, 0, 4); 406 C = BitsOp(Tm5t5.x, 0, 4);
490 if (BitsOp(T, 5, 6) == 3) { 407 if (BitsOp(Tm5t5.x, 5, 6) == 3) {
491 t[4] = 2; 408 Tm5t5.z = 2;
492 t[3] = BitsBracket(T, 7); 409 t[3] = BitsBracket(Tm5t5.x, 7);
493 } else { 410 } else {
494 t[4] = BitsBracket(T, 7); 411 Tm5t5.z = BitsBracket(Tm5t5.x, 7);
495 t[3] = BitsOp(T, 5, 6); 412 t[3] = BitsOp(Tm5t5.x, 5, 6);
496 } 413 }
497 } 414 }
498 if (BitsOp(C, 0, 1) == 3) { 415 if (BitsOp(C, 0, 1) == 3) {
@@ -508,31 +425,31 @@ void DecodeTritBlock(uint num_bits) {
508 t[1] = BitsOp(C, 2, 3); 425 t[1] = BitsOp(C, 2, 3);
509 t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1)); 426 t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1));
510 } 427 }
511 for (uint i = 0; i < 5; i++) { 428 for (uint i = 0; i < 4; i++) {
512 EncodingData val; 429 const EncodingData val = CreateEncodingData(TRIT, num_bits, m[i], t[i]);
513 val.encoding = TRIT;
514 val.num_bits = num_bits;
515 val.bit_value = m[i];
516 val.quint_trit_value = t[i];
517 ResultEmplaceBack(val); 430 ResultEmplaceBack(val);
518 } 431 }
432 const EncodingData val = CreateEncodingData(TRIT, num_bits, Tm5t5.y, Tm5t5.z);
433 ResultEmplaceBack(val);
519} 434}
520 435
521void DecodeIntegerSequence(uint max_range, uint num_values) { 436void DecodeIntegerSequence(uint max_range, uint num_values) {
522 EncodingData val = encoding_values[max_range]; 437 EncodingData val = EncodingData(encoding_values[max_range]);
438 const uint encoding = Encoding(val);
439 const uint num_bits = NumBits(val);
523 uint vals_decoded = 0; 440 uint vals_decoded = 0;
524 while (vals_decoded < num_values) { 441 while (vals_decoded < num_values && !result_limit_reached) {
525 switch (val.encoding) { 442 switch (encoding) {
526 case QUINT: 443 case QUINT:
527 DecodeQuintBlock(val.num_bits); 444 DecodeQuintBlock(num_bits);
528 vals_decoded += 3; 445 vals_decoded += 3;
529 break; 446 break;
530 case TRIT: 447 case TRIT:
531 DecodeTritBlock(val.num_bits); 448 DecodeTritBlock(num_bits);
532 vals_decoded += 5; 449 vals_decoded += 5;
533 break; 450 break;
534 case JUST_BITS: 451 case JUST_BITS:
535 val.bit_value = StreamColorBits(val.num_bits); 452 BitValue(val, StreamColorBits(num_bits));
536 ResultEmplaceBack(val); 453 ResultEmplaceBack(val);
537 vals_decoded++; 454 vals_decoded++;
538 break; 455 break;
@@ -540,7 +457,7 @@ void DecodeIntegerSequence(uint max_range, uint num_values) {
540 } 457 }
541} 458}
542 459
543void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { 460void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, out uint color_values[32]) {
544 uint num_values = 0; 461 uint num_values = 0;
545 for (uint i = 0; i < num_partitions; i++) { 462 for (uint i = 0; i < num_partitions; i++) {
546 num_values += ((modes[i] >> 2) + 1) << 1; 463 num_values += ((modes[i] >> 2) + 1) << 1;
@@ -549,7 +466,7 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
549 // TODO(ameerj): profile with binary search 466 // TODO(ameerj): profile with binary search
550 int range = 0; 467 int range = 0;
551 while (++range < encoding_values.length()) { 468 while (++range < encoding_values.length()) {
552 uint bit_length = GetBitLength(num_values, range); 469 const uint bit_length = GetBitLength(num_values, range);
553 if (bit_length > color_data_bits) { 470 if (bit_length > color_data_bits) {
554 break; 471 break;
555 } 472 }
@@ -560,48 +477,49 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
560 if (out_index >= num_values) { 477 if (out_index >= num_values) {
561 break; 478 break;
562 } 479 }
563 EncodingData val = result_vector[itr]; 480 const EncodingData val = GetEncodingFromVector(itr);
564 uint bitlen = val.num_bits; 481 const uint encoding = Encoding(val);
565 uint bitval = val.bit_value; 482 const uint bitlen = NumBits(val);
483 const uint bitval = BitValue(val);
566 uint A = 0, B = 0, C = 0, D = 0; 484 uint A = 0, B = 0, C = 0, D = 0;
567 A = ReplicateBitTo9((bitval & 1)); 485 A = ReplicateBitTo9((bitval & 1));
568 switch (val.encoding) { 486 switch (encoding) {
569 case JUST_BITS: 487 case JUST_BITS:
570 color_values[out_index++] = FastReplicateTo8(bitval, bitlen); 488 color_values[++out_index] = FastReplicateTo8(bitval, bitlen);
571 break; 489 break;
572 case TRIT: { 490 case TRIT: {
573 D = val.quint_trit_value; 491 D = QuintTritValue(val);
574 switch (bitlen) { 492 switch (bitlen) {
575 case 1: 493 case 1:
576 C = 204; 494 C = 204;
577 break; 495 break;
578 case 2: { 496 case 2: {
579 C = 93; 497 C = 93;
580 uint b = (bitval >> 1) & 1; 498 const uint b = (bitval >> 1) & 1;
581 B = (b << 8) | (b << 4) | (b << 2) | (b << 1); 499 B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
582 break; 500 break;
583 } 501 }
584 case 3: { 502 case 3: {
585 C = 44; 503 C = 44;
586 uint cb = (bitval >> 1) & 3; 504 const uint cb = (bitval >> 1) & 3;
587 B = (cb << 7) | (cb << 2) | cb; 505 B = (cb << 7) | (cb << 2) | cb;
588 break; 506 break;
589 } 507 }
590 case 4: { 508 case 4: {
591 C = 22; 509 C = 22;
592 uint dcb = (bitval >> 1) & 7; 510 const uint dcb = (bitval >> 1) & 7;
593 B = (dcb << 6) | dcb; 511 B = (dcb << 6) | dcb;
594 break; 512 break;
595 } 513 }
596 case 5: { 514 case 5: {
597 C = 11; 515 C = 11;
598 uint edcb = (bitval >> 1) & 0xF; 516 const uint edcb = (bitval >> 1) & 0xF;
599 B = (edcb << 5) | (edcb >> 2); 517 B = (edcb << 5) | (edcb >> 2);
600 break; 518 break;
601 } 519 }
602 case 6: { 520 case 6: {
603 C = 5; 521 C = 5;
604 uint fedcb = (bitval >> 1) & 0x1F; 522 const uint fedcb = (bitval >> 1) & 0x1F;
605 B = (fedcb << 4) | (fedcb >> 4); 523 B = (fedcb << 4) | (fedcb >> 4);
606 break; 524 break;
607 } 525 }
@@ -609,32 +527,32 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
609 break; 527 break;
610 } 528 }
611 case QUINT: { 529 case QUINT: {
612 D = val.quint_trit_value; 530 D = QuintTritValue(val);
613 switch (bitlen) { 531 switch (bitlen) {
614 case 1: 532 case 1:
615 C = 113; 533 C = 113;
616 break; 534 break;
617 case 2: { 535 case 2: {
618 C = 54; 536 C = 54;
619 uint b = (bitval >> 1) & 1; 537 const uint b = (bitval >> 1) & 1;
620 B = (b << 8) | (b << 3) | (b << 2); 538 B = (b << 8) | (b << 3) | (b << 2);
621 break; 539 break;
622 } 540 }
623 case 3: { 541 case 3: {
624 C = 26; 542 C = 26;
625 uint cb = (bitval >> 1) & 3; 543 const uint cb = (bitval >> 1) & 3;
626 B = (cb << 7) | (cb << 1) | (cb >> 1); 544 B = (cb << 7) | (cb << 1) | (cb >> 1);
627 break; 545 break;
628 } 546 }
629 case 4: { 547 case 4: {
630 C = 13; 548 C = 13;
631 uint dcb = (bitval >> 1) & 7; 549 const uint dcb = (bitval >> 1) & 7;
632 B = (dcb << 6) | (dcb >> 1); 550 B = (dcb << 6) | (dcb >> 1);
633 break; 551 break;
634 } 552 }
635 case 5: { 553 case 5: {
636 C = 6; 554 C = 6;
637 uint edcb = (bitval >> 1) & 0xF; 555 const uint edcb = (bitval >> 1) & 0xF;
638 B = (edcb << 5) | (edcb >> 3); 556 B = (edcb << 5) | (edcb >> 3);
639 break; 557 break;
640 } 558 }
@@ -642,11 +560,11 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
642 break; 560 break;
643 } 561 }
644 } 562 }
645 if (val.encoding != JUST_BITS) { 563 if (encoding != JUST_BITS) {
646 uint T = (D * C) + B; 564 uint T = (D * C) + B;
647 T ^= A; 565 T ^= A;
648 T = (A & 0x80) | (T >> 2); 566 T = (A & 0x80) | (T >> 2);
649 color_values[out_index++] = T; 567 color_values[++out_index] = T;
650 } 568 }
651 } 569 }
652} 570}
@@ -664,139 +582,136 @@ ivec2 BitTransferSigned(int a, int b) {
664} 582}
665 583
666uvec4 ClampByte(ivec4 color) { 584uvec4 ClampByte(ivec4 color) {
667 for (uint i = 0; i < 4; ++i) { 585 return uvec4(clamp(color, 0, 255));
668 color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
669 }
670 return uvec4(color);
671} 586}
672 587
673ivec4 BlueContract(int a, int r, int g, int b) { 588ivec4 BlueContract(int a, int r, int g, int b) {
674 return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); 589 return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
675} 590}
676 591
677void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { 592void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, uint color_values[32],
593 inout uint colvals_index) {
678#define READ_UINT_VALUES(N) \ 594#define READ_UINT_VALUES(N) \
679 uint v[N]; \ 595 uvec4 V[2]; \
680 for (uint i = 0; i < N; i++) { \ 596 for (uint i = 0; i < N; i++) { \
681 v[i] = color_values[colvals_index++]; \ 597 V[i / 4][i % 4] = color_values[++colvals_index]; \
682 } 598 }
683
684#define READ_INT_VALUES(N) \ 599#define READ_INT_VALUES(N) \
685 int v[N]; \ 600 ivec4 V[2]; \
686 for (uint i = 0; i < N; i++) { \ 601 for (uint i = 0; i < N; i++) { \
687 v[i] = int(color_values[colvals_index++]); \ 602 V[i / 4][i % 4] = int(color_values[++colvals_index]); \
688 } 603 }
689 604
690 switch (color_endpoint_mode) { 605 switch (color_endpoint_mode) {
691 case 0: { 606 case 0: {
692 READ_UINT_VALUES(2) 607 READ_UINT_VALUES(2)
693 ep1 = uvec4(0xFF, v[0], v[0], v[0]); 608 ep1 = uvec4(0xFF, V[0].x, V[0].x, V[0].x);
694 ep2 = uvec4(0xFF, v[1], v[1], v[1]); 609 ep2 = uvec4(0xFF, V[0].y, V[0].y, V[0].y);
695 break; 610 break;
696 } 611 }
697 case 1: { 612 case 1: {
698 READ_UINT_VALUES(2) 613 READ_UINT_VALUES(2)
699 uint L0 = (v[0] >> 2) | (v[1] & 0xC0); 614 const uint L0 = (V[0].x >> 2) | (V[0].y & 0xC0);
700 uint L1 = min(L0 + (v[1] & 0x3F), 0xFFU); 615 const uint L1 = min(L0 + (V[0].y & 0x3F), 0xFFU);
701 ep1 = uvec4(0xFF, L0, L0, L0); 616 ep1 = uvec4(0xFF, L0, L0, L0);
702 ep2 = uvec4(0xFF, L1, L1, L1); 617 ep2 = uvec4(0xFF, L1, L1, L1);
703 break; 618 break;
704 } 619 }
705 case 4: { 620 case 4: {
706 READ_UINT_VALUES(4) 621 READ_UINT_VALUES(4)
707 ep1 = uvec4(v[2], v[0], v[0], v[0]); 622 ep1 = uvec4(V[0].z, V[0].x, V[0].x, V[0].x);
708 ep2 = uvec4(v[3], v[1], v[1], v[1]); 623 ep2 = uvec4(V[0].w, V[0].y, V[0].y, V[0].y);
709 break; 624 break;
710 } 625 }
711 case 5: { 626 case 5: {
712 READ_INT_VALUES(4) 627 READ_INT_VALUES(4)
713 ivec2 transferred = BitTransferSigned(v[1], v[0]); 628 ivec2 transferred = BitTransferSigned(V[0].y, V[0].x);
714 v[1] = transferred.x; 629 V[0].y = transferred.x;
715 v[0] = transferred.y; 630 V[0].x = transferred.y;
716 transferred = BitTransferSigned(v[3], v[2]); 631 transferred = BitTransferSigned(V[0].w, V[0].z);
717 v[3] = transferred.x; 632 V[0].w = transferred.x;
718 v[2] = transferred.y; 633 V[0].z = transferred.y;
719 ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0])); 634 ep1 = ClampByte(ivec4(V[0].z, V[0].x, V[0].x, V[0].x));
720 ep2 = ClampByte(ivec4(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1])); 635 ep2 = ClampByte(ivec4(V[0].z + V[0].w, V[0].x + V[0].y, V[0].x + V[0].y, V[0].x + V[0].y));
721 break; 636 break;
722 } 637 }
723 case 6: { 638 case 6: {
724 READ_UINT_VALUES(4) 639 READ_UINT_VALUES(4)
725 ep1 = uvec4(0xFF, (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); 640 ep1 = uvec4(0xFF, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8);
726 ep2 = uvec4(0xFF, v[0], v[1], v[2]); 641 ep2 = uvec4(0xFF, V[0].x, V[0].y, V[0].z);
727 break; 642 break;
728 } 643 }
729 case 8: { 644 case 8: {
730 READ_UINT_VALUES(6) 645 READ_UINT_VALUES(6)
731 if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { 646 if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) {
732 ep1 = uvec4(0xFF, v[0], v[2], v[4]); 647 ep1 = uvec4(0xFF, V[0].x, V[0].z, V[1].x);
733 ep2 = uvec4(0xFF, v[1], v[3], v[5]); 648 ep2 = uvec4(0xFF, V[0].y, V[0].w, V[1].y);
734 } else { 649 } else {
735 ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5]))); 650 ep1 = uvec4(BlueContract(0xFF, int(V[0].y), int(V[0].w), int(V[1].y)));
736 ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4]))); 651 ep2 = uvec4(BlueContract(0xFF, int(V[0].x), int(V[0].z), int(V[1].x)));
737 } 652 }
738 break; 653 break;
739 } 654 }
740 case 9: { 655 case 9: {
741 READ_INT_VALUES(6) 656 READ_INT_VALUES(6)
742 ivec2 transferred = BitTransferSigned(v[1], v[0]); 657 ivec2 transferred = BitTransferSigned(V[0].y, V[0].x);
743 v[1] = transferred.x; 658 V[0].y = transferred.x;
744 v[0] = transferred.y; 659 V[0].x = transferred.y;
745 transferred = BitTransferSigned(v[3], v[2]); 660 transferred = BitTransferSigned(V[0].w, V[0].z);
746 v[3] = transferred.x; 661 V[0].w = transferred.x;
747 v[2] = transferred.y; 662 V[0].z = transferred.y;
748 transferred = BitTransferSigned(v[5], v[4]); 663 transferred = BitTransferSigned(V[1].y, V[1].x);
749 v[5] = transferred.x; 664 V[1].y = transferred.x;
750 v[4] = transferred.y; 665 V[1].x = transferred.y;
751 if ((v[1] + v[3] + v[5]) >= 0) { 666 if ((V[0].y + V[0].w + V[1].y) >= 0) {
752 ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4])); 667 ep1 = ClampByte(ivec4(0xFF, V[0].x, V[0].z, V[1].x));
753 ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); 668 ep2 = ClampByte(ivec4(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
754 } else { 669 } else {
755 ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); 670 ep1 = ClampByte(BlueContract(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
756 ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4])); 671 ep2 = ClampByte(BlueContract(0xFF, V[0].x, V[0].z, V[1].x));
757 } 672 }
758 break; 673 break;
759 } 674 }
760 case 10: { 675 case 10: {
761 READ_UINT_VALUES(6) 676 READ_UINT_VALUES(6)
762 ep1 = uvec4(v[4], (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); 677 ep1 = uvec4(V[1].x, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8);
763 ep2 = uvec4(v[5], v[0], v[1], v[2]); 678 ep2 = uvec4(V[1].y, V[0].x, V[0].y, V[0].z);
764 break; 679 break;
765 } 680 }
766 case 12: { 681 case 12: {
767 READ_UINT_VALUES(8) 682 READ_UINT_VALUES(8)
768 if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { 683 if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) {
769 ep1 = uvec4(v[6], v[0], v[2], v[4]); 684 ep1 = uvec4(V[1].z, V[0].x, V[0].z, V[1].x);
770 ep2 = uvec4(v[7], v[1], v[3], v[5]); 685 ep2 = uvec4(V[1].w, V[0].y, V[0].w, V[1].y);
771 } else { 686 } else {
772 ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5]))); 687 ep1 = uvec4(BlueContract(int(V[1].w), int(V[0].y), int(V[0].w), int(V[1].y)));
773 ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4]))); 688 ep2 = uvec4(BlueContract(int(V[1].z), int(V[0].x), int(V[0].z), int(V[1].x)));
774 } 689 }
775 break; 690 break;
776 } 691 }
777 case 13: { 692 case 13: {
778 READ_INT_VALUES(8) 693 READ_INT_VALUES(8)
779 ivec2 transferred = BitTransferSigned(v[1], v[0]); 694 ivec2 transferred = BitTransferSigned(V[0].y, V[0].x);
780 v[1] = transferred.x; 695 V[0].y = transferred.x;
781 v[0] = transferred.y; 696 V[0].x = transferred.y;
782 transferred = BitTransferSigned(v[3], v[2]); 697 transferred = BitTransferSigned(V[0].w, V[0].z);
783 v[3] = transferred.x; 698 V[0].w = transferred.x;
784 v[2] = transferred.y; 699 V[0].z = transferred.y;
785 700
786 transferred = BitTransferSigned(v[5], v[4]); 701 transferred = BitTransferSigned(V[1].y, V[1].x);
787 v[5] = transferred.x; 702 V[1].y = transferred.x;
788 v[4] = transferred.y; 703 V[1].x = transferred.y;
789 704
790 transferred = BitTransferSigned(v[7], v[6]); 705 transferred = BitTransferSigned(V[1].w, V[1].z);
791 v[7] = transferred.x; 706 V[1].w = transferred.x;
792 v[6] = transferred.y; 707 V[1].z = transferred.y;
793 708
794 if ((v[1] + v[3] + v[5]) >= 0) { 709 if ((V[0].y + V[0].w + V[1].y) >= 0) {
795 ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4])); 710 ep1 = ClampByte(ivec4(V[1].z, V[0].x, V[0].z, V[1].x));
796 ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5])); 711 ep2 = ClampByte(ivec4(V[1].w + V[1].z, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
797 } else { 712 } else {
798 ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5])); 713 ep1 = ClampByte(BlueContract(V[1].z + V[1].w, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
799 ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4])); 714 ep2 = ClampByte(BlueContract(V[1].z, V[0].x, V[0].z, V[1].x));
800 } 715 }
801 break; 716 break;
802 } 717 }
@@ -812,36 +727,34 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) {
812} 727}
813 728
814uint UnquantizeTexelWeight(EncodingData val) { 729uint UnquantizeTexelWeight(EncodingData val) {
815 uint bitval = val.bit_value; 730 const uint encoding = Encoding(val);
816 uint bitlen = val.num_bits; 731 const uint bitlen = NumBits(val);
817 uint A = ReplicateBitTo7((bitval & 1)); 732 const uint bitval = BitValue(val);
733 const uint A = ReplicateBitTo7((bitval & 1));
818 uint B = 0, C = 0, D = 0; 734 uint B = 0, C = 0, D = 0;
819 uint result = 0; 735 uint result = 0;
820 switch (val.encoding) { 736 const uint bitlen_0_results[5] = {0, 16, 32, 48, 64};
737 switch (encoding) {
821 case JUST_BITS: 738 case JUST_BITS:
822 result = FastReplicateTo6(bitval, bitlen); 739 return FastReplicateTo6(bitval, bitlen);
823 break;
824 case TRIT: { 740 case TRIT: {
825 D = val.quint_trit_value; 741 D = QuintTritValue(val);
826 switch (bitlen) { 742 switch (bitlen) {
827 case 0: { 743 case 0:
828 uint results[3] = {0, 32, 63}; 744 return bitlen_0_results[D * 2];
829 result = results[D];
830 break;
831 }
832 case 1: { 745 case 1: {
833 C = 50; 746 C = 50;
834 break; 747 break;
835 } 748 }
836 case 2: { 749 case 2: {
837 C = 23; 750 C = 23;
838 uint b = (bitval >> 1) & 1; 751 const uint b = (bitval >> 1) & 1;
839 B = (b << 6) | (b << 2) | b; 752 B = (b << 6) | (b << 2) | b;
840 break; 753 break;
841 } 754 }
842 case 3: { 755 case 3: {
843 C = 11; 756 C = 11;
844 uint cb = (bitval >> 1) & 3; 757 const uint cb = (bitval >> 1) & 3;
845 B = (cb << 5) | cb; 758 B = (cb << 5) | cb;
846 break; 759 break;
847 } 760 }
@@ -851,20 +764,17 @@ uint UnquantizeTexelWeight(EncodingData val) {
851 break; 764 break;
852 } 765 }
853 case QUINT: { 766 case QUINT: {
854 D = val.quint_trit_value; 767 D = QuintTritValue(val);
855 switch (bitlen) { 768 switch (bitlen) {
856 case 0: { 769 case 0:
857 uint results[5] = {0, 16, 32, 47, 63}; 770 return bitlen_0_results[D];
858 result = results[D];
859 break;
860 }
861 case 1: { 771 case 1: {
862 C = 28; 772 C = 28;
863 break; 773 break;
864 } 774 }
865 case 2: { 775 case 2: {
866 C = 13; 776 C = 13;
867 uint b = (bitval >> 1) & 1; 777 const uint b = (bitval >> 1) & 1;
868 B = (b << 6) | (b << 1); 778 B = (b << 6) | (b << 1);
869 break; 779 break;
870 } 780 }
@@ -872,7 +782,7 @@ uint UnquantizeTexelWeight(EncodingData val) {
872 break; 782 break;
873 } 783 }
874 } 784 }
875 if (val.encoding != JUST_BITS && bitlen > 0) { 785 if (encoding != JUST_BITS && bitlen > 0) {
876 result = D * C + B; 786 result = D * C + B;
877 result ^= A; 787 result ^= A;
878 result = (A & 0x20) | (result >> 2); 788 result = (A & 0x20) | (result >> 2);
@@ -883,61 +793,77 @@ uint UnquantizeTexelWeight(EncodingData val) {
883 return result; 793 return result;
884} 794}
885 795
886void UnquantizeTexelWeights(bool dual_plane, uvec2 size) { 796void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) {
887 uint weight_idx = 0; 797 const uint num_planes = is_dual_plane ? 2 : 1;
888 uint unquantized[2][144]; 798 const uint area = size.x * size.y;
889 uint area = size.x * size.y; 799 const uint loop_count = min(result_index, area * num_planes);
890 for (uint itr = 0; itr < texel_vector_index; itr++) { 800 for (uint itr = 0; itr < loop_count; ++itr) {
891 unquantized[0][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]); 801 result_vector[itr] =
892 if (dual_plane) { 802 UnquantizeTexelWeight(GetEncodingFromVector(itr));
893 ++itr;
894 unquantized[1][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
895 if (itr == texel_vector_index) {
896 break;
897 }
898 }
899 if (++weight_idx >= (area))
900 break;
901 } 803 }
804}
805
806uint GetUnquantizedTexelWieght(uint offset_base, uint plane, bool is_dual_plane) {
807 const uint offset = is_dual_plane ? 2 * offset_base + plane : offset_base;
808 return result_vector[offset];
809}
902 810
811uvec4 GetUnquantizedWeightVector(uint t, uint s, uvec2 size, uint plane_index, bool is_dual_plane) {
903 const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); 812 const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
904 const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); 813 const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
905 const uint k_plane_scale = dual_plane ? 2 : 1; 814 const uint area = size.x * size.y;
906 for (uint plane = 0; plane < k_plane_scale; plane++) { 815
907 for (uint t = 0; t < block_dims.y; t++) { 816 const uint cs = Ds * s;
908 for (uint s = 0; s < block_dims.x; s++) { 817 const uint ct = Dt * t;
909 uint cs = Ds * s; 818 const uint gs = (cs * (size.x - 1) + 32) >> 6;
910 uint ct = Dt * t; 819 const uint gt = (ct * (size.y - 1) + 32) >> 6;
911 uint gs = (cs * (size.x - 1) + 32) >> 6; 820 const uint js = gs >> 4;
912 uint gt = (ct * (size.y - 1) + 32) >> 6; 821 const uint fs = gs & 0xF;
913 uint js = gs >> 4; 822 const uint jt = gt >> 4;
914 uint fs = gs & 0xF; 823 const uint ft = gt & 0x0F;
915 uint jt = gt >> 4; 824 const uint w11 = (fs * ft + 8) >> 4;
916 uint ft = gt & 0x0F; 825 const uint w10 = ft - w11;
917 uint w11 = (fs * ft + 8) >> 4; 826 const uint w01 = fs - w11;
918 uint w10 = ft - w11; 827 const uint w00 = 16 - fs - ft + w11;
919 uint w01 = fs - w11; 828 const uvec4 w = uvec4(w00, w01, w10, w11);
920 uint w00 = 16 - fs - ft + w11; 829 const uint v0 = jt * size.x + js;
921 uvec4 w = uvec4(w00, w01, w10, w11); 830
922 uint v0 = jt * size.x + js; 831 uvec4 p0 = uvec4(0);
923 832 uvec4 p1 = uvec4(0);
924 uvec4 p = uvec4(0); 833
925 if (v0 < area) { 834 if (v0 < area) {
926 p.x = unquantized[plane][v0]; 835 const uint offset_base = v0;
927 } 836 p0.x = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane);
928 if ((v0 + 1) < (area)) { 837 p1.x = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane);
929 p.y = unquantized[plane][v0 + 1]; 838 }
930 } 839 if ((v0 + 1) < (area)) {
931 if ((v0 + size.x) < (area)) { 840 const uint offset_base = v0 + 1;
932 p.z = unquantized[plane][(v0 + size.x)]; 841 p0.y = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane);
933 } 842 p1.y = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane);
934 if ((v0 + size.x + 1) < (area)) { 843 }
935 p.w = unquantized[plane][(v0 + size.x + 1)]; 844 if ((v0 + size.x) < (area)) {
936 } 845 const uint offset_base = v0 + size.x;
937 unquantized_texel_weights[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; 846 p0.z = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane);
938 } 847 p1.z = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane);
848 }
849 if ((v0 + size.x + 1) < (area)) {
850 const uint offset_base = v0 + size.x + 1;
851 p0.w = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane);
852 p1.w = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane);
853 }
854
855 const uint primary_weight = (uint(dot(p0, w)) + 8) >> 4;
856
857 uvec4 weight_vec = uvec4(primary_weight);
858
859 if (is_dual_plane) {
860 const uint secondary_weight = (uint(dot(p1, w)) + 8) >> 4;
861 for (uint c = 0; c < 4; c++) {
862 const bool is_secondary = ((plane_index + 1u) & 3u) == c;
863 weight_vec[c] = is_secondary ? secondary_weight : primary_weight;
939 } 864 }
940 } 865 }
866 return weight_vec;
941} 867}
942 868
943int FindLayout(uint mode) { 869int FindLayout(uint mode) {
@@ -971,80 +897,96 @@ int FindLayout(uint mode) {
971 return 5; 897 return 5;
972} 898}
973 899
974TexelWeightParams DecodeBlockInfo() { 900
975 TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false); 901void FillError(ivec3 coord) {
976 uint mode = StreamBits(11); 902 for (uint j = 0; j < block_dims.y; j++) {
903 for (uint i = 0; i < block_dims.x; i++) {
904 imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0));
905 }
906 }
907}
908
909void FillVoidExtentLDR(ivec3 coord) {
910 SkipBits(52);
911 const uint r_u = StreamBits(16);
912 const uint g_u = StreamBits(16);
913 const uint b_u = StreamBits(16);
914 const uint a_u = StreamBits(16);
915 const float a = float(a_u) / 65535.0f;
916 const float r = float(r_u) / 65535.0f;
917 const float g = float(g_u) / 65535.0f;
918 const float b = float(b_u) / 65535.0f;
919 for (uint j = 0; j < block_dims.y; j++) {
920 for (uint i = 0; i < block_dims.x; i++) {
921 imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a));
922 }
923 }
924}
925
926bool IsError(uint mode) {
977 if ((mode & 0x1ff) == 0x1fc) { 927 if ((mode & 0x1ff) == 0x1fc) {
978 if ((mode & 0x200) != 0) { 928 if ((mode & 0x200) != 0) {
979 params.void_extent_hdr = true; 929 // params.void_extent_hdr = true;
980 } else { 930 return true;
981 params.void_extent_ldr = true;
982 } 931 }
983 if ((mode & 0x400) == 0 || StreamBits(1) == 0) { 932 if ((mode & 0x400) == 0 || StreamBits(1) == 0) {
984 params.error_state = true; 933 return true;
985 } 934 }
986 return params; 935 return false;
987 } 936 }
988 if ((mode & 0xf) == 0) { 937 if ((mode & 0xf) == 0) {
989 params.error_state = true; 938 return true;
990 return params;
991 } 939 }
992 if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) { 940 if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) {
993 params.error_state = true; 941 return true;
994 return params;
995 } 942 }
943 return false;
944}
945
946uvec2 DecodeBlockSize(uint mode) {
996 uint A, B; 947 uint A, B;
997 uint mode_layout = FindLayout(mode); 948 switch (FindLayout(mode)) {
998 switch (mode_layout) {
999 case 0: 949 case 0:
1000 A = (mode >> 5) & 0x3; 950 A = (mode >> 5) & 0x3;
1001 B = (mode >> 7) & 0x3; 951 B = (mode >> 7) & 0x3;
1002 params.size = uvec2(B + 4, A + 2); 952 return uvec2(B + 4, A + 2);
1003 break;
1004 case 1: 953 case 1:
1005 A = (mode >> 5) & 0x3; 954 A = (mode >> 5) & 0x3;
1006 B = (mode >> 7) & 0x3; 955 B = (mode >> 7) & 0x3;
1007 params.size = uvec2(B + 8, A + 2); 956 return uvec2(B + 8, A + 2);
1008 break;
1009 case 2: 957 case 2:
1010 A = (mode >> 5) & 0x3; 958 A = (mode >> 5) & 0x3;
1011 B = (mode >> 7) & 0x3; 959 B = (mode >> 7) & 0x3;
1012 params.size = uvec2(A + 2, B + 8); 960 return uvec2(A + 2, B + 8);
1013 break;
1014 case 3: 961 case 3:
1015 A = (mode >> 5) & 0x3; 962 A = (mode >> 5) & 0x3;
1016 B = (mode >> 7) & 0x1; 963 B = (mode >> 7) & 0x1;
1017 params.size = uvec2(A + 2, B + 6); 964 return uvec2(A + 2, B + 6);
1018 break;
1019 case 4: 965 case 4:
1020 A = (mode >> 5) & 0x3; 966 A = (mode >> 5) & 0x3;
1021 B = (mode >> 7) & 0x1; 967 B = (mode >> 7) & 0x1;
1022 params.size = uvec2(B + 2, A + 2); 968 return uvec2(B + 2, A + 2);
1023 break;
1024 case 5: 969 case 5:
1025 A = (mode >> 5) & 0x3; 970 A = (mode >> 5) & 0x3;
1026 params.size = uvec2(12, A + 2); 971 return uvec2(12, A + 2);
1027 break;
1028 case 6: 972 case 6:
1029 A = (mode >> 5) & 0x3; 973 A = (mode >> 5) & 0x3;
1030 params.size = uvec2(A + 2, 12); 974 return uvec2(A + 2, 12);
1031 break;
1032 case 7: 975 case 7:
1033 params.size = uvec2(6, 10); 976 return uvec2(6, 10);
1034 break;
1035 case 8: 977 case 8:
1036 params.size = uvec2(10, 6); 978 return uvec2(10, 6);
1037 break;
1038 case 9: 979 case 9:
1039 A = (mode >> 5) & 0x3; 980 A = (mode >> 5) & 0x3;
1040 B = (mode >> 9) & 0x3; 981 B = (mode >> 9) & 0x3;
1041 params.size = uvec2(A + 6, B + 6); 982 return uvec2(A + 6, B + 6);
1042 break;
1043 default: 983 default:
1044 params.error_state = true; 984 return uvec2(0);
1045 break;
1046 } 985 }
1047 params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); 986}
987
988uint DecodeMaxWeight(uint mode) {
989 const uint mode_layout = FindLayout(mode);
1048 uint weight_index = (mode & 0x10) != 0 ? 1 : 0; 990 uint weight_index = (mode & 0x10) != 0 ? 1 : 0;
1049 if (mode_layout < 5) { 991 if (mode_layout < 5) {
1050 weight_index |= (mode & 0x3) << 1; 992 weight_index |= (mode & 0x3) << 1;
@@ -1053,64 +995,34 @@ TexelWeightParams DecodeBlockInfo() {
1053 } 995 }
1054 weight_index -= 2; 996 weight_index -= 2;
1055 if ((mode_layout != 9) && ((mode & 0x200) != 0)) { 997 if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
1056 const int max_weights[6] = int[6](7, 8, 9, 10, 11, 12); 998 weight_index += 6;
1057 params.max_weight = max_weights[weight_index];
1058 } else {
1059 const int max_weights[6] = int[6](1, 2, 3, 4, 5, 6);
1060 params.max_weight = max_weights[weight_index];
1061 }
1062 return params;
1063}
1064
1065void FillError(ivec3 coord) {
1066 for (uint j = 0; j < block_dims.y; j++) {
1067 for (uint i = 0; i < block_dims.x; i++) {
1068 imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0));
1069 }
1070 }
1071}
1072
1073void FillVoidExtentLDR(ivec3 coord) {
1074 StreamBits(52);
1075 uint r_u = StreamBits(16);
1076 uint g_u = StreamBits(16);
1077 uint b_u = StreamBits(16);
1078 uint a_u = StreamBits(16);
1079 float a = float(a_u) / 65535.0f;
1080 float r = float(r_u) / 65535.0f;
1081 float g = float(g_u) / 65535.0f;
1082 float b = float(b_u) / 65535.0f;
1083 for (uint j = 0; j < block_dims.y; j++) {
1084 for (uint i = 0; i < block_dims.x; i++) {
1085 imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a));
1086 }
1087 } 999 }
1000 return weight_index + 1;
1088} 1001}
1089 1002
1090void DecompressBlock(ivec3 coord) { 1003void DecompressBlock(ivec3 coord) {
1091 TexelWeightParams params = DecodeBlockInfo(); 1004 uint mode = StreamBits(11);
1092 if (params.error_state) { 1005 if (IsError(mode)) {
1093 FillError(coord);
1094 return;
1095 }
1096 if (params.void_extent_hdr) {
1097 FillError(coord); 1006 FillError(coord);
1098 return; 1007 return;
1099 } 1008 }
1100 if (params.void_extent_ldr) { 1009 if ((mode & 0x1ff) == 0x1fc) {
1010 // params.void_extent_ldr = true;
1101 FillVoidExtentLDR(coord); 1011 FillVoidExtentLDR(coord);
1102 return; 1012 return;
1103 } 1013 }
1104 if ((params.size.x > block_dims.x) || (params.size.y > block_dims.y)) { 1014 const uvec2 size_params = DecodeBlockSize(mode);
1015 if ((size_params.x > block_dims.x) || (size_params.y > block_dims.y)) {
1105 FillError(coord); 1016 FillError(coord);
1106 return; 1017 return;
1107 } 1018 }
1108 uint num_partitions = StreamBits(2) + 1; 1019 const uint num_partitions = StreamBits(2) + 1;
1109 if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) { 1020 const uint mode_layout = FindLayout(mode);
1021 const bool dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
1022 if (num_partitions > 4 || (num_partitions == 4 && dual_plane)) {
1110 FillError(coord); 1023 FillError(coord);
1111 return; 1024 return;
1112 } 1025 }
1113 int plane_index = -1;
1114 uint partition_index = 1; 1026 uint partition_index = 1;
1115 uvec4 color_endpoint_mode = uvec4(0); 1027 uvec4 color_endpoint_mode = uvec4(0);
1116 uint ced_pointer = 0; 1028 uint ced_pointer = 0;
@@ -1122,8 +1034,9 @@ void DecompressBlock(ivec3 coord) {
1122 partition_index = StreamBits(10); 1034 partition_index = StreamBits(10);
1123 base_cem = StreamBits(6); 1035 base_cem = StreamBits(6);
1124 } 1036 }
1125 uint base_mode = base_cem & 3; 1037 const uint base_mode = base_cem & 3;
1126 uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight); 1038 const uint max_weight = DecodeMaxWeight(mode);
1039 const uint weight_bits = GetPackedBitSize(size_params, dual_plane, max_weight);
1127 uint remaining_bits = 128 - weight_bits - total_bitsread; 1040 uint remaining_bits = 128 - weight_bits - total_bitsread;
1128 uint extra_cem_bits = 0; 1041 uint extra_cem_bits = 0;
1129 if (base_mode > 0) { 1042 if (base_mode > 0) {
@@ -1142,10 +1055,7 @@ void DecompressBlock(ivec3 coord) {
1142 } 1055 }
1143 } 1056 }
1144 remaining_bits -= extra_cem_bits; 1057 remaining_bits -= extra_cem_bits;
1145 uint plane_selector_bits = 0; 1058 const uint plane_selector_bits = dual_plane ? 2 : 0;
1146 if (params.dual_plane) {
1147 plane_selector_bits = 2;
1148 }
1149 remaining_bits -= plane_selector_bits; 1059 remaining_bits -= plane_selector_bits;
1150 if (remaining_bits > 128) { 1060 if (remaining_bits > 128) {
1151 // Bad data, more remaining bits than 4 bytes 1061 // Bad data, more remaining bits than 4 bytes
@@ -1153,17 +1063,17 @@ void DecompressBlock(ivec3 coord) {
1153 return; 1063 return;
1154 } 1064 }
1155 // Read color data... 1065 // Read color data...
1156 uint color_data_bits = remaining_bits; 1066 const uint color_data_bits = remaining_bits;
1157 while (remaining_bits > 0) { 1067 while (remaining_bits > 0) {
1158 int nb = int(min(remaining_bits, 32U)); 1068 const int nb = int(min(remaining_bits, 32U));
1159 uint b = StreamBits(nb); 1069 const uint b = StreamBits(nb);
1160 color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); 1070 color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
1161 ++ced_pointer; 1071 ++ced_pointer;
1162 remaining_bits -= nb; 1072 remaining_bits -= nb;
1163 } 1073 }
1164 plane_index = int(StreamBits(plane_selector_bits)); 1074 const uint plane_index = uint(StreamBits(plane_selector_bits));
1165 if (base_mode > 0) { 1075 if (base_mode > 0) {
1166 uint extra_cem = StreamBits(extra_cem_bits); 1076 const uint extra_cem = StreamBits(extra_cem_bits);
1167 uint cem = (extra_cem << 6) | base_cem; 1077 uint cem = (extra_cem << 6) | base_cem;
1168 cem >>= 2; 1078 cem >>= 2;
1169 uvec4 C = uvec4(0); 1079 uvec4 C = uvec4(0);
@@ -1185,70 +1095,80 @@ void DecompressBlock(ivec3 coord) {
1185 color_endpoint_mode[i] |= M[i]; 1095 color_endpoint_mode[i] |= M[i];
1186 } 1096 }
1187 } else if (num_partitions > 1) { 1097 } else if (num_partitions > 1) {
1188 uint cem = base_cem >> 2; 1098 const uint cem = base_cem >> 2;
1189 for (uint i = 0; i < num_partitions; i++) { 1099 for (uint i = 0; i < num_partitions; i++) {
1190 color_endpoint_mode[i] = cem; 1100 color_endpoint_mode[i] = cem;
1191 } 1101 }
1192 } 1102 }
1193 DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits);
1194 1103
1195 uvec4 endpoints[4][2]; 1104 uvec4 endpoints0[4];
1196 for (uint i = 0; i < num_partitions; i++) { 1105 uvec4 endpoints1[4];
1197 ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]); 1106 {
1107 // This decode phase should at most push 32 elements into the vector
1108 result_vector_max_index = 32;
1109 uint color_values[32];
1110 uint colvals_index = 0;
1111 DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits, color_values);
1112 for (uint i = 0; i < num_partitions; i++) {
1113 ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values,
1114 colvals_index);
1115 }
1198 } 1116 }
1117 color_endpoint_data = local_buff;
1118 color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx;
1119 const uint clear_byte_start = (weight_bits >> 3) + 1;
1199 1120
1200 texel_weight_data = local_buff; 1121 const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) &
1201 texel_weight_data = bitfieldReverse(texel_weight_data).wzyx; 1122 uint(((1 << (weight_bits % 8)) - 1));
1202 uint clear_byte_start = 1123 const uint vec_index = (clear_byte_start - 1) >> 2;
1203 (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1; 1124 color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert,
1204 1125 int((clear_byte_start - 1) % 4) * 8, 8);
1205 uint byte_insert = ExtractBits(texel_weight_data, int(clear_byte_start - 1) * 8, 8) &
1206 uint(
1207 ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
1208 uint vec_index = (clear_byte_start - 1) >> 2;
1209 texel_weight_data[vec_index] =
1210 bitfieldInsert(texel_weight_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8);
1211 for (uint i = clear_byte_start; i < 16; ++i) { 1126 for (uint i = clear_byte_start; i < 16; ++i) {
1212 uint idx = i >> 2; 1127 const uint idx = i >> 2;
1213 texel_weight_data[idx] = bitfieldInsert(texel_weight_data[idx], 0, int(i % 4) * 8, 8); 1128 color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8);
1214 } 1129 }
1215 texel_flag = true; // use texel "vector" and bit stream in integer decoding
1216 DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
1217 1130
1218 UnquantizeTexelWeights(params.dual_plane, params.size); 1131 // Re-init vector variables for next decode phase
1132 result_index = 0;
1133 color_bitsread = 0;
1134 result_limit_reached = false;
1219 1135
1136 // The limit for the Unquantize phase, avoids decoding more data than needed.
1137 result_vector_max_index = size_params.x * size_params.y;
1138 if (dual_plane) {
1139 result_vector_max_index *= 2;
1140 }
1141 DecodeIntegerSequence(max_weight, GetNumWeightValues(size_params, dual_plane));
1142
1143 UnquantizeTexelWeights(size_params, dual_plane);
1220 for (uint j = 0; j < block_dims.y; j++) { 1144 for (uint j = 0; j < block_dims.y; j++) {
1221 for (uint i = 0; i < block_dims.x; i++) { 1145 for (uint i = 0; i < block_dims.x; i++) {
1222 uint local_partition = 0; 1146 uint local_partition = 0;
1223 if (num_partitions > 1) { 1147 if (num_partitions > 1) {
1224 local_partition = Select2DPartition(partition_index, i, j, num_partitions, 1148 local_partition = Select2DPartition(partition_index, i, j, num_partitions);
1225 (block_dims.y * block_dims.x) < 32);
1226 }
1227 vec4 p;
1228 uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]);
1229 uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]);
1230 uvec4 plane_vec = uvec4(0);
1231 uvec4 weight_vec = uvec4(0);
1232 for (uint c = 0; c < 4; c++) {
1233 if (params.dual_plane && (((plane_index + 1) & 3) == c)) {
1234 plane_vec[c] = 1;
1235 }
1236 weight_vec[c] = unquantized_texel_weights[plane_vec[c]][j * block_dims.x + i];
1237 } 1149 }
1238 vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); 1150 const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]);
1239 p = (Cf / 65535.0); 1151 const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]);
1152 const uvec4 weight_vec = GetUnquantizedWeightVector(j, i, size_params, plane_index, dual_plane);
1153 const vec4 Cf =
1154 vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64);
1155 const vec4 p = (Cf / 65535.0f);
1240 imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); 1156 imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar);
1241 } 1157 }
1242 } 1158 }
1243} 1159}
1244 1160
1161uint SwizzleOffset(uvec2 pos) {
1162 const uint x = pos.x;
1163 const uint y = pos.y;
1164 return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
1165 ((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
1166}
1167
1245void main() { 1168void main() {
1246 uvec3 pos = gl_GlobalInvocationID; 1169 uvec3 pos = gl_GlobalInvocationID;
1247 pos.x <<= BYTES_PER_BLOCK_LOG2; 1170 pos.x <<= BYTES_PER_BLOCK_LOG2;
1248
1249 // Read as soon as possible due to its latency
1250 const uint swizzle = SwizzleOffset(pos.xy); 1171 const uint swizzle = SwizzleOffset(pos.xy);
1251
1252 const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; 1172 const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
1253 1173
1254 uint offset = 0; 1174 uint offset = 0;
@@ -1262,8 +1182,6 @@ void main() {
1262 if (any(greaterThanEqual(coord, imageSize(dest_image)))) { 1182 if (any(greaterThanEqual(coord, imageSize(dest_image)))) {
1263 return; 1183 return;
1264 } 1184 }
1265 current_index = 0;
1266 bitsread = 0;
1267 local_buff = astc_data[offset / 16]; 1185 local_buff = astc_data[offset / 16];
1268 DecompressBlock(coord); 1186 DecompressBlock(coord);
1269} 1187}
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp
index 544982d18..c437013e6 100644
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -68,6 +68,7 @@ void UtilShaders::ASTCDecode(Image& image, const StagingBufferMap& map,
68 std::span<const VideoCommon::SwizzleParameters> swizzles) { 68 std::span<const VideoCommon::SwizzleParameters> swizzles) {
69 static constexpr GLuint BINDING_INPUT_BUFFER = 0; 69 static constexpr GLuint BINDING_INPUT_BUFFER = 0;
70 static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; 70 static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
71 program_manager.LocalMemoryWarmup();
71 72
72 const Extent2D tile_size{ 73 const Extent2D tile_size{
73 .width = VideoCore::Surface::DefaultBlockWidth(image.info.format), 74 .width = VideoCore::Surface::DefaultBlockWidth(image.info.format),