summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Ameer J2023-08-01 17:22:03 -0400
committerGravatar Ameer J2023-08-06 14:54:57 -0400
commit5248fa926dd53948b0df4f93c50107dc30ae2305 (patch)
tree3aa35f10a7a17d5a46d5579a914b858a46777551 /src
parentminor redundancy cleanup (diff)
downloadyuzu-5248fa926dd53948b0df4f93c50107dc30ae2305.tar.gz
yuzu-5248fa926dd53948b0df4f93c50107dc30ae2305.tar.xz
yuzu-5248fa926dd53948b0df4f93c50107dc30ae2305.zip
const, pack result_vector and replicate tables,
undo amd opts
Diffstat (limited to 'src')
-rw-r--r--src/video_core/host_shaders/astc_decoder.comp487
1 files changed, 260 insertions, 227 deletions
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index 90b40c55f..e8801b0ff 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -57,20 +57,40 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT;
57 57
58const uint BYTES_PER_BLOCK_LOG2 = 4; 58const uint BYTES_PER_BLOCK_LOG2 = 4;
59 59
60const int JUST_BITS = 0; 60const uint JUST_BITS = 0u;
61const int QUINT = 1; 61const uint QUINT = 1u;
62const int TRIT = 2; 62const uint TRIT = 2u;
63 63
64// ASTC Encodings data, sorted in ascending order based on their BitLength value 64// ASTC Encodings data, sorted in ascending order based on their BitLength value
65// (see GetBitLength() function) 65// (see GetBitLength() function)
66const EncodingData encoding_values[22] = EncodingData[]( 66const uvec4 encoding_values[6] = uvec4[](
67 EncodingData(JUST_BITS), EncodingData(JUST_BITS | (1u << 8u)), EncodingData(TRIT), EncodingData(JUST_BITS | (2u << 8u)), 67 uvec4((JUST_BITS), (JUST_BITS | (1u << 8u)), (TRIT), (JUST_BITS | (2u << 8u))),
68 EncodingData(QUINT), EncodingData(TRIT | (1u << 8u)), EncodingData(JUST_BITS | (3u << 8u)), EncodingData(QUINT | (1u << 8u)), 68 uvec4((QUINT), (TRIT | (1u << 8u)), (JUST_BITS | (3u << 8u)), (QUINT | (1u << 8u))),
69 EncodingData(TRIT | (2u << 8u)), EncodingData(JUST_BITS | (4u << 8u)), EncodingData(QUINT | (2u << 8u)), EncodingData(TRIT | (3u << 8u)), 69 uvec4((TRIT | (2u << 8u)), (JUST_BITS | (4u << 8u)), (QUINT | (2u << 8u)), (TRIT | (3u << 8u))),
70 EncodingData(JUST_BITS | (5u << 8u)), EncodingData(QUINT | (3u << 8u)), EncodingData(TRIT | (4u << 8u)), EncodingData(JUST_BITS | (6u << 8u)), 70 uvec4((JUST_BITS | (5u << 8u)), (QUINT | (3u << 8u)), (TRIT | (4u << 8u)), (JUST_BITS | (6u << 8u))),
71 EncodingData(QUINT | (4u << 8u)), EncodingData(TRIT | (5u << 8u)), EncodingData(JUST_BITS | (7u << 8u)), EncodingData(QUINT | (5u << 8u)), 71 uvec4((QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u))),
72 EncodingData(TRIT | (6u << 8u)), EncodingData(JUST_BITS | (8u << 8u)) 72 uvec4((TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)), 0u, 0u));
73); 73
74// Input ASTC texture globals
75int total_bitsread = 0;
76uvec4 local_buff;
77
78// Color data globals
79uvec4 color_endpoint_data;
80int color_bitsread = 0;
81
82// Global "vector" to be pushed into when decoding
83// At most will require BLOCK_WIDTH x BLOCK_HEIGHT in single plane mode
84// At most will require BLOCK_WIDTH x BLOCK_HEIGHT x 2 in dual plane mode
85// So the maximum would be 144 (12 x 12) elements, x 2 for two planes
86#define DIVCEIL(number, divisor) (number + divisor - 1) / divisor
87#define ARRAY_NUM_ELEMENTS 144
88#define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4)
89uvec4 result_vector[VECTOR_ARRAY_SIZE];
90
91int result_index = 0;
92uint result_vector_max_index;
93bool result_limit_reached = false;
74 94
75// EncodingData helpers 95// EncodingData helpers
76uint Encoding(EncodingData val) { 96uint Encoding(EncodingData val) {
@@ -104,78 +124,17 @@ EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint
104 ((bit_val) << 16u) | ((quint_trit_val) << 24u)); 124 ((bit_val) << 16u) | ((quint_trit_val) << 24u));
105} 125}
106 126
107// The following constants are expanded variants of the Replicate()
108// function calls corresponding to the following arguments:
109// value: index into the generated table
110// num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4.
111// to_bit: the integer after "TO_"
112const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
113const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
114
115const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
116const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
117const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
118const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
119 uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
120const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
121 uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
122 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
123const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
124const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
125const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
126const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
127 uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
128const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
129 uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
130 47, 49, 51, 53, 55, 57, 59, 61, 63);
131const uint REPLICATE_6_BIT_TO_8_TABLE[64] =
132 uint[](0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 65, 69, 73, 77, 81, 85, 89,
133 93, 97, 101, 105, 109, 113, 117, 121, 125, 130, 134, 138, 142, 146, 150, 154, 158, 162,
134 166, 170, 174, 178, 182, 186, 190, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235,
135 239, 243, 247, 251, 255);
136const uint REPLICATE_7_BIT_TO_8_TABLE[128] =
137 uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44,
138 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88,
139 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
140 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159, 161, 163,
141 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199,
142 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235,
143 237, 239, 241, 243, 245, 247, 249, 251, 253, 255);
144
145// Input ASTC texture globals
146int total_bitsread = 0;
147uvec4 local_buff;
148
149// Color data globals
150uvec4 color_endpoint_data;
151int color_bitsread = 0;
152
153// Four values, two endpoints, four maximum partitions
154uint color_values[32];
155int colvals_index = 0;
156
157// Global "vectors" to be pushed into when decoding
158EncodingData result_vector[144];
159int result_index = 0;
160 127
161// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] 128void ResultEmplaceBack(EncodingData val) {
162// is the same as [(num_bits - 1):0] and repeats all the way down. 129 if (result_index >= result_vector_max_index) {
163uint Replicate(uint val, uint num_bits, uint to_bit) { 130 // Alert callers to avoid decoding more than needed by this phase
164 const uint v = val & uint((1 << num_bits) - 1); 131 result_limit_reached = true;
165 uint res = v; 132 return;
166 uint reslen = num_bits;
167 while (reslen < to_bit) {
168 uint comp = 0;
169 if (num_bits > to_bit - reslen) {
170 uint newshift = to_bit - reslen;
171 comp = num_bits - newshift;
172 num_bits = newshift;
173 }
174 res = uint(res << num_bits);
175 res = uint(res | (v >> comp));
176 reslen += num_bits;
177 } 133 }
178 return res; 134 const uint array_index = result_index / 4;
135 const uint vector_index = result_index % 4;
136 result_vector[array_index][vector_index] = val.data;
137 ++result_index;
179} 138}
180 139
181uvec4 ReplicateByteTo16(uvec4 value) { 140uvec4 ReplicateByteTo16(uvec4 value) {
@@ -183,64 +142,105 @@ uvec4 ReplicateByteTo16(uvec4 value) {
183} 142}
184 143
185uint ReplicateBitTo7(uint value) { 144uint ReplicateBitTo7(uint value) {
186 return REPLICATE_BIT_TO_7_TABLE[value]; 145 return value * 127;
187} 146}
188 147
189uint ReplicateBitTo9(uint value) { 148uint ReplicateBitTo9(uint value) {
190 return REPLICATE_1_BIT_TO_9_TABLE[value]; 149 return value * 511;
191} 150}
192 151
193uint FastReplicate(uint value, uint num_bits, uint to_bit) { 152uint FastReplicateTo8(uint value, uint num_bits) {
194 if (num_bits == 0) { 153 if (value == 0) {
195 return 0; 154 return 0;
196 } 155 }
197 if (num_bits == to_bit) { 156 const uint array_index = value / 4;
198 return value; 157 const uint vector_index = value % 4;
158 switch (num_bits) {
159 case 1:
160 return 255;
161 case 2: {
162 const uvec4 REPLICATE_2_BIT_TO_8_TABLE = (uvec4(0, 85, 170, 255));
163 return REPLICATE_2_BIT_TO_8_TABLE[vector_index];
199 } 164 }
200 if (to_bit == 6) { 165 case 3: {
201 switch (num_bits) { 166 const uvec4 REPLICATE_3_BIT_TO_8_TABLE[2] =
202 case 1: 167 uvec4[](uvec4(0, 36, 73, 109), uvec4(146, 182, 219, 255));
203 return REPLICATE_1_BIT_TO_6_TABLE[value]; 168 return REPLICATE_3_BIT_TO_8_TABLE[array_index][vector_index];
204 case 2:
205 return REPLICATE_2_BIT_TO_6_TABLE[value];
206 case 3:
207 return REPLICATE_3_BIT_TO_6_TABLE[value];
208 case 4:
209 return REPLICATE_4_BIT_TO_6_TABLE[value];
210 case 5:
211 return REPLICATE_5_BIT_TO_6_TABLE[value];
212 default:
213 break;
214 }
215 } else { /* if (to_bit == 8) */
216 switch (num_bits) {
217 case 1:
218 return REPLICATE_1_BIT_TO_8_TABLE[value];
219 case 2:
220 return REPLICATE_2_BIT_TO_8_TABLE[value];
221 case 3:
222 return REPLICATE_3_BIT_TO_8_TABLE[value];
223 case 4:
224 return REPLICATE_4_BIT_TO_8_TABLE[value];
225 case 5:
226 return REPLICATE_5_BIT_TO_8_TABLE[value];
227 case 6:
228 return REPLICATE_6_BIT_TO_8_TABLE[value];
229 case 7:
230 return REPLICATE_7_BIT_TO_8_TABLE[value];
231 default:
232 break;
233 }
234 } 169 }
235 return Replicate(value, num_bits, to_bit); 170 case 4: {
236} 171 const uvec4 REPLICATE_4_BIT_TO_8_TABLE[4] =
237 172 uvec4[](uvec4(0, 17, 34, 51), uvec4(68, 85, 102, 119), uvec4(136, 153, 170, 187),
238uint FastReplicateTo8(uint value, uint num_bits) { 173 uvec4(204, 221, 238, 255));
239 return FastReplicate(value, num_bits, 8); 174 return REPLICATE_4_BIT_TO_8_TABLE[array_index][vector_index];
175 }
176 case 5: {
177 const uvec4 REPLICATE_5_BIT_TO_8_TABLE[8] =
178 uvec4[](uvec4(0, 8, 16, 24), uvec4(33, 41, 49, 57), uvec4(66, 74, 82, 90),
179 uvec4(99, 107, 115, 123), uvec4(132, 140, 148, 156), uvec4(165, 173, 181, 189),
180 uvec4(198, 206, 214, 222), uvec4(231, 239, 247, 255));
181 return REPLICATE_5_BIT_TO_8_TABLE[array_index][vector_index];
182 }
183 case 6: {
184 const uvec4 REPLICATE_6_BIT_TO_8_TABLE[16] = uvec4[](
185 uvec4(0, 4, 8, 12), uvec4(16, 20, 24, 28), uvec4(32, 36, 40, 44), uvec4(48, 52, 56, 60),
186 uvec4(65, 69, 73, 77), uvec4(81, 85, 89, 93), uvec4(97, 101, 105, 109),
187 uvec4(113, 117, 121, 125), uvec4(130, 134, 138, 142), uvec4(146, 150, 154, 158),
188 uvec4(162, 166, 170, 174), uvec4(178, 182, 186, 190), uvec4(195, 199, 203, 207),
189 uvec4(211, 215, 219, 223), uvec4(227, 231, 235, 239), uvec4(243, 247, 251, 255));
190 return REPLICATE_6_BIT_TO_8_TABLE[array_index][vector_index];
191 }
192 case 7: {
193 const uvec4 REPLICATE_7_BIT_TO_8_TABLE[32] =
194 uvec4[](uvec4(0, 2, 4, 6), uvec4(8, 10, 12, 14), uvec4(16, 18, 20, 22),
195 uvec4(24, 26, 28, 30), uvec4(32, 34, 36, 38), uvec4(40, 42, 44, 46),
196 uvec4(48, 50, 52, 54), uvec4(56, 58, 60, 62), uvec4(64, 66, 68, 70),
197 uvec4(72, 74, 76, 78), uvec4(80, 82, 84, 86), uvec4(88, 90, 92, 94),
198 uvec4(96, 98, 100, 102), uvec4(104, 106, 108, 110), uvec4(112, 114, 116, 118),
199 uvec4(120, 122, 124, 126), uvec4(129, 131, 133, 135), uvec4(137, 139, 141, 143),
200 uvec4(145, 147, 149, 151), uvec4(153, 155, 157, 159), uvec4(161, 163, 165, 167),
201 uvec4(169, 171, 173, 175), uvec4(177, 179, 181, 183), uvec4(185, 187, 189, 191),
202 uvec4(193, 195, 197, 199), uvec4(201, 203, 205, 207), uvec4(209, 211, 213, 215),
203 uvec4(217, 219, 221, 223), uvec4(225, 227, 229, 231), uvec4(233, 235, 237, 239),
204 uvec4(241, 243, 245, 247), uvec4(249, 251, 253, 255));
205 return REPLICATE_7_BIT_TO_8_TABLE[array_index][vector_index];
206 }
207 }
208 return value;
240} 209}
241 210
242uint FastReplicateTo6(uint value, uint num_bits) { 211uint FastReplicateTo6(uint value, uint num_bits) {
243 return FastReplicate(value, num_bits, 6); 212 if (value == 0) {
213 return 0;
214 }
215 const uint array_index = value / 4;
216 const uint vector_index = value % 4;
217 switch (num_bits) {
218 case 1:
219 return 63;
220 case 2: {
221 const uvec4 REPLICATE_2_BIT_TO_6_TABLE = uvec4(0, 21, 42, 63);
222 return REPLICATE_2_BIT_TO_6_TABLE[vector_index];
223 }
224 case 3: {
225 const uvec4 REPLICATE_3_BIT_TO_6_TABLE[2] =
226 uvec4[](uvec4(0, 9, 18, 27), uvec4(36, 45, 54, 63));
227 return REPLICATE_3_BIT_TO_6_TABLE[array_index][vector_index];
228 }
229 case 4: {
230 const uvec4 REPLICATE_4_BIT_TO_6_TABLE[4] =
231 uvec4[](uvec4(0, 4, 8, 12), uvec4(17, 21, 25, 29), uvec4(34, 38, 42, 46),
232 uvec4(51, 55, 59, 63));
233 return REPLICATE_4_BIT_TO_6_TABLE[array_index][vector_index];
234 }
235 case 5: {
236 const uvec4 REPLICATE_5_BIT_TO_6_TABLE[8] =
237 uvec4[](uvec4(0, 2, 4, 6), uvec4(8, 10, 12, 14), uvec4(16, 18, 20, 22),
238 uvec4(24, 26, 28, 30), uvec4(33, 35, 37, 39), uvec4(41, 43, 45, 47),
239 uvec4(49, 51, 53, 55), uvec4(57, 59, 61, 63));
240 return REPLICATE_5_BIT_TO_6_TABLE[array_index][vector_index];
241 }
242 }
243 return value;
244} 244}
245 245
246uint Div3Floor(uint v) { 246uint Div3Floor(uint v) {
@@ -281,7 +281,7 @@ uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool sma
281 281
282 seed += (partition_count - 1) * 1024; 282 seed += (partition_count - 1) * 1024;
283 283
284 uint rnum = Hash52(uint(seed)); 284 const uint rnum = Hash52(uint(seed));
285 uint seed1 = uint(rnum & 0xF); 285 uint seed1 = uint(rnum & 0xF);
286 uint seed2 = uint((rnum >> 4) & 0xF); 286 uint seed2 = uint((rnum >> 4) & 0xF);
287 uint seed3 = uint((rnum >> 8) & 0xF); 287 uint seed3 = uint((rnum >> 8) & 0xF);
@@ -364,8 +364,8 @@ uint ExtractBits(uvec4 payload, int offset, int bits) {
364} 364}
365 365
366uint StreamBits(uint num_bits) { 366uint StreamBits(uint num_bits) {
367 int int_bits = int(num_bits); 367 const int int_bits = int(num_bits);
368 uint ret = ExtractBits(local_buff, total_bitsread, int_bits); 368 const uint ret = ExtractBits(local_buff, total_bitsread, int_bits);
369 total_bitsread += int_bits; 369 total_bitsread += int_bits;
370 return ret; 370 return ret;
371} 371}
@@ -382,14 +382,18 @@ uint StreamColorBits(uint num_bits) {
382 return ret; 382 return ret;
383} 383}
384 384
385void ResultEmplaceBack(EncodingData val) { 385EncodingData GetEncodingFromVector(uint index) {
386 result_vector[result_index] = val; 386 const uint array_index = index / 4;
387 ++result_index; 387 const uint vector_index = index % 4;
388
389 const uint data = result_vector[array_index][vector_index];
390 return EncodingData(data);
388} 391}
389 392
390// Returns the number of bits required to encode n_vals values. 393// Returns the number of bits required to encode n_vals values.
391uint GetBitLength(uint n_vals, uint encoding_index) { 394uint GetBitLength(uint n_vals, uint encoding_index) {
392 const EncodingData encoding_value = encoding_values[encoding_index]; 395 const EncodingData encoding_value =
396 EncodingData(encoding_values[encoding_index / 4][encoding_index % 4]);
393 const uint encoding = Encoding(encoding_value); 397 const uint encoding = Encoding(encoding_value);
394 uint total_bits = NumBits(encoding_value) * n_vals; 398 uint total_bits = NumBits(encoding_value) * n_vals;
395 if (encoding == TRIT) { 399 if (encoding == TRIT) {
@@ -409,7 +413,7 @@ uint GetNumWeightValues(uvec2 size, bool dual_plane) {
409} 413}
410 414
411uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) { 415uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) {
412 uint n_vals = GetNumWeightValues(size, dual_plane); 416 const uint n_vals = GetNumWeightValues(size, dual_plane);
413 return GetBitLength(n_vals, max_weight); 417 return GetBitLength(n_vals, max_weight);
414} 418}
415 419
@@ -418,13 +422,13 @@ uint BitsBracket(uint bits, uint pos) {
418} 422}
419 423
420uint BitsOp(uint bits, uint start, uint end) { 424uint BitsOp(uint bits, uint start, uint end) {
421 uint mask = (1 << (end - start + 1)) - 1; 425 const uint mask = (1 << (end - start + 1)) - 1;
422 return ((bits >> start) & mask); 426 return ((bits >> start) & mask);
423} 427}
424 428
425void DecodeQuintBlock(uint num_bits) { 429void DecodeQuintBlock(uint num_bits) {
426 uint m[3]; 430 uvec3 m;
427 uint q[3]; 431 uvec3 q;
428 uint Q; 432 uint Q;
429 m[0] = StreamColorBits(num_bits); 433 m[0] = StreamColorBits(num_bits);
430 Q = StreamColorBits(3); 434 Q = StreamColorBits(3);
@@ -433,25 +437,25 @@ void DecodeQuintBlock(uint num_bits) {
433 m[2] = StreamColorBits(num_bits); 437 m[2] = StreamColorBits(num_bits);
434 Q |= StreamColorBits(2) << 5; 438 Q |= StreamColorBits(2) << 5;
435 if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) { 439 if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) {
436 q[0] = 4; 440 q.x = 4;
437 q[1] = 4; 441 q.y = 4;
438 q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) | 442 q.z = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) |
439 (BitsBracket(Q, 3) & ~BitsBracket(Q, 0)); 443 (BitsBracket(Q, 3) & ~BitsBracket(Q, 0));
440 } else { 444 } else {
441 uint C = 0; 445 uint C = 0;
442 if (BitsOp(Q, 1, 2) == 3) { 446 if (BitsOp(Q, 1, 2) == 3) {
443 q[2] = 4; 447 q.z = 4;
444 C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0); 448 C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0);
445 } else { 449 } else {
446 q[2] = BitsOp(Q, 5, 6); 450 q.z = BitsOp(Q, 5, 6);
447 C = BitsOp(Q, 0, 4); 451 C = BitsOp(Q, 0, 4);
448 } 452 }
449 if (BitsOp(C, 0, 2) == 5) { 453 if (BitsOp(C, 0, 2) == 5) {
450 q[1] = 4; 454 q.y = 4;
451 q[0] = BitsOp(C, 3, 4); 455 q.x = BitsOp(C, 3, 4);
452 } else { 456 } else {
453 q[1] = BitsOp(C, 3, 4); 457 q.y = BitsOp(C, 3, 4);
454 q[0] = BitsOp(C, 0, 2); 458 q.x = BitsOp(C, 0, 2);
455 } 459 }
456 } 460 }
457 for (uint i = 0; i < 3; i++) { 461 for (uint i = 0; i < 3; i++) {
@@ -509,11 +513,11 @@ void DecodeTritBlock(uint num_bits) {
509} 513}
510 514
511void DecodeIntegerSequence(uint max_range, uint num_values) { 515void DecodeIntegerSequence(uint max_range, uint num_values) {
512 EncodingData val = encoding_values[max_range]; 516 EncodingData val = EncodingData(encoding_values[max_range / 4][max_range % 4]);
513 const uint encoding = Encoding(val); 517 const uint encoding = Encoding(val);
514 const uint num_bits = NumBits(val); 518 const uint num_bits = NumBits(val);
515 uint vals_decoded = 0; 519 uint vals_decoded = 0;
516 while (vals_decoded < num_values) { 520 while (vals_decoded < num_values && !result_limit_reached) {
517 switch (encoding) { 521 switch (encoding) {
518 case QUINT: 522 case QUINT:
519 DecodeQuintBlock(num_bits); 523 DecodeQuintBlock(num_bits);
@@ -532,7 +536,8 @@ void DecodeIntegerSequence(uint max_range, uint num_values) {
532 } 536 }
533} 537}
534 538
535void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { 539void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits,
540 out uvec4 color_values[8]) {
536 uint num_values = 0; 541 uint num_values = 0;
537 for (uint i = 0; i < num_partitions; i++) { 542 for (uint i = 0; i < num_partitions; i++) {
538 num_values += ((modes[i] >> 2) + 1) << 1; 543 num_values += ((modes[i] >> 2) + 1) << 1;
@@ -540,8 +545,8 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
540 // Find the largest encoding that's within color_data_bits 545 // Find the largest encoding that's within color_data_bits
541 // TODO(ameerj): profile with binary search 546 // TODO(ameerj): profile with binary search
542 int range = 0; 547 int range = 0;
543 while (++range < encoding_values.length()) { 548 while (++range < ((encoding_values.length() * 4) - 2)) {
544 uint bit_length = GetBitLength(num_values, range); 549 const uint bit_length = GetBitLength(num_values, range);
545 if (bit_length > color_data_bits) { 550 if (bit_length > color_data_bits) {
546 break; 551 break;
547 } 552 }
@@ -552,7 +557,7 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
552 if (out_index >= num_values) { 557 if (out_index >= num_values) {
553 break; 558 break;
554 } 559 }
555 const EncodingData val = result_vector[itr]; 560 const EncodingData val = GetEncodingFromVector(itr);
556 const uint encoding = Encoding(val); 561 const uint encoding = Encoding(val);
557 const uint bitlen = NumBits(val); 562 const uint bitlen = NumBits(val);
558 const uint bitval = BitValue(val); 563 const uint bitval = BitValue(val);
@@ -560,7 +565,8 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
560 A = ReplicateBitTo9((bitval & 1)); 565 A = ReplicateBitTo9((bitval & 1));
561 switch (encoding) { 566 switch (encoding) {
562 case JUST_BITS: 567 case JUST_BITS:
563 color_values[out_index++] = FastReplicateTo8(bitval, bitlen); 568 color_values[out_index / 4][out_index % 4] = FastReplicateTo8(bitval, bitlen);
569 ++out_index;
564 break; 570 break;
565 case TRIT: { 571 case TRIT: {
566 D = QuintTritValue(val); 572 D = QuintTritValue(val);
@@ -570,31 +576,31 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
570 break; 576 break;
571 case 2: { 577 case 2: {
572 C = 93; 578 C = 93;
573 uint b = (bitval >> 1) & 1; 579 const uint b = (bitval >> 1) & 1;
574 B = (b << 8) | (b << 4) | (b << 2) | (b << 1); 580 B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
575 break; 581 break;
576 } 582 }
577 case 3: { 583 case 3: {
578 C = 44; 584 C = 44;
579 uint cb = (bitval >> 1) & 3; 585 const uint cb = (bitval >> 1) & 3;
580 B = (cb << 7) | (cb << 2) | cb; 586 B = (cb << 7) | (cb << 2) | cb;
581 break; 587 break;
582 } 588 }
583 case 4: { 589 case 4: {
584 C = 22; 590 C = 22;
585 uint dcb = (bitval >> 1) & 7; 591 const uint dcb = (bitval >> 1) & 7;
586 B = (dcb << 6) | dcb; 592 B = (dcb << 6) | dcb;
587 break; 593 break;
588 } 594 }
589 case 5: { 595 case 5: {
590 C = 11; 596 C = 11;
591 uint edcb = (bitval >> 1) & 0xF; 597 const uint edcb = (bitval >> 1) & 0xF;
592 B = (edcb << 5) | (edcb >> 2); 598 B = (edcb << 5) | (edcb >> 2);
593 break; 599 break;
594 } 600 }
595 case 6: { 601 case 6: {
596 C = 5; 602 C = 5;
597 uint fedcb = (bitval >> 1) & 0x1F; 603 const uint fedcb = (bitval >> 1) & 0x1F;
598 B = (fedcb << 4) | (fedcb >> 4); 604 B = (fedcb << 4) | (fedcb >> 4);
599 break; 605 break;
600 } 606 }
@@ -609,25 +615,25 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
609 break; 615 break;
610 case 2: { 616 case 2: {
611 C = 54; 617 C = 54;
612 uint b = (bitval >> 1) & 1; 618 const uint b = (bitval >> 1) & 1;
613 B = (b << 8) | (b << 3) | (b << 2); 619 B = (b << 8) | (b << 3) | (b << 2);
614 break; 620 break;
615 } 621 }
616 case 3: { 622 case 3: {
617 C = 26; 623 C = 26;
618 uint cb = (bitval >> 1) & 3; 624 const uint cb = (bitval >> 1) & 3;
619 B = (cb << 7) | (cb << 1) | (cb >> 1); 625 B = (cb << 7) | (cb << 1) | (cb >> 1);
620 break; 626 break;
621 } 627 }
622 case 4: { 628 case 4: {
623 C = 13; 629 C = 13;
624 uint dcb = (bitval >> 1) & 7; 630 const uint dcb = (bitval >> 1) & 7;
625 B = (dcb << 6) | (dcb >> 1); 631 B = (dcb << 6) | (dcb >> 1);
626 break; 632 break;
627 } 633 }
628 case 5: { 634 case 5: {
629 C = 6; 635 C = 6;
630 uint edcb = (bitval >> 1) & 0xF; 636 const uint edcb = (bitval >> 1) & 0xF;
631 B = (edcb << 5) | (edcb >> 3); 637 B = (edcb << 5) | (edcb >> 3);
632 break; 638 break;
633 } 639 }
@@ -639,7 +645,8 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
639 uint T = (D * C) + B; 645 uint T = (D * C) + B;
640 T ^= A; 646 T ^= A;
641 T = (A & 0x80) | (T >> 2); 647 T = (A & 0x80) | (T >> 2);
642 color_values[out_index++] = T; 648 color_values[out_index / 4][out_index % 4] = T;
649 ++out_index;
643 } 650 }
644 } 651 }
645} 652}
@@ -657,25 +664,30 @@ ivec2 BitTransferSigned(int a, int b) {
657} 664}
658 665
659uvec4 ClampByte(ivec4 color) { 666uvec4 ClampByte(ivec4 color) {
660 const uvec4 clamped = uvec4(clamp(color, 0, 255)); 667 for (uint i = 0; i < 4; ++i) {
661 return clamped; 668 color[i] = clamp(color[i], 0, 255);
669 }
670 return uvec4(color);
662} 671}
663 672
664ivec4 BlueContract(int a, int r, int g, int b) { 673ivec4 BlueContract(int a, int r, int g, int b) {
665 return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); 674 return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
666} 675}
667 676
668void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { 677void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode,
678 in uvec4 color_values[8], inout uint colvals_index) {
669#define READ_UINT_VALUES(N) \ 679#define READ_UINT_VALUES(N) \
670 uint v[N]; \ 680 uint v[N]; \
671 for (uint i = 0; i < N; i++) { \ 681 for (uint i = 0; i < N; i++) { \
672 v[i] = color_values[colvals_index++]; \ 682 v[i] = color_values[colvals_index / 4][colvals_index % 4]; \
683 ++colvals_index; \
673 } 684 }
674 685
675#define READ_INT_VALUES(N) \ 686#define READ_INT_VALUES(N) \
676 int v[N]; \ 687 int v[N]; \
677 for (uint i = 0; i < N; i++) { \ 688 for (uint i = 0; i < N; i++) { \
678 v[i] = int(color_values[colvals_index++]); \ 689 v[i] = int(color_values[colvals_index / 4][colvals_index % 4]); \
690 ++colvals_index; \
679 } 691 }
680 692
681 switch (color_endpoint_mode) { 693 switch (color_endpoint_mode) {
@@ -687,8 +699,8 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) {
687 } 699 }
688 case 1: { 700 case 1: {
689 READ_UINT_VALUES(2) 701 READ_UINT_VALUES(2)
690 uint L0 = (v[0] >> 2) | (v[1] & 0xC0); 702 const uint L0 = (v[0] >> 2) | (v[1] & 0xC0);
691 uint L1 = min(L0 + (v[1] & 0x3F), 0xFFU); 703 const uint L1 = min(L0 + (v[1] & 0x3F), 0xFFU);
692 ep1 = uvec4(0xFF, L0, L0, L0); 704 ep1 = uvec4(0xFF, L0, L0, L0);
693 ep2 = uvec4(0xFF, L1, L1, L1); 705 ep2 = uvec4(0xFF, L1, L1, L1);
694 break; 706 break;
@@ -817,7 +829,7 @@ uint UnquantizeTexelWeight(EncodingData val) {
817 D = QuintTritValue(val); 829 D = QuintTritValue(val);
818 switch (bitlen) { 830 switch (bitlen) {
819 case 0: { 831 case 0: {
820 uint results[3] = {0, 32, 63}; 832 const uint results[3] = {0, 32, 63};
821 result = results[D]; 833 result = results[D];
822 break; 834 break;
823 } 835 }
@@ -827,13 +839,13 @@ uint UnquantizeTexelWeight(EncodingData val) {
827 } 839 }
828 case 2: { 840 case 2: {
829 C = 23; 841 C = 23;
830 uint b = (bitval >> 1) & 1; 842 const uint b = (bitval >> 1) & 1;
831 B = (b << 6) | (b << 2) | b; 843 B = (b << 6) | (b << 2) | b;
832 break; 844 break;
833 } 845 }
834 case 3: { 846 case 3: {
835 C = 11; 847 C = 11;
836 uint cb = (bitval >> 1) & 3; 848 const uint cb = (bitval >> 1) & 3;
837 B = (cb << 5) | cb; 849 B = (cb << 5) | cb;
838 break; 850 break;
839 } 851 }
@@ -846,7 +858,7 @@ uint UnquantizeTexelWeight(EncodingData val) {
846 D = QuintTritValue(val); 858 D = QuintTritValue(val);
847 switch (bitlen) { 859 switch (bitlen) {
848 case 0: { 860 case 0: {
849 uint results[5] = {0, 16, 32, 47, 63}; 861 const uint results[5] = {0, 16, 32, 47, 63};
850 result = results[D]; 862 result = results[D];
851 break; 863 break;
852 } 864 }
@@ -856,7 +868,7 @@ uint UnquantizeTexelWeight(EncodingData val) {
856 } 868 }
857 case 2: { 869 case 2: {
858 C = 13; 870 C = 13;
859 uint b = (bitval >> 1) & 1; 871 const uint b = (bitval >> 1) & 1;
860 B = (b << 6) | (b << 1); 872 B = (b << 6) | (b << 1);
861 break; 873 break;
862 } 874 }
@@ -875,15 +887,18 @@ uint UnquantizeTexelWeight(EncodingData val) {
875 return result; 887 return result;
876} 888}
877 889
878void UnquantizeTexelWeights(bool is_dual_plane, uvec2 size, out uint unquantized_texel_weights[2 * 144]) { 890void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane,
891 out uvec4 unquantized_texel_weights[VECTOR_ARRAY_SIZE]) {
879 const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); 892 const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
880 const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); 893 const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
881 const uint num_planes = is_dual_plane ? 2 : 1; 894 const uint num_planes = is_dual_plane ? 2 : 1;
882 const uint area = size.x * size.y; 895 const uint area = size.x * size.y;
883 const uint loop_count = min(result_index, area * num_planes); 896 const uint loop_count = min(result_index, area * num_planes);
884 uint unquantized[2 * 144];
885 for (uint itr = 0; itr < loop_count; ++itr) { 897 for (uint itr = 0; itr < loop_count; ++itr) {
886 unquantized[itr] = UnquantizeTexelWeight(result_vector[itr]); 898 const uint array_index = itr / 4;
899 const uint vector_index = itr % 4;
900 result_vector[array_index][vector_index] =
901 UnquantizeTexelWeight(GetEncodingFromVector(itr));
887 } 902 }
888 for (uint plane = 0; plane < num_planes; ++plane) { 903 for (uint plane = 0; plane < num_planes; ++plane) {
889 for (uint t = 0; t < block_dims.y; t++) { 904 for (uint t = 0; t < block_dims.y; t++) {
@@ -907,28 +922,33 @@ void UnquantizeTexelWeights(bool is_dual_plane, uvec2 size, out uint unquantized
907 922
908#define VectorIndicesFromBase(offset_base) \ 923#define VectorIndicesFromBase(offset_base) \
909 const uint offset = is_dual_plane ? 2 * offset_base + plane : offset_base; \ 924 const uint offset = is_dual_plane ? 2 * offset_base + plane : offset_base; \
925 const uint array_index = offset / 4; \
926 const uint vector_index = offset % 4;
910 927
911 if (v0 < area) { 928 if (v0 < area) {
912 const uint offset_base = v0; 929 const uint offset_base = v0;
913 VectorIndicesFromBase(offset_base); 930 VectorIndicesFromBase(offset_base);
914 p.x = unquantized[offset]; 931 p.x = result_vector[array_index][vector_index];
915 } 932 }
916 if ((v0 + 1) < (area)) { 933 if ((v0 + 1) < (area)) {
917 const uint offset_base = v0 + 1; 934 const uint offset_base = v0 + 1;
918 VectorIndicesFromBase(offset_base); 935 VectorIndicesFromBase(offset_base);
919 p.y = unquantized[offset]; 936 p.y = result_vector[array_index][vector_index];
920 } 937 }
921 if ((v0 + size.x) < (area)) { 938 if ((v0 + size.x) < (area)) {
922 const uint offset_base = v0 + size.x; 939 const uint offset_base = v0 + size.x;
923 VectorIndicesFromBase(offset_base); 940 VectorIndicesFromBase(offset_base);
924 p.z = unquantized[offset]; 941 p.z = result_vector[array_index][vector_index];
925 } 942 }
926 if ((v0 + size.x + 1) < (area)) { 943 if ((v0 + size.x + 1) < (area)) {
927 const uint offset_base = v0 + size.x + 1; 944 const uint offset_base = v0 + size.x + 1;
928 VectorIndicesFromBase(offset_base); 945 VectorIndicesFromBase(offset_base);
929 p.w = unquantized[offset]; 946 p.w = result_vector[array_index][vector_index];
930 } 947 }
931 unquantized_texel_weights[plane * 144 + t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; 948 const uint offset = (t * block_dims.x + s) + ARRAY_NUM_ELEMENTS * plane;
949 const uint array_index = offset / 4;
950 const uint vector_index = offset % 4;
951 unquantized_texel_weights[array_index][vector_index] = (uint(dot(p, w)) + 8) >> 4;
932 } 952 }
933 } 953 }
934 } 954 }
@@ -1050,6 +1070,7 @@ TexelWeightParams DecodeBlockInfo() {
1050 weight_index += 6; 1070 weight_index += 6;
1051 } 1071 }
1052 params.max_weight = weight_index + 1; 1072 params.max_weight = weight_index + 1;
1073
1053 return params; 1074 return params;
1054} 1075}
1055 1076
@@ -1079,7 +1100,7 @@ void FillVoidExtentLDR(ivec3 coord) {
1079} 1100}
1080 1101
1081void DecompressBlock(ivec3 coord) { 1102void DecompressBlock(ivec3 coord) {
1082 TexelWeightParams params = DecodeBlockInfo(); 1103 const TexelWeightParams params = DecodeBlockInfo();
1083 if (params.error_state) { 1104 if (params.error_state) {
1084 FillError(coord); 1105 FillError(coord);
1085 return; 1106 return;
@@ -1096,12 +1117,11 @@ void DecompressBlock(ivec3 coord) {
1096 FillError(coord); 1117 FillError(coord);
1097 return; 1118 return;
1098 } 1119 }
1099 uint num_partitions = StreamBits(2) + 1; 1120 const uint num_partitions = StreamBits(2) + 1;
1100 if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) { 1121 if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) {
1101 FillError(coord); 1122 FillError(coord);
1102 return; 1123 return;
1103 } 1124 }
1104 int plane_index = -1;
1105 uint partition_index = 1; 1125 uint partition_index = 1;
1106 uvec4 color_endpoint_mode = uvec4(0); 1126 uvec4 color_endpoint_mode = uvec4(0);
1107 uint ced_pointer = 0; 1127 uint ced_pointer = 0;
@@ -1113,8 +1133,8 @@ void DecompressBlock(ivec3 coord) {
1113 partition_index = StreamBits(10); 1133 partition_index = StreamBits(10);
1114 base_cem = StreamBits(6); 1134 base_cem = StreamBits(6);
1115 } 1135 }
1116 uint base_mode = base_cem & 3; 1136 const uint base_mode = base_cem & 3;
1117 uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight); 1137 const uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight);
1118 uint remaining_bits = 128 - weight_bits - total_bitsread; 1138 uint remaining_bits = 128 - weight_bits - total_bitsread;
1119 uint extra_cem_bits = 0; 1139 uint extra_cem_bits = 0;
1120 if (base_mode > 0) { 1140 if (base_mode > 0) {
@@ -1133,10 +1153,7 @@ void DecompressBlock(ivec3 coord) {
1133 } 1153 }
1134 } 1154 }
1135 remaining_bits -= extra_cem_bits; 1155 remaining_bits -= extra_cem_bits;
1136 uint plane_selector_bits = 0; 1156 const uint plane_selector_bits = params.dual_plane ? 2 : 0;
1137 if (params.dual_plane) {
1138 plane_selector_bits = 2;
1139 }
1140 remaining_bits -= plane_selector_bits; 1157 remaining_bits -= plane_selector_bits;
1141 if (remaining_bits > 128) { 1158 if (remaining_bits > 128) {
1142 // Bad data, more remaining bits than 4 bytes 1159 // Bad data, more remaining bits than 4 bytes
@@ -1144,17 +1161,17 @@ void DecompressBlock(ivec3 coord) {
1144 return; 1161 return;
1145 } 1162 }
1146 // Read color data... 1163 // Read color data...
1147 uint color_data_bits = remaining_bits; 1164 const uint color_data_bits = remaining_bits;
1148 while (remaining_bits > 0) { 1165 while (remaining_bits > 0) {
1149 int nb = int(min(remaining_bits, 32U)); 1166 const int nb = int(min(remaining_bits, 32U));
1150 uint b = StreamBits(nb); 1167 const uint b = StreamBits(nb);
1151 color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); 1168 color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
1152 ++ced_pointer; 1169 ++ced_pointer;
1153 remaining_bits -= nb; 1170 remaining_bits -= nb;
1154 } 1171 }
1155 plane_index = int(StreamBits(plane_selector_bits)); 1172 const uint plane_index = uint(StreamBits(plane_selector_bits));
1156 if (base_mode > 0) { 1173 if (base_mode > 0) {
1157 uint extra_cem = StreamBits(extra_cem_bits); 1174 const uint extra_cem = StreamBits(extra_cem_bits);
1158 uint cem = (extra_cem << 6) | base_cem; 1175 uint cem = (extra_cem << 6) | base_cem;
1159 cem >>= 2; 1176 cem >>= 2;
1160 uvec4 C = uvec4(0); 1177 uvec4 C = uvec4(0);
@@ -1176,43 +1193,54 @@ void DecompressBlock(ivec3 coord) {
1176 color_endpoint_mode[i] |= M[i]; 1193 color_endpoint_mode[i] |= M[i];
1177 } 1194 }
1178 } else if (num_partitions > 1) { 1195 } else if (num_partitions > 1) {
1179 uint cem = base_cem >> 2; 1196 const uint cem = base_cem >> 2;
1180 for (uint i = 0; i < num_partitions; i++) { 1197 for (uint i = 0; i < num_partitions; i++) {
1181 color_endpoint_mode[i] = cem; 1198 color_endpoint_mode[i] = cem;
1182 } 1199 }
1183 } 1200 }
1184 DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits);
1185 1201
1186 uvec4 endpoints[4][2]; 1202 uvec4 endpoints0[4];
1187 for (uint i = 0; i < num_partitions; i++) { 1203 uvec4 endpoints1[4];
1188 ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]); 1204 {
1189 } 1205 // This decode phase should at most push 32 elements into the vector
1206 result_vector_max_index = 32;
1190 1207
1208 uvec4 color_values[8];
1209 uint colvals_index = 0;
1210 DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits, color_values);
1211 for (uint i = 0; i < num_partitions; i++) {
1212 ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values,
1213 colvals_index);
1214 }
1215 }
1191 color_endpoint_data = local_buff; 1216 color_endpoint_data = local_buff;
1192 color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx; 1217 color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx;
1193 uint clear_byte_start = 1218 const uint clear_byte_start = (weight_bits >> 3) + 1;
1194 (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1; 1219
1195 1220 const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) &
1196 uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) & 1221 uint(((1 << (weight_bits % 8)) - 1));
1197 uint( 1222 const uint vec_index = (clear_byte_start - 1) >> 2;
1198 ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); 1223 color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert,
1199 uint vec_index = (clear_byte_start - 1) >> 2; 1224 int((clear_byte_start - 1) % 4) * 8, 8);
1200 color_endpoint_data[vec_index] =
1201 bitfieldInsert(color_endpoint_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8);
1202 for (uint i = clear_byte_start; i < 16; ++i) { 1225 for (uint i = clear_byte_start; i < 16; ++i) {
1203 uint idx = i >> 2; 1226 const uint idx = i >> 2;
1204 color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8); 1227 color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8);
1205 } 1228 }
1206 1229
1207 // Re-init vector variables for next decode phase 1230 // Re-init vector variables for next decode phase
1208 result_index = 0; 1231 result_index = 0;
1209 color_bitsread = 0; 1232 color_bitsread = 0;
1233 result_limit_reached = false;
1210 1234
1235 // The limit for the Unquantize phase, avoids decoding more data than needed.
1236 result_vector_max_index = params.size.x * params.size.y;
1237 if (params.dual_plane) {
1238 result_vector_max_index *= 2;
1239 }
1211 DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); 1240 DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
1212 1241
1213 uint unquantized_texel_weights[2 * 144]; 1242 uvec4 unquantized_texel_weights[VECTOR_ARRAY_SIZE];
1214 UnquantizeTexelWeights(params.dual_plane, params.size, unquantized_texel_weights); 1243 UnquantizeTexelWeights(params.size, params.dual_plane, unquantized_texel_weights);
1215
1216 for (uint j = 0; j < block_dims.y; j++) { 1244 for (uint j = 0; j < block_dims.y; j++) {
1217 for (uint i = 0; i < block_dims.x; i++) { 1245 for (uint i = 0; i < block_dims.x; i++) {
1218 uint local_partition = 0; 1246 uint local_partition = 0;
@@ -1220,13 +1248,19 @@ void DecompressBlock(ivec3 coord) {
1220 local_partition = Select2DPartition(partition_index, i, j, num_partitions, 1248 local_partition = Select2DPartition(partition_index, i, j, num_partitions,
1221 (block_dims.y * block_dims.x) < 32); 1249 (block_dims.y * block_dims.x) < 32);
1222 } 1250 }
1223 const uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]); 1251 const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]);
1224 const uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]); 1252 const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]);
1225 const uint weight_offset = (j * block_dims.x + i); 1253 const uint weight_offset = (j * block_dims.x + i);
1226 const uint primary_weight = unquantized_texel_weights[weight_offset]; 1254 const uint array_index = weight_offset / 4;
1255 const uint vector_index = weight_offset % 4;
1256 const uint primary_weight = unquantized_texel_weights[array_index][vector_index];
1227 uvec4 weight_vec = uvec4(primary_weight); 1257 uvec4 weight_vec = uvec4(primary_weight);
1228 if (params.dual_plane) { 1258 if (params.dual_plane) {
1229 const uint secondary_weight = unquantized_texel_weights[weight_offset + 144]; 1259 const uint secondary_weight_offset = (j * block_dims.x + i) + ARRAY_NUM_ELEMENTS;
1260 const uint secondary_array_index = secondary_weight_offset / 4;
1261 const uint secondary_vector_index = secondary_weight_offset % 4;
1262 const uint secondary_weight =
1263 unquantized_texel_weights[secondary_array_index][secondary_vector_index];
1230 for (uint c = 0; c < 4; c++) { 1264 for (uint c = 0; c < 4; c++) {
1231 const bool is_secondary = ((plane_index + 1u) & 3u) == c; 1265 const bool is_secondary = ((plane_index + 1u) & 3u) == c;
1232 weight_vec[c] = is_secondary ? secondary_weight : primary_weight; 1266 weight_vec[c] = is_secondary ? secondary_weight : primary_weight;
@@ -1240,12 +1274,11 @@ void DecompressBlock(ivec3 coord) {
1240 } 1274 }
1241} 1275}
1242 1276
1243
1244uint SwizzleOffset(uvec2 pos) { 1277uint SwizzleOffset(uvec2 pos) {
1245 uint x = pos.x; 1278 const uint x = pos.x;
1246 uint y = pos.y; 1279 const uint y = pos.y;
1247 return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + 1280 return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
1248 (y % 2) * 16 + (x % 16); 1281 ((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
1249} 1282}
1250 1283
1251void main() { 1284void main() {