summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar ameerj2021-02-13 15:50:12 -0500
committerGravatar ameerj2021-03-13 12:16:03 -0500
commit2985e5e94c82febcf215feb0023f4184b38bb24a (patch)
tree7b7cd8be3605560707a74a74c281577920a24248 /src
parentMerge pull request #6053 from Morph1984/time-CalculateSpanBetween (diff)
downloadyuzu-2985e5e94c82febcf215feb0023f4184b38bb24a.tar.gz
yuzu-2985e5e94c82febcf215feb0023f4184b38bb24a.tar.xz
yuzu-2985e5e94c82febcf215feb0023f4184b38bb24a.zip
renderer_opengl: Accelerate ASTC texture decoding with a compute shader
ASTC texture decoding is currently handled by a CPU decoder for GPU's without native ASTC decoding support (most desktop GPUs). This is the cause for noticeable performance degradation in titles which use the format extensively. This commit adds support to accelerate ASTC decoding using a compute shader on OpenGL for GPUs without native support.
Diffstat (limited to 'src')
-rw-r--r--src/video_core/host_shaders/astc_decoder.comp1288
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp10
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h2
-rw-r--r--src/video_core/renderer_opengl/util_shaders.cpp99
-rw-r--r--src/video_core/renderer_opengl/util_shaders.h11
-rw-r--r--src/video_core/textures/astc.h190
6 files changed, 1598 insertions, 2 deletions
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
new file mode 100644
index 000000000..070190a5c
--- /dev/null
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -0,0 +1,1288 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#version 450
6
7#ifdef VULKAN
8
9#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
10#define END_PUSH_CONSTANTS };
11#define UNIFORM(n)
12#define BINDING_SWIZZLE_BUFFER 0
13#define BINDING_INPUT_BUFFER 1
14#define BINDING_ENC_BUFFER 2
15#define BINDING_6_TO_8_BUFFER 3
16#define BINDING_7_TO_8_BUFFER 4
17#define BINDING_8_TO_8_BUFFER 5
18#define BINDING_BYTE_TO_16_BUFFER 6
19#define BINDING_OUTPUT_IMAGE 3
20
21#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
22
23#define BEGIN_PUSH_CONSTANTS
24#define END_PUSH_CONSTANTS
25#define UNIFORM(n) layout(location = n) uniform
26#define BINDING_SWIZZLE_BUFFER 0
27#define BINDING_INPUT_BUFFER 1
28#define BINDING_ENC_BUFFER 2
29#define BINDING_6_TO_8_BUFFER 3
30#define BINDING_7_TO_8_BUFFER 4
31#define BINDING_8_TO_8_BUFFER 5
32#define BINDING_BYTE_TO_16_BUFFER 6
33#define BINDING_OUTPUT_IMAGE 0
34
35#endif
36
37layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
38
39BEGIN_PUSH_CONSTANTS
40UNIFORM(0) uvec2 num_image_blocks;
41UNIFORM(1) uvec2 block_dims;
42UNIFORM(2) uint layer;
43
44UNIFORM(3) uvec3 origin;
45UNIFORM(4) ivec3 destination;
46UNIFORM(5) uint bytes_per_block_log2;
47UNIFORM(6) uint layer_stride;
48UNIFORM(7) uint block_size;
49UNIFORM(8) uint x_shift;
50UNIFORM(9) uint block_height;
51UNIFORM(10) uint block_height_mask;
52
53END_PUSH_CONSTANTS
54
55uint current_index = 0;
56int bitsread = 0;
57uint total_bitsread = 0;
58uint local_buff[16];
59
60const int JustBits = 0;
61const int Quint = 1;
62const int Trit = 2;
63
64struct EncodingData {
65 uint encoding;
66 uint num_bits;
67 uint bit_value;
68 uint quint_trit_value;
69};
70
71struct TexelWeightParams {
72 uvec2 size;
73 bool dual_plane;
74 uint max_weight;
75 bool Error;
76 bool VoidExtentLDR;
77 bool VoidExtentHDR;
78};
79
80// Swizzle data
81layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
82 uint swizzle_table[];
83};
84
85layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 {
86 uint astc_data[];
87};
88layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly image2D dest_image;
89
90const uint GOB_SIZE_X = 64;
91const uint GOB_SIZE_Y = 8;
92const uint GOB_SIZE_Z = 1;
93const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
94
95const uint GOB_SIZE_X_SHIFT = 6;
96const uint GOB_SIZE_Y_SHIFT = 3;
97const uint GOB_SIZE_Z_SHIFT = 0;
98const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
99
100const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
101
102uint SwizzleOffset(uvec2 pos) {
103 pos = pos & SWIZZLE_MASK;
104 return swizzle_table[pos.y * 64 + pos.x];
105}
106
107uint ReadTexel(uint offset) {
108 // extract the 8-bit value from the 32-bit packed data.
109 return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
110}
111
112// ASTC Encodings data
113layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
114 EncodingData encoding_values[256];
115};
116// ASTC Precompiled tables
117layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 {
118 uint REPLICATE_6_BIT_TO_8_TABLE[];
119};
120layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 {
121 uint REPLICATE_7_BIT_TO_8_TABLE[];
122};
123layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 {
124 uint REPLICATE_8_BIT_TO_8_TABLE[];
125};
126layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 {
127 uint REPLICATE_BYTE_TO_16_TABLE[];
128};
129
130const int BLOCK_SIZE_IN_BYTES = 16;
131
132const int BLOCK_INFO_ERROR = 0;
133const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
134const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
135const int BLOCK_INFO_NORMAL = 3;
136
137// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
138// is the same as [(numBits - 1):0] and repeats all the way down.
139uint Replicate(uint val, uint num_bits, uint to_bit) {
140 if (num_bits == 0) {
141 return 0;
142 }
143 if (to_bit == 0) {
144 return 0;
145 }
146 const uint v = val & uint((1 << num_bits) - 1);
147 uint res = v;
148 uint reslen = num_bits;
149 while (reslen < to_bit) {
150 uint comp = 0;
151 if (num_bits > to_bit - reslen) {
152 uint newshift = to_bit - reslen;
153 comp = num_bits - newshift;
154 num_bits = newshift;
155 }
156 res = uint(res << num_bits);
157 res = uint(res | (v >> comp));
158 reslen += num_bits;
159 }
160 return res;
161}
162
163uvec4 ReplicateByteTo16(uvec4 value) {
164 return uvec4(REPLICATE_BYTE_TO_16_TABLE[value.x], REPLICATE_BYTE_TO_16_TABLE[value.y],
165 REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]);
166}
167
168const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
169uint ReplicateBitTo7(uint value) {
170 return REPLICATE_BIT_TO_7_TABLE[value];
171 ;
172}
173
174const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
175uint ReplicateBitTo9(uint value) {
176 return REPLICATE_1_BIT_TO_9_TABLE[value];
177}
178
179const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
180const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
181const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
182const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
183 uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
184const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
185 uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
186 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
187
188uint FastReplicateTo8(uint value, uint num_bits) {
189 switch (num_bits) {
190 case 1:
191 return REPLICATE_1_BIT_TO_8_TABLE[value];
192 case 2:
193 return REPLICATE_2_BIT_TO_8_TABLE[value];
194 case 3:
195 return REPLICATE_3_BIT_TO_8_TABLE[value];
196 case 4:
197 return REPLICATE_4_BIT_TO_8_TABLE[value];
198 case 5:
199 return REPLICATE_5_BIT_TO_8_TABLE[value];
200 case 6:
201 return REPLICATE_6_BIT_TO_8_TABLE[value];
202 case 7:
203 return REPLICATE_7_BIT_TO_8_TABLE[value];
204 case 8:
205 return REPLICATE_8_BIT_TO_8_TABLE[value];
206 }
207 return Replicate(value, num_bits, 8);
208}
209
210const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
211const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
212const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
213const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
214 uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
215const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
216 uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
217 47, 49, 51, 53, 55, 57, 59, 61, 63);
218
219uint FastReplicateTo6(uint value, uint num_bits) {
220 switch (num_bits) {
221 case 1:
222 return REPLICATE_1_BIT_TO_6_TABLE[value];
223 case 2:
224 return REPLICATE_2_BIT_TO_6_TABLE[value];
225 case 3:
226 return REPLICATE_3_BIT_TO_6_TABLE[value];
227 case 4:
228 return REPLICATE_4_BIT_TO_6_TABLE[value];
229 case 5:
230 return REPLICATE_5_BIT_TO_6_TABLE[value];
231 }
232 return Replicate(value, num_bits, 6);
233}
234
235uint hash52(uint p) {
236 p ^= p >> 15;
237 p -= p << 17;
238 p += p << 7;
239 p += p << 4;
240 p ^= p >> 5;
241 p += p << 16;
242 p ^= p >> 7;
243 p ^= p >> 3;
244 p ^= p << 6;
245 p ^= p >> 17;
246 return p;
247}
248
249uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) {
250 if (1 == partition_count)
251 return 0;
252
253 if (small_block) {
254 x <<= 1;
255 y <<= 1;
256 z <<= 1;
257 }
258
259 seed += (partition_count - 1) * 1024;
260
261 uint rnum = hash52(uint(seed));
262 uint seed1 = uint(rnum & 0xF);
263 uint seed2 = uint((rnum >> 4) & 0xF);
264 uint seed3 = uint((rnum >> 8) & 0xF);
265 uint seed4 = uint((rnum >> 12) & 0xF);
266 uint seed5 = uint((rnum >> 16) & 0xF);
267 uint seed6 = uint((rnum >> 20) & 0xF);
268 uint seed7 = uint((rnum >> 24) & 0xF);
269 uint seed8 = uint((rnum >> 28) & 0xF);
270 uint seed9 = uint((rnum >> 18) & 0xF);
271 uint seed10 = uint((rnum >> 22) & 0xF);
272 uint seed11 = uint((rnum >> 26) & 0xF);
273 uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF);
274
275 seed1 = (seed1 * seed1);
276 seed2 = (seed2 * seed2);
277 seed3 = (seed3 * seed3);
278 seed4 = (seed4 * seed4);
279 seed5 = (seed5 * seed5);
280 seed6 = (seed6 * seed6);
281 seed7 = (seed7 * seed7);
282 seed8 = (seed8 * seed8);
283 seed9 = (seed9 * seed9);
284 seed10 = (seed10 * seed10);
285 seed11 = (seed11 * seed11);
286 seed12 = (seed12 * seed12);
287
288 int sh1, sh2, sh3;
289 if ((seed & 1) > 0) {
290 sh1 = (seed & 2) > 0 ? 4 : 5;
291 sh2 = (partition_count == 3) ? 6 : 5;
292 } else {
293 sh1 = (partition_count == 3) ? 6 : 5;
294 sh2 = (seed & 2) > 0 ? 4 : 5;
295 }
296 sh3 = (seed & 0x10) > 0 ? sh1 : sh2;
297
298 seed1 = (seed1 >> sh1);
299 seed2 = (seed2 >> sh2);
300 seed3 = (seed3 >> sh1);
301 seed4 = (seed4 >> sh2);
302 seed5 = (seed5 >> sh1);
303 seed6 = (seed6 >> sh2);
304 seed7 = (seed7 >> sh1);
305 seed8 = (seed8 >> sh2);
306 seed9 = (seed9 >> sh3);
307 seed10 = (seed10 >> sh3);
308 seed11 = (seed11 >> sh3);
309 seed12 = (seed12 >> sh3);
310
311 uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
312 uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
313 uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
314 uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
315
316 a &= 0x3F;
317 b &= 0x3F;
318 c &= 0x3F;
319 d &= 0x3F;
320
321 if (partition_count < 4)
322 d = 0;
323 if (partition_count < 3)
324 c = 0;
325
326 if (a >= b && a >= c && a >= d)
327 return 0;
328 else if (b >= c && b >= d)
329 return 1;
330 else if (c >= d)
331 return 2;
332 return 3;
333}
334
335uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
336 return SelectPartition(seed, x, y, 0, partition_count, small_block);
337}
338
339uint ReadBit() {
340 uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1);
341 bitsread++;
342 total_bitsread++;
343 if (bitsread == 8) {
344 current_index++;
345 bitsread = 0;
346 }
347 return bit;
348}
349
350uint StreamBits(uint num_bits) {
351 uint ret = 0;
352 for (uint i = 0; i < num_bits; i++) {
353 ret |= ((ReadBit() & 1) << i);
354 }
355 return ret;
356}
357
358// Define color data.
359uint color_endpoint_data[16];
360int color_bitsread = 0;
361uint total_color_bitsread = 0;
362int color_index = 0;
363
364// Define color data.
365uint texel_weight_data[16];
366int texel_bitsread = 0;
367uint total_texel_bitsread = 0;
368int texel_index = 0;
369
370bool texel_flag = false;
371
372uint ReadColorBit() {
373 uint bit = 0;
374 if (texel_flag) {
375 bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1);
376 texel_bitsread++;
377 total_texel_bitsread++;
378 if (texel_bitsread == 8) {
379 texel_index++;
380 texel_bitsread = 0;
381 }
382 } else {
383 bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1);
384 color_bitsread++;
385 total_color_bitsread++;
386 if (color_bitsread == 8) {
387 color_index++;
388 color_bitsread = 0;
389 }
390 }
391 return bit;
392}
393
394uint StreamColorBits(uint num_bits) {
395 uint ret = 0;
396 for (uint i = 0; i < num_bits; i++) {
397 ret |= ((ReadColorBit() & 1) << i);
398 }
399 return ret;
400}
401
402EncodingData result_vector[100];
403int result_index = 0;
404
405EncodingData texel_vector[100];
406int texel_vector_index = 0;
407
408void ResultEmplaceBack(EncodingData val) {
409 if (texel_flag) {
410 texel_vector[texel_vector_index] = val;
411 texel_vector_index++;
412 } else {
413 result_vector[result_index] = val;
414 result_index++;
415 }
416}
417
418// Returns the number of bits required to encode n_vals values.
419uint GetBitLength(uint n_vals, uint encoding_index) {
420 uint totalBits = encoding_values[encoding_index].num_bits * n_vals;
421 if (encoding_values[encoding_index].encoding == Trit) {
422 totalBits += (n_vals * 8 + 4) / 5;
423 } else if (encoding_values[encoding_index].encoding == Quint) {
424 totalBits += (n_vals * 7 + 2) / 3;
425 }
426 return totalBits;
427}
428
429uint GetNumWeightValues(uvec2 size, bool dual_plane) {
430 uint n_vals = size.x * size.y;
431 if (dual_plane) {
432 n_vals *= 2;
433 }
434 return n_vals;
435}
436
437uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) {
438 uint n_vals = GetNumWeightValues(size, dual_plane);
439 return GetBitLength(n_vals, max_weight);
440}
441
442uint BitsBracket(uint bits, uint pos) {
443 return ((bits >> pos) & 1);
444}
445
446uint BitsOp(uint bits, uint start, uint end) {
447 if (start == end) {
448 return BitsBracket(bits, start);
449 } else if (start > end) {
450 uint t = start;
451 start = end;
452 end = t;
453 }
454
455 uint mask = (1 << (end - start + 1)) - 1;
456 return ((bits >> start) & mask);
457}
458
459void DecodeQuintBlock(uint num_bits) { // Value number of bits
460 uint m[3];
461 uint q[3];
462 uint Q;
463 m[0] = StreamColorBits(num_bits);
464 Q = StreamColorBits(3);
465 m[1] = StreamColorBits(num_bits);
466 Q |= StreamColorBits(2) << 3;
467 m[2] = StreamColorBits(num_bits);
468 Q |= StreamColorBits(2) << 5;
469 if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) {
470 q[0] = 4;
471 q[1] = 4;
472 q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) |
473 (BitsBracket(Q, 3) & ~BitsBracket(Q, 0));
474 } else {
475 uint C = 0;
476 if (BitsOp(Q, 1, 2) == 3) {
477 q[2] = 4;
478 C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0);
479 } else {
480 q[2] = BitsOp(Q, 5, 6);
481 C = BitsOp(Q, 0, 4);
482 }
483
484 if (BitsOp(C, 0, 2) == 5) {
485 q[1] = 4;
486 q[0] = BitsOp(C, 3, 4);
487 } else {
488 q[1] = BitsOp(C, 3, 4);
489 q[0] = BitsOp(C, 0, 2);
490 }
491 }
492
493 for (uint i = 0; i < 3; i++) {
494 EncodingData val;
495 val.encoding = Quint;
496 val.num_bits = num_bits;
497 val.bit_value = m[i];
498 val.quint_trit_value = q[i];
499 ResultEmplaceBack(val);
500 }
501}
502
503void DecodeTritBlock(uint num_bits) {
504 uint m[5];
505 uint t[5];
506 uint T;
507 m[0] = StreamColorBits(num_bits);
508 T = StreamColorBits(2);
509 m[1] = StreamColorBits(num_bits);
510 T |= StreamColorBits(2) << 2;
511 m[2] = StreamColorBits(num_bits);
512 T |= StreamColorBits(1) << 4;
513 m[3] = StreamColorBits(num_bits);
514 T |= StreamColorBits(2) << 5;
515 m[4] = StreamColorBits(num_bits);
516 T |= StreamColorBits(1) << 7;
517 uint C = 0;
518 if (BitsOp(T, 2, 4) == 7) {
519 C = (BitsOp(T, 5, 7) << 2) | BitsOp(T, 0, 1);
520 t[4] = 2;
521 t[3] = 2;
522 } else {
523 C = BitsOp(T, 0, 4);
524 if (BitsOp(T, 5, 6) == 3) {
525 t[4] = 2;
526 t[3] = BitsBracket(T, 7);
527 } else {
528 t[4] = BitsBracket(T, 7);
529 t[3] = BitsOp(T, 5, 6);
530 }
531 }
532 if (BitsOp(C, 0, 1) == 3) {
533 t[2] = 2;
534 t[1] = BitsBracket(C, 4);
535 t[0] = (BitsBracket(C, 3) << 1) | (BitsBracket(C, 2) & ~BitsBracket(C, 3));
536 } else if (BitsOp(C, 2, 3) == 3) {
537 t[2] = 2;
538 t[1] = 2;
539 t[0] = BitsOp(C, 0, 1);
540 } else {
541 t[2] = BitsBracket(C, 4);
542 t[1] = BitsOp(C, 2, 3);
543 t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1));
544 }
545 for (uint i = 0; i < 5; i++) {
546 EncodingData val;
547 val.encoding = Trit;
548 val.num_bits = num_bits;
549 val.bit_value = m[i];
550 val.quint_trit_value = t[i];
551 ResultEmplaceBack(val);
552 }
553}
554void DecodeIntegerSequence(uint max_range, uint num_values) {
555 EncodingData val = encoding_values[max_range];
556 uint vals_decoded = 0;
557 while (vals_decoded < num_values) {
558 switch (val.encoding) {
559 case Quint:
560 DecodeQuintBlock(val.num_bits);
561 vals_decoded += 3;
562 break;
563
564 case Trit:
565 DecodeTritBlock(val.num_bits);
566 vals_decoded += 5;
567 break;
568
569 case JustBits:
570 val.bit_value = StreamColorBits(val.num_bits);
571 ResultEmplaceBack(val);
572 vals_decoded++;
573 break;
574 }
575 }
576}
577
578void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitions,
579 uint color_data_bits) {
580 uint num_values = 0;
581 for (uint i = 0; i < num_partitions; i++) {
582 num_values += ((modes[i] >> 2) + 1) << 1;
583 }
584 int range = 256;
585 while (--range > 0) {
586 EncodingData val = encoding_values[range];
587 uint bitLength = GetBitLength(num_values, range);
588 if (bitLength <= color_data_bits) {
589 while (--range > 0) {
590 EncodingData newval = encoding_values[range];
591 if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) {
592 break;
593 }
594 }
595 range++;
596 break;
597 }
598 }
599 DecodeIntegerSequence(range, num_values);
600 uint out_index = 0;
601 for (int itr = 0; itr < result_index; itr++) {
602 if (out_index >= num_values) {
603 break;
604 }
605 EncodingData val = result_vector[itr];
606 uint bitlen = val.num_bits;
607 uint bitval = val.bit_value;
608 uint A = 0, B = 0, C = 0, D = 0;
609 A = ReplicateBitTo9((bitval & 1));
610 switch (val.encoding) {
611 case JustBits:
612 color_values[out_index++] = FastReplicateTo8(bitval, bitlen);
613 break;
614 case Trit: {
615 D = val.quint_trit_value;
616 switch (bitlen) {
617 case 1: {
618 C = 204;
619 } break;
620 case 2: {
621 C = 93;
622 uint b = (bitval >> 1) & 1;
623 B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
624 } break;
625
626 case 3: {
627 C = 44;
628 uint cb = (bitval >> 1) & 3;
629 B = (cb << 7) | (cb << 2) | cb;
630 } break;
631
632 case 4: {
633 C = 22;
634 uint dcb = (bitval >> 1) & 7;
635 B = (dcb << 6) | dcb;
636 } break;
637
638 case 5: {
639 C = 11;
640 uint edcb = (bitval >> 1) & 0xF;
641 B = (edcb << 5) | (edcb >> 2);
642 } break;
643
644 case 6: {
645 C = 5;
646 uint fedcb = (bitval >> 1) & 0x1F;
647 B = (fedcb << 4) | (fedcb >> 4);
648 } break;
649 }
650 } break;
651 case Quint: {
652 D = val.quint_trit_value;
653 switch (bitlen) {
654 case 1: {
655 C = 113;
656 } break;
657 case 2: {
658 C = 54;
659 uint b = (bitval >> 1) & 1;
660 B = (b << 8) | (b << 3) | (b << 2);
661 } break;
662 case 3: {
663 C = 26;
664 uint cb = (bitval >> 1) & 3;
665 B = (cb << 7) | (cb << 1) | (cb >> 1);
666 } break;
667 case 4: {
668 C = 13;
669 uint dcb = (bitval >> 1) & 7;
670 B = (dcb << 6) | (dcb >> 1);
671 } break;
672 case 5: {
673 C = 6;
674 uint edcb = (bitval >> 1) & 0xF;
675 B = (edcb << 5) | (edcb >> 3);
676 } break;
677 }
678 } break;
679 }
680
681 if (val.encoding != JustBits) {
682 uint T = (D * C) + B;
683 T ^= A;
684 T = (A & 0x80) | (T >> 2);
685 color_values[out_index++] = T;
686 }
687 }
688}
689ivec2 BitTransferSigned(int a, int b) {
690 ivec2 transferred;
691 transferred[1] = b >> 1;
692 transferred[1] |= a & 0x80;
693 transferred[0] = a >> 1;
694 transferred[0] &= 0x3F;
695 if ((transferred[0] & 0x20) > 0) {
696 transferred[0] -= 0x40;
697 }
698 return transferred;
699}
700
701uvec4 ClampByte(ivec4 color) {
702 for (uint i = 0; i < 4; i++) {
703 color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
704 }
705 return uvec4(color);
706}
707ivec4 BlueContract(int a, int r, int g, int b) {
708 return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
709}
710int colvals_index = 0;
711void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32],
712 uint color_endpoint_mode) {
713#define READ_UINT_VALUES(N) \
714 uint v[N]; \
715 for (uint i = 0; i < N; i++) { \
716 v[i] = color_values[colvals_index++]; \
717 }
718
719#define READ_INT_VALUES(N) \
720 int v[N]; \
721 for (uint i = 0; i < N; i++) { \
722 v[i] = int(color_values[colvals_index++]); \
723 }
724
725 switch (color_endpoint_mode) {
726 case 0: {
727 READ_UINT_VALUES(2)
728 ep1 = uvec4(0xFF, v[0], v[0], v[0]);
729 ep2 = uvec4(0xFF, v[1], v[1], v[1]);
730 } break;
731
732 case 1: {
733 READ_UINT_VALUES(2)
734 uint L0 = (v[0] >> 2) | (v[1] & 0xC0);
735 uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU);
736 ep1 = uvec4(0xFF, L0, L0, L0);
737 ep2 = uvec4(0xFF, L1, L1, L1);
738 } break;
739
740 case 4: {
741 READ_UINT_VALUES(4)
742 ep1 = uvec4(v[2], v[0], v[0], v[0]);
743 ep2 = uvec4(v[3], v[1], v[1], v[1]);
744 } break;
745
746 case 5: {
747 READ_INT_VALUES(4)
748 ivec2 transferred = BitTransferSigned(v[1], v[0]);
749 v[1] = transferred[0];
750 v[0] = transferred[1];
751 transferred = BitTransferSigned(v[3], v[2]);
752 v[3] = transferred[0];
753 v[2] = transferred[1];
754 ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0]));
755 ep2 = ClampByte(ivec4((v[2] + v[3]), v[0] + v[1], v[0] + v[1], v[0] + v[1]));
756 } break;
757
758 case 6: {
759 READ_UINT_VALUES(4)
760 ep1 = uvec4(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
761 ep2 = uvec4(0xFF, v[0], v[1], v[2]);
762 } break;
763
764 case 8: {
765 READ_UINT_VALUES(6)
766 if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
767 ep1 = uvec4(0xFF, v[0], v[2], v[4]);
768 ep2 = uvec4(0xFF, v[1], v[3], v[5]);
769 } else {
770 ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5])));
771 ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4])));
772 }
773 } break;
774
775 case 9: {
776 READ_INT_VALUES(6)
777 ivec2 transferred = BitTransferSigned(v[1], v[0]);
778 v[1] = transferred[0];
779 v[0] = transferred[1];
780 transferred = BitTransferSigned(v[3], v[2]);
781 v[3] = transferred[0];
782 v[2] = transferred[1];
783 transferred = BitTransferSigned(v[5], v[4]);
784 v[5] = transferred[0];
785 v[4] = transferred[1];
786 if (v[1] + v[3] + v[5] >= 0) {
787 ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4]));
788 ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
789 } else {
790 ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
791 ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4]));
792 }
793 } break;
794
795 case 10: {
796 READ_UINT_VALUES(6)
797 ep1 = uvec4(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
798 ep2 = uvec4(v[5], v[0], v[1], v[2]);
799 } break;
800
801 case 12: {
802 READ_UINT_VALUES(8)
803 if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
804 ep1 = uvec4(v[6], v[0], v[2], v[4]);
805 ep2 = uvec4(v[7], v[1], v[3], v[5]);
806 } else {
807 ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5])));
808 ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4])));
809 }
810 } break;
811
812 case 13: {
813 READ_INT_VALUES(8)
814 ivec2 transferred = BitTransferSigned(v[1], v[0]);
815 v[1] = transferred[0];
816 v[0] = transferred[1];
817 transferred = BitTransferSigned(v[3], v[2]);
818 v[3] = transferred[0];
819 v[2] = transferred[1];
820
821 transferred = BitTransferSigned(v[5], v[4]);
822 v[5] = transferred[0];
823 v[4] = transferred[1];
824
825 transferred = BitTransferSigned(v[7], v[6]);
826 v[7] = transferred[0];
827 v[6] = transferred[1];
828
829 if (v[1] + v[3] + v[5] >= 0) {
830 ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4]));
831 ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
832 } else {
833 ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
834 ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4]));
835 }
836 } break;
837 }
838#undef READ_UINT_VALUES
839#undef READ_INT_VALUES
840}
841
842uint UnquantizeTexelWeight(EncodingData val) {
843 uint bitval = val.bit_value;
844 uint bitlen = val.num_bits;
845 uint A = ReplicateBitTo7((bitval & 1));
846 uint B = 0, C = 0, D = 0;
847 uint result = 0;
848 switch (val.encoding) {
849 case JustBits:
850 result = FastReplicateTo6(bitval, bitlen);
851 break;
852 case Trit: {
853 D = val.quint_trit_value;
854 switch (bitlen) {
855 case 0: {
856 uint results[3] = {0, 32, 63};
857 result = results[D];
858 } break;
859 case 1: {
860 C = 50;
861 } break;
862 case 2: {
863 C = 23;
864 uint b = (bitval >> 1) & 1;
865 B = (b << 6) | (b << 2) | b;
866 } break;
867 case 3: {
868 C = 11;
869 uint cb = (bitval >> 1) & 3;
870 B = (cb << 5) | cb;
871 } break;
872 default:
873 break;
874 }
875 } break;
876 case Quint: {
877 D = val.quint_trit_value;
878 switch (bitlen) {
879 case 0: {
880 uint results[5] = {0, 16, 32, 47, 63};
881 result = results[D];
882 } break;
883 case 1: {
884 C = 28;
885 } break;
886 case 2: {
887 C = 13;
888 uint b = (bitval >> 1) & 1;
889 B = (b << 6) | (b << 1);
890 } break;
891 }
892 } break;
893 }
894 if (val.encoding != JustBits && bitlen > 0) {
895 result = D * C + B;
896 result ^= A;
897 result = (A & 0x20) | (result >> 2);
898 }
899 if (result > 32) {
900 result += 1;
901 }
902 return result;
903}
904
905void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 size) {
906 uint weight_idx = 0;
907 uint unquantized[2][144];
908 uint area = size.x * size.y;
909 for (uint itr = 0; itr < texel_vector_index; itr++) {
910 unquantized[0][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
911 if (dual_plane) {
912 ++itr;
913 unquantized[1][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
914 if (itr == texel_vector_index) {
915 break;
916 }
917 }
918 if (++weight_idx >= (area))
919 break;
920 }
921 uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
922 uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
923 uint kPlaneScale = dual_plane ? 2 : 1;
924 for (uint plane = 0; plane < kPlaneScale; plane++)
925 for (uint t = 0; t < block_dims.y; t++)
926 for (uint s = 0; s < block_dims.x; s++) {
927 uint cs = Ds * s;
928 uint ct = Dt * t;
929 uint gs = (cs * (size.x - 1) + 32) >> 6;
930 uint gt = (ct * (size.y - 1) + 32) >> 6;
931 uint js = gs >> 4;
932 uint fs = gs & 0xF;
933 uint jt = gt >> 4;
934 uint ft = gt & 0x0F;
935 uint w11 = (fs * ft + 8) >> 4;
936 uint w10 = ft - w11;
937 uint w01 = fs - w11;
938 uint w00 = 16 - fs - ft + w11;
939 uvec4 w = uvec4(w00, w01, w10, w11);
940 uint v0 = jt * size.x + js;
941
942 uvec4 p = uvec4(0);
943 if (v0 < area) {
944 p.x = unquantized[plane][v0];
945 }
946 if ((v0 + 1) < (area)) {
947 p.y = unquantized[plane][v0 + 1];
948 }
949 if ((v0 + size.x) < (area)) {
950 p.z = unquantized[plane][(v0 + size.x)];
951 }
952 if ((v0 + size.x + 1) < (area)) {
953 p.w = unquantized[plane][(v0 + size.x + 1)];
954 }
955 outbuffer[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4;
956 }
957}
958
959int FindLayout(uint mode) {
960 if ((mode & 3) != 0) {
961 if ((mode & 8) != 0) {
962 if ((mode & 4) != 0) {
963 if ((mode & 0x100) != 0) {
964 return 4;
965 }
966 return 3;
967 }
968 return 2;
969 }
970 if ((mode & 4) != 0) {
971 return 1;
972 }
973 return 0;
974 }
975 if ((mode & 0x100) != 0) {
976 if ((mode & 0x80) != 0) {
977 if ((mode & 0x20) != 0) {
978 return 8;
979 }
980 return 7;
981 }
982 return 9;
983 }
984 if ((mode & 0x80) != 0) {
985 return 6;
986 }
987 return 5;
988}
989
990TexelWeightParams DecodeBlockInfo(uint block_index) {
991 TexelWeightParams params = TexelWeightParams(uvec2(0), false, 0, false, false, false);
992 uint mode = StreamBits(11);
993 if ((mode & 0x1ff) == 0x1fc) {
994 if ((mode & 0x200) != 0) {
995 params.VoidExtentHDR = true;
996 } else {
997 params.VoidExtentLDR = true;
998 }
999 if ((mode & 0x400) == 0 || StreamBits(1) == 0) {
1000 params.Error = true;
1001 }
1002 return params;
1003 }
1004 if ((mode & 0xf) == 0) {
1005 params.Error = true;
1006 return params;
1007 }
1008 if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) {
1009 params.Error = true;
1010 return params;
1011 }
1012 uint A, B;
1013 uint mode_layout = FindLayout(mode);
1014 switch (mode_layout) {
1015 case 0:
1016 A = (mode >> 5) & 0x3;
1017 B = (mode >> 7) & 0x3;
1018 params.size = uvec2(B + 4, A + 2);
1019 break;
1020 case 1:
1021 A = (mode >> 5) & 0x3;
1022 B = (mode >> 7) & 0x3;
1023 params.size = uvec2(B + 8, A + 2);
1024 break;
1025 case 2:
1026 A = (mode >> 5) & 0x3;
1027 B = (mode >> 7) & 0x3;
1028 params.size = uvec2(A + 2, B + 8);
1029 break;
1030 case 3:
1031 A = (mode >> 5) & 0x3;
1032 B = (mode >> 7) & 0x1;
1033 params.size = uvec2(A + 2, B + 6);
1034 break;
1035 case 4:
1036 A = (mode >> 5) & 0x3;
1037 B = (mode >> 7) & 0x1;
1038 params.size = uvec2(B + 2, A + 2);
1039 break;
1040 case 5:
1041 A = (mode >> 5) & 0x3;
1042 params.size = uvec2(12, A + 2);
1043 break;
1044 case 6:
1045 A = (mode >> 5) & 0x3;
1046 params.size = uvec2(A + 2, 12);
1047 break;
1048 case 7:
1049 params.size = uvec2(6, 10);
1050 break;
1051 case 8:
1052 params.size = uvec2(10, 6);
1053 break;
1054 case 9:
1055 A = (mode >> 5) & 0x3;
1056 B = (mode >> 9) & 0x3;
1057 params.size = uvec2(A + 6, B + 6);
1058 break;
1059 default:
1060 params.Error = true;
1061 break;
1062 }
1063 params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
1064 uint weight_index = (mode & 0x10) != 0 ? 1 : 0;
1065 if (mode_layout < 5) {
1066 weight_index |= (mode & 0x3) << 1;
1067 } else {
1068 weight_index |= (mode & 0xc) >> 1;
1069 }
1070 weight_index -= 2;
1071 if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
1072 const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31);
1073 params.max_weight = max_weights[weight_index];
1074 } else {
1075 const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7);
1076 params.max_weight = max_weights[weight_index];
1077 }
1078 return params;
1079}
1080
1081void FillError(ivec3 coord) {
1082 for (uint j = 0; j < block_dims.y; j++) {
1083 for (uint i = 0; i < block_dims.x; i++) {
1084 imageStore(dest_image, coord.xy + ivec2(i, j), vec4(1.0, 1.0, 0.0, 1.0));
1085 }
1086 }
1087 return;
1088}
1089
1090void FillVoidExtentLDR(ivec3 coord, uint block_index) {
1091 for (int i = 0; i < 4; i++) {
1092 StreamBits(13);
1093 }
1094
1095 uint r_u = StreamBits(16);
1096 uint g_u = StreamBits(16);
1097 uint b_u = StreamBits(16);
1098 uint a_u = StreamBits(16);
1099 float a = float(a_u) / 65535.0f;
1100 float r = float(r_u) / 65535.0f;
1101 float g = float(g_u) / 65535.0f;
1102 float b = float(b_u) / 65535.0f;
1103 for (uint j = 0; j < block_dims.y; j++) {
1104 for (uint i = 0; i < block_dims.x; i++) {
1105 imageStore(dest_image, coord.xy + ivec2(i, j), vec4(r, g, b, a));
1106 }
1107 }
1108}
1109
1110void DecompressBlock(ivec3 coord, uint block_index) {
1111 TexelWeightParams params;
1112 params = DecodeBlockInfo(block_index);
1113 if (params.Error) {
1114 FillError(coord);
1115 return;
1116 }
1117 if (params.VoidExtentHDR) {
1118 FillError(coord);
1119 return;
1120 }
1121 if (params.VoidExtentLDR) {
1122 FillVoidExtentLDR(coord, block_index);
1123 return;
1124 }
1125 if (params.size.x > block_dims.x || params.size.y > block_dims.y) {
1126 FillError(coord);
1127 return;
1128 }
1129 uint num_partitions = StreamBits(2) + 1;
1130 if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) {
1131 FillError(coord);
1132 return;
1133 }
1134 int plane_index = -1;
1135 uint partition_index = 1;
1136 uvec4 color_endpoint_mode = uvec4(0);
1137 uint ced_pointer = 0;
1138 uint base_cem = 0;
1139 if (num_partitions == 1) {
1140 color_endpoint_mode[0] = StreamBits(4);
1141 partition_index = 0;
1142 } else {
1143 partition_index = StreamBits(10);
1144 base_cem = StreamBits(6);
1145 }
1146 uint base_mode = base_cem & 3;
1147 uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight);
1148 uint remaining_bits = 128 - weight_bits - total_bitsread;
1149 uint extra_cem_bits = 0;
1150 if (base_mode > 0) {
1151 switch (num_partitions) {
1152 case 2:
1153 extra_cem_bits += 2;
1154 break;
1155 case 3:
1156 extra_cem_bits += 5;
1157 break;
1158 case 4:
1159 extra_cem_bits += 8;
1160 break;
1161 default:
1162 return;
1163 }
1164 }
1165 remaining_bits -= extra_cem_bits;
1166 uint plane_selector_bits = 0;
1167 if (params.dual_plane) {
1168 plane_selector_bits = 2;
1169 }
1170 remaining_bits -= plane_selector_bits;
1171 // Read color data...
1172 uint color_data_bits = remaining_bits;
1173 while (remaining_bits > 0) {
1174 uint nb = min(remaining_bits, 8);
1175 uint b = StreamBits(nb);
1176 color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, 8));
1177 ced_pointer++;
1178 remaining_bits -= nb;
1179 }
1180 plane_index = int(StreamBits(plane_selector_bits));
1181 if (base_mode > 0) {
1182 uint extra_cem = StreamBits(extra_cem_bits);
1183 uint cem = (extra_cem << 6) | base_cem;
1184 cem >>= 2;
1185 uint C[4] = {0, 0, 0, 0};
1186 for (uint i = 0; i < num_partitions; i++) {
1187 C[i] = cem & 1;
1188 cem >>= 1;
1189 }
1190 uint M[4] = {0, 0, 0, 0};
1191 for (uint i = 0; i < num_partitions; i++) {
1192 M[i] = cem & 3;
1193 cem >>= 2;
1194 }
1195 for (uint i = 0; i < num_partitions; i++) {
1196 color_endpoint_mode[i] = base_mode;
1197 if ((C[i]) == 0) {
1198 color_endpoint_mode[i] -= 1;
1199 }
1200 color_endpoint_mode[i] <<= 2;
1201 color_endpoint_mode[i] |= M[i];
1202 }
1203 } else if (num_partitions > 1) {
1204 uint cem = base_cem >> 2;
1205 for (uint i = 0; i < num_partitions; i++) {
1206 color_endpoint_mode[i] = cem;
1207 }
1208 }
1209
1210 uint color_values[32]; // Four values, two endpoints, four maximum paritions
1211 DecodeColorValues(color_values, color_endpoint_mode, num_partitions, color_data_bits);
1212 uvec4 endpoints[4][2];
1213 for (uint i = 0; i < num_partitions; i++) {
1214 ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_values, color_endpoint_mode[i]);
1215 }
1216 for (uint i = 0; i < 16; i++) {
1217 texel_weight_data[i] = local_buff[i];
1218 }
1219 for (uint i = 0; i < 8; i++) {
1220#define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16
1221 uint a = REVERSE_BYTE(texel_weight_data[i]);
1222 uint b = REVERSE_BYTE(texel_weight_data[15 - i]);
1223#undef REVERSE_BYTE
1224 texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8));
1225 texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8));
1226 }
1227 uint clear_byte_start =
1228 (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1;
1229 texel_weight_data[clear_byte_start - 1] =
1230 texel_weight_data[clear_byte_start - 1] &
1231 uint(
1232 ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
1233 for (uint i = 0; i < 16 - clear_byte_start; i++) {
1234 texel_weight_data[clear_byte_start + i] = uint(0U);
1235 }
1236 texel_flag = true; // use texel "vector" and bit stream in integer decoding
1237 DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
1238 uint weights[2][144];
1239 UnquantizeTexelWeights(weights, params.dual_plane, params.size);
1240 for (uint j = 0; j < block_dims.y; j++) {
1241 for (uint i = 0; i < block_dims.x; i++) {
1242 uint local_partition = Select2DPartition(partition_index, i, j, num_partitions,
1243 (block_dims.y * block_dims.x) < 32);
1244 vec4 p;
1245 uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]);
1246 uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]);
1247 uvec4 plane_vec = uvec4(0);
1248 uvec4 weight_vec = uvec4(0);
1249 for (uint c = 0; c < 4; c++) {
1250 if (params.dual_plane && (((plane_index + 1) & 3) == c)) {
1251 plane_vec[c] = 1;
1252 }
1253 weight_vec[c] = weights[plane_vec[c]][j * block_dims.x + i];
1254 }
1255 vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) >> 6);
1256 p = (Cf / 65535.0);
1257 imageStore(dest_image, coord.xy + ivec2(i, j), p.gbar);
1258 }
1259 }
1260}
1261
1262void main() {
1263 uvec3 pos = gl_GlobalInvocationID + origin;
1264 pos.x <<= bytes_per_block_log2;
1265
1266 // Read as soon as possible due to its latency
1267 const uint swizzle = SwizzleOffset(pos.xy);
1268
1269 const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
1270
1271 uint offset = 0;
1272 offset += layer * layer_stride;
1273 offset += (block_y >> block_height) * block_size;
1274 offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
1275 offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
1276 offset += swizzle;
1277
1278 const ivec3 invocation_destination = ivec3(gl_GlobalInvocationID + destination);
1279 const ivec3 coord = ivec3(invocation_destination * uvec3(block_dims, 1.0));
1280 uint block_index =
1281 layer * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x;
1282 current_index = 0;
1283 bitsread = 0;
1284 for (int i = 0; i < 16; i++) {
1285 local_buff[i] = ReadTexel(offset + i);
1286 }
1287 DecompressBlock(coord, block_index);
1288}
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index e028677e9..29105ecad 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -307,7 +307,8 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4
307 307
308[[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime, 308[[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime,
309 const VideoCommon::ImageInfo& info) { 309 const VideoCommon::ImageInfo& info) {
310 // Disable accelerated uploads for now as they don't implement swizzled uploads 310 return (!runtime.HasNativeASTC() && IsPixelFormatASTC(info.format));
311 // Disable other accelerated uploads for now as they don't implement swizzled uploads
311 return false; 312 return false;
312 switch (info.type) { 313 switch (info.type) {
313 case ImageType::e2D: 314 case ImageType::e2D:
@@ -567,6 +568,9 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
567 568
568void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map, 569void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map,
569 std::span<const SwizzleParameters> swizzles) { 570 std::span<const SwizzleParameters> swizzles) {
571 if (IsPixelFormatASTC(image.info.format)) {
572 return util_shaders.ASTCDecode(image, map, swizzles);
573 }
570 switch (image.info.type) { 574 switch (image.info.type) {
571 case ImageType::e2D: 575 case ImageType::e2D:
572 return util_shaders.BlockLinearUpload2D(image, map, swizzles); 576 return util_shaders.BlockLinearUpload2D(image, map, swizzles);
@@ -599,6 +603,10 @@ FormatProperties TextureCacheRuntime::FormatInfo(ImageType type, GLenum internal
599 } 603 }
600} 604}
601 605
606bool TextureCacheRuntime::HasNativeASTC() const noexcept {
607 return device.HasASTC();
608}
609
602TextureCacheRuntime::StagingBuffers::StagingBuffers(GLenum storage_flags_, GLenum map_flags_) 610TextureCacheRuntime::StagingBuffers::StagingBuffers(GLenum storage_flags_, GLenum map_flags_)
603 : storage_flags{storage_flags_}, map_flags{map_flags_} {} 611 : storage_flags{storage_flags_}, map_flags{map_flags_} {}
604 612
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 3fbaa102f..3c871541b 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -95,6 +95,8 @@ public:
95 return has_broken_texture_view_formats; 95 return has_broken_texture_view_formats;
96 } 96 }
97 97
98 bool HasNativeASTC() const noexcept;
99
98private: 100private:
99 struct StagingBuffers { 101 struct StagingBuffers {
100 explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_); 102 explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_);
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp
index 2fe4799bc..2a4220661 100644
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -3,7 +3,10 @@
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <bit> 5#include <bit>
6#include <fstream>
6#include <span> 7#include <span>
8#include <streambuf>
9#include <string>
7#include <string_view> 10#include <string_view>
8 11
9#include <glad/glad.h> 12#include <glad/glad.h>
@@ -24,11 +27,13 @@
24#include "video_core/texture_cache/accelerated_swizzle.h" 27#include "video_core/texture_cache/accelerated_swizzle.h"
25#include "video_core/texture_cache/types.h" 28#include "video_core/texture_cache/types.h"
26#include "video_core/texture_cache/util.h" 29#include "video_core/texture_cache/util.h"
30#include "video_core/textures/astc.h"
27#include "video_core/textures/decoders.h" 31#include "video_core/textures/decoders.h"
28 32
29namespace OpenGL { 33namespace OpenGL {
30 34
31using namespace HostShaders; 35using namespace HostShaders;
36using namespace Tegra::Texture::ASTC;
32 37
33using VideoCommon::Extent3D; 38using VideoCommon::Extent3D;
34using VideoCommon::ImageCopy; 39using VideoCommon::ImageCopy;
@@ -63,12 +68,104 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
63 pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), 68 pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)),
64 copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)), 69 copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)),
65 copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { 70 copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) {
71 // TODO: Load shader string as a header
72 std::string astc_path = "astc_decoder.comp";
73 std::ifstream t(astc_path);
74 std::string str((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
75 astc_decoder_program = MakeProgram(str);
76 MakeBuffers();
77}
78
79UtilShaders::~UtilShaders() = default;
80
81void UtilShaders::MakeBuffers() {
66 const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); 82 const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
67 swizzle_table_buffer.Create(); 83 swizzle_table_buffer.Create();
68 glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); 84 glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0);
85
86 astc_encodings_buffer.Create();
87 glNamedBufferStorage(astc_encodings_buffer.handle, sizeof(EncodingsValues), &EncodingsValues,
88 0);
89 replicate_6_to_8_buffer.Create();
90 glNamedBufferStorage(replicate_6_to_8_buffer.handle, sizeof(REPLICATE_6_BIT_TO_8_TABLE),
91 &REPLICATE_6_BIT_TO_8_TABLE, 0);
92 replicate_7_to_8_buffer.Create();
93 glNamedBufferStorage(replicate_7_to_8_buffer.handle, sizeof(REPLICATE_7_BIT_TO_8_TABLE),
94 &REPLICATE_7_BIT_TO_8_TABLE, 0);
95 replicate_8_to_8_buffer.Create();
96 glNamedBufferStorage(replicate_8_to_8_buffer.handle, sizeof(REPLICATE_8_BIT_TO_8_TABLE),
97 &REPLICATE_8_BIT_TO_8_TABLE, 0);
98 replicate_byte_to_16_buffer.Create();
99 glNamedBufferStorage(replicate_byte_to_16_buffer.handle, sizeof(REPLICATE_BYTE_TO_16_TABLE),
100 &REPLICATE_BYTE_TO_16_TABLE, 0);
69} 101}
70 102
71UtilShaders::~UtilShaders() = default; 103void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
104 std::span<const VideoCommon::SwizzleParameters> swizzles) {
105 static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0;
106 static constexpr GLuint BINDING_INPUT_BUFFER = 1;
107 static constexpr GLuint BINDING_ENC_BUFFER = 2;
108
109 static constexpr GLuint BINDING_6_TO_8_BUFFER = 3;
110 static constexpr GLuint BINDING_7_TO_8_BUFFER = 4;
111 static constexpr GLuint BINDING_8_TO_8_BUFFER = 5;
112 static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6;
113
114 static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
115 static constexpr GLuint LOC_NUM_IMAGE_BLOCKS = 0;
116 static constexpr GLuint LOC_BLOCK_DIMS = 1;
117 static constexpr GLuint LOC_LAYER = 2;
118
119 const Extent3D tile_size = {
120 VideoCore::Surface::DefaultBlockWidth(image.info.format),
121 VideoCore::Surface::DefaultBlockHeight(image.info.format),
122 };
123 program_manager.BindHostCompute(astc_decoder_program.handle);
124 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
125 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_encodings_buffer.handle);
126 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER,
127 replicate_6_to_8_buffer.handle);
128 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER,
129 replicate_7_to_8_buffer.handle);
130 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER,
131 replicate_8_to_8_buffer.handle);
132 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER,
133 replicate_byte_to_16_buffer.handle);
134
135 glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
136 glUniform2ui(LOC_BLOCK_DIMS, tile_size.width, tile_size.height);
137
138 for (u32 layer = 0; layer < image.info.resources.layers; layer++) {
139 for (const SwizzleParameters& swizzle : swizzles) {
140 glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_FALSE,
141 layer, GL_WRITE_ONLY, GL_RGBA8);
142 const size_t input_offset = swizzle.buffer_offset + map.offset;
143 const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U);
144 const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U);
145
146 glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height);
147 glUniform1ui(LOC_LAYER, layer);
148
149 // To unswizzle the ASTC data
150 const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
151 glUniform3uiv(3, 1, params.origin.data());
152 glUniform3iv(4, 1, params.destination.data());
153 glUniform1ui(5, params.bytes_per_block_log2);
154 glUniform1ui(6, params.layer_stride);
155 glUniform1ui(7, params.block_size);
156 glUniform1ui(8, params.x_shift);
157 glUniform1ui(9, params.block_height);
158 glUniform1ui(10, params.block_height_mask);
159
160 // ASTC texture data
161 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer,
162 input_offset, image.guest_size_bytes - swizzle.buffer_offset);
163
164 glDispatchCompute(num_dispatches_x, num_dispatches_y, 1);
165 }
166 }
167 program_manager.RestoreGuestCompute();
168}
72 169
73void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, 170void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
74 std::span<const SwizzleParameters> swizzles) { 171 std::span<const SwizzleParameters> swizzles) {
diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h
index 93b009743..08a1cb9b2 100644
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -40,6 +40,11 @@ public:
40 explicit UtilShaders(ProgramManager& program_manager); 40 explicit UtilShaders(ProgramManager& program_manager);
41 ~UtilShaders(); 41 ~UtilShaders();
42 42
43 void MakeBuffers();
44
45 void ASTCDecode(Image& image, const ImageBufferMap& map,
46 std::span<const VideoCommon::SwizzleParameters> swizzles);
47
43 void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, 48 void BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
44 std::span<const VideoCommon::SwizzleParameters> swizzles); 49 std::span<const VideoCommon::SwizzleParameters> swizzles);
45 50
@@ -59,7 +64,13 @@ private:
59 ProgramManager& program_manager; 64 ProgramManager& program_manager;
60 65
61 OGLBuffer swizzle_table_buffer; 66 OGLBuffer swizzle_table_buffer;
67 OGLBuffer astc_encodings_buffer;
68 OGLBuffer replicate_6_to_8_buffer;
69 OGLBuffer replicate_7_to_8_buffer;
70 OGLBuffer replicate_8_to_8_buffer;
71 OGLBuffer replicate_byte_to_16_buffer;
62 72
73 OGLProgram astc_decoder_program;
63 OGLProgram block_linear_unswizzle_2d_program; 74 OGLProgram block_linear_unswizzle_2d_program;
64 OGLProgram block_linear_unswizzle_3d_program; 75 OGLProgram block_linear_unswizzle_3d_program;
65 OGLProgram pitch_unswizzle_program; 76 OGLProgram pitch_unswizzle_program;
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h
index 9105119bc..bc8bddaec 100644
--- a/src/video_core/textures/astc.h
+++ b/src/video_core/textures/astc.h
@@ -8,6 +8,196 @@
8 8
9namespace Tegra::Texture::ASTC { 9namespace Tegra::Texture::ASTC {
10 10
11/// Count the number of bits set in a number.
12constexpr u32 Popcnt(u32 n) {
13 u32 c = 0;
14 for (; n; c++) {
15 n &= n - 1;
16 }
17 return c;
18}
19
20enum class IntegerEncoding { JustBits, Qus32, Trit };
21
22struct IntegerEncodedValue {
23 constexpr IntegerEncodedValue() = default;
24
25 constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
26 : encoding{encoding_}, num_bits{num_bits_} {}
27
28 constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
29 return encoding == other.encoding && num_bits == other.num_bits;
30 }
31
32 // Returns the number of bits required to encode nVals values.
33 u32 GetBitLength(u32 nVals) const {
34 u32 totalBits = num_bits * nVals;
35 if (encoding == IntegerEncoding::Trit) {
36 totalBits += (nVals * 8 + 4) / 5;
37 } else if (encoding == IntegerEncoding::Qus32) {
38 totalBits += (nVals * 7 + 2) / 3;
39 }
40 return totalBits;
41 }
42
43 IntegerEncoding encoding{};
44 u32 num_bits = 0;
45 u32 bit_value = 0;
46 union {
47 u32 qus32_value = 0;
48 u32 trit_value;
49 };
50};
51
52// Returns a new instance of this struct that corresponds to the
53// can take no more than maxval values
54static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) {
55 while (maxVal > 0) {
56 u32 check = maxVal + 1;
57
58 // Is maxVal a power of two?
59 if (!(check & (check - 1))) {
60 return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal));
61 }
62
63 // Is maxVal of the type 3*2^n - 1?
64 if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
65 return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1));
66 }
67
68 // Is maxVal of the type 5*2^n - 1?
69 if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
70 return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1));
71 }
72
73 // Apparently it can't be represented with a bounded integer sequence...
74 // just iterate.
75 maxVal--;
76 }
77 return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
78}
79
80static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
81 std::array<IntegerEncodedValue, 256> encodings{};
82 for (std::size_t i = 0; i < encodings.size(); ++i) {
83 encodings[i] = CreateEncoding(static_cast<u32>(i));
84 }
85 return encodings;
86}
87
88static constexpr std::array<IntegerEncodedValue, 256> EncodingsValues = MakeEncodedValues();
89
90// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
91// is the same as [(numBits - 1):0] and repeats all the way down.
92template <typename IntType>
93static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
94 if (numBits == 0) {
95 return 0;
96 }
97 if (toBit == 0) {
98 return 0;
99 }
100 const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
101 IntType res = v;
102 u32 reslen = numBits;
103 while (reslen < toBit) {
104 u32 comp = 0;
105 if (numBits > toBit - reslen) {
106 u32 newshift = toBit - reslen;
107 comp = numBits - newshift;
108 numBits = newshift;
109 }
110 res = static_cast<IntType>(res << numBits);
111 res = static_cast<IntType>(res | (v >> comp));
112 reslen += numBits;
113 }
114 return res;
115}
116
117static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
118 return std::size_t(1) << num_bits;
119}
120
121template <typename IntType, u32 num_bits, u32 to_bit>
122static constexpr auto MakeReplicateTable() {
123 std::array<IntType, NumReplicateEntries(num_bits)> table{};
124 for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
125 table[value] = Replicate(value, num_bits, to_bit);
126 }
127 return table;
128}
129
130static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
131static constexpr u32 ReplicateByteTo16(std::size_t value) {
132 return REPLICATE_BYTE_TO_16_TABLE[value];
133}
134
135static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
136static constexpr u32 ReplicateBitTo7(std::size_t value) {
137 return REPLICATE_BIT_TO_7_TABLE[value];
138}
139
140static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
141static constexpr u32 ReplicateBitTo9(std::size_t value) {
142 return REPLICATE_BIT_TO_9_TABLE[value];
143}
144
145static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
146static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
147static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
148static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
149static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
150static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
151static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
152static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
153/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
154/// to the runtime implementation
155static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
156 switch (num_bits) {
157 case 1:
158 return REPLICATE_1_BIT_TO_8_TABLE[value];
159 case 2:
160 return REPLICATE_2_BIT_TO_8_TABLE[value];
161 case 3:
162 return REPLICATE_3_BIT_TO_8_TABLE[value];
163 case 4:
164 return REPLICATE_4_BIT_TO_8_TABLE[value];
165 case 5:
166 return REPLICATE_5_BIT_TO_8_TABLE[value];
167 case 6:
168 return REPLICATE_6_BIT_TO_8_TABLE[value];
169 case 7:
170 return REPLICATE_7_BIT_TO_8_TABLE[value];
171 case 8:
172 return REPLICATE_8_BIT_TO_8_TABLE[value];
173 default:
174 return Replicate(value, num_bits, 8);
175 }
176}
177
178static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
179static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
180static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
181static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
182static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
183
184static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
185 switch (num_bits) {
186 case 1:
187 return REPLICATE_1_BIT_TO_6_TABLE[value];
188 case 2:
189 return REPLICATE_2_BIT_TO_6_TABLE[value];
190 case 3:
191 return REPLICATE_3_BIT_TO_6_TABLE[value];
192 case 4:
193 return REPLICATE_4_BIT_TO_6_TABLE[value];
194 case 5:
195 return REPLICATE_5_BIT_TO_6_TABLE[value];
196 default:
197 return Replicate(value, num_bits, 6);
198 }
199}
200
11void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, 201void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
12 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output); 202 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);
13 203