summaryrefslogtreecommitdiff
path: root/src/video_core/host_shaders
diff options
context:
space:
mode:
authorGravatar Rodrigo Locatti2021-03-30 19:31:52 -0300
committerGravatar GitHub2021-03-30 19:31:52 -0300
commit5ee669466fcebd2258229ed6bfe6b5e5529e0200 (patch)
tree6dbf84fb5c2c9656f1d1ef6c46b2527ea1a205ff /src/video_core/host_shaders
parentMerge pull request #6116 from german77/userArgument (diff)
parentastc_decoder: Refactor for style and more efficient memory use (diff)
downloadyuzu-5ee669466fcebd2258229ed6bfe6b5e5529e0200.tar.gz
yuzu-5ee669466fcebd2258229ed6bfe6b5e5529e0200.tar.xz
yuzu-5ee669466fcebd2258229ed6bfe6b5e5529e0200.zip
Merge pull request #5927 from ameerj/astc-compute
video_core: Accelerate ASTC texture decoding using compute shaders
Diffstat (limited to 'src/video_core/host_shaders')
-rw-r--r--src/video_core/host_shaders/CMakeLists.txt1
-rw-r--r--src/video_core/host_shaders/StringShaderHeader.cmake22
-rw-r--r--src/video_core/host_shaders/astc_decoder.comp1339
-rw-r--r--src/video_core/host_shaders/source_shader.h.in4
4 files changed, 1364 insertions, 2 deletions
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index 3494318ca..2208e1922 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -1,4 +1,5 @@
1set(SHADER_FILES 1set(SHADER_FILES
2 astc_decoder.comp
2 block_linear_unswizzle_2d.comp 3 block_linear_unswizzle_2d.comp
3 block_linear_unswizzle_3d.comp 4 block_linear_unswizzle_3d.comp
4 convert_depth_to_float.frag 5 convert_depth_to_float.frag
diff --git a/src/video_core/host_shaders/StringShaderHeader.cmake b/src/video_core/host_shaders/StringShaderHeader.cmake
index c0fc49768..1b4bc6103 100644
--- a/src/video_core/host_shaders/StringShaderHeader.cmake
+++ b/src/video_core/host_shaders/StringShaderHeader.cmake
@@ -6,7 +6,27 @@ get_filename_component(CONTENTS_NAME ${SOURCE_FILE} NAME)
6string(REPLACE "." "_" CONTENTS_NAME ${CONTENTS_NAME}) 6string(REPLACE "." "_" CONTENTS_NAME ${CONTENTS_NAME})
7string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME) 7string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME)
8 8
9file(READ ${SOURCE_FILE} CONTENTS) 9FILE(READ ${SOURCE_FILE} line_contents)
10
11# Replace double quotes with single quotes,
12# as double quotes will be used to wrap the lines
13STRING(REGEX REPLACE "\"" "'" line_contents "${line_contents}")
14
15# CMake separates list elements with semicolons, but semicolons
16# are used extensively in the shader code.
17# Replace with a temporary marker, to be reverted later.
18STRING(REGEX REPLACE ";" "{{SEMICOLON}}" line_contents "${line_contents}")
19
20# Make every line an individual element in the CMake list.
21STRING(REGEX REPLACE "\n" ";" line_contents "${line_contents}")
22
23# Build the shader string, wrapping each line in double quotes.
24foreach(line IN LISTS line_contents)
25 string(CONCAT CONTENTS "${CONTENTS}" \"${line}\\n\"\n)
26endforeach()
27
28# Revert the original semicolons in the source.
29STRING(REGEX REPLACE "{{SEMICOLON}}" ";" CONTENTS "${CONTENTS}")
10 30
11get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY) 31get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY)
12make_directory(${OUTPUT_DIR}) 32make_directory(${OUTPUT_DIR})
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
new file mode 100644
index 000000000..703e34587
--- /dev/null
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -0,0 +1,1339 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#version 450
6
7#ifdef VULKAN
8
9#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
10#define END_PUSH_CONSTANTS };
11#define UNIFORM(n)
12#define BINDING_INPUT_BUFFER 0
13#define BINDING_ENC_BUFFER 1
14#define BINDING_6_TO_8_BUFFER 2
15#define BINDING_7_TO_8_BUFFER 3
16#define BINDING_8_TO_8_BUFFER 4
17#define BINDING_BYTE_TO_16_BUFFER 5
18#define BINDING_SWIZZLE_BUFFER 6
19#define BINDING_OUTPUT_IMAGE 7
20
21#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
22
23#define BEGIN_PUSH_CONSTANTS
24#define END_PUSH_CONSTANTS
25#define UNIFORM(n) layout(location = n) uniform
26#define BINDING_SWIZZLE_BUFFER 0
27#define BINDING_INPUT_BUFFER 1
28#define BINDING_ENC_BUFFER 2
29#define BINDING_6_TO_8_BUFFER 3
30#define BINDING_7_TO_8_BUFFER 4
31#define BINDING_8_TO_8_BUFFER 5
32#define BINDING_BYTE_TO_16_BUFFER 6
33#define BINDING_OUTPUT_IMAGE 0
34
35#endif
36
37layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
38
39BEGIN_PUSH_CONSTANTS
40UNIFORM(1) uvec2 block_dims;
41
42UNIFORM(2) uint bytes_per_block_log2;
43UNIFORM(3) uint layer_stride;
44UNIFORM(4) uint block_size;
45UNIFORM(5) uint x_shift;
46UNIFORM(6) uint block_height;
47UNIFORM(7) uint block_height_mask;
48END_PUSH_CONSTANTS
49
50struct EncodingData {
51 uint encoding;
52 uint num_bits;
53 uint bit_value;
54 uint quint_trit_value;
55};
56
57struct TexelWeightParams {
58 uvec2 size;
59 uint max_weight;
60 bool dual_plane;
61 bool error_state;
62 bool void_extent_ldr;
63 bool void_extent_hdr;
64};
65
66// Swizzle data
67layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
68 uint swizzle_table[];
69};
70
71layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 {
72 uint astc_data[];
73};
74
75// ASTC Encodings data
76layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
77 EncodingData encoding_values[];
78};
79// ASTC Precompiled tables
80layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 {
81 uint REPLICATE_6_BIT_TO_8_TABLE[];
82};
83layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 {
84 uint REPLICATE_7_BIT_TO_8_TABLE[];
85};
86layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 {
87 uint REPLICATE_8_BIT_TO_8_TABLE[];
88};
89layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 {
90 uint REPLICATE_BYTE_TO_16_TABLE[];
91};
92
93layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image;
94
95const uint GOB_SIZE_X = 64;
96const uint GOB_SIZE_Y = 8;
97const uint GOB_SIZE_Z = 1;
98const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
99
100const uint GOB_SIZE_X_SHIFT = 6;
101const uint GOB_SIZE_Y_SHIFT = 3;
102const uint GOB_SIZE_Z_SHIFT = 0;
103const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
104
105const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
106
107const int BLOCK_SIZE_IN_BYTES = 16;
108
109const int BLOCK_INFO_ERROR = 0;
110const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
111const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
112const int BLOCK_INFO_NORMAL = 3;
113
114const int JUST_BITS = 0;
115const int QUINT = 1;
116const int TRIT = 2;
117
118// The following constants are expanded variants of the Replicate()
119// function calls corresponding to the following arguments:
120// value: index into the generated table
121// num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4.
122// to_bit: the integer after "TO_"
123const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
124const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
125
126const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
127const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
128const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
129const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
130 uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
131const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
132 uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
133 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
134const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
135const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
136const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
137const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
138 uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
139const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
140 uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
141 47, 49, 51, 53, 55, 57, 59, 61, 63);
142
143// Input ASTC texture globals
144uint current_index = 0;
145int bitsread = 0;
146uint total_bitsread = 0;
147uint local_buff[16];
148
149// Color data globals
150uint color_endpoint_data[16];
151int color_bitsread = 0;
152uint total_color_bitsread = 0;
153int color_index = 0;
154
155// Four values, two endpoints, four maximum paritions
156uint color_values[32];
157int colvals_index = 0;
158
159// Weight data globals
160uint texel_weight_data[16];
161int texel_bitsread = 0;
162uint total_texel_bitsread = 0;
163int texel_index = 0;
164
165bool texel_flag = false;
166
167// Global "vectors" to be pushed into when decoding
168EncodingData result_vector[100];
169int result_index = 0;
170
171EncodingData texel_vector[100];
172int texel_vector_index = 0;
173
174uint unquantized_texel_weights[2][144];
175
176uint SwizzleOffset(uvec2 pos) {
177 pos = pos & SWIZZLE_MASK;
178 return swizzle_table[pos.y * 64 + pos.x];
179}
180
181uint ReadTexel(uint offset) {
182 // extract the 8-bit value from the 32-bit packed data.
183 return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
184}
185
186// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
187// is the same as [(num_bits - 1):0] and repeats all the way down.
188uint Replicate(uint val, uint num_bits, uint to_bit) {
189 if (num_bits == 0 || to_bit == 0) {
190 return 0;
191 }
192 const uint v = val & uint((1 << num_bits) - 1);
193 uint res = v;
194 uint reslen = num_bits;
195 while (reslen < to_bit) {
196 uint comp = 0;
197 if (num_bits > to_bit - reslen) {
198 uint newshift = to_bit - reslen;
199 comp = num_bits - newshift;
200 num_bits = newshift;
201 }
202 res = uint(res << num_bits);
203 res = uint(res | (v >> comp));
204 reslen += num_bits;
205 }
206 return res;
207}
208
209uvec4 ReplicateByteTo16(uvec4 value) {
210 return uvec4(REPLICATE_BYTE_TO_16_TABLE[value.x], REPLICATE_BYTE_TO_16_TABLE[value.y],
211 REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]);
212}
213
214uint ReplicateBitTo7(uint value) {
215 return REPLICATE_BIT_TO_7_TABLE[value];
216}
217
218uint ReplicateBitTo9(uint value) {
219 return REPLICATE_1_BIT_TO_9_TABLE[value];
220}
221
222uint FastReplicateTo8(uint value, uint num_bits) {
223 switch (num_bits) {
224 case 1:
225 return REPLICATE_1_BIT_TO_8_TABLE[value];
226 case 2:
227 return REPLICATE_2_BIT_TO_8_TABLE[value];
228 case 3:
229 return REPLICATE_3_BIT_TO_8_TABLE[value];
230 case 4:
231 return REPLICATE_4_BIT_TO_8_TABLE[value];
232 case 5:
233 return REPLICATE_5_BIT_TO_8_TABLE[value];
234 case 6:
235 return REPLICATE_6_BIT_TO_8_TABLE[value];
236 case 7:
237 return REPLICATE_7_BIT_TO_8_TABLE[value];
238 case 8:
239 return REPLICATE_8_BIT_TO_8_TABLE[value];
240 }
241 return Replicate(value, num_bits, 8);
242}
243
244uint FastReplicateTo6(uint value, uint num_bits) {
245 switch (num_bits) {
246 case 1:
247 return REPLICATE_1_BIT_TO_6_TABLE[value];
248 case 2:
249 return REPLICATE_2_BIT_TO_6_TABLE[value];
250 case 3:
251 return REPLICATE_3_BIT_TO_6_TABLE[value];
252 case 4:
253 return REPLICATE_4_BIT_TO_6_TABLE[value];
254 case 5:
255 return REPLICATE_5_BIT_TO_6_TABLE[value];
256 }
257 return Replicate(value, num_bits, 6);
258}
259
260uint Div3Floor(uint v) {
261 return (v * 0x5556) >> 16;
262}
263
264uint Div3Ceil(uint v) {
265 return Div3Floor(v + 2);
266}
267
268uint Div5Floor(uint v) {
269 return (v * 0x3334) >> 16;
270}
271
272uint Div5Ceil(uint v) {
273 return Div5Floor(v + 4);
274}
275
276uint Hash52(uint p) {
277 p ^= p >> 15;
278 p -= p << 17;
279 p += p << 7;
280 p += p << 4;
281 p ^= p >> 5;
282 p += p << 16;
283 p ^= p >> 7;
284 p ^= p >> 3;
285 p ^= p << 6;
286 p ^= p >> 17;
287 return p;
288}
289
290uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) {
291 if (partition_count == 1) {
292 return 0;
293 }
294 if (small_block) {
295 x <<= 1;
296 y <<= 1;
297 z <<= 1;
298 }
299
300 seed += (partition_count - 1) * 1024;
301
302 uint rnum = Hash52(uint(seed));
303 uint seed1 = uint(rnum & 0xF);
304 uint seed2 = uint((rnum >> 4) & 0xF);
305 uint seed3 = uint((rnum >> 8) & 0xF);
306 uint seed4 = uint((rnum >> 12) & 0xF);
307 uint seed5 = uint((rnum >> 16) & 0xF);
308 uint seed6 = uint((rnum >> 20) & 0xF);
309 uint seed7 = uint((rnum >> 24) & 0xF);
310 uint seed8 = uint((rnum >> 28) & 0xF);
311 uint seed9 = uint((rnum >> 18) & 0xF);
312 uint seed10 = uint((rnum >> 22) & 0xF);
313 uint seed11 = uint((rnum >> 26) & 0xF);
314 uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF);
315
316 seed1 = (seed1 * seed1);
317 seed2 = (seed2 * seed2);
318 seed3 = (seed3 * seed3);
319 seed4 = (seed4 * seed4);
320 seed5 = (seed5 * seed5);
321 seed6 = (seed6 * seed6);
322 seed7 = (seed7 * seed7);
323 seed8 = (seed8 * seed8);
324 seed9 = (seed9 * seed9);
325 seed10 = (seed10 * seed10);
326 seed11 = (seed11 * seed11);
327 seed12 = (seed12 * seed12);
328
329 int sh1, sh2, sh3;
330 if ((seed & 1) > 0) {
331 sh1 = (seed & 2) > 0 ? 4 : 5;
332 sh2 = (partition_count == 3) ? 6 : 5;
333 } else {
334 sh1 = (partition_count == 3) ? 6 : 5;
335 sh2 = (seed & 2) > 0 ? 4 : 5;
336 }
337 sh3 = (seed & 0x10) > 0 ? sh1 : sh2;
338
339 seed1 = (seed1 >> sh1);
340 seed2 = (seed2 >> sh2);
341 seed3 = (seed3 >> sh1);
342 seed4 = (seed4 >> sh2);
343 seed5 = (seed5 >> sh1);
344 seed6 = (seed6 >> sh2);
345 seed7 = (seed7 >> sh1);
346 seed8 = (seed8 >> sh2);
347 seed9 = (seed9 >> sh3);
348 seed10 = (seed10 >> sh3);
349 seed11 = (seed11 >> sh3);
350 seed12 = (seed12 >> sh3);
351
352 uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
353 uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
354 uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
355 uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
356
357 a &= 0x3F;
358 b &= 0x3F;
359 c &= 0x3F;
360 d &= 0x3F;
361
362 if (partition_count < 4) {
363 d = 0;
364 }
365 if (partition_count < 3) {
366 c = 0;
367 }
368
369 if (a >= b && a >= c && a >= d) {
370 return 0;
371 } else if (b >= c && b >= d) {
372 return 1;
373 } else if (c >= d) {
374 return 2;
375 } else {
376 return 3;
377 }
378}
379
380uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
381 return SelectPartition(seed, x, y, 0, partition_count, small_block);
382}
383
384uint ReadBit() {
385 if (current_index >= local_buff.length()) {
386 return 0;
387 }
388 uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1);
389 ++bitsread;
390 ++total_bitsread;
391 if (bitsread == 8) {
392 ++current_index;
393 bitsread = 0;
394 }
395 return bit;
396}
397
398uint StreamBits(uint num_bits) {
399 uint ret = 0;
400 for (uint i = 0; i < num_bits; i++) {
401 ret |= ((ReadBit() & 1) << i);
402 }
403 return ret;
404}
405
406uint ReadColorBit() {
407 uint bit = 0;
408 if (texel_flag) {
409 bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1);
410 ++texel_bitsread;
411 ++total_texel_bitsread;
412 if (texel_bitsread == 8) {
413 ++texel_index;
414 texel_bitsread = 0;
415 }
416 } else {
417 bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1);
418 ++color_bitsread;
419 ++total_color_bitsread;
420 if (color_bitsread == 8) {
421 ++color_index;
422 color_bitsread = 0;
423 }
424 }
425 return bit;
426}
427
428uint StreamColorBits(uint num_bits) {
429 uint ret = 0;
430 for (uint i = 0; i < num_bits; i++) {
431 ret |= ((ReadColorBit() & 1) << i);
432 }
433 return ret;
434}
435
436void ResultEmplaceBack(EncodingData val) {
437 if (texel_flag) {
438 texel_vector[texel_vector_index] = val;
439 ++texel_vector_index;
440 } else {
441 result_vector[result_index] = val;
442 ++result_index;
443 }
444}
445
446// Returns the number of bits required to encode n_vals values.
447uint GetBitLength(uint n_vals, uint encoding_index) {
448 uint total_bits = encoding_values[encoding_index].num_bits * n_vals;
449 if (encoding_values[encoding_index].encoding == TRIT) {
450 total_bits += Div5Ceil(n_vals * 8);
451 } else if (encoding_values[encoding_index].encoding == QUINT) {
452 total_bits += Div3Ceil(n_vals * 7);
453 }
454 return total_bits;
455}
456
457uint GetNumWeightValues(uvec2 size, bool dual_plane) {
458 uint n_vals = size.x * size.y;
459 if (dual_plane) {
460 n_vals *= 2;
461 }
462 return n_vals;
463}
464
465uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) {
466 uint n_vals = GetNumWeightValues(size, dual_plane);
467 return GetBitLength(n_vals, max_weight);
468}
469
470uint BitsBracket(uint bits, uint pos) {
471 return ((bits >> pos) & 1);
472}
473
474uint BitsOp(uint bits, uint start, uint end) {
475 if (start == end) {
476 return BitsBracket(bits, start);
477 } else if (start > end) {
478 uint t = start;
479 start = end;
480 end = t;
481 }
482
483 uint mask = (1 << (end - start + 1)) - 1;
484 return ((bits >> start) & mask);
485}
486
487void DecodeQuintBlock(uint num_bits) {
488 uint m[3];
489 uint q[3];
490 uint Q;
491 m[0] = StreamColorBits(num_bits);
492 Q = StreamColorBits(3);
493 m[1] = StreamColorBits(num_bits);
494 Q |= StreamColorBits(2) << 3;
495 m[2] = StreamColorBits(num_bits);
496 Q |= StreamColorBits(2) << 5;
497 if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) {
498 q[0] = 4;
499 q[1] = 4;
500 q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) |
501 (BitsBracket(Q, 3) & ~BitsBracket(Q, 0));
502 } else {
503 uint C = 0;
504 if (BitsOp(Q, 1, 2) == 3) {
505 q[2] = 4;
506 C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0);
507 } else {
508 q[2] = BitsOp(Q, 5, 6);
509 C = BitsOp(Q, 0, 4);
510 }
511 if (BitsOp(C, 0, 2) == 5) {
512 q[1] = 4;
513 q[0] = BitsOp(C, 3, 4);
514 } else {
515 q[1] = BitsOp(C, 3, 4);
516 q[0] = BitsOp(C, 0, 2);
517 }
518 }
519 for (uint i = 0; i < 3; i++) {
520 EncodingData val;
521 val.encoding = QUINT;
522 val.num_bits = num_bits;
523 val.bit_value = m[i];
524 val.quint_trit_value = q[i];
525 ResultEmplaceBack(val);
526 }
527}
528
529void DecodeTritBlock(uint num_bits) {
530 uint m[5];
531 uint t[5];
532 uint T;
533 m[0] = StreamColorBits(num_bits);
534 T = StreamColorBits(2);
535 m[1] = StreamColorBits(num_bits);
536 T |= StreamColorBits(2) << 2;
537 m[2] = StreamColorBits(num_bits);
538 T |= StreamColorBits(1) << 4;
539 m[3] = StreamColorBits(num_bits);
540 T |= StreamColorBits(2) << 5;
541 m[4] = StreamColorBits(num_bits);
542 T |= StreamColorBits(1) << 7;
543 uint C = 0;
544 if (BitsOp(T, 2, 4) == 7) {
545 C = (BitsOp(T, 5, 7) << 2) | BitsOp(T, 0, 1);
546 t[4] = 2;
547 t[3] = 2;
548 } else {
549 C = BitsOp(T, 0, 4);
550 if (BitsOp(T, 5, 6) == 3) {
551 t[4] = 2;
552 t[3] = BitsBracket(T, 7);
553 } else {
554 t[4] = BitsBracket(T, 7);
555 t[3] = BitsOp(T, 5, 6);
556 }
557 }
558 if (BitsOp(C, 0, 1) == 3) {
559 t[2] = 2;
560 t[1] = BitsBracket(C, 4);
561 t[0] = (BitsBracket(C, 3) << 1) | (BitsBracket(C, 2) & ~BitsBracket(C, 3));
562 } else if (BitsOp(C, 2, 3) == 3) {
563 t[2] = 2;
564 t[1] = 2;
565 t[0] = BitsOp(C, 0, 1);
566 } else {
567 t[2] = BitsBracket(C, 4);
568 t[1] = BitsOp(C, 2, 3);
569 t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1));
570 }
571 for (uint i = 0; i < 5; i++) {
572 EncodingData val;
573 val.encoding = TRIT;
574 val.num_bits = num_bits;
575 val.bit_value = m[i];
576 val.quint_trit_value = t[i];
577 ResultEmplaceBack(val);
578 }
579}
580
581void DecodeIntegerSequence(uint max_range, uint num_values) {
582 EncodingData val = encoding_values[max_range];
583 uint vals_decoded = 0;
584 while (vals_decoded < num_values) {
585 switch (val.encoding) {
586 case QUINT:
587 DecodeQuintBlock(val.num_bits);
588 vals_decoded += 3;
589 break;
590 case TRIT:
591 DecodeTritBlock(val.num_bits);
592 vals_decoded += 5;
593 break;
594 case JUST_BITS:
595 val.bit_value = StreamColorBits(val.num_bits);
596 ResultEmplaceBack(val);
597 vals_decoded++;
598 break;
599 }
600 }
601}
602
603void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
604 uint num_values = 0;
605 for (uint i = 0; i < num_partitions; i++) {
606 num_values += ((modes[i] >> 2) + 1) << 1;
607 }
608 int range = 256;
609 while (--range > 0) {
610 EncodingData val = encoding_values[range];
611 uint bit_length = GetBitLength(num_values, range);
612 if (bit_length <= color_data_bits) {
613 while (--range > 0) {
614 EncodingData newval = encoding_values[range];
615 if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) {
616 break;
617 }
618 }
619 ++range;
620 break;
621 }
622 }
623 DecodeIntegerSequence(range, num_values);
624 uint out_index = 0;
625 for (int itr = 0; itr < result_index; ++itr) {
626 if (out_index >= num_values) {
627 break;
628 }
629 EncodingData val = result_vector[itr];
630 uint bitlen = val.num_bits;
631 uint bitval = val.bit_value;
632 uint A = 0, B = 0, C = 0, D = 0;
633 A = ReplicateBitTo9((bitval & 1));
634 switch (val.encoding) {
635 case JUST_BITS:
636 color_values[out_index++] = FastReplicateTo8(bitval, bitlen);
637 break;
638 case TRIT: {
639 D = val.quint_trit_value;
640 switch (bitlen) {
641 case 1:
642 C = 204;
643 break;
644 case 2: {
645 C = 93;
646 uint b = (bitval >> 1) & 1;
647 B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
648 break;
649 }
650 case 3: {
651 C = 44;
652 uint cb = (bitval >> 1) & 3;
653 B = (cb << 7) | (cb << 2) | cb;
654 break;
655 }
656 case 4: {
657 C = 22;
658 uint dcb = (bitval >> 1) & 7;
659 B = (dcb << 6) | dcb;
660 break;
661 }
662 case 5: {
663 C = 11;
664 uint edcb = (bitval >> 1) & 0xF;
665 B = (edcb << 5) | (edcb >> 2);
666 break;
667 }
668 case 6: {
669 C = 5;
670 uint fedcb = (bitval >> 1) & 0x1F;
671 B = (fedcb << 4) | (fedcb >> 4);
672 break;
673 }
674 }
675 break;
676 }
677 case QUINT: {
678 D = val.quint_trit_value;
679 switch (bitlen) {
680 case 1:
681 C = 113;
682 break;
683 case 2: {
684 C = 54;
685 uint b = (bitval >> 1) & 1;
686 B = (b << 8) | (b << 3) | (b << 2);
687 break;
688 }
689 case 3: {
690 C = 26;
691 uint cb = (bitval >> 1) & 3;
692 B = (cb << 7) | (cb << 1) | (cb >> 1);
693 break;
694 }
695 case 4: {
696 C = 13;
697 uint dcb = (bitval >> 1) & 7;
698 B = (dcb << 6) | (dcb >> 1);
699 break;
700 }
701 case 5: {
702 C = 6;
703 uint edcb = (bitval >> 1) & 0xF;
704 B = (edcb << 5) | (edcb >> 3);
705 break;
706 }
707 }
708 break;
709 }
710 }
711 if (val.encoding != JUST_BITS) {
712 uint T = (D * C) + B;
713 T ^= A;
714 T = (A & 0x80) | (T >> 2);
715 color_values[out_index++] = T;
716 }
717 }
718}
719
720ivec2 BitTransferSigned(int a, int b) {
721 ivec2 transferred;
722 transferred.y = b >> 1;
723 transferred.y |= a & 0x80;
724 transferred.x = a >> 1;
725 transferred.x &= 0x3F;
726 if ((transferred.x & 0x20) > 0) {
727 transferred.x -= 0x40;
728 }
729 return transferred;
730}
731
732uvec4 ClampByte(ivec4 color) {
733 for (uint i = 0; i < 4; ++i) {
734 color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
735 }
736 return uvec4(color);
737}
738
739ivec4 BlueContract(int a, int r, int g, int b) {
740 return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
741}
742
743void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) {
744#define READ_UINT_VALUES(N) \
745 uint v[N]; \
746 for (uint i = 0; i < N; i++) { \
747 v[i] = color_values[colvals_index++]; \
748 }
749
750#define READ_INT_VALUES(N) \
751 int v[N]; \
752 for (uint i = 0; i < N; i++) { \
753 v[i] = int(color_values[colvals_index++]); \
754 }
755
756 switch (color_endpoint_mode) {
757 case 0: {
758 READ_UINT_VALUES(2)
759 ep1 = uvec4(0xFF, v[0], v[0], v[0]);
760 ep2 = uvec4(0xFF, v[1], v[1], v[1]);
761 break;
762 }
763 case 1: {
764 READ_UINT_VALUES(2)
765 uint L0 = (v[0] >> 2) | (v[1] & 0xC0);
766 uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU);
767 ep1 = uvec4(0xFF, L0, L0, L0);
768 ep2 = uvec4(0xFF, L1, L1, L1);
769 break;
770 }
771 case 4: {
772 READ_UINT_VALUES(4)
773 ep1 = uvec4(v[2], v[0], v[0], v[0]);
774 ep2 = uvec4(v[3], v[1], v[1], v[1]);
775 break;
776 }
777 case 5: {
778 READ_INT_VALUES(4)
779 ivec2 transferred = BitTransferSigned(v[1], v[0]);
780 v[1] = transferred.x;
781 v[0] = transferred.y;
782 transferred = BitTransferSigned(v[3], v[2]);
783 v[3] = transferred.x;
784 v[2] = transferred.y;
785 ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0]));
786 ep2 = ClampByte(ivec4(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]));
787 break;
788 }
789 case 6: {
790 READ_UINT_VALUES(4)
791 ep1 = uvec4(0xFF, (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8);
792 ep2 = uvec4(0xFF, v[0], v[1], v[2]);
793 break;
794 }
795 case 8: {
796 READ_UINT_VALUES(6)
797 if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) {
798 ep1 = uvec4(0xFF, v[0], v[2], v[4]);
799 ep2 = uvec4(0xFF, v[1], v[3], v[5]);
800 } else {
801 ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5])));
802 ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4])));
803 }
804 break;
805 }
806 case 9: {
807 READ_INT_VALUES(6)
808 ivec2 transferred = BitTransferSigned(v[1], v[0]);
809 v[1] = transferred.x;
810 v[0] = transferred.y;
811 transferred = BitTransferSigned(v[3], v[2]);
812 v[3] = transferred.x;
813 v[2] = transferred.y;
814 transferred = BitTransferSigned(v[5], v[4]);
815 v[5] = transferred.x;
816 v[4] = transferred.y;
817 if ((v[1] + v[3] + v[5]) >= 0) {
818 ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4]));
819 ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
820 } else {
821 ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
822 ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4]));
823 }
824 break;
825 }
826 case 10: {
827 READ_UINT_VALUES(6)
828 ep1 = uvec4(v[4], (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8);
829 ep2 = uvec4(v[5], v[0], v[1], v[2]);
830 break;
831 }
832 case 12: {
833 READ_UINT_VALUES(8)
834 if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) {
835 ep1 = uvec4(v[6], v[0], v[2], v[4]);
836 ep2 = uvec4(v[7], v[1], v[3], v[5]);
837 } else {
838 ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5])));
839 ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4])));
840 }
841 break;
842 }
843 case 13: {
844 READ_INT_VALUES(8)
845 ivec2 transferred = BitTransferSigned(v[1], v[0]);
846 v[1] = transferred.x;
847 v[0] = transferred.y;
848 transferred = BitTransferSigned(v[3], v[2]);
849 v[3] = transferred.x;
850 v[2] = transferred.y;
851
852 transferred = BitTransferSigned(v[5], v[4]);
853 v[5] = transferred.x;
854 v[4] = transferred.y;
855
856 transferred = BitTransferSigned(v[7], v[6]);
857 v[7] = transferred.x;
858 v[6] = transferred.y;
859
860 if ((v[1] + v[3] + v[5]) >= 0) {
861 ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4]));
862 ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
863 } else {
864 ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
865 ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4]));
866 }
867 break;
868 }
869 default: {
870 // HDR mode, or more likely a bug computing the color_endpoint_mode
871 ep1 = uvec4(0xFF, 0xFF, 0, 0);
872 ep2 = uvec4(0xFF, 0xFF, 0, 0);
873 break;
874 }
875 }
876#undef READ_UINT_VALUES
877#undef READ_INT_VALUES
878}
879
880uint UnquantizeTexelWeight(EncodingData val) {
881 uint bitval = val.bit_value;
882 uint bitlen = val.num_bits;
883 uint A = ReplicateBitTo7((bitval & 1));
884 uint B = 0, C = 0, D = 0;
885 uint result = 0;
886 switch (val.encoding) {
887 case JUST_BITS:
888 result = FastReplicateTo6(bitval, bitlen);
889 break;
890 case TRIT: {
891 D = val.quint_trit_value;
892 switch (bitlen) {
893 case 0: {
894 uint results[3] = {0, 32, 63};
895 result = results[D];
896 break;
897 }
898 case 1: {
899 C = 50;
900 break;
901 }
902 case 2: {
903 C = 23;
904 uint b = (bitval >> 1) & 1;
905 B = (b << 6) | (b << 2) | b;
906 break;
907 }
908 case 3: {
909 C = 11;
910 uint cb = (bitval >> 1) & 3;
911 B = (cb << 5) | cb;
912 break;
913 }
914 default:
915 break;
916 }
917 break;
918 }
919 case QUINT: {
920 D = val.quint_trit_value;
921 switch (bitlen) {
922 case 0: {
923 uint results[5] = {0, 16, 32, 47, 63};
924 result = results[D];
925 break;
926 }
927 case 1: {
928 C = 28;
929 break;
930 }
931 case 2: {
932 C = 13;
933 uint b = (bitval >> 1) & 1;
934 B = (b << 6) | (b << 1);
935 break;
936 }
937 }
938 break;
939 }
940 }
941 if (val.encoding != JUST_BITS && bitlen > 0) {
942 result = D * C + B;
943 result ^= A;
944 result = (A & 0x20) | (result >> 2);
945 }
946 if (result > 32) {
947 result += 1;
948 }
949 return result;
950}
951
952void UnquantizeTexelWeights(bool dual_plane, uvec2 size) {
953 uint weight_idx = 0;
954 uint unquantized[2][144];
955 uint area = size.x * size.y;
956 for (uint itr = 0; itr < texel_vector_index; itr++) {
957 unquantized[0][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
958 if (dual_plane) {
959 ++itr;
960 unquantized[1][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
961 if (itr == texel_vector_index) {
962 break;
963 }
964 }
965 if (++weight_idx >= (area))
966 break;
967 }
968
969 const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
970 const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
971 const uint k_plane_scale = dual_plane ? 2 : 1;
972 for (uint plane = 0; plane < k_plane_scale; plane++) {
973 for (uint t = 0; t < block_dims.y; t++) {
974 for (uint s = 0; s < block_dims.x; s++) {
975 uint cs = Ds * s;
976 uint ct = Dt * t;
977 uint gs = (cs * (size.x - 1) + 32) >> 6;
978 uint gt = (ct * (size.y - 1) + 32) >> 6;
979 uint js = gs >> 4;
980 uint fs = gs & 0xF;
981 uint jt = gt >> 4;
982 uint ft = gt & 0x0F;
983 uint w11 = (fs * ft + 8) >> 4;
984 uint w10 = ft - w11;
985 uint w01 = fs - w11;
986 uint w00 = 16 - fs - ft + w11;
987 uvec4 w = uvec4(w00, w01, w10, w11);
988 uint v0 = jt * size.x + js;
989
990 uvec4 p = uvec4(0);
991 if (v0 < area) {
992 p.x = unquantized[plane][v0];
993 }
994 if ((v0 + 1) < (area)) {
995 p.y = unquantized[plane][v0 + 1];
996 }
997 if ((v0 + size.x) < (area)) {
998 p.z = unquantized[plane][(v0 + size.x)];
999 }
1000 if ((v0 + size.x + 1) < (area)) {
1001 p.w = unquantized[plane][(v0 + size.x + 1)];
1002 }
1003 unquantized_texel_weights[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4;
1004 }
1005 }
1006 }
1007}
1008
1009int FindLayout(uint mode) {
1010 if ((mode & 3) != 0) {
1011 if ((mode & 8) != 0) {
1012 if ((mode & 4) != 0) {
1013 if ((mode & 0x100) != 0) {
1014 return 4;
1015 }
1016 return 3;
1017 }
1018 return 2;
1019 }
1020 if ((mode & 4) != 0) {
1021 return 1;
1022 }
1023 return 0;
1024 }
1025 if ((mode & 0x100) != 0) {
1026 if ((mode & 0x80) != 0) {
1027 if ((mode & 0x20) != 0) {
1028 return 8;
1029 }
1030 return 7;
1031 }
1032 return 9;
1033 }
1034 if ((mode & 0x80) != 0) {
1035 return 6;
1036 }
1037 return 5;
1038}
1039
1040TexelWeightParams DecodeBlockInfo(uint block_index) {
1041 TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false);
1042 uint mode = StreamBits(11);
1043 if ((mode & 0x1ff) == 0x1fc) {
1044 if ((mode & 0x200) != 0) {
1045 params.void_extent_hdr = true;
1046 } else {
1047 params.void_extent_ldr = true;
1048 }
1049 if ((mode & 0x400) == 0 || StreamBits(1) == 0) {
1050 params.error_state = true;
1051 }
1052 return params;
1053 }
1054 if ((mode & 0xf) == 0) {
1055 params.error_state = true;
1056 return params;
1057 }
1058 if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) {
1059 params.error_state = true;
1060 return params;
1061 }
1062 uint A, B;
1063 uint mode_layout = FindLayout(mode);
1064 switch (mode_layout) {
1065 case 0:
1066 A = (mode >> 5) & 0x3;
1067 B = (mode >> 7) & 0x3;
1068 params.size = uvec2(B + 4, A + 2);
1069 break;
1070 case 1:
1071 A = (mode >> 5) & 0x3;
1072 B = (mode >> 7) & 0x3;
1073 params.size = uvec2(B + 8, A + 2);
1074 break;
1075 case 2:
1076 A = (mode >> 5) & 0x3;
1077 B = (mode >> 7) & 0x3;
1078 params.size = uvec2(A + 2, B + 8);
1079 break;
1080 case 3:
1081 A = (mode >> 5) & 0x3;
1082 B = (mode >> 7) & 0x1;
1083 params.size = uvec2(A + 2, B + 6);
1084 break;
1085 case 4:
1086 A = (mode >> 5) & 0x3;
1087 B = (mode >> 7) & 0x1;
1088 params.size = uvec2(B + 2, A + 2);
1089 break;
1090 case 5:
1091 A = (mode >> 5) & 0x3;
1092 params.size = uvec2(12, A + 2);
1093 break;
1094 case 6:
1095 A = (mode >> 5) & 0x3;
1096 params.size = uvec2(A + 2, 12);
1097 break;
1098 case 7:
1099 params.size = uvec2(6, 10);
1100 break;
1101 case 8:
1102 params.size = uvec2(10, 6);
1103 break;
1104 case 9:
1105 A = (mode >> 5) & 0x3;
1106 B = (mode >> 9) & 0x3;
1107 params.size = uvec2(A + 6, B + 6);
1108 break;
1109 default:
1110 params.error_state = true;
1111 break;
1112 }
1113 params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
1114 uint weight_index = (mode & 0x10) != 0 ? 1 : 0;
1115 if (mode_layout < 5) {
1116 weight_index |= (mode & 0x3) << 1;
1117 } else {
1118 weight_index |= (mode & 0xc) >> 1;
1119 }
1120 weight_index -= 2;
1121 if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
1122 const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31);
1123 params.max_weight = max_weights[weight_index];
1124 } else {
1125 const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7);
1126 params.max_weight = max_weights[weight_index];
1127 }
1128 return params;
1129}
1130
1131void FillError(ivec3 coord) {
1132 for (uint j = 0; j < block_dims.y; j++) {
1133 for (uint i = 0; i < block_dims.x; i++) {
1134 imageStore(dest_image, coord + ivec3(i, j, 0), vec4(1.0, 1.0, 0.0, 1.0));
1135 }
1136 }
1137}
1138
1139void FillVoidExtentLDR(ivec3 coord) {
1140 StreamBits(52);
1141 uint r_u = StreamBits(16);
1142 uint g_u = StreamBits(16);
1143 uint b_u = StreamBits(16);
1144 uint a_u = StreamBits(16);
1145 float a = float(a_u) / 65535.0f;
1146 float r = float(r_u) / 65535.0f;
1147 float g = float(g_u) / 65535.0f;
1148 float b = float(b_u) / 65535.0f;
1149 for (uint j = 0; j < block_dims.y; j++) {
1150 for (uint i = 0; i < block_dims.x; i++) {
1151 imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a));
1152 }
1153 }
1154}
1155
1156void DecompressBlock(ivec3 coord, uint block_index) {
1157 TexelWeightParams params = DecodeBlockInfo(block_index);
1158 if (params.error_state) {
1159 FillError(coord);
1160 return;
1161 }
1162 if (params.void_extent_hdr) {
1163 FillError(coord);
1164 return;
1165 }
1166 if (params.void_extent_ldr) {
1167 FillVoidExtentLDR(coord);
1168 return;
1169 }
1170 if ((params.size.x > block_dims.x) || (params.size.y > block_dims.y)) {
1171 FillError(coord);
1172 return;
1173 }
1174 uint num_partitions = StreamBits(2) + 1;
1175 if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) {
1176 FillError(coord);
1177 return;
1178 }
1179 int plane_index = -1;
1180 uint partition_index = 1;
1181 uvec4 color_endpoint_mode = uvec4(0);
1182 uint ced_pointer = 0;
1183 uint base_cem = 0;
1184 if (num_partitions == 1) {
1185 color_endpoint_mode.x = StreamBits(4);
1186 partition_index = 0;
1187 } else {
1188 partition_index = StreamBits(10);
1189 base_cem = StreamBits(6);
1190 }
1191 uint base_mode = base_cem & 3;
1192 uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight);
1193 uint remaining_bits = 128 - weight_bits - total_bitsread;
1194 uint extra_cem_bits = 0;
1195 if (base_mode > 0) {
1196 switch (num_partitions) {
1197 case 2:
1198 extra_cem_bits += 2;
1199 break;
1200 case 3:
1201 extra_cem_bits += 5;
1202 break;
1203 case 4:
1204 extra_cem_bits += 8;
1205 break;
1206 default:
1207 return;
1208 }
1209 }
1210 remaining_bits -= extra_cem_bits;
1211 uint plane_selector_bits = 0;
1212 if (params.dual_plane) {
1213 plane_selector_bits = 2;
1214 }
1215 remaining_bits -= plane_selector_bits;
1216 if (remaining_bits > 128) {
1217 // Bad data, more remaining bits than 4 bytes
1218 // return early
1219 return;
1220 }
1221 // Read color data...
1222 uint color_data_bits = remaining_bits;
1223 while (remaining_bits > 0) {
1224 int nb = int(min(remaining_bits, 8U));
1225 uint b = StreamBits(nb);
1226 color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
1227 ++ced_pointer;
1228 remaining_bits -= nb;
1229 }
1230 plane_index = int(StreamBits(plane_selector_bits));
1231 if (base_mode > 0) {
1232 uint extra_cem = StreamBits(extra_cem_bits);
1233 uint cem = (extra_cem << 6) | base_cem;
1234 cem >>= 2;
1235 uvec4 C = uvec4(0);
1236 for (uint i = 0; i < num_partitions; i++) {
1237 C[i] = (cem & 1);
1238 cem >>= 1;
1239 }
1240 uvec4 M = uvec4(0);
1241 for (uint i = 0; i < num_partitions; i++) {
1242 M[i] = cem & 3;
1243 cem >>= 2;
1244 }
1245 for (uint i = 0; i < num_partitions; i++) {
1246 color_endpoint_mode[i] = base_mode;
1247 if (C[i] == 0) {
1248 --color_endpoint_mode[i];
1249 }
1250 color_endpoint_mode[i] <<= 2;
1251 color_endpoint_mode[i] |= M[i];
1252 }
1253 } else if (num_partitions > 1) {
1254 uint cem = base_cem >> 2;
1255 for (uint i = 0; i < num_partitions; i++) {
1256 color_endpoint_mode[i] = cem;
1257 }
1258 }
1259 DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits);
1260
1261 uvec4 endpoints[4][2];
1262 for (uint i = 0; i < num_partitions; i++) {
1263 ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]);
1264 }
1265
1266 for (uint i = 0; i < 16; i++) {
1267 texel_weight_data[i] = local_buff[i];
1268 }
1269 for (uint i = 0; i < 8; i++) {
1270#define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16
1271 uint a = REVERSE_BYTE(texel_weight_data[i]);
1272 uint b = REVERSE_BYTE(texel_weight_data[15 - i]);
1273#undef REVERSE_BYTE
1274 texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8));
1275 texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8));
1276 }
1277 uint clear_byte_start =
1278 (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1;
1279 texel_weight_data[clear_byte_start - 1] =
1280 texel_weight_data[clear_byte_start - 1] &
1281 uint(
1282 ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
1283 for (uint i = 0; i < 16 - clear_byte_start; i++) {
1284 texel_weight_data[clear_byte_start + i] = 0U;
1285 }
1286 texel_flag = true; // use texel "vector" and bit stream in integer decoding
1287 DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
1288
1289 UnquantizeTexelWeights(params.dual_plane, params.size);
1290
1291 for (uint j = 0; j < block_dims.y; j++) {
1292 for (uint i = 0; i < block_dims.x; i++) {
1293 uint local_partition = Select2DPartition(partition_index, i, j, num_partitions,
1294 (block_dims.y * block_dims.x) < 32);
1295 vec4 p;
1296 uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]);
1297 uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]);
1298 uvec4 plane_vec = uvec4(0);
1299 uvec4 weight_vec = uvec4(0);
1300 for (uint c = 0; c < 4; c++) {
1301 if (params.dual_plane && (((plane_index + 1) & 3) == c)) {
1302 plane_vec[c] = 1;
1303 }
1304 weight_vec[c] = unquantized_texel_weights[plane_vec[c]][j * block_dims.x + i];
1305 }
1306 vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64);
1307 p = (Cf / 65535.0);
1308 imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar);
1309 }
1310 }
1311}
1312
1313void main() {
1314 uvec3 pos = gl_GlobalInvocationID;
1315 pos.x <<= bytes_per_block_log2;
1316
1317 // Read as soon as possible due to its latency
1318 const uint swizzle = SwizzleOffset(pos.xy);
1319
1320 const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
1321
1322 uint offset = 0;
1323 offset += pos.z * layer_stride;
1324 offset += (block_y >> block_height) * block_size;
1325 offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
1326 offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
1327 offset += swizzle;
1328
1329 const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1));
1330 uint block_index =
1331 pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x;
1332
1333 current_index = 0;
1334 bitsread = 0;
1335 for (int i = 0; i < 16; i++) {
1336 local_buff[i] = ReadTexel(offset + i);
1337 }
1338 DecompressBlock(coord, block_index);
1339}
diff --git a/src/video_core/host_shaders/source_shader.h.in b/src/video_core/host_shaders/source_shader.h.in
index ccdb0d2a9..929dec39b 100644
--- a/src/video_core/host_shaders/source_shader.h.in
+++ b/src/video_core/host_shaders/source_shader.h.in
@@ -4,6 +4,8 @@
4 4
5namespace HostShaders { 5namespace HostShaders {
6 6
7constexpr std::string_view @CONTENTS_NAME@ = R"(@CONTENTS@)"; 7constexpr std::string_view @CONTENTS_NAME@ = {
8@CONTENTS@
9};
8 10
9} // namespace HostShaders 11} // namespace HostShaders