diff options
| author | 2021-03-30 19:31:52 -0300 | |
|---|---|---|
| committer | 2021-03-30 19:31:52 -0300 | |
| commit | 5ee669466fcebd2258229ed6bfe6b5e5529e0200 (patch) | |
| tree | 6dbf84fb5c2c9656f1d1ef6c46b2527ea1a205ff /src/video_core/host_shaders | |
| parent | Merge pull request #6116 from german77/userArgument (diff) | |
| parent | astc_decoder: Refactor for style and more efficient memory use (diff) | |
| download | yuzu-5ee669466fcebd2258229ed6bfe6b5e5529e0200.tar.gz yuzu-5ee669466fcebd2258229ed6bfe6b5e5529e0200.tar.xz yuzu-5ee669466fcebd2258229ed6bfe6b5e5529e0200.zip | |
Merge pull request #5927 from ameerj/astc-compute
video_core: Accelerate ASTC texture decoding using compute shaders
Diffstat (limited to 'src/video_core/host_shaders')
| -rw-r--r-- | src/video_core/host_shaders/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | src/video_core/host_shaders/StringShaderHeader.cmake | 22 | ||||
| -rw-r--r-- | src/video_core/host_shaders/astc_decoder.comp | 1339 | ||||
| -rw-r--r-- | src/video_core/host_shaders/source_shader.h.in | 4 |
4 files changed, 1364 insertions, 2 deletions
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 3494318ca..2208e1922 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | set(SHADER_FILES | 1 | set(SHADER_FILES |
| 2 | astc_decoder.comp | ||
| 2 | block_linear_unswizzle_2d.comp | 3 | block_linear_unswizzle_2d.comp |
| 3 | block_linear_unswizzle_3d.comp | 4 | block_linear_unswizzle_3d.comp |
| 4 | convert_depth_to_float.frag | 5 | convert_depth_to_float.frag |
diff --git a/src/video_core/host_shaders/StringShaderHeader.cmake b/src/video_core/host_shaders/StringShaderHeader.cmake index c0fc49768..1b4bc6103 100644 --- a/src/video_core/host_shaders/StringShaderHeader.cmake +++ b/src/video_core/host_shaders/StringShaderHeader.cmake | |||
| @@ -6,7 +6,27 @@ get_filename_component(CONTENTS_NAME ${SOURCE_FILE} NAME) | |||
| 6 | string(REPLACE "." "_" CONTENTS_NAME ${CONTENTS_NAME}) | 6 | string(REPLACE "." "_" CONTENTS_NAME ${CONTENTS_NAME}) |
| 7 | string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME) | 7 | string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME) |
| 8 | 8 | ||
| 9 | file(READ ${SOURCE_FILE} CONTENTS) | 9 | FILE(READ ${SOURCE_FILE} line_contents) |
| 10 | |||
| 11 | # Replace double quotes with single quotes, | ||
| 12 | # as double quotes will be used to wrap the lines | ||
| 13 | STRING(REGEX REPLACE "\"" "'" line_contents "${line_contents}") | ||
| 14 | |||
| 15 | # CMake separates list elements with semicolons, but semicolons | ||
| 16 | # are used extensively in the shader code. | ||
| 17 | # Replace with a temporary marker, to be reverted later. | ||
| 18 | STRING(REGEX REPLACE ";" "{{SEMICOLON}}" line_contents "${line_contents}") | ||
| 19 | |||
| 20 | # Make every line an individual element in the CMake list. | ||
| 21 | STRING(REGEX REPLACE "\n" ";" line_contents "${line_contents}") | ||
| 22 | |||
| 23 | # Build the shader string, wrapping each line in double quotes. | ||
| 24 | foreach(line IN LISTS line_contents) | ||
| 25 | string(CONCAT CONTENTS "${CONTENTS}" \"${line}\\n\"\n) | ||
| 26 | endforeach() | ||
| 27 | |||
| 28 | # Revert the original semicolons in the source. | ||
| 29 | STRING(REGEX REPLACE "{{SEMICOLON}}" ";" CONTENTS "${CONTENTS}") | ||
| 10 | 30 | ||
| 11 | get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY) | 31 | get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY) |
| 12 | make_directory(${OUTPUT_DIR}) | 32 | make_directory(${OUTPUT_DIR}) |
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp new file mode 100644 index 000000000..703e34587 --- /dev/null +++ b/src/video_core/host_shaders/astc_decoder.comp | |||
| @@ -0,0 +1,1339 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #version 450 | ||
| 6 | |||
| 7 | #ifdef VULKAN | ||
| 8 | |||
| 9 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | ||
| 10 | #define END_PUSH_CONSTANTS }; | ||
| 11 | #define UNIFORM(n) | ||
| 12 | #define BINDING_INPUT_BUFFER 0 | ||
| 13 | #define BINDING_ENC_BUFFER 1 | ||
| 14 | #define BINDING_6_TO_8_BUFFER 2 | ||
| 15 | #define BINDING_7_TO_8_BUFFER 3 | ||
| 16 | #define BINDING_8_TO_8_BUFFER 4 | ||
| 17 | #define BINDING_BYTE_TO_16_BUFFER 5 | ||
| 18 | #define BINDING_SWIZZLE_BUFFER 6 | ||
| 19 | #define BINDING_OUTPUT_IMAGE 7 | ||
| 20 | |||
| 21 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||
| 22 | |||
| 23 | #define BEGIN_PUSH_CONSTANTS | ||
| 24 | #define END_PUSH_CONSTANTS | ||
| 25 | #define UNIFORM(n) layout(location = n) uniform | ||
| 26 | #define BINDING_SWIZZLE_BUFFER 0 | ||
| 27 | #define BINDING_INPUT_BUFFER 1 | ||
| 28 | #define BINDING_ENC_BUFFER 2 | ||
| 29 | #define BINDING_6_TO_8_BUFFER 3 | ||
| 30 | #define BINDING_7_TO_8_BUFFER 4 | ||
| 31 | #define BINDING_8_TO_8_BUFFER 5 | ||
| 32 | #define BINDING_BYTE_TO_16_BUFFER 6 | ||
| 33 | #define BINDING_OUTPUT_IMAGE 0 | ||
| 34 | |||
| 35 | #endif | ||
| 36 | |||
| 37 | layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; | ||
| 38 | |||
| 39 | BEGIN_PUSH_CONSTANTS | ||
| 40 | UNIFORM(1) uvec2 block_dims; | ||
| 41 | |||
| 42 | UNIFORM(2) uint bytes_per_block_log2; | ||
| 43 | UNIFORM(3) uint layer_stride; | ||
| 44 | UNIFORM(4) uint block_size; | ||
| 45 | UNIFORM(5) uint x_shift; | ||
| 46 | UNIFORM(6) uint block_height; | ||
| 47 | UNIFORM(7) uint block_height_mask; | ||
| 48 | END_PUSH_CONSTANTS | ||
| 49 | |||
| 50 | struct EncodingData { | ||
| 51 | uint encoding; | ||
| 52 | uint num_bits; | ||
| 53 | uint bit_value; | ||
| 54 | uint quint_trit_value; | ||
| 55 | }; | ||
| 56 | |||
| 57 | struct TexelWeightParams { | ||
| 58 | uvec2 size; | ||
| 59 | uint max_weight; | ||
| 60 | bool dual_plane; | ||
| 61 | bool error_state; | ||
| 62 | bool void_extent_ldr; | ||
| 63 | bool void_extent_hdr; | ||
| 64 | }; | ||
| 65 | |||
| 66 | // Swizzle data | ||
| 67 | layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { | ||
| 68 | uint swizzle_table[]; | ||
| 69 | }; | ||
| 70 | |||
| 71 | layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { | ||
| 72 | uint astc_data[]; | ||
| 73 | }; | ||
| 74 | |||
| 75 | // ASTC Encodings data | ||
| 76 | layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues { | ||
| 77 | EncodingData encoding_values[]; | ||
| 78 | }; | ||
| 79 | // ASTC Precompiled tables | ||
| 80 | layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 { | ||
| 81 | uint REPLICATE_6_BIT_TO_8_TABLE[]; | ||
| 82 | }; | ||
| 83 | layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 { | ||
| 84 | uint REPLICATE_7_BIT_TO_8_TABLE[]; | ||
| 85 | }; | ||
| 86 | layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 { | ||
| 87 | uint REPLICATE_8_BIT_TO_8_TABLE[]; | ||
| 88 | }; | ||
| 89 | layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 { | ||
| 90 | uint REPLICATE_BYTE_TO_16_TABLE[]; | ||
| 91 | }; | ||
| 92 | |||
| 93 | layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; | ||
| 94 | |||
| 95 | const uint GOB_SIZE_X = 64; | ||
| 96 | const uint GOB_SIZE_Y = 8; | ||
| 97 | const uint GOB_SIZE_Z = 1; | ||
| 98 | const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; | ||
| 99 | |||
| 100 | const uint GOB_SIZE_X_SHIFT = 6; | ||
| 101 | const uint GOB_SIZE_Y_SHIFT = 3; | ||
| 102 | const uint GOB_SIZE_Z_SHIFT = 0; | ||
| 103 | const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; | ||
| 104 | |||
| 105 | const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); | ||
| 106 | |||
| 107 | const int BLOCK_SIZE_IN_BYTES = 16; | ||
| 108 | |||
| 109 | const int BLOCK_INFO_ERROR = 0; | ||
| 110 | const int BLOCK_INFO_VOID_EXTENT_HDR = 1; | ||
| 111 | const int BLOCK_INFO_VOID_EXTENT_LDR = 2; | ||
| 112 | const int BLOCK_INFO_NORMAL = 3; | ||
| 113 | |||
| 114 | const int JUST_BITS = 0; | ||
| 115 | const int QUINT = 1; | ||
| 116 | const int TRIT = 2; | ||
| 117 | |||
| 118 | // The following constants are expanded variants of the Replicate() | ||
| 119 | // function calls corresponding to the following arguments: | ||
| 120 | // value: index into the generated table | ||
| 121 | // num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4. | ||
| 122 | // to_bit: the integer after "TO_" | ||
| 123 | const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127); | ||
| 124 | const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511); | ||
| 125 | |||
| 126 | const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255); | ||
| 127 | const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255); | ||
| 128 | const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255); | ||
| 129 | const uint REPLICATE_4_BIT_TO_8_TABLE[16] = | ||
| 130 | uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255); | ||
| 131 | const uint REPLICATE_5_BIT_TO_8_TABLE[32] = | ||
| 132 | uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165, | ||
| 133 | 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255); | ||
| 134 | const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63); | ||
| 135 | const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63); | ||
| 136 | const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63); | ||
| 137 | const uint REPLICATE_4_BIT_TO_6_TABLE[16] = | ||
| 138 | uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63); | ||
| 139 | const uint REPLICATE_5_BIT_TO_6_TABLE[32] = | ||
| 140 | uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45, | ||
| 141 | 47, 49, 51, 53, 55, 57, 59, 61, 63); | ||
| 142 | |||
| 143 | // Input ASTC texture globals | ||
| 144 | uint current_index = 0; | ||
| 145 | int bitsread = 0; | ||
| 146 | uint total_bitsread = 0; | ||
| 147 | uint local_buff[16]; | ||
| 148 | |||
| 149 | // Color data globals | ||
| 150 | uint color_endpoint_data[16]; | ||
| 151 | int color_bitsread = 0; | ||
| 152 | uint total_color_bitsread = 0; | ||
| 153 | int color_index = 0; | ||
| 154 | |||
| 155 | // Four values, two endpoints, four maximum paritions | ||
| 156 | uint color_values[32]; | ||
| 157 | int colvals_index = 0; | ||
| 158 | |||
| 159 | // Weight data globals | ||
| 160 | uint texel_weight_data[16]; | ||
| 161 | int texel_bitsread = 0; | ||
| 162 | uint total_texel_bitsread = 0; | ||
| 163 | int texel_index = 0; | ||
| 164 | |||
| 165 | bool texel_flag = false; | ||
| 166 | |||
| 167 | // Global "vectors" to be pushed into when decoding | ||
| 168 | EncodingData result_vector[100]; | ||
| 169 | int result_index = 0; | ||
| 170 | |||
| 171 | EncodingData texel_vector[100]; | ||
| 172 | int texel_vector_index = 0; | ||
| 173 | |||
| 174 | uint unquantized_texel_weights[2][144]; | ||
| 175 | |||
| 176 | uint SwizzleOffset(uvec2 pos) { | ||
| 177 | pos = pos & SWIZZLE_MASK; | ||
| 178 | return swizzle_table[pos.y * 64 + pos.x]; | ||
| 179 | } | ||
| 180 | |||
| 181 | uint ReadTexel(uint offset) { | ||
| 182 | // extract the 8-bit value from the 32-bit packed data. | ||
| 183 | return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8); | ||
| 184 | } | ||
| 185 | |||
| 186 | // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] | ||
| 187 | // is the same as [(num_bits - 1):0] and repeats all the way down. | ||
| 188 | uint Replicate(uint val, uint num_bits, uint to_bit) { | ||
| 189 | if (num_bits == 0 || to_bit == 0) { | ||
| 190 | return 0; | ||
| 191 | } | ||
| 192 | const uint v = val & uint((1 << num_bits) - 1); | ||
| 193 | uint res = v; | ||
| 194 | uint reslen = num_bits; | ||
| 195 | while (reslen < to_bit) { | ||
| 196 | uint comp = 0; | ||
| 197 | if (num_bits > to_bit - reslen) { | ||
| 198 | uint newshift = to_bit - reslen; | ||
| 199 | comp = num_bits - newshift; | ||
| 200 | num_bits = newshift; | ||
| 201 | } | ||
| 202 | res = uint(res << num_bits); | ||
| 203 | res = uint(res | (v >> comp)); | ||
| 204 | reslen += num_bits; | ||
| 205 | } | ||
| 206 | return res; | ||
| 207 | } | ||
| 208 | |||
| 209 | uvec4 ReplicateByteTo16(uvec4 value) { | ||
| 210 | return uvec4(REPLICATE_BYTE_TO_16_TABLE[value.x], REPLICATE_BYTE_TO_16_TABLE[value.y], | ||
| 211 | REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]); | ||
| 212 | } | ||
| 213 | |||
| 214 | uint ReplicateBitTo7(uint value) { | ||
| 215 | return REPLICATE_BIT_TO_7_TABLE[value]; | ||
| 216 | } | ||
| 217 | |||
| 218 | uint ReplicateBitTo9(uint value) { | ||
| 219 | return REPLICATE_1_BIT_TO_9_TABLE[value]; | ||
| 220 | } | ||
| 221 | |||
| 222 | uint FastReplicateTo8(uint value, uint num_bits) { | ||
| 223 | switch (num_bits) { | ||
| 224 | case 1: | ||
| 225 | return REPLICATE_1_BIT_TO_8_TABLE[value]; | ||
| 226 | case 2: | ||
| 227 | return REPLICATE_2_BIT_TO_8_TABLE[value]; | ||
| 228 | case 3: | ||
| 229 | return REPLICATE_3_BIT_TO_8_TABLE[value]; | ||
| 230 | case 4: | ||
| 231 | return REPLICATE_4_BIT_TO_8_TABLE[value]; | ||
| 232 | case 5: | ||
| 233 | return REPLICATE_5_BIT_TO_8_TABLE[value]; | ||
| 234 | case 6: | ||
| 235 | return REPLICATE_6_BIT_TO_8_TABLE[value]; | ||
| 236 | case 7: | ||
| 237 | return REPLICATE_7_BIT_TO_8_TABLE[value]; | ||
| 238 | case 8: | ||
| 239 | return REPLICATE_8_BIT_TO_8_TABLE[value]; | ||
| 240 | } | ||
| 241 | return Replicate(value, num_bits, 8); | ||
| 242 | } | ||
| 243 | |||
| 244 | uint FastReplicateTo6(uint value, uint num_bits) { | ||
| 245 | switch (num_bits) { | ||
| 246 | case 1: | ||
| 247 | return REPLICATE_1_BIT_TO_6_TABLE[value]; | ||
| 248 | case 2: | ||
| 249 | return REPLICATE_2_BIT_TO_6_TABLE[value]; | ||
| 250 | case 3: | ||
| 251 | return REPLICATE_3_BIT_TO_6_TABLE[value]; | ||
| 252 | case 4: | ||
| 253 | return REPLICATE_4_BIT_TO_6_TABLE[value]; | ||
| 254 | case 5: | ||
| 255 | return REPLICATE_5_BIT_TO_6_TABLE[value]; | ||
| 256 | } | ||
| 257 | return Replicate(value, num_bits, 6); | ||
| 258 | } | ||
| 259 | |||
| 260 | uint Div3Floor(uint v) { | ||
| 261 | return (v * 0x5556) >> 16; | ||
| 262 | } | ||
| 263 | |||
| 264 | uint Div3Ceil(uint v) { | ||
| 265 | return Div3Floor(v + 2); | ||
| 266 | } | ||
| 267 | |||
| 268 | uint Div5Floor(uint v) { | ||
| 269 | return (v * 0x3334) >> 16; | ||
| 270 | } | ||
| 271 | |||
| 272 | uint Div5Ceil(uint v) { | ||
| 273 | return Div5Floor(v + 4); | ||
| 274 | } | ||
| 275 | |||
| 276 | uint Hash52(uint p) { | ||
| 277 | p ^= p >> 15; | ||
| 278 | p -= p << 17; | ||
| 279 | p += p << 7; | ||
| 280 | p += p << 4; | ||
| 281 | p ^= p >> 5; | ||
| 282 | p += p << 16; | ||
| 283 | p ^= p >> 7; | ||
| 284 | p ^= p >> 3; | ||
| 285 | p ^= p << 6; | ||
| 286 | p ^= p >> 17; | ||
| 287 | return p; | ||
| 288 | } | ||
| 289 | |||
| 290 | uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) { | ||
| 291 | if (partition_count == 1) { | ||
| 292 | return 0; | ||
| 293 | } | ||
| 294 | if (small_block) { | ||
| 295 | x <<= 1; | ||
| 296 | y <<= 1; | ||
| 297 | z <<= 1; | ||
| 298 | } | ||
| 299 | |||
| 300 | seed += (partition_count - 1) * 1024; | ||
| 301 | |||
| 302 | uint rnum = Hash52(uint(seed)); | ||
| 303 | uint seed1 = uint(rnum & 0xF); | ||
| 304 | uint seed2 = uint((rnum >> 4) & 0xF); | ||
| 305 | uint seed3 = uint((rnum >> 8) & 0xF); | ||
| 306 | uint seed4 = uint((rnum >> 12) & 0xF); | ||
| 307 | uint seed5 = uint((rnum >> 16) & 0xF); | ||
| 308 | uint seed6 = uint((rnum >> 20) & 0xF); | ||
| 309 | uint seed7 = uint((rnum >> 24) & 0xF); | ||
| 310 | uint seed8 = uint((rnum >> 28) & 0xF); | ||
| 311 | uint seed9 = uint((rnum >> 18) & 0xF); | ||
| 312 | uint seed10 = uint((rnum >> 22) & 0xF); | ||
| 313 | uint seed11 = uint((rnum >> 26) & 0xF); | ||
| 314 | uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF); | ||
| 315 | |||
| 316 | seed1 = (seed1 * seed1); | ||
| 317 | seed2 = (seed2 * seed2); | ||
| 318 | seed3 = (seed3 * seed3); | ||
| 319 | seed4 = (seed4 * seed4); | ||
| 320 | seed5 = (seed5 * seed5); | ||
| 321 | seed6 = (seed6 * seed6); | ||
| 322 | seed7 = (seed7 * seed7); | ||
| 323 | seed8 = (seed8 * seed8); | ||
| 324 | seed9 = (seed9 * seed9); | ||
| 325 | seed10 = (seed10 * seed10); | ||
| 326 | seed11 = (seed11 * seed11); | ||
| 327 | seed12 = (seed12 * seed12); | ||
| 328 | |||
| 329 | int sh1, sh2, sh3; | ||
| 330 | if ((seed & 1) > 0) { | ||
| 331 | sh1 = (seed & 2) > 0 ? 4 : 5; | ||
| 332 | sh2 = (partition_count == 3) ? 6 : 5; | ||
| 333 | } else { | ||
| 334 | sh1 = (partition_count == 3) ? 6 : 5; | ||
| 335 | sh2 = (seed & 2) > 0 ? 4 : 5; | ||
| 336 | } | ||
| 337 | sh3 = (seed & 0x10) > 0 ? sh1 : sh2; | ||
| 338 | |||
| 339 | seed1 = (seed1 >> sh1); | ||
| 340 | seed2 = (seed2 >> sh2); | ||
| 341 | seed3 = (seed3 >> sh1); | ||
| 342 | seed4 = (seed4 >> sh2); | ||
| 343 | seed5 = (seed5 >> sh1); | ||
| 344 | seed6 = (seed6 >> sh2); | ||
| 345 | seed7 = (seed7 >> sh1); | ||
| 346 | seed8 = (seed8 >> sh2); | ||
| 347 | seed9 = (seed9 >> sh3); | ||
| 348 | seed10 = (seed10 >> sh3); | ||
| 349 | seed11 = (seed11 >> sh3); | ||
| 350 | seed12 = (seed12 >> sh3); | ||
| 351 | |||
| 352 | uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); | ||
| 353 | uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); | ||
| 354 | uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); | ||
| 355 | uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); | ||
| 356 | |||
| 357 | a &= 0x3F; | ||
| 358 | b &= 0x3F; | ||
| 359 | c &= 0x3F; | ||
| 360 | d &= 0x3F; | ||
| 361 | |||
| 362 | if (partition_count < 4) { | ||
| 363 | d = 0; | ||
| 364 | } | ||
| 365 | if (partition_count < 3) { | ||
| 366 | c = 0; | ||
| 367 | } | ||
| 368 | |||
| 369 | if (a >= b && a >= c && a >= d) { | ||
| 370 | return 0; | ||
| 371 | } else if (b >= c && b >= d) { | ||
| 372 | return 1; | ||
| 373 | } else if (c >= d) { | ||
| 374 | return 2; | ||
| 375 | } else { | ||
| 376 | return 3; | ||
| 377 | } | ||
| 378 | } | ||
| 379 | |||
| 380 | uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { | ||
| 381 | return SelectPartition(seed, x, y, 0, partition_count, small_block); | ||
| 382 | } | ||
| 383 | |||
| 384 | uint ReadBit() { | ||
| 385 | if (current_index >= local_buff.length()) { | ||
| 386 | return 0; | ||
| 387 | } | ||
| 388 | uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1); | ||
| 389 | ++bitsread; | ||
| 390 | ++total_bitsread; | ||
| 391 | if (bitsread == 8) { | ||
| 392 | ++current_index; | ||
| 393 | bitsread = 0; | ||
| 394 | } | ||
| 395 | return bit; | ||
| 396 | } | ||
| 397 | |||
| 398 | uint StreamBits(uint num_bits) { | ||
| 399 | uint ret = 0; | ||
| 400 | for (uint i = 0; i < num_bits; i++) { | ||
| 401 | ret |= ((ReadBit() & 1) << i); | ||
| 402 | } | ||
| 403 | return ret; | ||
| 404 | } | ||
| 405 | |||
| 406 | uint ReadColorBit() { | ||
| 407 | uint bit = 0; | ||
| 408 | if (texel_flag) { | ||
| 409 | bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1); | ||
| 410 | ++texel_bitsread; | ||
| 411 | ++total_texel_bitsread; | ||
| 412 | if (texel_bitsread == 8) { | ||
| 413 | ++texel_index; | ||
| 414 | texel_bitsread = 0; | ||
| 415 | } | ||
| 416 | } else { | ||
| 417 | bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1); | ||
| 418 | ++color_bitsread; | ||
| 419 | ++total_color_bitsread; | ||
| 420 | if (color_bitsread == 8) { | ||
| 421 | ++color_index; | ||
| 422 | color_bitsread = 0; | ||
| 423 | } | ||
| 424 | } | ||
| 425 | return bit; | ||
| 426 | } | ||
| 427 | |||
| 428 | uint StreamColorBits(uint num_bits) { | ||
| 429 | uint ret = 0; | ||
| 430 | for (uint i = 0; i < num_bits; i++) { | ||
| 431 | ret |= ((ReadColorBit() & 1) << i); | ||
| 432 | } | ||
| 433 | return ret; | ||
| 434 | } | ||
| 435 | |||
| 436 | void ResultEmplaceBack(EncodingData val) { | ||
| 437 | if (texel_flag) { | ||
| 438 | texel_vector[texel_vector_index] = val; | ||
| 439 | ++texel_vector_index; | ||
| 440 | } else { | ||
| 441 | result_vector[result_index] = val; | ||
| 442 | ++result_index; | ||
| 443 | } | ||
| 444 | } | ||
| 445 | |||
| 446 | // Returns the number of bits required to encode n_vals values. | ||
| 447 | uint GetBitLength(uint n_vals, uint encoding_index) { | ||
| 448 | uint total_bits = encoding_values[encoding_index].num_bits * n_vals; | ||
| 449 | if (encoding_values[encoding_index].encoding == TRIT) { | ||
| 450 | total_bits += Div5Ceil(n_vals * 8); | ||
| 451 | } else if (encoding_values[encoding_index].encoding == QUINT) { | ||
| 452 | total_bits += Div3Ceil(n_vals * 7); | ||
| 453 | } | ||
| 454 | return total_bits; | ||
| 455 | } | ||
| 456 | |||
| 457 | uint GetNumWeightValues(uvec2 size, bool dual_plane) { | ||
| 458 | uint n_vals = size.x * size.y; | ||
| 459 | if (dual_plane) { | ||
| 460 | n_vals *= 2; | ||
| 461 | } | ||
| 462 | return n_vals; | ||
| 463 | } | ||
| 464 | |||
| 465 | uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) { | ||
| 466 | uint n_vals = GetNumWeightValues(size, dual_plane); | ||
| 467 | return GetBitLength(n_vals, max_weight); | ||
| 468 | } | ||
| 469 | |||
| 470 | uint BitsBracket(uint bits, uint pos) { | ||
| 471 | return ((bits >> pos) & 1); | ||
| 472 | } | ||
| 473 | |||
| 474 | uint BitsOp(uint bits, uint start, uint end) { | ||
| 475 | if (start == end) { | ||
| 476 | return BitsBracket(bits, start); | ||
| 477 | } else if (start > end) { | ||
| 478 | uint t = start; | ||
| 479 | start = end; | ||
| 480 | end = t; | ||
| 481 | } | ||
| 482 | |||
| 483 | uint mask = (1 << (end - start + 1)) - 1; | ||
| 484 | return ((bits >> start) & mask); | ||
| 485 | } | ||
| 486 | |||
| 487 | void DecodeQuintBlock(uint num_bits) { | ||
| 488 | uint m[3]; | ||
| 489 | uint q[3]; | ||
| 490 | uint Q; | ||
| 491 | m[0] = StreamColorBits(num_bits); | ||
| 492 | Q = StreamColorBits(3); | ||
| 493 | m[1] = StreamColorBits(num_bits); | ||
| 494 | Q |= StreamColorBits(2) << 3; | ||
| 495 | m[2] = StreamColorBits(num_bits); | ||
| 496 | Q |= StreamColorBits(2) << 5; | ||
| 497 | if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) { | ||
| 498 | q[0] = 4; | ||
| 499 | q[1] = 4; | ||
| 500 | q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) | | ||
| 501 | (BitsBracket(Q, 3) & ~BitsBracket(Q, 0)); | ||
| 502 | } else { | ||
| 503 | uint C = 0; | ||
| 504 | if (BitsOp(Q, 1, 2) == 3) { | ||
| 505 | q[2] = 4; | ||
| 506 | C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0); | ||
| 507 | } else { | ||
| 508 | q[2] = BitsOp(Q, 5, 6); | ||
| 509 | C = BitsOp(Q, 0, 4); | ||
| 510 | } | ||
| 511 | if (BitsOp(C, 0, 2) == 5) { | ||
| 512 | q[1] = 4; | ||
| 513 | q[0] = BitsOp(C, 3, 4); | ||
| 514 | } else { | ||
| 515 | q[1] = BitsOp(C, 3, 4); | ||
| 516 | q[0] = BitsOp(C, 0, 2); | ||
| 517 | } | ||
| 518 | } | ||
| 519 | for (uint i = 0; i < 3; i++) { | ||
| 520 | EncodingData val; | ||
| 521 | val.encoding = QUINT; | ||
| 522 | val.num_bits = num_bits; | ||
| 523 | val.bit_value = m[i]; | ||
| 524 | val.quint_trit_value = q[i]; | ||
| 525 | ResultEmplaceBack(val); | ||
| 526 | } | ||
| 527 | } | ||
| 528 | |||
| 529 | void DecodeTritBlock(uint num_bits) { | ||
| 530 | uint m[5]; | ||
| 531 | uint t[5]; | ||
| 532 | uint T; | ||
| 533 | m[0] = StreamColorBits(num_bits); | ||
| 534 | T = StreamColorBits(2); | ||
| 535 | m[1] = StreamColorBits(num_bits); | ||
| 536 | T |= StreamColorBits(2) << 2; | ||
| 537 | m[2] = StreamColorBits(num_bits); | ||
| 538 | T |= StreamColorBits(1) << 4; | ||
| 539 | m[3] = StreamColorBits(num_bits); | ||
| 540 | T |= StreamColorBits(2) << 5; | ||
| 541 | m[4] = StreamColorBits(num_bits); | ||
| 542 | T |= StreamColorBits(1) << 7; | ||
| 543 | uint C = 0; | ||
| 544 | if (BitsOp(T, 2, 4) == 7) { | ||
| 545 | C = (BitsOp(T, 5, 7) << 2) | BitsOp(T, 0, 1); | ||
| 546 | t[4] = 2; | ||
| 547 | t[3] = 2; | ||
| 548 | } else { | ||
| 549 | C = BitsOp(T, 0, 4); | ||
| 550 | if (BitsOp(T, 5, 6) == 3) { | ||
| 551 | t[4] = 2; | ||
| 552 | t[3] = BitsBracket(T, 7); | ||
| 553 | } else { | ||
| 554 | t[4] = BitsBracket(T, 7); | ||
| 555 | t[3] = BitsOp(T, 5, 6); | ||
| 556 | } | ||
| 557 | } | ||
| 558 | if (BitsOp(C, 0, 1) == 3) { | ||
| 559 | t[2] = 2; | ||
| 560 | t[1] = BitsBracket(C, 4); | ||
| 561 | t[0] = (BitsBracket(C, 3) << 1) | (BitsBracket(C, 2) & ~BitsBracket(C, 3)); | ||
| 562 | } else if (BitsOp(C, 2, 3) == 3) { | ||
| 563 | t[2] = 2; | ||
| 564 | t[1] = 2; | ||
| 565 | t[0] = BitsOp(C, 0, 1); | ||
| 566 | } else { | ||
| 567 | t[2] = BitsBracket(C, 4); | ||
| 568 | t[1] = BitsOp(C, 2, 3); | ||
| 569 | t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1)); | ||
| 570 | } | ||
| 571 | for (uint i = 0; i < 5; i++) { | ||
| 572 | EncodingData val; | ||
| 573 | val.encoding = TRIT; | ||
| 574 | val.num_bits = num_bits; | ||
| 575 | val.bit_value = m[i]; | ||
| 576 | val.quint_trit_value = t[i]; | ||
| 577 | ResultEmplaceBack(val); | ||
| 578 | } | ||
| 579 | } | ||
| 580 | |||
| 581 | void DecodeIntegerSequence(uint max_range, uint num_values) { | ||
| 582 | EncodingData val = encoding_values[max_range]; | ||
| 583 | uint vals_decoded = 0; | ||
| 584 | while (vals_decoded < num_values) { | ||
| 585 | switch (val.encoding) { | ||
| 586 | case QUINT: | ||
| 587 | DecodeQuintBlock(val.num_bits); | ||
| 588 | vals_decoded += 3; | ||
| 589 | break; | ||
| 590 | case TRIT: | ||
| 591 | DecodeTritBlock(val.num_bits); | ||
| 592 | vals_decoded += 5; | ||
| 593 | break; | ||
| 594 | case JUST_BITS: | ||
| 595 | val.bit_value = StreamColorBits(val.num_bits); | ||
| 596 | ResultEmplaceBack(val); | ||
| 597 | vals_decoded++; | ||
| 598 | break; | ||
| 599 | } | ||
| 600 | } | ||
| 601 | } | ||
| 602 | |||
| 603 | void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | ||
| 604 | uint num_values = 0; | ||
| 605 | for (uint i = 0; i < num_partitions; i++) { | ||
| 606 | num_values += ((modes[i] >> 2) + 1) << 1; | ||
| 607 | } | ||
| 608 | int range = 256; | ||
| 609 | while (--range > 0) { | ||
| 610 | EncodingData val = encoding_values[range]; | ||
| 611 | uint bit_length = GetBitLength(num_values, range); | ||
| 612 | if (bit_length <= color_data_bits) { | ||
| 613 | while (--range > 0) { | ||
| 614 | EncodingData newval = encoding_values[range]; | ||
| 615 | if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) { | ||
| 616 | break; | ||
| 617 | } | ||
| 618 | } | ||
| 619 | ++range; | ||
| 620 | break; | ||
| 621 | } | ||
| 622 | } | ||
| 623 | DecodeIntegerSequence(range, num_values); | ||
| 624 | uint out_index = 0; | ||
| 625 | for (int itr = 0; itr < result_index; ++itr) { | ||
| 626 | if (out_index >= num_values) { | ||
| 627 | break; | ||
| 628 | } | ||
| 629 | EncodingData val = result_vector[itr]; | ||
| 630 | uint bitlen = val.num_bits; | ||
| 631 | uint bitval = val.bit_value; | ||
| 632 | uint A = 0, B = 0, C = 0, D = 0; | ||
| 633 | A = ReplicateBitTo9((bitval & 1)); | ||
| 634 | switch (val.encoding) { | ||
| 635 | case JUST_BITS: | ||
| 636 | color_values[out_index++] = FastReplicateTo8(bitval, bitlen); | ||
| 637 | break; | ||
| 638 | case TRIT: { | ||
| 639 | D = val.quint_trit_value; | ||
| 640 | switch (bitlen) { | ||
| 641 | case 1: | ||
| 642 | C = 204; | ||
| 643 | break; | ||
| 644 | case 2: { | ||
| 645 | C = 93; | ||
| 646 | uint b = (bitval >> 1) & 1; | ||
| 647 | B = (b << 8) | (b << 4) | (b << 2) | (b << 1); | ||
| 648 | break; | ||
| 649 | } | ||
| 650 | case 3: { | ||
| 651 | C = 44; | ||
| 652 | uint cb = (bitval >> 1) & 3; | ||
| 653 | B = (cb << 7) | (cb << 2) | cb; | ||
| 654 | break; | ||
| 655 | } | ||
| 656 | case 4: { | ||
| 657 | C = 22; | ||
| 658 | uint dcb = (bitval >> 1) & 7; | ||
| 659 | B = (dcb << 6) | dcb; | ||
| 660 | break; | ||
| 661 | } | ||
| 662 | case 5: { | ||
| 663 | C = 11; | ||
| 664 | uint edcb = (bitval >> 1) & 0xF; | ||
| 665 | B = (edcb << 5) | (edcb >> 2); | ||
| 666 | break; | ||
| 667 | } | ||
| 668 | case 6: { | ||
| 669 | C = 5; | ||
| 670 | uint fedcb = (bitval >> 1) & 0x1F; | ||
| 671 | B = (fedcb << 4) | (fedcb >> 4); | ||
| 672 | break; | ||
| 673 | } | ||
| 674 | } | ||
| 675 | break; | ||
| 676 | } | ||
| 677 | case QUINT: { | ||
| 678 | D = val.quint_trit_value; | ||
| 679 | switch (bitlen) { | ||
| 680 | case 1: | ||
| 681 | C = 113; | ||
| 682 | break; | ||
| 683 | case 2: { | ||
| 684 | C = 54; | ||
| 685 | uint b = (bitval >> 1) & 1; | ||
| 686 | B = (b << 8) | (b << 3) | (b << 2); | ||
| 687 | break; | ||
| 688 | } | ||
| 689 | case 3: { | ||
| 690 | C = 26; | ||
| 691 | uint cb = (bitval >> 1) & 3; | ||
| 692 | B = (cb << 7) | (cb << 1) | (cb >> 1); | ||
| 693 | break; | ||
| 694 | } | ||
| 695 | case 4: { | ||
| 696 | C = 13; | ||
| 697 | uint dcb = (bitval >> 1) & 7; | ||
| 698 | B = (dcb << 6) | (dcb >> 1); | ||
| 699 | break; | ||
| 700 | } | ||
| 701 | case 5: { | ||
| 702 | C = 6; | ||
| 703 | uint edcb = (bitval >> 1) & 0xF; | ||
| 704 | B = (edcb << 5) | (edcb >> 3); | ||
| 705 | break; | ||
| 706 | } | ||
| 707 | } | ||
| 708 | break; | ||
| 709 | } | ||
| 710 | } | ||
| 711 | if (val.encoding != JUST_BITS) { | ||
| 712 | uint T = (D * C) + B; | ||
| 713 | T ^= A; | ||
| 714 | T = (A & 0x80) | (T >> 2); | ||
| 715 | color_values[out_index++] = T; | ||
| 716 | } | ||
| 717 | } | ||
| 718 | } | ||
| 719 | |||
| 720 | ivec2 BitTransferSigned(int a, int b) { | ||
| 721 | ivec2 transferred; | ||
| 722 | transferred.y = b >> 1; | ||
| 723 | transferred.y |= a & 0x80; | ||
| 724 | transferred.x = a >> 1; | ||
| 725 | transferred.x &= 0x3F; | ||
| 726 | if ((transferred.x & 0x20) > 0) { | ||
| 727 | transferred.x -= 0x40; | ||
| 728 | } | ||
| 729 | return transferred; | ||
| 730 | } | ||
| 731 | |||
| 732 | uvec4 ClampByte(ivec4 color) { | ||
| 733 | for (uint i = 0; i < 4; ++i) { | ||
| 734 | color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); | ||
| 735 | } | ||
| 736 | return uvec4(color); | ||
| 737 | } | ||
| 738 | |||
| 739 | ivec4 BlueContract(int a, int r, int g, int b) { | ||
| 740 | return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); | ||
| 741 | } | ||
| 742 | |||
| 743 | void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { | ||
| 744 | #define READ_UINT_VALUES(N) \ | ||
| 745 | uint v[N]; \ | ||
| 746 | for (uint i = 0; i < N; i++) { \ | ||
| 747 | v[i] = color_values[colvals_index++]; \ | ||
| 748 | } | ||
| 749 | |||
| 750 | #define READ_INT_VALUES(N) \ | ||
| 751 | int v[N]; \ | ||
| 752 | for (uint i = 0; i < N; i++) { \ | ||
| 753 | v[i] = int(color_values[colvals_index++]); \ | ||
| 754 | } | ||
| 755 | |||
| 756 | switch (color_endpoint_mode) { | ||
| 757 | case 0: { | ||
| 758 | READ_UINT_VALUES(2) | ||
| 759 | ep1 = uvec4(0xFF, v[0], v[0], v[0]); | ||
| 760 | ep2 = uvec4(0xFF, v[1], v[1], v[1]); | ||
| 761 | break; | ||
| 762 | } | ||
| 763 | case 1: { | ||
| 764 | READ_UINT_VALUES(2) | ||
| 765 | uint L0 = (v[0] >> 2) | (v[1] & 0xC0); | ||
| 766 | uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU); | ||
| 767 | ep1 = uvec4(0xFF, L0, L0, L0); | ||
| 768 | ep2 = uvec4(0xFF, L1, L1, L1); | ||
| 769 | break; | ||
| 770 | } | ||
| 771 | case 4: { | ||
| 772 | READ_UINT_VALUES(4) | ||
| 773 | ep1 = uvec4(v[2], v[0], v[0], v[0]); | ||
| 774 | ep2 = uvec4(v[3], v[1], v[1], v[1]); | ||
| 775 | break; | ||
| 776 | } | ||
| 777 | case 5: { | ||
| 778 | READ_INT_VALUES(4) | ||
| 779 | ivec2 transferred = BitTransferSigned(v[1], v[0]); | ||
| 780 | v[1] = transferred.x; | ||
| 781 | v[0] = transferred.y; | ||
| 782 | transferred = BitTransferSigned(v[3], v[2]); | ||
| 783 | v[3] = transferred.x; | ||
| 784 | v[2] = transferred.y; | ||
| 785 | ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0])); | ||
| 786 | ep2 = ClampByte(ivec4(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1])); | ||
| 787 | break; | ||
| 788 | } | ||
| 789 | case 6: { | ||
| 790 | READ_UINT_VALUES(4) | ||
| 791 | ep1 = uvec4(0xFF, (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); | ||
| 792 | ep2 = uvec4(0xFF, v[0], v[1], v[2]); | ||
| 793 | break; | ||
| 794 | } | ||
| 795 | case 8: { | ||
| 796 | READ_UINT_VALUES(6) | ||
| 797 | if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { | ||
| 798 | ep1 = uvec4(0xFF, v[0], v[2], v[4]); | ||
| 799 | ep2 = uvec4(0xFF, v[1], v[3], v[5]); | ||
| 800 | } else { | ||
| 801 | ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5]))); | ||
| 802 | ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4]))); | ||
| 803 | } | ||
| 804 | break; | ||
| 805 | } | ||
| 806 | case 9: { | ||
| 807 | READ_INT_VALUES(6) | ||
| 808 | ivec2 transferred = BitTransferSigned(v[1], v[0]); | ||
| 809 | v[1] = transferred.x; | ||
| 810 | v[0] = transferred.y; | ||
| 811 | transferred = BitTransferSigned(v[3], v[2]); | ||
| 812 | v[3] = transferred.x; | ||
| 813 | v[2] = transferred.y; | ||
| 814 | transferred = BitTransferSigned(v[5], v[4]); | ||
| 815 | v[5] = transferred.x; | ||
| 816 | v[4] = transferred.y; | ||
| 817 | if ((v[1] + v[3] + v[5]) >= 0) { | ||
| 818 | ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4])); | ||
| 819 | ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); | ||
| 820 | } else { | ||
| 821 | ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); | ||
| 822 | ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4])); | ||
| 823 | } | ||
| 824 | break; | ||
| 825 | } | ||
| 826 | case 10: { | ||
| 827 | READ_UINT_VALUES(6) | ||
| 828 | ep1 = uvec4(v[4], (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); | ||
| 829 | ep2 = uvec4(v[5], v[0], v[1], v[2]); | ||
| 830 | break; | ||
| 831 | } | ||
| 832 | case 12: { | ||
| 833 | READ_UINT_VALUES(8) | ||
| 834 | if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { | ||
| 835 | ep1 = uvec4(v[6], v[0], v[2], v[4]); | ||
| 836 | ep2 = uvec4(v[7], v[1], v[3], v[5]); | ||
| 837 | } else { | ||
| 838 | ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5]))); | ||
| 839 | ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4]))); | ||
| 840 | } | ||
| 841 | break; | ||
| 842 | } | ||
| 843 | case 13: { | ||
| 844 | READ_INT_VALUES(8) | ||
| 845 | ivec2 transferred = BitTransferSigned(v[1], v[0]); | ||
| 846 | v[1] = transferred.x; | ||
| 847 | v[0] = transferred.y; | ||
| 848 | transferred = BitTransferSigned(v[3], v[2]); | ||
| 849 | v[3] = transferred.x; | ||
| 850 | v[2] = transferred.y; | ||
| 851 | |||
| 852 | transferred = BitTransferSigned(v[5], v[4]); | ||
| 853 | v[5] = transferred.x; | ||
| 854 | v[4] = transferred.y; | ||
| 855 | |||
| 856 | transferred = BitTransferSigned(v[7], v[6]); | ||
| 857 | v[7] = transferred.x; | ||
| 858 | v[6] = transferred.y; | ||
| 859 | |||
| 860 | if ((v[1] + v[3] + v[5]) >= 0) { | ||
| 861 | ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4])); | ||
| 862 | ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5])); | ||
| 863 | } else { | ||
| 864 | ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5])); | ||
| 865 | ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4])); | ||
| 866 | } | ||
| 867 | break; | ||
| 868 | } | ||
| 869 | default: { | ||
| 870 | // HDR mode, or more likely a bug computing the color_endpoint_mode | ||
| 871 | ep1 = uvec4(0xFF, 0xFF, 0, 0); | ||
| 872 | ep2 = uvec4(0xFF, 0xFF, 0, 0); | ||
| 873 | break; | ||
| 874 | } | ||
| 875 | } | ||
| 876 | #undef READ_UINT_VALUES | ||
| 877 | #undef READ_INT_VALUES | ||
| 878 | } | ||
| 879 | |||
| 880 | uint UnquantizeTexelWeight(EncodingData val) { | ||
| 881 | uint bitval = val.bit_value; | ||
| 882 | uint bitlen = val.num_bits; | ||
| 883 | uint A = ReplicateBitTo7((bitval & 1)); | ||
| 884 | uint B = 0, C = 0, D = 0; | ||
| 885 | uint result = 0; | ||
| 886 | switch (val.encoding) { | ||
| 887 | case JUST_BITS: | ||
| 888 | result = FastReplicateTo6(bitval, bitlen); | ||
| 889 | break; | ||
| 890 | case TRIT: { | ||
| 891 | D = val.quint_trit_value; | ||
| 892 | switch (bitlen) { | ||
| 893 | case 0: { | ||
| 894 | uint results[3] = {0, 32, 63}; | ||
| 895 | result = results[D]; | ||
| 896 | break; | ||
| 897 | } | ||
| 898 | case 1: { | ||
| 899 | C = 50; | ||
| 900 | break; | ||
| 901 | } | ||
| 902 | case 2: { | ||
| 903 | C = 23; | ||
| 904 | uint b = (bitval >> 1) & 1; | ||
| 905 | B = (b << 6) | (b << 2) | b; | ||
| 906 | break; | ||
| 907 | } | ||
| 908 | case 3: { | ||
| 909 | C = 11; | ||
| 910 | uint cb = (bitval >> 1) & 3; | ||
| 911 | B = (cb << 5) | cb; | ||
| 912 | break; | ||
| 913 | } | ||
| 914 | default: | ||
| 915 | break; | ||
| 916 | } | ||
| 917 | break; | ||
| 918 | } | ||
| 919 | case QUINT: { | ||
| 920 | D = val.quint_trit_value; | ||
| 921 | switch (bitlen) { | ||
| 922 | case 0: { | ||
| 923 | uint results[5] = {0, 16, 32, 47, 63}; | ||
| 924 | result = results[D]; | ||
| 925 | break; | ||
| 926 | } | ||
| 927 | case 1: { | ||
| 928 | C = 28; | ||
| 929 | break; | ||
| 930 | } | ||
| 931 | case 2: { | ||
| 932 | C = 13; | ||
| 933 | uint b = (bitval >> 1) & 1; | ||
| 934 | B = (b << 6) | (b << 1); | ||
| 935 | break; | ||
| 936 | } | ||
| 937 | } | ||
| 938 | break; | ||
| 939 | } | ||
| 940 | } | ||
| 941 | if (val.encoding != JUST_BITS && bitlen > 0) { | ||
| 942 | result = D * C + B; | ||
| 943 | result ^= A; | ||
| 944 | result = (A & 0x20) | (result >> 2); | ||
| 945 | } | ||
| 946 | if (result > 32) { | ||
| 947 | result += 1; | ||
| 948 | } | ||
| 949 | return result; | ||
| 950 | } | ||
| 951 | |||
| 952 | void UnquantizeTexelWeights(bool dual_plane, uvec2 size) { | ||
| 953 | uint weight_idx = 0; | ||
| 954 | uint unquantized[2][144]; | ||
| 955 | uint area = size.x * size.y; | ||
| 956 | for (uint itr = 0; itr < texel_vector_index; itr++) { | ||
| 957 | unquantized[0][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]); | ||
| 958 | if (dual_plane) { | ||
| 959 | ++itr; | ||
| 960 | unquantized[1][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]); | ||
| 961 | if (itr == texel_vector_index) { | ||
| 962 | break; | ||
| 963 | } | ||
| 964 | } | ||
| 965 | if (++weight_idx >= (area)) | ||
| 966 | break; | ||
| 967 | } | ||
| 968 | |||
| 969 | const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); | ||
| 970 | const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); | ||
| 971 | const uint k_plane_scale = dual_plane ? 2 : 1; | ||
| 972 | for (uint plane = 0; plane < k_plane_scale; plane++) { | ||
| 973 | for (uint t = 0; t < block_dims.y; t++) { | ||
| 974 | for (uint s = 0; s < block_dims.x; s++) { | ||
| 975 | uint cs = Ds * s; | ||
| 976 | uint ct = Dt * t; | ||
| 977 | uint gs = (cs * (size.x - 1) + 32) >> 6; | ||
| 978 | uint gt = (ct * (size.y - 1) + 32) >> 6; | ||
| 979 | uint js = gs >> 4; | ||
| 980 | uint fs = gs & 0xF; | ||
| 981 | uint jt = gt >> 4; | ||
| 982 | uint ft = gt & 0x0F; | ||
| 983 | uint w11 = (fs * ft + 8) >> 4; | ||
| 984 | uint w10 = ft - w11; | ||
| 985 | uint w01 = fs - w11; | ||
| 986 | uint w00 = 16 - fs - ft + w11; | ||
| 987 | uvec4 w = uvec4(w00, w01, w10, w11); | ||
| 988 | uint v0 = jt * size.x + js; | ||
| 989 | |||
| 990 | uvec4 p = uvec4(0); | ||
| 991 | if (v0 < area) { | ||
| 992 | p.x = unquantized[plane][v0]; | ||
| 993 | } | ||
| 994 | if ((v0 + 1) < (area)) { | ||
| 995 | p.y = unquantized[plane][v0 + 1]; | ||
| 996 | } | ||
| 997 | if ((v0 + size.x) < (area)) { | ||
| 998 | p.z = unquantized[plane][(v0 + size.x)]; | ||
| 999 | } | ||
| 1000 | if ((v0 + size.x + 1) < (area)) { | ||
| 1001 | p.w = unquantized[plane][(v0 + size.x + 1)]; | ||
| 1002 | } | ||
| 1003 | unquantized_texel_weights[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; | ||
| 1004 | } | ||
| 1005 | } | ||
| 1006 | } | ||
| 1007 | } | ||
| 1008 | |||
| 1009 | int FindLayout(uint mode) { | ||
| 1010 | if ((mode & 3) != 0) { | ||
| 1011 | if ((mode & 8) != 0) { | ||
| 1012 | if ((mode & 4) != 0) { | ||
| 1013 | if ((mode & 0x100) != 0) { | ||
| 1014 | return 4; | ||
| 1015 | } | ||
| 1016 | return 3; | ||
| 1017 | } | ||
| 1018 | return 2; | ||
| 1019 | } | ||
| 1020 | if ((mode & 4) != 0) { | ||
| 1021 | return 1; | ||
| 1022 | } | ||
| 1023 | return 0; | ||
| 1024 | } | ||
| 1025 | if ((mode & 0x100) != 0) { | ||
| 1026 | if ((mode & 0x80) != 0) { | ||
| 1027 | if ((mode & 0x20) != 0) { | ||
| 1028 | return 8; | ||
| 1029 | } | ||
| 1030 | return 7; | ||
| 1031 | } | ||
| 1032 | return 9; | ||
| 1033 | } | ||
| 1034 | if ((mode & 0x80) != 0) { | ||
| 1035 | return 6; | ||
| 1036 | } | ||
| 1037 | return 5; | ||
| 1038 | } | ||
| 1039 | |||
| 1040 | TexelWeightParams DecodeBlockInfo(uint block_index) { | ||
| 1041 | TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false); | ||
| 1042 | uint mode = StreamBits(11); | ||
| 1043 | if ((mode & 0x1ff) == 0x1fc) { | ||
| 1044 | if ((mode & 0x200) != 0) { | ||
| 1045 | params.void_extent_hdr = true; | ||
| 1046 | } else { | ||
| 1047 | params.void_extent_ldr = true; | ||
| 1048 | } | ||
| 1049 | if ((mode & 0x400) == 0 || StreamBits(1) == 0) { | ||
| 1050 | params.error_state = true; | ||
| 1051 | } | ||
| 1052 | return params; | ||
| 1053 | } | ||
| 1054 | if ((mode & 0xf) == 0) { | ||
| 1055 | params.error_state = true; | ||
| 1056 | return params; | ||
| 1057 | } | ||
| 1058 | if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) { | ||
| 1059 | params.error_state = true; | ||
| 1060 | return params; | ||
| 1061 | } | ||
| 1062 | uint A, B; | ||
| 1063 | uint mode_layout = FindLayout(mode); | ||
| 1064 | switch (mode_layout) { | ||
| 1065 | case 0: | ||
| 1066 | A = (mode >> 5) & 0x3; | ||
| 1067 | B = (mode >> 7) & 0x3; | ||
| 1068 | params.size = uvec2(B + 4, A + 2); | ||
| 1069 | break; | ||
| 1070 | case 1: | ||
| 1071 | A = (mode >> 5) & 0x3; | ||
| 1072 | B = (mode >> 7) & 0x3; | ||
| 1073 | params.size = uvec2(B + 8, A + 2); | ||
| 1074 | break; | ||
| 1075 | case 2: | ||
| 1076 | A = (mode >> 5) & 0x3; | ||
| 1077 | B = (mode >> 7) & 0x3; | ||
| 1078 | params.size = uvec2(A + 2, B + 8); | ||
| 1079 | break; | ||
| 1080 | case 3: | ||
| 1081 | A = (mode >> 5) & 0x3; | ||
| 1082 | B = (mode >> 7) & 0x1; | ||
| 1083 | params.size = uvec2(A + 2, B + 6); | ||
| 1084 | break; | ||
| 1085 | case 4: | ||
| 1086 | A = (mode >> 5) & 0x3; | ||
| 1087 | B = (mode >> 7) & 0x1; | ||
| 1088 | params.size = uvec2(B + 2, A + 2); | ||
| 1089 | break; | ||
| 1090 | case 5: | ||
| 1091 | A = (mode >> 5) & 0x3; | ||
| 1092 | params.size = uvec2(12, A + 2); | ||
| 1093 | break; | ||
| 1094 | case 6: | ||
| 1095 | A = (mode >> 5) & 0x3; | ||
| 1096 | params.size = uvec2(A + 2, 12); | ||
| 1097 | break; | ||
| 1098 | case 7: | ||
| 1099 | params.size = uvec2(6, 10); | ||
| 1100 | break; | ||
| 1101 | case 8: | ||
| 1102 | params.size = uvec2(10, 6); | ||
| 1103 | break; | ||
| 1104 | case 9: | ||
| 1105 | A = (mode >> 5) & 0x3; | ||
| 1106 | B = (mode >> 9) & 0x3; | ||
| 1107 | params.size = uvec2(A + 6, B + 6); | ||
| 1108 | break; | ||
| 1109 | default: | ||
| 1110 | params.error_state = true; | ||
| 1111 | break; | ||
| 1112 | } | ||
| 1113 | params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); | ||
| 1114 | uint weight_index = (mode & 0x10) != 0 ? 1 : 0; | ||
| 1115 | if (mode_layout < 5) { | ||
| 1116 | weight_index |= (mode & 0x3) << 1; | ||
| 1117 | } else { | ||
| 1118 | weight_index |= (mode & 0xc) >> 1; | ||
| 1119 | } | ||
| 1120 | weight_index -= 2; | ||
| 1121 | if ((mode_layout != 9) && ((mode & 0x200) != 0)) { | ||
| 1122 | const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31); | ||
| 1123 | params.max_weight = max_weights[weight_index]; | ||
| 1124 | } else { | ||
| 1125 | const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7); | ||
| 1126 | params.max_weight = max_weights[weight_index]; | ||
| 1127 | } | ||
| 1128 | return params; | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | void FillError(ivec3 coord) { | ||
| 1132 | for (uint j = 0; j < block_dims.y; j++) { | ||
| 1133 | for (uint i = 0; i < block_dims.x; i++) { | ||
| 1134 | imageStore(dest_image, coord + ivec3(i, j, 0), vec4(1.0, 1.0, 0.0, 1.0)); | ||
| 1135 | } | ||
| 1136 | } | ||
| 1137 | } | ||
| 1138 | |||
| 1139 | void FillVoidExtentLDR(ivec3 coord) { | ||
| 1140 | StreamBits(52); | ||
| 1141 | uint r_u = StreamBits(16); | ||
| 1142 | uint g_u = StreamBits(16); | ||
| 1143 | uint b_u = StreamBits(16); | ||
| 1144 | uint a_u = StreamBits(16); | ||
| 1145 | float a = float(a_u) / 65535.0f; | ||
| 1146 | float r = float(r_u) / 65535.0f; | ||
| 1147 | float g = float(g_u) / 65535.0f; | ||
| 1148 | float b = float(b_u) / 65535.0f; | ||
| 1149 | for (uint j = 0; j < block_dims.y; j++) { | ||
| 1150 | for (uint i = 0; i < block_dims.x; i++) { | ||
| 1151 | imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a)); | ||
| 1152 | } | ||
| 1153 | } | ||
| 1154 | } | ||
| 1155 | |||
| 1156 | void DecompressBlock(ivec3 coord, uint block_index) { | ||
| 1157 | TexelWeightParams params = DecodeBlockInfo(block_index); | ||
| 1158 | if (params.error_state) { | ||
| 1159 | FillError(coord); | ||
| 1160 | return; | ||
| 1161 | } | ||
| 1162 | if (params.void_extent_hdr) { | ||
| 1163 | FillError(coord); | ||
| 1164 | return; | ||
| 1165 | } | ||
| 1166 | if (params.void_extent_ldr) { | ||
| 1167 | FillVoidExtentLDR(coord); | ||
| 1168 | return; | ||
| 1169 | } | ||
| 1170 | if ((params.size.x > block_dims.x) || (params.size.y > block_dims.y)) { | ||
| 1171 | FillError(coord); | ||
| 1172 | return; | ||
| 1173 | } | ||
| 1174 | uint num_partitions = StreamBits(2) + 1; | ||
| 1175 | if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) { | ||
| 1176 | FillError(coord); | ||
| 1177 | return; | ||
| 1178 | } | ||
| 1179 | int plane_index = -1; | ||
| 1180 | uint partition_index = 1; | ||
| 1181 | uvec4 color_endpoint_mode = uvec4(0); | ||
| 1182 | uint ced_pointer = 0; | ||
| 1183 | uint base_cem = 0; | ||
| 1184 | if (num_partitions == 1) { | ||
| 1185 | color_endpoint_mode.x = StreamBits(4); | ||
| 1186 | partition_index = 0; | ||
| 1187 | } else { | ||
| 1188 | partition_index = StreamBits(10); | ||
| 1189 | base_cem = StreamBits(6); | ||
| 1190 | } | ||
| 1191 | uint base_mode = base_cem & 3; | ||
| 1192 | uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight); | ||
| 1193 | uint remaining_bits = 128 - weight_bits - total_bitsread; | ||
| 1194 | uint extra_cem_bits = 0; | ||
| 1195 | if (base_mode > 0) { | ||
| 1196 | switch (num_partitions) { | ||
| 1197 | case 2: | ||
| 1198 | extra_cem_bits += 2; | ||
| 1199 | break; | ||
| 1200 | case 3: | ||
| 1201 | extra_cem_bits += 5; | ||
| 1202 | break; | ||
| 1203 | case 4: | ||
| 1204 | extra_cem_bits += 8; | ||
| 1205 | break; | ||
| 1206 | default: | ||
| 1207 | return; | ||
| 1208 | } | ||
| 1209 | } | ||
| 1210 | remaining_bits -= extra_cem_bits; | ||
| 1211 | uint plane_selector_bits = 0; | ||
| 1212 | if (params.dual_plane) { | ||
| 1213 | plane_selector_bits = 2; | ||
| 1214 | } | ||
| 1215 | remaining_bits -= plane_selector_bits; | ||
| 1216 | if (remaining_bits > 128) { | ||
| 1217 | // Bad data, more remaining bits than 4 bytes | ||
| 1218 | // return early | ||
| 1219 | return; | ||
| 1220 | } | ||
| 1221 | // Read color data... | ||
| 1222 | uint color_data_bits = remaining_bits; | ||
| 1223 | while (remaining_bits > 0) { | ||
| 1224 | int nb = int(min(remaining_bits, 8U)); | ||
| 1225 | uint b = StreamBits(nb); | ||
| 1226 | color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); | ||
| 1227 | ++ced_pointer; | ||
| 1228 | remaining_bits -= nb; | ||
| 1229 | } | ||
| 1230 | plane_index = int(StreamBits(plane_selector_bits)); | ||
| 1231 | if (base_mode > 0) { | ||
| 1232 | uint extra_cem = StreamBits(extra_cem_bits); | ||
| 1233 | uint cem = (extra_cem << 6) | base_cem; | ||
| 1234 | cem >>= 2; | ||
| 1235 | uvec4 C = uvec4(0); | ||
| 1236 | for (uint i = 0; i < num_partitions; i++) { | ||
| 1237 | C[i] = (cem & 1); | ||
| 1238 | cem >>= 1; | ||
| 1239 | } | ||
| 1240 | uvec4 M = uvec4(0); | ||
| 1241 | for (uint i = 0; i < num_partitions; i++) { | ||
| 1242 | M[i] = cem & 3; | ||
| 1243 | cem >>= 2; | ||
| 1244 | } | ||
| 1245 | for (uint i = 0; i < num_partitions; i++) { | ||
| 1246 | color_endpoint_mode[i] = base_mode; | ||
| 1247 | if (C[i] == 0) { | ||
| 1248 | --color_endpoint_mode[i]; | ||
| 1249 | } | ||
| 1250 | color_endpoint_mode[i] <<= 2; | ||
| 1251 | color_endpoint_mode[i] |= M[i]; | ||
| 1252 | } | ||
| 1253 | } else if (num_partitions > 1) { | ||
| 1254 | uint cem = base_cem >> 2; | ||
| 1255 | for (uint i = 0; i < num_partitions; i++) { | ||
| 1256 | color_endpoint_mode[i] = cem; | ||
| 1257 | } | ||
| 1258 | } | ||
| 1259 | DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits); | ||
| 1260 | |||
| 1261 | uvec4 endpoints[4][2]; | ||
| 1262 | for (uint i = 0; i < num_partitions; i++) { | ||
| 1263 | ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]); | ||
| 1264 | } | ||
| 1265 | |||
| 1266 | for (uint i = 0; i < 16; i++) { | ||
| 1267 | texel_weight_data[i] = local_buff[i]; | ||
| 1268 | } | ||
| 1269 | for (uint i = 0; i < 8; i++) { | ||
| 1270 | #define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16 | ||
| 1271 | uint a = REVERSE_BYTE(texel_weight_data[i]); | ||
| 1272 | uint b = REVERSE_BYTE(texel_weight_data[15 - i]); | ||
| 1273 | #undef REVERSE_BYTE | ||
| 1274 | texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8)); | ||
| 1275 | texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8)); | ||
| 1276 | } | ||
| 1277 | uint clear_byte_start = | ||
| 1278 | (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1; | ||
| 1279 | texel_weight_data[clear_byte_start - 1] = | ||
| 1280 | texel_weight_data[clear_byte_start - 1] & | ||
| 1281 | uint( | ||
| 1282 | ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); | ||
| 1283 | for (uint i = 0; i < 16 - clear_byte_start; i++) { | ||
| 1284 | texel_weight_data[clear_byte_start + i] = 0U; | ||
| 1285 | } | ||
| 1286 | texel_flag = true; // use texel "vector" and bit stream in integer decoding | ||
| 1287 | DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); | ||
| 1288 | |||
| 1289 | UnquantizeTexelWeights(params.dual_plane, params.size); | ||
| 1290 | |||
| 1291 | for (uint j = 0; j < block_dims.y; j++) { | ||
| 1292 | for (uint i = 0; i < block_dims.x; i++) { | ||
| 1293 | uint local_partition = Select2DPartition(partition_index, i, j, num_partitions, | ||
| 1294 | (block_dims.y * block_dims.x) < 32); | ||
| 1295 | vec4 p; | ||
| 1296 | uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]); | ||
| 1297 | uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]); | ||
| 1298 | uvec4 plane_vec = uvec4(0); | ||
| 1299 | uvec4 weight_vec = uvec4(0); | ||
| 1300 | for (uint c = 0; c < 4; c++) { | ||
| 1301 | if (params.dual_plane && (((plane_index + 1) & 3) == c)) { | ||
| 1302 | plane_vec[c] = 1; | ||
| 1303 | } | ||
| 1304 | weight_vec[c] = unquantized_texel_weights[plane_vec[c]][j * block_dims.x + i]; | ||
| 1305 | } | ||
| 1306 | vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); | ||
| 1307 | p = (Cf / 65535.0); | ||
| 1308 | imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); | ||
| 1309 | } | ||
| 1310 | } | ||
| 1311 | } | ||
| 1312 | |||
| 1313 | void main() { | ||
| 1314 | uvec3 pos = gl_GlobalInvocationID; | ||
| 1315 | pos.x <<= bytes_per_block_log2; | ||
| 1316 | |||
| 1317 | // Read as soon as possible due to its latency | ||
| 1318 | const uint swizzle = SwizzleOffset(pos.xy); | ||
| 1319 | |||
| 1320 | const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; | ||
| 1321 | |||
| 1322 | uint offset = 0; | ||
| 1323 | offset += pos.z * layer_stride; | ||
| 1324 | offset += (block_y >> block_height) * block_size; | ||
| 1325 | offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT; | ||
| 1326 | offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; | ||
| 1327 | offset += swizzle; | ||
| 1328 | |||
| 1329 | const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1)); | ||
| 1330 | uint block_index = | ||
| 1331 | pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x; | ||
| 1332 | |||
| 1333 | current_index = 0; | ||
| 1334 | bitsread = 0; | ||
| 1335 | for (int i = 0; i < 16; i++) { | ||
| 1336 | local_buff[i] = ReadTexel(offset + i); | ||
| 1337 | } | ||
| 1338 | DecompressBlock(coord, block_index); | ||
| 1339 | } | ||
diff --git a/src/video_core/host_shaders/source_shader.h.in b/src/video_core/host_shaders/source_shader.h.in index ccdb0d2a9..929dec39b 100644 --- a/src/video_core/host_shaders/source_shader.h.in +++ b/src/video_core/host_shaders/source_shader.h.in | |||
| @@ -4,6 +4,8 @@ | |||
| 4 | 4 | ||
| 5 | namespace HostShaders { | 5 | namespace HostShaders { |
| 6 | 6 | ||
| 7 | constexpr std::string_view @CONTENTS_NAME@ = R"(@CONTENTS@)"; | 7 | constexpr std::string_view @CONTENTS_NAME@ = { |
| 8 | @CONTENTS@ | ||
| 9 | }; | ||
| 8 | 10 | ||
| 9 | } // namespace HostShaders | 11 | } // namespace HostShaders |