diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | src/video_core/host_shaders/astc_decoder.comp | 569 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_texture_cache.cpp | 11 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/util_shaders.cpp | 96 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/util_shaders.h | 8 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_compute_pass.cpp | 175 | ||||
| -rw-r--r-- | src/video_core/texture_cache/util.cpp | 14 | ||||
| -rw-r--r-- | src/video_core/textures/astc.cpp | 1710 | ||||
| -rw-r--r-- | src/video_core/textures/astc.h | 174 |
9 files changed, 502 insertions, 2256 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 9b931976a..47190c464 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt | |||
| @@ -236,7 +236,6 @@ add_library(video_core STATIC | |||
| 236 | texture_cache/types.h | 236 | texture_cache/types.h |
| 237 | texture_cache/util.cpp | 237 | texture_cache/util.cpp |
| 238 | texture_cache/util.h | 238 | texture_cache/util.h |
| 239 | textures/astc.cpp | ||
| 240 | textures/astc.h | 239 | textures/astc.h |
| 241 | textures/decoders.cpp | 240 | textures/decoders.cpp |
| 242 | textures/decoders.h | 241 | textures/decoders.h |
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index b903a2d37..703e34587 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp | |||
| @@ -9,13 +9,13 @@ | |||
| 9 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | 9 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { |
| 10 | #define END_PUSH_CONSTANTS }; | 10 | #define END_PUSH_CONSTANTS }; |
| 11 | #define UNIFORM(n) | 11 | #define UNIFORM(n) |
| 12 | #define BINDING_SWIZZLE_BUFFER 0 | 12 | #define BINDING_INPUT_BUFFER 0 |
| 13 | #define BINDING_INPUT_BUFFER 1 | 13 | #define BINDING_ENC_BUFFER 1 |
| 14 | #define BINDING_ENC_BUFFER 2 | 14 | #define BINDING_6_TO_8_BUFFER 2 |
| 15 | #define BINDING_6_TO_8_BUFFER 3 | 15 | #define BINDING_7_TO_8_BUFFER 3 |
| 16 | #define BINDING_7_TO_8_BUFFER 4 | 16 | #define BINDING_8_TO_8_BUFFER 4 |
| 17 | #define BINDING_8_TO_8_BUFFER 5 | 17 | #define BINDING_BYTE_TO_16_BUFFER 5 |
| 18 | #define BINDING_BYTE_TO_16_BUFFER 6 | 18 | #define BINDING_SWIZZLE_BUFFER 6 |
| 19 | #define BINDING_OUTPUT_IMAGE 7 | 19 | #define BINDING_OUTPUT_IMAGE 7 |
| 20 | 20 | ||
| 21 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | 21 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv |
| @@ -37,28 +37,16 @@ | |||
| 37 | layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; | 37 | layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; |
| 38 | 38 | ||
| 39 | BEGIN_PUSH_CONSTANTS | 39 | BEGIN_PUSH_CONSTANTS |
| 40 | UNIFORM(0) uvec2 num_image_blocks; | ||
| 41 | UNIFORM(1) uvec2 block_dims; | 40 | UNIFORM(1) uvec2 block_dims; |
| 42 | 41 | ||
| 43 | UNIFORM(2) uvec3 origin; | 42 | UNIFORM(2) uint bytes_per_block_log2; |
| 44 | UNIFORM(3) ivec3 destination; | 43 | UNIFORM(3) uint layer_stride; |
| 45 | UNIFORM(4) uint bytes_per_block_log2; | 44 | UNIFORM(4) uint block_size; |
| 46 | UNIFORM(5) uint layer_stride; | 45 | UNIFORM(5) uint x_shift; |
| 47 | UNIFORM(6) uint block_size; | 46 | UNIFORM(6) uint block_height; |
| 48 | UNIFORM(7) uint x_shift; | 47 | UNIFORM(7) uint block_height_mask; |
| 49 | UNIFORM(8) uint block_height; | ||
| 50 | UNIFORM(9) uint block_height_mask; | ||
| 51 | END_PUSH_CONSTANTS | 48 | END_PUSH_CONSTANTS |
| 52 | 49 | ||
| 53 | uint current_index = 0; | ||
| 54 | int bitsread = 0; | ||
| 55 | uint total_bitsread = 0; | ||
| 56 | uint local_buff[16]; | ||
| 57 | |||
| 58 | const int JustBits = 0; | ||
| 59 | const int Quint = 1; | ||
| 60 | const int Trit = 2; | ||
| 61 | |||
| 62 | struct EncodingData { | 50 | struct EncodingData { |
| 63 | uint encoding; | 51 | uint encoding; |
| 64 | uint num_bits; | 52 | uint num_bits; |
| @@ -68,11 +56,11 @@ struct EncodingData { | |||
| 68 | 56 | ||
| 69 | struct TexelWeightParams { | 57 | struct TexelWeightParams { |
| 70 | uvec2 size; | 58 | uvec2 size; |
| 71 | bool dual_plane; | ||
| 72 | uint max_weight; | 59 | uint max_weight; |
| 73 | bool Error; | 60 | bool dual_plane; |
| 74 | bool VoidExtentLDR; | 61 | bool error_state; |
| 75 | bool VoidExtentHDR; | 62 | bool void_extent_ldr; |
| 63 | bool void_extent_hdr; | ||
| 76 | }; | 64 | }; |
| 77 | 65 | ||
| 78 | // Swizzle data | 66 | // Swizzle data |
| @@ -116,6 +104,75 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHI | |||
| 116 | 104 | ||
| 117 | const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); | 105 | const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); |
| 118 | 106 | ||
| 107 | const int BLOCK_SIZE_IN_BYTES = 16; | ||
| 108 | |||
| 109 | const int BLOCK_INFO_ERROR = 0; | ||
| 110 | const int BLOCK_INFO_VOID_EXTENT_HDR = 1; | ||
| 111 | const int BLOCK_INFO_VOID_EXTENT_LDR = 2; | ||
| 112 | const int BLOCK_INFO_NORMAL = 3; | ||
| 113 | |||
| 114 | const int JUST_BITS = 0; | ||
| 115 | const int QUINT = 1; | ||
| 116 | const int TRIT = 2; | ||
| 117 | |||
| 118 | // The following constants are expanded variants of the Replicate() | ||
| 119 | // function calls corresponding to the following arguments: | ||
| 120 | // value: index into the generated table | ||
| 121 | // num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4. | ||
| 122 | // to_bit: the integer after "TO_" | ||
| 123 | const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127); | ||
| 124 | const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511); | ||
| 125 | |||
| 126 | const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255); | ||
| 127 | const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255); | ||
| 128 | const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255); | ||
| 129 | const uint REPLICATE_4_BIT_TO_8_TABLE[16] = | ||
| 130 | uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255); | ||
| 131 | const uint REPLICATE_5_BIT_TO_8_TABLE[32] = | ||
| 132 | uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165, | ||
| 133 | 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255); | ||
| 134 | const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63); | ||
| 135 | const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63); | ||
| 136 | const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63); | ||
| 137 | const uint REPLICATE_4_BIT_TO_6_TABLE[16] = | ||
| 138 | uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63); | ||
| 139 | const uint REPLICATE_5_BIT_TO_6_TABLE[32] = | ||
| 140 | uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45, | ||
| 141 | 47, 49, 51, 53, 55, 57, 59, 61, 63); | ||
| 142 | |||
| 143 | // Input ASTC texture globals | ||
| 144 | uint current_index = 0; | ||
| 145 | int bitsread = 0; | ||
| 146 | uint total_bitsread = 0; | ||
| 147 | uint local_buff[16]; | ||
| 148 | |||
| 149 | // Color data globals | ||
| 150 | uint color_endpoint_data[16]; | ||
| 151 | int color_bitsread = 0; | ||
| 152 | uint total_color_bitsread = 0; | ||
| 153 | int color_index = 0; | ||
| 154 | |||
| 155 | // Four values, two endpoints, four maximum paritions | ||
| 156 | uint color_values[32]; | ||
| 157 | int colvals_index = 0; | ||
| 158 | |||
| 159 | // Weight data globals | ||
| 160 | uint texel_weight_data[16]; | ||
| 161 | int texel_bitsread = 0; | ||
| 162 | uint total_texel_bitsread = 0; | ||
| 163 | int texel_index = 0; | ||
| 164 | |||
| 165 | bool texel_flag = false; | ||
| 166 | |||
| 167 | // Global "vectors" to be pushed into when decoding | ||
| 168 | EncodingData result_vector[100]; | ||
| 169 | int result_index = 0; | ||
| 170 | |||
| 171 | EncodingData texel_vector[100]; | ||
| 172 | int texel_vector_index = 0; | ||
| 173 | |||
| 174 | uint unquantized_texel_weights[2][144]; | ||
| 175 | |||
| 119 | uint SwizzleOffset(uvec2 pos) { | 176 | uint SwizzleOffset(uvec2 pos) { |
| 120 | pos = pos & SWIZZLE_MASK; | 177 | pos = pos & SWIZZLE_MASK; |
| 121 | return swizzle_table[pos.y * 64 + pos.x]; | 178 | return swizzle_table[pos.y * 64 + pos.x]; |
| @@ -126,21 +183,10 @@ uint ReadTexel(uint offset) { | |||
| 126 | return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8); | 183 | return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8); |
| 127 | } | 184 | } |
| 128 | 185 | ||
| 129 | 186 | // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] | |
| 130 | const int BLOCK_SIZE_IN_BYTES = 16; | 187 | // is the same as [(num_bits - 1):0] and repeats all the way down. |
| 131 | |||
| 132 | const int BLOCK_INFO_ERROR = 0; | ||
| 133 | const int BLOCK_INFO_VOID_EXTENT_HDR = 1; | ||
| 134 | const int BLOCK_INFO_VOID_EXTENT_LDR = 2; | ||
| 135 | const int BLOCK_INFO_NORMAL = 3; | ||
| 136 | |||
| 137 | // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] | ||
| 138 | // is the same as [(numBits - 1):0] and repeats all the way down. | ||
| 139 | uint Replicate(uint val, uint num_bits, uint to_bit) { | 188 | uint Replicate(uint val, uint num_bits, uint to_bit) { |
| 140 | if (num_bits == 0) { | 189 | if (num_bits == 0 || to_bit == 0) { |
| 141 | return 0; | ||
| 142 | } | ||
| 143 | if (to_bit == 0) { | ||
| 144 | return 0; | 190 | return 0; |
| 145 | } | 191 | } |
| 146 | const uint v = val & uint((1 << num_bits) - 1); | 192 | const uint v = val & uint((1 << num_bits) - 1); |
| @@ -165,26 +211,14 @@ uvec4 ReplicateByteTo16(uvec4 value) { | |||
| 165 | REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]); | 211 | REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]); |
| 166 | } | 212 | } |
| 167 | 213 | ||
| 168 | const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127); | ||
| 169 | uint ReplicateBitTo7(uint value) { | 214 | uint ReplicateBitTo7(uint value) { |
| 170 | return REPLICATE_BIT_TO_7_TABLE[value]; | 215 | return REPLICATE_BIT_TO_7_TABLE[value]; |
| 171 | ; | ||
| 172 | } | 216 | } |
| 173 | 217 | ||
| 174 | const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511); | ||
| 175 | uint ReplicateBitTo9(uint value) { | 218 | uint ReplicateBitTo9(uint value) { |
| 176 | return REPLICATE_1_BIT_TO_9_TABLE[value]; | 219 | return REPLICATE_1_BIT_TO_9_TABLE[value]; |
| 177 | } | 220 | } |
| 178 | 221 | ||
| 179 | const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255); | ||
| 180 | const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255); | ||
| 181 | const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255); | ||
| 182 | const uint REPLICATE_4_BIT_TO_8_TABLE[16] = | ||
| 183 | uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255); | ||
| 184 | const uint REPLICATE_5_BIT_TO_8_TABLE[32] = | ||
| 185 | uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165, | ||
| 186 | 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255); | ||
| 187 | |||
| 188 | uint FastReplicateTo8(uint value, uint num_bits) { | 222 | uint FastReplicateTo8(uint value, uint num_bits) { |
| 189 | switch (num_bits) { | 223 | switch (num_bits) { |
| 190 | case 1: | 224 | case 1: |
| @@ -207,15 +241,6 @@ uint FastReplicateTo8(uint value, uint num_bits) { | |||
| 207 | return Replicate(value, num_bits, 8); | 241 | return Replicate(value, num_bits, 8); |
| 208 | } | 242 | } |
| 209 | 243 | ||
| 210 | const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63); | ||
| 211 | const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63); | ||
| 212 | const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63); | ||
| 213 | const uint REPLICATE_4_BIT_TO_6_TABLE[16] = | ||
| 214 | uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63); | ||
| 215 | const uint REPLICATE_5_BIT_TO_6_TABLE[32] = | ||
| 216 | uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45, | ||
| 217 | 47, 49, 51, 53, 55, 57, 59, 61, 63); | ||
| 218 | |||
| 219 | uint FastReplicateTo6(uint value, uint num_bits) { | 244 | uint FastReplicateTo6(uint value, uint num_bits) { |
| 220 | switch (num_bits) { | 245 | switch (num_bits) { |
| 221 | case 1: | 246 | case 1: |
| @@ -232,7 +257,23 @@ uint FastReplicateTo6(uint value, uint num_bits) { | |||
| 232 | return Replicate(value, num_bits, 6); | 257 | return Replicate(value, num_bits, 6); |
| 233 | } | 258 | } |
| 234 | 259 | ||
| 235 | uint hash52(uint p) { | 260 | uint Div3Floor(uint v) { |
| 261 | return (v * 0x5556) >> 16; | ||
| 262 | } | ||
| 263 | |||
| 264 | uint Div3Ceil(uint v) { | ||
| 265 | return Div3Floor(v + 2); | ||
| 266 | } | ||
| 267 | |||
| 268 | uint Div5Floor(uint v) { | ||
| 269 | return (v * 0x3334) >> 16; | ||
| 270 | } | ||
| 271 | |||
| 272 | uint Div5Ceil(uint v) { | ||
| 273 | return Div5Floor(v + 4); | ||
| 274 | } | ||
| 275 | |||
| 276 | uint Hash52(uint p) { | ||
| 236 | p ^= p >> 15; | 277 | p ^= p >> 15; |
| 237 | p -= p << 17; | 278 | p -= p << 17; |
| 238 | p += p << 7; | 279 | p += p << 7; |
| @@ -247,9 +288,9 @@ uint hash52(uint p) { | |||
| 247 | } | 288 | } |
| 248 | 289 | ||
| 249 | uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) { | 290 | uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) { |
| 250 | if (1 == partition_count) | 291 | if (partition_count == 1) { |
| 251 | return 0; | 292 | return 0; |
| 252 | 293 | } | |
| 253 | if (small_block) { | 294 | if (small_block) { |
| 254 | x <<= 1; | 295 | x <<= 1; |
| 255 | y <<= 1; | 296 | y <<= 1; |
| @@ -258,7 +299,7 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo | |||
| 258 | 299 | ||
| 259 | seed += (partition_count - 1) * 1024; | 300 | seed += (partition_count - 1) * 1024; |
| 260 | 301 | ||
| 261 | uint rnum = hash52(uint(seed)); | 302 | uint rnum = Hash52(uint(seed)); |
| 262 | uint seed1 = uint(rnum & 0xF); | 303 | uint seed1 = uint(rnum & 0xF); |
| 263 | uint seed2 = uint((rnum >> 4) & 0xF); | 304 | uint seed2 = uint((rnum >> 4) & 0xF); |
| 264 | uint seed3 = uint((rnum >> 8) & 0xF); | 305 | uint seed3 = uint((rnum >> 8) & 0xF); |
| @@ -318,18 +359,22 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo | |||
| 318 | c &= 0x3F; | 359 | c &= 0x3F; |
| 319 | d &= 0x3F; | 360 | d &= 0x3F; |
| 320 | 361 | ||
| 321 | if (partition_count < 4) | 362 | if (partition_count < 4) { |
| 322 | d = 0; | 363 | d = 0; |
| 323 | if (partition_count < 3) | 364 | } |
| 365 | if (partition_count < 3) { | ||
| 324 | c = 0; | 366 | c = 0; |
| 367 | } | ||
| 325 | 368 | ||
| 326 | if (a >= b && a >= c && a >= d) | 369 | if (a >= b && a >= c && a >= d) { |
| 327 | return 0; | 370 | return 0; |
| 328 | else if (b >= c && b >= d) | 371 | } else if (b >= c && b >= d) { |
| 329 | return 1; | 372 | return 1; |
| 330 | else if (c >= d) | 373 | } else if (c >= d) { |
| 331 | return 2; | 374 | return 2; |
| 332 | return 3; | 375 | } else { |
| 376 | return 3; | ||
| 377 | } | ||
| 333 | } | 378 | } |
| 334 | 379 | ||
| 335 | uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { | 380 | uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { |
| @@ -341,10 +386,10 @@ uint ReadBit() { | |||
| 341 | return 0; | 386 | return 0; |
| 342 | } | 387 | } |
| 343 | uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1); | 388 | uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1); |
| 344 | bitsread++; | 389 | ++bitsread; |
| 345 | total_bitsread++; | 390 | ++total_bitsread; |
| 346 | if (bitsread == 8) { | 391 | if (bitsread == 8) { |
| 347 | current_index++; | 392 | ++current_index; |
| 348 | bitsread = 0; | 393 | bitsread = 0; |
| 349 | } | 394 | } |
| 350 | return bit; | 395 | return bit; |
| @@ -358,36 +403,22 @@ uint StreamBits(uint num_bits) { | |||
| 358 | return ret; | 403 | return ret; |
| 359 | } | 404 | } |
| 360 | 405 | ||
| 361 | // Define color data. | ||
| 362 | uint color_endpoint_data[16]; | ||
| 363 | int color_bitsread = 0; | ||
| 364 | uint total_color_bitsread = 0; | ||
| 365 | int color_index = 0; | ||
| 366 | |||
| 367 | // Define color data. | ||
| 368 | uint texel_weight_data[16]; | ||
| 369 | int texel_bitsread = 0; | ||
| 370 | uint total_texel_bitsread = 0; | ||
| 371 | int texel_index = 0; | ||
| 372 | |||
| 373 | bool texel_flag = false; | ||
| 374 | |||
| 375 | uint ReadColorBit() { | 406 | uint ReadColorBit() { |
| 376 | uint bit = 0; | 407 | uint bit = 0; |
| 377 | if (texel_flag) { | 408 | if (texel_flag) { |
| 378 | bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1); | 409 | bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1); |
| 379 | texel_bitsread++; | 410 | ++texel_bitsread; |
| 380 | total_texel_bitsread++; | 411 | ++total_texel_bitsread; |
| 381 | if (texel_bitsread == 8) { | 412 | if (texel_bitsread == 8) { |
| 382 | texel_index++; | 413 | ++texel_index; |
| 383 | texel_bitsread = 0; | 414 | texel_bitsread = 0; |
| 384 | } | 415 | } |
| 385 | } else { | 416 | } else { |
| 386 | bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1); | 417 | bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1); |
| 387 | color_bitsread++; | 418 | ++color_bitsread; |
| 388 | total_color_bitsread++; | 419 | ++total_color_bitsread; |
| 389 | if (color_bitsread == 8) { | 420 | if (color_bitsread == 8) { |
| 390 | color_index++; | 421 | ++color_index; |
| 391 | color_bitsread = 0; | 422 | color_bitsread = 0; |
| 392 | } | 423 | } |
| 393 | } | 424 | } |
| @@ -402,31 +433,25 @@ uint StreamColorBits(uint num_bits) { | |||
| 402 | return ret; | 433 | return ret; |
| 403 | } | 434 | } |
| 404 | 435 | ||
| 405 | EncodingData result_vector[100]; | ||
| 406 | int result_index = 0; | ||
| 407 | |||
| 408 | EncodingData texel_vector[100]; | ||
| 409 | int texel_vector_index = 0; | ||
| 410 | |||
| 411 | void ResultEmplaceBack(EncodingData val) { | 436 | void ResultEmplaceBack(EncodingData val) { |
| 412 | if (texel_flag) { | 437 | if (texel_flag) { |
| 413 | texel_vector[texel_vector_index] = val; | 438 | texel_vector[texel_vector_index] = val; |
| 414 | texel_vector_index++; | 439 | ++texel_vector_index; |
| 415 | } else { | 440 | } else { |
| 416 | result_vector[result_index] = val; | 441 | result_vector[result_index] = val; |
| 417 | result_index++; | 442 | ++result_index; |
| 418 | } | 443 | } |
| 419 | } | 444 | } |
| 420 | 445 | ||
| 421 | // Returns the number of bits required to encode n_vals values. | 446 | // Returns the number of bits required to encode n_vals values. |
| 422 | uint GetBitLength(uint n_vals, uint encoding_index) { | 447 | uint GetBitLength(uint n_vals, uint encoding_index) { |
| 423 | uint totalBits = encoding_values[encoding_index].num_bits * n_vals; | 448 | uint total_bits = encoding_values[encoding_index].num_bits * n_vals; |
| 424 | if (encoding_values[encoding_index].encoding == Trit) { | 449 | if (encoding_values[encoding_index].encoding == TRIT) { |
| 425 | totalBits += (n_vals * 8 + 4) / 5; | 450 | total_bits += Div5Ceil(n_vals * 8); |
| 426 | } else if (encoding_values[encoding_index].encoding == Quint) { | 451 | } else if (encoding_values[encoding_index].encoding == QUINT) { |
| 427 | totalBits += (n_vals * 7 + 2) / 3; | 452 | total_bits += Div3Ceil(n_vals * 7); |
| 428 | } | 453 | } |
| 429 | return totalBits; | 454 | return total_bits; |
| 430 | } | 455 | } |
| 431 | 456 | ||
| 432 | uint GetNumWeightValues(uvec2 size, bool dual_plane) { | 457 | uint GetNumWeightValues(uvec2 size, bool dual_plane) { |
| @@ -459,7 +484,7 @@ uint BitsOp(uint bits, uint start, uint end) { | |||
| 459 | return ((bits >> start) & mask); | 484 | return ((bits >> start) & mask); |
| 460 | } | 485 | } |
| 461 | 486 | ||
| 462 | void DecodeQuintBlock(uint num_bits) { // Value number of bits | 487 | void DecodeQuintBlock(uint num_bits) { |
| 463 | uint m[3]; | 488 | uint m[3]; |
| 464 | uint q[3]; | 489 | uint q[3]; |
| 465 | uint Q; | 490 | uint Q; |
| @@ -483,7 +508,6 @@ void DecodeQuintBlock(uint num_bits) { // Value number of bits | |||
| 483 | q[2] = BitsOp(Q, 5, 6); | 508 | q[2] = BitsOp(Q, 5, 6); |
| 484 | C = BitsOp(Q, 0, 4); | 509 | C = BitsOp(Q, 0, 4); |
| 485 | } | 510 | } |
| 486 | |||
| 487 | if (BitsOp(C, 0, 2) == 5) { | 511 | if (BitsOp(C, 0, 2) == 5) { |
| 488 | q[1] = 4; | 512 | q[1] = 4; |
| 489 | q[0] = BitsOp(C, 3, 4); | 513 | q[0] = BitsOp(C, 3, 4); |
| @@ -492,10 +516,9 @@ void DecodeQuintBlock(uint num_bits) { // Value number of bits | |||
| 492 | q[0] = BitsOp(C, 0, 2); | 516 | q[0] = BitsOp(C, 0, 2); |
| 493 | } | 517 | } |
| 494 | } | 518 | } |
| 495 | |||
| 496 | for (uint i = 0; i < 3; i++) { | 519 | for (uint i = 0; i < 3; i++) { |
| 497 | EncodingData val; | 520 | EncodingData val; |
| 498 | val.encoding = Quint; | 521 | val.encoding = QUINT; |
| 499 | val.num_bits = num_bits; | 522 | val.num_bits = num_bits; |
| 500 | val.bit_value = m[i]; | 523 | val.bit_value = m[i]; |
| 501 | val.quint_trit_value = q[i]; | 524 | val.quint_trit_value = q[i]; |
| @@ -547,29 +570,28 @@ void DecodeTritBlock(uint num_bits) { | |||
| 547 | } | 570 | } |
| 548 | for (uint i = 0; i < 5; i++) { | 571 | for (uint i = 0; i < 5; i++) { |
| 549 | EncodingData val; | 572 | EncodingData val; |
| 550 | val.encoding = Trit; | 573 | val.encoding = TRIT; |
| 551 | val.num_bits = num_bits; | 574 | val.num_bits = num_bits; |
| 552 | val.bit_value = m[i]; | 575 | val.bit_value = m[i]; |
| 553 | val.quint_trit_value = t[i]; | 576 | val.quint_trit_value = t[i]; |
| 554 | ResultEmplaceBack(val); | 577 | ResultEmplaceBack(val); |
| 555 | } | 578 | } |
| 556 | } | 579 | } |
| 580 | |||
| 557 | void DecodeIntegerSequence(uint max_range, uint num_values) { | 581 | void DecodeIntegerSequence(uint max_range, uint num_values) { |
| 558 | EncodingData val = encoding_values[max_range]; | 582 | EncodingData val = encoding_values[max_range]; |
| 559 | uint vals_decoded = 0; | 583 | uint vals_decoded = 0; |
| 560 | while (vals_decoded < num_values) { | 584 | while (vals_decoded < num_values) { |
| 561 | switch (val.encoding) { | 585 | switch (val.encoding) { |
| 562 | case Quint: | 586 | case QUINT: |
| 563 | DecodeQuintBlock(val.num_bits); | 587 | DecodeQuintBlock(val.num_bits); |
| 564 | vals_decoded += 3; | 588 | vals_decoded += 3; |
| 565 | break; | 589 | break; |
| 566 | 590 | case TRIT: | |
| 567 | case Trit: | ||
| 568 | DecodeTritBlock(val.num_bits); | 591 | DecodeTritBlock(val.num_bits); |
| 569 | vals_decoded += 5; | 592 | vals_decoded += 5; |
| 570 | break; | 593 | break; |
| 571 | 594 | case JUST_BITS: | |
| 572 | case JustBits: | ||
| 573 | val.bit_value = StreamColorBits(val.num_bits); | 595 | val.bit_value = StreamColorBits(val.num_bits); |
| 574 | ResultEmplaceBack(val); | 596 | ResultEmplaceBack(val); |
| 575 | vals_decoded++; | 597 | vals_decoded++; |
| @@ -578,8 +600,7 @@ void DecodeIntegerSequence(uint max_range, uint num_values) { | |||
| 578 | } | 600 | } |
| 579 | } | 601 | } |
| 580 | 602 | ||
| 581 | void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitions, | 603 | void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { |
| 582 | uint color_data_bits) { | ||
| 583 | uint num_values = 0; | 604 | uint num_values = 0; |
| 584 | for (uint i = 0; i < num_partitions; i++) { | 605 | for (uint i = 0; i < num_partitions; i++) { |
| 585 | num_values += ((modes[i] >> 2) + 1) << 1; | 606 | num_values += ((modes[i] >> 2) + 1) << 1; |
| @@ -587,21 +608,21 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio | |||
| 587 | int range = 256; | 608 | int range = 256; |
| 588 | while (--range > 0) { | 609 | while (--range > 0) { |
| 589 | EncodingData val = encoding_values[range]; | 610 | EncodingData val = encoding_values[range]; |
| 590 | uint bitLength = GetBitLength(num_values, range); | 611 | uint bit_length = GetBitLength(num_values, range); |
| 591 | if (bitLength <= color_data_bits) { | 612 | if (bit_length <= color_data_bits) { |
| 592 | while (--range > 0) { | 613 | while (--range > 0) { |
| 593 | EncodingData newval = encoding_values[range]; | 614 | EncodingData newval = encoding_values[range]; |
| 594 | if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) { | 615 | if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) { |
| 595 | break; | 616 | break; |
| 596 | } | 617 | } |
| 597 | } | 618 | } |
| 598 | range++; | 619 | ++range; |
| 599 | break; | 620 | break; |
| 600 | } | 621 | } |
| 601 | } | 622 | } |
| 602 | DecodeIntegerSequence(range, num_values); | 623 | DecodeIntegerSequence(range, num_values); |
| 603 | uint out_index = 0; | 624 | uint out_index = 0; |
| 604 | for (int itr = 0; itr < result_index; itr++) { | 625 | for (int itr = 0; itr < result_index; ++itr) { |
| 605 | if (out_index >= num_values) { | 626 | if (out_index >= num_values) { |
| 606 | break; | 627 | break; |
| 607 | } | 628 | } |
| @@ -611,77 +632,83 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio | |||
| 611 | uint A = 0, B = 0, C = 0, D = 0; | 632 | uint A = 0, B = 0, C = 0, D = 0; |
| 612 | A = ReplicateBitTo9((bitval & 1)); | 633 | A = ReplicateBitTo9((bitval & 1)); |
| 613 | switch (val.encoding) { | 634 | switch (val.encoding) { |
| 614 | case JustBits: | 635 | case JUST_BITS: |
| 615 | color_values[out_index++] = FastReplicateTo8(bitval, bitlen); | 636 | color_values[out_index++] = FastReplicateTo8(bitval, bitlen); |
| 616 | break; | 637 | break; |
| 617 | case Trit: { | 638 | case TRIT: { |
| 618 | D = val.quint_trit_value; | 639 | D = val.quint_trit_value; |
| 619 | switch (bitlen) { | 640 | switch (bitlen) { |
| 620 | case 1: { | 641 | case 1: |
| 621 | C = 204; | 642 | C = 204; |
| 622 | } break; | 643 | break; |
| 623 | case 2: { | 644 | case 2: { |
| 624 | C = 93; | 645 | C = 93; |
| 625 | uint b = (bitval >> 1) & 1; | 646 | uint b = (bitval >> 1) & 1; |
| 626 | B = (b << 8) | (b << 4) | (b << 2) | (b << 1); | 647 | B = (b << 8) | (b << 4) | (b << 2) | (b << 1); |
| 627 | } break; | 648 | break; |
| 628 | 649 | } | |
| 629 | case 3: { | 650 | case 3: { |
| 630 | C = 44; | 651 | C = 44; |
| 631 | uint cb = (bitval >> 1) & 3; | 652 | uint cb = (bitval >> 1) & 3; |
| 632 | B = (cb << 7) | (cb << 2) | cb; | 653 | B = (cb << 7) | (cb << 2) | cb; |
| 633 | } break; | 654 | break; |
| 634 | 655 | } | |
| 635 | case 4: { | 656 | case 4: { |
| 636 | C = 22; | 657 | C = 22; |
| 637 | uint dcb = (bitval >> 1) & 7; | 658 | uint dcb = (bitval >> 1) & 7; |
| 638 | B = (dcb << 6) | dcb; | 659 | B = (dcb << 6) | dcb; |
| 639 | } break; | 660 | break; |
| 640 | 661 | } | |
| 641 | case 5: { | 662 | case 5: { |
| 642 | C = 11; | 663 | C = 11; |
| 643 | uint edcb = (bitval >> 1) & 0xF; | 664 | uint edcb = (bitval >> 1) & 0xF; |
| 644 | B = (edcb << 5) | (edcb >> 2); | 665 | B = (edcb << 5) | (edcb >> 2); |
| 645 | } break; | 666 | break; |
| 646 | 667 | } | |
| 647 | case 6: { | 668 | case 6: { |
| 648 | C = 5; | 669 | C = 5; |
| 649 | uint fedcb = (bitval >> 1) & 0x1F; | 670 | uint fedcb = (bitval >> 1) & 0x1F; |
| 650 | B = (fedcb << 4) | (fedcb >> 4); | 671 | B = (fedcb << 4) | (fedcb >> 4); |
| 651 | } break; | 672 | break; |
| 652 | } | 673 | } |
| 653 | } break; | 674 | } |
| 654 | case Quint: { | 675 | break; |
| 676 | } | ||
| 677 | case QUINT: { | ||
| 655 | D = val.quint_trit_value; | 678 | D = val.quint_trit_value; |
| 656 | switch (bitlen) { | 679 | switch (bitlen) { |
| 657 | case 1: { | 680 | case 1: |
| 658 | C = 113; | 681 | C = 113; |
| 659 | } break; | 682 | break; |
| 660 | case 2: { | 683 | case 2: { |
| 661 | C = 54; | 684 | C = 54; |
| 662 | uint b = (bitval >> 1) & 1; | 685 | uint b = (bitval >> 1) & 1; |
| 663 | B = (b << 8) | (b << 3) | (b << 2); | 686 | B = (b << 8) | (b << 3) | (b << 2); |
| 664 | } break; | 687 | break; |
| 688 | } | ||
| 665 | case 3: { | 689 | case 3: { |
| 666 | C = 26; | 690 | C = 26; |
| 667 | uint cb = (bitval >> 1) & 3; | 691 | uint cb = (bitval >> 1) & 3; |
| 668 | B = (cb << 7) | (cb << 1) | (cb >> 1); | 692 | B = (cb << 7) | (cb << 1) | (cb >> 1); |
| 669 | } break; | 693 | break; |
| 694 | } | ||
| 670 | case 4: { | 695 | case 4: { |
| 671 | C = 13; | 696 | C = 13; |
| 672 | uint dcb = (bitval >> 1) & 7; | 697 | uint dcb = (bitval >> 1) & 7; |
| 673 | B = (dcb << 6) | (dcb >> 1); | 698 | B = (dcb << 6) | (dcb >> 1); |
| 674 | } break; | 699 | break; |
| 700 | } | ||
| 675 | case 5: { | 701 | case 5: { |
| 676 | C = 6; | 702 | C = 6; |
| 677 | uint edcb = (bitval >> 1) & 0xF; | 703 | uint edcb = (bitval >> 1) & 0xF; |
| 678 | B = (edcb << 5) | (edcb >> 3); | 704 | B = (edcb << 5) | (edcb >> 3); |
| 679 | } break; | 705 | break; |
| 680 | } | 706 | } |
| 681 | } break; | 707 | } |
| 708 | break; | ||
| 682 | } | 709 | } |
| 683 | 710 | } | |
| 684 | if (val.encoding != JustBits) { | 711 | if (val.encoding != JUST_BITS) { |
| 685 | uint T = (D * C) + B; | 712 | uint T = (D * C) + B; |
| 686 | T ^= A; | 713 | T ^= A; |
| 687 | T = (A & 0x80) | (T >> 2); | 714 | T = (A & 0x80) | (T >> 2); |
| @@ -689,30 +716,31 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio | |||
| 689 | } | 716 | } |
| 690 | } | 717 | } |
| 691 | } | 718 | } |
| 719 | |||
| 692 | ivec2 BitTransferSigned(int a, int b) { | 720 | ivec2 BitTransferSigned(int a, int b) { |
| 693 | ivec2 transferred; | 721 | ivec2 transferred; |
| 694 | transferred[1] = b >> 1; | 722 | transferred.y = b >> 1; |
| 695 | transferred[1] |= a & 0x80; | 723 | transferred.y |= a & 0x80; |
| 696 | transferred[0] = a >> 1; | 724 | transferred.x = a >> 1; |
| 697 | transferred[0] &= 0x3F; | 725 | transferred.x &= 0x3F; |
| 698 | if ((transferred[0] & 0x20) > 0) { | 726 | if ((transferred.x & 0x20) > 0) { |
| 699 | transferred[0] -= 0x40; | 727 | transferred.x -= 0x40; |
| 700 | } | 728 | } |
| 701 | return transferred; | 729 | return transferred; |
| 702 | } | 730 | } |
| 703 | 731 | ||
| 704 | uvec4 ClampByte(ivec4 color) { | 732 | uvec4 ClampByte(ivec4 color) { |
| 705 | for (uint i = 0; i < 4; i++) { | 733 | for (uint i = 0; i < 4; ++i) { |
| 706 | color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); | 734 | color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); |
| 707 | } | 735 | } |
| 708 | return uvec4(color); | 736 | return uvec4(color); |
| 709 | } | 737 | } |
| 738 | |||
| 710 | ivec4 BlueContract(int a, int r, int g, int b) { | 739 | ivec4 BlueContract(int a, int r, int g, int b) { |
| 711 | return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); | 740 | return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); |
| 712 | } | 741 | } |
| 713 | int colvals_index = 0; | 742 | |
| 714 | void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32], | 743 | void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { |
| 715 | uint color_endpoint_mode) { | ||
| 716 | #define READ_UINT_VALUES(N) \ | 744 | #define READ_UINT_VALUES(N) \ |
| 717 | uint v[N]; \ | 745 | uint v[N]; \ |
| 718 | for (uint i = 0; i < N; i++) { \ | 746 | for (uint i = 0; i < N; i++) { \ |
| @@ -730,113 +758,120 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32], | |||
| 730 | READ_UINT_VALUES(2) | 758 | READ_UINT_VALUES(2) |
| 731 | ep1 = uvec4(0xFF, v[0], v[0], v[0]); | 759 | ep1 = uvec4(0xFF, v[0], v[0], v[0]); |
| 732 | ep2 = uvec4(0xFF, v[1], v[1], v[1]); | 760 | ep2 = uvec4(0xFF, v[1], v[1], v[1]); |
| 733 | } break; | 761 | break; |
| 734 | 762 | } | |
| 735 | case 1: { | 763 | case 1: { |
| 736 | READ_UINT_VALUES(2) | 764 | READ_UINT_VALUES(2) |
| 737 | uint L0 = (v[0] >> 2) | (v[1] & 0xC0); | 765 | uint L0 = (v[0] >> 2) | (v[1] & 0xC0); |
| 738 | uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU); | 766 | uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU); |
| 739 | ep1 = uvec4(0xFF, L0, L0, L0); | 767 | ep1 = uvec4(0xFF, L0, L0, L0); |
| 740 | ep2 = uvec4(0xFF, L1, L1, L1); | 768 | ep2 = uvec4(0xFF, L1, L1, L1); |
| 741 | } break; | 769 | break; |
| 742 | 770 | } | |
| 743 | case 4: { | 771 | case 4: { |
| 744 | READ_UINT_VALUES(4) | 772 | READ_UINT_VALUES(4) |
| 745 | ep1 = uvec4(v[2], v[0], v[0], v[0]); | 773 | ep1 = uvec4(v[2], v[0], v[0], v[0]); |
| 746 | ep2 = uvec4(v[3], v[1], v[1], v[1]); | 774 | ep2 = uvec4(v[3], v[1], v[1], v[1]); |
| 747 | } break; | 775 | break; |
| 748 | 776 | } | |
| 749 | case 5: { | 777 | case 5: { |
| 750 | READ_INT_VALUES(4) | 778 | READ_INT_VALUES(4) |
| 751 | ivec2 transferred = BitTransferSigned(v[1], v[0]); | 779 | ivec2 transferred = BitTransferSigned(v[1], v[0]); |
| 752 | v[1] = transferred[0]; | 780 | v[1] = transferred.x; |
| 753 | v[0] = transferred[1]; | 781 | v[0] = transferred.y; |
| 754 | transferred = BitTransferSigned(v[3], v[2]); | 782 | transferred = BitTransferSigned(v[3], v[2]); |
| 755 | v[3] = transferred[0]; | 783 | v[3] = transferred.x; |
| 756 | v[2] = transferred[1]; | 784 | v[2] = transferred.y; |
| 757 | ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0])); | 785 | ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0])); |
| 758 | ep2 = ClampByte(ivec4((v[2] + v[3]), v[0] + v[1], v[0] + v[1], v[0] + v[1])); | 786 | ep2 = ClampByte(ivec4(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1])); |
| 759 | } break; | 787 | break; |
| 760 | 788 | } | |
| 761 | case 6: { | 789 | case 6: { |
| 762 | READ_UINT_VALUES(4) | 790 | READ_UINT_VALUES(4) |
| 763 | ep1 = uvec4(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); | 791 | ep1 = uvec4(0xFF, (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); |
| 764 | ep2 = uvec4(0xFF, v[0], v[1], v[2]); | 792 | ep2 = uvec4(0xFF, v[0], v[1], v[2]); |
| 765 | } break; | 793 | break; |
| 766 | 794 | } | |
| 767 | case 8: { | 795 | case 8: { |
| 768 | READ_UINT_VALUES(6) | 796 | READ_UINT_VALUES(6) |
| 769 | if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { | 797 | if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { |
| 770 | ep1 = uvec4(0xFF, v[0], v[2], v[4]); | 798 | ep1 = uvec4(0xFF, v[0], v[2], v[4]); |
| 771 | ep2 = uvec4(0xFF, v[1], v[3], v[5]); | 799 | ep2 = uvec4(0xFF, v[1], v[3], v[5]); |
| 772 | } else { | 800 | } else { |
| 773 | ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5]))); | 801 | ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5]))); |
| 774 | ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4]))); | 802 | ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4]))); |
| 775 | } | 803 | } |
| 776 | } break; | 804 | break; |
| 777 | 805 | } | |
| 778 | case 9: { | 806 | case 9: { |
| 779 | READ_INT_VALUES(6) | 807 | READ_INT_VALUES(6) |
| 780 | ivec2 transferred = BitTransferSigned(v[1], v[0]); | 808 | ivec2 transferred = BitTransferSigned(v[1], v[0]); |
| 781 | v[1] = transferred[0]; | 809 | v[1] = transferred.x; |
| 782 | v[0] = transferred[1]; | 810 | v[0] = transferred.y; |
| 783 | transferred = BitTransferSigned(v[3], v[2]); | 811 | transferred = BitTransferSigned(v[3], v[2]); |
| 784 | v[3] = transferred[0]; | 812 | v[3] = transferred.x; |
| 785 | v[2] = transferred[1]; | 813 | v[2] = transferred.y; |
| 786 | transferred = BitTransferSigned(v[5], v[4]); | 814 | transferred = BitTransferSigned(v[5], v[4]); |
| 787 | v[5] = transferred[0]; | 815 | v[5] = transferred.x; |
| 788 | v[4] = transferred[1]; | 816 | v[4] = transferred.y; |
| 789 | if (v[1] + v[3] + v[5] >= 0) { | 817 | if ((v[1] + v[3] + v[5]) >= 0) { |
| 790 | ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4])); | 818 | ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4])); |
| 791 | ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); | 819 | ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); |
| 792 | } else { | 820 | } else { |
| 793 | ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); | 821 | ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); |
| 794 | ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4])); | 822 | ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4])); |
| 795 | } | 823 | } |
| 796 | } break; | 824 | break; |
| 797 | 825 | } | |
| 798 | case 10: { | 826 | case 10: { |
| 799 | READ_UINT_VALUES(6) | 827 | READ_UINT_VALUES(6) |
| 800 | ep1 = uvec4(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); | 828 | ep1 = uvec4(v[4], (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); |
| 801 | ep2 = uvec4(v[5], v[0], v[1], v[2]); | 829 | ep2 = uvec4(v[5], v[0], v[1], v[2]); |
| 802 | } break; | 830 | break; |
| 803 | 831 | } | |
| 804 | case 12: { | 832 | case 12: { |
| 805 | READ_UINT_VALUES(8) | 833 | READ_UINT_VALUES(8) |
| 806 | if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { | 834 | if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { |
| 807 | ep1 = uvec4(v[6], v[0], v[2], v[4]); | 835 | ep1 = uvec4(v[6], v[0], v[2], v[4]); |
| 808 | ep2 = uvec4(v[7], v[1], v[3], v[5]); | 836 | ep2 = uvec4(v[7], v[1], v[3], v[5]); |
| 809 | } else { | 837 | } else { |
| 810 | ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5]))); | 838 | ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5]))); |
| 811 | ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4]))); | 839 | ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4]))); |
| 812 | } | 840 | } |
| 813 | } break; | 841 | break; |
| 814 | 842 | } | |
| 815 | case 13: { | 843 | case 13: { |
| 816 | READ_INT_VALUES(8) | 844 | READ_INT_VALUES(8) |
| 817 | ivec2 transferred = BitTransferSigned(v[1], v[0]); | 845 | ivec2 transferred = BitTransferSigned(v[1], v[0]); |
| 818 | v[1] = transferred[0]; | 846 | v[1] = transferred.x; |
| 819 | v[0] = transferred[1]; | 847 | v[0] = transferred.y; |
| 820 | transferred = BitTransferSigned(v[3], v[2]); | 848 | transferred = BitTransferSigned(v[3], v[2]); |
| 821 | v[3] = transferred[0]; | 849 | v[3] = transferred.x; |
| 822 | v[2] = transferred[1]; | 850 | v[2] = transferred.y; |
| 823 | 851 | ||
| 824 | transferred = BitTransferSigned(v[5], v[4]); | 852 | transferred = BitTransferSigned(v[5], v[4]); |
| 825 | v[5] = transferred[0]; | 853 | v[5] = transferred.x; |
| 826 | v[4] = transferred[1]; | 854 | v[4] = transferred.y; |
| 827 | 855 | ||
| 828 | transferred = BitTransferSigned(v[7], v[6]); | 856 | transferred = BitTransferSigned(v[7], v[6]); |
| 829 | v[7] = transferred[0]; | 857 | v[7] = transferred.x; |
| 830 | v[6] = transferred[1]; | 858 | v[6] = transferred.y; |
| 831 | 859 | ||
| 832 | if (v[1] + v[3] + v[5] >= 0) { | 860 | if ((v[1] + v[3] + v[5]) >= 0) { |
| 833 | ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4])); | 861 | ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4])); |
| 834 | ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5])); | 862 | ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5])); |
| 835 | } else { | 863 | } else { |
| 836 | ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5])); | 864 | ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5])); |
| 837 | ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4])); | 865 | ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4])); |
| 838 | } | 866 | } |
| 839 | } break; | 867 | break; |
| 868 | } | ||
| 869 | default: { | ||
| 870 | // HDR mode, or more likely a bug computing the color_endpoint_mode | ||
| 871 | ep1 = uvec4(0xFF, 0xFF, 0, 0); | ||
| 872 | ep2 = uvec4(0xFF, 0xFF, 0, 0); | ||
| 873 | break; | ||
| 874 | } | ||
| 840 | } | 875 | } |
| 841 | #undef READ_UINT_VALUES | 876 | #undef READ_UINT_VALUES |
| 842 | #undef READ_INT_VALUES | 877 | #undef READ_INT_VALUES |
| @@ -849,52 +884,61 @@ uint UnquantizeTexelWeight(EncodingData val) { | |||
| 849 | uint B = 0, C = 0, D = 0; | 884 | uint B = 0, C = 0, D = 0; |
| 850 | uint result = 0; | 885 | uint result = 0; |
| 851 | switch (val.encoding) { | 886 | switch (val.encoding) { |
| 852 | case JustBits: | 887 | case JUST_BITS: |
| 853 | result = FastReplicateTo6(bitval, bitlen); | 888 | result = FastReplicateTo6(bitval, bitlen); |
| 854 | break; | 889 | break; |
| 855 | case Trit: { | 890 | case TRIT: { |
| 856 | D = val.quint_trit_value; | 891 | D = val.quint_trit_value; |
| 857 | switch (bitlen) { | 892 | switch (bitlen) { |
| 858 | case 0: { | 893 | case 0: { |
| 859 | uint results[3] = {0, 32, 63}; | 894 | uint results[3] = {0, 32, 63}; |
| 860 | result = results[D]; | 895 | result = results[D]; |
| 861 | } break; | 896 | break; |
| 897 | } | ||
| 862 | case 1: { | 898 | case 1: { |
| 863 | C = 50; | 899 | C = 50; |
| 864 | } break; | 900 | break; |
| 901 | } | ||
| 865 | case 2: { | 902 | case 2: { |
| 866 | C = 23; | 903 | C = 23; |
| 867 | uint b = (bitval >> 1) & 1; | 904 | uint b = (bitval >> 1) & 1; |
| 868 | B = (b << 6) | (b << 2) | b; | 905 | B = (b << 6) | (b << 2) | b; |
| 869 | } break; | 906 | break; |
| 907 | } | ||
| 870 | case 3: { | 908 | case 3: { |
| 871 | C = 11; | 909 | C = 11; |
| 872 | uint cb = (bitval >> 1) & 3; | 910 | uint cb = (bitval >> 1) & 3; |
| 873 | B = (cb << 5) | cb; | 911 | B = (cb << 5) | cb; |
| 874 | } break; | 912 | break; |
| 913 | } | ||
| 875 | default: | 914 | default: |
| 876 | break; | 915 | break; |
| 877 | } | 916 | } |
| 878 | } break; | 917 | break; |
| 879 | case Quint: { | 918 | } |
| 919 | case QUINT: { | ||
| 880 | D = val.quint_trit_value; | 920 | D = val.quint_trit_value; |
| 881 | switch (bitlen) { | 921 | switch (bitlen) { |
| 882 | case 0: { | 922 | case 0: { |
| 883 | uint results[5] = {0, 16, 32, 47, 63}; | 923 | uint results[5] = {0, 16, 32, 47, 63}; |
| 884 | result = results[D]; | 924 | result = results[D]; |
| 885 | } break; | 925 | break; |
| 926 | } | ||
| 886 | case 1: { | 927 | case 1: { |
| 887 | C = 28; | 928 | C = 28; |
| 888 | } break; | 929 | break; |
| 930 | } | ||
| 889 | case 2: { | 931 | case 2: { |
| 890 | C = 13; | 932 | C = 13; |
| 891 | uint b = (bitval >> 1) & 1; | 933 | uint b = (bitval >> 1) & 1; |
| 892 | B = (b << 6) | (b << 1); | 934 | B = (b << 6) | (b << 1); |
| 893 | } break; | 935 | break; |
| 894 | } | 936 | } |
| 895 | } break; | 937 | } |
| 938 | break; | ||
| 896 | } | 939 | } |
| 897 | if (val.encoding != JustBits && bitlen > 0) { | 940 | } |
| 941 | if (val.encoding != JUST_BITS && bitlen > 0) { | ||
| 898 | result = D * C + B; | 942 | result = D * C + B; |
| 899 | result ^= A; | 943 | result ^= A; |
| 900 | result = (A & 0x20) | (result >> 2); | 944 | result = (A & 0x20) | (result >> 2); |
| @@ -905,7 +949,7 @@ uint UnquantizeTexelWeight(EncodingData val) { | |||
| 905 | return result; | 949 | return result; |
| 906 | } | 950 | } |
| 907 | 951 | ||
| 908 | void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 size) { | 952 | void UnquantizeTexelWeights(bool dual_plane, uvec2 size) { |
| 909 | uint weight_idx = 0; | 953 | uint weight_idx = 0; |
| 910 | uint unquantized[2][144]; | 954 | uint unquantized[2][144]; |
| 911 | uint area = size.x * size.y; | 955 | uint area = size.x * size.y; |
| @@ -921,11 +965,12 @@ void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 s | |||
| 921 | if (++weight_idx >= (area)) | 965 | if (++weight_idx >= (area)) |
| 922 | break; | 966 | break; |
| 923 | } | 967 | } |
| 924 | uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); | 968 | |
| 925 | uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); | 969 | const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); |
| 926 | uint kPlaneScale = dual_plane ? 2 : 1; | 970 | const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); |
| 927 | for (uint plane = 0; plane < kPlaneScale; plane++) | 971 | const uint k_plane_scale = dual_plane ? 2 : 1; |
| 928 | for (uint t = 0; t < block_dims.y; t++) | 972 | for (uint plane = 0; plane < k_plane_scale; plane++) { |
| 973 | for (uint t = 0; t < block_dims.y; t++) { | ||
| 929 | for (uint s = 0; s < block_dims.x; s++) { | 974 | for (uint s = 0; s < block_dims.x; s++) { |
| 930 | uint cs = Ds * s; | 975 | uint cs = Ds * s; |
| 931 | uint ct = Dt * t; | 976 | uint ct = Dt * t; |
| @@ -955,8 +1000,10 @@ void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 s | |||
| 955 | if ((v0 + size.x + 1) < (area)) { | 1000 | if ((v0 + size.x + 1) < (area)) { |
| 956 | p.w = unquantized[plane][(v0 + size.x + 1)]; | 1001 | p.w = unquantized[plane][(v0 + size.x + 1)]; |
| 957 | } | 1002 | } |
| 958 | outbuffer[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; | 1003 | unquantized_texel_weights[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; |
| 959 | } | 1004 | } |
| 1005 | } | ||
| 1006 | } | ||
| 960 | } | 1007 | } |
| 961 | 1008 | ||
| 962 | int FindLayout(uint mode) { | 1009 | int FindLayout(uint mode) { |
| @@ -991,25 +1038,25 @@ int FindLayout(uint mode) { | |||
| 991 | } | 1038 | } |
| 992 | 1039 | ||
| 993 | TexelWeightParams DecodeBlockInfo(uint block_index) { | 1040 | TexelWeightParams DecodeBlockInfo(uint block_index) { |
| 994 | TexelWeightParams params = TexelWeightParams(uvec2(0), false, 0, false, false, false); | 1041 | TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false); |
| 995 | uint mode = StreamBits(11); | 1042 | uint mode = StreamBits(11); |
| 996 | if ((mode & 0x1ff) == 0x1fc) { | 1043 | if ((mode & 0x1ff) == 0x1fc) { |
| 997 | if ((mode & 0x200) != 0) { | 1044 | if ((mode & 0x200) != 0) { |
| 998 | params.VoidExtentHDR = true; | 1045 | params.void_extent_hdr = true; |
| 999 | } else { | 1046 | } else { |
| 1000 | params.VoidExtentLDR = true; | 1047 | params.void_extent_ldr = true; |
| 1001 | } | 1048 | } |
| 1002 | if ((mode & 0x400) == 0 || StreamBits(1) == 0) { | 1049 | if ((mode & 0x400) == 0 || StreamBits(1) == 0) { |
| 1003 | params.Error = true; | 1050 | params.error_state = true; |
| 1004 | } | 1051 | } |
| 1005 | return params; | 1052 | return params; |
| 1006 | } | 1053 | } |
| 1007 | if ((mode & 0xf) == 0) { | 1054 | if ((mode & 0xf) == 0) { |
| 1008 | params.Error = true; | 1055 | params.error_state = true; |
| 1009 | return params; | 1056 | return params; |
| 1010 | } | 1057 | } |
| 1011 | if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) { | 1058 | if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) { |
| 1012 | params.Error = true; | 1059 | params.error_state = true; |
| 1013 | return params; | 1060 | return params; |
| 1014 | } | 1061 | } |
| 1015 | uint A, B; | 1062 | uint A, B; |
| @@ -1060,7 +1107,7 @@ TexelWeightParams DecodeBlockInfo(uint block_index) { | |||
| 1060 | params.size = uvec2(A + 6, B + 6); | 1107 | params.size = uvec2(A + 6, B + 6); |
| 1061 | break; | 1108 | break; |
| 1062 | default: | 1109 | default: |
| 1063 | params.Error = true; | 1110 | params.error_state = true; |
| 1064 | break; | 1111 | break; |
| 1065 | } | 1112 | } |
| 1066 | params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); | 1113 | params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); |
| @@ -1089,11 +1136,8 @@ void FillError(ivec3 coord) { | |||
| 1089 | } | 1136 | } |
| 1090 | } | 1137 | } |
| 1091 | 1138 | ||
| 1092 | void FillVoidExtentLDR(ivec3 coord, uint block_index) { | 1139 | void FillVoidExtentLDR(ivec3 coord) { |
| 1093 | for (int i = 0; i < 4; i++) { | 1140 | StreamBits(52); |
| 1094 | StreamBits(13); | ||
| 1095 | } | ||
| 1096 | |||
| 1097 | uint r_u = StreamBits(16); | 1141 | uint r_u = StreamBits(16); |
| 1098 | uint g_u = StreamBits(16); | 1142 | uint g_u = StreamBits(16); |
| 1099 | uint b_u = StreamBits(16); | 1143 | uint b_u = StreamBits(16); |
| @@ -1110,21 +1154,20 @@ void FillVoidExtentLDR(ivec3 coord, uint block_index) { | |||
| 1110 | } | 1154 | } |
| 1111 | 1155 | ||
| 1112 | void DecompressBlock(ivec3 coord, uint block_index) { | 1156 | void DecompressBlock(ivec3 coord, uint block_index) { |
| 1113 | TexelWeightParams params; | 1157 | TexelWeightParams params = DecodeBlockInfo(block_index); |
| 1114 | params = DecodeBlockInfo(block_index); | 1158 | if (params.error_state) { |
| 1115 | if (params.Error) { | ||
| 1116 | FillError(coord); | 1159 | FillError(coord); |
| 1117 | return; | 1160 | return; |
| 1118 | } | 1161 | } |
| 1119 | if (params.VoidExtentHDR) { | 1162 | if (params.void_extent_hdr) { |
| 1120 | FillError(coord); | 1163 | FillError(coord); |
| 1121 | return; | 1164 | return; |
| 1122 | } | 1165 | } |
| 1123 | if (params.VoidExtentLDR) { | 1166 | if (params.void_extent_ldr) { |
| 1124 | FillVoidExtentLDR(coord, block_index); | 1167 | FillVoidExtentLDR(coord); |
| 1125 | return; | 1168 | return; |
| 1126 | } | 1169 | } |
| 1127 | if (params.size.x > block_dims.x || params.size.y > block_dims.y) { | 1170 | if ((params.size.x > block_dims.x) || (params.size.y > block_dims.y)) { |
| 1128 | FillError(coord); | 1171 | FillError(coord); |
| 1129 | return; | 1172 | return; |
| 1130 | } | 1173 | } |
| @@ -1139,7 +1182,7 @@ void DecompressBlock(ivec3 coord, uint block_index) { | |||
| 1139 | uint ced_pointer = 0; | 1182 | uint ced_pointer = 0; |
| 1140 | uint base_cem = 0; | 1183 | uint base_cem = 0; |
| 1141 | if (num_partitions == 1) { | 1184 | if (num_partitions == 1) { |
| 1142 | color_endpoint_mode[0] = StreamBits(4); | 1185 | color_endpoint_mode.x = StreamBits(4); |
| 1143 | partition_index = 0; | 1186 | partition_index = 0; |
| 1144 | } else { | 1187 | } else { |
| 1145 | partition_index = StreamBits(10); | 1188 | partition_index = StreamBits(10); |
| @@ -1181,7 +1224,7 @@ void DecompressBlock(ivec3 coord, uint block_index) { | |||
| 1181 | int nb = int(min(remaining_bits, 8U)); | 1224 | int nb = int(min(remaining_bits, 8U)); |
| 1182 | uint b = StreamBits(nb); | 1225 | uint b = StreamBits(nb); |
| 1183 | color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); | 1226 | color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); |
| 1184 | ced_pointer++; | 1227 | ++ced_pointer; |
| 1185 | remaining_bits -= nb; | 1228 | remaining_bits -= nb; |
| 1186 | } | 1229 | } |
| 1187 | plane_index = int(StreamBits(plane_selector_bits)); | 1230 | plane_index = int(StreamBits(plane_selector_bits)); |
| @@ -1189,20 +1232,20 @@ void DecompressBlock(ivec3 coord, uint block_index) { | |||
| 1189 | uint extra_cem = StreamBits(extra_cem_bits); | 1232 | uint extra_cem = StreamBits(extra_cem_bits); |
| 1190 | uint cem = (extra_cem << 6) | base_cem; | 1233 | uint cem = (extra_cem << 6) | base_cem; |
| 1191 | cem >>= 2; | 1234 | cem >>= 2; |
| 1192 | uint C[4] = {0, 0, 0, 0}; | 1235 | uvec4 C = uvec4(0); |
| 1193 | for (uint i = 0; i < num_partitions; i++) { | 1236 | for (uint i = 0; i < num_partitions; i++) { |
| 1194 | C[i] = cem & 1; | 1237 | C[i] = (cem & 1); |
| 1195 | cem >>= 1; | 1238 | cem >>= 1; |
| 1196 | } | 1239 | } |
| 1197 | uint M[4] = {0, 0, 0, 0}; | 1240 | uvec4 M = uvec4(0); |
| 1198 | for (uint i = 0; i < num_partitions; i++) { | 1241 | for (uint i = 0; i < num_partitions; i++) { |
| 1199 | M[i] = cem & 3; | 1242 | M[i] = cem & 3; |
| 1200 | cem >>= 2; | 1243 | cem >>= 2; |
| 1201 | } | 1244 | } |
| 1202 | for (uint i = 0; i < num_partitions; i++) { | 1245 | for (uint i = 0; i < num_partitions; i++) { |
| 1203 | color_endpoint_mode[i] = base_mode; | 1246 | color_endpoint_mode[i] = base_mode; |
| 1204 | if ((C[i]) == 0) { | 1247 | if (C[i] == 0) { |
| 1205 | color_endpoint_mode[i] -= 1; | 1248 | --color_endpoint_mode[i]; |
| 1206 | } | 1249 | } |
| 1207 | color_endpoint_mode[i] <<= 2; | 1250 | color_endpoint_mode[i] <<= 2; |
| 1208 | color_endpoint_mode[i] |= M[i]; | 1251 | color_endpoint_mode[i] |= M[i]; |
| @@ -1213,13 +1256,13 @@ void DecompressBlock(ivec3 coord, uint block_index) { | |||
| 1213 | color_endpoint_mode[i] = cem; | 1256 | color_endpoint_mode[i] = cem; |
| 1214 | } | 1257 | } |
| 1215 | } | 1258 | } |
| 1259 | DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits); | ||
| 1216 | 1260 | ||
| 1217 | uint color_values[32]; // Four values, two endpoints, four maximum paritions | ||
| 1218 | DecodeColorValues(color_values, color_endpoint_mode, num_partitions, color_data_bits); | ||
| 1219 | uvec4 endpoints[4][2]; | 1261 | uvec4 endpoints[4][2]; |
| 1220 | for (uint i = 0; i < num_partitions; i++) { | 1262 | for (uint i = 0; i < num_partitions; i++) { |
| 1221 | ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_values, color_endpoint_mode[i]); | 1263 | ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]); |
| 1222 | } | 1264 | } |
| 1265 | |||
| 1223 | for (uint i = 0; i < 16; i++) { | 1266 | for (uint i = 0; i < 16; i++) { |
| 1224 | texel_weight_data[i] = local_buff[i]; | 1267 | texel_weight_data[i] = local_buff[i]; |
| 1225 | } | 1268 | } |
| @@ -1238,12 +1281,13 @@ void DecompressBlock(ivec3 coord, uint block_index) { | |||
| 1238 | uint( | 1281 | uint( |
| 1239 | ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); | 1282 | ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); |
| 1240 | for (uint i = 0; i < 16 - clear_byte_start; i++) { | 1283 | for (uint i = 0; i < 16 - clear_byte_start; i++) { |
| 1241 | texel_weight_data[clear_byte_start + i] = uint(0U); | 1284 | texel_weight_data[clear_byte_start + i] = 0U; |
| 1242 | } | 1285 | } |
| 1243 | texel_flag = true; // use texel "vector" and bit stream in integer decoding | 1286 | texel_flag = true; // use texel "vector" and bit stream in integer decoding |
| 1244 | DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); | 1287 | DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); |
| 1245 | uint weights[2][144]; | 1288 | |
| 1246 | UnquantizeTexelWeights(weights, params.dual_plane, params.size); | 1289 | UnquantizeTexelWeights(params.dual_plane, params.size); |
| 1290 | |||
| 1247 | for (uint j = 0; j < block_dims.y; j++) { | 1291 | for (uint j = 0; j < block_dims.y; j++) { |
| 1248 | for (uint i = 0; i < block_dims.x; i++) { | 1292 | for (uint i = 0; i < block_dims.x; i++) { |
| 1249 | uint local_partition = Select2DPartition(partition_index, i, j, num_partitions, | 1293 | uint local_partition = Select2DPartition(partition_index, i, j, num_partitions, |
| @@ -1257,9 +1301,9 @@ void DecompressBlock(ivec3 coord, uint block_index) { | |||
| 1257 | if (params.dual_plane && (((plane_index + 1) & 3) == c)) { | 1301 | if (params.dual_plane && (((plane_index + 1) & 3) == c)) { |
| 1258 | plane_vec[c] = 1; | 1302 | plane_vec[c] = 1; |
| 1259 | } | 1303 | } |
| 1260 | weight_vec[c] = weights[plane_vec[c]][j * block_dims.x + i]; | 1304 | weight_vec[c] = unquantized_texel_weights[plane_vec[c]][j * block_dims.x + i]; |
| 1261 | } | 1305 | } |
| 1262 | vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) >> 6); | 1306 | vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); |
| 1263 | p = (Cf / 65535.0); | 1307 | p = (Cf / 65535.0); |
| 1264 | imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); | 1308 | imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); |
| 1265 | } | 1309 | } |
| @@ -1267,7 +1311,7 @@ void DecompressBlock(ivec3 coord, uint block_index) { | |||
| 1267 | } | 1311 | } |
| 1268 | 1312 | ||
| 1269 | void main() { | 1313 | void main() { |
| 1270 | uvec3 pos = gl_GlobalInvocationID + origin; | 1314 | uvec3 pos = gl_GlobalInvocationID; |
| 1271 | pos.x <<= bytes_per_block_log2; | 1315 | pos.x <<= bytes_per_block_log2; |
| 1272 | 1316 | ||
| 1273 | // Read as soon as possible due to its latency | 1317 | // Read as soon as possible due to its latency |
| @@ -1282,9 +1326,10 @@ void main() { | |||
| 1282 | offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; | 1326 | offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; |
| 1283 | offset += swizzle; | 1327 | offset += swizzle; |
| 1284 | 1328 | ||
| 1285 | const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1.0)); | 1329 | const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1)); |
| 1286 | uint block_index = | 1330 | uint block_index = |
| 1287 | pos.z * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x; | 1331 | pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x; |
| 1332 | |||
| 1288 | current_index = 0; | 1333 | current_index = 0; |
| 1289 | bitsread = 0; | 1334 | bitsread = 0; |
| 1290 | for (int i = 0; i < 16; i++) { | 1335 | for (int i = 0; i < 16; i++) { |
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 29105ecad..623b43d8a 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp | |||
| @@ -307,7 +307,7 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 | |||
| 307 | 307 | ||
| 308 | [[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime, | 308 | [[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime, |
| 309 | const VideoCommon::ImageInfo& info) { | 309 | const VideoCommon::ImageInfo& info) { |
| 310 | return (!runtime.HasNativeASTC() && IsPixelFormatASTC(info.format)); | 310 | return !runtime.HasNativeASTC() && IsPixelFormatASTC(info.format); |
| 311 | // Disable other accelerated uploads for now as they don't implement swizzled uploads | 311 | // Disable other accelerated uploads for now as they don't implement swizzled uploads |
| 312 | return false; | 312 | return false; |
| 313 | switch (info.type) { | 313 | switch (info.type) { |
| @@ -568,12 +568,13 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src, | |||
| 568 | 568 | ||
| 569 | void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map, | 569 | void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map, |
| 570 | std::span<const SwizzleParameters> swizzles) { | 570 | std::span<const SwizzleParameters> swizzles) { |
| 571 | if (IsPixelFormatASTC(image.info.format)) { | ||
| 572 | return util_shaders.ASTCDecode(image, map, swizzles); | ||
| 573 | } | ||
| 574 | switch (image.info.type) { | 571 | switch (image.info.type) { |
| 575 | case ImageType::e2D: | 572 | case ImageType::e2D: |
| 576 | return util_shaders.BlockLinearUpload2D(image, map, swizzles); | 573 | if (IsPixelFormatASTC(image.info.format)) { |
| 574 | return util_shaders.ASTCDecode(image, map, swizzles); | ||
| 575 | } else { | ||
| 576 | return util_shaders.BlockLinearUpload2D(image, map, swizzles); | ||
| 577 | } | ||
| 577 | case ImageType::e3D: | 578 | case ImageType::e3D: |
| 578 | return util_shaders.BlockLinearUpload3D(image, map, swizzles); | 579 | return util_shaders.BlockLinearUpload3D(image, map, swizzles); |
| 579 | case ImageType::Linear: | 580 | case ImageType::Linear: |
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 85722c54a..47fddcb6e 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp | |||
| @@ -2,11 +2,7 @@ | |||
| 2 | // Licensed under GPLv2 or any later version | 2 | // Licensed under GPLv2 or any later version |
| 3 | // Refer to the license.txt file included. | 3 | // Refer to the license.txt file included. |
| 4 | 4 | ||
| 5 | #include <bit> | ||
| 6 | #include <fstream> | ||
| 7 | #include <span> | 5 | #include <span> |
| 8 | #include <streambuf> | ||
| 9 | #include <string> | ||
| 10 | #include <string_view> | 6 | #include <string_view> |
| 11 | 7 | ||
| 12 | #include <glad/glad.h> | 8 | #include <glad/glad.h> |
| @@ -24,7 +20,6 @@ | |||
| 24 | #include "video_core/renderer_opengl/gl_shader_manager.h" | 20 | #include "video_core/renderer_opengl/gl_shader_manager.h" |
| 25 | #include "video_core/renderer_opengl/gl_texture_cache.h" | 21 | #include "video_core/renderer_opengl/gl_texture_cache.h" |
| 26 | #include "video_core/renderer_opengl/util_shaders.h" | 22 | #include "video_core/renderer_opengl/util_shaders.h" |
| 27 | #include "video_core/surface.h" | ||
| 28 | #include "video_core/texture_cache/accelerated_swizzle.h" | 23 | #include "video_core/texture_cache/accelerated_swizzle.h" |
| 29 | #include "video_core/texture_cache/types.h" | 24 | #include "video_core/texture_cache/types.h" |
| 30 | #include "video_core/texture_cache/util.h" | 25 | #include "video_core/texture_cache/util.h" |
| @@ -36,6 +31,7 @@ namespace OpenGL { | |||
| 36 | using namespace HostShaders; | 31 | using namespace HostShaders; |
| 37 | using namespace Tegra::Texture::ASTC; | 32 | using namespace Tegra::Texture::ASTC; |
| 38 | 33 | ||
| 34 | using VideoCommon::Extent2D; | ||
| 39 | using VideoCommon::Extent3D; | 35 | using VideoCommon::Extent3D; |
| 40 | using VideoCommon::ImageCopy; | 36 | using VideoCommon::ImageCopy; |
| 41 | using VideoCommon::ImageType; | 37 | using VideoCommon::ImageType; |
| @@ -69,33 +65,15 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) | |||
| 69 | pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), | 65 | pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), |
| 70 | copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)), | 66 | copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)), |
| 71 | copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { | 67 | copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { |
| 72 | MakeBuffers(); | ||
| 73 | } | ||
| 74 | |||
| 75 | UtilShaders::~UtilShaders() = default; | ||
| 76 | |||
| 77 | void UtilShaders::MakeBuffers() { | ||
| 78 | const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); | 68 | const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); |
| 79 | swizzle_table_buffer.Create(); | 69 | swizzle_table_buffer.Create(); |
| 70 | astc_buffer.Create(); | ||
| 80 | glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); | 71 | glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); |
| 81 | 72 | glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_BUFFER_DATA), &ASTC_BUFFER_DATA, 0); | |
| 82 | astc_encodings_buffer.Create(); | ||
| 83 | glNamedBufferStorage(astc_encodings_buffer.handle, sizeof(EncodingsValues), &EncodingsValues, | ||
| 84 | 0); | ||
| 85 | replicate_6_to_8_buffer.Create(); | ||
| 86 | glNamedBufferStorage(replicate_6_to_8_buffer.handle, sizeof(REPLICATE_6_BIT_TO_8_TABLE), | ||
| 87 | &REPLICATE_6_BIT_TO_8_TABLE, 0); | ||
| 88 | replicate_7_to_8_buffer.Create(); | ||
| 89 | glNamedBufferStorage(replicate_7_to_8_buffer.handle, sizeof(REPLICATE_7_BIT_TO_8_TABLE), | ||
| 90 | &REPLICATE_7_BIT_TO_8_TABLE, 0); | ||
| 91 | replicate_8_to_8_buffer.Create(); | ||
| 92 | glNamedBufferStorage(replicate_8_to_8_buffer.handle, sizeof(REPLICATE_8_BIT_TO_8_TABLE), | ||
| 93 | &REPLICATE_8_BIT_TO_8_TABLE, 0); | ||
| 94 | replicate_byte_to_16_buffer.Create(); | ||
| 95 | glNamedBufferStorage(replicate_byte_to_16_buffer.handle, sizeof(REPLICATE_BYTE_TO_16_TABLE), | ||
| 96 | &REPLICATE_BYTE_TO_16_TABLE, 0); | ||
| 97 | } | 73 | } |
| 98 | 74 | ||
| 75 | UtilShaders::~UtilShaders() = default; | ||
| 76 | |||
| 99 | void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, | 77 | void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, |
| 100 | std::span<const VideoCommon::SwizzleParameters> swizzles) { | 78 | std::span<const VideoCommon::SwizzleParameters> swizzles) { |
| 101 | static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; | 79 | static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; |
| @@ -108,47 +86,51 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, | |||
| 108 | static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6; | 86 | static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6; |
| 109 | 87 | ||
| 110 | static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; | 88 | static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; |
| 111 | static constexpr GLuint LOC_NUM_IMAGE_BLOCKS = 0; | ||
| 112 | static constexpr GLuint LOC_BLOCK_DIMS = 1; | ||
| 113 | 89 | ||
| 114 | const Extent3D tile_size = { | 90 | const Extent2D tile_size{ |
| 115 | VideoCore::Surface::DefaultBlockWidth(image.info.format), | 91 | .width = VideoCore::Surface::DefaultBlockWidth(image.info.format), |
| 116 | VideoCore::Surface::DefaultBlockHeight(image.info.format), | 92 | .height = VideoCore::Surface::DefaultBlockHeight(image.info.format), |
| 117 | }; | 93 | }; |
| 118 | program_manager.BindHostCompute(astc_decoder_program.handle); | 94 | program_manager.BindHostCompute(astc_decoder_program.handle); |
| 119 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); | 95 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); |
| 120 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_encodings_buffer.handle); | 96 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle, |
| 121 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER, | 97 | offsetof(AstcBufferData, encoding_values), |
| 122 | replicate_6_to_8_buffer.handle); | 98 | sizeof(AstcBufferData::encoding_values)); |
| 123 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER, | 99 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER, astc_buffer.handle, |
| 124 | replicate_7_to_8_buffer.handle); | 100 | offsetof(AstcBufferData, replicate_6_to_8), |
| 125 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER, | 101 | sizeof(AstcBufferData::replicate_6_to_8)); |
| 126 | replicate_8_to_8_buffer.handle); | 102 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER, astc_buffer.handle, |
| 127 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER, | 103 | offsetof(AstcBufferData, replicate_7_to_8), |
| 128 | replicate_byte_to_16_buffer.handle); | 104 | sizeof(AstcBufferData::replicate_7_to_8)); |
| 105 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER, astc_buffer.handle, | ||
| 106 | offsetof(AstcBufferData, replicate_8_to_8), | ||
| 107 | sizeof(AstcBufferData::replicate_8_to_8)); | ||
| 108 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER, astc_buffer.handle, | ||
| 109 | offsetof(AstcBufferData, replicate_byte_to_16), | ||
| 110 | sizeof(AstcBufferData::replicate_byte_to_16)); | ||
| 129 | 111 | ||
| 130 | glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); | 112 | glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); |
| 131 | glUniform2ui(LOC_BLOCK_DIMS, tile_size.width, tile_size.height); | 113 | glUniform2ui(1, tile_size.width, tile_size.height); |
| 114 | // Ensure buffer data is valid before dispatching | ||
| 115 | glFlush(); | ||
| 132 | for (const SwizzleParameters& swizzle : swizzles) { | 116 | for (const SwizzleParameters& swizzle : swizzles) { |
| 133 | glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, | ||
| 134 | GL_WRITE_ONLY, GL_RGBA8); | ||
| 135 | const size_t input_offset = swizzle.buffer_offset + map.offset; | 117 | const size_t input_offset = swizzle.buffer_offset + map.offset; |
| 136 | const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); | 118 | const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); |
| 137 | const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); | 119 | const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); |
| 138 | |||
| 139 | glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height); | ||
| 140 | 120 | ||
| 141 | // To unswizzle the ASTC data | ||
| 142 | const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); | 121 | const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); |
| 143 | glUniform3uiv(2, 1, params.origin.data()); | 122 | ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0})); |
| 144 | glUniform3iv(3, 1, params.destination.data()); | 123 | ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0})); |
| 145 | glUniform1ui(4, params.bytes_per_block_log2); | ||
| 146 | glUniform1ui(5, params.layer_stride); | ||
| 147 | glUniform1ui(6, params.block_size); | ||
| 148 | glUniform1ui(7, params.x_shift); | ||
| 149 | glUniform1ui(8, params.block_height); | ||
| 150 | glUniform1ui(9, params.block_height_mask); | ||
| 151 | 124 | ||
| 125 | glUniform1ui(2, params.bytes_per_block_log2); | ||
| 126 | glUniform1ui(3, params.layer_stride); | ||
| 127 | glUniform1ui(4, params.block_size); | ||
| 128 | glUniform1ui(5, params.x_shift); | ||
| 129 | glUniform1ui(6, params.block_height); | ||
| 130 | glUniform1ui(7, params.block_height_mask); | ||
| 131 | |||
| 132 | glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, | ||
| 133 | GL_WRITE_ONLY, GL_RGBA8); | ||
| 152 | // ASTC texture data | 134 | // ASTC texture data |
| 153 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, | 135 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, |
| 154 | image.guest_size_bytes - swizzle.buffer_offset); | 136 | image.guest_size_bytes - swizzle.buffer_offset); |
diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index 08a1cb9b2..53d65f368 100644 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h | |||
| @@ -40,8 +40,6 @@ public: | |||
| 40 | explicit UtilShaders(ProgramManager& program_manager); | 40 | explicit UtilShaders(ProgramManager& program_manager); |
| 41 | ~UtilShaders(); | 41 | ~UtilShaders(); |
| 42 | 42 | ||
| 43 | void MakeBuffers(); | ||
| 44 | |||
| 45 | void ASTCDecode(Image& image, const ImageBufferMap& map, | 43 | void ASTCDecode(Image& image, const ImageBufferMap& map, |
| 46 | std::span<const VideoCommon::SwizzleParameters> swizzles); | 44 | std::span<const VideoCommon::SwizzleParameters> swizzles); |
| 47 | 45 | ||
| @@ -64,11 +62,7 @@ private: | |||
| 64 | ProgramManager& program_manager; | 62 | ProgramManager& program_manager; |
| 65 | 63 | ||
| 66 | OGLBuffer swizzle_table_buffer; | 64 | OGLBuffer swizzle_table_buffer; |
| 67 | OGLBuffer astc_encodings_buffer; | 65 | OGLBuffer astc_buffer; |
| 68 | OGLBuffer replicate_6_to_8_buffer; | ||
| 69 | OGLBuffer replicate_7_to_8_buffer; | ||
| 70 | OGLBuffer replicate_8_to_8_buffer; | ||
| 71 | OGLBuffer replicate_byte_to_16_buffer; | ||
| 72 | 66 | ||
| 73 | OGLProgram astc_decoder_program; | 67 | OGLProgram astc_decoder_program; |
| 74 | OGLProgram block_linear_unswizzle_2d_program; | 68 | OGLProgram block_linear_unswizzle_2d_program; |
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index a0050b68f..e11406e58 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp | |||
| @@ -35,13 +35,13 @@ using namespace Tegra::Texture::ASTC; | |||
| 35 | 35 | ||
| 36 | namespace { | 36 | namespace { |
| 37 | 37 | ||
| 38 | constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 0; | 38 | constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; |
| 39 | constexpr u32 ASTC_BINDING_INPUT_BUFFER = 1; | 39 | constexpr u32 ASTC_BINDING_ENC_BUFFER = 1; |
| 40 | constexpr u32 ASTC_BINDING_ENC_BUFFER = 2; | 40 | constexpr u32 ASTC_BINDING_6_TO_8_BUFFER = 2; |
| 41 | constexpr u32 ASTC_BINDING_6_TO_8_BUFFER = 3; | 41 | constexpr u32 ASTC_BINDING_7_TO_8_BUFFER = 3; |
| 42 | constexpr u32 ASTC_BINDING_7_TO_8_BUFFER = 4; | 42 | constexpr u32 ASTC_BINDING_8_TO_8_BUFFER = 4; |
| 43 | constexpr u32 ASTC_BINDING_8_TO_8_BUFFER = 5; | 43 | constexpr u32 ASTC_BINDING_BYTE_TO_16_BUFFER = 5; |
| 44 | constexpr u32 ASTC_BINDING_BYTE_TO_16_BUFFER = 6; | 44 | constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 6; |
| 45 | constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 7; | 45 | constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 7; |
| 46 | 46 | ||
| 47 | VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { | 47 | VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { |
| @@ -74,56 +74,56 @@ std::array<VkDescriptorSetLayoutBinding, 2> BuildInputOutputDescriptorSetBinding | |||
| 74 | std::array<VkDescriptorSetLayoutBinding, 8> BuildASTCDescriptorSetBindings() { | 74 | std::array<VkDescriptorSetLayoutBinding, 8> BuildASTCDescriptorSetBindings() { |
| 75 | return {{ | 75 | return {{ |
| 76 | { | 76 | { |
| 77 | .binding = ASTC_BINDING_SWIZZLE_BUFFER, // Swizzle buffer | 77 | .binding = ASTC_BINDING_INPUT_BUFFER, |
| 78 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 78 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 79 | .descriptorCount = 1, | 79 | .descriptorCount = 1, |
| 80 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | 80 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |
| 81 | .pImmutableSamplers = nullptr, | 81 | .pImmutableSamplers = nullptr, |
| 82 | }, | 82 | }, |
| 83 | { | 83 | { |
| 84 | .binding = ASTC_BINDING_INPUT_BUFFER, // ASTC Img data buffer | 84 | .binding = ASTC_BINDING_ENC_BUFFER, |
| 85 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 85 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 86 | .descriptorCount = 1, | 86 | .descriptorCount = 1, |
| 87 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | 87 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |
| 88 | .pImmutableSamplers = nullptr, | 88 | .pImmutableSamplers = nullptr, |
| 89 | }, | 89 | }, |
| 90 | { | 90 | { |
| 91 | .binding = ASTC_BINDING_ENC_BUFFER, // Encodings buffer | 91 | .binding = ASTC_BINDING_6_TO_8_BUFFER, |
| 92 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 92 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 93 | .descriptorCount = 1, | 93 | .descriptorCount = 1, |
| 94 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | 94 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |
| 95 | .pImmutableSamplers = nullptr, | 95 | .pImmutableSamplers = nullptr, |
| 96 | }, | 96 | }, |
| 97 | { | 97 | { |
| 98 | .binding = ASTC_BINDING_6_TO_8_BUFFER, // BINDING_6_TO_8_BUFFER | 98 | .binding = ASTC_BINDING_7_TO_8_BUFFER, |
| 99 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 99 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 100 | .descriptorCount = 1, | 100 | .descriptorCount = 1, |
| 101 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | 101 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |
| 102 | .pImmutableSamplers = nullptr, | 102 | .pImmutableSamplers = nullptr, |
| 103 | }, | 103 | }, |
| 104 | { | 104 | { |
| 105 | .binding = ASTC_BINDING_7_TO_8_BUFFER, // BINDING_7_TO_8_BUFFER | 105 | .binding = ASTC_BINDING_8_TO_8_BUFFER, |
| 106 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 106 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 107 | .descriptorCount = 1, | 107 | .descriptorCount = 1, |
| 108 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | 108 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |
| 109 | .pImmutableSamplers = nullptr, | 109 | .pImmutableSamplers = nullptr, |
| 110 | }, | 110 | }, |
| 111 | { | 111 | { |
| 112 | .binding = ASTC_BINDING_8_TO_8_BUFFER, // BINDING_8_TO_8_BUFFER | 112 | .binding = ASTC_BINDING_BYTE_TO_16_BUFFER, |
| 113 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 113 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 114 | .descriptorCount = 1, | 114 | .descriptorCount = 1, |
| 115 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | 115 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |
| 116 | .pImmutableSamplers = nullptr, | 116 | .pImmutableSamplers = nullptr, |
| 117 | }, | 117 | }, |
| 118 | { | 118 | { |
| 119 | .binding = ASTC_BINDING_BYTE_TO_16_BUFFER, // BINDING_BYTE_TO_16_BUFFER | 119 | .binding = ASTC_BINDING_SWIZZLE_BUFFER, |
| 120 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 120 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 121 | .descriptorCount = 1, | 121 | .descriptorCount = 1, |
| 122 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | 122 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |
| 123 | .pImmutableSamplers = nullptr, | 123 | .pImmutableSamplers = nullptr, |
| 124 | }, | 124 | }, |
| 125 | { | 125 | { |
| 126 | .binding = ASTC_BINDING_OUTPUT_IMAGE, // Output image | 126 | .binding = ASTC_BINDING_OUTPUT_IMAGE, |
| 127 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, | 127 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, |
| 128 | .descriptorCount = 1, | 128 | .descriptorCount = 1, |
| 129 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | 129 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |
| @@ -146,19 +146,11 @@ VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() { | |||
| 146 | std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateTemplateEntry() { | 146 | std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateTemplateEntry() { |
| 147 | return {{ | 147 | return {{ |
| 148 | { | 148 | { |
| 149 | .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, | ||
| 150 | .dstArrayElement = 0, | ||
| 151 | .descriptorCount = 1, | ||
| 152 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||
| 153 | .offset = 0 * sizeof(DescriptorUpdateEntry), | ||
| 154 | .stride = sizeof(DescriptorUpdateEntry), | ||
| 155 | }, | ||
| 156 | { | ||
| 157 | .dstBinding = ASTC_BINDING_INPUT_BUFFER, | 149 | .dstBinding = ASTC_BINDING_INPUT_BUFFER, |
| 158 | .dstArrayElement = 0, | 150 | .dstArrayElement = 0, |
| 159 | .descriptorCount = 1, | 151 | .descriptorCount = 1, |
| 160 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 152 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 161 | .offset = 1 * sizeof(DescriptorUpdateEntry), | 153 | .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry), |
| 162 | .stride = sizeof(DescriptorUpdateEntry), | 154 | .stride = sizeof(DescriptorUpdateEntry), |
| 163 | }, | 155 | }, |
| 164 | { | 156 | { |
| @@ -166,7 +158,7 @@ std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateT | |||
| 166 | .dstArrayElement = 0, | 158 | .dstArrayElement = 0, |
| 167 | .descriptorCount = 1, | 159 | .descriptorCount = 1, |
| 168 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 160 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 169 | .offset = 2 * sizeof(DescriptorUpdateEntry), | 161 | .offset = ASTC_BINDING_ENC_BUFFER * sizeof(DescriptorUpdateEntry), |
| 170 | .stride = sizeof(DescriptorUpdateEntry), | 162 | .stride = sizeof(DescriptorUpdateEntry), |
| 171 | }, | 163 | }, |
| 172 | { | 164 | { |
| @@ -174,7 +166,7 @@ std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateT | |||
| 174 | .dstArrayElement = 0, | 166 | .dstArrayElement = 0, |
| 175 | .descriptorCount = 1, | 167 | .descriptorCount = 1, |
| 176 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 168 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 177 | .offset = 3 * sizeof(DescriptorUpdateEntry), | 169 | .offset = ASTC_BINDING_6_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), |
| 178 | .stride = sizeof(DescriptorUpdateEntry), | 170 | .stride = sizeof(DescriptorUpdateEntry), |
| 179 | }, | 171 | }, |
| 180 | { | 172 | { |
| @@ -182,7 +174,7 @@ std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateT | |||
| 182 | .dstArrayElement = 0, | 174 | .dstArrayElement = 0, |
| 183 | .descriptorCount = 1, | 175 | .descriptorCount = 1, |
| 184 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 176 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 185 | .offset = 4 * sizeof(DescriptorUpdateEntry), | 177 | .offset = ASTC_BINDING_7_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), |
| 186 | .stride = sizeof(DescriptorUpdateEntry), | 178 | .stride = sizeof(DescriptorUpdateEntry), |
| 187 | }, | 179 | }, |
| 188 | { | 180 | { |
| @@ -190,7 +182,7 @@ std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateT | |||
| 190 | .dstArrayElement = 0, | 182 | .dstArrayElement = 0, |
| 191 | .descriptorCount = 1, | 183 | .descriptorCount = 1, |
| 192 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 184 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 193 | .offset = 5 * sizeof(DescriptorUpdateEntry), | 185 | .offset = ASTC_BINDING_8_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), |
| 194 | .stride = sizeof(DescriptorUpdateEntry), | 186 | .stride = sizeof(DescriptorUpdateEntry), |
| 195 | }, | 187 | }, |
| 196 | { | 188 | { |
| @@ -198,7 +190,15 @@ std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateT | |||
| 198 | .dstArrayElement = 0, | 190 | .dstArrayElement = 0, |
| 199 | .descriptorCount = 1, | 191 | .descriptorCount = 1, |
| 200 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | 192 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 201 | .offset = 6 * sizeof(DescriptorUpdateEntry), | 193 | .offset = ASTC_BINDING_BYTE_TO_16_BUFFER * sizeof(DescriptorUpdateEntry), |
| 194 | .stride = sizeof(DescriptorUpdateEntry), | ||
| 195 | }, | ||
| 196 | { | ||
| 197 | .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, | ||
| 198 | .dstArrayElement = 0, | ||
| 199 | .descriptorCount = 1, | ||
| 200 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||
| 201 | .offset = ASTC_BINDING_SWIZZLE_BUFFER * sizeof(DescriptorUpdateEntry), | ||
| 202 | .stride = sizeof(DescriptorUpdateEntry), | 202 | .stride = sizeof(DescriptorUpdateEntry), |
| 203 | }, | 203 | }, |
| 204 | { | 204 | { |
| @@ -206,16 +206,20 @@ std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateT | |||
| 206 | .dstArrayElement = 0, | 206 | .dstArrayElement = 0, |
| 207 | .descriptorCount = 1, | 207 | .descriptorCount = 1, |
| 208 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, | 208 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, |
| 209 | .offset = 7 * sizeof(DescriptorUpdateEntry), | 209 | .offset = ASTC_BINDING_OUTPUT_IMAGE * sizeof(DescriptorUpdateEntry), |
| 210 | .stride = sizeof(DescriptorUpdateEntry), | 210 | .stride = sizeof(DescriptorUpdateEntry), |
| 211 | }, | 211 | }, |
| 212 | }}; | 212 | }}; |
| 213 | } | 213 | } |
| 214 | 214 | ||
| 215 | struct AstcPushConstants { | 215 | struct AstcPushConstants { |
| 216 | std::array<u32, 2> num_image_blocks; | ||
| 217 | std::array<u32, 2> blocks_dims; | 216 | std::array<u32, 2> blocks_dims; |
| 218 | VideoCommon::Accelerated::BlockLinearSwizzle2DParams params; | 217 | u32 bytes_per_block_log2; |
| 218 | u32 layer_stride; | ||
| 219 | u32 block_size; | ||
| 220 | u32 x_shift; | ||
| 221 | u32 block_height; | ||
| 222 | u32 block_height_mask; | ||
| 219 | }; | 223 | }; |
| 220 | 224 | ||
| 221 | struct AstcBufferData { | 225 | struct AstcBufferData { |
| @@ -419,11 +423,12 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, | |||
| 419 | ASTCDecoderPass::~ASTCDecoderPass() = default; | 423 | ASTCDecoderPass::~ASTCDecoderPass() = default; |
| 420 | 424 | ||
| 421 | void ASTCDecoderPass::MakeDataBuffer() { | 425 | void ASTCDecoderPass::MakeDataBuffer() { |
| 426 | constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_BUFFER_DATA) + sizeof(SWIZZLE_TABLE); | ||
| 422 | data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ | 427 | data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ |
| 423 | .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, | 428 | .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, |
| 424 | .pNext = nullptr, | 429 | .pNext = nullptr, |
| 425 | .flags = 0, | 430 | .flags = 0, |
| 426 | .size = sizeof(ASTC_BUFFER_DATA), | 431 | .size = TOTAL_BUFFER_SIZE, |
| 427 | .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, | 432 | .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, |
| 428 | .sharingMode = VK_SHARING_MODE_EXCLUSIVE, | 433 | .sharingMode = VK_SHARING_MODE_EXCLUSIVE, |
| 429 | .queueFamilyIndexCount = 0, | 434 | .queueFamilyIndexCount = 0, |
| @@ -431,15 +436,19 @@ void ASTCDecoderPass::MakeDataBuffer() { | |||
| 431 | }); | 436 | }); |
| 432 | data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); | 437 | data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); |
| 433 | 438 | ||
| 434 | const auto staging_ref = | 439 | const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload); |
| 435 | staging_buffer_pool.Request(sizeof(ASTC_BUFFER_DATA), MemoryUsage::Upload); | ||
| 436 | std::memcpy(staging_ref.mapped_span.data(), &ASTC_BUFFER_DATA, sizeof(ASTC_BUFFER_DATA)); | 440 | std::memcpy(staging_ref.mapped_span.data(), &ASTC_BUFFER_DATA, sizeof(ASTC_BUFFER_DATA)); |
| 437 | scheduler.Record([src = staging_ref.buffer, dst = *data_buffer](vk::CommandBuffer cmdbuf) { | 441 | // Tack on the swizzle table at the end of the buffer |
| 442 | std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_BUFFER_DATA), &SWIZZLE_TABLE, | ||
| 443 | sizeof(SWIZZLE_TABLE)); | ||
| 444 | |||
| 445 | scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer, | ||
| 446 | TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) { | ||
| 438 | cmdbuf.CopyBuffer(src, dst, | 447 | cmdbuf.CopyBuffer(src, dst, |
| 439 | VkBufferCopy{ | 448 | VkBufferCopy{ |
| 440 | .srcOffset = 0, | 449 | .srcOffset = offset, |
| 441 | .dstOffset = 0, | 450 | .dstOffset = 0, |
| 442 | .size = sizeof(ASTC_BUFFER_DATA), | 451 | .size = TOTAL_BUFFER_SIZE, |
| 443 | }); | 452 | }); |
| 444 | cmdbuf.PipelineBarrier( | 453 | cmdbuf.PipelineBarrier( |
| 445 | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, | 454 | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, |
| @@ -448,61 +457,58 @@ void ASTCDecoderPass::MakeDataBuffer() { | |||
| 448 | .pNext = nullptr, | 457 | .pNext = nullptr, |
| 449 | .srcAccessMask = 0, | 458 | .srcAccessMask = 0, |
| 450 | .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, | 459 | .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, |
| 451 | }, | 460 | }); |
| 452 | {}, {}); | ||
| 453 | }); | 461 | }); |
| 454 | } | 462 | } |
| 455 | 463 | ||
| 456 | void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | 464 | void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, |
| 457 | std::span<const VideoCommon::SwizzleParameters> swizzles) { | 465 | std::span<const VideoCommon::SwizzleParameters> swizzles) { |
| 458 | using namespace VideoCommon::Accelerated; | 466 | using namespace VideoCommon::Accelerated; |
| 459 | const VideoCommon::Extent2D tile_size{ | 467 | const std::array<u32, 2> block_dims{ |
| 460 | .width = VideoCore::Surface::DefaultBlockWidth(image.info.format), | 468 | VideoCore::Surface::DefaultBlockWidth(image.info.format), |
| 461 | .height = VideoCore::Surface::DefaultBlockHeight(image.info.format), | 469 | VideoCore::Surface::DefaultBlockHeight(image.info.format), |
| 462 | }; | 470 | }; |
| 463 | scheduler.RequestOutsideRenderPassOperationContext(); | 471 | scheduler.RequestOutsideRenderPassOperationContext(); |
| 464 | if (!data_buffer) { | 472 | if (!data_buffer) { |
| 465 | MakeDataBuffer(); | 473 | MakeDataBuffer(); |
| 466 | } | 474 | } |
| 475 | const VkPipeline vk_pipeline = *pipeline; | ||
| 467 | const VkImageAspectFlags aspect_mask = image.AspectMask(); | 476 | const VkImageAspectFlags aspect_mask = image.AspectMask(); |
| 468 | const VkImage vk_image = image.Handle(); | 477 | const VkImage vk_image = image.Handle(); |
| 469 | const bool is_initialized = image.ExchangeInitialization(); | 478 | const bool is_initialized = image.ExchangeInitialization(); |
| 470 | scheduler.Record([vk_image, aspect_mask, is_initialized](vk::CommandBuffer cmdbuf) { | 479 | scheduler.Record( |
| 471 | const VkImageMemoryBarrier image_barrier{ | 480 | [vk_pipeline, vk_image, aspect_mask, is_initialized](vk::CommandBuffer cmdbuf) { |
| 472 | .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, | 481 | const VkImageMemoryBarrier image_barrier{ |
| 473 | .pNext = nullptr, | 482 | .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, |
| 474 | .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, | 483 | .pNext = nullptr, |
| 475 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, | 484 | .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, |
| 476 | .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, | 485 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, |
| 477 | .newLayout = VK_IMAGE_LAYOUT_GENERAL, | 486 | .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, |
| 478 | .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | 487 | .newLayout = VK_IMAGE_LAYOUT_GENERAL, |
| 479 | .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | 488 | .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, |
| 480 | .image = vk_image, | 489 | .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, |
| 481 | .subresourceRange{ | 490 | .image = vk_image, |
| 482 | .aspectMask = aspect_mask, | 491 | .subresourceRange{ |
| 483 | .baseMipLevel = 0, | 492 | .aspectMask = aspect_mask, |
| 484 | .levelCount = VK_REMAINING_MIP_LEVELS, | 493 | .baseMipLevel = 0, |
| 485 | .baseArrayLayer = 0, | 494 | .levelCount = VK_REMAINING_MIP_LEVELS, |
| 486 | .layerCount = VK_REMAINING_ARRAY_LAYERS, | 495 | .baseArrayLayer = 0, |
| 487 | }, | 496 | .layerCount = VK_REMAINING_ARRAY_LAYERS, |
| 488 | }; | 497 | }, |
| 489 | cmdbuf.PipelineBarrier(0, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, image_barrier); | 498 | }; |
| 490 | }); | 499 | cmdbuf.PipelineBarrier(is_initialized ? VK_PIPELINE_STAGE_ALL_COMMANDS_BIT : 0, |
| 491 | const std::array<u32, 2> block_dims{tile_size.width, tile_size.height}; | 500 | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, image_barrier); |
| 501 | cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, vk_pipeline); | ||
| 502 | }); | ||
| 492 | for (const VideoCommon::SwizzleParameters& swizzle : swizzles) { | 503 | for (const VideoCommon::SwizzleParameters& swizzle : swizzles) { |
| 493 | const size_t input_offset = swizzle.buffer_offset + map.offset; | 504 | const size_t input_offset = swizzle.buffer_offset + map.offset; |
| 494 | const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); | 505 | const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); |
| 495 | const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); | 506 | const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); |
| 496 | const u32 num_dispatches_z = image.info.resources.layers; | 507 | const u32 num_dispatches_z = image.info.resources.layers; |
| 497 | const std::array num_image_blocks{swizzle.num_tiles.width, swizzle.num_tiles.height}; | ||
| 498 | const u32 layer_image_size = | ||
| 499 | image.guest_size_bytes - static_cast<u32>(swizzle.buffer_offset); | ||
| 500 | 508 | ||
| 501 | update_descriptor_queue.Acquire(); | 509 | update_descriptor_queue.Acquire(); |
| 502 | update_descriptor_queue.AddBuffer(*data_buffer, | 510 | update_descriptor_queue.AddBuffer(map.buffer, input_offset, |
| 503 | offsetof(AstcBufferData, swizzle_table_buffer), | 511 | image.guest_size_bytes - swizzle.buffer_offset); |
| 504 | sizeof(AstcBufferData::swizzle_table_buffer)); | ||
| 505 | update_descriptor_queue.AddBuffer(map.buffer, input_offset, layer_image_size); | ||
| 506 | update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, encoding_values), | 512 | update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, encoding_values), |
| 507 | sizeof(AstcBufferData::encoding_values)); | 513 | sizeof(AstcBufferData::encoding_values)); |
| 508 | update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_6_to_8), | 514 | update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_6_to_8), |
| @@ -514,18 +520,28 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | |||
| 514 | update_descriptor_queue.AddBuffer(*data_buffer, | 520 | update_descriptor_queue.AddBuffer(*data_buffer, |
| 515 | offsetof(AstcBufferData, replicate_byte_to_16), | 521 | offsetof(AstcBufferData, replicate_byte_to_16), |
| 516 | sizeof(AstcBufferData::replicate_byte_to_16)); | 522 | sizeof(AstcBufferData::replicate_byte_to_16)); |
| 523 | update_descriptor_queue.AddBuffer(*data_buffer, sizeof(AstcBufferData), | ||
| 524 | sizeof(SWIZZLE_TABLE)); | ||
| 517 | update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); | 525 | update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); |
| 518 | 526 | ||
| 519 | const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); | 527 | const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); |
| 520 | const VkPipelineLayout vk_layout = *layout; | 528 | const VkPipelineLayout vk_layout = *layout; |
| 521 | const VkPipeline vk_pipeline = *pipeline; | 529 | |
| 522 | // To unswizzle the ASTC data | 530 | // To unswizzle the ASTC data |
| 523 | const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); | 531 | const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); |
| 524 | scheduler.Record([vk_layout, vk_pipeline, buffer = map.buffer, num_dispatches_x, | 532 | ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0})); |
| 525 | num_dispatches_y, num_dispatches_z, num_image_blocks, block_dims, params, | 533 | ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0})); |
| 526 | set, input_offset](vk::CommandBuffer cmdbuf) { | 534 | scheduler.Record([vk_layout, num_dispatches_x, num_dispatches_y, num_dispatches_z, |
| 527 | const AstcPushConstants uniforms{num_image_blocks, block_dims, params}; | 535 | block_dims, params, set](vk::CommandBuffer cmdbuf) { |
| 528 | cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, vk_pipeline); | 536 | const AstcPushConstants uniforms{ |
| 537 | .blocks_dims = block_dims, | ||
| 538 | .bytes_per_block_log2 = params.bytes_per_block_log2, | ||
| 539 | .layer_stride = params.layer_stride, | ||
| 540 | .block_size = params.block_size, | ||
| 541 | .x_shift = params.x_shift, | ||
| 542 | .block_height = params.block_height, | ||
| 543 | .block_height_mask = params.block_height_mask, | ||
| 544 | }; | ||
| 529 | cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, vk_layout, 0, set, {}); | 545 | cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, vk_layout, 0, set, {}); |
| 530 | cmdbuf.PushConstants(vk_layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); | 546 | cmdbuf.PushConstants(vk_layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); |
| 531 | cmdbuf.Dispatch(num_dispatches_x, num_dispatches_y, num_dispatches_z); | 547 | cmdbuf.Dispatch(num_dispatches_x, num_dispatches_y, num_dispatches_z); |
| @@ -550,7 +566,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | |||
| 550 | .layerCount = VK_REMAINING_ARRAY_LAYERS, | 566 | .layerCount = VK_REMAINING_ARRAY_LAYERS, |
| 551 | }, | 567 | }, |
| 552 | }; | 568 | }; |
| 553 | cmdbuf.PipelineBarrier(0, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, image_barrier); | 569 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, |
| 570 | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier); | ||
| 554 | }); | 571 | }); |
| 555 | } | 572 | } |
| 556 | 573 | ||
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 2c42d1449..c22dd0148 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp | |||
| @@ -47,7 +47,6 @@ | |||
| 47 | #include "video_core/texture_cache/formatter.h" | 47 | #include "video_core/texture_cache/formatter.h" |
| 48 | #include "video_core/texture_cache/samples_helper.h" | 48 | #include "video_core/texture_cache/samples_helper.h" |
| 49 | #include "video_core/texture_cache/util.h" | 49 | #include "video_core/texture_cache/util.h" |
| 50 | #include "video_core/textures/astc.h" | ||
| 51 | #include "video_core/textures/decoders.h" | 50 | #include "video_core/textures/decoders.h" |
| 52 | 51 | ||
| 53 | namespace VideoCommon { | 52 | namespace VideoCommon { |
| @@ -879,17 +878,8 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8 | |||
| 879 | ASSERT(copy.image_extent == mip_size); | 878 | ASSERT(copy.image_extent == mip_size); |
| 880 | ASSERT(copy.buffer_row_length == Common::AlignUp(mip_size.width, tile_size.width)); | 879 | ASSERT(copy.buffer_row_length == Common::AlignUp(mip_size.width, tile_size.width)); |
| 881 | ASSERT(copy.buffer_image_height == Common::AlignUp(mip_size.height, tile_size.height)); | 880 | ASSERT(copy.buffer_image_height == Common::AlignUp(mip_size.height, tile_size.height)); |
| 882 | 881 | DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent, | |
| 883 | if (IsPixelFormatASTC(info.format)) { | 882 | output.subspan(output_offset)); |
| 884 | ASSERT(copy.image_extent.depth == 1); | ||
| 885 | Tegra::Texture::ASTC::Decompress(input.subspan(copy.buffer_offset), | ||
| 886 | copy.image_extent.width, copy.image_extent.height, | ||
| 887 | copy.image_subresource.num_layers, tile_size.width, | ||
| 888 | tile_size.height, output.subspan(output_offset)); | ||
| 889 | } else { | ||
| 890 | DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent, | ||
| 891 | output.subspan(output_offset)); | ||
| 892 | } | ||
| 893 | copy.buffer_offset = output_offset; | 883 | copy.buffer_offset = output_offset; |
| 894 | copy.buffer_row_length = mip_size.width; | 884 | copy.buffer_row_length = mip_size.width; |
| 895 | copy.buffer_image_height = mip_size.height; | 885 | copy.buffer_image_height = mip_size.height; |
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp deleted file mode 100644 index 3625b666c..000000000 --- a/src/video_core/textures/astc.cpp +++ /dev/null | |||
| @@ -1,1710 +0,0 @@ | |||
| 1 | // Copyright 2016 The University of North Carolina at Chapel Hill | ||
| 2 | // | ||
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| 4 | // you may not use this file except in compliance with the License. | ||
| 5 | // You may obtain a copy of the License at | ||
| 6 | // | ||
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 | ||
| 8 | // | ||
| 9 | // Unless required by applicable law or agreed to in writing, software | ||
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, | ||
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| 12 | // See the License for the specific language governing permissions and | ||
| 13 | // limitations under the License. | ||
| 14 | // | ||
| 15 | // Please send all BUG REPORTS to <pavel@cs.unc.edu>. | ||
| 16 | // <http://gamma.cs.unc.edu/FasTC/> | ||
| 17 | |||
| 18 | #include <algorithm> | ||
| 19 | #include <cassert> | ||
| 20 | #include <cstring> | ||
| 21 | #include <span> | ||
| 22 | #include <vector> | ||
| 23 | |||
| 24 | #include <boost/container/static_vector.hpp> | ||
| 25 | |||
| 26 | #include "common/common_types.h" | ||
| 27 | |||
| 28 | #include "video_core/textures/astc.h" | ||
| 29 | |||
| 30 | namespace { | ||
| 31 | |||
| 32 | /// Count the number of bits set in a number. | ||
| 33 | constexpr u32 Popcnt(u32 n) { | ||
| 34 | u32 c = 0; | ||
| 35 | for (; n; c++) { | ||
| 36 | n &= n - 1; | ||
| 37 | } | ||
| 38 | return c; | ||
| 39 | } | ||
| 40 | |||
| 41 | } // Anonymous namespace | ||
| 42 | |||
| 43 | class InputBitStream { | ||
| 44 | public: | ||
| 45 | constexpr explicit InputBitStream(std::span<const u8> data, size_t start_offset = 0) | ||
| 46 | : cur_byte{data.data()}, total_bits{data.size()}, next_bit{start_offset % 8} {} | ||
| 47 | |||
| 48 | constexpr size_t GetBitsRead() const { | ||
| 49 | return bits_read; | ||
| 50 | } | ||
| 51 | |||
| 52 | constexpr bool ReadBit() { | ||
| 53 | if (bits_read >= total_bits * 8) { | ||
| 54 | return 0; | ||
| 55 | } | ||
| 56 | const bool bit = ((*cur_byte >> next_bit) & 1) != 0; | ||
| 57 | ++next_bit; | ||
| 58 | while (next_bit >= 8) { | ||
| 59 | next_bit -= 8; | ||
| 60 | ++cur_byte; | ||
| 61 | } | ||
| 62 | ++bits_read; | ||
| 63 | return bit; | ||
| 64 | } | ||
| 65 | |||
| 66 | constexpr u32 ReadBits(std::size_t nBits) { | ||
| 67 | u32 ret = 0; | ||
| 68 | for (std::size_t i = 0; i < nBits; ++i) { | ||
| 69 | ret |= (ReadBit() & 1) << i; | ||
| 70 | } | ||
| 71 | return ret; | ||
| 72 | } | ||
| 73 | |||
| 74 | template <std::size_t nBits> | ||
| 75 | constexpr u32 ReadBits() { | ||
| 76 | u32 ret = 0; | ||
| 77 | for (std::size_t i = 0; i < nBits; ++i) { | ||
| 78 | ret |= (ReadBit() & 1) << i; | ||
| 79 | } | ||
| 80 | return ret; | ||
| 81 | } | ||
| 82 | |||
| 83 | private: | ||
| 84 | const u8* cur_byte; | ||
| 85 | size_t total_bits = 0; | ||
| 86 | size_t next_bit = 0; | ||
| 87 | size_t bits_read = 0; | ||
| 88 | }; | ||
| 89 | |||
| 90 | class OutputBitStream { | ||
| 91 | public: | ||
| 92 | constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0) | ||
| 93 | : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {} | ||
| 94 | |||
| 95 | constexpr std::size_t GetBitsWritten() const { | ||
| 96 | return bits_written; | ||
| 97 | } | ||
| 98 | |||
| 99 | constexpr void WriteBitsR(u32 val, u32 nBits) { | ||
| 100 | for (u32 i = 0; i < nBits; i++) { | ||
| 101 | WriteBit((val >> (nBits - i - 1)) & 1); | ||
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | constexpr void WriteBits(u32 val, u32 nBits) { | ||
| 106 | for (u32 i = 0; i < nBits; i++) { | ||
| 107 | WriteBit((val >> i) & 1); | ||
| 108 | } | ||
| 109 | } | ||
| 110 | |||
| 111 | private: | ||
| 112 | constexpr void WriteBit(bool b) { | ||
| 113 | if (bits_written >= num_bits) { | ||
| 114 | return; | ||
| 115 | } | ||
| 116 | |||
| 117 | const u32 mask = 1 << next_bit++; | ||
| 118 | |||
| 119 | // clear the bit | ||
| 120 | *cur_byte &= static_cast<u8>(~mask); | ||
| 121 | |||
| 122 | // Write the bit, if necessary | ||
| 123 | if (b) | ||
| 124 | *cur_byte |= static_cast<u8>(mask); | ||
| 125 | |||
| 126 | // Next byte? | ||
| 127 | if (next_bit >= 8) { | ||
| 128 | cur_byte += 1; | ||
| 129 | next_bit = 0; | ||
| 130 | } | ||
| 131 | } | ||
| 132 | |||
| 133 | u8* cur_byte; | ||
| 134 | std::size_t num_bits; | ||
| 135 | std::size_t bits_written = 0; | ||
| 136 | std::size_t next_bit = 0; | ||
| 137 | }; | ||
| 138 | |||
| 139 | template <typename IntType> | ||
| 140 | class Bits { | ||
| 141 | public: | ||
| 142 | explicit Bits(const IntType& v) : m_Bits(v) {} | ||
| 143 | |||
| 144 | Bits(const Bits&) = delete; | ||
| 145 | Bits& operator=(const Bits&) = delete; | ||
| 146 | |||
| 147 | u8 operator[](u32 bitPos) const { | ||
| 148 | return static_cast<u8>((m_Bits >> bitPos) & 1); | ||
| 149 | } | ||
| 150 | |||
| 151 | IntType operator()(u32 start, u32 end) const { | ||
| 152 | if (start == end) { | ||
| 153 | return (*this)[start]; | ||
| 154 | } else if (start > end) { | ||
| 155 | u32 t = start; | ||
| 156 | start = end; | ||
| 157 | end = t; | ||
| 158 | } | ||
| 159 | |||
| 160 | u64 mask = (1 << (end - start + 1)) - 1; | ||
| 161 | return (m_Bits >> start) & static_cast<IntType>(mask); | ||
| 162 | } | ||
| 163 | |||
| 164 | private: | ||
| 165 | const IntType& m_Bits; | ||
| 166 | }; | ||
| 167 | |||
| 168 | enum class IntegerEncoding { JustBits, Qus32, Trit }; | ||
| 169 | |||
| 170 | struct IntegerEncodedValue { | ||
| 171 | constexpr IntegerEncodedValue() = default; | ||
| 172 | |||
| 173 | constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_) | ||
| 174 | : encoding{encoding_}, num_bits{num_bits_} {} | ||
| 175 | |||
| 176 | constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const { | ||
| 177 | return encoding == other.encoding && num_bits == other.num_bits; | ||
| 178 | } | ||
| 179 | |||
| 180 | // Returns the number of bits required to encode nVals values. | ||
| 181 | u32 GetBitLength(u32 nVals) const { | ||
| 182 | u32 totalBits = num_bits * nVals; | ||
| 183 | if (encoding == IntegerEncoding::Trit) { | ||
| 184 | totalBits += (nVals * 8 + 4) / 5; | ||
| 185 | } else if (encoding == IntegerEncoding::Qus32) { | ||
| 186 | totalBits += (nVals * 7 + 2) / 3; | ||
| 187 | } | ||
| 188 | return totalBits; | ||
| 189 | } | ||
| 190 | |||
| 191 | IntegerEncoding encoding{}; | ||
| 192 | u32 num_bits = 0; | ||
| 193 | u32 bit_value = 0; | ||
| 194 | union { | ||
| 195 | u32 qus32_value = 0; | ||
| 196 | u32 trit_value; | ||
| 197 | }; | ||
| 198 | }; | ||
| 199 | using IntegerEncodedVector = boost::container::static_vector< | ||
| 200 | IntegerEncodedValue, 256, | ||
| 201 | boost::container::static_vector_options< | ||
| 202 | boost::container::inplace_alignment<alignof(IntegerEncodedValue)>, | ||
| 203 | boost::container::throw_on_overflow<false>>::type>; | ||
| 204 | |||
| 205 | static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) { | ||
| 206 | // Implement the algorithm in section C.2.12 | ||
| 207 | std::array<u32, 5> m; | ||
| 208 | std::array<u32, 5> t; | ||
| 209 | u32 T; | ||
| 210 | |||
| 211 | // Read the trit encoded block according to | ||
| 212 | // table C.2.14 | ||
| 213 | m[0] = bits.ReadBits(nBitsPerValue); | ||
| 214 | T = bits.ReadBits<2>(); | ||
| 215 | m[1] = bits.ReadBits(nBitsPerValue); | ||
| 216 | T |= bits.ReadBits<2>() << 2; | ||
| 217 | m[2] = bits.ReadBits(nBitsPerValue); | ||
| 218 | T |= bits.ReadBit() << 4; | ||
| 219 | m[3] = bits.ReadBits(nBitsPerValue); | ||
| 220 | T |= bits.ReadBits<2>() << 5; | ||
| 221 | m[4] = bits.ReadBits(nBitsPerValue); | ||
| 222 | T |= bits.ReadBit() << 7; | ||
| 223 | |||
| 224 | u32 C = 0; | ||
| 225 | |||
| 226 | Bits<u32> Tb(T); | ||
| 227 | if (Tb(2, 4) == 7) { | ||
| 228 | C = (Tb(5, 7) << 2) | Tb(0, 1); | ||
| 229 | t[4] = t[3] = 2; | ||
| 230 | } else { | ||
| 231 | C = Tb(0, 4); | ||
| 232 | if (Tb(5, 6) == 3) { | ||
| 233 | t[4] = 2; | ||
| 234 | t[3] = Tb[7]; | ||
| 235 | } else { | ||
| 236 | t[4] = Tb[7]; | ||
| 237 | t[3] = Tb(5, 6); | ||
| 238 | } | ||
| 239 | } | ||
| 240 | |||
| 241 | Bits<u32> Cb(C); | ||
| 242 | if (Cb(0, 1) == 3) { | ||
| 243 | t[2] = 2; | ||
| 244 | t[1] = Cb[4]; | ||
| 245 | t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]); | ||
| 246 | } else if (Cb(2, 3) == 3) { | ||
| 247 | t[2] = 2; | ||
| 248 | t[1] = 2; | ||
| 249 | t[0] = Cb(0, 1); | ||
| 250 | } else { | ||
| 251 | t[2] = Cb[4]; | ||
| 252 | t[1] = Cb(2, 3); | ||
| 253 | t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]); | ||
| 254 | } | ||
| 255 | |||
| 256 | for (std::size_t i = 0; i < 5; ++i) { | ||
| 257 | IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Trit, nBitsPerValue); | ||
| 258 | val.bit_value = m[i]; | ||
| 259 | val.trit_value = t[i]; | ||
| 260 | } | ||
| 261 | } | ||
| 262 | |||
| 263 | static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result, | ||
| 264 | u32 nBitsPerValue) { | ||
| 265 | // Implement the algorithm in section C.2.12 | ||
| 266 | u32 m[3]; | ||
| 267 | u32 q[3]; | ||
| 268 | u32 Q; | ||
| 269 | |||
| 270 | // Read the trit encoded block according to | ||
| 271 | // table C.2.15 | ||
| 272 | m[0] = bits.ReadBits(nBitsPerValue); | ||
| 273 | Q = bits.ReadBits<3>(); | ||
| 274 | m[1] = bits.ReadBits(nBitsPerValue); | ||
| 275 | Q |= bits.ReadBits<2>() << 3; | ||
| 276 | m[2] = bits.ReadBits(nBitsPerValue); | ||
| 277 | Q |= bits.ReadBits<2>() << 5; | ||
| 278 | |||
| 279 | Bits<u32> Qb(Q); | ||
| 280 | if (Qb(1, 2) == 3 && Qb(5, 6) == 0) { | ||
| 281 | q[0] = q[1] = 4; | ||
| 282 | q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]); | ||
| 283 | } else { | ||
| 284 | u32 C = 0; | ||
| 285 | if (Qb(1, 2) == 3) { | ||
| 286 | q[2] = 4; | ||
| 287 | C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0]; | ||
| 288 | } else { | ||
| 289 | q[2] = Qb(5, 6); | ||
| 290 | C = Qb(0, 4); | ||
| 291 | } | ||
| 292 | |||
| 293 | Bits<u32> Cb(C); | ||
| 294 | if (Cb(0, 2) == 5) { | ||
| 295 | q[1] = 4; | ||
| 296 | q[0] = Cb(3, 4); | ||
| 297 | } else { | ||
| 298 | q[1] = Cb(3, 4); | ||
| 299 | q[0] = Cb(0, 2); | ||
| 300 | } | ||
| 301 | } | ||
| 302 | |||
| 303 | for (std::size_t i = 0; i < 3; ++i) { | ||
| 304 | IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Qus32, nBitsPerValue); | ||
| 305 | val.bit_value = m[i]; | ||
| 306 | val.qus32_value = q[i]; | ||
| 307 | } | ||
| 308 | } | ||
| 309 | |||
| 310 | // Returns a new instance of this struct that corresponds to the | ||
| 311 | // can take no more than maxval values | ||
| 312 | static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) { | ||
| 313 | while (maxVal > 0) { | ||
| 314 | u32 check = maxVal + 1; | ||
| 315 | |||
| 316 | // Is maxVal a power of two? | ||
| 317 | if (!(check & (check - 1))) { | ||
| 318 | return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal)); | ||
| 319 | } | ||
| 320 | |||
| 321 | // Is maxVal of the type 3*2^n - 1? | ||
| 322 | if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { | ||
| 323 | return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1)); | ||
| 324 | } | ||
| 325 | |||
| 326 | // Is maxVal of the type 5*2^n - 1? | ||
| 327 | if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { | ||
| 328 | return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1)); | ||
| 329 | } | ||
| 330 | |||
| 331 | // Apparently it can't be represented with a bounded integer sequence... | ||
| 332 | // just iterate. | ||
| 333 | maxVal--; | ||
| 334 | } | ||
| 335 | return IntegerEncodedValue(IntegerEncoding::JustBits, 0); | ||
| 336 | } | ||
| 337 | |||
| 338 | static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { | ||
| 339 | std::array<IntegerEncodedValue, 256> encodings{}; | ||
| 340 | for (std::size_t i = 0; i < encodings.size(); ++i) { | ||
| 341 | encodings[i] = CreateEncoding(static_cast<u32>(i)); | ||
| 342 | } | ||
| 343 | return encodings; | ||
| 344 | } | ||
| 345 | |||
| 346 | static constexpr std::array EncodingsValues = MakeEncodedValues(); | ||
| 347 | |||
| 348 | // Fills result with the values that are encoded in the given | ||
| 349 | // bitstream. We must know beforehand what the maximum possible | ||
| 350 | // value is, and how many values we're decoding. | ||
| 351 | static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange, | ||
| 352 | u32 nValues) { | ||
| 353 | // Determine encoding parameters | ||
| 354 | IntegerEncodedValue val = EncodingsValues[maxRange]; | ||
| 355 | |||
| 356 | // Start decoding | ||
| 357 | u32 nValsDecoded = 0; | ||
| 358 | while (nValsDecoded < nValues) { | ||
| 359 | switch (val.encoding) { | ||
| 360 | case IntegerEncoding::Qus32: | ||
| 361 | DecodeQus32Block(bits, result, val.num_bits); | ||
| 362 | nValsDecoded += 3; | ||
| 363 | break; | ||
| 364 | |||
| 365 | case IntegerEncoding::Trit: | ||
| 366 | DecodeTritBlock(bits, result, val.num_bits); | ||
| 367 | nValsDecoded += 5; | ||
| 368 | break; | ||
| 369 | |||
| 370 | case IntegerEncoding::JustBits: | ||
| 371 | val.bit_value = bits.ReadBits(val.num_bits); | ||
| 372 | result.push_back(val); | ||
| 373 | nValsDecoded++; | ||
| 374 | break; | ||
| 375 | } | ||
| 376 | } | ||
| 377 | } | ||
| 378 | |||
| 379 | namespace ASTCC { | ||
| 380 | |||
| 381 | struct TexelWeightParams { | ||
| 382 | u32 m_Width = 0; | ||
| 383 | u32 m_Height = 0; | ||
| 384 | bool m_bDualPlane = false; | ||
| 385 | u32 m_MaxWeight = 0; | ||
| 386 | bool m_bError = false; | ||
| 387 | bool m_bVoidExtentLDR = false; | ||
| 388 | bool m_bVoidExtentHDR = false; | ||
| 389 | |||
| 390 | u32 GetPackedBitSize() const { | ||
| 391 | // How many indices do we have? | ||
| 392 | u32 nIdxs = m_Height * m_Width; | ||
| 393 | if (m_bDualPlane) { | ||
| 394 | nIdxs *= 2; | ||
| 395 | } | ||
| 396 | |||
| 397 | return EncodingsValues[m_MaxWeight].GetBitLength(nIdxs); | ||
| 398 | } | ||
| 399 | |||
| 400 | u32 GetNumWeightValues() const { | ||
| 401 | u32 ret = m_Width * m_Height; | ||
| 402 | if (m_bDualPlane) { | ||
| 403 | ret *= 2; | ||
| 404 | } | ||
| 405 | return ret; | ||
| 406 | } | ||
| 407 | }; | ||
| 408 | |||
| 409 | static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { | ||
| 410 | TexelWeightParams params; | ||
| 411 | |||
| 412 | // Read the entire block mode all at once | ||
| 413 | u16 modeBits = static_cast<u16>(strm.ReadBits<11>()); | ||
| 414 | |||
| 415 | // Does this match the void extent block mode? | ||
| 416 | if ((modeBits & 0x01FF) == 0x1FC) { | ||
| 417 | if (modeBits & 0x200) { | ||
| 418 | params.m_bVoidExtentHDR = true; | ||
| 419 | } else { | ||
| 420 | params.m_bVoidExtentLDR = true; | ||
| 421 | } | ||
| 422 | |||
| 423 | // Next two bits must be one. | ||
| 424 | if (!(modeBits & 0x400) || !strm.ReadBit()) { | ||
| 425 | params.m_bError = true; | ||
| 426 | } | ||
| 427 | |||
| 428 | return params; | ||
| 429 | } | ||
| 430 | |||
| 431 | // First check if the last four bits are zero | ||
| 432 | if ((modeBits & 0xF) == 0) { | ||
| 433 | params.m_bError = true; | ||
| 434 | return params; | ||
| 435 | } | ||
| 436 | |||
| 437 | // If the last two bits are zero, then if bits | ||
| 438 | // [6-8] are all ones, this is also reserved. | ||
| 439 | if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0) { | ||
| 440 | params.m_bError = true; | ||
| 441 | return params; | ||
| 442 | } | ||
| 443 | |||
| 444 | // Otherwise, there is no error... Figure out the layout | ||
| 445 | // of the block mode. Layout is determined by a number | ||
| 446 | // between 0 and 9 corresponding to table C.2.8 of the | ||
| 447 | // ASTC spec. | ||
| 448 | u32 layout = 0; | ||
| 449 | |||
| 450 | if ((modeBits & 0x1) || (modeBits & 0x2)) { | ||
| 451 | // layout is in [0-4] | ||
| 452 | if (modeBits & 0x8) { | ||
| 453 | // layout is in [2-4] | ||
| 454 | if (modeBits & 0x4) { | ||
| 455 | // layout is in [3-4] | ||
| 456 | if (modeBits & 0x100) { | ||
| 457 | layout = 4; | ||
| 458 | } else { | ||
| 459 | layout = 3; | ||
| 460 | } | ||
| 461 | } else { | ||
| 462 | layout = 2; | ||
| 463 | } | ||
| 464 | } else { | ||
| 465 | // layout is in [0-1] | ||
| 466 | if (modeBits & 0x4) { | ||
| 467 | layout = 1; | ||
| 468 | } else { | ||
| 469 | layout = 0; | ||
| 470 | } | ||
| 471 | } | ||
| 472 | } else { | ||
| 473 | // layout is in [5-9] | ||
| 474 | if (modeBits & 0x100) { | ||
| 475 | // layout is in [7-9] | ||
| 476 | if (modeBits & 0x80) { | ||
| 477 | // layout is in [7-8] | ||
| 478 | assert((modeBits & 0x40) == 0U); | ||
| 479 | if (modeBits & 0x20) { | ||
| 480 | layout = 8; | ||
| 481 | } else { | ||
| 482 | layout = 7; | ||
| 483 | } | ||
| 484 | } else { | ||
| 485 | layout = 9; | ||
| 486 | } | ||
| 487 | } else { | ||
| 488 | // layout is in [5-6] | ||
| 489 | if (modeBits & 0x80) { | ||
| 490 | layout = 6; | ||
| 491 | } else { | ||
| 492 | layout = 5; | ||
| 493 | } | ||
| 494 | } | ||
| 495 | } | ||
| 496 | |||
| 497 | assert(layout < 10); | ||
| 498 | |||
| 499 | // Determine R | ||
| 500 | u32 R = !!(modeBits & 0x10); | ||
| 501 | if (layout < 5) { | ||
| 502 | R |= (modeBits & 0x3) << 1; | ||
| 503 | } else { | ||
| 504 | R |= (modeBits & 0xC) >> 1; | ||
| 505 | } | ||
| 506 | assert(2 <= R && R <= 7); | ||
| 507 | |||
| 508 | // Determine width & height | ||
| 509 | switch (layout) { | ||
| 510 | case 0: { | ||
| 511 | u32 A = (modeBits >> 5) & 0x3; | ||
| 512 | u32 B = (modeBits >> 7) & 0x3; | ||
| 513 | params.m_Width = B + 4; | ||
| 514 | params.m_Height = A + 2; | ||
| 515 | break; | ||
| 516 | } | ||
| 517 | |||
| 518 | case 1: { | ||
| 519 | u32 A = (modeBits >> 5) & 0x3; | ||
| 520 | u32 B = (modeBits >> 7) & 0x3; | ||
| 521 | params.m_Width = B + 8; | ||
| 522 | params.m_Height = A + 2; | ||
| 523 | break; | ||
| 524 | } | ||
| 525 | |||
| 526 | case 2: { | ||
| 527 | u32 A = (modeBits >> 5) & 0x3; | ||
| 528 | u32 B = (modeBits >> 7) & 0x3; | ||
| 529 | params.m_Width = A + 2; | ||
| 530 | params.m_Height = B + 8; | ||
| 531 | break; | ||
| 532 | } | ||
| 533 | |||
| 534 | case 3: { | ||
| 535 | u32 A = (modeBits >> 5) & 0x3; | ||
| 536 | u32 B = (modeBits >> 7) & 0x1; | ||
| 537 | params.m_Width = A + 2; | ||
| 538 | params.m_Height = B + 6; | ||
| 539 | break; | ||
| 540 | } | ||
| 541 | |||
| 542 | case 4: { | ||
| 543 | u32 A = (modeBits >> 5) & 0x3; | ||
| 544 | u32 B = (modeBits >> 7) & 0x1; | ||
| 545 | params.m_Width = B + 2; | ||
| 546 | params.m_Height = A + 2; | ||
| 547 | break; | ||
| 548 | } | ||
| 549 | |||
| 550 | case 5: { | ||
| 551 | u32 A = (modeBits >> 5) & 0x3; | ||
| 552 | params.m_Width = 12; | ||
| 553 | params.m_Height = A + 2; | ||
| 554 | break; | ||
| 555 | } | ||
| 556 | |||
| 557 | case 6: { | ||
| 558 | u32 A = (modeBits >> 5) & 0x3; | ||
| 559 | params.m_Width = A + 2; | ||
| 560 | params.m_Height = 12; | ||
| 561 | break; | ||
| 562 | } | ||
| 563 | |||
| 564 | case 7: { | ||
| 565 | params.m_Width = 6; | ||
| 566 | params.m_Height = 10; | ||
| 567 | break; | ||
| 568 | } | ||
| 569 | |||
| 570 | case 8: { | ||
| 571 | params.m_Width = 10; | ||
| 572 | params.m_Height = 6; | ||
| 573 | break; | ||
| 574 | } | ||
| 575 | |||
| 576 | case 9: { | ||
| 577 | u32 A = (modeBits >> 5) & 0x3; | ||
| 578 | u32 B = (modeBits >> 9) & 0x3; | ||
| 579 | params.m_Width = A + 6; | ||
| 580 | params.m_Height = B + 6; | ||
| 581 | break; | ||
| 582 | } | ||
| 583 | |||
| 584 | default: | ||
| 585 | assert(false && "Don't know this layout..."); | ||
| 586 | params.m_bError = true; | ||
| 587 | break; | ||
| 588 | } | ||
| 589 | |||
| 590 | // Determine whether or not we're using dual planes | ||
| 591 | // and/or high precision layouts. | ||
| 592 | bool D = (layout != 9) && (modeBits & 0x400); | ||
| 593 | bool H = (layout != 9) && (modeBits & 0x200); | ||
| 594 | |||
| 595 | if (H) { | ||
| 596 | const u32 maxWeights[6] = {9, 11, 15, 19, 23, 31}; | ||
| 597 | params.m_MaxWeight = maxWeights[R - 2]; | ||
| 598 | } else { | ||
| 599 | const u32 maxWeights[6] = {1, 2, 3, 4, 5, 7}; | ||
| 600 | params.m_MaxWeight = maxWeights[R - 2]; | ||
| 601 | } | ||
| 602 | |||
| 603 | params.m_bDualPlane = D; | ||
| 604 | |||
| 605 | return params; | ||
| 606 | } | ||
| 607 | |||
| 608 | static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth, | ||
| 609 | u32 blockHeight) { | ||
| 610 | // Don't actually care about the void extent, just read the bits... | ||
| 611 | for (s32 i = 0; i < 4; ++i) { | ||
| 612 | strm.ReadBits<13>(); | ||
| 613 | } | ||
| 614 | |||
| 615 | // Decode the RGBA components and renormalize them to the range [0, 255] | ||
| 616 | u16 r = static_cast<u16>(strm.ReadBits<16>()); | ||
| 617 | u16 g = static_cast<u16>(strm.ReadBits<16>()); | ||
| 618 | u16 b = static_cast<u16>(strm.ReadBits<16>()); | ||
| 619 | u16 a = static_cast<u16>(strm.ReadBits<16>()); | ||
| 620 | |||
| 621 | u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 | | ||
| 622 | (static_cast<u32>(a) & 0xFF00) << 16; | ||
| 623 | |||
| 624 | for (u32 j = 0; j < blockHeight; j++) { | ||
| 625 | for (u32 i = 0; i < blockWidth; i++) { | ||
| 626 | outBuf[j * blockWidth + i] = rgba; | ||
| 627 | } | ||
| 628 | } | ||
| 629 | } | ||
| 630 | |||
| 631 | static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) { | ||
| 632 | for (u32 j = 0; j < blockHeight; j++) { | ||
| 633 | for (u32 i = 0; i < blockWidth; i++) { | ||
| 634 | outBuf[j * blockWidth + i] = 0xFFFF00FF; | ||
| 635 | } | ||
| 636 | } | ||
| 637 | } | ||
| 638 | |||
| 639 | // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] | ||
| 640 | // is the same as [(numBits - 1):0] and repeats all the way down. | ||
| 641 | template <typename IntType> | ||
| 642 | static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) { | ||
| 643 | if (numBits == 0) { | ||
| 644 | return 0; | ||
| 645 | } | ||
| 646 | if (toBit == 0) { | ||
| 647 | return 0; | ||
| 648 | } | ||
| 649 | const IntType v = val & static_cast<IntType>((1 << numBits) - 1); | ||
| 650 | IntType res = v; | ||
| 651 | u32 reslen = numBits; | ||
| 652 | while (reslen < toBit) { | ||
| 653 | u32 comp = 0; | ||
| 654 | if (numBits > toBit - reslen) { | ||
| 655 | u32 newshift = toBit - reslen; | ||
| 656 | comp = numBits - newshift; | ||
| 657 | numBits = newshift; | ||
| 658 | } | ||
| 659 | res = static_cast<IntType>(res << numBits); | ||
| 660 | res = static_cast<IntType>(res | (v >> comp)); | ||
| 661 | reslen += numBits; | ||
| 662 | } | ||
| 663 | return res; | ||
| 664 | } | ||
| 665 | |||
| 666 | static constexpr std::size_t NumReplicateEntries(u32 num_bits) { | ||
| 667 | return std::size_t(1) << num_bits; | ||
| 668 | } | ||
| 669 | |||
| 670 | template <typename IntType, u32 num_bits, u32 to_bit> | ||
| 671 | static constexpr auto MakeReplicateTable() { | ||
| 672 | std::array<IntType, NumReplicateEntries(num_bits)> table{}; | ||
| 673 | for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) { | ||
| 674 | table[value] = Replicate(value, num_bits, to_bit); | ||
| 675 | } | ||
| 676 | return table; | ||
| 677 | } | ||
| 678 | |||
| 679 | static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); | ||
| 680 | static constexpr u32 ReplicateByteTo16(std::size_t value) { | ||
| 681 | return REPLICATE_BYTE_TO_16_TABLE[value]; | ||
| 682 | } | ||
| 683 | |||
| 684 | static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>(); | ||
| 685 | static constexpr u32 ReplicateBitTo7(std::size_t value) { | ||
| 686 | return REPLICATE_BIT_TO_7_TABLE[value]; | ||
| 687 | } | ||
| 688 | |||
| 689 | static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>(); | ||
| 690 | static constexpr u32 ReplicateBitTo9(std::size_t value) { | ||
| 691 | return REPLICATE_BIT_TO_9_TABLE[value]; | ||
| 692 | } | ||
| 693 | |||
| 694 | static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>(); | ||
| 695 | static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>(); | ||
| 696 | static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>(); | ||
| 697 | static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>(); | ||
| 698 | static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>(); | ||
| 699 | static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>(); | ||
| 700 | static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>(); | ||
| 701 | static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>(); | ||
| 702 | /// Use a precompiled table with the most common usages, if it's not in the expected range, fallback | ||
| 703 | /// to the runtime implementation | ||
| 704 | static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { | ||
| 705 | switch (num_bits) { | ||
| 706 | case 1: | ||
| 707 | return REPLICATE_1_BIT_TO_8_TABLE[value]; | ||
| 708 | case 2: | ||
| 709 | return REPLICATE_2_BIT_TO_8_TABLE[value]; | ||
| 710 | case 3: | ||
| 711 | return REPLICATE_3_BIT_TO_8_TABLE[value]; | ||
| 712 | case 4: | ||
| 713 | return REPLICATE_4_BIT_TO_8_TABLE[value]; | ||
| 714 | case 5: | ||
| 715 | return REPLICATE_5_BIT_TO_8_TABLE[value]; | ||
| 716 | case 6: | ||
| 717 | return REPLICATE_6_BIT_TO_8_TABLE[value]; | ||
| 718 | case 7: | ||
| 719 | return REPLICATE_7_BIT_TO_8_TABLE[value]; | ||
| 720 | case 8: | ||
| 721 | return REPLICATE_8_BIT_TO_8_TABLE[value]; | ||
| 722 | default: | ||
| 723 | return Replicate(value, num_bits, 8); | ||
| 724 | } | ||
| 725 | } | ||
| 726 | |||
| 727 | static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>(); | ||
| 728 | static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>(); | ||
| 729 | static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>(); | ||
| 730 | static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>(); | ||
| 731 | static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>(); | ||
| 732 | static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) { | ||
| 733 | switch (num_bits) { | ||
| 734 | case 1: | ||
| 735 | return REPLICATE_1_BIT_TO_6_TABLE[value]; | ||
| 736 | case 2: | ||
| 737 | return REPLICATE_2_BIT_TO_6_TABLE[value]; | ||
| 738 | case 3: | ||
| 739 | return REPLICATE_3_BIT_TO_6_TABLE[value]; | ||
| 740 | case 4: | ||
| 741 | return REPLICATE_4_BIT_TO_6_TABLE[value]; | ||
| 742 | case 5: | ||
| 743 | return REPLICATE_5_BIT_TO_6_TABLE[value]; | ||
| 744 | default: | ||
| 745 | return Replicate(value, num_bits, 6); | ||
| 746 | } | ||
| 747 | } | ||
| 748 | |||
| 749 | class Pixel { | ||
| 750 | protected: | ||
| 751 | using ChannelType = s16; | ||
| 752 | u8 m_BitDepth[4] = {8, 8, 8, 8}; | ||
| 753 | s16 color[4] = {}; | ||
| 754 | |||
| 755 | public: | ||
| 756 | Pixel() = default; | ||
| 757 | Pixel(u32 a, u32 r, u32 g, u32 b, u32 bitDepth = 8) | ||
| 758 | : m_BitDepth{u8(bitDepth), u8(bitDepth), u8(bitDepth), u8(bitDepth)}, | ||
| 759 | color{static_cast<ChannelType>(a), static_cast<ChannelType>(r), | ||
| 760 | static_cast<ChannelType>(g), static_cast<ChannelType>(b)} {} | ||
| 761 | |||
| 762 | // Changes the depth of each pixel. This scales the values to | ||
| 763 | // the appropriate bit depth by either truncating the least | ||
| 764 | // significant bits when going from larger to smaller bit depth | ||
| 765 | // or by repeating the most significant bits when going from | ||
| 766 | // smaller to larger bit depths. | ||
| 767 | void ChangeBitDepth() { | ||
| 768 | for (u32 i = 0; i < 4; i++) { | ||
| 769 | Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]); | ||
| 770 | m_BitDepth[i] = 8; | ||
| 771 | } | ||
| 772 | } | ||
| 773 | |||
| 774 | template <typename IntType> | ||
| 775 | static float ConvertChannelToFloat(IntType channel, u8 bitDepth) { | ||
| 776 | float denominator = static_cast<float>((1 << bitDepth) - 1); | ||
| 777 | return static_cast<float>(channel) / denominator; | ||
| 778 | } | ||
| 779 | |||
| 780 | // Changes the bit depth of a single component. See the comment | ||
| 781 | // above for how we do this. | ||
| 782 | static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) { | ||
| 783 | assert(oldDepth <= 8); | ||
| 784 | |||
| 785 | if (oldDepth == 8) { | ||
| 786 | // Do nothing | ||
| 787 | return val; | ||
| 788 | } else if (oldDepth == 0) { | ||
| 789 | return static_cast<ChannelType>((1 << 8) - 1); | ||
| 790 | } else if (8 > oldDepth) { | ||
| 791 | return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth)); | ||
| 792 | } else { | ||
| 793 | // oldDepth > newDepth | ||
| 794 | const u8 bitsWasted = static_cast<u8>(oldDepth - 8); | ||
| 795 | u16 v = static_cast<u16>(val); | ||
| 796 | v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted); | ||
| 797 | v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1)); | ||
| 798 | return static_cast<u8>(v); | ||
| 799 | } | ||
| 800 | |||
| 801 | assert(false && "We shouldn't get here."); | ||
| 802 | return 0; | ||
| 803 | } | ||
| 804 | |||
| 805 | const ChannelType& A() const { | ||
| 806 | return color[0]; | ||
| 807 | } | ||
| 808 | ChannelType& A() { | ||
| 809 | return color[0]; | ||
| 810 | } | ||
| 811 | const ChannelType& R() const { | ||
| 812 | return color[1]; | ||
| 813 | } | ||
| 814 | ChannelType& R() { | ||
| 815 | return color[1]; | ||
| 816 | } | ||
| 817 | const ChannelType& G() const { | ||
| 818 | return color[2]; | ||
| 819 | } | ||
| 820 | ChannelType& G() { | ||
| 821 | return color[2]; | ||
| 822 | } | ||
| 823 | const ChannelType& B() const { | ||
| 824 | return color[3]; | ||
| 825 | } | ||
| 826 | ChannelType& B() { | ||
| 827 | return color[3]; | ||
| 828 | } | ||
| 829 | const ChannelType& Component(u32 idx) const { | ||
| 830 | return color[idx]; | ||
| 831 | } | ||
| 832 | ChannelType& Component(u32 idx) { | ||
| 833 | return color[idx]; | ||
| 834 | } | ||
| 835 | |||
| 836 | void GetBitDepth(u8 (&outDepth)[4]) const { | ||
| 837 | for (s32 i = 0; i < 4; i++) { | ||
| 838 | outDepth[i] = m_BitDepth[i]; | ||
| 839 | } | ||
| 840 | } | ||
| 841 | |||
| 842 | // Take all of the components, transform them to their 8-bit variants, | ||
| 843 | // and then pack each channel into an R8G8B8A8 32-bit integer. We assume | ||
| 844 | // that the architecture is little-endian, so the alpha channel will end | ||
| 845 | // up in the most-significant byte. | ||
| 846 | u32 Pack() const { | ||
| 847 | Pixel eightBit(*this); | ||
| 848 | eightBit.ChangeBitDepth(); | ||
| 849 | |||
| 850 | u32 r = 0; | ||
| 851 | r |= eightBit.A(); | ||
| 852 | r <<= 8; | ||
| 853 | r |= eightBit.B(); | ||
| 854 | r <<= 8; | ||
| 855 | r |= eightBit.G(); | ||
| 856 | r <<= 8; | ||
| 857 | r |= eightBit.R(); | ||
| 858 | return r; | ||
| 859 | } | ||
| 860 | |||
| 861 | // Clamps the pixel to the range [0,255] | ||
| 862 | void ClampByte() { | ||
| 863 | for (u32 i = 0; i < 4; i++) { | ||
| 864 | color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); | ||
| 865 | } | ||
| 866 | } | ||
| 867 | |||
| 868 | void MakeOpaque() { | ||
| 869 | A() = 255; | ||
| 870 | } | ||
| 871 | }; | ||
| 872 | |||
| 873 | static void DecodeColorValues(u32* out, std::span<u8> data, const u32* modes, const u32 nPartitions, | ||
| 874 | const u32 nBitsForColorData) { | ||
| 875 | // First figure out how many color values we have | ||
| 876 | u32 nValues = 0; | ||
| 877 | for (u32 i = 0; i < nPartitions; i++) { | ||
| 878 | nValues += ((modes[i] >> 2) + 1) << 1; | ||
| 879 | } | ||
| 880 | |||
| 881 | // Then based on the number of values and the remaining number of bits, | ||
| 882 | // figure out the max value for each of them... | ||
| 883 | u32 range = 256; | ||
| 884 | while (--range > 0) { | ||
| 885 | IntegerEncodedValue val = EncodingsValues[range]; | ||
| 886 | u32 bitLength = val.GetBitLength(nValues); | ||
| 887 | if (bitLength <= nBitsForColorData) { | ||
| 888 | // Find the smallest possible range that matches the given encoding | ||
| 889 | while (--range > 0) { | ||
| 890 | IntegerEncodedValue newval = EncodingsValues[range]; | ||
| 891 | if (!newval.MatchesEncoding(val)) { | ||
| 892 | break; | ||
| 893 | } | ||
| 894 | } | ||
| 895 | |||
| 896 | // Return to last matching range. | ||
| 897 | range++; | ||
| 898 | break; | ||
| 899 | } | ||
| 900 | } | ||
| 901 | |||
| 902 | // We now have enough to decode our integer sequence. | ||
| 903 | IntegerEncodedVector decodedColorValues; | ||
| 904 | |||
| 905 | InputBitStream colorStream(data, 0); | ||
| 906 | DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); | ||
| 907 | |||
| 908 | // Once we have the decoded values, we need to dequantize them to the 0-255 range | ||
| 909 | // This procedure is outlined in ASTC spec C.2.13 | ||
| 910 | u32 outIdx = 0; | ||
| 911 | for (auto itr = decodedColorValues.begin(); itr != decodedColorValues.end(); ++itr) { | ||
| 912 | // Have we already decoded all that we need? | ||
| 913 | if (outIdx >= nValues) { | ||
| 914 | break; | ||
| 915 | } | ||
| 916 | |||
| 917 | const IntegerEncodedValue& val = *itr; | ||
| 918 | u32 bitlen = val.num_bits; | ||
| 919 | u32 bitval = val.bit_value; | ||
| 920 | |||
| 921 | assert(bitlen >= 1); | ||
| 922 | |||
| 923 | u32 A = 0, B = 0, C = 0, D = 0; | ||
| 924 | // A is just the lsb replicated 9 times. | ||
| 925 | A = ReplicateBitTo9(bitval & 1); | ||
| 926 | |||
| 927 | switch (val.encoding) { | ||
| 928 | // Replicate bits | ||
| 929 | case IntegerEncoding::JustBits: | ||
| 930 | out[outIdx++] = FastReplicateTo8(bitval, bitlen); | ||
| 931 | break; | ||
| 932 | |||
| 933 | // Use algorithm in C.2.13 | ||
| 934 | case IntegerEncoding::Trit: { | ||
| 935 | |||
| 936 | D = val.trit_value; | ||
| 937 | |||
| 938 | switch (bitlen) { | ||
| 939 | case 1: { | ||
| 940 | C = 204; | ||
| 941 | } break; | ||
| 942 | |||
| 943 | case 2: { | ||
| 944 | C = 93; | ||
| 945 | // B = b000b0bb0 | ||
| 946 | u32 b = (bitval >> 1) & 1; | ||
| 947 | B = (b << 8) | (b << 4) | (b << 2) | (b << 1); | ||
| 948 | } break; | ||
| 949 | |||
| 950 | case 3: { | ||
| 951 | C = 44; | ||
| 952 | // B = cb000cbcb | ||
| 953 | u32 cb = (bitval >> 1) & 3; | ||
| 954 | B = (cb << 7) | (cb << 2) | cb; | ||
| 955 | } break; | ||
| 956 | |||
| 957 | case 4: { | ||
| 958 | C = 22; | ||
| 959 | // B = dcb000dcb | ||
| 960 | u32 dcb = (bitval >> 1) & 7; | ||
| 961 | B = (dcb << 6) | dcb; | ||
| 962 | } break; | ||
| 963 | |||
| 964 | case 5: { | ||
| 965 | C = 11; | ||
| 966 | // B = edcb000ed | ||
| 967 | u32 edcb = (bitval >> 1) & 0xF; | ||
| 968 | B = (edcb << 5) | (edcb >> 2); | ||
| 969 | } break; | ||
| 970 | |||
| 971 | case 6: { | ||
| 972 | C = 5; | ||
| 973 | // B = fedcb000f | ||
| 974 | u32 fedcb = (bitval >> 1) & 0x1F; | ||
| 975 | B = (fedcb << 4) | (fedcb >> 4); | ||
| 976 | } break; | ||
| 977 | |||
| 978 | default: | ||
| 979 | assert(false && "Unsupported trit encoding for color values!"); | ||
| 980 | break; | ||
| 981 | } // switch(bitlen) | ||
| 982 | } // case IntegerEncoding::Trit | ||
| 983 | break; | ||
| 984 | |||
| 985 | case IntegerEncoding::Qus32: { | ||
| 986 | |||
| 987 | D = val.qus32_value; | ||
| 988 | |||
| 989 | switch (bitlen) { | ||
| 990 | case 1: { | ||
| 991 | C = 113; | ||
| 992 | } break; | ||
| 993 | |||
| 994 | case 2: { | ||
| 995 | C = 54; | ||
| 996 | // B = b0000bb00 | ||
| 997 | u32 b = (bitval >> 1) & 1; | ||
| 998 | B = (b << 8) | (b << 3) | (b << 2); | ||
| 999 | } break; | ||
| 1000 | |||
| 1001 | case 3: { | ||
| 1002 | C = 26; | ||
| 1003 | // B = cb0000cbc | ||
| 1004 | u32 cb = (bitval >> 1) & 3; | ||
| 1005 | B = (cb << 7) | (cb << 1) | (cb >> 1); | ||
| 1006 | } break; | ||
| 1007 | |||
| 1008 | case 4: { | ||
| 1009 | C = 13; | ||
| 1010 | // B = dcb0000dc | ||
| 1011 | u32 dcb = (bitval >> 1) & 7; | ||
| 1012 | B = (dcb << 6) | (dcb >> 1); | ||
| 1013 | } break; | ||
| 1014 | |||
| 1015 | case 5: { | ||
| 1016 | C = 6; | ||
| 1017 | // B = edcb0000e | ||
| 1018 | u32 edcb = (bitval >> 1) & 0xF; | ||
| 1019 | B = (edcb << 5) | (edcb >> 3); | ||
| 1020 | } break; | ||
| 1021 | |||
| 1022 | default: | ||
| 1023 | assert(false && "Unsupported quint encoding for color values!"); | ||
| 1024 | break; | ||
| 1025 | } // switch(bitlen) | ||
| 1026 | } // case IntegerEncoding::Qus32 | ||
| 1027 | break; | ||
| 1028 | } // switch(val.encoding) | ||
| 1029 | |||
| 1030 | if (val.encoding != IntegerEncoding::JustBits) { | ||
| 1031 | u32 T = D * C + B; | ||
| 1032 | T ^= A; | ||
| 1033 | T = (A & 0x80) | (T >> 2); | ||
| 1034 | out[outIdx++] = T; | ||
| 1035 | } | ||
| 1036 | } | ||
| 1037 | |||
| 1038 | // Make sure that each of our values is in the proper range... | ||
| 1039 | for (u32 i = 0; i < nValues; i++) { | ||
| 1040 | assert(out[i] <= 255); | ||
| 1041 | } | ||
| 1042 | } | ||
| 1043 | |||
| 1044 | static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { | ||
| 1045 | u32 bitval = val.bit_value; | ||
| 1046 | u32 bitlen = val.num_bits; | ||
| 1047 | |||
| 1048 | u32 A = ReplicateBitTo7(bitval & 1); | ||
| 1049 | u32 B = 0, C = 0, D = 0; | ||
| 1050 | |||
| 1051 | u32 result = 0; | ||
| 1052 | switch (val.encoding) { | ||
| 1053 | case IntegerEncoding::JustBits: | ||
| 1054 | result = FastReplicateTo6(bitval, bitlen); | ||
| 1055 | break; | ||
| 1056 | |||
| 1057 | case IntegerEncoding::Trit: { | ||
| 1058 | D = val.trit_value; | ||
| 1059 | assert(D < 3); | ||
| 1060 | |||
| 1061 | switch (bitlen) { | ||
| 1062 | case 0: { | ||
| 1063 | u32 results[3] = {0, 32, 63}; | ||
| 1064 | result = results[D]; | ||
| 1065 | } break; | ||
| 1066 | |||
| 1067 | case 1: { | ||
| 1068 | C = 50; | ||
| 1069 | } break; | ||
| 1070 | |||
| 1071 | case 2: { | ||
| 1072 | C = 23; | ||
| 1073 | u32 b = (bitval >> 1) & 1; | ||
| 1074 | B = (b << 6) | (b << 2) | b; | ||
| 1075 | } break; | ||
| 1076 | |||
| 1077 | case 3: { | ||
| 1078 | C = 11; | ||
| 1079 | u32 cb = (bitval >> 1) & 3; | ||
| 1080 | B = (cb << 5) | cb; | ||
| 1081 | } break; | ||
| 1082 | |||
| 1083 | default: | ||
| 1084 | assert(false && "Invalid trit encoding for texel weight"); | ||
| 1085 | break; | ||
| 1086 | } | ||
| 1087 | } break; | ||
| 1088 | |||
| 1089 | case IntegerEncoding::Qus32: { | ||
| 1090 | D = val.qus32_value; | ||
| 1091 | assert(D < 5); | ||
| 1092 | |||
| 1093 | switch (bitlen) { | ||
| 1094 | case 0: { | ||
| 1095 | u32 results[5] = {0, 16, 32, 47, 63}; | ||
| 1096 | result = results[D]; | ||
| 1097 | } break; | ||
| 1098 | |||
| 1099 | case 1: { | ||
| 1100 | C = 28; | ||
| 1101 | } break; | ||
| 1102 | |||
| 1103 | case 2: { | ||
| 1104 | C = 13; | ||
| 1105 | u32 b = (bitval >> 1) & 1; | ||
| 1106 | B = (b << 6) | (b << 1); | ||
| 1107 | } break; | ||
| 1108 | |||
| 1109 | default: | ||
| 1110 | assert(false && "Invalid quint encoding for texel weight"); | ||
| 1111 | break; | ||
| 1112 | } | ||
| 1113 | } break; | ||
| 1114 | } | ||
| 1115 | |||
| 1116 | if (val.encoding != IntegerEncoding::JustBits && bitlen > 0) { | ||
| 1117 | // Decode the value... | ||
| 1118 | result = D * C + B; | ||
| 1119 | result ^= A; | ||
| 1120 | result = (A & 0x20) | (result >> 2); | ||
| 1121 | } | ||
| 1122 | |||
| 1123 | assert(result < 64); | ||
| 1124 | |||
| 1125 | // Change from [0,63] to [0,64] | ||
| 1126 | if (result > 32) { | ||
| 1127 | result += 1; | ||
| 1128 | } | ||
| 1129 | |||
| 1130 | return result; | ||
| 1131 | } | ||
| 1132 | |||
| 1133 | static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights, | ||
| 1134 | const TexelWeightParams& params, const u32 blockWidth, | ||
| 1135 | const u32 blockHeight) { | ||
| 1136 | u32 weightIdx = 0; | ||
| 1137 | u32 unquantized[2][144]; | ||
| 1138 | |||
| 1139 | for (auto itr = weights.begin(); itr != weights.end(); ++itr) { | ||
| 1140 | unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr); | ||
| 1141 | |||
| 1142 | if (params.m_bDualPlane) { | ||
| 1143 | ++itr; | ||
| 1144 | unquantized[1][weightIdx] = UnquantizeTexelWeight(*itr); | ||
| 1145 | if (itr == weights.end()) { | ||
| 1146 | break; | ||
| 1147 | } | ||
| 1148 | } | ||
| 1149 | |||
| 1150 | if (++weightIdx >= (params.m_Width * params.m_Height)) | ||
| 1151 | break; | ||
| 1152 | } | ||
| 1153 | |||
| 1154 | // Do infill if necessary (Section C.2.18) ... | ||
| 1155 | u32 Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1); | ||
| 1156 | u32 Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1); | ||
| 1157 | |||
| 1158 | const u32 kPlaneScale = params.m_bDualPlane ? 2U : 1U; | ||
| 1159 | for (u32 plane = 0; plane < kPlaneScale; plane++) | ||
| 1160 | for (u32 t = 0; t < blockHeight; t++) | ||
| 1161 | for (u32 s = 0; s < blockWidth; s++) { | ||
| 1162 | u32 cs = Ds * s; | ||
| 1163 | u32 ct = Dt * t; | ||
| 1164 | |||
| 1165 | u32 gs = (cs * (params.m_Width - 1) + 32) >> 6; | ||
| 1166 | u32 gt = (ct * (params.m_Height - 1) + 32) >> 6; | ||
| 1167 | |||
| 1168 | u32 js = gs >> 4; | ||
| 1169 | u32 fs = gs & 0xF; | ||
| 1170 | |||
| 1171 | u32 jt = gt >> 4; | ||
| 1172 | u32 ft = gt & 0x0F; | ||
| 1173 | |||
| 1174 | u32 w11 = (fs * ft + 8) >> 4; | ||
| 1175 | u32 w10 = ft - w11; | ||
| 1176 | u32 w01 = fs - w11; | ||
| 1177 | u32 w00 = 16 - fs - ft + w11; | ||
| 1178 | |||
| 1179 | u32 v0 = js + jt * params.m_Width; | ||
| 1180 | |||
| 1181 | #define FIND_TEXEL(tidx, bidx) \ | ||
| 1182 | u32 p##bidx = 0; \ | ||
| 1183 | do { \ | ||
| 1184 | if ((tidx) < (params.m_Width * params.m_Height)) { \ | ||
| 1185 | p##bidx = unquantized[plane][(tidx)]; \ | ||
| 1186 | } \ | ||
| 1187 | } while (0) | ||
| 1188 | |||
| 1189 | FIND_TEXEL(v0, 00); | ||
| 1190 | FIND_TEXEL(v0 + 1, 01); | ||
| 1191 | FIND_TEXEL(v0 + params.m_Width, 10); | ||
| 1192 | FIND_TEXEL(v0 + params.m_Width + 1, 11); | ||
| 1193 | |||
| 1194 | #undef FIND_TEXEL | ||
| 1195 | |||
| 1196 | out[plane][t * blockWidth + s] = | ||
| 1197 | (p00 * w00 + p01 * w01 + p10 * w10 + p11 * w11 + 8) >> 4; | ||
| 1198 | } | ||
| 1199 | } | ||
| 1200 | |||
| 1201 | // Transfers a bit as described in C.2.14 | ||
| 1202 | static inline void BitTransferSigned(s32& a, s32& b) { | ||
| 1203 | b >>= 1; | ||
| 1204 | b |= a & 0x80; | ||
| 1205 | a >>= 1; | ||
| 1206 | a &= 0x3F; | ||
| 1207 | if (a & 0x20) | ||
| 1208 | a -= 0x40; | ||
| 1209 | } | ||
| 1210 | |||
| 1211 | // Adds more precision to the blue channel as described | ||
| 1212 | // in C.2.14 | ||
| 1213 | static inline Pixel BlueContract(s32 a, s32 r, s32 g, s32 b) { | ||
| 1214 | return Pixel(static_cast<s16>(a), static_cast<s16>((r + b) >> 1), | ||
| 1215 | static_cast<s16>((g + b) >> 1), static_cast<s16>(b)); | ||
| 1216 | } | ||
| 1217 | |||
| 1218 | // Partition selection functions as specified in | ||
| 1219 | // C.2.21 | ||
| 1220 | static inline u32 hash52(u32 p) { | ||
| 1221 | p ^= p >> 15; | ||
| 1222 | p -= p << 17; | ||
| 1223 | p += p << 7; | ||
| 1224 | p += p << 4; | ||
| 1225 | p ^= p >> 5; | ||
| 1226 | p += p << 16; | ||
| 1227 | p ^= p >> 7; | ||
| 1228 | p ^= p >> 3; | ||
| 1229 | p ^= p << 6; | ||
| 1230 | p ^= p >> 17; | ||
| 1231 | return p; | ||
| 1232 | } | ||
| 1233 | |||
| 1234 | static u32 SelectPartition(s32 seed, s32 x, s32 y, s32 z, s32 partitionCount, s32 smallBlock) { | ||
| 1235 | if (1 == partitionCount) | ||
| 1236 | return 0; | ||
| 1237 | |||
| 1238 | if (smallBlock) { | ||
| 1239 | x <<= 1; | ||
| 1240 | y <<= 1; | ||
| 1241 | z <<= 1; | ||
| 1242 | } | ||
| 1243 | |||
| 1244 | seed += (partitionCount - 1) * 1024; | ||
| 1245 | |||
| 1246 | u32 rnum = hash52(static_cast<u32>(seed)); | ||
| 1247 | u8 seed1 = static_cast<u8>(rnum & 0xF); | ||
| 1248 | u8 seed2 = static_cast<u8>((rnum >> 4) & 0xF); | ||
| 1249 | u8 seed3 = static_cast<u8>((rnum >> 8) & 0xF); | ||
| 1250 | u8 seed4 = static_cast<u8>((rnum >> 12) & 0xF); | ||
| 1251 | u8 seed5 = static_cast<u8>((rnum >> 16) & 0xF); | ||
| 1252 | u8 seed6 = static_cast<u8>((rnum >> 20) & 0xF); | ||
| 1253 | u8 seed7 = static_cast<u8>((rnum >> 24) & 0xF); | ||
| 1254 | u8 seed8 = static_cast<u8>((rnum >> 28) & 0xF); | ||
| 1255 | u8 seed9 = static_cast<u8>((rnum >> 18) & 0xF); | ||
| 1256 | u8 seed10 = static_cast<u8>((rnum >> 22) & 0xF); | ||
| 1257 | u8 seed11 = static_cast<u8>((rnum >> 26) & 0xF); | ||
| 1258 | u8 seed12 = static_cast<u8>(((rnum >> 30) | (rnum << 2)) & 0xF); | ||
| 1259 | |||
| 1260 | seed1 = static_cast<u8>(seed1 * seed1); | ||
| 1261 | seed2 = static_cast<u8>(seed2 * seed2); | ||
| 1262 | seed3 = static_cast<u8>(seed3 * seed3); | ||
| 1263 | seed4 = static_cast<u8>(seed4 * seed4); | ||
| 1264 | seed5 = static_cast<u8>(seed5 * seed5); | ||
| 1265 | seed6 = static_cast<u8>(seed6 * seed6); | ||
| 1266 | seed7 = static_cast<u8>(seed7 * seed7); | ||
| 1267 | seed8 = static_cast<u8>(seed8 * seed8); | ||
| 1268 | seed9 = static_cast<u8>(seed9 * seed9); | ||
| 1269 | seed10 = static_cast<u8>(seed10 * seed10); | ||
| 1270 | seed11 = static_cast<u8>(seed11 * seed11); | ||
| 1271 | seed12 = static_cast<u8>(seed12 * seed12); | ||
| 1272 | |||
| 1273 | s32 sh1, sh2, sh3; | ||
| 1274 | if (seed & 1) { | ||
| 1275 | sh1 = (seed & 2) ? 4 : 5; | ||
| 1276 | sh2 = (partitionCount == 3) ? 6 : 5; | ||
| 1277 | } else { | ||
| 1278 | sh1 = (partitionCount == 3) ? 6 : 5; | ||
| 1279 | sh2 = (seed & 2) ? 4 : 5; | ||
| 1280 | } | ||
| 1281 | sh3 = (seed & 0x10) ? sh1 : sh2; | ||
| 1282 | |||
| 1283 | seed1 = static_cast<u8>(seed1 >> sh1); | ||
| 1284 | seed2 = static_cast<u8>(seed2 >> sh2); | ||
| 1285 | seed3 = static_cast<u8>(seed3 >> sh1); | ||
| 1286 | seed4 = static_cast<u8>(seed4 >> sh2); | ||
| 1287 | seed5 = static_cast<u8>(seed5 >> sh1); | ||
| 1288 | seed6 = static_cast<u8>(seed6 >> sh2); | ||
| 1289 | seed7 = static_cast<u8>(seed7 >> sh1); | ||
| 1290 | seed8 = static_cast<u8>(seed8 >> sh2); | ||
| 1291 | seed9 = static_cast<u8>(seed9 >> sh3); | ||
| 1292 | seed10 = static_cast<u8>(seed10 >> sh3); | ||
| 1293 | seed11 = static_cast<u8>(seed11 >> sh3); | ||
| 1294 | seed12 = static_cast<u8>(seed12 >> sh3); | ||
| 1295 | |||
| 1296 | s32 a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); | ||
| 1297 | s32 b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); | ||
| 1298 | s32 c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); | ||
| 1299 | s32 d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); | ||
| 1300 | |||
| 1301 | a &= 0x3F; | ||
| 1302 | b &= 0x3F; | ||
| 1303 | c &= 0x3F; | ||
| 1304 | d &= 0x3F; | ||
| 1305 | |||
| 1306 | if (partitionCount < 4) | ||
| 1307 | d = 0; | ||
| 1308 | if (partitionCount < 3) | ||
| 1309 | c = 0; | ||
| 1310 | |||
| 1311 | if (a >= b && a >= c && a >= d) | ||
| 1312 | return 0; | ||
| 1313 | else if (b >= c && b >= d) | ||
| 1314 | return 1; | ||
| 1315 | else if (c >= d) | ||
| 1316 | return 2; | ||
| 1317 | return 3; | ||
| 1318 | } | ||
| 1319 | |||
| 1320 | static inline u32 Select2DPartition(s32 seed, s32 x, s32 y, s32 partitionCount, s32 smallBlock) { | ||
| 1321 | return SelectPartition(seed, x, y, 0, partitionCount, smallBlock); | ||
| 1322 | } | ||
| 1323 | |||
| 1324 | // Section C.2.14 | ||
| 1325 | static void ComputeEndpos32s(Pixel& ep1, Pixel& ep2, const u32*& colorValues, | ||
| 1326 | u32 colorEndpos32Mode) { | ||
| 1327 | #define READ_UINT_VALUES(N) \ | ||
| 1328 | u32 v[N]; \ | ||
| 1329 | for (u32 i = 0; i < N; i++) { \ | ||
| 1330 | v[i] = *(colorValues++); \ | ||
| 1331 | } | ||
| 1332 | |||
| 1333 | #define READ_INT_VALUES(N) \ | ||
| 1334 | s32 v[N]; \ | ||
| 1335 | for (u32 i = 0; i < N; i++) { \ | ||
| 1336 | v[i] = static_cast<s32>(*(colorValues++)); \ | ||
| 1337 | } | ||
| 1338 | |||
| 1339 | switch (colorEndpos32Mode) { | ||
| 1340 | case 0: { | ||
| 1341 | READ_UINT_VALUES(2) | ||
| 1342 | ep1 = Pixel(0xFF, v[0], v[0], v[0]); | ||
| 1343 | ep2 = Pixel(0xFF, v[1], v[1], v[1]); | ||
| 1344 | } break; | ||
| 1345 | |||
| 1346 | case 1: { | ||
| 1347 | READ_UINT_VALUES(2) | ||
| 1348 | u32 L0 = (v[0] >> 2) | (v[1] & 0xC0); | ||
| 1349 | u32 L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU); | ||
| 1350 | ep1 = Pixel(0xFF, L0, L0, L0); | ||
| 1351 | ep2 = Pixel(0xFF, L1, L1, L1); | ||
| 1352 | } break; | ||
| 1353 | |||
| 1354 | case 4: { | ||
| 1355 | READ_UINT_VALUES(4) | ||
| 1356 | ep1 = Pixel(v[2], v[0], v[0], v[0]); | ||
| 1357 | ep2 = Pixel(v[3], v[1], v[1], v[1]); | ||
| 1358 | } break; | ||
| 1359 | |||
| 1360 | case 5: { | ||
| 1361 | READ_INT_VALUES(4) | ||
| 1362 | BitTransferSigned(v[1], v[0]); | ||
| 1363 | BitTransferSigned(v[3], v[2]); | ||
| 1364 | ep1 = Pixel(v[2], v[0], v[0], v[0]); | ||
| 1365 | ep2 = Pixel(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]); | ||
| 1366 | ep1.ClampByte(); | ||
| 1367 | ep2.ClampByte(); | ||
| 1368 | } break; | ||
| 1369 | |||
| 1370 | case 6: { | ||
| 1371 | READ_UINT_VALUES(4) | ||
| 1372 | ep1 = Pixel(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); | ||
| 1373 | ep2 = Pixel(0xFF, v[0], v[1], v[2]); | ||
| 1374 | } break; | ||
| 1375 | |||
| 1376 | case 8: { | ||
| 1377 | READ_UINT_VALUES(6) | ||
| 1378 | if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { | ||
| 1379 | ep1 = Pixel(0xFF, v[0], v[2], v[4]); | ||
| 1380 | ep2 = Pixel(0xFF, v[1], v[3], v[5]); | ||
| 1381 | } else { | ||
| 1382 | ep1 = BlueContract(0xFF, v[1], v[3], v[5]); | ||
| 1383 | ep2 = BlueContract(0xFF, v[0], v[2], v[4]); | ||
| 1384 | } | ||
| 1385 | } break; | ||
| 1386 | |||
| 1387 | case 9: { | ||
| 1388 | READ_INT_VALUES(6) | ||
| 1389 | BitTransferSigned(v[1], v[0]); | ||
| 1390 | BitTransferSigned(v[3], v[2]); | ||
| 1391 | BitTransferSigned(v[5], v[4]); | ||
| 1392 | if (v[1] + v[3] + v[5] >= 0) { | ||
| 1393 | ep1 = Pixel(0xFF, v[0], v[2], v[4]); | ||
| 1394 | ep2 = Pixel(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]); | ||
| 1395 | } else { | ||
| 1396 | ep1 = BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]); | ||
| 1397 | ep2 = BlueContract(0xFF, v[0], v[2], v[4]); | ||
| 1398 | } | ||
| 1399 | ep1.ClampByte(); | ||
| 1400 | ep2.ClampByte(); | ||
| 1401 | } break; | ||
| 1402 | |||
| 1403 | case 10: { | ||
| 1404 | READ_UINT_VALUES(6) | ||
| 1405 | ep1 = Pixel(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); | ||
| 1406 | ep2 = Pixel(v[5], v[0], v[1], v[2]); | ||
| 1407 | } break; | ||
| 1408 | |||
| 1409 | case 12: { | ||
| 1410 | READ_UINT_VALUES(8) | ||
| 1411 | if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { | ||
| 1412 | ep1 = Pixel(v[6], v[0], v[2], v[4]); | ||
| 1413 | ep2 = Pixel(v[7], v[1], v[3], v[5]); | ||
| 1414 | } else { | ||
| 1415 | ep1 = BlueContract(v[7], v[1], v[3], v[5]); | ||
| 1416 | ep2 = BlueContract(v[6], v[0], v[2], v[4]); | ||
| 1417 | } | ||
| 1418 | } break; | ||
| 1419 | |||
| 1420 | case 13: { | ||
| 1421 | READ_INT_VALUES(8) | ||
| 1422 | BitTransferSigned(v[1], v[0]); | ||
| 1423 | BitTransferSigned(v[3], v[2]); | ||
| 1424 | BitTransferSigned(v[5], v[4]); | ||
| 1425 | BitTransferSigned(v[7], v[6]); | ||
| 1426 | if (v[1] + v[3] + v[5] >= 0) { | ||
| 1427 | ep1 = Pixel(v[6], v[0], v[2], v[4]); | ||
| 1428 | ep2 = Pixel(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]); | ||
| 1429 | } else { | ||
| 1430 | ep1 = BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]); | ||
| 1431 | ep2 = BlueContract(v[6], v[0], v[2], v[4]); | ||
| 1432 | } | ||
| 1433 | ep1.ClampByte(); | ||
| 1434 | ep2.ClampByte(); | ||
| 1435 | } break; | ||
| 1436 | |||
| 1437 | default: | ||
| 1438 | assert(false && "Unsupported color endpoint mode (is it HDR?)"); | ||
| 1439 | break; | ||
| 1440 | } | ||
| 1441 | |||
| 1442 | #undef READ_UINT_VALUES | ||
| 1443 | #undef READ_INT_VALUES | ||
| 1444 | } | ||
| 1445 | |||
| 1446 | static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth, | ||
| 1447 | const u32 blockHeight, std::span<u32, 12 * 12> outBuf) { | ||
| 1448 | InputBitStream strm(inBuf); | ||
| 1449 | TexelWeightParams weightParams = DecodeBlockInfo(strm); | ||
| 1450 | |||
| 1451 | // Was there an error? | ||
| 1452 | if (weightParams.m_bError) { | ||
| 1453 | assert(false && "Invalid block mode"); | ||
| 1454 | FillError(outBuf, blockWidth, blockHeight); | ||
| 1455 | return; | ||
| 1456 | } | ||
| 1457 | |||
| 1458 | if (weightParams.m_bVoidExtentLDR) { | ||
| 1459 | FillVoidExtentLDR(strm, outBuf, blockWidth, blockHeight); | ||
| 1460 | return; | ||
| 1461 | } | ||
| 1462 | |||
| 1463 | if (weightParams.m_bVoidExtentHDR) { | ||
| 1464 | assert(false && "HDR void extent blocks are unsupported!"); | ||
| 1465 | FillError(outBuf, blockWidth, blockHeight); | ||
| 1466 | return; | ||
| 1467 | } | ||
| 1468 | |||
| 1469 | if (weightParams.m_Width > blockWidth) { | ||
| 1470 | assert(false && "Texel weight grid width should be smaller than block width"); | ||
| 1471 | FillError(outBuf, blockWidth, blockHeight); | ||
| 1472 | return; | ||
| 1473 | } | ||
| 1474 | |||
| 1475 | if (weightParams.m_Height > blockHeight) { | ||
| 1476 | assert(false && "Texel weight grid height should be smaller than block height"); | ||
| 1477 | FillError(outBuf, blockWidth, blockHeight); | ||
| 1478 | return; | ||
| 1479 | } | ||
| 1480 | |||
| 1481 | // Read num partitions | ||
| 1482 | u32 nPartitions = strm.ReadBits<2>() + 1; | ||
| 1483 | assert(nPartitions <= 4); | ||
| 1484 | |||
| 1485 | if (nPartitions == 4 && weightParams.m_bDualPlane) { | ||
| 1486 | assert(false && "Dual plane mode is incompatible with four partition blocks"); | ||
| 1487 | FillError(outBuf, blockWidth, blockHeight); | ||
| 1488 | return; | ||
| 1489 | } | ||
| 1490 | |||
| 1491 | // Based on the number of partitions, read the color endpos32 mode for | ||
| 1492 | // each partition. | ||
| 1493 | |||
| 1494 | // Determine partitions, partition index, and color endpos32 modes | ||
| 1495 | s32 planeIdx = -1; | ||
| 1496 | u32 partitionIndex; | ||
| 1497 | u32 colorEndpos32Mode[4] = {0, 0, 0, 0}; | ||
| 1498 | |||
| 1499 | // Define color data. | ||
| 1500 | u8 colorEndpos32Data[16]; | ||
| 1501 | memset(colorEndpos32Data, 0, sizeof(colorEndpos32Data)); | ||
| 1502 | OutputBitStream colorEndpos32Stream(colorEndpos32Data, 16 * 8, 0); | ||
| 1503 | |||
| 1504 | // Read extra config data... | ||
| 1505 | u32 baseCEM = 0; | ||
| 1506 | if (nPartitions == 1) { | ||
| 1507 | colorEndpos32Mode[0] = strm.ReadBits<4>(); | ||
| 1508 | partitionIndex = 0; | ||
| 1509 | } else { | ||
| 1510 | partitionIndex = strm.ReadBits<10>(); | ||
| 1511 | baseCEM = strm.ReadBits<6>(); | ||
| 1512 | } | ||
| 1513 | u32 baseMode = (baseCEM & 3); | ||
| 1514 | |||
| 1515 | // Remaining bits are color endpos32 data... | ||
| 1516 | u32 nWeightBits = weightParams.GetPackedBitSize(); | ||
| 1517 | s32 remainingBits = 128 - nWeightBits - static_cast<s32>(strm.GetBitsRead()); | ||
| 1518 | |||
| 1519 | // Consider extra bits prior to texel data... | ||
| 1520 | u32 extraCEMbits = 0; | ||
| 1521 | if (baseMode) { | ||
| 1522 | switch (nPartitions) { | ||
| 1523 | case 2: | ||
| 1524 | extraCEMbits += 2; | ||
| 1525 | break; | ||
| 1526 | case 3: | ||
| 1527 | extraCEMbits += 5; | ||
| 1528 | break; | ||
| 1529 | case 4: | ||
| 1530 | extraCEMbits += 8; | ||
| 1531 | break; | ||
| 1532 | default: | ||
| 1533 | assert(false); | ||
| 1534 | break; | ||
| 1535 | } | ||
| 1536 | } | ||
| 1537 | remainingBits -= extraCEMbits; | ||
| 1538 | |||
| 1539 | // Do we have a dual plane situation? | ||
| 1540 | u32 planeSelectorBits = 0; | ||
| 1541 | if (weightParams.m_bDualPlane) { | ||
| 1542 | planeSelectorBits = 2; | ||
| 1543 | } | ||
| 1544 | remainingBits -= planeSelectorBits; | ||
| 1545 | |||
| 1546 | // Read color data... | ||
| 1547 | u32 colorDataBits = remainingBits; | ||
| 1548 | while (remainingBits > 0) { | ||
| 1549 | u32 nb = std::min(remainingBits, 8); | ||
| 1550 | u32 b = strm.ReadBits(nb); | ||
| 1551 | colorEndpos32Stream.WriteBits(b, nb); | ||
| 1552 | remainingBits -= 8; | ||
| 1553 | } | ||
| 1554 | |||
| 1555 | // Read the plane selection bits | ||
| 1556 | planeIdx = strm.ReadBits(planeSelectorBits); | ||
| 1557 | |||
| 1558 | // Read the rest of the CEM | ||
| 1559 | if (baseMode) { | ||
| 1560 | u32 extraCEM = strm.ReadBits(extraCEMbits); | ||
| 1561 | u32 CEM = (extraCEM << 6) | baseCEM; | ||
| 1562 | CEM >>= 2; | ||
| 1563 | |||
| 1564 | bool C[4] = {0}; | ||
| 1565 | for (u32 i = 0; i < nPartitions; i++) { | ||
| 1566 | C[i] = CEM & 1; | ||
| 1567 | CEM >>= 1; | ||
| 1568 | } | ||
| 1569 | |||
| 1570 | u8 M[4] = {0}; | ||
| 1571 | for (u32 i = 0; i < nPartitions; i++) { | ||
| 1572 | M[i] = CEM & 3; | ||
| 1573 | CEM >>= 2; | ||
| 1574 | assert(M[i] <= 3); | ||
| 1575 | } | ||
| 1576 | |||
| 1577 | for (u32 i = 0; i < nPartitions; i++) { | ||
| 1578 | colorEndpos32Mode[i] = baseMode; | ||
| 1579 | if (!(C[i])) | ||
| 1580 | colorEndpos32Mode[i] -= 1; | ||
| 1581 | colorEndpos32Mode[i] <<= 2; | ||
| 1582 | colorEndpos32Mode[i] |= M[i]; | ||
| 1583 | } | ||
| 1584 | } else if (nPartitions > 1) { | ||
| 1585 | u32 CEM = baseCEM >> 2; | ||
| 1586 | for (u32 i = 0; i < nPartitions; i++) { | ||
| 1587 | colorEndpos32Mode[i] = CEM; | ||
| 1588 | } | ||
| 1589 | } | ||
| 1590 | |||
| 1591 | // Make sure everything up till here is sane. | ||
| 1592 | for (u32 i = 0; i < nPartitions; i++) { | ||
| 1593 | assert(colorEndpos32Mode[i] < 16); | ||
| 1594 | } | ||
| 1595 | assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128); | ||
| 1596 | |||
| 1597 | // Decode both color data and texel weight data | ||
| 1598 | u32 colorValues[32]; // Four values, two endpos32s, four maximum paritions | ||
| 1599 | DecodeColorValues(colorValues, colorEndpos32Data, colorEndpos32Mode, nPartitions, | ||
| 1600 | colorDataBits); | ||
| 1601 | |||
| 1602 | Pixel endpos32s[4][2]; | ||
| 1603 | const u32* colorValuesPtr = colorValues; | ||
| 1604 | for (u32 i = 0; i < nPartitions; i++) { | ||
| 1605 | ComputeEndpos32s(endpos32s[i][0], endpos32s[i][1], colorValuesPtr, colorEndpos32Mode[i]); | ||
| 1606 | } | ||
| 1607 | |||
| 1608 | // Read the texel weight data.. | ||
| 1609 | std::array<u8, 16> texelWeightData; | ||
| 1610 | std::ranges::copy(inBuf, texelWeightData.begin()); | ||
| 1611 | |||
| 1612 | // Reverse everything | ||
| 1613 | for (u32 i = 0; i < 8; i++) { | ||
| 1614 | // Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits | ||
| 1615 | #define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32 | ||
| 1616 | u8 a = static_cast<u8>(REVERSE_BYTE(texelWeightData[i])); | ||
| 1617 | u8 b = static_cast<u8>(REVERSE_BYTE(texelWeightData[15 - i])); | ||
| 1618 | #undef REVERSE_BYTE | ||
| 1619 | |||
| 1620 | texelWeightData[i] = b; | ||
| 1621 | texelWeightData[15 - i] = a; | ||
| 1622 | } | ||
| 1623 | |||
| 1624 | // Make sure that higher non-texel bits are set to zero | ||
| 1625 | const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1; | ||
| 1626 | if (clearByteStart > 0 && clearByteStart <= texelWeightData.size()) { | ||
| 1627 | texelWeightData[clearByteStart - 1] &= | ||
| 1628 | static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); | ||
| 1629 | std::memset(texelWeightData.data() + clearByteStart, 0, | ||
| 1630 | std::min(16U - clearByteStart, 16U)); | ||
| 1631 | } | ||
| 1632 | |||
| 1633 | IntegerEncodedVector texelWeightValues; | ||
| 1634 | |||
| 1635 | InputBitStream weightStream(texelWeightData); | ||
| 1636 | |||
| 1637 | DecodeIntegerSequence(texelWeightValues, weightStream, weightParams.m_MaxWeight, | ||
| 1638 | weightParams.GetNumWeightValues()); | ||
| 1639 | |||
| 1640 | // Blocks can be at most 12x12, so we can have as many as 144 weights | ||
| 1641 | u32 weights[2][144]; | ||
| 1642 | UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight); | ||
| 1643 | |||
| 1644 | // Now that we have endpos32s and weights, we can s32erpolate and generate | ||
| 1645 | // the proper decoding... | ||
| 1646 | for (u32 j = 0; j < blockHeight; j++) | ||
| 1647 | for (u32 i = 0; i < blockWidth; i++) { | ||
| 1648 | u32 partition = Select2DPartition(partitionIndex, i, j, nPartitions, | ||
| 1649 | (blockHeight * blockWidth) < 32); | ||
| 1650 | assert(partition < nPartitions); | ||
| 1651 | |||
| 1652 | Pixel p; | ||
| 1653 | for (u32 c = 0; c < 4; c++) { | ||
| 1654 | u32 C0 = endpos32s[partition][0].Component(c); | ||
| 1655 | C0 = ReplicateByteTo16(C0); | ||
| 1656 | u32 C1 = endpos32s[partition][1].Component(c); | ||
| 1657 | C1 = ReplicateByteTo16(C1); | ||
| 1658 | |||
| 1659 | u32 plane = 0; | ||
| 1660 | if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { | ||
| 1661 | plane = 1; | ||
| 1662 | } | ||
| 1663 | |||
| 1664 | u32 weight = weights[plane][j * blockWidth + i]; | ||
| 1665 | u32 C = (C0 * (64 - weight) + C1 * weight + 32) / 64; | ||
| 1666 | if (C == 65535) { | ||
| 1667 | p.Component(c) = 255; | ||
| 1668 | } else { | ||
| 1669 | double Cf = static_cast<double>(C); | ||
| 1670 | p.Component(c) = static_cast<u16>(255.0 * (Cf / 65536.0) + 0.5); | ||
| 1671 | } | ||
| 1672 | } | ||
| 1673 | |||
| 1674 | outBuf[j * blockWidth + i] = p.Pack(); | ||
| 1675 | } | ||
| 1676 | } | ||
| 1677 | |||
| 1678 | } // namespace ASTCC | ||
| 1679 | |||
| 1680 | namespace Tegra::Texture::ASTC { | ||
| 1681 | |||
| 1682 | void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, | ||
| 1683 | uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) { | ||
| 1684 | u32 block_index = 0; | ||
| 1685 | std::size_t depth_offset = 0; | ||
| 1686 | for (u32 z = 0; z < depth; z++) { | ||
| 1687 | for (u32 y = 0; y < height; y += block_height) { | ||
| 1688 | for (u32 x = 0; x < width; x += block_width) { | ||
| 1689 | const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)}; | ||
| 1690 | |||
| 1691 | // Blocks can be at most 12x12 | ||
| 1692 | std::array<u32, 12 * 12> uncompData; | ||
| 1693 | ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData); | ||
| 1694 | |||
| 1695 | u32 decompWidth = std::min(block_width, width - x); | ||
| 1696 | u32 decompHeight = std::min(block_height, height - y); | ||
| 1697 | |||
| 1698 | const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4); | ||
| 1699 | for (u32 jj = 0; jj < decompHeight; jj++) { | ||
| 1700 | std::memcpy(outRow.data() + jj * width * 4, | ||
| 1701 | uncompData.data() + jj * block_width, decompWidth * 4); | ||
| 1702 | } | ||
| 1703 | ++block_index; | ||
| 1704 | } | ||
| 1705 | } | ||
| 1706 | depth_offset += height * width * 4; | ||
| 1707 | } | ||
| 1708 | } | ||
| 1709 | |||
| 1710 | } // namespace Tegra::Texture::ASTC | ||
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h index bc8bddaec..c1c73fda5 100644 --- a/src/video_core/textures/astc.h +++ b/src/video_core/textures/astc.h | |||
| @@ -4,20 +4,12 @@ | |||
| 4 | 4 | ||
| 5 | #pragma once | 5 | #pragma once |
| 6 | 6 | ||
| 7 | #include <cstdint> | 7 | #include <bit> |
| 8 | #include "common/common_types.h" | ||
| 8 | 9 | ||
| 9 | namespace Tegra::Texture::ASTC { | 10 | namespace Tegra::Texture::ASTC { |
| 10 | 11 | ||
| 11 | /// Count the number of bits set in a number. | 12 | enum class IntegerEncoding { JustBits, Quint, Trit }; |
| 12 | constexpr u32 Popcnt(u32 n) { | ||
| 13 | u32 c = 0; | ||
| 14 | for (; n; c++) { | ||
| 15 | n &= n - 1; | ||
| 16 | } | ||
| 17 | return c; | ||
| 18 | } | ||
| 19 | |||
| 20 | enum class IntegerEncoding { JustBits, Qus32, Trit }; | ||
| 21 | 13 | ||
| 22 | struct IntegerEncodedValue { | 14 | struct IntegerEncodedValue { |
| 23 | constexpr IntegerEncodedValue() = default; | 15 | constexpr IntegerEncodedValue() = default; |
| @@ -29,55 +21,55 @@ struct IntegerEncodedValue { | |||
| 29 | return encoding == other.encoding && num_bits == other.num_bits; | 21 | return encoding == other.encoding && num_bits == other.num_bits; |
| 30 | } | 22 | } |
| 31 | 23 | ||
| 32 | // Returns the number of bits required to encode nVals values. | 24 | // Returns the number of bits required to encode num_vals values. |
| 33 | u32 GetBitLength(u32 nVals) const { | 25 | u32 GetBitLength(u32 num_vals) const { |
| 34 | u32 totalBits = num_bits * nVals; | 26 | u32 total_bits = num_bits * num_vals; |
| 35 | if (encoding == IntegerEncoding::Trit) { | 27 | if (encoding == IntegerEncoding::Trit) { |
| 36 | totalBits += (nVals * 8 + 4) / 5; | 28 | total_bits += (num_vals * 8 + 4) / 5; |
| 37 | } else if (encoding == IntegerEncoding::Qus32) { | 29 | } else if (encoding == IntegerEncoding::Quint) { |
| 38 | totalBits += (nVals * 7 + 2) / 3; | 30 | total_bits += (num_vals * 7 + 2) / 3; |
| 39 | } | 31 | } |
| 40 | return totalBits; | 32 | return total_bits; |
| 41 | } | 33 | } |
| 42 | 34 | ||
| 43 | IntegerEncoding encoding{}; | 35 | IntegerEncoding encoding{}; |
| 44 | u32 num_bits = 0; | 36 | u32 num_bits = 0; |
| 45 | u32 bit_value = 0; | 37 | u32 bit_value = 0; |
| 46 | union { | 38 | union { |
| 47 | u32 qus32_value = 0; | 39 | u32 quint_value = 0; |
| 48 | u32 trit_value; | 40 | u32 trit_value; |
| 49 | }; | 41 | }; |
| 50 | }; | 42 | }; |
| 51 | 43 | ||
| 52 | // Returns a new instance of this struct that corresponds to the | 44 | // Returns a new instance of this struct that corresponds to the |
| 53 | // can take no more than maxval values | 45 | // can take no more than mav_value values |
| 54 | static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) { | 46 | constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) { |
| 55 | while (maxVal > 0) { | 47 | while (mav_value > 0) { |
| 56 | u32 check = maxVal + 1; | 48 | u32 check = mav_value + 1; |
| 57 | 49 | ||
| 58 | // Is maxVal a power of two? | 50 | // Is mav_value a power of two? |
| 59 | if (!(check & (check - 1))) { | 51 | if (!(check & (check - 1))) { |
| 60 | return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal)); | 52 | return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value)); |
| 61 | } | 53 | } |
| 62 | 54 | ||
| 63 | // Is maxVal of the type 3*2^n - 1? | 55 | // Is mav_value of the type 3*2^n - 1? |
| 64 | if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { | 56 | if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { |
| 65 | return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1)); | 57 | return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1)); |
| 66 | } | 58 | } |
| 67 | 59 | ||
| 68 | // Is maxVal of the type 5*2^n - 1? | 60 | // Is mav_value of the type 5*2^n - 1? |
| 69 | if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { | 61 | if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { |
| 70 | return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1)); | 62 | return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1)); |
| 71 | } | 63 | } |
| 72 | 64 | ||
| 73 | // Apparently it can't be represented with a bounded integer sequence... | 65 | // Apparently it can't be represented with a bounded integer sequence... |
| 74 | // just iterate. | 66 | // just iterate. |
| 75 | maxVal--; | 67 | mav_value--; |
| 76 | } | 68 | } |
| 77 | return IntegerEncodedValue(IntegerEncoding::JustBits, 0); | 69 | return IntegerEncodedValue(IntegerEncoding::JustBits, 0); |
| 78 | } | 70 | } |
| 79 | 71 | ||
| 80 | static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { | 72 | constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { |
| 81 | std::array<IntegerEncodedValue, 256> encodings{}; | 73 | std::array<IntegerEncodedValue, 256> encodings{}; |
| 82 | for (std::size_t i = 0; i < encodings.size(); ++i) { | 74 | for (std::size_t i = 0; i < encodings.size(); ++i) { |
| 83 | encodings[i] = CreateEncoding(static_cast<u32>(i)); | 75 | encodings[i] = CreateEncoding(static_cast<u32>(i)); |
| @@ -85,41 +77,38 @@ static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { | |||
| 85 | return encodings; | 77 | return encodings; |
| 86 | } | 78 | } |
| 87 | 79 | ||
| 88 | static constexpr std::array<IntegerEncodedValue, 256> EncodingsValues = MakeEncodedValues(); | 80 | constexpr std::array<IntegerEncodedValue, 256> EncodingsValues = MakeEncodedValues(); |
| 89 | 81 | ||
| 90 | // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] | 82 | // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] |
| 91 | // is the same as [(numBits - 1):0] and repeats all the way down. | 83 | // is the same as [(num_bits - 1):0] and repeats all the way down. |
| 92 | template <typename IntType> | 84 | template <typename IntType> |
| 93 | static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) { | 85 | constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) { |
| 94 | if (numBits == 0) { | 86 | if (num_bits == 0 || to_bit == 0) { |
| 95 | return 0; | ||
| 96 | } | ||
| 97 | if (toBit == 0) { | ||
| 98 | return 0; | 87 | return 0; |
| 99 | } | 88 | } |
| 100 | const IntType v = val & static_cast<IntType>((1 << numBits) - 1); | 89 | const IntType v = val & static_cast<IntType>((1 << num_bits) - 1); |
| 101 | IntType res = v; | 90 | IntType res = v; |
| 102 | u32 reslen = numBits; | 91 | u32 reslen = num_bits; |
| 103 | while (reslen < toBit) { | 92 | while (reslen < to_bit) { |
| 104 | u32 comp = 0; | 93 | u32 comp = 0; |
| 105 | if (numBits > toBit - reslen) { | 94 | if (num_bits > to_bit - reslen) { |
| 106 | u32 newshift = toBit - reslen; | 95 | u32 newshift = to_bit - reslen; |
| 107 | comp = numBits - newshift; | 96 | comp = num_bits - newshift; |
| 108 | numBits = newshift; | 97 | num_bits = newshift; |
| 109 | } | 98 | } |
| 110 | res = static_cast<IntType>(res << numBits); | 99 | res = static_cast<IntType>(res << num_bits); |
| 111 | res = static_cast<IntType>(res | (v >> comp)); | 100 | res = static_cast<IntType>(res | (v >> comp)); |
| 112 | reslen += numBits; | 101 | reslen += num_bits; |
| 113 | } | 102 | } |
| 114 | return res; | 103 | return res; |
| 115 | } | 104 | } |
| 116 | 105 | ||
| 117 | static constexpr std::size_t NumReplicateEntries(u32 num_bits) { | 106 | constexpr std::size_t NumReplicateEntries(u32 num_bits) { |
| 118 | return std::size_t(1) << num_bits; | 107 | return std::size_t(1) << num_bits; |
| 119 | } | 108 | } |
| 120 | 109 | ||
| 121 | template <typename IntType, u32 num_bits, u32 to_bit> | 110 | template <typename IntType, u32 num_bits, u32 to_bit> |
| 122 | static constexpr auto MakeReplicateTable() { | 111 | constexpr auto MakeReplicateTable() { |
| 123 | std::array<IntType, NumReplicateEntries(num_bits)> table{}; | 112 | std::array<IntType, NumReplicateEntries(num_bits)> table{}; |
| 124 | for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) { | 113 | for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) { |
| 125 | table[value] = Replicate(value, num_bits, to_bit); | 114 | table[value] = Replicate(value, num_bits, to_bit); |
| @@ -127,78 +116,17 @@ static constexpr auto MakeReplicateTable() { | |||
| 127 | return table; | 116 | return table; |
| 128 | } | 117 | } |
| 129 | 118 | ||
| 130 | static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); | 119 | constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); |
| 131 | static constexpr u32 ReplicateByteTo16(std::size_t value) { | 120 | constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>(); |
| 132 | return REPLICATE_BYTE_TO_16_TABLE[value]; | 121 | constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>(); |
| 133 | } | 122 | constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>(); |
| 134 | 123 | ||
| 135 | static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>(); | 124 | struct AstcBufferData { |
| 136 | static constexpr u32 ReplicateBitTo7(std::size_t value) { | 125 | decltype(EncodingsValues) encoding_values = EncodingsValues; |
| 137 | return REPLICATE_BIT_TO_7_TABLE[value]; | 126 | decltype(REPLICATE_6_BIT_TO_8_TABLE) replicate_6_to_8 = REPLICATE_6_BIT_TO_8_TABLE; |
| 138 | } | 127 | decltype(REPLICATE_7_BIT_TO_8_TABLE) replicate_7_to_8 = REPLICATE_7_BIT_TO_8_TABLE; |
| 139 | 128 | decltype(REPLICATE_8_BIT_TO_8_TABLE) replicate_8_to_8 = REPLICATE_8_BIT_TO_8_TABLE; | |
| 140 | static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>(); | 129 | decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE; |
| 141 | static constexpr u32 ReplicateBitTo9(std::size_t value) { | 130 | } constexpr ASTC_BUFFER_DATA; |
| 142 | return REPLICATE_BIT_TO_9_TABLE[value]; | ||
| 143 | } | ||
| 144 | |||
| 145 | static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>(); | ||
| 146 | static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>(); | ||
| 147 | static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>(); | ||
| 148 | static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>(); | ||
| 149 | static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>(); | ||
| 150 | static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>(); | ||
| 151 | static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>(); | ||
| 152 | static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>(); | ||
| 153 | /// Use a precompiled table with the most common usages, if it's not in the expected range, fallback | ||
| 154 | /// to the runtime implementation | ||
| 155 | static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { | ||
| 156 | switch (num_bits) { | ||
| 157 | case 1: | ||
| 158 | return REPLICATE_1_BIT_TO_8_TABLE[value]; | ||
| 159 | case 2: | ||
| 160 | return REPLICATE_2_BIT_TO_8_TABLE[value]; | ||
| 161 | case 3: | ||
| 162 | return REPLICATE_3_BIT_TO_8_TABLE[value]; | ||
| 163 | case 4: | ||
| 164 | return REPLICATE_4_BIT_TO_8_TABLE[value]; | ||
| 165 | case 5: | ||
| 166 | return REPLICATE_5_BIT_TO_8_TABLE[value]; | ||
| 167 | case 6: | ||
| 168 | return REPLICATE_6_BIT_TO_8_TABLE[value]; | ||
| 169 | case 7: | ||
| 170 | return REPLICATE_7_BIT_TO_8_TABLE[value]; | ||
| 171 | case 8: | ||
| 172 | return REPLICATE_8_BIT_TO_8_TABLE[value]; | ||
| 173 | default: | ||
| 174 | return Replicate(value, num_bits, 8); | ||
| 175 | } | ||
| 176 | } | ||
| 177 | |||
| 178 | static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>(); | ||
| 179 | static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>(); | ||
| 180 | static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>(); | ||
| 181 | static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>(); | ||
| 182 | static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>(); | ||
| 183 | |||
| 184 | static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) { | ||
| 185 | switch (num_bits) { | ||
| 186 | case 1: | ||
| 187 | return REPLICATE_1_BIT_TO_6_TABLE[value]; | ||
| 188 | case 2: | ||
| 189 | return REPLICATE_2_BIT_TO_6_TABLE[value]; | ||
| 190 | case 3: | ||
| 191 | return REPLICATE_3_BIT_TO_6_TABLE[value]; | ||
| 192 | case 4: | ||
| 193 | return REPLICATE_4_BIT_TO_6_TABLE[value]; | ||
| 194 | case 5: | ||
| 195 | return REPLICATE_5_BIT_TO_6_TABLE[value]; | ||
| 196 | default: | ||
| 197 | return Replicate(value, num_bits, 6); | ||
| 198 | } | ||
| 199 | } | ||
| 200 | |||
| 201 | void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, | ||
| 202 | uint32_t block_width, uint32_t block_height, std::span<uint8_t> output); | ||
| 203 | 131 | ||
| 204 | } // namespace Tegra::Texture::ASTC | 132 | } // namespace Tegra::Texture::ASTC |