diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/host_shaders/astc_decoder.comp | 988 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/util_shaders.cpp | 1 |
2 files changed, 454 insertions, 535 deletions
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index bf2693559..5ff17cd0c 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp | |||
| @@ -33,26 +33,14 @@ UNIFORM(6) uint block_height_mask; | |||
| 33 | END_PUSH_CONSTANTS | 33 | END_PUSH_CONSTANTS |
| 34 | 34 | ||
| 35 | struct EncodingData { | 35 | struct EncodingData { |
| 36 | uint encoding; | 36 | uint data; |
| 37 | uint num_bits; | ||
| 38 | uint bit_value; | ||
| 39 | uint quint_trit_value; | ||
| 40 | }; | 37 | }; |
| 41 | 38 | ||
| 42 | struct TexelWeightParams { | 39 | layout(binding = BINDING_INPUT_BUFFER, std430) readonly restrict buffer InputBufferU32 { |
| 43 | uvec2 size; | ||
| 44 | uint max_weight; | ||
| 45 | bool dual_plane; | ||
| 46 | bool error_state; | ||
| 47 | bool void_extent_ldr; | ||
| 48 | bool void_extent_hdr; | ||
| 49 | }; | ||
| 50 | |||
| 51 | layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { | ||
| 52 | uvec4 astc_data[]; | 40 | uvec4 astc_data[]; |
| 53 | }; | 41 | }; |
| 54 | 42 | ||
| 55 | layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; | 43 | layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly restrict image2DArray dest_image; |
| 56 | 44 | ||
| 57 | const uint GOB_SIZE_X_SHIFT = 6; | 45 | const uint GOB_SIZE_X_SHIFT = 6; |
| 58 | const uint GOB_SIZE_Y_SHIFT = 3; | 46 | const uint GOB_SIZE_Y_SHIFT = 3; |
| @@ -60,64 +48,21 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT; | |||
| 60 | 48 | ||
| 61 | const uint BYTES_PER_BLOCK_LOG2 = 4; | 49 | const uint BYTES_PER_BLOCK_LOG2 = 4; |
| 62 | 50 | ||
| 63 | const int JUST_BITS = 0; | 51 | const uint JUST_BITS = 0u; |
| 64 | const int QUINT = 1; | 52 | const uint QUINT = 1u; |
| 65 | const int TRIT = 2; | 53 | const uint TRIT = 2u; |
| 66 | 54 | ||
| 67 | // ASTC Encodings data, sorted in ascending order based on their BitLength value | 55 | // ASTC Encodings data, sorted in ascending order based on their BitLength value |
| 68 | // (see GetBitLength() function) | 56 | // (see GetBitLength() function) |
| 69 | EncodingData encoding_values[22] = EncodingData[]( | 57 | const uint encoding_values[22] = uint[]( |
| 70 | EncodingData(JUST_BITS, 0, 0, 0), EncodingData(JUST_BITS, 1, 0, 0), EncodingData(TRIT, 0, 0, 0), | 58 | (JUST_BITS), (JUST_BITS | (1u << 8u)), (TRIT), (JUST_BITS | (2u << 8u)), |
| 71 | EncodingData(JUST_BITS, 2, 0, 0), EncodingData(QUINT, 0, 0, 0), EncodingData(TRIT, 1, 0, 0), | 59 | (QUINT), (TRIT | (1u << 8u)), (JUST_BITS | (3u << 8u)), (QUINT | (1u << 8u)), |
| 72 | EncodingData(JUST_BITS, 3, 0, 0), EncodingData(QUINT, 1, 0, 0), EncodingData(TRIT, 2, 0, 0), | 60 | (TRIT | (2u << 8u)), (JUST_BITS | (4u << 8u)), (QUINT | (2u << 8u)), (TRIT | (3u << 8u)), |
| 73 | EncodingData(JUST_BITS, 4, 0, 0), EncodingData(QUINT, 2, 0, 0), EncodingData(TRIT, 3, 0, 0), | 61 | (JUST_BITS | (5u << 8u)), (QUINT | (3u << 8u)), (TRIT | (4u << 8u)), (JUST_BITS | (6u << 8u)), |
| 74 | EncodingData(JUST_BITS, 5, 0, 0), EncodingData(QUINT, 3, 0, 0), EncodingData(TRIT, 4, 0, 0), | 62 | (QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)), |
| 75 | EncodingData(JUST_BITS, 6, 0, 0), EncodingData(QUINT, 4, 0, 0), EncodingData(TRIT, 5, 0, 0), | 63 | (TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u))); |
| 76 | EncodingData(JUST_BITS, 7, 0, 0), EncodingData(QUINT, 5, 0, 0), EncodingData(TRIT, 6, 0, 0), | ||
| 77 | EncodingData(JUST_BITS, 8, 0, 0) | ||
| 78 | ); | ||
| 79 | |||
| 80 | // The following constants are expanded variants of the Replicate() | ||
| 81 | // function calls corresponding to the following arguments: | ||
| 82 | // value: index into the generated table | ||
| 83 | // num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4. | ||
| 84 | // to_bit: the integer after "TO_" | ||
| 85 | const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127); | ||
| 86 | const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511); | ||
| 87 | |||
| 88 | const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255); | ||
| 89 | const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255); | ||
| 90 | const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255); | ||
| 91 | const uint REPLICATE_4_BIT_TO_8_TABLE[16] = | ||
| 92 | uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255); | ||
| 93 | const uint REPLICATE_5_BIT_TO_8_TABLE[32] = | ||
| 94 | uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165, | ||
| 95 | 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255); | ||
| 96 | const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63); | ||
| 97 | const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63); | ||
| 98 | const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63); | ||
| 99 | const uint REPLICATE_4_BIT_TO_6_TABLE[16] = | ||
| 100 | uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63); | ||
| 101 | const uint REPLICATE_5_BIT_TO_6_TABLE[32] = | ||
| 102 | uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45, | ||
| 103 | 47, 49, 51, 53, 55, 57, 59, 61, 63); | ||
| 104 | const uint REPLICATE_6_BIT_TO_8_TABLE[64] = | ||
| 105 | uint[](0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 65, 69, 73, 77, 81, 85, 89, | ||
| 106 | 93, 97, 101, 105, 109, 113, 117, 121, 125, 130, 134, 138, 142, 146, 150, 154, 158, 162, | ||
| 107 | 166, 170, 174, 178, 182, 186, 190, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, | ||
| 108 | 239, 243, 247, 251, 255); | ||
| 109 | const uint REPLICATE_7_BIT_TO_8_TABLE[128] = | ||
| 110 | uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, | ||
| 111 | 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, | ||
| 112 | 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, | ||
| 113 | 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159, 161, 163, | ||
| 114 | 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, | ||
| 115 | 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, | ||
| 116 | 237, 239, 241, 243, 245, 247, 249, 251, 253, 255); | ||
| 117 | 64 | ||
| 118 | // Input ASTC texture globals | 65 | // Input ASTC texture globals |
| 119 | uint current_index = 0; | ||
| 120 | int bitsread = 0; | ||
| 121 | int total_bitsread = 0; | 66 | int total_bitsread = 0; |
| 122 | uvec4 local_buff; | 67 | uvec4 local_buff; |
| 123 | 68 | ||
| @@ -125,50 +70,60 @@ uvec4 local_buff; | |||
| 125 | uvec4 color_endpoint_data; | 70 | uvec4 color_endpoint_data; |
| 126 | int color_bitsread = 0; | 71 | int color_bitsread = 0; |
| 127 | 72 | ||
| 128 | // Four values, two endpoints, four maximum partitions | 73 | // Global "vector" to be pushed into when decoding |
| 129 | uint color_values[32]; | 74 | // At most will require BLOCK_WIDTH x BLOCK_HEIGHT in single plane mode |
| 130 | int colvals_index = 0; | 75 | // At most will require BLOCK_WIDTH x BLOCK_HEIGHT x 2 in dual plane mode |
| 131 | 76 | // So the maximum would be 144 (12 x 12) elements, x 2 for two planes | |
| 132 | // Weight data globals | 77 | #define DIVCEIL(number, divisor) (number + divisor - 1) / divisor |
| 133 | uvec4 texel_weight_data; | 78 | #define ARRAY_NUM_ELEMENTS 144 |
| 134 | int texel_bitsread = 0; | 79 | #define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4) |
| 80 | uint result_vector[ARRAY_NUM_ELEMENTS * 2]; | ||
| 135 | 81 | ||
| 136 | bool texel_flag = false; | ||
| 137 | |||
| 138 | // Global "vectors" to be pushed into when decoding | ||
| 139 | EncodingData result_vector[144]; | ||
| 140 | int result_index = 0; | 82 | int result_index = 0; |
| 83 | uint result_vector_max_index; | ||
| 84 | bool result_limit_reached = false; | ||
| 141 | 85 | ||
| 142 | EncodingData texel_vector[144]; | 86 | // EncodingData helpers |
| 143 | int texel_vector_index = 0; | 87 | uint Encoding(EncodingData val) { |
| 88 | return bitfieldExtract(val.data, 0, 8); | ||
| 89 | } | ||
| 90 | uint NumBits(EncodingData val) { | ||
| 91 | return bitfieldExtract(val.data, 8, 8); | ||
| 92 | } | ||
| 93 | uint BitValue(EncodingData val) { | ||
| 94 | return bitfieldExtract(val.data, 16, 8); | ||
| 95 | } | ||
| 96 | uint QuintTritValue(EncodingData val) { | ||
| 97 | return bitfieldExtract(val.data, 24, 8); | ||
| 98 | } | ||
| 144 | 99 | ||
| 145 | uint unquantized_texel_weights[2][144]; | 100 | void Encoding(inout EncodingData val, uint v) { |
| 101 | val.data = bitfieldInsert(val.data, v, 0, 8); | ||
| 102 | } | ||
| 103 | void NumBits(inout EncodingData val, uint v) { | ||
| 104 | val.data = bitfieldInsert(val.data, v, 8, 8); | ||
| 105 | } | ||
| 106 | void BitValue(inout EncodingData val, uint v) { | ||
| 107 | val.data = bitfieldInsert(val.data, v, 16, 8); | ||
| 108 | } | ||
| 109 | void QuintTritValue(inout EncodingData val, uint v) { | ||
| 110 | val.data = bitfieldInsert(val.data, v, 24, 8); | ||
| 111 | } | ||
| 146 | 112 | ||
| 147 | uint SwizzleOffset(uvec2 pos) { | 113 | EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint quint_trit_val) { |
| 148 | uint x = pos.x; | 114 | return EncodingData(((encoding) << 0u) | ((num_bits) << 8u) | |
| 149 | uint y = pos.y; | 115 | ((bit_val) << 16u) | ((quint_trit_val) << 24u)); |
| 150 | return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + | ||
| 151 | (y % 2) * 16 + (x % 16); | ||
| 152 | } | 116 | } |
| 153 | 117 | ||
| 154 | // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] | 118 | |
| 155 | // is the same as [(num_bits - 1):0] and repeats all the way down. | 119 | void ResultEmplaceBack(EncodingData val) { |
| 156 | uint Replicate(uint val, uint num_bits, uint to_bit) { | 120 | if (result_index >= result_vector_max_index) { |
| 157 | const uint v = val & uint((1 << num_bits) - 1); | 121 | // Alert callers to avoid decoding more than needed by this phase |
| 158 | uint res = v; | 122 | result_limit_reached = true; |
| 159 | uint reslen = num_bits; | 123 | return; |
| 160 | while (reslen < to_bit) { | ||
| 161 | uint comp = 0; | ||
| 162 | if (num_bits > to_bit - reslen) { | ||
| 163 | uint newshift = to_bit - reslen; | ||
| 164 | comp = num_bits - newshift; | ||
| 165 | num_bits = newshift; | ||
| 166 | } | ||
| 167 | res = uint(res << num_bits); | ||
| 168 | res = uint(res | (v >> comp)); | ||
| 169 | reslen += num_bits; | ||
| 170 | } | 124 | } |
| 171 | return res; | 125 | result_vector[result_index] = val.data; |
| 126 | ++result_index; | ||
| 172 | } | 127 | } |
| 173 | 128 | ||
| 174 | uvec4 ReplicateByteTo16(uvec4 value) { | 129 | uvec4 ReplicateByteTo16(uvec4 value) { |
| @@ -176,64 +131,40 @@ uvec4 ReplicateByteTo16(uvec4 value) { | |||
| 176 | } | 131 | } |
| 177 | 132 | ||
| 178 | uint ReplicateBitTo7(uint value) { | 133 | uint ReplicateBitTo7(uint value) { |
| 179 | return REPLICATE_BIT_TO_7_TABLE[value]; | 134 | return value * 127; |
| 180 | } | 135 | } |
| 181 | 136 | ||
| 182 | uint ReplicateBitTo9(uint value) { | 137 | uint ReplicateBitTo9(uint value) { |
| 183 | return REPLICATE_1_BIT_TO_9_TABLE[value]; | 138 | return value * 511; |
| 184 | } | 139 | } |
| 185 | 140 | ||
| 186 | uint FastReplicate(uint value, uint num_bits, uint to_bit) { | 141 | uint ReplicateBits(uint value, uint num_bits, uint to_bit) { |
| 187 | if (num_bits == 0) { | 142 | if (value == 0 || num_bits == 0) { |
| 188 | return 0; | 143 | return 0; |
| 189 | } | 144 | } |
| 190 | if (num_bits == to_bit) { | 145 | if (num_bits >= to_bit) { |
| 191 | return value; | 146 | return value; |
| 192 | } | 147 | } |
| 193 | if (to_bit == 6) { | 148 | const uint v = value & uint((1 << num_bits) - 1); |
| 194 | switch (num_bits) { | 149 | uint res = v; |
| 195 | case 1: | 150 | uint reslen = num_bits; |
| 196 | return REPLICATE_1_BIT_TO_6_TABLE[value]; | 151 | while (reslen < to_bit) { |
| 197 | case 2: | 152 | const uint num_dst_bits_to_shift_up = min(num_bits, to_bit - reslen); |
| 198 | return REPLICATE_2_BIT_TO_6_TABLE[value]; | 153 | const uint num_src_bits_to_shift_down = num_bits - num_dst_bits_to_shift_up; |
| 199 | case 3: | 154 | |
| 200 | return REPLICATE_3_BIT_TO_6_TABLE[value]; | 155 | res <<= num_dst_bits_to_shift_up; |
| 201 | case 4: | 156 | res |= (v >> num_src_bits_to_shift_down); |
| 202 | return REPLICATE_4_BIT_TO_6_TABLE[value]; | 157 | reslen += num_bits; |
| 203 | case 5: | ||
| 204 | return REPLICATE_5_BIT_TO_6_TABLE[value]; | ||
| 205 | default: | ||
| 206 | break; | ||
| 207 | } | ||
| 208 | } else { /* if (to_bit == 8) */ | ||
| 209 | switch (num_bits) { | ||
| 210 | case 1: | ||
| 211 | return REPLICATE_1_BIT_TO_8_TABLE[value]; | ||
| 212 | case 2: | ||
| 213 | return REPLICATE_2_BIT_TO_8_TABLE[value]; | ||
| 214 | case 3: | ||
| 215 | return REPLICATE_3_BIT_TO_8_TABLE[value]; | ||
| 216 | case 4: | ||
| 217 | return REPLICATE_4_BIT_TO_8_TABLE[value]; | ||
| 218 | case 5: | ||
| 219 | return REPLICATE_5_BIT_TO_8_TABLE[value]; | ||
| 220 | case 6: | ||
| 221 | return REPLICATE_6_BIT_TO_8_TABLE[value]; | ||
| 222 | case 7: | ||
| 223 | return REPLICATE_7_BIT_TO_8_TABLE[value]; | ||
| 224 | default: | ||
| 225 | break; | ||
| 226 | } | ||
| 227 | } | 158 | } |
| 228 | return Replicate(value, num_bits, to_bit); | 159 | return res; |
| 229 | } | 160 | } |
| 230 | 161 | ||
| 231 | uint FastReplicateTo8(uint value, uint num_bits) { | 162 | uint FastReplicateTo8(uint value, uint num_bits) { |
| 232 | return FastReplicate(value, num_bits, 8); | 163 | return ReplicateBits(value, num_bits, 8); |
| 233 | } | 164 | } |
| 234 | 165 | ||
| 235 | uint FastReplicateTo6(uint value, uint num_bits) { | 166 | uint FastReplicateTo6(uint value, uint num_bits) { |
| 236 | return FastReplicate(value, num_bits, 6); | 167 | return ReplicateBits(value, num_bits, 6); |
| 237 | } | 168 | } |
| 238 | 169 | ||
| 239 | uint Div3Floor(uint v) { | 170 | uint Div3Floor(uint v) { |
| @@ -266,15 +197,15 @@ uint Hash52(uint p) { | |||
| 266 | return p; | 197 | return p; |
| 267 | } | 198 | } |
| 268 | 199 | ||
| 269 | uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { | 200 | uint Select2DPartition(uint seed, uint x, uint y, uint partition_count) { |
| 270 | if (small_block) { | 201 | if ((block_dims.y * block_dims.x) < 32) { |
| 271 | x <<= 1; | 202 | x <<= 1; |
| 272 | y <<= 1; | 203 | y <<= 1; |
| 273 | } | 204 | } |
| 274 | 205 | ||
| 275 | seed += (partition_count - 1) * 1024; | 206 | seed += (partition_count - 1) * 1024; |
| 276 | 207 | ||
| 277 | uint rnum = Hash52(uint(seed)); | 208 | const uint rnum = Hash52(uint(seed)); |
| 278 | uint seed1 = uint(rnum & 0xF); | 209 | uint seed1 = uint(rnum & 0xF); |
| 279 | uint seed2 = uint((rnum >> 4) & 0xF); | 210 | uint seed2 = uint((rnum >> 4) & 0xF); |
| 280 | uint seed3 = uint((rnum >> 8) & 0xF); | 211 | uint seed3 = uint((rnum >> 8) & 0xF); |
| @@ -342,53 +273,52 @@ uint ExtractBits(uvec4 payload, int offset, int bits) { | |||
| 342 | if (bits <= 0) { | 273 | if (bits <= 0) { |
| 343 | return 0; | 274 | return 0; |
| 344 | } | 275 | } |
| 345 | int last_offset = offset + bits - 1; | 276 | if (bits > 32) { |
| 346 | int shifted_offset = offset >> 5; | 277 | return 0; |
| 278 | } | ||
| 279 | const int last_offset = offset + bits - 1; | ||
| 280 | const int shifted_offset = offset >> 5; | ||
| 347 | if ((last_offset >> 5) == shifted_offset) { | 281 | if ((last_offset >> 5) == shifted_offset) { |
| 348 | return bitfieldExtract(payload[shifted_offset], offset & 31, bits); | 282 | return bitfieldExtract(payload[shifted_offset], offset & 31, bits); |
| 349 | } | 283 | } |
| 350 | int first_bits = 32 - (offset & 31); | 284 | const int first_bits = 32 - (offset & 31); |
| 351 | int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits)); | 285 | const int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits)); |
| 352 | int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits)); | 286 | const int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits)); |
| 353 | return result_first | (result_second << first_bits); | 287 | return result_first | (result_second << first_bits); |
| 354 | } | 288 | } |
| 355 | 289 | ||
| 356 | uint StreamBits(uint num_bits) { | 290 | uint StreamBits(uint num_bits) { |
| 357 | int int_bits = int(num_bits); | 291 | const int int_bits = int(num_bits); |
| 358 | uint ret = ExtractBits(local_buff, total_bitsread, int_bits); | 292 | const uint ret = ExtractBits(local_buff, total_bitsread, int_bits); |
| 359 | total_bitsread += int_bits; | 293 | total_bitsread += int_bits; |
| 360 | return ret; | 294 | return ret; |
| 361 | } | 295 | } |
| 362 | 296 | ||
| 297 | void SkipBits(uint num_bits) { | ||
| 298 | const int int_bits = int(num_bits); | ||
| 299 | total_bitsread += int_bits; | ||
| 300 | } | ||
| 301 | |||
| 363 | uint StreamColorBits(uint num_bits) { | 302 | uint StreamColorBits(uint num_bits) { |
| 364 | uint ret = 0; | 303 | const int int_bits = int(num_bits); |
| 365 | int int_bits = int(num_bits); | 304 | const uint ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits); |
| 366 | if (texel_flag) { | 305 | color_bitsread += int_bits; |
| 367 | ret = ExtractBits(texel_weight_data, texel_bitsread, int_bits); | ||
| 368 | texel_bitsread += int_bits; | ||
| 369 | } else { | ||
| 370 | ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits); | ||
| 371 | color_bitsread += int_bits; | ||
| 372 | } | ||
| 373 | return ret; | 306 | return ret; |
| 374 | } | 307 | } |
| 375 | 308 | ||
| 376 | void ResultEmplaceBack(EncodingData val) { | 309 | EncodingData GetEncodingFromVector(uint index) { |
| 377 | if (texel_flag) { | 310 | const uint data = result_vector[index]; |
| 378 | texel_vector[texel_vector_index] = val; | 311 | return EncodingData(data); |
| 379 | ++texel_vector_index; | ||
| 380 | } else { | ||
| 381 | result_vector[result_index] = val; | ||
| 382 | ++result_index; | ||
| 383 | } | ||
| 384 | } | 312 | } |
| 385 | 313 | ||
| 386 | // Returns the number of bits required to encode n_vals values. | 314 | // Returns the number of bits required to encode n_vals values. |
| 387 | uint GetBitLength(uint n_vals, uint encoding_index) { | 315 | uint GetBitLength(uint n_vals, uint encoding_index) { |
| 388 | uint total_bits = encoding_values[encoding_index].num_bits * n_vals; | 316 | const EncodingData encoding_value = EncodingData(encoding_values[encoding_index]); |
| 389 | if (encoding_values[encoding_index].encoding == TRIT) { | 317 | const uint encoding = Encoding(encoding_value); |
| 318 | uint total_bits = NumBits(encoding_value) * n_vals; | ||
| 319 | if (encoding == TRIT) { | ||
| 390 | total_bits += Div5Ceil(n_vals * 8); | 320 | total_bits += Div5Ceil(n_vals * 8); |
| 391 | } else if (encoding_values[encoding_index].encoding == QUINT) { | 321 | } else if (encoding == QUINT) { |
| 392 | total_bits += Div3Ceil(n_vals * 7); | 322 | total_bits += Div3Ceil(n_vals * 7); |
| 393 | } | 323 | } |
| 394 | return total_bits; | 324 | return total_bits; |
| @@ -403,7 +333,7 @@ uint GetNumWeightValues(uvec2 size, bool dual_plane) { | |||
| 403 | } | 333 | } |
| 404 | 334 | ||
| 405 | uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) { | 335 | uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) { |
| 406 | uint n_vals = GetNumWeightValues(size, dual_plane); | 336 | const uint n_vals = GetNumWeightValues(size, dual_plane); |
| 407 | return GetBitLength(n_vals, max_weight); | 337 | return GetBitLength(n_vals, max_weight); |
| 408 | } | 338 | } |
| 409 | 339 | ||
| @@ -412,87 +342,74 @@ uint BitsBracket(uint bits, uint pos) { | |||
| 412 | } | 342 | } |
| 413 | 343 | ||
| 414 | uint BitsOp(uint bits, uint start, uint end) { | 344 | uint BitsOp(uint bits, uint start, uint end) { |
| 415 | if (start == end) { | 345 | const uint mask = (1 << (end - start + 1)) - 1; |
| 416 | return BitsBracket(bits, start); | ||
| 417 | } else if (start > end) { | ||
| 418 | uint t = start; | ||
| 419 | start = end; | ||
| 420 | end = t; | ||
| 421 | } | ||
| 422 | |||
| 423 | uint mask = (1 << (end - start + 1)) - 1; | ||
| 424 | return ((bits >> start) & mask); | 346 | return ((bits >> start) & mask); |
| 425 | } | 347 | } |
| 426 | 348 | ||
| 427 | void DecodeQuintBlock(uint num_bits) { | 349 | void DecodeQuintBlock(uint num_bits) { |
| 428 | uint m[3]; | 350 | uvec3 m; |
| 429 | uint q[3]; | 351 | uvec4 qQ; |
| 430 | uint Q; | ||
| 431 | m[0] = StreamColorBits(num_bits); | 352 | m[0] = StreamColorBits(num_bits); |
| 432 | Q = StreamColorBits(3); | 353 | qQ.w = StreamColorBits(3); |
| 433 | m[1] = StreamColorBits(num_bits); | 354 | m[1] = StreamColorBits(num_bits); |
| 434 | Q |= StreamColorBits(2) << 3; | 355 | qQ.w |= StreamColorBits(2) << 3; |
| 435 | m[2] = StreamColorBits(num_bits); | 356 | m[2] = StreamColorBits(num_bits); |
| 436 | Q |= StreamColorBits(2) << 5; | 357 | qQ.w |= StreamColorBits(2) << 5; |
| 437 | if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) { | 358 | if (BitsOp(qQ.w, 1, 2) == 3 && BitsOp(qQ.w, 5, 6) == 0) { |
| 438 | q[0] = 4; | 359 | qQ.x = 4; |
| 439 | q[1] = 4; | 360 | qQ.y = 4; |
| 440 | q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) | | 361 | qQ.z = (BitsBracket(qQ.w, 0) << 2) | ((BitsBracket(qQ.w, 4) & ~BitsBracket(qQ.w, 0)) << 1) | |
| 441 | (BitsBracket(Q, 3) & ~BitsBracket(Q, 0)); | 362 | (BitsBracket(qQ.w, 3) & ~BitsBracket(qQ.w, 0)); |
| 442 | } else { | 363 | } else { |
| 443 | uint C = 0; | 364 | uint C = 0; |
| 444 | if (BitsOp(Q, 1, 2) == 3) { | 365 | if (BitsOp(qQ.w, 1, 2) == 3) { |
| 445 | q[2] = 4; | 366 | qQ.z = 4; |
| 446 | C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0); | 367 | C = (BitsOp(qQ.w, 3, 4) << 3) | ((~BitsOp(qQ.w, 5, 6) & 3) << 1) | BitsBracket(qQ.w, 0); |
| 447 | } else { | 368 | } else { |
| 448 | q[2] = BitsOp(Q, 5, 6); | 369 | qQ.z = BitsOp(qQ.w, 5, 6); |
| 449 | C = BitsOp(Q, 0, 4); | 370 | C = BitsOp(qQ.w, 0, 4); |
| 450 | } | 371 | } |
| 451 | if (BitsOp(C, 0, 2) == 5) { | 372 | if (BitsOp(C, 0, 2) == 5) { |
| 452 | q[1] = 4; | 373 | qQ.y = 4; |
| 453 | q[0] = BitsOp(C, 3, 4); | 374 | qQ.x = BitsOp(C, 3, 4); |
| 454 | } else { | 375 | } else { |
| 455 | q[1] = BitsOp(C, 3, 4); | 376 | qQ.y = BitsOp(C, 3, 4); |
| 456 | q[0] = BitsOp(C, 0, 2); | 377 | qQ.x = BitsOp(C, 0, 2); |
| 457 | } | 378 | } |
| 458 | } | 379 | } |
| 459 | for (uint i = 0; i < 3; i++) { | 380 | for (uint i = 0; i < 3; i++) { |
| 460 | EncodingData val; | 381 | const EncodingData val = CreateEncodingData(QUINT, num_bits, m[i], qQ[i]); |
| 461 | val.encoding = QUINT; | ||
| 462 | val.num_bits = num_bits; | ||
| 463 | val.bit_value = m[i]; | ||
| 464 | val.quint_trit_value = q[i]; | ||
| 465 | ResultEmplaceBack(val); | 382 | ResultEmplaceBack(val); |
| 466 | } | 383 | } |
| 467 | } | 384 | } |
| 468 | 385 | ||
| 469 | void DecodeTritBlock(uint num_bits) { | 386 | void DecodeTritBlock(uint num_bits) { |
| 470 | uint m[5]; | 387 | uvec4 m; |
| 471 | uint t[5]; | 388 | uvec4 t; |
| 472 | uint T; | 389 | uvec3 Tm5t5; |
| 473 | m[0] = StreamColorBits(num_bits); | 390 | m[0] = StreamColorBits(num_bits); |
| 474 | T = StreamColorBits(2); | 391 | Tm5t5.x = StreamColorBits(2); |
| 475 | m[1] = StreamColorBits(num_bits); | 392 | m[1] = StreamColorBits(num_bits); |
| 476 | T |= StreamColorBits(2) << 2; | 393 | Tm5t5.x |= StreamColorBits(2) << 2; |
| 477 | m[2] = StreamColorBits(num_bits); | 394 | m[2] = StreamColorBits(num_bits); |
| 478 | T |= StreamColorBits(1) << 4; | 395 | Tm5t5.x |= StreamColorBits(1) << 4; |
| 479 | m[3] = StreamColorBits(num_bits); | 396 | m[3] = StreamColorBits(num_bits); |
| 480 | T |= StreamColorBits(2) << 5; | 397 | Tm5t5.x |= StreamColorBits(2) << 5; |
| 481 | m[4] = StreamColorBits(num_bits); | 398 | Tm5t5.y = StreamColorBits(num_bits); |
| 482 | T |= StreamColorBits(1) << 7; | 399 | Tm5t5.x |= StreamColorBits(1) << 7; |
| 483 | uint C = 0; | 400 | uint C = 0; |
| 484 | if (BitsOp(T, 2, 4) == 7) { | 401 | if (BitsOp(Tm5t5.x, 2, 4) == 7) { |
| 485 | C = (BitsOp(T, 5, 7) << 2) | BitsOp(T, 0, 1); | 402 | C = (BitsOp(Tm5t5.x, 5, 7) << 2) | BitsOp(Tm5t5.x, 0, 1); |
| 486 | t[4] = 2; | 403 | Tm5t5.z = 2; |
| 487 | t[3] = 2; | 404 | t[3] = 2; |
| 488 | } else { | 405 | } else { |
| 489 | C = BitsOp(T, 0, 4); | 406 | C = BitsOp(Tm5t5.x, 0, 4); |
| 490 | if (BitsOp(T, 5, 6) == 3) { | 407 | if (BitsOp(Tm5t5.x, 5, 6) == 3) { |
| 491 | t[4] = 2; | 408 | Tm5t5.z = 2; |
| 492 | t[3] = BitsBracket(T, 7); | 409 | t[3] = BitsBracket(Tm5t5.x, 7); |
| 493 | } else { | 410 | } else { |
| 494 | t[4] = BitsBracket(T, 7); | 411 | Tm5t5.z = BitsBracket(Tm5t5.x, 7); |
| 495 | t[3] = BitsOp(T, 5, 6); | 412 | t[3] = BitsOp(Tm5t5.x, 5, 6); |
| 496 | } | 413 | } |
| 497 | } | 414 | } |
| 498 | if (BitsOp(C, 0, 1) == 3) { | 415 | if (BitsOp(C, 0, 1) == 3) { |
| @@ -508,31 +425,31 @@ void DecodeTritBlock(uint num_bits) { | |||
| 508 | t[1] = BitsOp(C, 2, 3); | 425 | t[1] = BitsOp(C, 2, 3); |
| 509 | t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1)); | 426 | t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1)); |
| 510 | } | 427 | } |
| 511 | for (uint i = 0; i < 5; i++) { | 428 | for (uint i = 0; i < 4; i++) { |
| 512 | EncodingData val; | 429 | const EncodingData val = CreateEncodingData(TRIT, num_bits, m[i], t[i]); |
| 513 | val.encoding = TRIT; | ||
| 514 | val.num_bits = num_bits; | ||
| 515 | val.bit_value = m[i]; | ||
| 516 | val.quint_trit_value = t[i]; | ||
| 517 | ResultEmplaceBack(val); | 430 | ResultEmplaceBack(val); |
| 518 | } | 431 | } |
| 432 | const EncodingData val = CreateEncodingData(TRIT, num_bits, Tm5t5.y, Tm5t5.z); | ||
| 433 | ResultEmplaceBack(val); | ||
| 519 | } | 434 | } |
| 520 | 435 | ||
| 521 | void DecodeIntegerSequence(uint max_range, uint num_values) { | 436 | void DecodeIntegerSequence(uint max_range, uint num_values) { |
| 522 | EncodingData val = encoding_values[max_range]; | 437 | EncodingData val = EncodingData(encoding_values[max_range]); |
| 438 | const uint encoding = Encoding(val); | ||
| 439 | const uint num_bits = NumBits(val); | ||
| 523 | uint vals_decoded = 0; | 440 | uint vals_decoded = 0; |
| 524 | while (vals_decoded < num_values) { | 441 | while (vals_decoded < num_values && !result_limit_reached) { |
| 525 | switch (val.encoding) { | 442 | switch (encoding) { |
| 526 | case QUINT: | 443 | case QUINT: |
| 527 | DecodeQuintBlock(val.num_bits); | 444 | DecodeQuintBlock(num_bits); |
| 528 | vals_decoded += 3; | 445 | vals_decoded += 3; |
| 529 | break; | 446 | break; |
| 530 | case TRIT: | 447 | case TRIT: |
| 531 | DecodeTritBlock(val.num_bits); | 448 | DecodeTritBlock(num_bits); |
| 532 | vals_decoded += 5; | 449 | vals_decoded += 5; |
| 533 | break; | 450 | break; |
| 534 | case JUST_BITS: | 451 | case JUST_BITS: |
| 535 | val.bit_value = StreamColorBits(val.num_bits); | 452 | BitValue(val, StreamColorBits(num_bits)); |
| 536 | ResultEmplaceBack(val); | 453 | ResultEmplaceBack(val); |
| 537 | vals_decoded++; | 454 | vals_decoded++; |
| 538 | break; | 455 | break; |
| @@ -540,7 +457,7 @@ void DecodeIntegerSequence(uint max_range, uint num_values) { | |||
| 540 | } | 457 | } |
| 541 | } | 458 | } |
| 542 | 459 | ||
| 543 | void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | 460 | void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, out uint color_values[32]) { |
| 544 | uint num_values = 0; | 461 | uint num_values = 0; |
| 545 | for (uint i = 0; i < num_partitions; i++) { | 462 | for (uint i = 0; i < num_partitions; i++) { |
| 546 | num_values += ((modes[i] >> 2) + 1) << 1; | 463 | num_values += ((modes[i] >> 2) + 1) << 1; |
| @@ -549,7 +466,7 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | |||
| 549 | // TODO(ameerj): profile with binary search | 466 | // TODO(ameerj): profile with binary search |
| 550 | int range = 0; | 467 | int range = 0; |
| 551 | while (++range < encoding_values.length()) { | 468 | while (++range < encoding_values.length()) { |
| 552 | uint bit_length = GetBitLength(num_values, range); | 469 | const uint bit_length = GetBitLength(num_values, range); |
| 553 | if (bit_length > color_data_bits) { | 470 | if (bit_length > color_data_bits) { |
| 554 | break; | 471 | break; |
| 555 | } | 472 | } |
| @@ -560,48 +477,49 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | |||
| 560 | if (out_index >= num_values) { | 477 | if (out_index >= num_values) { |
| 561 | break; | 478 | break; |
| 562 | } | 479 | } |
| 563 | EncodingData val = result_vector[itr]; | 480 | const EncodingData val = GetEncodingFromVector(itr); |
| 564 | uint bitlen = val.num_bits; | 481 | const uint encoding = Encoding(val); |
| 565 | uint bitval = val.bit_value; | 482 | const uint bitlen = NumBits(val); |
| 483 | const uint bitval = BitValue(val); | ||
| 566 | uint A = 0, B = 0, C = 0, D = 0; | 484 | uint A = 0, B = 0, C = 0, D = 0; |
| 567 | A = ReplicateBitTo9((bitval & 1)); | 485 | A = ReplicateBitTo9((bitval & 1)); |
| 568 | switch (val.encoding) { | 486 | switch (encoding) { |
| 569 | case JUST_BITS: | 487 | case JUST_BITS: |
| 570 | color_values[out_index++] = FastReplicateTo8(bitval, bitlen); | 488 | color_values[++out_index] = FastReplicateTo8(bitval, bitlen); |
| 571 | break; | 489 | break; |
| 572 | case TRIT: { | 490 | case TRIT: { |
| 573 | D = val.quint_trit_value; | 491 | D = QuintTritValue(val); |
| 574 | switch (bitlen) { | 492 | switch (bitlen) { |
| 575 | case 1: | 493 | case 1: |
| 576 | C = 204; | 494 | C = 204; |
| 577 | break; | 495 | break; |
| 578 | case 2: { | 496 | case 2: { |
| 579 | C = 93; | 497 | C = 93; |
| 580 | uint b = (bitval >> 1) & 1; | 498 | const uint b = (bitval >> 1) & 1; |
| 581 | B = (b << 8) | (b << 4) | (b << 2) | (b << 1); | 499 | B = (b << 8) | (b << 4) | (b << 2) | (b << 1); |
| 582 | break; | 500 | break; |
| 583 | } | 501 | } |
| 584 | case 3: { | 502 | case 3: { |
| 585 | C = 44; | 503 | C = 44; |
| 586 | uint cb = (bitval >> 1) & 3; | 504 | const uint cb = (bitval >> 1) & 3; |
| 587 | B = (cb << 7) | (cb << 2) | cb; | 505 | B = (cb << 7) | (cb << 2) | cb; |
| 588 | break; | 506 | break; |
| 589 | } | 507 | } |
| 590 | case 4: { | 508 | case 4: { |
| 591 | C = 22; | 509 | C = 22; |
| 592 | uint dcb = (bitval >> 1) & 7; | 510 | const uint dcb = (bitval >> 1) & 7; |
| 593 | B = (dcb << 6) | dcb; | 511 | B = (dcb << 6) | dcb; |
| 594 | break; | 512 | break; |
| 595 | } | 513 | } |
| 596 | case 5: { | 514 | case 5: { |
| 597 | C = 11; | 515 | C = 11; |
| 598 | uint edcb = (bitval >> 1) & 0xF; | 516 | const uint edcb = (bitval >> 1) & 0xF; |
| 599 | B = (edcb << 5) | (edcb >> 2); | 517 | B = (edcb << 5) | (edcb >> 2); |
| 600 | break; | 518 | break; |
| 601 | } | 519 | } |
| 602 | case 6: { | 520 | case 6: { |
| 603 | C = 5; | 521 | C = 5; |
| 604 | uint fedcb = (bitval >> 1) & 0x1F; | 522 | const uint fedcb = (bitval >> 1) & 0x1F; |
| 605 | B = (fedcb << 4) | (fedcb >> 4); | 523 | B = (fedcb << 4) | (fedcb >> 4); |
| 606 | break; | 524 | break; |
| 607 | } | 525 | } |
| @@ -609,32 +527,32 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | |||
| 609 | break; | 527 | break; |
| 610 | } | 528 | } |
| 611 | case QUINT: { | 529 | case QUINT: { |
| 612 | D = val.quint_trit_value; | 530 | D = QuintTritValue(val); |
| 613 | switch (bitlen) { | 531 | switch (bitlen) { |
| 614 | case 1: | 532 | case 1: |
| 615 | C = 113; | 533 | C = 113; |
| 616 | break; | 534 | break; |
| 617 | case 2: { | 535 | case 2: { |
| 618 | C = 54; | 536 | C = 54; |
| 619 | uint b = (bitval >> 1) & 1; | 537 | const uint b = (bitval >> 1) & 1; |
| 620 | B = (b << 8) | (b << 3) | (b << 2); | 538 | B = (b << 8) | (b << 3) | (b << 2); |
| 621 | break; | 539 | break; |
| 622 | } | 540 | } |
| 623 | case 3: { | 541 | case 3: { |
| 624 | C = 26; | 542 | C = 26; |
| 625 | uint cb = (bitval >> 1) & 3; | 543 | const uint cb = (bitval >> 1) & 3; |
| 626 | B = (cb << 7) | (cb << 1) | (cb >> 1); | 544 | B = (cb << 7) | (cb << 1) | (cb >> 1); |
| 627 | break; | 545 | break; |
| 628 | } | 546 | } |
| 629 | case 4: { | 547 | case 4: { |
| 630 | C = 13; | 548 | C = 13; |
| 631 | uint dcb = (bitval >> 1) & 7; | 549 | const uint dcb = (bitval >> 1) & 7; |
| 632 | B = (dcb << 6) | (dcb >> 1); | 550 | B = (dcb << 6) | (dcb >> 1); |
| 633 | break; | 551 | break; |
| 634 | } | 552 | } |
| 635 | case 5: { | 553 | case 5: { |
| 636 | C = 6; | 554 | C = 6; |
| 637 | uint edcb = (bitval >> 1) & 0xF; | 555 | const uint edcb = (bitval >> 1) & 0xF; |
| 638 | B = (edcb << 5) | (edcb >> 3); | 556 | B = (edcb << 5) | (edcb >> 3); |
| 639 | break; | 557 | break; |
| 640 | } | 558 | } |
| @@ -642,11 +560,11 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | |||
| 642 | break; | 560 | break; |
| 643 | } | 561 | } |
| 644 | } | 562 | } |
| 645 | if (val.encoding != JUST_BITS) { | 563 | if (encoding != JUST_BITS) { |
| 646 | uint T = (D * C) + B; | 564 | uint T = (D * C) + B; |
| 647 | T ^= A; | 565 | T ^= A; |
| 648 | T = (A & 0x80) | (T >> 2); | 566 | T = (A & 0x80) | (T >> 2); |
| 649 | color_values[out_index++] = T; | 567 | color_values[++out_index] = T; |
| 650 | } | 568 | } |
| 651 | } | 569 | } |
| 652 | } | 570 | } |
| @@ -664,139 +582,136 @@ ivec2 BitTransferSigned(int a, int b) { | |||
| 664 | } | 582 | } |
| 665 | 583 | ||
| 666 | uvec4 ClampByte(ivec4 color) { | 584 | uvec4 ClampByte(ivec4 color) { |
| 667 | for (uint i = 0; i < 4; ++i) { | 585 | return uvec4(clamp(color, 0, 255)); |
| 668 | color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); | ||
| 669 | } | ||
| 670 | return uvec4(color); | ||
| 671 | } | 586 | } |
| 672 | 587 | ||
| 673 | ivec4 BlueContract(int a, int r, int g, int b) { | 588 | ivec4 BlueContract(int a, int r, int g, int b) { |
| 674 | return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); | 589 | return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); |
| 675 | } | 590 | } |
| 676 | 591 | ||
| 677 | void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { | 592 | void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, uint color_values[32], |
| 593 | inout uint colvals_index) { | ||
| 678 | #define READ_UINT_VALUES(N) \ | 594 | #define READ_UINT_VALUES(N) \ |
| 679 | uint v[N]; \ | 595 | uvec4 V[2]; \ |
| 680 | for (uint i = 0; i < N; i++) { \ | 596 | for (uint i = 0; i < N; i++) { \ |
| 681 | v[i] = color_values[colvals_index++]; \ | 597 | V[i / 4][i % 4] = color_values[++colvals_index]; \ |
| 682 | } | 598 | } |
| 683 | |||
| 684 | #define READ_INT_VALUES(N) \ | 599 | #define READ_INT_VALUES(N) \ |
| 685 | int v[N]; \ | 600 | ivec4 V[2]; \ |
| 686 | for (uint i = 0; i < N; i++) { \ | 601 | for (uint i = 0; i < N; i++) { \ |
| 687 | v[i] = int(color_values[colvals_index++]); \ | 602 | V[i / 4][i % 4] = int(color_values[++colvals_index]); \ |
| 688 | } | 603 | } |
| 689 | 604 | ||
| 690 | switch (color_endpoint_mode) { | 605 | switch (color_endpoint_mode) { |
| 691 | case 0: { | 606 | case 0: { |
| 692 | READ_UINT_VALUES(2) | 607 | READ_UINT_VALUES(2) |
| 693 | ep1 = uvec4(0xFF, v[0], v[0], v[0]); | 608 | ep1 = uvec4(0xFF, V[0].x, V[0].x, V[0].x); |
| 694 | ep2 = uvec4(0xFF, v[1], v[1], v[1]); | 609 | ep2 = uvec4(0xFF, V[0].y, V[0].y, V[0].y); |
| 695 | break; | 610 | break; |
| 696 | } | 611 | } |
| 697 | case 1: { | 612 | case 1: { |
| 698 | READ_UINT_VALUES(2) | 613 | READ_UINT_VALUES(2) |
| 699 | uint L0 = (v[0] >> 2) | (v[1] & 0xC0); | 614 | const uint L0 = (V[0].x >> 2) | (V[0].y & 0xC0); |
| 700 | uint L1 = min(L0 + (v[1] & 0x3F), 0xFFU); | 615 | const uint L1 = min(L0 + (V[0].y & 0x3F), 0xFFU); |
| 701 | ep1 = uvec4(0xFF, L0, L0, L0); | 616 | ep1 = uvec4(0xFF, L0, L0, L0); |
| 702 | ep2 = uvec4(0xFF, L1, L1, L1); | 617 | ep2 = uvec4(0xFF, L1, L1, L1); |
| 703 | break; | 618 | break; |
| 704 | } | 619 | } |
| 705 | case 4: { | 620 | case 4: { |
| 706 | READ_UINT_VALUES(4) | 621 | READ_UINT_VALUES(4) |
| 707 | ep1 = uvec4(v[2], v[0], v[0], v[0]); | 622 | ep1 = uvec4(V[0].z, V[0].x, V[0].x, V[0].x); |
| 708 | ep2 = uvec4(v[3], v[1], v[1], v[1]); | 623 | ep2 = uvec4(V[0].w, V[0].y, V[0].y, V[0].y); |
| 709 | break; | 624 | break; |
| 710 | } | 625 | } |
| 711 | case 5: { | 626 | case 5: { |
| 712 | READ_INT_VALUES(4) | 627 | READ_INT_VALUES(4) |
| 713 | ivec2 transferred = BitTransferSigned(v[1], v[0]); | 628 | ivec2 transferred = BitTransferSigned(V[0].y, V[0].x); |
| 714 | v[1] = transferred.x; | 629 | V[0].y = transferred.x; |
| 715 | v[0] = transferred.y; | 630 | V[0].x = transferred.y; |
| 716 | transferred = BitTransferSigned(v[3], v[2]); | 631 | transferred = BitTransferSigned(V[0].w, V[0].z); |
| 717 | v[3] = transferred.x; | 632 | V[0].w = transferred.x; |
| 718 | v[2] = transferred.y; | 633 | V[0].z = transferred.y; |
| 719 | ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0])); | 634 | ep1 = ClampByte(ivec4(V[0].z, V[0].x, V[0].x, V[0].x)); |
| 720 | ep2 = ClampByte(ivec4(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1])); | 635 | ep2 = ClampByte(ivec4(V[0].z + V[0].w, V[0].x + V[0].y, V[0].x + V[0].y, V[0].x + V[0].y)); |
| 721 | break; | 636 | break; |
| 722 | } | 637 | } |
| 723 | case 6: { | 638 | case 6: { |
| 724 | READ_UINT_VALUES(4) | 639 | READ_UINT_VALUES(4) |
| 725 | ep1 = uvec4(0xFF, (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); | 640 | ep1 = uvec4(0xFF, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8); |
| 726 | ep2 = uvec4(0xFF, v[0], v[1], v[2]); | 641 | ep2 = uvec4(0xFF, V[0].x, V[0].y, V[0].z); |
| 727 | break; | 642 | break; |
| 728 | } | 643 | } |
| 729 | case 8: { | 644 | case 8: { |
| 730 | READ_UINT_VALUES(6) | 645 | READ_UINT_VALUES(6) |
| 731 | if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { | 646 | if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) { |
| 732 | ep1 = uvec4(0xFF, v[0], v[2], v[4]); | 647 | ep1 = uvec4(0xFF, V[0].x, V[0].z, V[1].x); |
| 733 | ep2 = uvec4(0xFF, v[1], v[3], v[5]); | 648 | ep2 = uvec4(0xFF, V[0].y, V[0].w, V[1].y); |
| 734 | } else { | 649 | } else { |
| 735 | ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5]))); | 650 | ep1 = uvec4(BlueContract(0xFF, int(V[0].y), int(V[0].w), int(V[1].y))); |
| 736 | ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4]))); | 651 | ep2 = uvec4(BlueContract(0xFF, int(V[0].x), int(V[0].z), int(V[1].x))); |
| 737 | } | 652 | } |
| 738 | break; | 653 | break; |
| 739 | } | 654 | } |
| 740 | case 9: { | 655 | case 9: { |
| 741 | READ_INT_VALUES(6) | 656 | READ_INT_VALUES(6) |
| 742 | ivec2 transferred = BitTransferSigned(v[1], v[0]); | 657 | ivec2 transferred = BitTransferSigned(V[0].y, V[0].x); |
| 743 | v[1] = transferred.x; | 658 | V[0].y = transferred.x; |
| 744 | v[0] = transferred.y; | 659 | V[0].x = transferred.y; |
| 745 | transferred = BitTransferSigned(v[3], v[2]); | 660 | transferred = BitTransferSigned(V[0].w, V[0].z); |
| 746 | v[3] = transferred.x; | 661 | V[0].w = transferred.x; |
| 747 | v[2] = transferred.y; | 662 | V[0].z = transferred.y; |
| 748 | transferred = BitTransferSigned(v[5], v[4]); | 663 | transferred = BitTransferSigned(V[1].y, V[1].x); |
| 749 | v[5] = transferred.x; | 664 | V[1].y = transferred.x; |
| 750 | v[4] = transferred.y; | 665 | V[1].x = transferred.y; |
| 751 | if ((v[1] + v[3] + v[5]) >= 0) { | 666 | if ((V[0].y + V[0].w + V[1].y) >= 0) { |
| 752 | ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4])); | 667 | ep1 = ClampByte(ivec4(0xFF, V[0].x, V[0].z, V[1].x)); |
| 753 | ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); | 668 | ep2 = ClampByte(ivec4(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); |
| 754 | } else { | 669 | } else { |
| 755 | ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); | 670 | ep1 = ClampByte(BlueContract(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); |
| 756 | ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4])); | 671 | ep2 = ClampByte(BlueContract(0xFF, V[0].x, V[0].z, V[1].x)); |
| 757 | } | 672 | } |
| 758 | break; | 673 | break; |
| 759 | } | 674 | } |
| 760 | case 10: { | 675 | case 10: { |
| 761 | READ_UINT_VALUES(6) | 676 | READ_UINT_VALUES(6) |
| 762 | ep1 = uvec4(v[4], (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); | 677 | ep1 = uvec4(V[1].x, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8); |
| 763 | ep2 = uvec4(v[5], v[0], v[1], v[2]); | 678 | ep2 = uvec4(V[1].y, V[0].x, V[0].y, V[0].z); |
| 764 | break; | 679 | break; |
| 765 | } | 680 | } |
| 766 | case 12: { | 681 | case 12: { |
| 767 | READ_UINT_VALUES(8) | 682 | READ_UINT_VALUES(8) |
| 768 | if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { | 683 | if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) { |
| 769 | ep1 = uvec4(v[6], v[0], v[2], v[4]); | 684 | ep1 = uvec4(V[1].z, V[0].x, V[0].z, V[1].x); |
| 770 | ep2 = uvec4(v[7], v[1], v[3], v[5]); | 685 | ep2 = uvec4(V[1].w, V[0].y, V[0].w, V[1].y); |
| 771 | } else { | 686 | } else { |
| 772 | ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5]))); | 687 | ep1 = uvec4(BlueContract(int(V[1].w), int(V[0].y), int(V[0].w), int(V[1].y))); |
| 773 | ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4]))); | 688 | ep2 = uvec4(BlueContract(int(V[1].z), int(V[0].x), int(V[0].z), int(V[1].x))); |
| 774 | } | 689 | } |
| 775 | break; | 690 | break; |
| 776 | } | 691 | } |
| 777 | case 13: { | 692 | case 13: { |
| 778 | READ_INT_VALUES(8) | 693 | READ_INT_VALUES(8) |
| 779 | ivec2 transferred = BitTransferSigned(v[1], v[0]); | 694 | ivec2 transferred = BitTransferSigned(V[0].y, V[0].x); |
| 780 | v[1] = transferred.x; | 695 | V[0].y = transferred.x; |
| 781 | v[0] = transferred.y; | 696 | V[0].x = transferred.y; |
| 782 | transferred = BitTransferSigned(v[3], v[2]); | 697 | transferred = BitTransferSigned(V[0].w, V[0].z); |
| 783 | v[3] = transferred.x; | 698 | V[0].w = transferred.x; |
| 784 | v[2] = transferred.y; | 699 | V[0].z = transferred.y; |
| 785 | 700 | ||
| 786 | transferred = BitTransferSigned(v[5], v[4]); | 701 | transferred = BitTransferSigned(V[1].y, V[1].x); |
| 787 | v[5] = transferred.x; | 702 | V[1].y = transferred.x; |
| 788 | v[4] = transferred.y; | 703 | V[1].x = transferred.y; |
| 789 | 704 | ||
| 790 | transferred = BitTransferSigned(v[7], v[6]); | 705 | transferred = BitTransferSigned(V[1].w, V[1].z); |
| 791 | v[7] = transferred.x; | 706 | V[1].w = transferred.x; |
| 792 | v[6] = transferred.y; | 707 | V[1].z = transferred.y; |
| 793 | 708 | ||
| 794 | if ((v[1] + v[3] + v[5]) >= 0) { | 709 | if ((V[0].y + V[0].w + V[1].y) >= 0) { |
| 795 | ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4])); | 710 | ep1 = ClampByte(ivec4(V[1].z, V[0].x, V[0].z, V[1].x)); |
| 796 | ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5])); | 711 | ep2 = ClampByte(ivec4(V[1].w + V[1].z, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); |
| 797 | } else { | 712 | } else { |
| 798 | ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5])); | 713 | ep1 = ClampByte(BlueContract(V[1].z + V[1].w, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); |
| 799 | ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4])); | 714 | ep2 = ClampByte(BlueContract(V[1].z, V[0].x, V[0].z, V[1].x)); |
| 800 | } | 715 | } |
| 801 | break; | 716 | break; |
| 802 | } | 717 | } |
| @@ -812,36 +727,34 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { | |||
| 812 | } | 727 | } |
| 813 | 728 | ||
| 814 | uint UnquantizeTexelWeight(EncodingData val) { | 729 | uint UnquantizeTexelWeight(EncodingData val) { |
| 815 | uint bitval = val.bit_value; | 730 | const uint encoding = Encoding(val); |
| 816 | uint bitlen = val.num_bits; | 731 | const uint bitlen = NumBits(val); |
| 817 | uint A = ReplicateBitTo7((bitval & 1)); | 732 | const uint bitval = BitValue(val); |
| 733 | const uint A = ReplicateBitTo7((bitval & 1)); | ||
| 818 | uint B = 0, C = 0, D = 0; | 734 | uint B = 0, C = 0, D = 0; |
| 819 | uint result = 0; | 735 | uint result = 0; |
| 820 | switch (val.encoding) { | 736 | const uint bitlen_0_results[5] = {0, 16, 32, 48, 64}; |
| 737 | switch (encoding) { | ||
| 821 | case JUST_BITS: | 738 | case JUST_BITS: |
| 822 | result = FastReplicateTo6(bitval, bitlen); | 739 | return FastReplicateTo6(bitval, bitlen); |
| 823 | break; | ||
| 824 | case TRIT: { | 740 | case TRIT: { |
| 825 | D = val.quint_trit_value; | 741 | D = QuintTritValue(val); |
| 826 | switch (bitlen) { | 742 | switch (bitlen) { |
| 827 | case 0: { | 743 | case 0: |
| 828 | uint results[3] = {0, 32, 63}; | 744 | return bitlen_0_results[D * 2]; |
| 829 | result = results[D]; | ||
| 830 | break; | ||
| 831 | } | ||
| 832 | case 1: { | 745 | case 1: { |
| 833 | C = 50; | 746 | C = 50; |
| 834 | break; | 747 | break; |
| 835 | } | 748 | } |
| 836 | case 2: { | 749 | case 2: { |
| 837 | C = 23; | 750 | C = 23; |
| 838 | uint b = (bitval >> 1) & 1; | 751 | const uint b = (bitval >> 1) & 1; |
| 839 | B = (b << 6) | (b << 2) | b; | 752 | B = (b << 6) | (b << 2) | b; |
| 840 | break; | 753 | break; |
| 841 | } | 754 | } |
| 842 | case 3: { | 755 | case 3: { |
| 843 | C = 11; | 756 | C = 11; |
| 844 | uint cb = (bitval >> 1) & 3; | 757 | const uint cb = (bitval >> 1) & 3; |
| 845 | B = (cb << 5) | cb; | 758 | B = (cb << 5) | cb; |
| 846 | break; | 759 | break; |
| 847 | } | 760 | } |
| @@ -851,20 +764,17 @@ uint UnquantizeTexelWeight(EncodingData val) { | |||
| 851 | break; | 764 | break; |
| 852 | } | 765 | } |
| 853 | case QUINT: { | 766 | case QUINT: { |
| 854 | D = val.quint_trit_value; | 767 | D = QuintTritValue(val); |
| 855 | switch (bitlen) { | 768 | switch (bitlen) { |
| 856 | case 0: { | 769 | case 0: |
| 857 | uint results[5] = {0, 16, 32, 47, 63}; | 770 | return bitlen_0_results[D]; |
| 858 | result = results[D]; | ||
| 859 | break; | ||
| 860 | } | ||
| 861 | case 1: { | 771 | case 1: { |
| 862 | C = 28; | 772 | C = 28; |
| 863 | break; | 773 | break; |
| 864 | } | 774 | } |
| 865 | case 2: { | 775 | case 2: { |
| 866 | C = 13; | 776 | C = 13; |
| 867 | uint b = (bitval >> 1) & 1; | 777 | const uint b = (bitval >> 1) & 1; |
| 868 | B = (b << 6) | (b << 1); | 778 | B = (b << 6) | (b << 1); |
| 869 | break; | 779 | break; |
| 870 | } | 780 | } |
| @@ -872,7 +782,7 @@ uint UnquantizeTexelWeight(EncodingData val) { | |||
| 872 | break; | 782 | break; |
| 873 | } | 783 | } |
| 874 | } | 784 | } |
| 875 | if (val.encoding != JUST_BITS && bitlen > 0) { | 785 | if (encoding != JUST_BITS && bitlen > 0) { |
| 876 | result = D * C + B; | 786 | result = D * C + B; |
| 877 | result ^= A; | 787 | result ^= A; |
| 878 | result = (A & 0x20) | (result >> 2); | 788 | result = (A & 0x20) | (result >> 2); |
| @@ -883,61 +793,77 @@ uint UnquantizeTexelWeight(EncodingData val) { | |||
| 883 | return result; | 793 | return result; |
| 884 | } | 794 | } |
| 885 | 795 | ||
| 886 | void UnquantizeTexelWeights(bool dual_plane, uvec2 size) { | 796 | void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) { |
| 887 | uint weight_idx = 0; | 797 | const uint num_planes = is_dual_plane ? 2 : 1; |
| 888 | uint unquantized[2][144]; | 798 | const uint area = size.x * size.y; |
| 889 | uint area = size.x * size.y; | 799 | const uint loop_count = min(result_index, area * num_planes); |
| 890 | for (uint itr = 0; itr < texel_vector_index; itr++) { | 800 | for (uint itr = 0; itr < loop_count; ++itr) { |
| 891 | unquantized[0][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]); | 801 | result_vector[itr] = |
| 892 | if (dual_plane) { | 802 | UnquantizeTexelWeight(GetEncodingFromVector(itr)); |
| 893 | ++itr; | ||
| 894 | unquantized[1][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]); | ||
| 895 | if (itr == texel_vector_index) { | ||
| 896 | break; | ||
| 897 | } | ||
| 898 | } | ||
| 899 | if (++weight_idx >= (area)) | ||
| 900 | break; | ||
| 901 | } | 803 | } |
| 804 | } | ||
| 805 | |||
| 806 | uint GetUnquantizedTexelWieght(uint offset_base, uint plane, bool is_dual_plane) { | ||
| 807 | const uint offset = is_dual_plane ? 2 * offset_base + plane : offset_base; | ||
| 808 | return result_vector[offset]; | ||
| 809 | } | ||
| 902 | 810 | ||
| 811 | uvec4 GetUnquantizedWeightVector(uint t, uint s, uvec2 size, uint plane_index, bool is_dual_plane) { | ||
| 903 | const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); | 812 | const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); |
| 904 | const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); | 813 | const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); |
| 905 | const uint k_plane_scale = dual_plane ? 2 : 1; | 814 | const uint area = size.x * size.y; |
| 906 | for (uint plane = 0; plane < k_plane_scale; plane++) { | 815 | |
| 907 | for (uint t = 0; t < block_dims.y; t++) { | 816 | const uint cs = Ds * s; |
| 908 | for (uint s = 0; s < block_dims.x; s++) { | 817 | const uint ct = Dt * t; |
| 909 | uint cs = Ds * s; | 818 | const uint gs = (cs * (size.x - 1) + 32) >> 6; |
| 910 | uint ct = Dt * t; | 819 | const uint gt = (ct * (size.y - 1) + 32) >> 6; |
| 911 | uint gs = (cs * (size.x - 1) + 32) >> 6; | 820 | const uint js = gs >> 4; |
| 912 | uint gt = (ct * (size.y - 1) + 32) >> 6; | 821 | const uint fs = gs & 0xF; |
| 913 | uint js = gs >> 4; | 822 | const uint jt = gt >> 4; |
| 914 | uint fs = gs & 0xF; | 823 | const uint ft = gt & 0x0F; |
| 915 | uint jt = gt >> 4; | 824 | const uint w11 = (fs * ft + 8) >> 4; |
| 916 | uint ft = gt & 0x0F; | 825 | const uint w10 = ft - w11; |
| 917 | uint w11 = (fs * ft + 8) >> 4; | 826 | const uint w01 = fs - w11; |
| 918 | uint w10 = ft - w11; | 827 | const uint w00 = 16 - fs - ft + w11; |
| 919 | uint w01 = fs - w11; | 828 | const uvec4 w = uvec4(w00, w01, w10, w11); |
| 920 | uint w00 = 16 - fs - ft + w11; | 829 | const uint v0 = jt * size.x + js; |
| 921 | uvec4 w = uvec4(w00, w01, w10, w11); | 830 | |
| 922 | uint v0 = jt * size.x + js; | 831 | uvec4 p0 = uvec4(0); |
| 923 | 832 | uvec4 p1 = uvec4(0); | |
| 924 | uvec4 p = uvec4(0); | 833 | |
| 925 | if (v0 < area) { | 834 | if (v0 < area) { |
| 926 | p.x = unquantized[plane][v0]; | 835 | const uint offset_base = v0; |
| 927 | } | 836 | p0.x = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane); |
| 928 | if ((v0 + 1) < (area)) { | 837 | p1.x = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane); |
| 929 | p.y = unquantized[plane][v0 + 1]; | 838 | } |
| 930 | } | 839 | if ((v0 + 1) < (area)) { |
| 931 | if ((v0 + size.x) < (area)) { | 840 | const uint offset_base = v0 + 1; |
| 932 | p.z = unquantized[plane][(v0 + size.x)]; | 841 | p0.y = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane); |
| 933 | } | 842 | p1.y = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane); |
| 934 | if ((v0 + size.x + 1) < (area)) { | 843 | } |
| 935 | p.w = unquantized[plane][(v0 + size.x + 1)]; | 844 | if ((v0 + size.x) < (area)) { |
| 936 | } | 845 | const uint offset_base = v0 + size.x; |
| 937 | unquantized_texel_weights[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; | 846 | p0.z = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane); |
| 938 | } | 847 | p1.z = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane); |
| 848 | } | ||
| 849 | if ((v0 + size.x + 1) < (area)) { | ||
| 850 | const uint offset_base = v0 + size.x + 1; | ||
| 851 | p0.w = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane); | ||
| 852 | p1.w = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane); | ||
| 853 | } | ||
| 854 | |||
| 855 | const uint primary_weight = (uint(dot(p0, w)) + 8) >> 4; | ||
| 856 | |||
| 857 | uvec4 weight_vec = uvec4(primary_weight); | ||
| 858 | |||
| 859 | if (is_dual_plane) { | ||
| 860 | const uint secondary_weight = (uint(dot(p1, w)) + 8) >> 4; | ||
| 861 | for (uint c = 0; c < 4; c++) { | ||
| 862 | const bool is_secondary = ((plane_index + 1u) & 3u) == c; | ||
| 863 | weight_vec[c] = is_secondary ? secondary_weight : primary_weight; | ||
| 939 | } | 864 | } |
| 940 | } | 865 | } |
| 866 | return weight_vec; | ||
| 941 | } | 867 | } |
| 942 | 868 | ||
| 943 | int FindLayout(uint mode) { | 869 | int FindLayout(uint mode) { |
| @@ -971,80 +897,96 @@ int FindLayout(uint mode) { | |||
| 971 | return 5; | 897 | return 5; |
| 972 | } | 898 | } |
| 973 | 899 | ||
| 974 | TexelWeightParams DecodeBlockInfo() { | 900 | |
| 975 | TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false); | 901 | void FillError(ivec3 coord) { |
| 976 | uint mode = StreamBits(11); | 902 | for (uint j = 0; j < block_dims.y; j++) { |
| 903 | for (uint i = 0; i < block_dims.x; i++) { | ||
| 904 | imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0)); | ||
| 905 | } | ||
| 906 | } | ||
| 907 | } | ||
| 908 | |||
| 909 | void FillVoidExtentLDR(ivec3 coord) { | ||
| 910 | SkipBits(52); | ||
| 911 | const uint r_u = StreamBits(16); | ||
| 912 | const uint g_u = StreamBits(16); | ||
| 913 | const uint b_u = StreamBits(16); | ||
| 914 | const uint a_u = StreamBits(16); | ||
| 915 | const float a = float(a_u) / 65535.0f; | ||
| 916 | const float r = float(r_u) / 65535.0f; | ||
| 917 | const float g = float(g_u) / 65535.0f; | ||
| 918 | const float b = float(b_u) / 65535.0f; | ||
| 919 | for (uint j = 0; j < block_dims.y; j++) { | ||
| 920 | for (uint i = 0; i < block_dims.x; i++) { | ||
| 921 | imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a)); | ||
| 922 | } | ||
| 923 | } | ||
| 924 | } | ||
| 925 | |||
| 926 | bool IsError(uint mode) { | ||
| 977 | if ((mode & 0x1ff) == 0x1fc) { | 927 | if ((mode & 0x1ff) == 0x1fc) { |
| 978 | if ((mode & 0x200) != 0) { | 928 | if ((mode & 0x200) != 0) { |
| 979 | params.void_extent_hdr = true; | 929 | // params.void_extent_hdr = true; |
| 980 | } else { | 930 | return true; |
| 981 | params.void_extent_ldr = true; | ||
| 982 | } | 931 | } |
| 983 | if ((mode & 0x400) == 0 || StreamBits(1) == 0) { | 932 | if ((mode & 0x400) == 0 || StreamBits(1) == 0) { |
| 984 | params.error_state = true; | 933 | return true; |
| 985 | } | 934 | } |
| 986 | return params; | 935 | return false; |
| 987 | } | 936 | } |
| 988 | if ((mode & 0xf) == 0) { | 937 | if ((mode & 0xf) == 0) { |
| 989 | params.error_state = true; | 938 | return true; |
| 990 | return params; | ||
| 991 | } | 939 | } |
| 992 | if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) { | 940 | if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) { |
| 993 | params.error_state = true; | 941 | return true; |
| 994 | return params; | ||
| 995 | } | 942 | } |
| 943 | return false; | ||
| 944 | } | ||
| 945 | |||
| 946 | uvec2 DecodeBlockSize(uint mode) { | ||
| 996 | uint A, B; | 947 | uint A, B; |
| 997 | uint mode_layout = FindLayout(mode); | 948 | switch (FindLayout(mode)) { |
| 998 | switch (mode_layout) { | ||
| 999 | case 0: | 949 | case 0: |
| 1000 | A = (mode >> 5) & 0x3; | 950 | A = (mode >> 5) & 0x3; |
| 1001 | B = (mode >> 7) & 0x3; | 951 | B = (mode >> 7) & 0x3; |
| 1002 | params.size = uvec2(B + 4, A + 2); | 952 | return uvec2(B + 4, A + 2); |
| 1003 | break; | ||
| 1004 | case 1: | 953 | case 1: |
| 1005 | A = (mode >> 5) & 0x3; | 954 | A = (mode >> 5) & 0x3; |
| 1006 | B = (mode >> 7) & 0x3; | 955 | B = (mode >> 7) & 0x3; |
| 1007 | params.size = uvec2(B + 8, A + 2); | 956 | return uvec2(B + 8, A + 2); |
| 1008 | break; | ||
| 1009 | case 2: | 957 | case 2: |
| 1010 | A = (mode >> 5) & 0x3; | 958 | A = (mode >> 5) & 0x3; |
| 1011 | B = (mode >> 7) & 0x3; | 959 | B = (mode >> 7) & 0x3; |
| 1012 | params.size = uvec2(A + 2, B + 8); | 960 | return uvec2(A + 2, B + 8); |
| 1013 | break; | ||
| 1014 | case 3: | 961 | case 3: |
| 1015 | A = (mode >> 5) & 0x3; | 962 | A = (mode >> 5) & 0x3; |
| 1016 | B = (mode >> 7) & 0x1; | 963 | B = (mode >> 7) & 0x1; |
| 1017 | params.size = uvec2(A + 2, B + 6); | 964 | return uvec2(A + 2, B + 6); |
| 1018 | break; | ||
| 1019 | case 4: | 965 | case 4: |
| 1020 | A = (mode >> 5) & 0x3; | 966 | A = (mode >> 5) & 0x3; |
| 1021 | B = (mode >> 7) & 0x1; | 967 | B = (mode >> 7) & 0x1; |
| 1022 | params.size = uvec2(B + 2, A + 2); | 968 | return uvec2(B + 2, A + 2); |
| 1023 | break; | ||
| 1024 | case 5: | 969 | case 5: |
| 1025 | A = (mode >> 5) & 0x3; | 970 | A = (mode >> 5) & 0x3; |
| 1026 | params.size = uvec2(12, A + 2); | 971 | return uvec2(12, A + 2); |
| 1027 | break; | ||
| 1028 | case 6: | 972 | case 6: |
| 1029 | A = (mode >> 5) & 0x3; | 973 | A = (mode >> 5) & 0x3; |
| 1030 | params.size = uvec2(A + 2, 12); | 974 | return uvec2(A + 2, 12); |
| 1031 | break; | ||
| 1032 | case 7: | 975 | case 7: |
| 1033 | params.size = uvec2(6, 10); | 976 | return uvec2(6, 10); |
| 1034 | break; | ||
| 1035 | case 8: | 977 | case 8: |
| 1036 | params.size = uvec2(10, 6); | 978 | return uvec2(10, 6); |
| 1037 | break; | ||
| 1038 | case 9: | 979 | case 9: |
| 1039 | A = (mode >> 5) & 0x3; | 980 | A = (mode >> 5) & 0x3; |
| 1040 | B = (mode >> 9) & 0x3; | 981 | B = (mode >> 9) & 0x3; |
| 1041 | params.size = uvec2(A + 6, B + 6); | 982 | return uvec2(A + 6, B + 6); |
| 1042 | break; | ||
| 1043 | default: | 983 | default: |
| 1044 | params.error_state = true; | 984 | return uvec2(0); |
| 1045 | break; | ||
| 1046 | } | 985 | } |
| 1047 | params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); | 986 | } |
| 987 | |||
| 988 | uint DecodeMaxWeight(uint mode) { | ||
| 989 | const uint mode_layout = FindLayout(mode); | ||
| 1048 | uint weight_index = (mode & 0x10) != 0 ? 1 : 0; | 990 | uint weight_index = (mode & 0x10) != 0 ? 1 : 0; |
| 1049 | if (mode_layout < 5) { | 991 | if (mode_layout < 5) { |
| 1050 | weight_index |= (mode & 0x3) << 1; | 992 | weight_index |= (mode & 0x3) << 1; |
| @@ -1053,64 +995,34 @@ TexelWeightParams DecodeBlockInfo() { | |||
| 1053 | } | 995 | } |
| 1054 | weight_index -= 2; | 996 | weight_index -= 2; |
| 1055 | if ((mode_layout != 9) && ((mode & 0x200) != 0)) { | 997 | if ((mode_layout != 9) && ((mode & 0x200) != 0)) { |
| 1056 | const int max_weights[6] = int[6](7, 8, 9, 10, 11, 12); | 998 | weight_index += 6; |
| 1057 | params.max_weight = max_weights[weight_index]; | ||
| 1058 | } else { | ||
| 1059 | const int max_weights[6] = int[6](1, 2, 3, 4, 5, 6); | ||
| 1060 | params.max_weight = max_weights[weight_index]; | ||
| 1061 | } | ||
| 1062 | return params; | ||
| 1063 | } | ||
| 1064 | |||
| 1065 | void FillError(ivec3 coord) { | ||
| 1066 | for (uint j = 0; j < block_dims.y; j++) { | ||
| 1067 | for (uint i = 0; i < block_dims.x; i++) { | ||
| 1068 | imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0)); | ||
| 1069 | } | ||
| 1070 | } | ||
| 1071 | } | ||
| 1072 | |||
| 1073 | void FillVoidExtentLDR(ivec3 coord) { | ||
| 1074 | StreamBits(52); | ||
| 1075 | uint r_u = StreamBits(16); | ||
| 1076 | uint g_u = StreamBits(16); | ||
| 1077 | uint b_u = StreamBits(16); | ||
| 1078 | uint a_u = StreamBits(16); | ||
| 1079 | float a = float(a_u) / 65535.0f; | ||
| 1080 | float r = float(r_u) / 65535.0f; | ||
| 1081 | float g = float(g_u) / 65535.0f; | ||
| 1082 | float b = float(b_u) / 65535.0f; | ||
| 1083 | for (uint j = 0; j < block_dims.y; j++) { | ||
| 1084 | for (uint i = 0; i < block_dims.x; i++) { | ||
| 1085 | imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a)); | ||
| 1086 | } | ||
| 1087 | } | 999 | } |
| 1000 | return weight_index + 1; | ||
| 1088 | } | 1001 | } |
| 1089 | 1002 | ||
| 1090 | void DecompressBlock(ivec3 coord) { | 1003 | void DecompressBlock(ivec3 coord) { |
| 1091 | TexelWeightParams params = DecodeBlockInfo(); | 1004 | uint mode = StreamBits(11); |
| 1092 | if (params.error_state) { | 1005 | if (IsError(mode)) { |
| 1093 | FillError(coord); | ||
| 1094 | return; | ||
| 1095 | } | ||
| 1096 | if (params.void_extent_hdr) { | ||
| 1097 | FillError(coord); | 1006 | FillError(coord); |
| 1098 | return; | 1007 | return; |
| 1099 | } | 1008 | } |
| 1100 | if (params.void_extent_ldr) { | 1009 | if ((mode & 0x1ff) == 0x1fc) { |
| 1010 | // params.void_extent_ldr = true; | ||
| 1101 | FillVoidExtentLDR(coord); | 1011 | FillVoidExtentLDR(coord); |
| 1102 | return; | 1012 | return; |
| 1103 | } | 1013 | } |
| 1104 | if ((params.size.x > block_dims.x) || (params.size.y > block_dims.y)) { | 1014 | const uvec2 size_params = DecodeBlockSize(mode); |
| 1015 | if ((size_params.x > block_dims.x) || (size_params.y > block_dims.y)) { | ||
| 1105 | FillError(coord); | 1016 | FillError(coord); |
| 1106 | return; | 1017 | return; |
| 1107 | } | 1018 | } |
| 1108 | uint num_partitions = StreamBits(2) + 1; | 1019 | const uint num_partitions = StreamBits(2) + 1; |
| 1109 | if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) { | 1020 | const uint mode_layout = FindLayout(mode); |
| 1021 | const bool dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); | ||
| 1022 | if (num_partitions > 4 || (num_partitions == 4 && dual_plane)) { | ||
| 1110 | FillError(coord); | 1023 | FillError(coord); |
| 1111 | return; | 1024 | return; |
| 1112 | } | 1025 | } |
| 1113 | int plane_index = -1; | ||
| 1114 | uint partition_index = 1; | 1026 | uint partition_index = 1; |
| 1115 | uvec4 color_endpoint_mode = uvec4(0); | 1027 | uvec4 color_endpoint_mode = uvec4(0); |
| 1116 | uint ced_pointer = 0; | 1028 | uint ced_pointer = 0; |
| @@ -1122,8 +1034,9 @@ void DecompressBlock(ivec3 coord) { | |||
| 1122 | partition_index = StreamBits(10); | 1034 | partition_index = StreamBits(10); |
| 1123 | base_cem = StreamBits(6); | 1035 | base_cem = StreamBits(6); |
| 1124 | } | 1036 | } |
| 1125 | uint base_mode = base_cem & 3; | 1037 | const uint base_mode = base_cem & 3; |
| 1126 | uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight); | 1038 | const uint max_weight = DecodeMaxWeight(mode); |
| 1039 | const uint weight_bits = GetPackedBitSize(size_params, dual_plane, max_weight); | ||
| 1127 | uint remaining_bits = 128 - weight_bits - total_bitsread; | 1040 | uint remaining_bits = 128 - weight_bits - total_bitsread; |
| 1128 | uint extra_cem_bits = 0; | 1041 | uint extra_cem_bits = 0; |
| 1129 | if (base_mode > 0) { | 1042 | if (base_mode > 0) { |
| @@ -1142,10 +1055,7 @@ void DecompressBlock(ivec3 coord) { | |||
| 1142 | } | 1055 | } |
| 1143 | } | 1056 | } |
| 1144 | remaining_bits -= extra_cem_bits; | 1057 | remaining_bits -= extra_cem_bits; |
| 1145 | uint plane_selector_bits = 0; | 1058 | const uint plane_selector_bits = dual_plane ? 2 : 0; |
| 1146 | if (params.dual_plane) { | ||
| 1147 | plane_selector_bits = 2; | ||
| 1148 | } | ||
| 1149 | remaining_bits -= plane_selector_bits; | 1059 | remaining_bits -= plane_selector_bits; |
| 1150 | if (remaining_bits > 128) { | 1060 | if (remaining_bits > 128) { |
| 1151 | // Bad data, more remaining bits than 4 bytes | 1061 | // Bad data, more remaining bits than 4 bytes |
| @@ -1153,17 +1063,17 @@ void DecompressBlock(ivec3 coord) { | |||
| 1153 | return; | 1063 | return; |
| 1154 | } | 1064 | } |
| 1155 | // Read color data... | 1065 | // Read color data... |
| 1156 | uint color_data_bits = remaining_bits; | 1066 | const uint color_data_bits = remaining_bits; |
| 1157 | while (remaining_bits > 0) { | 1067 | while (remaining_bits > 0) { |
| 1158 | int nb = int(min(remaining_bits, 32U)); | 1068 | const int nb = int(min(remaining_bits, 32U)); |
| 1159 | uint b = StreamBits(nb); | 1069 | const uint b = StreamBits(nb); |
| 1160 | color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); | 1070 | color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); |
| 1161 | ++ced_pointer; | 1071 | ++ced_pointer; |
| 1162 | remaining_bits -= nb; | 1072 | remaining_bits -= nb; |
| 1163 | } | 1073 | } |
| 1164 | plane_index = int(StreamBits(plane_selector_bits)); | 1074 | const uint plane_index = uint(StreamBits(plane_selector_bits)); |
| 1165 | if (base_mode > 0) { | 1075 | if (base_mode > 0) { |
| 1166 | uint extra_cem = StreamBits(extra_cem_bits); | 1076 | const uint extra_cem = StreamBits(extra_cem_bits); |
| 1167 | uint cem = (extra_cem << 6) | base_cem; | 1077 | uint cem = (extra_cem << 6) | base_cem; |
| 1168 | cem >>= 2; | 1078 | cem >>= 2; |
| 1169 | uvec4 C = uvec4(0); | 1079 | uvec4 C = uvec4(0); |
| @@ -1185,70 +1095,80 @@ void DecompressBlock(ivec3 coord) { | |||
| 1185 | color_endpoint_mode[i] |= M[i]; | 1095 | color_endpoint_mode[i] |= M[i]; |
| 1186 | } | 1096 | } |
| 1187 | } else if (num_partitions > 1) { | 1097 | } else if (num_partitions > 1) { |
| 1188 | uint cem = base_cem >> 2; | 1098 | const uint cem = base_cem >> 2; |
| 1189 | for (uint i = 0; i < num_partitions; i++) { | 1099 | for (uint i = 0; i < num_partitions; i++) { |
| 1190 | color_endpoint_mode[i] = cem; | 1100 | color_endpoint_mode[i] = cem; |
| 1191 | } | 1101 | } |
| 1192 | } | 1102 | } |
| 1193 | DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits); | ||
| 1194 | 1103 | ||
| 1195 | uvec4 endpoints[4][2]; | 1104 | uvec4 endpoints0[4]; |
| 1196 | for (uint i = 0; i < num_partitions; i++) { | 1105 | uvec4 endpoints1[4]; |
| 1197 | ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]); | 1106 | { |
| 1107 | // This decode phase should at most push 32 elements into the vector | ||
| 1108 | result_vector_max_index = 32; | ||
| 1109 | uint color_values[32]; | ||
| 1110 | uint colvals_index = 0; | ||
| 1111 | DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits, color_values); | ||
| 1112 | for (uint i = 0; i < num_partitions; i++) { | ||
| 1113 | ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values, | ||
| 1114 | colvals_index); | ||
| 1115 | } | ||
| 1198 | } | 1116 | } |
| 1117 | color_endpoint_data = local_buff; | ||
| 1118 | color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx; | ||
| 1119 | const uint clear_byte_start = (weight_bits >> 3) + 1; | ||
| 1199 | 1120 | ||
| 1200 | texel_weight_data = local_buff; | 1121 | const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) & |
| 1201 | texel_weight_data = bitfieldReverse(texel_weight_data).wzyx; | 1122 | uint(((1 << (weight_bits % 8)) - 1)); |
| 1202 | uint clear_byte_start = | 1123 | const uint vec_index = (clear_byte_start - 1) >> 2; |
| 1203 | (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1; | 1124 | color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert, |
| 1204 | 1125 | int((clear_byte_start - 1) % 4) * 8, 8); | |
| 1205 | uint byte_insert = ExtractBits(texel_weight_data, int(clear_byte_start - 1) * 8, 8) & | ||
| 1206 | uint( | ||
| 1207 | ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); | ||
| 1208 | uint vec_index = (clear_byte_start - 1) >> 2; | ||
| 1209 | texel_weight_data[vec_index] = | ||
| 1210 | bitfieldInsert(texel_weight_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8); | ||
| 1211 | for (uint i = clear_byte_start; i < 16; ++i) { | 1126 | for (uint i = clear_byte_start; i < 16; ++i) { |
| 1212 | uint idx = i >> 2; | 1127 | const uint idx = i >> 2; |
| 1213 | texel_weight_data[idx] = bitfieldInsert(texel_weight_data[idx], 0, int(i % 4) * 8, 8); | 1128 | color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8); |
| 1214 | } | 1129 | } |
| 1215 | texel_flag = true; // use texel "vector" and bit stream in integer decoding | ||
| 1216 | DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); | ||
| 1217 | 1130 | ||
| 1218 | UnquantizeTexelWeights(params.dual_plane, params.size); | 1131 | // Re-init vector variables for next decode phase |
| 1132 | result_index = 0; | ||
| 1133 | color_bitsread = 0; | ||
| 1134 | result_limit_reached = false; | ||
| 1219 | 1135 | ||
| 1136 | // The limit for the Unquantize phase, avoids decoding more data than needed. | ||
| 1137 | result_vector_max_index = size_params.x * size_params.y; | ||
| 1138 | if (dual_plane) { | ||
| 1139 | result_vector_max_index *= 2; | ||
| 1140 | } | ||
| 1141 | DecodeIntegerSequence(max_weight, GetNumWeightValues(size_params, dual_plane)); | ||
| 1142 | |||
| 1143 | UnquantizeTexelWeights(size_params, dual_plane); | ||
| 1220 | for (uint j = 0; j < block_dims.y; j++) { | 1144 | for (uint j = 0; j < block_dims.y; j++) { |
| 1221 | for (uint i = 0; i < block_dims.x; i++) { | 1145 | for (uint i = 0; i < block_dims.x; i++) { |
| 1222 | uint local_partition = 0; | 1146 | uint local_partition = 0; |
| 1223 | if (num_partitions > 1) { | 1147 | if (num_partitions > 1) { |
| 1224 | local_partition = Select2DPartition(partition_index, i, j, num_partitions, | 1148 | local_partition = Select2DPartition(partition_index, i, j, num_partitions); |
| 1225 | (block_dims.y * block_dims.x) < 32); | ||
| 1226 | } | ||
| 1227 | vec4 p; | ||
| 1228 | uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]); | ||
| 1229 | uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]); | ||
| 1230 | uvec4 plane_vec = uvec4(0); | ||
| 1231 | uvec4 weight_vec = uvec4(0); | ||
| 1232 | for (uint c = 0; c < 4; c++) { | ||
| 1233 | if (params.dual_plane && (((plane_index + 1) & 3) == c)) { | ||
| 1234 | plane_vec[c] = 1; | ||
| 1235 | } | ||
| 1236 | weight_vec[c] = unquantized_texel_weights[plane_vec[c]][j * block_dims.x + i]; | ||
| 1237 | } | 1149 | } |
| 1238 | vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); | 1150 | const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]); |
| 1239 | p = (Cf / 65535.0); | 1151 | const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]); |
| 1152 | const uvec4 weight_vec = GetUnquantizedWeightVector(j, i, size_params, plane_index, dual_plane); | ||
| 1153 | const vec4 Cf = | ||
| 1154 | vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); | ||
| 1155 | const vec4 p = (Cf / 65535.0f); | ||
| 1240 | imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); | 1156 | imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); |
| 1241 | } | 1157 | } |
| 1242 | } | 1158 | } |
| 1243 | } | 1159 | } |
| 1244 | 1160 | ||
| 1161 | uint SwizzleOffset(uvec2 pos) { | ||
| 1162 | const uint x = pos.x; | ||
| 1163 | const uint y = pos.y; | ||
| 1164 | return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + | ||
| 1165 | ((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16); | ||
| 1166 | } | ||
| 1167 | |||
| 1245 | void main() { | 1168 | void main() { |
| 1246 | uvec3 pos = gl_GlobalInvocationID; | 1169 | uvec3 pos = gl_GlobalInvocationID; |
| 1247 | pos.x <<= BYTES_PER_BLOCK_LOG2; | 1170 | pos.x <<= BYTES_PER_BLOCK_LOG2; |
| 1248 | |||
| 1249 | // Read as soon as possible due to its latency | ||
| 1250 | const uint swizzle = SwizzleOffset(pos.xy); | 1171 | const uint swizzle = SwizzleOffset(pos.xy); |
| 1251 | |||
| 1252 | const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; | 1172 | const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; |
| 1253 | 1173 | ||
| 1254 | uint offset = 0; | 1174 | uint offset = 0; |
| @@ -1262,8 +1182,6 @@ void main() { | |||
| 1262 | if (any(greaterThanEqual(coord, imageSize(dest_image)))) { | 1182 | if (any(greaterThanEqual(coord, imageSize(dest_image)))) { |
| 1263 | return; | 1183 | return; |
| 1264 | } | 1184 | } |
| 1265 | current_index = 0; | ||
| 1266 | bitsread = 0; | ||
| 1267 | local_buff = astc_data[offset / 16]; | 1185 | local_buff = astc_data[offset / 16]; |
| 1268 | DecompressBlock(coord); | 1186 | DecompressBlock(coord); |
| 1269 | } | 1187 | } |
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 544982d18..c437013e6 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp | |||
| @@ -68,6 +68,7 @@ void UtilShaders::ASTCDecode(Image& image, const StagingBufferMap& map, | |||
| 68 | std::span<const VideoCommon::SwizzleParameters> swizzles) { | 68 | std::span<const VideoCommon::SwizzleParameters> swizzles) { |
| 69 | static constexpr GLuint BINDING_INPUT_BUFFER = 0; | 69 | static constexpr GLuint BINDING_INPUT_BUFFER = 0; |
| 70 | static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; | 70 | static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; |
| 71 | program_manager.LocalMemoryWarmup(); | ||
| 71 | 72 | ||
| 72 | const Extent2D tile_size{ | 73 | const Extent2D tile_size{ |
| 73 | .width = VideoCore::Surface::DefaultBlockWidth(image.info.format), | 74 | .width = VideoCore::Surface::DefaultBlockWidth(image.info.format), |