diff options
| author | 2023-08-01 17:22:03 -0400 | |
|---|---|---|
| committer | 2023-08-06 14:54:57 -0400 | |
| commit | 5248fa926dd53948b0df4f93c50107dc30ae2305 (patch) | |
| tree | 3aa35f10a7a17d5a46d5579a914b858a46777551 /src | |
| parent | minor redundancy cleanup (diff) | |
| download | yuzu-5248fa926dd53948b0df4f93c50107dc30ae2305.tar.gz yuzu-5248fa926dd53948b0df4f93c50107dc30ae2305.tar.xz yuzu-5248fa926dd53948b0df4f93c50107dc30ae2305.zip | |
const, pack result_vector and replicate tables,
undo amd opts
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/host_shaders/astc_decoder.comp | 487 |
1 files changed, 260 insertions, 227 deletions
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index 90b40c55f..e8801b0ff 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp | |||
| @@ -57,20 +57,40 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT; | |||
| 57 | 57 | ||
| 58 | const uint BYTES_PER_BLOCK_LOG2 = 4; | 58 | const uint BYTES_PER_BLOCK_LOG2 = 4; |
| 59 | 59 | ||
| 60 | const int JUST_BITS = 0; | 60 | const uint JUST_BITS = 0u; |
| 61 | const int QUINT = 1; | 61 | const uint QUINT = 1u; |
| 62 | const int TRIT = 2; | 62 | const uint TRIT = 2u; |
| 63 | 63 | ||
| 64 | // ASTC Encodings data, sorted in ascending order based on their BitLength value | 64 | // ASTC Encodings data, sorted in ascending order based on their BitLength value |
| 65 | // (see GetBitLength() function) | 65 | // (see GetBitLength() function) |
| 66 | const EncodingData encoding_values[22] = EncodingData[]( | 66 | const uvec4 encoding_values[6] = uvec4[]( |
| 67 | EncodingData(JUST_BITS), EncodingData(JUST_BITS | (1u << 8u)), EncodingData(TRIT), EncodingData(JUST_BITS | (2u << 8u)), | 67 | uvec4((JUST_BITS), (JUST_BITS | (1u << 8u)), (TRIT), (JUST_BITS | (2u << 8u))), |
| 68 | EncodingData(QUINT), EncodingData(TRIT | (1u << 8u)), EncodingData(JUST_BITS | (3u << 8u)), EncodingData(QUINT | (1u << 8u)), | 68 | uvec4((QUINT), (TRIT | (1u << 8u)), (JUST_BITS | (3u << 8u)), (QUINT | (1u << 8u))), |
| 69 | EncodingData(TRIT | (2u << 8u)), EncodingData(JUST_BITS | (4u << 8u)), EncodingData(QUINT | (2u << 8u)), EncodingData(TRIT | (3u << 8u)), | 69 | uvec4((TRIT | (2u << 8u)), (JUST_BITS | (4u << 8u)), (QUINT | (2u << 8u)), (TRIT | (3u << 8u))), |
| 70 | EncodingData(JUST_BITS | (5u << 8u)), EncodingData(QUINT | (3u << 8u)), EncodingData(TRIT | (4u << 8u)), EncodingData(JUST_BITS | (6u << 8u)), | 70 | uvec4((JUST_BITS | (5u << 8u)), (QUINT | (3u << 8u)), (TRIT | (4u << 8u)), (JUST_BITS | (6u << 8u))), |
| 71 | EncodingData(QUINT | (4u << 8u)), EncodingData(TRIT | (5u << 8u)), EncodingData(JUST_BITS | (7u << 8u)), EncodingData(QUINT | (5u << 8u)), | 71 | uvec4((QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u))), |
| 72 | EncodingData(TRIT | (6u << 8u)), EncodingData(JUST_BITS | (8u << 8u)) | 72 | uvec4((TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)), 0u, 0u)); |
| 73 | ); | 73 | |
| 74 | // Input ASTC texture globals | ||
| 75 | int total_bitsread = 0; | ||
| 76 | uvec4 local_buff; | ||
| 77 | |||
| 78 | // Color data globals | ||
| 79 | uvec4 color_endpoint_data; | ||
| 80 | int color_bitsread = 0; | ||
| 81 | |||
| 82 | // Global "vector" to be pushed into when decoding | ||
| 83 | // At most will require BLOCK_WIDTH x BLOCK_HEIGHT in single plane mode | ||
| 84 | // At most will require BLOCK_WIDTH x BLOCK_HEIGHT x 2 in dual plane mode | ||
| 85 | // So the maximum would be 144 (12 x 12) elements, x 2 for two planes | ||
| 86 | #define DIVCEIL(number, divisor) (number + divisor - 1) / divisor | ||
| 87 | #define ARRAY_NUM_ELEMENTS 144 | ||
| 88 | #define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4) | ||
| 89 | uvec4 result_vector[VECTOR_ARRAY_SIZE]; | ||
| 90 | |||
| 91 | int result_index = 0; | ||
| 92 | uint result_vector_max_index; | ||
| 93 | bool result_limit_reached = false; | ||
| 74 | 94 | ||
| 75 | // EncodingData helpers | 95 | // EncodingData helpers |
| 76 | uint Encoding(EncodingData val) { | 96 | uint Encoding(EncodingData val) { |
| @@ -104,78 +124,17 @@ EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint | |||
| 104 | ((bit_val) << 16u) | ((quint_trit_val) << 24u)); | 124 | ((bit_val) << 16u) | ((quint_trit_val) << 24u)); |
| 105 | } | 125 | } |
| 106 | 126 | ||
| 107 | // The following constants are expanded variants of the Replicate() | ||
| 108 | // function calls corresponding to the following arguments: | ||
| 109 | // value: index into the generated table | ||
| 110 | // num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4. | ||
| 111 | // to_bit: the integer after "TO_" | ||
| 112 | const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127); | ||
| 113 | const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511); | ||
| 114 | |||
| 115 | const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255); | ||
| 116 | const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255); | ||
| 117 | const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255); | ||
| 118 | const uint REPLICATE_4_BIT_TO_8_TABLE[16] = | ||
| 119 | uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255); | ||
| 120 | const uint REPLICATE_5_BIT_TO_8_TABLE[32] = | ||
| 121 | uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165, | ||
| 122 | 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255); | ||
| 123 | const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63); | ||
| 124 | const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63); | ||
| 125 | const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63); | ||
| 126 | const uint REPLICATE_4_BIT_TO_6_TABLE[16] = | ||
| 127 | uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63); | ||
| 128 | const uint REPLICATE_5_BIT_TO_6_TABLE[32] = | ||
| 129 | uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45, | ||
| 130 | 47, 49, 51, 53, 55, 57, 59, 61, 63); | ||
| 131 | const uint REPLICATE_6_BIT_TO_8_TABLE[64] = | ||
| 132 | uint[](0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 65, 69, 73, 77, 81, 85, 89, | ||
| 133 | 93, 97, 101, 105, 109, 113, 117, 121, 125, 130, 134, 138, 142, 146, 150, 154, 158, 162, | ||
| 134 | 166, 170, 174, 178, 182, 186, 190, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, | ||
| 135 | 239, 243, 247, 251, 255); | ||
| 136 | const uint REPLICATE_7_BIT_TO_8_TABLE[128] = | ||
| 137 | uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, | ||
| 138 | 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, | ||
| 139 | 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, | ||
| 140 | 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159, 161, 163, | ||
| 141 | 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, | ||
| 142 | 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, | ||
| 143 | 237, 239, 241, 243, 245, 247, 249, 251, 253, 255); | ||
| 144 | |||
| 145 | // Input ASTC texture globals | ||
| 146 | int total_bitsread = 0; | ||
| 147 | uvec4 local_buff; | ||
| 148 | |||
| 149 | // Color data globals | ||
| 150 | uvec4 color_endpoint_data; | ||
| 151 | int color_bitsread = 0; | ||
| 152 | |||
| 153 | // Four values, two endpoints, four maximum partitions | ||
| 154 | uint color_values[32]; | ||
| 155 | int colvals_index = 0; | ||
| 156 | |||
| 157 | // Global "vectors" to be pushed into when decoding | ||
| 158 | EncodingData result_vector[144]; | ||
| 159 | int result_index = 0; | ||
| 160 | 127 | ||
| 161 | // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] | 128 | void ResultEmplaceBack(EncodingData val) { |
| 162 | // is the same as [(num_bits - 1):0] and repeats all the way down. | 129 | if (result_index >= result_vector_max_index) { |
| 163 | uint Replicate(uint val, uint num_bits, uint to_bit) { | 130 | // Alert callers to avoid decoding more than needed by this phase |
| 164 | const uint v = val & uint((1 << num_bits) - 1); | 131 | result_limit_reached = true; |
| 165 | uint res = v; | 132 | return; |
| 166 | uint reslen = num_bits; | ||
| 167 | while (reslen < to_bit) { | ||
| 168 | uint comp = 0; | ||
| 169 | if (num_bits > to_bit - reslen) { | ||
| 170 | uint newshift = to_bit - reslen; | ||
| 171 | comp = num_bits - newshift; | ||
| 172 | num_bits = newshift; | ||
| 173 | } | ||
| 174 | res = uint(res << num_bits); | ||
| 175 | res = uint(res | (v >> comp)); | ||
| 176 | reslen += num_bits; | ||
| 177 | } | 133 | } |
| 178 | return res; | 134 | const uint array_index = result_index / 4; |
| 135 | const uint vector_index = result_index % 4; | ||
| 136 | result_vector[array_index][vector_index] = val.data; | ||
| 137 | ++result_index; | ||
| 179 | } | 138 | } |
| 180 | 139 | ||
| 181 | uvec4 ReplicateByteTo16(uvec4 value) { | 140 | uvec4 ReplicateByteTo16(uvec4 value) { |
| @@ -183,64 +142,105 @@ uvec4 ReplicateByteTo16(uvec4 value) { | |||
| 183 | } | 142 | } |
| 184 | 143 | ||
| 185 | uint ReplicateBitTo7(uint value) { | 144 | uint ReplicateBitTo7(uint value) { |
| 186 | return REPLICATE_BIT_TO_7_TABLE[value]; | 145 | return value * 127; |
| 187 | } | 146 | } |
| 188 | 147 | ||
| 189 | uint ReplicateBitTo9(uint value) { | 148 | uint ReplicateBitTo9(uint value) { |
| 190 | return REPLICATE_1_BIT_TO_9_TABLE[value]; | 149 | return value * 511; |
| 191 | } | 150 | } |
| 192 | 151 | ||
| 193 | uint FastReplicate(uint value, uint num_bits, uint to_bit) { | 152 | uint FastReplicateTo8(uint value, uint num_bits) { |
| 194 | if (num_bits == 0) { | 153 | if (value == 0) { |
| 195 | return 0; | 154 | return 0; |
| 196 | } | 155 | } |
| 197 | if (num_bits == to_bit) { | 156 | const uint array_index = value / 4; |
| 198 | return value; | 157 | const uint vector_index = value % 4; |
| 158 | switch (num_bits) { | ||
| 159 | case 1: | ||
| 160 | return 255; | ||
| 161 | case 2: { | ||
| 162 | const uvec4 REPLICATE_2_BIT_TO_8_TABLE = (uvec4(0, 85, 170, 255)); | ||
| 163 | return REPLICATE_2_BIT_TO_8_TABLE[vector_index]; | ||
| 199 | } | 164 | } |
| 200 | if (to_bit == 6) { | 165 | case 3: { |
| 201 | switch (num_bits) { | 166 | const uvec4 REPLICATE_3_BIT_TO_8_TABLE[2] = |
| 202 | case 1: | 167 | uvec4[](uvec4(0, 36, 73, 109), uvec4(146, 182, 219, 255)); |
| 203 | return REPLICATE_1_BIT_TO_6_TABLE[value]; | 168 | return REPLICATE_3_BIT_TO_8_TABLE[array_index][vector_index]; |
| 204 | case 2: | ||
| 205 | return REPLICATE_2_BIT_TO_6_TABLE[value]; | ||
| 206 | case 3: | ||
| 207 | return REPLICATE_3_BIT_TO_6_TABLE[value]; | ||
| 208 | case 4: | ||
| 209 | return REPLICATE_4_BIT_TO_6_TABLE[value]; | ||
| 210 | case 5: | ||
| 211 | return REPLICATE_5_BIT_TO_6_TABLE[value]; | ||
| 212 | default: | ||
| 213 | break; | ||
| 214 | } | ||
| 215 | } else { /* if (to_bit == 8) */ | ||
| 216 | switch (num_bits) { | ||
| 217 | case 1: | ||
| 218 | return REPLICATE_1_BIT_TO_8_TABLE[value]; | ||
| 219 | case 2: | ||
| 220 | return REPLICATE_2_BIT_TO_8_TABLE[value]; | ||
| 221 | case 3: | ||
| 222 | return REPLICATE_3_BIT_TO_8_TABLE[value]; | ||
| 223 | case 4: | ||
| 224 | return REPLICATE_4_BIT_TO_8_TABLE[value]; | ||
| 225 | case 5: | ||
| 226 | return REPLICATE_5_BIT_TO_8_TABLE[value]; | ||
| 227 | case 6: | ||
| 228 | return REPLICATE_6_BIT_TO_8_TABLE[value]; | ||
| 229 | case 7: | ||
| 230 | return REPLICATE_7_BIT_TO_8_TABLE[value]; | ||
| 231 | default: | ||
| 232 | break; | ||
| 233 | } | ||
| 234 | } | 169 | } |
| 235 | return Replicate(value, num_bits, to_bit); | 170 | case 4: { |
| 236 | } | 171 | const uvec4 REPLICATE_4_BIT_TO_8_TABLE[4] = |
| 237 | 172 | uvec4[](uvec4(0, 17, 34, 51), uvec4(68, 85, 102, 119), uvec4(136, 153, 170, 187), | |
| 238 | uint FastReplicateTo8(uint value, uint num_bits) { | 173 | uvec4(204, 221, 238, 255)); |
| 239 | return FastReplicate(value, num_bits, 8); | 174 | return REPLICATE_4_BIT_TO_8_TABLE[array_index][vector_index]; |
| 175 | } | ||
| 176 | case 5: { | ||
| 177 | const uvec4 REPLICATE_5_BIT_TO_8_TABLE[8] = | ||
| 178 | uvec4[](uvec4(0, 8, 16, 24), uvec4(33, 41, 49, 57), uvec4(66, 74, 82, 90), | ||
| 179 | uvec4(99, 107, 115, 123), uvec4(132, 140, 148, 156), uvec4(165, 173, 181, 189), | ||
| 180 | uvec4(198, 206, 214, 222), uvec4(231, 239, 247, 255)); | ||
| 181 | return REPLICATE_5_BIT_TO_8_TABLE[array_index][vector_index]; | ||
| 182 | } | ||
| 183 | case 6: { | ||
| 184 | const uvec4 REPLICATE_6_BIT_TO_8_TABLE[16] = uvec4[]( | ||
| 185 | uvec4(0, 4, 8, 12), uvec4(16, 20, 24, 28), uvec4(32, 36, 40, 44), uvec4(48, 52, 56, 60), | ||
| 186 | uvec4(65, 69, 73, 77), uvec4(81, 85, 89, 93), uvec4(97, 101, 105, 109), | ||
| 187 | uvec4(113, 117, 121, 125), uvec4(130, 134, 138, 142), uvec4(146, 150, 154, 158), | ||
| 188 | uvec4(162, 166, 170, 174), uvec4(178, 182, 186, 190), uvec4(195, 199, 203, 207), | ||
| 189 | uvec4(211, 215, 219, 223), uvec4(227, 231, 235, 239), uvec4(243, 247, 251, 255)); | ||
| 190 | return REPLICATE_6_BIT_TO_8_TABLE[array_index][vector_index]; | ||
| 191 | } | ||
| 192 | case 7: { | ||
| 193 | const uvec4 REPLICATE_7_BIT_TO_8_TABLE[32] = | ||
| 194 | uvec4[](uvec4(0, 2, 4, 6), uvec4(8, 10, 12, 14), uvec4(16, 18, 20, 22), | ||
| 195 | uvec4(24, 26, 28, 30), uvec4(32, 34, 36, 38), uvec4(40, 42, 44, 46), | ||
| 196 | uvec4(48, 50, 52, 54), uvec4(56, 58, 60, 62), uvec4(64, 66, 68, 70), | ||
| 197 | uvec4(72, 74, 76, 78), uvec4(80, 82, 84, 86), uvec4(88, 90, 92, 94), | ||
| 198 | uvec4(96, 98, 100, 102), uvec4(104, 106, 108, 110), uvec4(112, 114, 116, 118), | ||
| 199 | uvec4(120, 122, 124, 126), uvec4(129, 131, 133, 135), uvec4(137, 139, 141, 143), | ||
| 200 | uvec4(145, 147, 149, 151), uvec4(153, 155, 157, 159), uvec4(161, 163, 165, 167), | ||
| 201 | uvec4(169, 171, 173, 175), uvec4(177, 179, 181, 183), uvec4(185, 187, 189, 191), | ||
| 202 | uvec4(193, 195, 197, 199), uvec4(201, 203, 205, 207), uvec4(209, 211, 213, 215), | ||
| 203 | uvec4(217, 219, 221, 223), uvec4(225, 227, 229, 231), uvec4(233, 235, 237, 239), | ||
| 204 | uvec4(241, 243, 245, 247), uvec4(249, 251, 253, 255)); | ||
| 205 | return REPLICATE_7_BIT_TO_8_TABLE[array_index][vector_index]; | ||
| 206 | } | ||
| 207 | } | ||
| 208 | return value; | ||
| 240 | } | 209 | } |
| 241 | 210 | ||
| 242 | uint FastReplicateTo6(uint value, uint num_bits) { | 211 | uint FastReplicateTo6(uint value, uint num_bits) { |
| 243 | return FastReplicate(value, num_bits, 6); | 212 | if (value == 0) { |
| 213 | return 0; | ||
| 214 | } | ||
| 215 | const uint array_index = value / 4; | ||
| 216 | const uint vector_index = value % 4; | ||
| 217 | switch (num_bits) { | ||
| 218 | case 1: | ||
| 219 | return 63; | ||
| 220 | case 2: { | ||
| 221 | const uvec4 REPLICATE_2_BIT_TO_6_TABLE = uvec4(0, 21, 42, 63); | ||
| 222 | return REPLICATE_2_BIT_TO_6_TABLE[vector_index]; | ||
| 223 | } | ||
| 224 | case 3: { | ||
| 225 | const uvec4 REPLICATE_3_BIT_TO_6_TABLE[2] = | ||
| 226 | uvec4[](uvec4(0, 9, 18, 27), uvec4(36, 45, 54, 63)); | ||
| 227 | return REPLICATE_3_BIT_TO_6_TABLE[array_index][vector_index]; | ||
| 228 | } | ||
| 229 | case 4: { | ||
| 230 | const uvec4 REPLICATE_4_BIT_TO_6_TABLE[4] = | ||
| 231 | uvec4[](uvec4(0, 4, 8, 12), uvec4(17, 21, 25, 29), uvec4(34, 38, 42, 46), | ||
| 232 | uvec4(51, 55, 59, 63)); | ||
| 233 | return REPLICATE_4_BIT_TO_6_TABLE[array_index][vector_index]; | ||
| 234 | } | ||
| 235 | case 5: { | ||
| 236 | const uvec4 REPLICATE_5_BIT_TO_6_TABLE[8] = | ||
| 237 | uvec4[](uvec4(0, 2, 4, 6), uvec4(8, 10, 12, 14), uvec4(16, 18, 20, 22), | ||
| 238 | uvec4(24, 26, 28, 30), uvec4(33, 35, 37, 39), uvec4(41, 43, 45, 47), | ||
| 239 | uvec4(49, 51, 53, 55), uvec4(57, 59, 61, 63)); | ||
| 240 | return REPLICATE_5_BIT_TO_6_TABLE[array_index][vector_index]; | ||
| 241 | } | ||
| 242 | } | ||
| 243 | return value; | ||
| 244 | } | 244 | } |
| 245 | 245 | ||
| 246 | uint Div3Floor(uint v) { | 246 | uint Div3Floor(uint v) { |
| @@ -281,7 +281,7 @@ uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool sma | |||
| 281 | 281 | ||
| 282 | seed += (partition_count - 1) * 1024; | 282 | seed += (partition_count - 1) * 1024; |
| 283 | 283 | ||
| 284 | uint rnum = Hash52(uint(seed)); | 284 | const uint rnum = Hash52(uint(seed)); |
| 285 | uint seed1 = uint(rnum & 0xF); | 285 | uint seed1 = uint(rnum & 0xF); |
| 286 | uint seed2 = uint((rnum >> 4) & 0xF); | 286 | uint seed2 = uint((rnum >> 4) & 0xF); |
| 287 | uint seed3 = uint((rnum >> 8) & 0xF); | 287 | uint seed3 = uint((rnum >> 8) & 0xF); |
| @@ -364,8 +364,8 @@ uint ExtractBits(uvec4 payload, int offset, int bits) { | |||
| 364 | } | 364 | } |
| 365 | 365 | ||
| 366 | uint StreamBits(uint num_bits) { | 366 | uint StreamBits(uint num_bits) { |
| 367 | int int_bits = int(num_bits); | 367 | const int int_bits = int(num_bits); |
| 368 | uint ret = ExtractBits(local_buff, total_bitsread, int_bits); | 368 | const uint ret = ExtractBits(local_buff, total_bitsread, int_bits); |
| 369 | total_bitsread += int_bits; | 369 | total_bitsread += int_bits; |
| 370 | return ret; | 370 | return ret; |
| 371 | } | 371 | } |
| @@ -382,14 +382,18 @@ uint StreamColorBits(uint num_bits) { | |||
| 382 | return ret; | 382 | return ret; |
| 383 | } | 383 | } |
| 384 | 384 | ||
| 385 | void ResultEmplaceBack(EncodingData val) { | 385 | EncodingData GetEncodingFromVector(uint index) { |
| 386 | result_vector[result_index] = val; | 386 | const uint array_index = index / 4; |
| 387 | ++result_index; | 387 | const uint vector_index = index % 4; |
| 388 | |||
| 389 | const uint data = result_vector[array_index][vector_index]; | ||
| 390 | return EncodingData(data); | ||
| 388 | } | 391 | } |
| 389 | 392 | ||
| 390 | // Returns the number of bits required to encode n_vals values. | 393 | // Returns the number of bits required to encode n_vals values. |
| 391 | uint GetBitLength(uint n_vals, uint encoding_index) { | 394 | uint GetBitLength(uint n_vals, uint encoding_index) { |
| 392 | const EncodingData encoding_value = encoding_values[encoding_index]; | 395 | const EncodingData encoding_value = |
| 396 | EncodingData(encoding_values[encoding_index / 4][encoding_index % 4]); | ||
| 393 | const uint encoding = Encoding(encoding_value); | 397 | const uint encoding = Encoding(encoding_value); |
| 394 | uint total_bits = NumBits(encoding_value) * n_vals; | 398 | uint total_bits = NumBits(encoding_value) * n_vals; |
| 395 | if (encoding == TRIT) { | 399 | if (encoding == TRIT) { |
| @@ -409,7 +413,7 @@ uint GetNumWeightValues(uvec2 size, bool dual_plane) { | |||
| 409 | } | 413 | } |
| 410 | 414 | ||
| 411 | uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) { | 415 | uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) { |
| 412 | uint n_vals = GetNumWeightValues(size, dual_plane); | 416 | const uint n_vals = GetNumWeightValues(size, dual_plane); |
| 413 | return GetBitLength(n_vals, max_weight); | 417 | return GetBitLength(n_vals, max_weight); |
| 414 | } | 418 | } |
| 415 | 419 | ||
| @@ -418,13 +422,13 @@ uint BitsBracket(uint bits, uint pos) { | |||
| 418 | } | 422 | } |
| 419 | 423 | ||
| 420 | uint BitsOp(uint bits, uint start, uint end) { | 424 | uint BitsOp(uint bits, uint start, uint end) { |
| 421 | uint mask = (1 << (end - start + 1)) - 1; | 425 | const uint mask = (1 << (end - start + 1)) - 1; |
| 422 | return ((bits >> start) & mask); | 426 | return ((bits >> start) & mask); |
| 423 | } | 427 | } |
| 424 | 428 | ||
| 425 | void DecodeQuintBlock(uint num_bits) { | 429 | void DecodeQuintBlock(uint num_bits) { |
| 426 | uint m[3]; | 430 | uvec3 m; |
| 427 | uint q[3]; | 431 | uvec3 q; |
| 428 | uint Q; | 432 | uint Q; |
| 429 | m[0] = StreamColorBits(num_bits); | 433 | m[0] = StreamColorBits(num_bits); |
| 430 | Q = StreamColorBits(3); | 434 | Q = StreamColorBits(3); |
| @@ -433,25 +437,25 @@ void DecodeQuintBlock(uint num_bits) { | |||
| 433 | m[2] = StreamColorBits(num_bits); | 437 | m[2] = StreamColorBits(num_bits); |
| 434 | Q |= StreamColorBits(2) << 5; | 438 | Q |= StreamColorBits(2) << 5; |
| 435 | if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) { | 439 | if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) { |
| 436 | q[0] = 4; | 440 | q.x = 4; |
| 437 | q[1] = 4; | 441 | q.y = 4; |
| 438 | q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) | | 442 | q.z = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) | |
| 439 | (BitsBracket(Q, 3) & ~BitsBracket(Q, 0)); | 443 | (BitsBracket(Q, 3) & ~BitsBracket(Q, 0)); |
| 440 | } else { | 444 | } else { |
| 441 | uint C = 0; | 445 | uint C = 0; |
| 442 | if (BitsOp(Q, 1, 2) == 3) { | 446 | if (BitsOp(Q, 1, 2) == 3) { |
| 443 | q[2] = 4; | 447 | q.z = 4; |
| 444 | C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0); | 448 | C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0); |
| 445 | } else { | 449 | } else { |
| 446 | q[2] = BitsOp(Q, 5, 6); | 450 | q.z = BitsOp(Q, 5, 6); |
| 447 | C = BitsOp(Q, 0, 4); | 451 | C = BitsOp(Q, 0, 4); |
| 448 | } | 452 | } |
| 449 | if (BitsOp(C, 0, 2) == 5) { | 453 | if (BitsOp(C, 0, 2) == 5) { |
| 450 | q[1] = 4; | 454 | q.y = 4; |
| 451 | q[0] = BitsOp(C, 3, 4); | 455 | q.x = BitsOp(C, 3, 4); |
| 452 | } else { | 456 | } else { |
| 453 | q[1] = BitsOp(C, 3, 4); | 457 | q.y = BitsOp(C, 3, 4); |
| 454 | q[0] = BitsOp(C, 0, 2); | 458 | q.x = BitsOp(C, 0, 2); |
| 455 | } | 459 | } |
| 456 | } | 460 | } |
| 457 | for (uint i = 0; i < 3; i++) { | 461 | for (uint i = 0; i < 3; i++) { |
| @@ -509,11 +513,11 @@ void DecodeTritBlock(uint num_bits) { | |||
| 509 | } | 513 | } |
| 510 | 514 | ||
| 511 | void DecodeIntegerSequence(uint max_range, uint num_values) { | 515 | void DecodeIntegerSequence(uint max_range, uint num_values) { |
| 512 | EncodingData val = encoding_values[max_range]; | 516 | EncodingData val = EncodingData(encoding_values[max_range / 4][max_range % 4]); |
| 513 | const uint encoding = Encoding(val); | 517 | const uint encoding = Encoding(val); |
| 514 | const uint num_bits = NumBits(val); | 518 | const uint num_bits = NumBits(val); |
| 515 | uint vals_decoded = 0; | 519 | uint vals_decoded = 0; |
| 516 | while (vals_decoded < num_values) { | 520 | while (vals_decoded < num_values && !result_limit_reached) { |
| 517 | switch (encoding) { | 521 | switch (encoding) { |
| 518 | case QUINT: | 522 | case QUINT: |
| 519 | DecodeQuintBlock(num_bits); | 523 | DecodeQuintBlock(num_bits); |
| @@ -532,7 +536,8 @@ void DecodeIntegerSequence(uint max_range, uint num_values) { | |||
| 532 | } | 536 | } |
| 533 | } | 537 | } |
| 534 | 538 | ||
| 535 | void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | 539 | void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, |
| 540 | out uvec4 color_values[8]) { | ||
| 536 | uint num_values = 0; | 541 | uint num_values = 0; |
| 537 | for (uint i = 0; i < num_partitions; i++) { | 542 | for (uint i = 0; i < num_partitions; i++) { |
| 538 | num_values += ((modes[i] >> 2) + 1) << 1; | 543 | num_values += ((modes[i] >> 2) + 1) << 1; |
| @@ -540,8 +545,8 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | |||
| 540 | // Find the largest encoding that's within color_data_bits | 545 | // Find the largest encoding that's within color_data_bits |
| 541 | // TODO(ameerj): profile with binary search | 546 | // TODO(ameerj): profile with binary search |
| 542 | int range = 0; | 547 | int range = 0; |
| 543 | while (++range < encoding_values.length()) { | 548 | while (++range < ((encoding_values.length() * 4) - 2)) { |
| 544 | uint bit_length = GetBitLength(num_values, range); | 549 | const uint bit_length = GetBitLength(num_values, range); |
| 545 | if (bit_length > color_data_bits) { | 550 | if (bit_length > color_data_bits) { |
| 546 | break; | 551 | break; |
| 547 | } | 552 | } |
| @@ -552,7 +557,7 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | |||
| 552 | if (out_index >= num_values) { | 557 | if (out_index >= num_values) { |
| 553 | break; | 558 | break; |
| 554 | } | 559 | } |
| 555 | const EncodingData val = result_vector[itr]; | 560 | const EncodingData val = GetEncodingFromVector(itr); |
| 556 | const uint encoding = Encoding(val); | 561 | const uint encoding = Encoding(val); |
| 557 | const uint bitlen = NumBits(val); | 562 | const uint bitlen = NumBits(val); |
| 558 | const uint bitval = BitValue(val); | 563 | const uint bitval = BitValue(val); |
| @@ -560,7 +565,8 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | |||
| 560 | A = ReplicateBitTo9((bitval & 1)); | 565 | A = ReplicateBitTo9((bitval & 1)); |
| 561 | switch (encoding) { | 566 | switch (encoding) { |
| 562 | case JUST_BITS: | 567 | case JUST_BITS: |
| 563 | color_values[out_index++] = FastReplicateTo8(bitval, bitlen); | 568 | color_values[out_index / 4][out_index % 4] = FastReplicateTo8(bitval, bitlen); |
| 569 | ++out_index; | ||
| 564 | break; | 570 | break; |
| 565 | case TRIT: { | 571 | case TRIT: { |
| 566 | D = QuintTritValue(val); | 572 | D = QuintTritValue(val); |
| @@ -570,31 +576,31 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | |||
| 570 | break; | 576 | break; |
| 571 | case 2: { | 577 | case 2: { |
| 572 | C = 93; | 578 | C = 93; |
| 573 | uint b = (bitval >> 1) & 1; | 579 | const uint b = (bitval >> 1) & 1; |
| 574 | B = (b << 8) | (b << 4) | (b << 2) | (b << 1); | 580 | B = (b << 8) | (b << 4) | (b << 2) | (b << 1); |
| 575 | break; | 581 | break; |
| 576 | } | 582 | } |
| 577 | case 3: { | 583 | case 3: { |
| 578 | C = 44; | 584 | C = 44; |
| 579 | uint cb = (bitval >> 1) & 3; | 585 | const uint cb = (bitval >> 1) & 3; |
| 580 | B = (cb << 7) | (cb << 2) | cb; | 586 | B = (cb << 7) | (cb << 2) | cb; |
| 581 | break; | 587 | break; |
| 582 | } | 588 | } |
| 583 | case 4: { | 589 | case 4: { |
| 584 | C = 22; | 590 | C = 22; |
| 585 | uint dcb = (bitval >> 1) & 7; | 591 | const uint dcb = (bitval >> 1) & 7; |
| 586 | B = (dcb << 6) | dcb; | 592 | B = (dcb << 6) | dcb; |
| 587 | break; | 593 | break; |
| 588 | } | 594 | } |
| 589 | case 5: { | 595 | case 5: { |
| 590 | C = 11; | 596 | C = 11; |
| 591 | uint edcb = (bitval >> 1) & 0xF; | 597 | const uint edcb = (bitval >> 1) & 0xF; |
| 592 | B = (edcb << 5) | (edcb >> 2); | 598 | B = (edcb << 5) | (edcb >> 2); |
| 593 | break; | 599 | break; |
| 594 | } | 600 | } |
| 595 | case 6: { | 601 | case 6: { |
| 596 | C = 5; | 602 | C = 5; |
| 597 | uint fedcb = (bitval >> 1) & 0x1F; | 603 | const uint fedcb = (bitval >> 1) & 0x1F; |
| 598 | B = (fedcb << 4) | (fedcb >> 4); | 604 | B = (fedcb << 4) | (fedcb >> 4); |
| 599 | break; | 605 | break; |
| 600 | } | 606 | } |
| @@ -609,25 +615,25 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | |||
| 609 | break; | 615 | break; |
| 610 | case 2: { | 616 | case 2: { |
| 611 | C = 54; | 617 | C = 54; |
| 612 | uint b = (bitval >> 1) & 1; | 618 | const uint b = (bitval >> 1) & 1; |
| 613 | B = (b << 8) | (b << 3) | (b << 2); | 619 | B = (b << 8) | (b << 3) | (b << 2); |
| 614 | break; | 620 | break; |
| 615 | } | 621 | } |
| 616 | case 3: { | 622 | case 3: { |
| 617 | C = 26; | 623 | C = 26; |
| 618 | uint cb = (bitval >> 1) & 3; | 624 | const uint cb = (bitval >> 1) & 3; |
| 619 | B = (cb << 7) | (cb << 1) | (cb >> 1); | 625 | B = (cb << 7) | (cb << 1) | (cb >> 1); |
| 620 | break; | 626 | break; |
| 621 | } | 627 | } |
| 622 | case 4: { | 628 | case 4: { |
| 623 | C = 13; | 629 | C = 13; |
| 624 | uint dcb = (bitval >> 1) & 7; | 630 | const uint dcb = (bitval >> 1) & 7; |
| 625 | B = (dcb << 6) | (dcb >> 1); | 631 | B = (dcb << 6) | (dcb >> 1); |
| 626 | break; | 632 | break; |
| 627 | } | 633 | } |
| 628 | case 5: { | 634 | case 5: { |
| 629 | C = 6; | 635 | C = 6; |
| 630 | uint edcb = (bitval >> 1) & 0xF; | 636 | const uint edcb = (bitval >> 1) & 0xF; |
| 631 | B = (edcb << 5) | (edcb >> 3); | 637 | B = (edcb << 5) | (edcb >> 3); |
| 632 | break; | 638 | break; |
| 633 | } | 639 | } |
| @@ -639,7 +645,8 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | |||
| 639 | uint T = (D * C) + B; | 645 | uint T = (D * C) + B; |
| 640 | T ^= A; | 646 | T ^= A; |
| 641 | T = (A & 0x80) | (T >> 2); | 647 | T = (A & 0x80) | (T >> 2); |
| 642 | color_values[out_index++] = T; | 648 | color_values[out_index / 4][out_index % 4] = T; |
| 649 | ++out_index; | ||
| 643 | } | 650 | } |
| 644 | } | 651 | } |
| 645 | } | 652 | } |
| @@ -657,25 +664,30 @@ ivec2 BitTransferSigned(int a, int b) { | |||
| 657 | } | 664 | } |
| 658 | 665 | ||
| 659 | uvec4 ClampByte(ivec4 color) { | 666 | uvec4 ClampByte(ivec4 color) { |
| 660 | const uvec4 clamped = uvec4(clamp(color, 0, 255)); | 667 | for (uint i = 0; i < 4; ++i) { |
| 661 | return clamped; | 668 | color[i] = clamp(color[i], 0, 255); |
| 669 | } | ||
| 670 | return uvec4(color); | ||
| 662 | } | 671 | } |
| 663 | 672 | ||
| 664 | ivec4 BlueContract(int a, int r, int g, int b) { | 673 | ivec4 BlueContract(int a, int r, int g, int b) { |
| 665 | return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); | 674 | return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); |
| 666 | } | 675 | } |
| 667 | 676 | ||
| 668 | void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { | 677 | void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, |
| 678 | in uvec4 color_values[8], inout uint colvals_index) { | ||
| 669 | #define READ_UINT_VALUES(N) \ | 679 | #define READ_UINT_VALUES(N) \ |
| 670 | uint v[N]; \ | 680 | uint v[N]; \ |
| 671 | for (uint i = 0; i < N; i++) { \ | 681 | for (uint i = 0; i < N; i++) { \ |
| 672 | v[i] = color_values[colvals_index++]; \ | 682 | v[i] = color_values[colvals_index / 4][colvals_index % 4]; \ |
| 683 | ++colvals_index; \ | ||
| 673 | } | 684 | } |
| 674 | 685 | ||
| 675 | #define READ_INT_VALUES(N) \ | 686 | #define READ_INT_VALUES(N) \ |
| 676 | int v[N]; \ | 687 | int v[N]; \ |
| 677 | for (uint i = 0; i < N; i++) { \ | 688 | for (uint i = 0; i < N; i++) { \ |
| 678 | v[i] = int(color_values[colvals_index++]); \ | 689 | v[i] = int(color_values[colvals_index / 4][colvals_index % 4]); \ |
| 690 | ++colvals_index; \ | ||
| 679 | } | 691 | } |
| 680 | 692 | ||
| 681 | switch (color_endpoint_mode) { | 693 | switch (color_endpoint_mode) { |
| @@ -687,8 +699,8 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { | |||
| 687 | } | 699 | } |
| 688 | case 1: { | 700 | case 1: { |
| 689 | READ_UINT_VALUES(2) | 701 | READ_UINT_VALUES(2) |
| 690 | uint L0 = (v[0] >> 2) | (v[1] & 0xC0); | 702 | const uint L0 = (v[0] >> 2) | (v[1] & 0xC0); |
| 691 | uint L1 = min(L0 + (v[1] & 0x3F), 0xFFU); | 703 | const uint L1 = min(L0 + (v[1] & 0x3F), 0xFFU); |
| 692 | ep1 = uvec4(0xFF, L0, L0, L0); | 704 | ep1 = uvec4(0xFF, L0, L0, L0); |
| 693 | ep2 = uvec4(0xFF, L1, L1, L1); | 705 | ep2 = uvec4(0xFF, L1, L1, L1); |
| 694 | break; | 706 | break; |
| @@ -817,7 +829,7 @@ uint UnquantizeTexelWeight(EncodingData val) { | |||
| 817 | D = QuintTritValue(val); | 829 | D = QuintTritValue(val); |
| 818 | switch (bitlen) { | 830 | switch (bitlen) { |
| 819 | case 0: { | 831 | case 0: { |
| 820 | uint results[3] = {0, 32, 63}; | 832 | const uint results[3] = {0, 32, 63}; |
| 821 | result = results[D]; | 833 | result = results[D]; |
| 822 | break; | 834 | break; |
| 823 | } | 835 | } |
| @@ -827,13 +839,13 @@ uint UnquantizeTexelWeight(EncodingData val) { | |||
| 827 | } | 839 | } |
| 828 | case 2: { | 840 | case 2: { |
| 829 | C = 23; | 841 | C = 23; |
| 830 | uint b = (bitval >> 1) & 1; | 842 | const uint b = (bitval >> 1) & 1; |
| 831 | B = (b << 6) | (b << 2) | b; | 843 | B = (b << 6) | (b << 2) | b; |
| 832 | break; | 844 | break; |
| 833 | } | 845 | } |
| 834 | case 3: { | 846 | case 3: { |
| 835 | C = 11; | 847 | C = 11; |
| 836 | uint cb = (bitval >> 1) & 3; | 848 | const uint cb = (bitval >> 1) & 3; |
| 837 | B = (cb << 5) | cb; | 849 | B = (cb << 5) | cb; |
| 838 | break; | 850 | break; |
| 839 | } | 851 | } |
| @@ -846,7 +858,7 @@ uint UnquantizeTexelWeight(EncodingData val) { | |||
| 846 | D = QuintTritValue(val); | 858 | D = QuintTritValue(val); |
| 847 | switch (bitlen) { | 859 | switch (bitlen) { |
| 848 | case 0: { | 860 | case 0: { |
| 849 | uint results[5] = {0, 16, 32, 47, 63}; | 861 | const uint results[5] = {0, 16, 32, 47, 63}; |
| 850 | result = results[D]; | 862 | result = results[D]; |
| 851 | break; | 863 | break; |
| 852 | } | 864 | } |
| @@ -856,7 +868,7 @@ uint UnquantizeTexelWeight(EncodingData val) { | |||
| 856 | } | 868 | } |
| 857 | case 2: { | 869 | case 2: { |
| 858 | C = 13; | 870 | C = 13; |
| 859 | uint b = (bitval >> 1) & 1; | 871 | const uint b = (bitval >> 1) & 1; |
| 860 | B = (b << 6) | (b << 1); | 872 | B = (b << 6) | (b << 1); |
| 861 | break; | 873 | break; |
| 862 | } | 874 | } |
| @@ -875,15 +887,18 @@ uint UnquantizeTexelWeight(EncodingData val) { | |||
| 875 | return result; | 887 | return result; |
| 876 | } | 888 | } |
| 877 | 889 | ||
| 878 | void UnquantizeTexelWeights(bool is_dual_plane, uvec2 size, out uint unquantized_texel_weights[2 * 144]) { | 890 | void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane, |
| 891 | out uvec4 unquantized_texel_weights[VECTOR_ARRAY_SIZE]) { | ||
| 879 | const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); | 892 | const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); |
| 880 | const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); | 893 | const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); |
| 881 | const uint num_planes = is_dual_plane ? 2 : 1; | 894 | const uint num_planes = is_dual_plane ? 2 : 1; |
| 882 | const uint area = size.x * size.y; | 895 | const uint area = size.x * size.y; |
| 883 | const uint loop_count = min(result_index, area * num_planes); | 896 | const uint loop_count = min(result_index, area * num_planes); |
| 884 | uint unquantized[2 * 144]; | ||
| 885 | for (uint itr = 0; itr < loop_count; ++itr) { | 897 | for (uint itr = 0; itr < loop_count; ++itr) { |
| 886 | unquantized[itr] = UnquantizeTexelWeight(result_vector[itr]); | 898 | const uint array_index = itr / 4; |
| 899 | const uint vector_index = itr % 4; | ||
| 900 | result_vector[array_index][vector_index] = | ||
| 901 | UnquantizeTexelWeight(GetEncodingFromVector(itr)); | ||
| 887 | } | 902 | } |
| 888 | for (uint plane = 0; plane < num_planes; ++plane) { | 903 | for (uint plane = 0; plane < num_planes; ++plane) { |
| 889 | for (uint t = 0; t < block_dims.y; t++) { | 904 | for (uint t = 0; t < block_dims.y; t++) { |
| @@ -907,28 +922,33 @@ void UnquantizeTexelWeights(bool is_dual_plane, uvec2 size, out uint unquantized | |||
| 907 | 922 | ||
| 908 | #define VectorIndicesFromBase(offset_base) \ | 923 | #define VectorIndicesFromBase(offset_base) \ |
| 909 | const uint offset = is_dual_plane ? 2 * offset_base + plane : offset_base; \ | 924 | const uint offset = is_dual_plane ? 2 * offset_base + plane : offset_base; \ |
| 925 | const uint array_index = offset / 4; \ | ||
| 926 | const uint vector_index = offset % 4; | ||
| 910 | 927 | ||
| 911 | if (v0 < area) { | 928 | if (v0 < area) { |
| 912 | const uint offset_base = v0; | 929 | const uint offset_base = v0; |
| 913 | VectorIndicesFromBase(offset_base); | 930 | VectorIndicesFromBase(offset_base); |
| 914 | p.x = unquantized[offset]; | 931 | p.x = result_vector[array_index][vector_index]; |
| 915 | } | 932 | } |
| 916 | if ((v0 + 1) < (area)) { | 933 | if ((v0 + 1) < (area)) { |
| 917 | const uint offset_base = v0 + 1; | 934 | const uint offset_base = v0 + 1; |
| 918 | VectorIndicesFromBase(offset_base); | 935 | VectorIndicesFromBase(offset_base); |
| 919 | p.y = unquantized[offset]; | 936 | p.y = result_vector[array_index][vector_index]; |
| 920 | } | 937 | } |
| 921 | if ((v0 + size.x) < (area)) { | 938 | if ((v0 + size.x) < (area)) { |
| 922 | const uint offset_base = v0 + size.x; | 939 | const uint offset_base = v0 + size.x; |
| 923 | VectorIndicesFromBase(offset_base); | 940 | VectorIndicesFromBase(offset_base); |
| 924 | p.z = unquantized[offset]; | 941 | p.z = result_vector[array_index][vector_index]; |
| 925 | } | 942 | } |
| 926 | if ((v0 + size.x + 1) < (area)) { | 943 | if ((v0 + size.x + 1) < (area)) { |
| 927 | const uint offset_base = v0 + size.x + 1; | 944 | const uint offset_base = v0 + size.x + 1; |
| 928 | VectorIndicesFromBase(offset_base); | 945 | VectorIndicesFromBase(offset_base); |
| 929 | p.w = unquantized[offset]; | 946 | p.w = result_vector[array_index][vector_index]; |
| 930 | } | 947 | } |
| 931 | unquantized_texel_weights[plane * 144 + t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; | 948 | const uint offset = (t * block_dims.x + s) + ARRAY_NUM_ELEMENTS * plane; |
| 949 | const uint array_index = offset / 4; | ||
| 950 | const uint vector_index = offset % 4; | ||
| 951 | unquantized_texel_weights[array_index][vector_index] = (uint(dot(p, w)) + 8) >> 4; | ||
| 932 | } | 952 | } |
| 933 | } | 953 | } |
| 934 | } | 954 | } |
| @@ -1050,6 +1070,7 @@ TexelWeightParams DecodeBlockInfo() { | |||
| 1050 | weight_index += 6; | 1070 | weight_index += 6; |
| 1051 | } | 1071 | } |
| 1052 | params.max_weight = weight_index + 1; | 1072 | params.max_weight = weight_index + 1; |
| 1073 | |||
| 1053 | return params; | 1074 | return params; |
| 1054 | } | 1075 | } |
| 1055 | 1076 | ||
| @@ -1079,7 +1100,7 @@ void FillVoidExtentLDR(ivec3 coord) { | |||
| 1079 | } | 1100 | } |
| 1080 | 1101 | ||
| 1081 | void DecompressBlock(ivec3 coord) { | 1102 | void DecompressBlock(ivec3 coord) { |
| 1082 | TexelWeightParams params = DecodeBlockInfo(); | 1103 | const TexelWeightParams params = DecodeBlockInfo(); |
| 1083 | if (params.error_state) { | 1104 | if (params.error_state) { |
| 1084 | FillError(coord); | 1105 | FillError(coord); |
| 1085 | return; | 1106 | return; |
| @@ -1096,12 +1117,11 @@ void DecompressBlock(ivec3 coord) { | |||
| 1096 | FillError(coord); | 1117 | FillError(coord); |
| 1097 | return; | 1118 | return; |
| 1098 | } | 1119 | } |
| 1099 | uint num_partitions = StreamBits(2) + 1; | 1120 | const uint num_partitions = StreamBits(2) + 1; |
| 1100 | if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) { | 1121 | if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) { |
| 1101 | FillError(coord); | 1122 | FillError(coord); |
| 1102 | return; | 1123 | return; |
| 1103 | } | 1124 | } |
| 1104 | int plane_index = -1; | ||
| 1105 | uint partition_index = 1; | 1125 | uint partition_index = 1; |
| 1106 | uvec4 color_endpoint_mode = uvec4(0); | 1126 | uvec4 color_endpoint_mode = uvec4(0); |
| 1107 | uint ced_pointer = 0; | 1127 | uint ced_pointer = 0; |
| @@ -1113,8 +1133,8 @@ void DecompressBlock(ivec3 coord) { | |||
| 1113 | partition_index = StreamBits(10); | 1133 | partition_index = StreamBits(10); |
| 1114 | base_cem = StreamBits(6); | 1134 | base_cem = StreamBits(6); |
| 1115 | } | 1135 | } |
| 1116 | uint base_mode = base_cem & 3; | 1136 | const uint base_mode = base_cem & 3; |
| 1117 | uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight); | 1137 | const uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight); |
| 1118 | uint remaining_bits = 128 - weight_bits - total_bitsread; | 1138 | uint remaining_bits = 128 - weight_bits - total_bitsread; |
| 1119 | uint extra_cem_bits = 0; | 1139 | uint extra_cem_bits = 0; |
| 1120 | if (base_mode > 0) { | 1140 | if (base_mode > 0) { |
| @@ -1133,10 +1153,7 @@ void DecompressBlock(ivec3 coord) { | |||
| 1133 | } | 1153 | } |
| 1134 | } | 1154 | } |
| 1135 | remaining_bits -= extra_cem_bits; | 1155 | remaining_bits -= extra_cem_bits; |
| 1136 | uint plane_selector_bits = 0; | 1156 | const uint plane_selector_bits = params.dual_plane ? 2 : 0; |
| 1137 | if (params.dual_plane) { | ||
| 1138 | plane_selector_bits = 2; | ||
| 1139 | } | ||
| 1140 | remaining_bits -= plane_selector_bits; | 1157 | remaining_bits -= plane_selector_bits; |
| 1141 | if (remaining_bits > 128) { | 1158 | if (remaining_bits > 128) { |
| 1142 | // Bad data, more remaining bits than 4 bytes | 1159 | // Bad data, more remaining bits than 4 bytes |
| @@ -1144,17 +1161,17 @@ void DecompressBlock(ivec3 coord) { | |||
| 1144 | return; | 1161 | return; |
| 1145 | } | 1162 | } |
| 1146 | // Read color data... | 1163 | // Read color data... |
| 1147 | uint color_data_bits = remaining_bits; | 1164 | const uint color_data_bits = remaining_bits; |
| 1148 | while (remaining_bits > 0) { | 1165 | while (remaining_bits > 0) { |
| 1149 | int nb = int(min(remaining_bits, 32U)); | 1166 | const int nb = int(min(remaining_bits, 32U)); |
| 1150 | uint b = StreamBits(nb); | 1167 | const uint b = StreamBits(nb); |
| 1151 | color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); | 1168 | color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); |
| 1152 | ++ced_pointer; | 1169 | ++ced_pointer; |
| 1153 | remaining_bits -= nb; | 1170 | remaining_bits -= nb; |
| 1154 | } | 1171 | } |
| 1155 | plane_index = int(StreamBits(plane_selector_bits)); | 1172 | const uint plane_index = uint(StreamBits(plane_selector_bits)); |
| 1156 | if (base_mode > 0) { | 1173 | if (base_mode > 0) { |
| 1157 | uint extra_cem = StreamBits(extra_cem_bits); | 1174 | const uint extra_cem = StreamBits(extra_cem_bits); |
| 1158 | uint cem = (extra_cem << 6) | base_cem; | 1175 | uint cem = (extra_cem << 6) | base_cem; |
| 1159 | cem >>= 2; | 1176 | cem >>= 2; |
| 1160 | uvec4 C = uvec4(0); | 1177 | uvec4 C = uvec4(0); |
| @@ -1176,43 +1193,54 @@ void DecompressBlock(ivec3 coord) { | |||
| 1176 | color_endpoint_mode[i] |= M[i]; | 1193 | color_endpoint_mode[i] |= M[i]; |
| 1177 | } | 1194 | } |
| 1178 | } else if (num_partitions > 1) { | 1195 | } else if (num_partitions > 1) { |
| 1179 | uint cem = base_cem >> 2; | 1196 | const uint cem = base_cem >> 2; |
| 1180 | for (uint i = 0; i < num_partitions; i++) { | 1197 | for (uint i = 0; i < num_partitions; i++) { |
| 1181 | color_endpoint_mode[i] = cem; | 1198 | color_endpoint_mode[i] = cem; |
| 1182 | } | 1199 | } |
| 1183 | } | 1200 | } |
| 1184 | DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits); | ||
| 1185 | 1201 | ||
| 1186 | uvec4 endpoints[4][2]; | 1202 | uvec4 endpoints0[4]; |
| 1187 | for (uint i = 0; i < num_partitions; i++) { | 1203 | uvec4 endpoints1[4]; |
| 1188 | ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]); | 1204 | { |
| 1189 | } | 1205 | // This decode phase should at most push 32 elements into the vector |
| 1206 | result_vector_max_index = 32; | ||
| 1190 | 1207 | ||
| 1208 | uvec4 color_values[8]; | ||
| 1209 | uint colvals_index = 0; | ||
| 1210 | DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits, color_values); | ||
| 1211 | for (uint i = 0; i < num_partitions; i++) { | ||
| 1212 | ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values, | ||
| 1213 | colvals_index); | ||
| 1214 | } | ||
| 1215 | } | ||
| 1191 | color_endpoint_data = local_buff; | 1216 | color_endpoint_data = local_buff; |
| 1192 | color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx; | 1217 | color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx; |
| 1193 | uint clear_byte_start = | 1218 | const uint clear_byte_start = (weight_bits >> 3) + 1; |
| 1194 | (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1; | 1219 | |
| 1195 | 1220 | const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) & | |
| 1196 | uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) & | 1221 | uint(((1 << (weight_bits % 8)) - 1)); |
| 1197 | uint( | 1222 | const uint vec_index = (clear_byte_start - 1) >> 2; |
| 1198 | ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); | 1223 | color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert, |
| 1199 | uint vec_index = (clear_byte_start - 1) >> 2; | 1224 | int((clear_byte_start - 1) % 4) * 8, 8); |
| 1200 | color_endpoint_data[vec_index] = | ||
| 1201 | bitfieldInsert(color_endpoint_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8); | ||
| 1202 | for (uint i = clear_byte_start; i < 16; ++i) { | 1225 | for (uint i = clear_byte_start; i < 16; ++i) { |
| 1203 | uint idx = i >> 2; | 1226 | const uint idx = i >> 2; |
| 1204 | color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8); | 1227 | color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8); |
| 1205 | } | 1228 | } |
| 1206 | 1229 | ||
| 1207 | // Re-init vector variables for next decode phase | 1230 | // Re-init vector variables for next decode phase |
| 1208 | result_index = 0; | 1231 | result_index = 0; |
| 1209 | color_bitsread = 0; | 1232 | color_bitsread = 0; |
| 1233 | result_limit_reached = false; | ||
| 1210 | 1234 | ||
| 1235 | // The limit for the Unquantize phase, avoids decoding more data than needed. | ||
| 1236 | result_vector_max_index = params.size.x * params.size.y; | ||
| 1237 | if (params.dual_plane) { | ||
| 1238 | result_vector_max_index *= 2; | ||
| 1239 | } | ||
| 1211 | DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); | 1240 | DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); |
| 1212 | 1241 | ||
| 1213 | uint unquantized_texel_weights[2 * 144]; | 1242 | uvec4 unquantized_texel_weights[VECTOR_ARRAY_SIZE]; |
| 1214 | UnquantizeTexelWeights(params.dual_plane, params.size, unquantized_texel_weights); | 1243 | UnquantizeTexelWeights(params.size, params.dual_plane, unquantized_texel_weights); |
| 1215 | |||
| 1216 | for (uint j = 0; j < block_dims.y; j++) { | 1244 | for (uint j = 0; j < block_dims.y; j++) { |
| 1217 | for (uint i = 0; i < block_dims.x; i++) { | 1245 | for (uint i = 0; i < block_dims.x; i++) { |
| 1218 | uint local_partition = 0; | 1246 | uint local_partition = 0; |
| @@ -1220,13 +1248,19 @@ void DecompressBlock(ivec3 coord) { | |||
| 1220 | local_partition = Select2DPartition(partition_index, i, j, num_partitions, | 1248 | local_partition = Select2DPartition(partition_index, i, j, num_partitions, |
| 1221 | (block_dims.y * block_dims.x) < 32); | 1249 | (block_dims.y * block_dims.x) < 32); |
| 1222 | } | 1250 | } |
| 1223 | const uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]); | 1251 | const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]); |
| 1224 | const uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]); | 1252 | const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]); |
| 1225 | const uint weight_offset = (j * block_dims.x + i); | 1253 | const uint weight_offset = (j * block_dims.x + i); |
| 1226 | const uint primary_weight = unquantized_texel_weights[weight_offset]; | 1254 | const uint array_index = weight_offset / 4; |
| 1255 | const uint vector_index = weight_offset % 4; | ||
| 1256 | const uint primary_weight = unquantized_texel_weights[array_index][vector_index]; | ||
| 1227 | uvec4 weight_vec = uvec4(primary_weight); | 1257 | uvec4 weight_vec = uvec4(primary_weight); |
| 1228 | if (params.dual_plane) { | 1258 | if (params.dual_plane) { |
| 1229 | const uint secondary_weight = unquantized_texel_weights[weight_offset + 144]; | 1259 | const uint secondary_weight_offset = (j * block_dims.x + i) + ARRAY_NUM_ELEMENTS; |
| 1260 | const uint secondary_array_index = secondary_weight_offset / 4; | ||
| 1261 | const uint secondary_vector_index = secondary_weight_offset % 4; | ||
| 1262 | const uint secondary_weight = | ||
| 1263 | unquantized_texel_weights[secondary_array_index][secondary_vector_index]; | ||
| 1230 | for (uint c = 0; c < 4; c++) { | 1264 | for (uint c = 0; c < 4; c++) { |
| 1231 | const bool is_secondary = ((plane_index + 1u) & 3u) == c; | 1265 | const bool is_secondary = ((plane_index + 1u) & 3u) == c; |
| 1232 | weight_vec[c] = is_secondary ? secondary_weight : primary_weight; | 1266 | weight_vec[c] = is_secondary ? secondary_weight : primary_weight; |
| @@ -1240,12 +1274,11 @@ void DecompressBlock(ivec3 coord) { | |||
| 1240 | } | 1274 | } |
| 1241 | } | 1275 | } |
| 1242 | 1276 | ||
| 1243 | |||
| 1244 | uint SwizzleOffset(uvec2 pos) { | 1277 | uint SwizzleOffset(uvec2 pos) { |
| 1245 | uint x = pos.x; | 1278 | const uint x = pos.x; |
| 1246 | uint y = pos.y; | 1279 | const uint y = pos.y; |
| 1247 | return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + | 1280 | return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + |
| 1248 | (y % 2) * 16 + (x % 16); | 1281 | ((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16); |
| 1249 | } | 1282 | } |
| 1250 | 1283 | ||
| 1251 | void main() { | 1284 | void main() { |