summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar ameerj2021-06-13 15:15:08 -0400
committerGravatar ameerj2021-06-15 20:19:00 -0400
commitc4ff7ecf511edb8adc2f2d8eff9d51212a87dc6b (patch)
tree1d8b154c9e2ab468af6f867be6fe93a632a50428
parentMerge pull request #6470 from ameerj/lm-silence (diff)
downloadyuzu-c4ff7ecf511edb8adc2f2d8eff9d51212a87dc6b.tar.gz
yuzu-c4ff7ecf511edb8adc2f2d8eff9d51212a87dc6b.tar.xz
yuzu-c4ff7ecf511edb8adc2f2d8eff9d51212a87dc6b.zip
textures: Reintroduce CPU ASTC decoder
Users may want to fall back to the CPU ASTC texture decoder due to hangs and crashes that may be caused by keeping the GPU under compute heavy loads for extended periods of time. This is especially the case in games such as Astral Chain which make extensive use of ASTC textures.
Diffstat (limited to '')
-rw-r--r--src/video_core/CMakeLists.txt1
-rw-r--r--src/video_core/texture_cache/util.cpp13
-rw-r--r--src/video_core/textures/astc.cpp1577
-rw-r--r--src/video_core/textures/astc.h3
4 files changed, 1592 insertions, 2 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 47190c464..f9454bbaa 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -237,6 +237,7 @@ add_library(video_core STATIC
237 texture_cache/util.cpp 237 texture_cache/util.cpp
238 texture_cache/util.h 238 texture_cache/util.h
239 textures/astc.h 239 textures/astc.h
240 textures/astc.cpp
240 textures/decoders.cpp 241 textures/decoders.cpp
241 textures/decoders.h 242 textures/decoders.h
242 textures/texture.cpp 243 textures/texture.cpp
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index 906604a39..0d3e0804f 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -47,6 +47,7 @@
47#include "video_core/texture_cache/formatter.h" 47#include "video_core/texture_cache/formatter.h"
48#include "video_core/texture_cache/samples_helper.h" 48#include "video_core/texture_cache/samples_helper.h"
49#include "video_core/texture_cache/util.h" 49#include "video_core/texture_cache/util.h"
50#include "video_core/textures/astc.h"
50#include "video_core/textures/decoders.h" 51#include "video_core/textures/decoders.h"
51 52
52namespace VideoCommon { 53namespace VideoCommon {
@@ -884,8 +885,16 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8
884 ASSERT(copy.image_extent == mip_size); 885 ASSERT(copy.image_extent == mip_size);
885 ASSERT(copy.buffer_row_length == Common::AlignUp(mip_size.width, tile_size.width)); 886 ASSERT(copy.buffer_row_length == Common::AlignUp(mip_size.width, tile_size.width));
886 ASSERT(copy.buffer_image_height == Common::AlignUp(mip_size.height, tile_size.height)); 887 ASSERT(copy.buffer_image_height == Common::AlignUp(mip_size.height, tile_size.height));
887 DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent, 888 if (IsPixelFormatASTC(info.format)) {
888 output.subspan(output_offset)); 889 ASSERT(copy.image_extent.depth == 1);
890 Tegra::Texture::ASTC::Decompress(input.subspan(copy.buffer_offset),
891 copy.image_extent.width, copy.image_extent.height,
892 copy.image_subresource.num_layers, tile_size.width,
893 tile_size.height, output.subspan(output_offset));
894 } else {
895 DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent,
896 output.subspan(output_offset));
897 }
889 copy.buffer_offset = output_offset; 898 copy.buffer_offset = output_offset;
890 copy.buffer_row_length = mip_size.width; 899 copy.buffer_row_length = mip_size.width;
891 copy.buffer_image_height = mip_size.height; 900 copy.buffer_image_height = mip_size.height;
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
new file mode 100644
index 000000000..6079aa709
--- /dev/null
+++ b/src/video_core/textures/astc.cpp
@@ -0,0 +1,1577 @@
1// Copyright 2016 The University of North Carolina at Chapel Hill
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Please send all BUG REPORTS to <pavel@cs.unc.edu>.
16// <http://gamma.cs.unc.edu/FasTC/>
17
18#include <algorithm>
19#include <cassert>
20#include <cstring>
21#include <span>
22#include <vector>
23
24#include <boost/container/static_vector.hpp>
25
26#include "common/common_types.h"
27#include "video_core/textures/astc.h"
28
29class InputBitStream {
30public:
31 constexpr explicit InputBitStream(std::span<const u8> data, size_t start_offset = 0)
32 : cur_byte{data.data()}, total_bits{data.size()}, next_bit{start_offset % 8} {}
33
34 constexpr size_t GetBitsRead() const {
35 return bits_read;
36 }
37
38 constexpr bool ReadBit() {
39 if (bits_read >= total_bits * 8) {
40 return 0;
41 }
42 const bool bit = ((*cur_byte >> next_bit) & 1) != 0;
43 ++next_bit;
44 while (next_bit >= 8) {
45 next_bit -= 8;
46 ++cur_byte;
47 }
48 ++bits_read;
49 return bit;
50 }
51
52 constexpr u32 ReadBits(std::size_t nBits) {
53 u32 ret = 0;
54 for (std::size_t i = 0; i < nBits; ++i) {
55 ret |= (ReadBit() & 1) << i;
56 }
57 return ret;
58 }
59
60 template <std::size_t nBits>
61 constexpr u32 ReadBits() {
62 u32 ret = 0;
63 for (std::size_t i = 0; i < nBits; ++i) {
64 ret |= (ReadBit() & 1) << i;
65 }
66 return ret;
67 }
68
69private:
70 const u8* cur_byte;
71 size_t total_bits = 0;
72 size_t next_bit = 0;
73 size_t bits_read = 0;
74};
75
76class OutputBitStream {
77public:
78 constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0)
79 : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {}
80
81 constexpr std::size_t GetBitsWritten() const {
82 return bits_written;
83 }
84
85 constexpr void WriteBitsR(u32 val, u32 nBits) {
86 for (u32 i = 0; i < nBits; i++) {
87 WriteBit((val >> (nBits - i - 1)) & 1);
88 }
89 }
90
91 constexpr void WriteBits(u32 val, u32 nBits) {
92 for (u32 i = 0; i < nBits; i++) {
93 WriteBit((val >> i) & 1);
94 }
95 }
96
97private:
98 constexpr void WriteBit(bool b) {
99 if (bits_written >= num_bits) {
100 return;
101 }
102
103 const u32 mask = 1 << next_bit++;
104
105 // clear the bit
106 *cur_byte &= static_cast<u8>(~mask);
107
108 // Write the bit, if necessary
109 if (b)
110 *cur_byte |= static_cast<u8>(mask);
111
112 // Next byte?
113 if (next_bit >= 8) {
114 cur_byte += 1;
115 next_bit = 0;
116 }
117 }
118
119 u8* cur_byte;
120 std::size_t num_bits;
121 std::size_t bits_written = 0;
122 std::size_t next_bit = 0;
123};
124
125template <typename IntType>
126class Bits {
127public:
128 explicit Bits(const IntType& v) : m_Bits(v) {}
129
130 Bits(const Bits&) = delete;
131 Bits& operator=(const Bits&) = delete;
132
133 u8 operator[](u32 bitPos) const {
134 return static_cast<u8>((m_Bits >> bitPos) & 1);
135 }
136
137 IntType operator()(u32 start, u32 end) const {
138 if (start == end) {
139 return (*this)[start];
140 } else if (start > end) {
141 u32 t = start;
142 start = end;
143 end = t;
144 }
145
146 u64 mask = (1 << (end - start + 1)) - 1;
147 return (m_Bits >> start) & static_cast<IntType>(mask);
148 }
149
150private:
151 const IntType& m_Bits;
152};
153
154namespace Tegra::Texture::ASTC {
155using IntegerEncodedVector = boost::container::static_vector<
156 IntegerEncodedValue, 256,
157 boost::container::static_vector_options<
158 boost::container::inplace_alignment<alignof(IntegerEncodedValue)>,
159 boost::container::throw_on_overflow<false>>::type>;
160
161static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) {
162 // Implement the algorithm in section C.2.12
163 std::array<u32, 5> m;
164 std::array<u32, 5> t;
165 u32 T;
166
167 // Read the trit encoded block according to
168 // table C.2.14
169 m[0] = bits.ReadBits(nBitsPerValue);
170 T = bits.ReadBits<2>();
171 m[1] = bits.ReadBits(nBitsPerValue);
172 T |= bits.ReadBits<2>() << 2;
173 m[2] = bits.ReadBits(nBitsPerValue);
174 T |= bits.ReadBit() << 4;
175 m[3] = bits.ReadBits(nBitsPerValue);
176 T |= bits.ReadBits<2>() << 5;
177 m[4] = bits.ReadBits(nBitsPerValue);
178 T |= bits.ReadBit() << 7;
179
180 u32 C = 0;
181
182 Bits<u32> Tb(T);
183 if (Tb(2, 4) == 7) {
184 C = (Tb(5, 7) << 2) | Tb(0, 1);
185 t[4] = t[3] = 2;
186 } else {
187 C = Tb(0, 4);
188 if (Tb(5, 6) == 3) {
189 t[4] = 2;
190 t[3] = Tb[7];
191 } else {
192 t[4] = Tb[7];
193 t[3] = Tb(5, 6);
194 }
195 }
196
197 Bits<u32> Cb(C);
198 if (Cb(0, 1) == 3) {
199 t[2] = 2;
200 t[1] = Cb[4];
201 t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]);
202 } else if (Cb(2, 3) == 3) {
203 t[2] = 2;
204 t[1] = 2;
205 t[0] = Cb(0, 1);
206 } else {
207 t[2] = Cb[4];
208 t[1] = Cb(2, 3);
209 t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]);
210 }
211
212 for (std::size_t i = 0; i < 5; ++i) {
213 IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Trit, nBitsPerValue);
214 val.bit_value = m[i];
215 val.trit_value = t[i];
216 }
217}
218
219static void DecodeQuintBlock(InputBitStream& bits, IntegerEncodedVector& result,
220 u32 nBitsPerValue) {
221 // Implement the algorithm in section C.2.12
222 u32 m[3];
223 u32 q[3];
224 u32 Q;
225
226 // Read the trit encoded block according to
227 // table C.2.15
228 m[0] = bits.ReadBits(nBitsPerValue);
229 Q = bits.ReadBits<3>();
230 m[1] = bits.ReadBits(nBitsPerValue);
231 Q |= bits.ReadBits<2>() << 3;
232 m[2] = bits.ReadBits(nBitsPerValue);
233 Q |= bits.ReadBits<2>() << 5;
234
235 Bits<u32> Qb(Q);
236 if (Qb(1, 2) == 3 && Qb(5, 6) == 0) {
237 q[0] = q[1] = 4;
238 q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]);
239 } else {
240 u32 C = 0;
241 if (Qb(1, 2) == 3) {
242 q[2] = 4;
243 C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0];
244 } else {
245 q[2] = Qb(5, 6);
246 C = Qb(0, 4);
247 }
248
249 Bits<u32> Cb(C);
250 if (Cb(0, 2) == 5) {
251 q[1] = 4;
252 q[0] = Cb(3, 4);
253 } else {
254 q[1] = Cb(3, 4);
255 q[0] = Cb(0, 2);
256 }
257 }
258
259 for (std::size_t i = 0; i < 3; ++i) {
260 IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Quint, nBitsPerValue);
261 val.bit_value = m[i];
262 val.quint_value = q[i];
263 }
264}
265
266// Fills result with the values that are encoded in the given
267// bitstream. We must know beforehand what the maximum possible
268// value is, and how many values we're decoding.
269static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange,
270 u32 nValues) {
271 // Determine encoding parameters
272 IntegerEncodedValue val = EncodingsValues[maxRange];
273
274 // Start decoding
275 u32 nValsDecoded = 0;
276 while (nValsDecoded < nValues) {
277 switch (val.encoding) {
278 case IntegerEncoding::Quint:
279 DecodeQuintBlock(bits, result, val.num_bits);
280 nValsDecoded += 3;
281 break;
282
283 case IntegerEncoding::Trit:
284 DecodeTritBlock(bits, result, val.num_bits);
285 nValsDecoded += 5;
286 break;
287
288 case IntegerEncoding::JustBits:
289 val.bit_value = bits.ReadBits(val.num_bits);
290 result.push_back(val);
291 nValsDecoded++;
292 break;
293 }
294 }
295}
296
297struct TexelWeightParams {
298 u32 m_Width = 0;
299 u32 m_Height = 0;
300 bool m_bDualPlane = false;
301 u32 m_MaxWeight = 0;
302 bool m_bError = false;
303 bool m_bVoidExtentLDR = false;
304 bool m_bVoidExtentHDR = false;
305
306 u32 GetPackedBitSize() const {
307 // How many indices do we have?
308 u32 nIdxs = m_Height * m_Width;
309 if (m_bDualPlane) {
310 nIdxs *= 2;
311 }
312
313 return EncodingsValues[m_MaxWeight].GetBitLength(nIdxs);
314 }
315
316 u32 GetNumWeightValues() const {
317 u32 ret = m_Width * m_Height;
318 if (m_bDualPlane) {
319 ret *= 2;
320 }
321 return ret;
322 }
323};
324
325static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
326 TexelWeightParams params;
327
328 // Read the entire block mode all at once
329 u16 modeBits = static_cast<u16>(strm.ReadBits<11>());
330
331 // Does this match the void extent block mode?
332 if ((modeBits & 0x01FF) == 0x1FC) {
333 if (modeBits & 0x200) {
334 params.m_bVoidExtentHDR = true;
335 } else {
336 params.m_bVoidExtentLDR = true;
337 }
338
339 // Next two bits must be one.
340 if (!(modeBits & 0x400) || !strm.ReadBit()) {
341 params.m_bError = true;
342 }
343
344 return params;
345 }
346
347 // First check if the last four bits are zero
348 if ((modeBits & 0xF) == 0) {
349 params.m_bError = true;
350 return params;
351 }
352
353 // If the last two bits are zero, then if bits
354 // [6-8] are all ones, this is also reserved.
355 if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0) {
356 params.m_bError = true;
357 return params;
358 }
359
360 // Otherwise, there is no error... Figure out the layout
361 // of the block mode. Layout is determined by a number
362 // between 0 and 9 corresponding to table C.2.8 of the
363 // ASTC spec.
364 u32 layout = 0;
365
366 if ((modeBits & 0x1) || (modeBits & 0x2)) {
367 // layout is in [0-4]
368 if (modeBits & 0x8) {
369 // layout is in [2-4]
370 if (modeBits & 0x4) {
371 // layout is in [3-4]
372 if (modeBits & 0x100) {
373 layout = 4;
374 } else {
375 layout = 3;
376 }
377 } else {
378 layout = 2;
379 }
380 } else {
381 // layout is in [0-1]
382 if (modeBits & 0x4) {
383 layout = 1;
384 } else {
385 layout = 0;
386 }
387 }
388 } else {
389 // layout is in [5-9]
390 if (modeBits & 0x100) {
391 // layout is in [7-9]
392 if (modeBits & 0x80) {
393 // layout is in [7-8]
394 assert((modeBits & 0x40) == 0U);
395 if (modeBits & 0x20) {
396 layout = 8;
397 } else {
398 layout = 7;
399 }
400 } else {
401 layout = 9;
402 }
403 } else {
404 // layout is in [5-6]
405 if (modeBits & 0x80) {
406 layout = 6;
407 } else {
408 layout = 5;
409 }
410 }
411 }
412
413 assert(layout < 10);
414
415 // Determine R
416 u32 R = !!(modeBits & 0x10);
417 if (layout < 5) {
418 R |= (modeBits & 0x3) << 1;
419 } else {
420 R |= (modeBits & 0xC) >> 1;
421 }
422 assert(2 <= R && R <= 7);
423
424 // Determine width & height
425 switch (layout) {
426 case 0: {
427 u32 A = (modeBits >> 5) & 0x3;
428 u32 B = (modeBits >> 7) & 0x3;
429 params.m_Width = B + 4;
430 params.m_Height = A + 2;
431 break;
432 }
433
434 case 1: {
435 u32 A = (modeBits >> 5) & 0x3;
436 u32 B = (modeBits >> 7) & 0x3;
437 params.m_Width = B + 8;
438 params.m_Height = A + 2;
439 break;
440 }
441
442 case 2: {
443 u32 A = (modeBits >> 5) & 0x3;
444 u32 B = (modeBits >> 7) & 0x3;
445 params.m_Width = A + 2;
446 params.m_Height = B + 8;
447 break;
448 }
449
450 case 3: {
451 u32 A = (modeBits >> 5) & 0x3;
452 u32 B = (modeBits >> 7) & 0x1;
453 params.m_Width = A + 2;
454 params.m_Height = B + 6;
455 break;
456 }
457
458 case 4: {
459 u32 A = (modeBits >> 5) & 0x3;
460 u32 B = (modeBits >> 7) & 0x1;
461 params.m_Width = B + 2;
462 params.m_Height = A + 2;
463 break;
464 }
465
466 case 5: {
467 u32 A = (modeBits >> 5) & 0x3;
468 params.m_Width = 12;
469 params.m_Height = A + 2;
470 break;
471 }
472
473 case 6: {
474 u32 A = (modeBits >> 5) & 0x3;
475 params.m_Width = A + 2;
476 params.m_Height = 12;
477 break;
478 }
479
480 case 7: {
481 params.m_Width = 6;
482 params.m_Height = 10;
483 break;
484 }
485
486 case 8: {
487 params.m_Width = 10;
488 params.m_Height = 6;
489 break;
490 }
491
492 case 9: {
493 u32 A = (modeBits >> 5) & 0x3;
494 u32 B = (modeBits >> 9) & 0x3;
495 params.m_Width = A + 6;
496 params.m_Height = B + 6;
497 break;
498 }
499
500 default:
501 assert(false && "Don't know this layout...");
502 params.m_bError = true;
503 break;
504 }
505
506 // Determine whether or not we're using dual planes
507 // and/or high precision layouts.
508 bool D = (layout != 9) && (modeBits & 0x400);
509 bool H = (layout != 9) && (modeBits & 0x200);
510
511 if (H) {
512 const u32 maxWeights[6] = {9, 11, 15, 19, 23, 31};
513 params.m_MaxWeight = maxWeights[R - 2];
514 } else {
515 const u32 maxWeights[6] = {1, 2, 3, 4, 5, 7};
516 params.m_MaxWeight = maxWeights[R - 2];
517 }
518
519 params.m_bDualPlane = D;
520
521 return params;
522}
523
524static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
525 u32 blockHeight) {
526 // Don't actually care about the void extent, just read the bits...
527 for (s32 i = 0; i < 4; ++i) {
528 strm.ReadBits<13>();
529 }
530
531 // Decode the RGBA components and renormalize them to the range [0, 255]
532 u16 r = static_cast<u16>(strm.ReadBits<16>());
533 u16 g = static_cast<u16>(strm.ReadBits<16>());
534 u16 b = static_cast<u16>(strm.ReadBits<16>());
535 u16 a = static_cast<u16>(strm.ReadBits<16>());
536
537 u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
538 (static_cast<u32>(a) & 0xFF00) << 16;
539
540 for (u32 j = 0; j < blockHeight; j++) {
541 for (u32 i = 0; i < blockWidth; i++) {
542 outBuf[j * blockWidth + i] = rgba;
543 }
544 }
545}
546
547static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
548 for (u32 j = 0; j < blockHeight; j++) {
549 for (u32 i = 0; i < blockWidth; i++) {
550 outBuf[j * blockWidth + i] = 0xFFFF00FF;
551 }
552 }
553}
554static constexpr u32 ReplicateByteTo16(std::size_t value) {
555 return REPLICATE_BYTE_TO_16_TABLE[value];
556}
557
558static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
559static constexpr u32 ReplicateBitTo7(std::size_t value) {
560 return REPLICATE_BIT_TO_7_TABLE[value];
561}
562
563static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
564static constexpr u32 ReplicateBitTo9(std::size_t value) {
565 return REPLICATE_BIT_TO_9_TABLE[value];
566}
567
568static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
569static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
570static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
571static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
572static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
573/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
574/// to the runtime implementation
575static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
576 switch (num_bits) {
577 case 1:
578 return REPLICATE_1_BIT_TO_8_TABLE[value];
579 case 2:
580 return REPLICATE_2_BIT_TO_8_TABLE[value];
581 case 3:
582 return REPLICATE_3_BIT_TO_8_TABLE[value];
583 case 4:
584 return REPLICATE_4_BIT_TO_8_TABLE[value];
585 case 5:
586 return REPLICATE_5_BIT_TO_8_TABLE[value];
587 case 6:
588 return REPLICATE_6_BIT_TO_8_TABLE[value];
589 case 7:
590 return REPLICATE_7_BIT_TO_8_TABLE[value];
591 case 8:
592 return REPLICATE_8_BIT_TO_8_TABLE[value];
593 default:
594 return Replicate(value, num_bits, 8);
595 }
596}
597
598static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
599static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
600static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
601static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
602static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
603static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
604 switch (num_bits) {
605 case 1:
606 return REPLICATE_1_BIT_TO_6_TABLE[value];
607 case 2:
608 return REPLICATE_2_BIT_TO_6_TABLE[value];
609 case 3:
610 return REPLICATE_3_BIT_TO_6_TABLE[value];
611 case 4:
612 return REPLICATE_4_BIT_TO_6_TABLE[value];
613 case 5:
614 return REPLICATE_5_BIT_TO_6_TABLE[value];
615 default:
616 return Replicate(value, num_bits, 6);
617 }
618}
619
620class Pixel {
621protected:
622 using ChannelType = s16;
623 u8 m_BitDepth[4] = {8, 8, 8, 8};
624 s16 color[4] = {};
625
626public:
627 Pixel() = default;
628 Pixel(u32 a, u32 r, u32 g, u32 b, u32 bitDepth = 8)
629 : m_BitDepth{u8(bitDepth), u8(bitDepth), u8(bitDepth), u8(bitDepth)},
630 color{static_cast<ChannelType>(a), static_cast<ChannelType>(r),
631 static_cast<ChannelType>(g), static_cast<ChannelType>(b)} {}
632
633 // Changes the depth of each pixel. This scales the values to
634 // the appropriate bit depth by either truncating the least
635 // significant bits when going from larger to smaller bit depth
636 // or by repeating the most significant bits when going from
637 // smaller to larger bit depths.
638 void ChangeBitDepth() {
639 for (u32 i = 0; i < 4; i++) {
640 Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]);
641 m_BitDepth[i] = 8;
642 }
643 }
644
645 template <typename IntType>
646 static float ConvertChannelToFloat(IntType channel, u8 bitDepth) {
647 float denominator = static_cast<float>((1 << bitDepth) - 1);
648 return static_cast<float>(channel) / denominator;
649 }
650
651 // Changes the bit depth of a single component. See the comment
652 // above for how we do this.
653 static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) {
654 assert(oldDepth <= 8);
655
656 if (oldDepth == 8) {
657 // Do nothing
658 return val;
659 } else if (oldDepth == 0) {
660 return static_cast<ChannelType>((1 << 8) - 1);
661 } else if (8 > oldDepth) {
662 return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
663 } else {
664 // oldDepth > newDepth
665 const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
666 u16 v = static_cast<u16>(val);
667 v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
668 v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1));
669 return static_cast<u8>(v);
670 }
671
672 assert(false && "We shouldn't get here.");
673 return 0;
674 }
675
676 const ChannelType& A() const {
677 return color[0];
678 }
679 ChannelType& A() {
680 return color[0];
681 }
682 const ChannelType& R() const {
683 return color[1];
684 }
685 ChannelType& R() {
686 return color[1];
687 }
688 const ChannelType& G() const {
689 return color[2];
690 }
691 ChannelType& G() {
692 return color[2];
693 }
694 const ChannelType& B() const {
695 return color[3];
696 }
697 ChannelType& B() {
698 return color[3];
699 }
700 const ChannelType& Component(u32 idx) const {
701 return color[idx];
702 }
703 ChannelType& Component(u32 idx) {
704 return color[idx];
705 }
706
707 void GetBitDepth(u8 (&outDepth)[4]) const {
708 for (s32 i = 0; i < 4; i++) {
709 outDepth[i] = m_BitDepth[i];
710 }
711 }
712
713 // Take all of the components, transform them to their 8-bit variants,
714 // and then pack each channel into an R8G8B8A8 32-bit integer. We assume
715 // that the architecture is little-endian, so the alpha channel will end
716 // up in the most-significant byte.
717 u32 Pack() const {
718 Pixel eightBit(*this);
719 eightBit.ChangeBitDepth();
720
721 u32 r = 0;
722 r |= eightBit.A();
723 r <<= 8;
724 r |= eightBit.B();
725 r <<= 8;
726 r |= eightBit.G();
727 r <<= 8;
728 r |= eightBit.R();
729 return r;
730 }
731
732 // Clamps the pixel to the range [0,255]
733 void ClampByte() {
734 for (u32 i = 0; i < 4; i++) {
735 color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
736 }
737 }
738
739 void MakeOpaque() {
740 A() = 255;
741 }
742};
743
744static void DecodeColorValues(u32* out, std::span<u8> data, const u32* modes, const u32 nPartitions,
745 const u32 nBitsForColorData) {
746 // First figure out how many color values we have
747 u32 nValues = 0;
748 for (u32 i = 0; i < nPartitions; i++) {
749 nValues += ((modes[i] >> 2) + 1) << 1;
750 }
751
752 // Then based on the number of values and the remaining number of bits,
753 // figure out the max value for each of them...
754 u32 range = 256;
755 while (--range > 0) {
756 IntegerEncodedValue val = EncodingsValues[range];
757 u32 bitLength = val.GetBitLength(nValues);
758 if (bitLength <= nBitsForColorData) {
759 // Find the smallest possible range that matches the given encoding
760 while (--range > 0) {
761 IntegerEncodedValue newval = EncodingsValues[range];
762 if (!newval.MatchesEncoding(val)) {
763 break;
764 }
765 }
766
767 // Return to last matching range.
768 range++;
769 break;
770 }
771 }
772
773 // We now have enough to decode our integer sequence.
774 IntegerEncodedVector decodedColorValues;
775
776 InputBitStream colorStream(data, 0);
777 DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
778
779 // Once we have the decoded values, we need to dequantize them to the 0-255 range
780 // This procedure is outlined in ASTC spec C.2.13
781 u32 outIdx = 0;
782 for (auto itr = decodedColorValues.begin(); itr != decodedColorValues.end(); ++itr) {
783 // Have we already decoded all that we need?
784 if (outIdx >= nValues) {
785 break;
786 }
787
788 const IntegerEncodedValue& val = *itr;
789 u32 bitlen = val.num_bits;
790 u32 bitval = val.bit_value;
791
792 assert(bitlen >= 1);
793
794 u32 A = 0, B = 0, C = 0, D = 0;
795 // A is just the lsb replicated 9 times.
796 A = ReplicateBitTo9(bitval & 1);
797
798 switch (val.encoding) {
799 // Replicate bits
800 case IntegerEncoding::JustBits:
801 out[outIdx++] = FastReplicateTo8(bitval, bitlen);
802 break;
803
804 // Use algorithm in C.2.13
805 case IntegerEncoding::Trit: {
806
807 D = val.trit_value;
808
809 switch (bitlen) {
810 case 1: {
811 C = 204;
812 } break;
813
814 case 2: {
815 C = 93;
816 // B = b000b0bb0
817 u32 b = (bitval >> 1) & 1;
818 B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
819 } break;
820
821 case 3: {
822 C = 44;
823 // B = cb000cbcb
824 u32 cb = (bitval >> 1) & 3;
825 B = (cb << 7) | (cb << 2) | cb;
826 } break;
827
828 case 4: {
829 C = 22;
830 // B = dcb000dcb
831 u32 dcb = (bitval >> 1) & 7;
832 B = (dcb << 6) | dcb;
833 } break;
834
835 case 5: {
836 C = 11;
837 // B = edcb000ed
838 u32 edcb = (bitval >> 1) & 0xF;
839 B = (edcb << 5) | (edcb >> 2);
840 } break;
841
842 case 6: {
843 C = 5;
844 // B = fedcb000f
845 u32 fedcb = (bitval >> 1) & 0x1F;
846 B = (fedcb << 4) | (fedcb >> 4);
847 } break;
848
849 default:
850 assert(false && "Unsupported trit encoding for color values!");
851 break;
852 } // switch(bitlen)
853 } // case IntegerEncoding::Trit
854 break;
855
856 case IntegerEncoding::Quint: {
857
858 D = val.quint_value;
859
860 switch (bitlen) {
861 case 1: {
862 C = 113;
863 } break;
864
865 case 2: {
866 C = 54;
867 // B = b0000bb00
868 u32 b = (bitval >> 1) & 1;
869 B = (b << 8) | (b << 3) | (b << 2);
870 } break;
871
872 case 3: {
873 C = 26;
874 // B = cb0000cbc
875 u32 cb = (bitval >> 1) & 3;
876 B = (cb << 7) | (cb << 1) | (cb >> 1);
877 } break;
878
879 case 4: {
880 C = 13;
881 // B = dcb0000dc
882 u32 dcb = (bitval >> 1) & 7;
883 B = (dcb << 6) | (dcb >> 1);
884 } break;
885
886 case 5: {
887 C = 6;
888 // B = edcb0000e
889 u32 edcb = (bitval >> 1) & 0xF;
890 B = (edcb << 5) | (edcb >> 3);
891 } break;
892
893 default:
894 assert(false && "Unsupported quint encoding for color values!");
895 break;
896 } // switch(bitlen)
897 } // case IntegerEncoding::Quint
898 break;
899 } // switch(val.encoding)
900
901 if (val.encoding != IntegerEncoding::JustBits) {
902 u32 T = D * C + B;
903 T ^= A;
904 T = (A & 0x80) | (T >> 2);
905 out[outIdx++] = T;
906 }
907 }
908
909 // Make sure that each of our values is in the proper range...
910 for (u32 i = 0; i < nValues; i++) {
911 assert(out[i] <= 255);
912 }
913}
914
915static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
916 u32 bitval = val.bit_value;
917 u32 bitlen = val.num_bits;
918
919 u32 A = ReplicateBitTo7(bitval & 1);
920 u32 B = 0, C = 0, D = 0;
921
922 u32 result = 0;
923 switch (val.encoding) {
924 case IntegerEncoding::JustBits:
925 result = FastReplicateTo6(bitval, bitlen);
926 break;
927
928 case IntegerEncoding::Trit: {
929 D = val.trit_value;
930 assert(D < 3);
931
932 switch (bitlen) {
933 case 0: {
934 u32 results[3] = {0, 32, 63};
935 result = results[D];
936 } break;
937
938 case 1: {
939 C = 50;
940 } break;
941
942 case 2: {
943 C = 23;
944 u32 b = (bitval >> 1) & 1;
945 B = (b << 6) | (b << 2) | b;
946 } break;
947
948 case 3: {
949 C = 11;
950 u32 cb = (bitval >> 1) & 3;
951 B = (cb << 5) | cb;
952 } break;
953
954 default:
955 assert(false && "Invalid trit encoding for texel weight");
956 break;
957 }
958 } break;
959
960 case IntegerEncoding::Quint: {
961 D = val.quint_value;
962 assert(D < 5);
963
964 switch (bitlen) {
965 case 0: {
966 u32 results[5] = {0, 16, 32, 47, 63};
967 result = results[D];
968 } break;
969
970 case 1: {
971 C = 28;
972 } break;
973
974 case 2: {
975 C = 13;
976 u32 b = (bitval >> 1) & 1;
977 B = (b << 6) | (b << 1);
978 } break;
979
980 default:
981 assert(false && "Invalid quint encoding for texel weight");
982 break;
983 }
984 } break;
985 }
986
987 if (val.encoding != IntegerEncoding::JustBits && bitlen > 0) {
988 // Decode the value...
989 result = D * C + B;
990 result ^= A;
991 result = (A & 0x20) | (result >> 2);
992 }
993
994 assert(result < 64);
995
996 // Change from [0,63] to [0,64]
997 if (result > 32) {
998 result += 1;
999 }
1000
1001 return result;
1002}
1003
1004static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights,
1005 const TexelWeightParams& params, const u32 blockWidth,
1006 const u32 blockHeight) {
1007 u32 weightIdx = 0;
1008 u32 unquantized[2][144];
1009
1010 for (auto itr = weights.begin(); itr != weights.end(); ++itr) {
1011 unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr);
1012
1013 if (params.m_bDualPlane) {
1014 ++itr;
1015 unquantized[1][weightIdx] = UnquantizeTexelWeight(*itr);
1016 if (itr == weights.end()) {
1017 break;
1018 }
1019 }
1020
1021 if (++weightIdx >= (params.m_Width * params.m_Height))
1022 break;
1023 }
1024
1025 // Do infill if necessary (Section C.2.18) ...
1026 u32 Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1);
1027 u32 Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1);
1028
1029 const u32 kPlaneScale = params.m_bDualPlane ? 2U : 1U;
1030 for (u32 plane = 0; plane < kPlaneScale; plane++)
1031 for (u32 t = 0; t < blockHeight; t++)
1032 for (u32 s = 0; s < blockWidth; s++) {
1033 u32 cs = Ds * s;
1034 u32 ct = Dt * t;
1035
1036 u32 gs = (cs * (params.m_Width - 1) + 32) >> 6;
1037 u32 gt = (ct * (params.m_Height - 1) + 32) >> 6;
1038
1039 u32 js = gs >> 4;
1040 u32 fs = gs & 0xF;
1041
1042 u32 jt = gt >> 4;
1043 u32 ft = gt & 0x0F;
1044
1045 u32 w11 = (fs * ft + 8) >> 4;
1046 u32 w10 = ft - w11;
1047 u32 w01 = fs - w11;
1048 u32 w00 = 16 - fs - ft + w11;
1049
1050 u32 v0 = js + jt * params.m_Width;
1051
1052#define FIND_TEXEL(tidx, bidx) \
1053 u32 p##bidx = 0; \
1054 do { \
1055 if ((tidx) < (params.m_Width * params.m_Height)) { \
1056 p##bidx = unquantized[plane][(tidx)]; \
1057 } \
1058 } while (0)
1059
1060 FIND_TEXEL(v0, 00);
1061 FIND_TEXEL(v0 + 1, 01);
1062 FIND_TEXEL(v0 + params.m_Width, 10);
1063 FIND_TEXEL(v0 + params.m_Width + 1, 11);
1064
1065#undef FIND_TEXEL
1066
1067 out[plane][t * blockWidth + s] =
1068 (p00 * w00 + p01 * w01 + p10 * w10 + p11 * w11 + 8) >> 4;
1069 }
1070}
1071
1072// Transfers a bit as described in C.2.14
1073static inline void BitTransferSigned(int& a, int& b) {
1074 b >>= 1;
1075 b |= a & 0x80;
1076 a >>= 1;
1077 a &= 0x3F;
1078 if (a & 0x20)
1079 a -= 0x40;
1080}
1081
1082// Adds more precision to the blue channel as described
1083// in C.2.14
1084static inline Pixel BlueContract(s32 a, s32 r, s32 g, s32 b) {
1085 return Pixel(static_cast<s16>(a), static_cast<s16>((r + b) >> 1),
1086 static_cast<s16>((g + b) >> 1), static_cast<s16>(b));
1087}
1088
1089// Partition selection functions as specified in
1090// C.2.21
1091static inline u32 hash52(u32 p) {
1092 p ^= p >> 15;
1093 p -= p << 17;
1094 p += p << 7;
1095 p += p << 4;
1096 p ^= p >> 5;
1097 p += p << 16;
1098 p ^= p >> 7;
1099 p ^= p >> 3;
1100 p ^= p << 6;
1101 p ^= p >> 17;
1102 return p;
1103}
1104
1105static u32 SelectPartition(s32 seed, s32 x, s32 y, s32 z, s32 partitionCount, s32 smallBlock) {
1106 if (1 == partitionCount)
1107 return 0;
1108
1109 if (smallBlock) {
1110 x <<= 1;
1111 y <<= 1;
1112 z <<= 1;
1113 }
1114
1115 seed += (partitionCount - 1) * 1024;
1116
1117 u32 rnum = hash52(static_cast<u32>(seed));
1118 u8 seed1 = static_cast<u8>(rnum & 0xF);
1119 u8 seed2 = static_cast<u8>((rnum >> 4) & 0xF);
1120 u8 seed3 = static_cast<u8>((rnum >> 8) & 0xF);
1121 u8 seed4 = static_cast<u8>((rnum >> 12) & 0xF);
1122 u8 seed5 = static_cast<u8>((rnum >> 16) & 0xF);
1123 u8 seed6 = static_cast<u8>((rnum >> 20) & 0xF);
1124 u8 seed7 = static_cast<u8>((rnum >> 24) & 0xF);
1125 u8 seed8 = static_cast<u8>((rnum >> 28) & 0xF);
1126 u8 seed9 = static_cast<u8>((rnum >> 18) & 0xF);
1127 u8 seed10 = static_cast<u8>((rnum >> 22) & 0xF);
1128 u8 seed11 = static_cast<u8>((rnum >> 26) & 0xF);
1129 u8 seed12 = static_cast<u8>(((rnum >> 30) | (rnum << 2)) & 0xF);
1130
1131 seed1 = static_cast<u8>(seed1 * seed1);
1132 seed2 = static_cast<u8>(seed2 * seed2);
1133 seed3 = static_cast<u8>(seed3 * seed3);
1134 seed4 = static_cast<u8>(seed4 * seed4);
1135 seed5 = static_cast<u8>(seed5 * seed5);
1136 seed6 = static_cast<u8>(seed6 * seed6);
1137 seed7 = static_cast<u8>(seed7 * seed7);
1138 seed8 = static_cast<u8>(seed8 * seed8);
1139 seed9 = static_cast<u8>(seed9 * seed9);
1140 seed10 = static_cast<u8>(seed10 * seed10);
1141 seed11 = static_cast<u8>(seed11 * seed11);
1142 seed12 = static_cast<u8>(seed12 * seed12);
1143
1144 s32 sh1, sh2, sh3;
1145 if (seed & 1) {
1146 sh1 = (seed & 2) ? 4 : 5;
1147 sh2 = (partitionCount == 3) ? 6 : 5;
1148 } else {
1149 sh1 = (partitionCount == 3) ? 6 : 5;
1150 sh2 = (seed & 2) ? 4 : 5;
1151 }
1152 sh3 = (seed & 0x10) ? sh1 : sh2;
1153
1154 seed1 = static_cast<u8>(seed1 >> sh1);
1155 seed2 = static_cast<u8>(seed2 >> sh2);
1156 seed3 = static_cast<u8>(seed3 >> sh1);
1157 seed4 = static_cast<u8>(seed4 >> sh2);
1158 seed5 = static_cast<u8>(seed5 >> sh1);
1159 seed6 = static_cast<u8>(seed6 >> sh2);
1160 seed7 = static_cast<u8>(seed7 >> sh1);
1161 seed8 = static_cast<u8>(seed8 >> sh2);
1162 seed9 = static_cast<u8>(seed9 >> sh3);
1163 seed10 = static_cast<u8>(seed10 >> sh3);
1164 seed11 = static_cast<u8>(seed11 >> sh3);
1165 seed12 = static_cast<u8>(seed12 >> sh3);
1166
1167 s32 a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
1168 s32 b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
1169 s32 c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
1170 s32 d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
1171
1172 a &= 0x3F;
1173 b &= 0x3F;
1174 c &= 0x3F;
1175 d &= 0x3F;
1176
1177 if (partitionCount < 4)
1178 d = 0;
1179 if (partitionCount < 3)
1180 c = 0;
1181
1182 if (a >= b && a >= c && a >= d)
1183 return 0;
1184 else if (b >= c && b >= d)
1185 return 1;
1186 else if (c >= d)
1187 return 2;
1188 return 3;
1189}
1190
1191static inline u32 Select2DPartition(s32 seed, s32 x, s32 y, s32 partitionCount, s32 smallBlock) {
1192 return SelectPartition(seed, x, y, 0, partitionCount, smallBlock);
1193}
1194
1195// Section C.2.14
1196static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const u32*& colorValues,
1197 u32 colorEndpointMode) {
1198#define READ_UINT_VALUES(N) \
1199 u32 v[N]; \
1200 for (u32 i = 0; i < N; i++) { \
1201 v[i] = *(colorValues++); \
1202 }
1203
1204#define READ_INT_VALUES(N) \
1205 s32 v[N]; \
1206 for (u32 i = 0; i < N; i++) { \
1207 v[i] = static_cast<int>(*(colorValues++)); \
1208 }
1209
1210 switch (colorEndpointMode) {
1211 case 0: {
1212 READ_UINT_VALUES(2)
1213 ep1 = Pixel(0xFF, v[0], v[0], v[0]);
1214 ep2 = Pixel(0xFF, v[1], v[1], v[1]);
1215 } break;
1216
1217 case 1: {
1218 READ_UINT_VALUES(2)
1219 u32 L0 = (v[0] >> 2) | (v[1] & 0xC0);
1220 u32 L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU);
1221 ep1 = Pixel(0xFF, L0, L0, L0);
1222 ep2 = Pixel(0xFF, L1, L1, L1);
1223 } break;
1224
1225 case 4: {
1226 READ_UINT_VALUES(4)
1227 ep1 = Pixel(v[2], v[0], v[0], v[0]);
1228 ep2 = Pixel(v[3], v[1], v[1], v[1]);
1229 } break;
1230
1231 case 5: {
1232 READ_INT_VALUES(4)
1233 BitTransferSigned(v[1], v[0]);
1234 BitTransferSigned(v[3], v[2]);
1235 ep1 = Pixel(v[2], v[0], v[0], v[0]);
1236 ep2 = Pixel(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]);
1237 ep1.ClampByte();
1238 ep2.ClampByte();
1239 } break;
1240
1241 case 6: {
1242 READ_UINT_VALUES(4)
1243 ep1 = Pixel(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
1244 ep2 = Pixel(0xFF, v[0], v[1], v[2]);
1245 } break;
1246
1247 case 8: {
1248 READ_UINT_VALUES(6)
1249 if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
1250 ep1 = Pixel(0xFF, v[0], v[2], v[4]);
1251 ep2 = Pixel(0xFF, v[1], v[3], v[5]);
1252 } else {
1253 ep1 = BlueContract(0xFF, v[1], v[3], v[5]);
1254 ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
1255 }
1256 } break;
1257
1258 case 9: {
1259 READ_INT_VALUES(6)
1260 BitTransferSigned(v[1], v[0]);
1261 BitTransferSigned(v[3], v[2]);
1262 BitTransferSigned(v[5], v[4]);
1263 if (v[1] + v[3] + v[5] >= 0) {
1264 ep1 = Pixel(0xFF, v[0], v[2], v[4]);
1265 ep2 = Pixel(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1266 } else {
1267 ep1 = BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1268 ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
1269 }
1270 ep1.ClampByte();
1271 ep2.ClampByte();
1272 } break;
1273
1274 case 10: {
1275 READ_UINT_VALUES(6)
1276 ep1 = Pixel(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
1277 ep2 = Pixel(v[5], v[0], v[1], v[2]);
1278 } break;
1279
1280 case 12: {
1281 READ_UINT_VALUES(8)
1282 if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
1283 ep1 = Pixel(v[6], v[0], v[2], v[4]);
1284 ep2 = Pixel(v[7], v[1], v[3], v[5]);
1285 } else {
1286 ep1 = BlueContract(v[7], v[1], v[3], v[5]);
1287 ep2 = BlueContract(v[6], v[0], v[2], v[4]);
1288 }
1289 } break;
1290
1291 case 13: {
1292 READ_INT_VALUES(8)
1293 BitTransferSigned(v[1], v[0]);
1294 BitTransferSigned(v[3], v[2]);
1295 BitTransferSigned(v[5], v[4]);
1296 BitTransferSigned(v[7], v[6]);
1297 if (v[1] + v[3] + v[5] >= 0) {
1298 ep1 = Pixel(v[6], v[0], v[2], v[4]);
1299 ep2 = Pixel(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1300 } else {
1301 ep1 = BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1302 ep2 = BlueContract(v[6], v[0], v[2], v[4]);
1303 }
1304 ep1.ClampByte();
1305 ep2.ClampByte();
1306 } break;
1307
1308 default:
1309 assert(false && "Unsupported color endpoint mode (is it HDR?)");
1310 break;
1311 }
1312
1313#undef READ_UINT_VALUES
1314#undef READ_INT_VALUES
1315}
1316
1317static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
1318 const u32 blockHeight, std::span<u32, 12 * 12> outBuf) {
1319 InputBitStream strm(inBuf);
1320 TexelWeightParams weightParams = DecodeBlockInfo(strm);
1321
1322 // Was there an error?
1323 if (weightParams.m_bError) {
1324 assert(false && "Invalid block mode");
1325 FillError(outBuf, blockWidth, blockHeight);
1326 return;
1327 }
1328
1329 if (weightParams.m_bVoidExtentLDR) {
1330 FillVoidExtentLDR(strm, outBuf, blockWidth, blockHeight);
1331 return;
1332 }
1333
1334 if (weightParams.m_bVoidExtentHDR) {
1335 assert(false && "HDR void extent blocks are unsupported!");
1336 FillError(outBuf, blockWidth, blockHeight);
1337 return;
1338 }
1339
1340 if (weightParams.m_Width > blockWidth) {
1341 assert(false && "Texel weight grid width should be smaller than block width");
1342 FillError(outBuf, blockWidth, blockHeight);
1343 return;
1344 }
1345
1346 if (weightParams.m_Height > blockHeight) {
1347 assert(false && "Texel weight grid height should be smaller than block height");
1348 FillError(outBuf, blockWidth, blockHeight);
1349 return;
1350 }
1351
1352 // Read num partitions
1353 u32 nPartitions = strm.ReadBits<2>() + 1;
1354 assert(nPartitions <= 4);
1355
1356 if (nPartitions == 4 && weightParams.m_bDualPlane) {
1357 assert(false && "Dual plane mode is incompatible with four partition blocks");
1358 FillError(outBuf, blockWidth, blockHeight);
1359 return;
1360 }
1361
1362 // Based on the number of partitions, read the color endpoint mode for
1363 // each partition.
1364
1365 // Determine partitions, partition index, and color endpoint modes
1366 s32 planeIdx = -1;
1367 u32 partitionIndex;
1368 u32 colorEndpointMode[4] = {0, 0, 0, 0};
1369
1370 // Define color data.
1371 u8 colorEndpointData[16];
1372 memset(colorEndpointData, 0, sizeof(colorEndpointData));
1373 OutputBitStream colorEndpointStream(colorEndpointData, 16 * 8, 0);
1374
1375 // Read extra config data...
1376 u32 baseCEM = 0;
1377 if (nPartitions == 1) {
1378 colorEndpointMode[0] = strm.ReadBits<4>();
1379 partitionIndex = 0;
1380 } else {
1381 partitionIndex = strm.ReadBits<10>();
1382 baseCEM = strm.ReadBits<6>();
1383 }
1384 u32 baseMode = (baseCEM & 3);
1385
1386 // Remaining bits are color endpoint data...
1387 u32 nWeightBits = weightParams.GetPackedBitSize();
1388 s32 remainingBits = 128 - nWeightBits - static_cast<int>(strm.GetBitsRead());
1389
1390 // Consider extra bits prior to texel data...
1391 u32 extraCEMbits = 0;
1392 if (baseMode) {
1393 switch (nPartitions) {
1394 case 2:
1395 extraCEMbits += 2;
1396 break;
1397 case 3:
1398 extraCEMbits += 5;
1399 break;
1400 case 4:
1401 extraCEMbits += 8;
1402 break;
1403 default:
1404 assert(false);
1405 break;
1406 }
1407 }
1408 remainingBits -= extraCEMbits;
1409
1410 // Do we have a dual plane situation?
1411 u32 planeSelectorBits = 0;
1412 if (weightParams.m_bDualPlane) {
1413 planeSelectorBits = 2;
1414 }
1415 remainingBits -= planeSelectorBits;
1416
1417 // Read color data...
1418 u32 colorDataBits = remainingBits;
1419 while (remainingBits > 0) {
1420 u32 nb = std::min(remainingBits, 8);
1421 u32 b = strm.ReadBits(nb);
1422 colorEndpointStream.WriteBits(b, nb);
1423 remainingBits -= 8;
1424 }
1425
1426 // Read the plane selection bits
1427 planeIdx = strm.ReadBits(planeSelectorBits);
1428
1429 // Read the rest of the CEM
1430 if (baseMode) {
1431 u32 extraCEM = strm.ReadBits(extraCEMbits);
1432 u32 CEM = (extraCEM << 6) | baseCEM;
1433 CEM >>= 2;
1434
1435 bool C[4] = {0};
1436 for (u32 i = 0; i < nPartitions; i++) {
1437 C[i] = CEM & 1;
1438 CEM >>= 1;
1439 }
1440
1441 u8 M[4] = {0};
1442 for (u32 i = 0; i < nPartitions; i++) {
1443 M[i] = CEM & 3;
1444 CEM >>= 2;
1445 assert(M[i] <= 3);
1446 }
1447
1448 for (u32 i = 0; i < nPartitions; i++) {
1449 colorEndpointMode[i] = baseMode;
1450 if (!(C[i]))
1451 colorEndpointMode[i] -= 1;
1452 colorEndpointMode[i] <<= 2;
1453 colorEndpointMode[i] |= M[i];
1454 }
1455 } else if (nPartitions > 1) {
1456 u32 CEM = baseCEM >> 2;
1457 for (u32 i = 0; i < nPartitions; i++) {
1458 colorEndpointMode[i] = CEM;
1459 }
1460 }
1461
1462 // Make sure everything up till here is sane.
1463 for (u32 i = 0; i < nPartitions; i++) {
1464 assert(colorEndpointMode[i] < 16);
1465 }
1466 assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128);
1467
1468 // Decode both color data and texel weight data
1469 u32 colorValues[32]; // Four values, two endpoints, four maximum paritions
1470 DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions,
1471 colorDataBits);
1472
1473 Pixel endpoints[4][2];
1474 const u32* colorValuesPtr = colorValues;
1475 for (u32 i = 0; i < nPartitions; i++) {
1476 ComputeEndpoints(endpoints[i][0], endpoints[i][1], colorValuesPtr, colorEndpointMode[i]);
1477 }
1478
1479 // Read the texel weight data..
1480 std::array<u8, 16> texelWeightData;
1481 std::ranges::copy(inBuf, texelWeightData.begin());
1482
1483 // Reverse everything
1484 for (u32 i = 0; i < 8; i++) {
1485// Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
1486#define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32
1487 u8 a = static_cast<u8>(REVERSE_BYTE(texelWeightData[i]));
1488 u8 b = static_cast<u8>(REVERSE_BYTE(texelWeightData[15 - i]));
1489#undef REVERSE_BYTE
1490
1491 texelWeightData[i] = b;
1492 texelWeightData[15 - i] = a;
1493 }
1494
1495 // Make sure that higher non-texel bits are set to zero
1496 const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1;
1497 if (clearByteStart > 0 && clearByteStart <= texelWeightData.size()) {
1498 texelWeightData[clearByteStart - 1] &=
1499 static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
1500 std::memset(texelWeightData.data() + clearByteStart, 0,
1501 std::min(16U - clearByteStart, 16U));
1502 }
1503
1504 IntegerEncodedVector texelWeightValues;
1505
1506 InputBitStream weightStream(texelWeightData);
1507
1508 DecodeIntegerSequence(texelWeightValues, weightStream, weightParams.m_MaxWeight,
1509 weightParams.GetNumWeightValues());
1510
1511 // Blocks can be at most 12x12, so we can have as many as 144 weights
1512 u32 weights[2][144];
1513 UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight);
1514
1515 // Now that we have endpoints and weights, we can interpolate and generate
1516 // the proper decoding...
1517 for (u32 j = 0; j < blockHeight; j++)
1518 for (u32 i = 0; i < blockWidth; i++) {
1519 u32 partition = Select2DPartition(partitionIndex, i, j, nPartitions,
1520 (blockHeight * blockWidth) < 32);
1521 assert(partition < nPartitions);
1522
1523 Pixel p;
1524 for (u32 c = 0; c < 4; c++) {
1525 u32 C0 = endpoints[partition][0].Component(c);
1526 C0 = ReplicateByteTo16(C0);
1527 u32 C1 = endpoints[partition][1].Component(c);
1528 C1 = ReplicateByteTo16(C1);
1529
1530 u32 plane = 0;
1531 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
1532 plane = 1;
1533 }
1534
1535 u32 weight = weights[plane][j * blockWidth + i];
1536 u32 C = (C0 * (64 - weight) + C1 * weight + 32) / 64;
1537 if (C == 65535) {
1538 p.Component(c) = 255;
1539 } else {
1540 double Cf = static_cast<double>(C);
1541 p.Component(c) = static_cast<u16>(255.0 * (Cf / 65536.0) + 0.5);
1542 }
1543 }
1544
1545 outBuf[j * blockWidth + i] = p.Pack();
1546 }
1547}
1548
1549void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
1550 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) {
1551 u32 block_index = 0;
1552 std::size_t depth_offset = 0;
1553 for (u32 z = 0; z < depth; z++) {
1554 for (u32 y = 0; y < height; y += block_height) {
1555 for (u32 x = 0; x < width; x += block_width) {
1556 const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
1557
1558 // Blocks can be at most 12x12
1559 std::array<u32, 12 * 12> uncompData;
1560 DecompressBlock(blockPtr, block_width, block_height, uncompData);
1561
1562 u32 decompWidth = std::min(block_width, width - x);
1563 u32 decompHeight = std::min(block_height, height - y);
1564
1565 const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
1566 for (u32 jj = 0; jj < decompHeight; jj++) {
1567 std::memcpy(outRow.data() + jj * width * 4,
1568 uncompData.data() + jj * block_width, decompWidth * 4);
1569 }
1570 ++block_index;
1571 }
1572 }
1573 depth_offset += height * width * 4;
1574 }
1575}
1576
1577} // namespace Tegra::Texture::ASTC
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h
index c1c73fda5..c1c37dfe7 100644
--- a/src/video_core/textures/astc.h
+++ b/src/video_core/textures/astc.h
@@ -129,4 +129,7 @@ struct AstcBufferData {
129 decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE; 129 decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE;
130} constexpr ASTC_BUFFER_DATA; 130} constexpr ASTC_BUFFER_DATA;
131 131
132void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
133 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);
134
132} // namespace Tegra::Texture::ASTC 135} // namespace Tegra::Texture::ASTC