summaryrefslogtreecommitdiff
path: root/src/video_core/textures/astc.cpp
diff options
context:
space:
mode:
authorGravatar ameerj2021-03-25 16:53:51 -0400
committerGravatar ameerj2021-03-25 16:53:51 -0400
commit2f83d9a61bca42d9ef24074beb2b11b19bd4cecd (patch)
tree514e40eb750280c2e3025f9301befb6f8c9b46e9 /src/video_core/textures/astc.cpp
parentastc_decoder: Reimplement Layers (diff)
downloadyuzu-2f83d9a61bca42d9ef24074beb2b11b19bd4cecd.tar.gz
yuzu-2f83d9a61bca42d9ef24074beb2b11b19bd4cecd.tar.xz
yuzu-2f83d9a61bca42d9ef24074beb2b11b19bd4cecd.zip
astc_decoder: Refactor for style and more efficient memory use
Diffstat (limited to 'src/video_core/textures/astc.cpp')
-rw-r--r--src/video_core/textures/astc.cpp1710
1 files changed, 0 insertions, 1710 deletions
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
deleted file mode 100644
index 3625b666c..000000000
--- a/src/video_core/textures/astc.cpp
+++ /dev/null
@@ -1,1710 +0,0 @@
1// Copyright 2016 The University of North Carolina at Chapel Hill
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Please send all BUG REPORTS to <pavel@cs.unc.edu>.
16// <http://gamma.cs.unc.edu/FasTC/>
17
18#include <algorithm>
19#include <cassert>
20#include <cstring>
21#include <span>
22#include <vector>
23
24#include <boost/container/static_vector.hpp>
25
26#include "common/common_types.h"
27
28#include "video_core/textures/astc.h"
29
30namespace {
31
32/// Count the number of bits set in a number.
33constexpr u32 Popcnt(u32 n) {
34 u32 c = 0;
35 for (; n; c++) {
36 n &= n - 1;
37 }
38 return c;
39}
40
41} // Anonymous namespace
42
43class InputBitStream {
44public:
45 constexpr explicit InputBitStream(std::span<const u8> data, size_t start_offset = 0)
46 : cur_byte{data.data()}, total_bits{data.size()}, next_bit{start_offset % 8} {}
47
48 constexpr size_t GetBitsRead() const {
49 return bits_read;
50 }
51
52 constexpr bool ReadBit() {
53 if (bits_read >= total_bits * 8) {
54 return 0;
55 }
56 const bool bit = ((*cur_byte >> next_bit) & 1) != 0;
57 ++next_bit;
58 while (next_bit >= 8) {
59 next_bit -= 8;
60 ++cur_byte;
61 }
62 ++bits_read;
63 return bit;
64 }
65
66 constexpr u32 ReadBits(std::size_t nBits) {
67 u32 ret = 0;
68 for (std::size_t i = 0; i < nBits; ++i) {
69 ret |= (ReadBit() & 1) << i;
70 }
71 return ret;
72 }
73
74 template <std::size_t nBits>
75 constexpr u32 ReadBits() {
76 u32 ret = 0;
77 for (std::size_t i = 0; i < nBits; ++i) {
78 ret |= (ReadBit() & 1) << i;
79 }
80 return ret;
81 }
82
83private:
84 const u8* cur_byte;
85 size_t total_bits = 0;
86 size_t next_bit = 0;
87 size_t bits_read = 0;
88};
89
90class OutputBitStream {
91public:
92 constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0)
93 : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {}
94
95 constexpr std::size_t GetBitsWritten() const {
96 return bits_written;
97 }
98
99 constexpr void WriteBitsR(u32 val, u32 nBits) {
100 for (u32 i = 0; i < nBits; i++) {
101 WriteBit((val >> (nBits - i - 1)) & 1);
102 }
103 }
104
105 constexpr void WriteBits(u32 val, u32 nBits) {
106 for (u32 i = 0; i < nBits; i++) {
107 WriteBit((val >> i) & 1);
108 }
109 }
110
111private:
112 constexpr void WriteBit(bool b) {
113 if (bits_written >= num_bits) {
114 return;
115 }
116
117 const u32 mask = 1 << next_bit++;
118
119 // clear the bit
120 *cur_byte &= static_cast<u8>(~mask);
121
122 // Write the bit, if necessary
123 if (b)
124 *cur_byte |= static_cast<u8>(mask);
125
126 // Next byte?
127 if (next_bit >= 8) {
128 cur_byte += 1;
129 next_bit = 0;
130 }
131 }
132
133 u8* cur_byte;
134 std::size_t num_bits;
135 std::size_t bits_written = 0;
136 std::size_t next_bit = 0;
137};
138
139template <typename IntType>
140class Bits {
141public:
142 explicit Bits(const IntType& v) : m_Bits(v) {}
143
144 Bits(const Bits&) = delete;
145 Bits& operator=(const Bits&) = delete;
146
147 u8 operator[](u32 bitPos) const {
148 return static_cast<u8>((m_Bits >> bitPos) & 1);
149 }
150
151 IntType operator()(u32 start, u32 end) const {
152 if (start == end) {
153 return (*this)[start];
154 } else if (start > end) {
155 u32 t = start;
156 start = end;
157 end = t;
158 }
159
160 u64 mask = (1 << (end - start + 1)) - 1;
161 return (m_Bits >> start) & static_cast<IntType>(mask);
162 }
163
164private:
165 const IntType& m_Bits;
166};
167
168enum class IntegerEncoding { JustBits, Qus32, Trit };
169
170struct IntegerEncodedValue {
171 constexpr IntegerEncodedValue() = default;
172
173 constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
174 : encoding{encoding_}, num_bits{num_bits_} {}
175
176 constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
177 return encoding == other.encoding && num_bits == other.num_bits;
178 }
179
180 // Returns the number of bits required to encode nVals values.
181 u32 GetBitLength(u32 nVals) const {
182 u32 totalBits = num_bits * nVals;
183 if (encoding == IntegerEncoding::Trit) {
184 totalBits += (nVals * 8 + 4) / 5;
185 } else if (encoding == IntegerEncoding::Qus32) {
186 totalBits += (nVals * 7 + 2) / 3;
187 }
188 return totalBits;
189 }
190
191 IntegerEncoding encoding{};
192 u32 num_bits = 0;
193 u32 bit_value = 0;
194 union {
195 u32 qus32_value = 0;
196 u32 trit_value;
197 };
198};
199using IntegerEncodedVector = boost::container::static_vector<
200 IntegerEncodedValue, 256,
201 boost::container::static_vector_options<
202 boost::container::inplace_alignment<alignof(IntegerEncodedValue)>,
203 boost::container::throw_on_overflow<false>>::type>;
204
205static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) {
206 // Implement the algorithm in section C.2.12
207 std::array<u32, 5> m;
208 std::array<u32, 5> t;
209 u32 T;
210
211 // Read the trit encoded block according to
212 // table C.2.14
213 m[0] = bits.ReadBits(nBitsPerValue);
214 T = bits.ReadBits<2>();
215 m[1] = bits.ReadBits(nBitsPerValue);
216 T |= bits.ReadBits<2>() << 2;
217 m[2] = bits.ReadBits(nBitsPerValue);
218 T |= bits.ReadBit() << 4;
219 m[3] = bits.ReadBits(nBitsPerValue);
220 T |= bits.ReadBits<2>() << 5;
221 m[4] = bits.ReadBits(nBitsPerValue);
222 T |= bits.ReadBit() << 7;
223
224 u32 C = 0;
225
226 Bits<u32> Tb(T);
227 if (Tb(2, 4) == 7) {
228 C = (Tb(5, 7) << 2) | Tb(0, 1);
229 t[4] = t[3] = 2;
230 } else {
231 C = Tb(0, 4);
232 if (Tb(5, 6) == 3) {
233 t[4] = 2;
234 t[3] = Tb[7];
235 } else {
236 t[4] = Tb[7];
237 t[3] = Tb(5, 6);
238 }
239 }
240
241 Bits<u32> Cb(C);
242 if (Cb(0, 1) == 3) {
243 t[2] = 2;
244 t[1] = Cb[4];
245 t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]);
246 } else if (Cb(2, 3) == 3) {
247 t[2] = 2;
248 t[1] = 2;
249 t[0] = Cb(0, 1);
250 } else {
251 t[2] = Cb[4];
252 t[1] = Cb(2, 3);
253 t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]);
254 }
255
256 for (std::size_t i = 0; i < 5; ++i) {
257 IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Trit, nBitsPerValue);
258 val.bit_value = m[i];
259 val.trit_value = t[i];
260 }
261}
262
263static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result,
264 u32 nBitsPerValue) {
265 // Implement the algorithm in section C.2.12
266 u32 m[3];
267 u32 q[3];
268 u32 Q;
269
270 // Read the trit encoded block according to
271 // table C.2.15
272 m[0] = bits.ReadBits(nBitsPerValue);
273 Q = bits.ReadBits<3>();
274 m[1] = bits.ReadBits(nBitsPerValue);
275 Q |= bits.ReadBits<2>() << 3;
276 m[2] = bits.ReadBits(nBitsPerValue);
277 Q |= bits.ReadBits<2>() << 5;
278
279 Bits<u32> Qb(Q);
280 if (Qb(1, 2) == 3 && Qb(5, 6) == 0) {
281 q[0] = q[1] = 4;
282 q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]);
283 } else {
284 u32 C = 0;
285 if (Qb(1, 2) == 3) {
286 q[2] = 4;
287 C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0];
288 } else {
289 q[2] = Qb(5, 6);
290 C = Qb(0, 4);
291 }
292
293 Bits<u32> Cb(C);
294 if (Cb(0, 2) == 5) {
295 q[1] = 4;
296 q[0] = Cb(3, 4);
297 } else {
298 q[1] = Cb(3, 4);
299 q[0] = Cb(0, 2);
300 }
301 }
302
303 for (std::size_t i = 0; i < 3; ++i) {
304 IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Qus32, nBitsPerValue);
305 val.bit_value = m[i];
306 val.qus32_value = q[i];
307 }
308}
309
310// Returns a new instance of this struct that corresponds to the
311// can take no more than maxval values
312static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) {
313 while (maxVal > 0) {
314 u32 check = maxVal + 1;
315
316 // Is maxVal a power of two?
317 if (!(check & (check - 1))) {
318 return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal));
319 }
320
321 // Is maxVal of the type 3*2^n - 1?
322 if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
323 return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1));
324 }
325
326 // Is maxVal of the type 5*2^n - 1?
327 if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
328 return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1));
329 }
330
331 // Apparently it can't be represented with a bounded integer sequence...
332 // just iterate.
333 maxVal--;
334 }
335 return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
336}
337
338static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
339 std::array<IntegerEncodedValue, 256> encodings{};
340 for (std::size_t i = 0; i < encodings.size(); ++i) {
341 encodings[i] = CreateEncoding(static_cast<u32>(i));
342 }
343 return encodings;
344}
345
346static constexpr std::array EncodingsValues = MakeEncodedValues();
347
348// Fills result with the values that are encoded in the given
349// bitstream. We must know beforehand what the maximum possible
350// value is, and how many values we're decoding.
351static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange,
352 u32 nValues) {
353 // Determine encoding parameters
354 IntegerEncodedValue val = EncodingsValues[maxRange];
355
356 // Start decoding
357 u32 nValsDecoded = 0;
358 while (nValsDecoded < nValues) {
359 switch (val.encoding) {
360 case IntegerEncoding::Qus32:
361 DecodeQus32Block(bits, result, val.num_bits);
362 nValsDecoded += 3;
363 break;
364
365 case IntegerEncoding::Trit:
366 DecodeTritBlock(bits, result, val.num_bits);
367 nValsDecoded += 5;
368 break;
369
370 case IntegerEncoding::JustBits:
371 val.bit_value = bits.ReadBits(val.num_bits);
372 result.push_back(val);
373 nValsDecoded++;
374 break;
375 }
376 }
377}
378
379namespace ASTCC {
380
381struct TexelWeightParams {
382 u32 m_Width = 0;
383 u32 m_Height = 0;
384 bool m_bDualPlane = false;
385 u32 m_MaxWeight = 0;
386 bool m_bError = false;
387 bool m_bVoidExtentLDR = false;
388 bool m_bVoidExtentHDR = false;
389
390 u32 GetPackedBitSize() const {
391 // How many indices do we have?
392 u32 nIdxs = m_Height * m_Width;
393 if (m_bDualPlane) {
394 nIdxs *= 2;
395 }
396
397 return EncodingsValues[m_MaxWeight].GetBitLength(nIdxs);
398 }
399
400 u32 GetNumWeightValues() const {
401 u32 ret = m_Width * m_Height;
402 if (m_bDualPlane) {
403 ret *= 2;
404 }
405 return ret;
406 }
407};
408
409static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
410 TexelWeightParams params;
411
412 // Read the entire block mode all at once
413 u16 modeBits = static_cast<u16>(strm.ReadBits<11>());
414
415 // Does this match the void extent block mode?
416 if ((modeBits & 0x01FF) == 0x1FC) {
417 if (modeBits & 0x200) {
418 params.m_bVoidExtentHDR = true;
419 } else {
420 params.m_bVoidExtentLDR = true;
421 }
422
423 // Next two bits must be one.
424 if (!(modeBits & 0x400) || !strm.ReadBit()) {
425 params.m_bError = true;
426 }
427
428 return params;
429 }
430
431 // First check if the last four bits are zero
432 if ((modeBits & 0xF) == 0) {
433 params.m_bError = true;
434 return params;
435 }
436
437 // If the last two bits are zero, then if bits
438 // [6-8] are all ones, this is also reserved.
439 if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0) {
440 params.m_bError = true;
441 return params;
442 }
443
444 // Otherwise, there is no error... Figure out the layout
445 // of the block mode. Layout is determined by a number
446 // between 0 and 9 corresponding to table C.2.8 of the
447 // ASTC spec.
448 u32 layout = 0;
449
450 if ((modeBits & 0x1) || (modeBits & 0x2)) {
451 // layout is in [0-4]
452 if (modeBits & 0x8) {
453 // layout is in [2-4]
454 if (modeBits & 0x4) {
455 // layout is in [3-4]
456 if (modeBits & 0x100) {
457 layout = 4;
458 } else {
459 layout = 3;
460 }
461 } else {
462 layout = 2;
463 }
464 } else {
465 // layout is in [0-1]
466 if (modeBits & 0x4) {
467 layout = 1;
468 } else {
469 layout = 0;
470 }
471 }
472 } else {
473 // layout is in [5-9]
474 if (modeBits & 0x100) {
475 // layout is in [7-9]
476 if (modeBits & 0x80) {
477 // layout is in [7-8]
478 assert((modeBits & 0x40) == 0U);
479 if (modeBits & 0x20) {
480 layout = 8;
481 } else {
482 layout = 7;
483 }
484 } else {
485 layout = 9;
486 }
487 } else {
488 // layout is in [5-6]
489 if (modeBits & 0x80) {
490 layout = 6;
491 } else {
492 layout = 5;
493 }
494 }
495 }
496
497 assert(layout < 10);
498
499 // Determine R
500 u32 R = !!(modeBits & 0x10);
501 if (layout < 5) {
502 R |= (modeBits & 0x3) << 1;
503 } else {
504 R |= (modeBits & 0xC) >> 1;
505 }
506 assert(2 <= R && R <= 7);
507
508 // Determine width & height
509 switch (layout) {
510 case 0: {
511 u32 A = (modeBits >> 5) & 0x3;
512 u32 B = (modeBits >> 7) & 0x3;
513 params.m_Width = B + 4;
514 params.m_Height = A + 2;
515 break;
516 }
517
518 case 1: {
519 u32 A = (modeBits >> 5) & 0x3;
520 u32 B = (modeBits >> 7) & 0x3;
521 params.m_Width = B + 8;
522 params.m_Height = A + 2;
523 break;
524 }
525
526 case 2: {
527 u32 A = (modeBits >> 5) & 0x3;
528 u32 B = (modeBits >> 7) & 0x3;
529 params.m_Width = A + 2;
530 params.m_Height = B + 8;
531 break;
532 }
533
534 case 3: {
535 u32 A = (modeBits >> 5) & 0x3;
536 u32 B = (modeBits >> 7) & 0x1;
537 params.m_Width = A + 2;
538 params.m_Height = B + 6;
539 break;
540 }
541
542 case 4: {
543 u32 A = (modeBits >> 5) & 0x3;
544 u32 B = (modeBits >> 7) & 0x1;
545 params.m_Width = B + 2;
546 params.m_Height = A + 2;
547 break;
548 }
549
550 case 5: {
551 u32 A = (modeBits >> 5) & 0x3;
552 params.m_Width = 12;
553 params.m_Height = A + 2;
554 break;
555 }
556
557 case 6: {
558 u32 A = (modeBits >> 5) & 0x3;
559 params.m_Width = A + 2;
560 params.m_Height = 12;
561 break;
562 }
563
564 case 7: {
565 params.m_Width = 6;
566 params.m_Height = 10;
567 break;
568 }
569
570 case 8: {
571 params.m_Width = 10;
572 params.m_Height = 6;
573 break;
574 }
575
576 case 9: {
577 u32 A = (modeBits >> 5) & 0x3;
578 u32 B = (modeBits >> 9) & 0x3;
579 params.m_Width = A + 6;
580 params.m_Height = B + 6;
581 break;
582 }
583
584 default:
585 assert(false && "Don't know this layout...");
586 params.m_bError = true;
587 break;
588 }
589
590 // Determine whether or not we're using dual planes
591 // and/or high precision layouts.
592 bool D = (layout != 9) && (modeBits & 0x400);
593 bool H = (layout != 9) && (modeBits & 0x200);
594
595 if (H) {
596 const u32 maxWeights[6] = {9, 11, 15, 19, 23, 31};
597 params.m_MaxWeight = maxWeights[R - 2];
598 } else {
599 const u32 maxWeights[6] = {1, 2, 3, 4, 5, 7};
600 params.m_MaxWeight = maxWeights[R - 2];
601 }
602
603 params.m_bDualPlane = D;
604
605 return params;
606}
607
608static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
609 u32 blockHeight) {
610 // Don't actually care about the void extent, just read the bits...
611 for (s32 i = 0; i < 4; ++i) {
612 strm.ReadBits<13>();
613 }
614
615 // Decode the RGBA components and renormalize them to the range [0, 255]
616 u16 r = static_cast<u16>(strm.ReadBits<16>());
617 u16 g = static_cast<u16>(strm.ReadBits<16>());
618 u16 b = static_cast<u16>(strm.ReadBits<16>());
619 u16 a = static_cast<u16>(strm.ReadBits<16>());
620
621 u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
622 (static_cast<u32>(a) & 0xFF00) << 16;
623
624 for (u32 j = 0; j < blockHeight; j++) {
625 for (u32 i = 0; i < blockWidth; i++) {
626 outBuf[j * blockWidth + i] = rgba;
627 }
628 }
629}
630
631static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
632 for (u32 j = 0; j < blockHeight; j++) {
633 for (u32 i = 0; i < blockWidth; i++) {
634 outBuf[j * blockWidth + i] = 0xFFFF00FF;
635 }
636 }
637}
638
639// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
640// is the same as [(numBits - 1):0] and repeats all the way down.
641template <typename IntType>
642static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
643 if (numBits == 0) {
644 return 0;
645 }
646 if (toBit == 0) {
647 return 0;
648 }
649 const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
650 IntType res = v;
651 u32 reslen = numBits;
652 while (reslen < toBit) {
653 u32 comp = 0;
654 if (numBits > toBit - reslen) {
655 u32 newshift = toBit - reslen;
656 comp = numBits - newshift;
657 numBits = newshift;
658 }
659 res = static_cast<IntType>(res << numBits);
660 res = static_cast<IntType>(res | (v >> comp));
661 reslen += numBits;
662 }
663 return res;
664}
665
666static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
667 return std::size_t(1) << num_bits;
668}
669
670template <typename IntType, u32 num_bits, u32 to_bit>
671static constexpr auto MakeReplicateTable() {
672 std::array<IntType, NumReplicateEntries(num_bits)> table{};
673 for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
674 table[value] = Replicate(value, num_bits, to_bit);
675 }
676 return table;
677}
678
679static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
680static constexpr u32 ReplicateByteTo16(std::size_t value) {
681 return REPLICATE_BYTE_TO_16_TABLE[value];
682}
683
684static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
685static constexpr u32 ReplicateBitTo7(std::size_t value) {
686 return REPLICATE_BIT_TO_7_TABLE[value];
687}
688
689static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
690static constexpr u32 ReplicateBitTo9(std::size_t value) {
691 return REPLICATE_BIT_TO_9_TABLE[value];
692}
693
694static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
695static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
696static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
697static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
698static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
699static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
700static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
701static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
702/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
703/// to the runtime implementation
704static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
705 switch (num_bits) {
706 case 1:
707 return REPLICATE_1_BIT_TO_8_TABLE[value];
708 case 2:
709 return REPLICATE_2_BIT_TO_8_TABLE[value];
710 case 3:
711 return REPLICATE_3_BIT_TO_8_TABLE[value];
712 case 4:
713 return REPLICATE_4_BIT_TO_8_TABLE[value];
714 case 5:
715 return REPLICATE_5_BIT_TO_8_TABLE[value];
716 case 6:
717 return REPLICATE_6_BIT_TO_8_TABLE[value];
718 case 7:
719 return REPLICATE_7_BIT_TO_8_TABLE[value];
720 case 8:
721 return REPLICATE_8_BIT_TO_8_TABLE[value];
722 default:
723 return Replicate(value, num_bits, 8);
724 }
725}
726
727static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
728static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
729static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
730static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
731static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
732static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
733 switch (num_bits) {
734 case 1:
735 return REPLICATE_1_BIT_TO_6_TABLE[value];
736 case 2:
737 return REPLICATE_2_BIT_TO_6_TABLE[value];
738 case 3:
739 return REPLICATE_3_BIT_TO_6_TABLE[value];
740 case 4:
741 return REPLICATE_4_BIT_TO_6_TABLE[value];
742 case 5:
743 return REPLICATE_5_BIT_TO_6_TABLE[value];
744 default:
745 return Replicate(value, num_bits, 6);
746 }
747}
748
749class Pixel {
750protected:
751 using ChannelType = s16;
752 u8 m_BitDepth[4] = {8, 8, 8, 8};
753 s16 color[4] = {};
754
755public:
756 Pixel() = default;
757 Pixel(u32 a, u32 r, u32 g, u32 b, u32 bitDepth = 8)
758 : m_BitDepth{u8(bitDepth), u8(bitDepth), u8(bitDepth), u8(bitDepth)},
759 color{static_cast<ChannelType>(a), static_cast<ChannelType>(r),
760 static_cast<ChannelType>(g), static_cast<ChannelType>(b)} {}
761
762 // Changes the depth of each pixel. This scales the values to
763 // the appropriate bit depth by either truncating the least
764 // significant bits when going from larger to smaller bit depth
765 // or by repeating the most significant bits when going from
766 // smaller to larger bit depths.
767 void ChangeBitDepth() {
768 for (u32 i = 0; i < 4; i++) {
769 Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]);
770 m_BitDepth[i] = 8;
771 }
772 }
773
774 template <typename IntType>
775 static float ConvertChannelToFloat(IntType channel, u8 bitDepth) {
776 float denominator = static_cast<float>((1 << bitDepth) - 1);
777 return static_cast<float>(channel) / denominator;
778 }
779
780 // Changes the bit depth of a single component. See the comment
781 // above for how we do this.
782 static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) {
783 assert(oldDepth <= 8);
784
785 if (oldDepth == 8) {
786 // Do nothing
787 return val;
788 } else if (oldDepth == 0) {
789 return static_cast<ChannelType>((1 << 8) - 1);
790 } else if (8 > oldDepth) {
791 return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
792 } else {
793 // oldDepth > newDepth
794 const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
795 u16 v = static_cast<u16>(val);
796 v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
797 v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1));
798 return static_cast<u8>(v);
799 }
800
801 assert(false && "We shouldn't get here.");
802 return 0;
803 }
804
805 const ChannelType& A() const {
806 return color[0];
807 }
808 ChannelType& A() {
809 return color[0];
810 }
811 const ChannelType& R() const {
812 return color[1];
813 }
814 ChannelType& R() {
815 return color[1];
816 }
817 const ChannelType& G() const {
818 return color[2];
819 }
820 ChannelType& G() {
821 return color[2];
822 }
823 const ChannelType& B() const {
824 return color[3];
825 }
826 ChannelType& B() {
827 return color[3];
828 }
829 const ChannelType& Component(u32 idx) const {
830 return color[idx];
831 }
832 ChannelType& Component(u32 idx) {
833 return color[idx];
834 }
835
836 void GetBitDepth(u8 (&outDepth)[4]) const {
837 for (s32 i = 0; i < 4; i++) {
838 outDepth[i] = m_BitDepth[i];
839 }
840 }
841
842 // Take all of the components, transform them to their 8-bit variants,
843 // and then pack each channel into an R8G8B8A8 32-bit integer. We assume
844 // that the architecture is little-endian, so the alpha channel will end
845 // up in the most-significant byte.
846 u32 Pack() const {
847 Pixel eightBit(*this);
848 eightBit.ChangeBitDepth();
849
850 u32 r = 0;
851 r |= eightBit.A();
852 r <<= 8;
853 r |= eightBit.B();
854 r <<= 8;
855 r |= eightBit.G();
856 r <<= 8;
857 r |= eightBit.R();
858 return r;
859 }
860
861 // Clamps the pixel to the range [0,255]
862 void ClampByte() {
863 for (u32 i = 0; i < 4; i++) {
864 color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
865 }
866 }
867
868 void MakeOpaque() {
869 A() = 255;
870 }
871};
872
873static void DecodeColorValues(u32* out, std::span<u8> data, const u32* modes, const u32 nPartitions,
874 const u32 nBitsForColorData) {
875 // First figure out how many color values we have
876 u32 nValues = 0;
877 for (u32 i = 0; i < nPartitions; i++) {
878 nValues += ((modes[i] >> 2) + 1) << 1;
879 }
880
881 // Then based on the number of values and the remaining number of bits,
882 // figure out the max value for each of them...
883 u32 range = 256;
884 while (--range > 0) {
885 IntegerEncodedValue val = EncodingsValues[range];
886 u32 bitLength = val.GetBitLength(nValues);
887 if (bitLength <= nBitsForColorData) {
888 // Find the smallest possible range that matches the given encoding
889 while (--range > 0) {
890 IntegerEncodedValue newval = EncodingsValues[range];
891 if (!newval.MatchesEncoding(val)) {
892 break;
893 }
894 }
895
896 // Return to last matching range.
897 range++;
898 break;
899 }
900 }
901
902 // We now have enough to decode our integer sequence.
903 IntegerEncodedVector decodedColorValues;
904
905 InputBitStream colorStream(data, 0);
906 DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
907
908 // Once we have the decoded values, we need to dequantize them to the 0-255 range
909 // This procedure is outlined in ASTC spec C.2.13
910 u32 outIdx = 0;
911 for (auto itr = decodedColorValues.begin(); itr != decodedColorValues.end(); ++itr) {
912 // Have we already decoded all that we need?
913 if (outIdx >= nValues) {
914 break;
915 }
916
917 const IntegerEncodedValue& val = *itr;
918 u32 bitlen = val.num_bits;
919 u32 bitval = val.bit_value;
920
921 assert(bitlen >= 1);
922
923 u32 A = 0, B = 0, C = 0, D = 0;
924 // A is just the lsb replicated 9 times.
925 A = ReplicateBitTo9(bitval & 1);
926
927 switch (val.encoding) {
928 // Replicate bits
929 case IntegerEncoding::JustBits:
930 out[outIdx++] = FastReplicateTo8(bitval, bitlen);
931 break;
932
933 // Use algorithm in C.2.13
934 case IntegerEncoding::Trit: {
935
936 D = val.trit_value;
937
938 switch (bitlen) {
939 case 1: {
940 C = 204;
941 } break;
942
943 case 2: {
944 C = 93;
945 // B = b000b0bb0
946 u32 b = (bitval >> 1) & 1;
947 B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
948 } break;
949
950 case 3: {
951 C = 44;
952 // B = cb000cbcb
953 u32 cb = (bitval >> 1) & 3;
954 B = (cb << 7) | (cb << 2) | cb;
955 } break;
956
957 case 4: {
958 C = 22;
959 // B = dcb000dcb
960 u32 dcb = (bitval >> 1) & 7;
961 B = (dcb << 6) | dcb;
962 } break;
963
964 case 5: {
965 C = 11;
966 // B = edcb000ed
967 u32 edcb = (bitval >> 1) & 0xF;
968 B = (edcb << 5) | (edcb >> 2);
969 } break;
970
971 case 6: {
972 C = 5;
973 // B = fedcb000f
974 u32 fedcb = (bitval >> 1) & 0x1F;
975 B = (fedcb << 4) | (fedcb >> 4);
976 } break;
977
978 default:
979 assert(false && "Unsupported trit encoding for color values!");
980 break;
981 } // switch(bitlen)
982 } // case IntegerEncoding::Trit
983 break;
984
985 case IntegerEncoding::Qus32: {
986
987 D = val.qus32_value;
988
989 switch (bitlen) {
990 case 1: {
991 C = 113;
992 } break;
993
994 case 2: {
995 C = 54;
996 // B = b0000bb00
997 u32 b = (bitval >> 1) & 1;
998 B = (b << 8) | (b << 3) | (b << 2);
999 } break;
1000
1001 case 3: {
1002 C = 26;
1003 // B = cb0000cbc
1004 u32 cb = (bitval >> 1) & 3;
1005 B = (cb << 7) | (cb << 1) | (cb >> 1);
1006 } break;
1007
1008 case 4: {
1009 C = 13;
1010 // B = dcb0000dc
1011 u32 dcb = (bitval >> 1) & 7;
1012 B = (dcb << 6) | (dcb >> 1);
1013 } break;
1014
1015 case 5: {
1016 C = 6;
1017 // B = edcb0000e
1018 u32 edcb = (bitval >> 1) & 0xF;
1019 B = (edcb << 5) | (edcb >> 3);
1020 } break;
1021
1022 default:
1023 assert(false && "Unsupported quint encoding for color values!");
1024 break;
1025 } // switch(bitlen)
1026 } // case IntegerEncoding::Qus32
1027 break;
1028 } // switch(val.encoding)
1029
1030 if (val.encoding != IntegerEncoding::JustBits) {
1031 u32 T = D * C + B;
1032 T ^= A;
1033 T = (A & 0x80) | (T >> 2);
1034 out[outIdx++] = T;
1035 }
1036 }
1037
1038 // Make sure that each of our values is in the proper range...
1039 for (u32 i = 0; i < nValues; i++) {
1040 assert(out[i] <= 255);
1041 }
1042}
1043
1044static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
1045 u32 bitval = val.bit_value;
1046 u32 bitlen = val.num_bits;
1047
1048 u32 A = ReplicateBitTo7(bitval & 1);
1049 u32 B = 0, C = 0, D = 0;
1050
1051 u32 result = 0;
1052 switch (val.encoding) {
1053 case IntegerEncoding::JustBits:
1054 result = FastReplicateTo6(bitval, bitlen);
1055 break;
1056
1057 case IntegerEncoding::Trit: {
1058 D = val.trit_value;
1059 assert(D < 3);
1060
1061 switch (bitlen) {
1062 case 0: {
1063 u32 results[3] = {0, 32, 63};
1064 result = results[D];
1065 } break;
1066
1067 case 1: {
1068 C = 50;
1069 } break;
1070
1071 case 2: {
1072 C = 23;
1073 u32 b = (bitval >> 1) & 1;
1074 B = (b << 6) | (b << 2) | b;
1075 } break;
1076
1077 case 3: {
1078 C = 11;
1079 u32 cb = (bitval >> 1) & 3;
1080 B = (cb << 5) | cb;
1081 } break;
1082
1083 default:
1084 assert(false && "Invalid trit encoding for texel weight");
1085 break;
1086 }
1087 } break;
1088
1089 case IntegerEncoding::Qus32: {
1090 D = val.qus32_value;
1091 assert(D < 5);
1092
1093 switch (bitlen) {
1094 case 0: {
1095 u32 results[5] = {0, 16, 32, 47, 63};
1096 result = results[D];
1097 } break;
1098
1099 case 1: {
1100 C = 28;
1101 } break;
1102
1103 case 2: {
1104 C = 13;
1105 u32 b = (bitval >> 1) & 1;
1106 B = (b << 6) | (b << 1);
1107 } break;
1108
1109 default:
1110 assert(false && "Invalid quint encoding for texel weight");
1111 break;
1112 }
1113 } break;
1114 }
1115
1116 if (val.encoding != IntegerEncoding::JustBits && bitlen > 0) {
1117 // Decode the value...
1118 result = D * C + B;
1119 result ^= A;
1120 result = (A & 0x20) | (result >> 2);
1121 }
1122
1123 assert(result < 64);
1124
1125 // Change from [0,63] to [0,64]
1126 if (result > 32) {
1127 result += 1;
1128 }
1129
1130 return result;
1131}
1132
1133static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights,
1134 const TexelWeightParams& params, const u32 blockWidth,
1135 const u32 blockHeight) {
1136 u32 weightIdx = 0;
1137 u32 unquantized[2][144];
1138
1139 for (auto itr = weights.begin(); itr != weights.end(); ++itr) {
1140 unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr);
1141
1142 if (params.m_bDualPlane) {
1143 ++itr;
1144 unquantized[1][weightIdx] = UnquantizeTexelWeight(*itr);
1145 if (itr == weights.end()) {
1146 break;
1147 }
1148 }
1149
1150 if (++weightIdx >= (params.m_Width * params.m_Height))
1151 break;
1152 }
1153
1154 // Do infill if necessary (Section C.2.18) ...
1155 u32 Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1);
1156 u32 Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1);
1157
1158 const u32 kPlaneScale = params.m_bDualPlane ? 2U : 1U;
1159 for (u32 plane = 0; plane < kPlaneScale; plane++)
1160 for (u32 t = 0; t < blockHeight; t++)
1161 for (u32 s = 0; s < blockWidth; s++) {
1162 u32 cs = Ds * s;
1163 u32 ct = Dt * t;
1164
1165 u32 gs = (cs * (params.m_Width - 1) + 32) >> 6;
1166 u32 gt = (ct * (params.m_Height - 1) + 32) >> 6;
1167
1168 u32 js = gs >> 4;
1169 u32 fs = gs & 0xF;
1170
1171 u32 jt = gt >> 4;
1172 u32 ft = gt & 0x0F;
1173
1174 u32 w11 = (fs * ft + 8) >> 4;
1175 u32 w10 = ft - w11;
1176 u32 w01 = fs - w11;
1177 u32 w00 = 16 - fs - ft + w11;
1178
1179 u32 v0 = js + jt * params.m_Width;
1180
1181#define FIND_TEXEL(tidx, bidx) \
1182 u32 p##bidx = 0; \
1183 do { \
1184 if ((tidx) < (params.m_Width * params.m_Height)) { \
1185 p##bidx = unquantized[plane][(tidx)]; \
1186 } \
1187 } while (0)
1188
1189 FIND_TEXEL(v0, 00);
1190 FIND_TEXEL(v0 + 1, 01);
1191 FIND_TEXEL(v0 + params.m_Width, 10);
1192 FIND_TEXEL(v0 + params.m_Width + 1, 11);
1193
1194#undef FIND_TEXEL
1195
1196 out[plane][t * blockWidth + s] =
1197 (p00 * w00 + p01 * w01 + p10 * w10 + p11 * w11 + 8) >> 4;
1198 }
1199}
1200
1201// Transfers a bit as described in C.2.14
1202static inline void BitTransferSigned(s32& a, s32& b) {
1203 b >>= 1;
1204 b |= a & 0x80;
1205 a >>= 1;
1206 a &= 0x3F;
1207 if (a & 0x20)
1208 a -= 0x40;
1209}
1210
1211// Adds more precision to the blue channel as described
1212// in C.2.14
1213static inline Pixel BlueContract(s32 a, s32 r, s32 g, s32 b) {
1214 return Pixel(static_cast<s16>(a), static_cast<s16>((r + b) >> 1),
1215 static_cast<s16>((g + b) >> 1), static_cast<s16>(b));
1216}
1217
1218// Partition selection functions as specified in
1219// C.2.21
1220static inline u32 hash52(u32 p) {
1221 p ^= p >> 15;
1222 p -= p << 17;
1223 p += p << 7;
1224 p += p << 4;
1225 p ^= p >> 5;
1226 p += p << 16;
1227 p ^= p >> 7;
1228 p ^= p >> 3;
1229 p ^= p << 6;
1230 p ^= p >> 17;
1231 return p;
1232}
1233
1234static u32 SelectPartition(s32 seed, s32 x, s32 y, s32 z, s32 partitionCount, s32 smallBlock) {
1235 if (1 == partitionCount)
1236 return 0;
1237
1238 if (smallBlock) {
1239 x <<= 1;
1240 y <<= 1;
1241 z <<= 1;
1242 }
1243
1244 seed += (partitionCount - 1) * 1024;
1245
1246 u32 rnum = hash52(static_cast<u32>(seed));
1247 u8 seed1 = static_cast<u8>(rnum & 0xF);
1248 u8 seed2 = static_cast<u8>((rnum >> 4) & 0xF);
1249 u8 seed3 = static_cast<u8>((rnum >> 8) & 0xF);
1250 u8 seed4 = static_cast<u8>((rnum >> 12) & 0xF);
1251 u8 seed5 = static_cast<u8>((rnum >> 16) & 0xF);
1252 u8 seed6 = static_cast<u8>((rnum >> 20) & 0xF);
1253 u8 seed7 = static_cast<u8>((rnum >> 24) & 0xF);
1254 u8 seed8 = static_cast<u8>((rnum >> 28) & 0xF);
1255 u8 seed9 = static_cast<u8>((rnum >> 18) & 0xF);
1256 u8 seed10 = static_cast<u8>((rnum >> 22) & 0xF);
1257 u8 seed11 = static_cast<u8>((rnum >> 26) & 0xF);
1258 u8 seed12 = static_cast<u8>(((rnum >> 30) | (rnum << 2)) & 0xF);
1259
1260 seed1 = static_cast<u8>(seed1 * seed1);
1261 seed2 = static_cast<u8>(seed2 * seed2);
1262 seed3 = static_cast<u8>(seed3 * seed3);
1263 seed4 = static_cast<u8>(seed4 * seed4);
1264 seed5 = static_cast<u8>(seed5 * seed5);
1265 seed6 = static_cast<u8>(seed6 * seed6);
1266 seed7 = static_cast<u8>(seed7 * seed7);
1267 seed8 = static_cast<u8>(seed8 * seed8);
1268 seed9 = static_cast<u8>(seed9 * seed9);
1269 seed10 = static_cast<u8>(seed10 * seed10);
1270 seed11 = static_cast<u8>(seed11 * seed11);
1271 seed12 = static_cast<u8>(seed12 * seed12);
1272
1273 s32 sh1, sh2, sh3;
1274 if (seed & 1) {
1275 sh1 = (seed & 2) ? 4 : 5;
1276 sh2 = (partitionCount == 3) ? 6 : 5;
1277 } else {
1278 sh1 = (partitionCount == 3) ? 6 : 5;
1279 sh2 = (seed & 2) ? 4 : 5;
1280 }
1281 sh3 = (seed & 0x10) ? sh1 : sh2;
1282
1283 seed1 = static_cast<u8>(seed1 >> sh1);
1284 seed2 = static_cast<u8>(seed2 >> sh2);
1285 seed3 = static_cast<u8>(seed3 >> sh1);
1286 seed4 = static_cast<u8>(seed4 >> sh2);
1287 seed5 = static_cast<u8>(seed5 >> sh1);
1288 seed6 = static_cast<u8>(seed6 >> sh2);
1289 seed7 = static_cast<u8>(seed7 >> sh1);
1290 seed8 = static_cast<u8>(seed8 >> sh2);
1291 seed9 = static_cast<u8>(seed9 >> sh3);
1292 seed10 = static_cast<u8>(seed10 >> sh3);
1293 seed11 = static_cast<u8>(seed11 >> sh3);
1294 seed12 = static_cast<u8>(seed12 >> sh3);
1295
1296 s32 a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
1297 s32 b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
1298 s32 c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
1299 s32 d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
1300
1301 a &= 0x3F;
1302 b &= 0x3F;
1303 c &= 0x3F;
1304 d &= 0x3F;
1305
1306 if (partitionCount < 4)
1307 d = 0;
1308 if (partitionCount < 3)
1309 c = 0;
1310
1311 if (a >= b && a >= c && a >= d)
1312 return 0;
1313 else if (b >= c && b >= d)
1314 return 1;
1315 else if (c >= d)
1316 return 2;
1317 return 3;
1318}
1319
1320static inline u32 Select2DPartition(s32 seed, s32 x, s32 y, s32 partitionCount, s32 smallBlock) {
1321 return SelectPartition(seed, x, y, 0, partitionCount, smallBlock);
1322}
1323
1324// Section C.2.14
1325static void ComputeEndpos32s(Pixel& ep1, Pixel& ep2, const u32*& colorValues,
1326 u32 colorEndpos32Mode) {
1327#define READ_UINT_VALUES(N) \
1328 u32 v[N]; \
1329 for (u32 i = 0; i < N; i++) { \
1330 v[i] = *(colorValues++); \
1331 }
1332
1333#define READ_INT_VALUES(N) \
1334 s32 v[N]; \
1335 for (u32 i = 0; i < N; i++) { \
1336 v[i] = static_cast<s32>(*(colorValues++)); \
1337 }
1338
1339 switch (colorEndpos32Mode) {
1340 case 0: {
1341 READ_UINT_VALUES(2)
1342 ep1 = Pixel(0xFF, v[0], v[0], v[0]);
1343 ep2 = Pixel(0xFF, v[1], v[1], v[1]);
1344 } break;
1345
1346 case 1: {
1347 READ_UINT_VALUES(2)
1348 u32 L0 = (v[0] >> 2) | (v[1] & 0xC0);
1349 u32 L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU);
1350 ep1 = Pixel(0xFF, L0, L0, L0);
1351 ep2 = Pixel(0xFF, L1, L1, L1);
1352 } break;
1353
1354 case 4: {
1355 READ_UINT_VALUES(4)
1356 ep1 = Pixel(v[2], v[0], v[0], v[0]);
1357 ep2 = Pixel(v[3], v[1], v[1], v[1]);
1358 } break;
1359
1360 case 5: {
1361 READ_INT_VALUES(4)
1362 BitTransferSigned(v[1], v[0]);
1363 BitTransferSigned(v[3], v[2]);
1364 ep1 = Pixel(v[2], v[0], v[0], v[0]);
1365 ep2 = Pixel(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]);
1366 ep1.ClampByte();
1367 ep2.ClampByte();
1368 } break;
1369
1370 case 6: {
1371 READ_UINT_VALUES(4)
1372 ep1 = Pixel(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
1373 ep2 = Pixel(0xFF, v[0], v[1], v[2]);
1374 } break;
1375
1376 case 8: {
1377 READ_UINT_VALUES(6)
1378 if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
1379 ep1 = Pixel(0xFF, v[0], v[2], v[4]);
1380 ep2 = Pixel(0xFF, v[1], v[3], v[5]);
1381 } else {
1382 ep1 = BlueContract(0xFF, v[1], v[3], v[5]);
1383 ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
1384 }
1385 } break;
1386
1387 case 9: {
1388 READ_INT_VALUES(6)
1389 BitTransferSigned(v[1], v[0]);
1390 BitTransferSigned(v[3], v[2]);
1391 BitTransferSigned(v[5], v[4]);
1392 if (v[1] + v[3] + v[5] >= 0) {
1393 ep1 = Pixel(0xFF, v[0], v[2], v[4]);
1394 ep2 = Pixel(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1395 } else {
1396 ep1 = BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1397 ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
1398 }
1399 ep1.ClampByte();
1400 ep2.ClampByte();
1401 } break;
1402
1403 case 10: {
1404 READ_UINT_VALUES(6)
1405 ep1 = Pixel(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
1406 ep2 = Pixel(v[5], v[0], v[1], v[2]);
1407 } break;
1408
1409 case 12: {
1410 READ_UINT_VALUES(8)
1411 if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
1412 ep1 = Pixel(v[6], v[0], v[2], v[4]);
1413 ep2 = Pixel(v[7], v[1], v[3], v[5]);
1414 } else {
1415 ep1 = BlueContract(v[7], v[1], v[3], v[5]);
1416 ep2 = BlueContract(v[6], v[0], v[2], v[4]);
1417 }
1418 } break;
1419
1420 case 13: {
1421 READ_INT_VALUES(8)
1422 BitTransferSigned(v[1], v[0]);
1423 BitTransferSigned(v[3], v[2]);
1424 BitTransferSigned(v[5], v[4]);
1425 BitTransferSigned(v[7], v[6]);
1426 if (v[1] + v[3] + v[5] >= 0) {
1427 ep1 = Pixel(v[6], v[0], v[2], v[4]);
1428 ep2 = Pixel(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1429 } else {
1430 ep1 = BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1431 ep2 = BlueContract(v[6], v[0], v[2], v[4]);
1432 }
1433 ep1.ClampByte();
1434 ep2.ClampByte();
1435 } break;
1436
1437 default:
1438 assert(false && "Unsupported color endpoint mode (is it HDR?)");
1439 break;
1440 }
1441
1442#undef READ_UINT_VALUES
1443#undef READ_INT_VALUES
1444}
1445
1446static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
1447 const u32 blockHeight, std::span<u32, 12 * 12> outBuf) {
1448 InputBitStream strm(inBuf);
1449 TexelWeightParams weightParams = DecodeBlockInfo(strm);
1450
1451 // Was there an error?
1452 if (weightParams.m_bError) {
1453 assert(false && "Invalid block mode");
1454 FillError(outBuf, blockWidth, blockHeight);
1455 return;
1456 }
1457
1458 if (weightParams.m_bVoidExtentLDR) {
1459 FillVoidExtentLDR(strm, outBuf, blockWidth, blockHeight);
1460 return;
1461 }
1462
1463 if (weightParams.m_bVoidExtentHDR) {
1464 assert(false && "HDR void extent blocks are unsupported!");
1465 FillError(outBuf, blockWidth, blockHeight);
1466 return;
1467 }
1468
1469 if (weightParams.m_Width > blockWidth) {
1470 assert(false && "Texel weight grid width should be smaller than block width");
1471 FillError(outBuf, blockWidth, blockHeight);
1472 return;
1473 }
1474
1475 if (weightParams.m_Height > blockHeight) {
1476 assert(false && "Texel weight grid height should be smaller than block height");
1477 FillError(outBuf, blockWidth, blockHeight);
1478 return;
1479 }
1480
1481 // Read num partitions
1482 u32 nPartitions = strm.ReadBits<2>() + 1;
1483 assert(nPartitions <= 4);
1484
1485 if (nPartitions == 4 && weightParams.m_bDualPlane) {
1486 assert(false && "Dual plane mode is incompatible with four partition blocks");
1487 FillError(outBuf, blockWidth, blockHeight);
1488 return;
1489 }
1490
1491 // Based on the number of partitions, read the color endpos32 mode for
1492 // each partition.
1493
1494 // Determine partitions, partition index, and color endpos32 modes
1495 s32 planeIdx = -1;
1496 u32 partitionIndex;
1497 u32 colorEndpos32Mode[4] = {0, 0, 0, 0};
1498
1499 // Define color data.
1500 u8 colorEndpos32Data[16];
1501 memset(colorEndpos32Data, 0, sizeof(colorEndpos32Data));
1502 OutputBitStream colorEndpos32Stream(colorEndpos32Data, 16 * 8, 0);
1503
1504 // Read extra config data...
1505 u32 baseCEM = 0;
1506 if (nPartitions == 1) {
1507 colorEndpos32Mode[0] = strm.ReadBits<4>();
1508 partitionIndex = 0;
1509 } else {
1510 partitionIndex = strm.ReadBits<10>();
1511 baseCEM = strm.ReadBits<6>();
1512 }
1513 u32 baseMode = (baseCEM & 3);
1514
1515 // Remaining bits are color endpos32 data...
1516 u32 nWeightBits = weightParams.GetPackedBitSize();
1517 s32 remainingBits = 128 - nWeightBits - static_cast<s32>(strm.GetBitsRead());
1518
1519 // Consider extra bits prior to texel data...
1520 u32 extraCEMbits = 0;
1521 if (baseMode) {
1522 switch (nPartitions) {
1523 case 2:
1524 extraCEMbits += 2;
1525 break;
1526 case 3:
1527 extraCEMbits += 5;
1528 break;
1529 case 4:
1530 extraCEMbits += 8;
1531 break;
1532 default:
1533 assert(false);
1534 break;
1535 }
1536 }
1537 remainingBits -= extraCEMbits;
1538
1539 // Do we have a dual plane situation?
1540 u32 planeSelectorBits = 0;
1541 if (weightParams.m_bDualPlane) {
1542 planeSelectorBits = 2;
1543 }
1544 remainingBits -= planeSelectorBits;
1545
1546 // Read color data...
1547 u32 colorDataBits = remainingBits;
1548 while (remainingBits > 0) {
1549 u32 nb = std::min(remainingBits, 8);
1550 u32 b = strm.ReadBits(nb);
1551 colorEndpos32Stream.WriteBits(b, nb);
1552 remainingBits -= 8;
1553 }
1554
1555 // Read the plane selection bits
1556 planeIdx = strm.ReadBits(planeSelectorBits);
1557
1558 // Read the rest of the CEM
1559 if (baseMode) {
1560 u32 extraCEM = strm.ReadBits(extraCEMbits);
1561 u32 CEM = (extraCEM << 6) | baseCEM;
1562 CEM >>= 2;
1563
1564 bool C[4] = {0};
1565 for (u32 i = 0; i < nPartitions; i++) {
1566 C[i] = CEM & 1;
1567 CEM >>= 1;
1568 }
1569
1570 u8 M[4] = {0};
1571 for (u32 i = 0; i < nPartitions; i++) {
1572 M[i] = CEM & 3;
1573 CEM >>= 2;
1574 assert(M[i] <= 3);
1575 }
1576
1577 for (u32 i = 0; i < nPartitions; i++) {
1578 colorEndpos32Mode[i] = baseMode;
1579 if (!(C[i]))
1580 colorEndpos32Mode[i] -= 1;
1581 colorEndpos32Mode[i] <<= 2;
1582 colorEndpos32Mode[i] |= M[i];
1583 }
1584 } else if (nPartitions > 1) {
1585 u32 CEM = baseCEM >> 2;
1586 for (u32 i = 0; i < nPartitions; i++) {
1587 colorEndpos32Mode[i] = CEM;
1588 }
1589 }
1590
1591 // Make sure everything up till here is sane.
1592 for (u32 i = 0; i < nPartitions; i++) {
1593 assert(colorEndpos32Mode[i] < 16);
1594 }
1595 assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128);
1596
1597 // Decode both color data and texel weight data
1598 u32 colorValues[32]; // Four values, two endpos32s, four maximum paritions
1599 DecodeColorValues(colorValues, colorEndpos32Data, colorEndpos32Mode, nPartitions,
1600 colorDataBits);
1601
1602 Pixel endpos32s[4][2];
1603 const u32* colorValuesPtr = colorValues;
1604 for (u32 i = 0; i < nPartitions; i++) {
1605 ComputeEndpos32s(endpos32s[i][0], endpos32s[i][1], colorValuesPtr, colorEndpos32Mode[i]);
1606 }
1607
1608 // Read the texel weight data..
1609 std::array<u8, 16> texelWeightData;
1610 std::ranges::copy(inBuf, texelWeightData.begin());
1611
1612 // Reverse everything
1613 for (u32 i = 0; i < 8; i++) {
1614// Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
1615#define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32
1616 u8 a = static_cast<u8>(REVERSE_BYTE(texelWeightData[i]));
1617 u8 b = static_cast<u8>(REVERSE_BYTE(texelWeightData[15 - i]));
1618#undef REVERSE_BYTE
1619
1620 texelWeightData[i] = b;
1621 texelWeightData[15 - i] = a;
1622 }
1623
1624 // Make sure that higher non-texel bits are set to zero
1625 const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1;
1626 if (clearByteStart > 0 && clearByteStart <= texelWeightData.size()) {
1627 texelWeightData[clearByteStart - 1] &=
1628 static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
1629 std::memset(texelWeightData.data() + clearByteStart, 0,
1630 std::min(16U - clearByteStart, 16U));
1631 }
1632
1633 IntegerEncodedVector texelWeightValues;
1634
1635 InputBitStream weightStream(texelWeightData);
1636
1637 DecodeIntegerSequence(texelWeightValues, weightStream, weightParams.m_MaxWeight,
1638 weightParams.GetNumWeightValues());
1639
1640 // Blocks can be at most 12x12, so we can have as many as 144 weights
1641 u32 weights[2][144];
1642 UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight);
1643
1644 // Now that we have endpos32s and weights, we can s32erpolate and generate
1645 // the proper decoding...
1646 for (u32 j = 0; j < blockHeight; j++)
1647 for (u32 i = 0; i < blockWidth; i++) {
1648 u32 partition = Select2DPartition(partitionIndex, i, j, nPartitions,
1649 (blockHeight * blockWidth) < 32);
1650 assert(partition < nPartitions);
1651
1652 Pixel p;
1653 for (u32 c = 0; c < 4; c++) {
1654 u32 C0 = endpos32s[partition][0].Component(c);
1655 C0 = ReplicateByteTo16(C0);
1656 u32 C1 = endpos32s[partition][1].Component(c);
1657 C1 = ReplicateByteTo16(C1);
1658
1659 u32 plane = 0;
1660 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
1661 plane = 1;
1662 }
1663
1664 u32 weight = weights[plane][j * blockWidth + i];
1665 u32 C = (C0 * (64 - weight) + C1 * weight + 32) / 64;
1666 if (C == 65535) {
1667 p.Component(c) = 255;
1668 } else {
1669 double Cf = static_cast<double>(C);
1670 p.Component(c) = static_cast<u16>(255.0 * (Cf / 65536.0) + 0.5);
1671 }
1672 }
1673
1674 outBuf[j * blockWidth + i] = p.Pack();
1675 }
1676}
1677
1678} // namespace ASTCC
1679
1680namespace Tegra::Texture::ASTC {
1681
1682void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
1683 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) {
1684 u32 block_index = 0;
1685 std::size_t depth_offset = 0;
1686 for (u32 z = 0; z < depth; z++) {
1687 for (u32 y = 0; y < height; y += block_height) {
1688 for (u32 x = 0; x < width; x += block_width) {
1689 const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
1690
1691 // Blocks can be at most 12x12
1692 std::array<u32, 12 * 12> uncompData;
1693 ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData);
1694
1695 u32 decompWidth = std::min(block_width, width - x);
1696 u32 decompHeight = std::min(block_height, height - y);
1697
1698 const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
1699 for (u32 jj = 0; jj < decompHeight; jj++) {
1700 std::memcpy(outRow.data() + jj * width * 4,
1701 uncompData.data() + jj * block_width, decompWidth * 4);
1702 }
1703 ++block_index;
1704 }
1705 }
1706 depth_offset += height * width * 4;
1707 }
1708}
1709
1710} // namespace Tegra::Texture::ASTC