diff options
| author | 2021-08-06 21:45:24 -0700 | |
|---|---|---|
| committer | 2021-08-06 21:45:24 -0700 | |
| commit | 268b5764c70a8300d24c32985dee595046a1e2e1 (patch) | |
| tree | 178317fbc7f34549a93b8e28d9f0b6857aa104c8 /src/video_core/textures | |
| parent | Merge pull request #6799 from ameerj/vp9-fixes (diff) | |
| parent | astc_decoder: Reduce workgroup size (diff) | |
| download | yuzu-268b5764c70a8300d24c32985dee595046a1e2e1.tar.gz yuzu-268b5764c70a8300d24c32985dee595046a1e2e1.tar.xz yuzu-268b5764c70a8300d24c32985dee595046a1e2e1.zip | |
Merge pull request #6791 from ameerj/astc-opt
astc_decoder: Various performance and memory optimizations
Diffstat (limited to 'src/video_core/textures')
| -rw-r--r-- | src/video_core/textures/astc.cpp | 156 | ||||
| -rw-r--r-- | src/video_core/textures/astc.h | 111 |
2 files changed, 133 insertions, 134 deletions
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 3ab500760..25161df1f 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp | |||
| @@ -151,6 +151,76 @@ private: | |||
| 151 | const IntType& m_Bits; | 151 | const IntType& m_Bits; |
| 152 | }; | 152 | }; |
| 153 | 153 | ||
| 154 | enum class IntegerEncoding { JustBits, Quint, Trit }; | ||
| 155 | |||
| 156 | struct IntegerEncodedValue { | ||
| 157 | constexpr IntegerEncodedValue() = default; | ||
| 158 | |||
| 159 | constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_) | ||
| 160 | : encoding{encoding_}, num_bits{num_bits_} {} | ||
| 161 | |||
| 162 | constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const { | ||
| 163 | return encoding == other.encoding && num_bits == other.num_bits; | ||
| 164 | } | ||
| 165 | |||
| 166 | // Returns the number of bits required to encode num_vals values. | ||
| 167 | u32 GetBitLength(u32 num_vals) const { | ||
| 168 | u32 total_bits = num_bits * num_vals; | ||
| 169 | if (encoding == IntegerEncoding::Trit) { | ||
| 170 | total_bits += (num_vals * 8 + 4) / 5; | ||
| 171 | } else if (encoding == IntegerEncoding::Quint) { | ||
| 172 | total_bits += (num_vals * 7 + 2) / 3; | ||
| 173 | } | ||
| 174 | return total_bits; | ||
| 175 | } | ||
| 176 | |||
| 177 | IntegerEncoding encoding{}; | ||
| 178 | u32 num_bits = 0; | ||
| 179 | u32 bit_value = 0; | ||
| 180 | union { | ||
| 181 | u32 quint_value = 0; | ||
| 182 | u32 trit_value; | ||
| 183 | }; | ||
| 184 | }; | ||
| 185 | |||
| 186 | // Returns a new instance of this struct that corresponds to the | ||
| 187 | // can take no more than mav_value values | ||
| 188 | static constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) { | ||
| 189 | while (mav_value > 0) { | ||
| 190 | u32 check = mav_value + 1; | ||
| 191 | |||
| 192 | // Is mav_value a power of two? | ||
| 193 | if (!(check & (check - 1))) { | ||
| 194 | return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value)); | ||
| 195 | } | ||
| 196 | |||
| 197 | // Is mav_value of the type 3*2^n - 1? | ||
| 198 | if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { | ||
| 199 | return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1)); | ||
| 200 | } | ||
| 201 | |||
| 202 | // Is mav_value of the type 5*2^n - 1? | ||
| 203 | if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { | ||
| 204 | return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1)); | ||
| 205 | } | ||
| 206 | |||
| 207 | // Apparently it can't be represented with a bounded integer sequence... | ||
| 208 | // just iterate. | ||
| 209 | mav_value--; | ||
| 210 | } | ||
| 211 | return IntegerEncodedValue(IntegerEncoding::JustBits, 0); | ||
| 212 | } | ||
| 213 | |||
| 214 | static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { | ||
| 215 | std::array<IntegerEncodedValue, 256> encodings{}; | ||
| 216 | for (std::size_t i = 0; i < encodings.size(); ++i) { | ||
| 217 | encodings[i] = CreateEncoding(static_cast<u32>(i)); | ||
| 218 | } | ||
| 219 | return encodings; | ||
| 220 | } | ||
| 221 | |||
| 222 | static constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues(); | ||
| 223 | |||
| 154 | namespace Tegra::Texture::ASTC { | 224 | namespace Tegra::Texture::ASTC { |
| 155 | using IntegerEncodedVector = boost::container::static_vector< | 225 | using IntegerEncodedVector = boost::container::static_vector< |
| 156 | IntegerEncodedValue, 256, | 226 | IntegerEncodedValue, 256, |
| @@ -521,35 +591,41 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { | |||
| 521 | return params; | 591 | return params; |
| 522 | } | 592 | } |
| 523 | 593 | ||
| 524 | static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth, | 594 | // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] |
| 525 | u32 blockHeight) { | 595 | // is the same as [(num_bits - 1):0] and repeats all the way down. |
| 526 | // Don't actually care about the void extent, just read the bits... | 596 | template <typename IntType> |
| 527 | for (s32 i = 0; i < 4; ++i) { | 597 | static constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) { |
| 528 | strm.ReadBits<13>(); | 598 | if (num_bits == 0 || to_bit == 0) { |
| 599 | return 0; | ||
| 529 | } | 600 | } |
| 530 | 601 | const IntType v = val & static_cast<IntType>((1 << num_bits) - 1); | |
| 531 | // Decode the RGBA components and renormalize them to the range [0, 255] | 602 | IntType res = v; |
| 532 | u16 r = static_cast<u16>(strm.ReadBits<16>()); | 603 | u32 reslen = num_bits; |
| 533 | u16 g = static_cast<u16>(strm.ReadBits<16>()); | 604 | while (reslen < to_bit) { |
| 534 | u16 b = static_cast<u16>(strm.ReadBits<16>()); | 605 | u32 comp = 0; |
| 535 | u16 a = static_cast<u16>(strm.ReadBits<16>()); | 606 | if (num_bits > to_bit - reslen) { |
| 536 | 607 | u32 newshift = to_bit - reslen; | |
| 537 | u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 | | 608 | comp = num_bits - newshift; |
| 538 | (static_cast<u32>(a) & 0xFF00) << 16; | 609 | num_bits = newshift; |
| 539 | |||
| 540 | for (u32 j = 0; j < blockHeight; j++) { | ||
| 541 | for (u32 i = 0; i < blockWidth; i++) { | ||
| 542 | outBuf[j * blockWidth + i] = rgba; | ||
| 543 | } | 610 | } |
| 611 | res = static_cast<IntType>(res << num_bits); | ||
| 612 | res = static_cast<IntType>(res | (v >> comp)); | ||
| 613 | reslen += num_bits; | ||
| 544 | } | 614 | } |
| 615 | return res; | ||
| 545 | } | 616 | } |
| 546 | 617 | ||
| 547 | static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) { | 618 | static constexpr std::size_t NumReplicateEntries(u32 num_bits) { |
| 548 | for (u32 j = 0; j < blockHeight; j++) { | 619 | return std::size_t(1) << num_bits; |
| 549 | for (u32 i = 0; i < blockWidth; i++) { | 620 | } |
| 550 | outBuf[j * blockWidth + i] = 0xFFFF00FF; | 621 | |
| 551 | } | 622 | template <typename IntType, u32 num_bits, u32 to_bit> |
| 623 | static constexpr auto MakeReplicateTable() { | ||
| 624 | std::array<IntType, NumReplicateEntries(num_bits)> table{}; | ||
| 625 | for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) { | ||
| 626 | table[value] = Replicate(value, num_bits, to_bit); | ||
| 552 | } | 627 | } |
| 628 | return table; | ||
| 553 | } | 629 | } |
| 554 | 630 | ||
| 555 | static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); | 631 | static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); |
| @@ -572,6 +648,9 @@ static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8> | |||
| 572 | static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>(); | 648 | static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>(); |
| 573 | static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>(); | 649 | static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>(); |
| 574 | static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>(); | 650 | static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>(); |
| 651 | static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>(); | ||
| 652 | static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>(); | ||
| 653 | static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>(); | ||
| 575 | /// Use a precompiled table with the most common usages, if it's not in the expected range, fallback | 654 | /// Use a precompiled table with the most common usages, if it's not in the expected range, fallback |
| 576 | /// to the runtime implementation | 655 | /// to the runtime implementation |
| 577 | static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { | 656 | static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { |
| @@ -1316,6 +1395,37 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const u32*& colorValues, | |||
| 1316 | #undef READ_INT_VALUES | 1395 | #undef READ_INT_VALUES |
| 1317 | } | 1396 | } |
| 1318 | 1397 | ||
| 1398 | static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth, | ||
| 1399 | u32 blockHeight) { | ||
| 1400 | // Don't actually care about the void extent, just read the bits... | ||
| 1401 | for (s32 i = 0; i < 4; ++i) { | ||
| 1402 | strm.ReadBits<13>(); | ||
| 1403 | } | ||
| 1404 | |||
| 1405 | // Decode the RGBA components and renormalize them to the range [0, 255] | ||
| 1406 | u16 r = static_cast<u16>(strm.ReadBits<16>()); | ||
| 1407 | u16 g = static_cast<u16>(strm.ReadBits<16>()); | ||
| 1408 | u16 b = static_cast<u16>(strm.ReadBits<16>()); | ||
| 1409 | u16 a = static_cast<u16>(strm.ReadBits<16>()); | ||
| 1410 | |||
| 1411 | u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 | | ||
| 1412 | (static_cast<u32>(a) & 0xFF00) << 16; | ||
| 1413 | |||
| 1414 | for (u32 j = 0; j < blockHeight; j++) { | ||
| 1415 | for (u32 i = 0; i < blockWidth; i++) { | ||
| 1416 | outBuf[j * blockWidth + i] = rgba; | ||
| 1417 | } | ||
| 1418 | } | ||
| 1419 | } | ||
| 1420 | |||
| 1421 | static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) { | ||
| 1422 | for (u32 j = 0; j < blockHeight; j++) { | ||
| 1423 | for (u32 i = 0; i < blockWidth; i++) { | ||
| 1424 | outBuf[j * blockWidth + i] = 0xFFFF00FF; | ||
| 1425 | } | ||
| 1426 | } | ||
| 1427 | } | ||
| 1428 | |||
| 1319 | static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth, | 1429 | static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth, |
| 1320 | const u32 blockHeight, std::span<u32, 12 * 12> outBuf) { | 1430 | const u32 blockHeight, std::span<u32, 12 * 12> outBuf) { |
| 1321 | InputBitStream strm(inBuf); | 1431 | InputBitStream strm(inBuf); |
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h index 0229ae122..14d2beec0 100644 --- a/src/video_core/textures/astc.h +++ b/src/video_core/textures/astc.h | |||
| @@ -9,117 +9,6 @@ | |||
| 9 | 9 | ||
| 10 | namespace Tegra::Texture::ASTC { | 10 | namespace Tegra::Texture::ASTC { |
| 11 | 11 | ||
| 12 | enum class IntegerEncoding { JustBits, Quint, Trit }; | ||
| 13 | |||
| 14 | struct IntegerEncodedValue { | ||
| 15 | constexpr IntegerEncodedValue() = default; | ||
| 16 | |||
| 17 | constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_) | ||
| 18 | : encoding{encoding_}, num_bits{num_bits_} {} | ||
| 19 | |||
| 20 | constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const { | ||
| 21 | return encoding == other.encoding && num_bits == other.num_bits; | ||
| 22 | } | ||
| 23 | |||
| 24 | // Returns the number of bits required to encode num_vals values. | ||
| 25 | u32 GetBitLength(u32 num_vals) const { | ||
| 26 | u32 total_bits = num_bits * num_vals; | ||
| 27 | if (encoding == IntegerEncoding::Trit) { | ||
| 28 | total_bits += (num_vals * 8 + 4) / 5; | ||
| 29 | } else if (encoding == IntegerEncoding::Quint) { | ||
| 30 | total_bits += (num_vals * 7 + 2) / 3; | ||
| 31 | } | ||
| 32 | return total_bits; | ||
| 33 | } | ||
| 34 | |||
| 35 | IntegerEncoding encoding{}; | ||
| 36 | u32 num_bits = 0; | ||
| 37 | u32 bit_value = 0; | ||
| 38 | union { | ||
| 39 | u32 quint_value = 0; | ||
| 40 | u32 trit_value; | ||
| 41 | }; | ||
| 42 | }; | ||
| 43 | |||
| 44 | // Returns a new instance of this struct that corresponds to the | ||
| 45 | // can take no more than mav_value values | ||
| 46 | constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) { | ||
| 47 | while (mav_value > 0) { | ||
| 48 | u32 check = mav_value + 1; | ||
| 49 | |||
| 50 | // Is mav_value a power of two? | ||
| 51 | if (!(check & (check - 1))) { | ||
| 52 | return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value)); | ||
| 53 | } | ||
| 54 | |||
| 55 | // Is mav_value of the type 3*2^n - 1? | ||
| 56 | if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { | ||
| 57 | return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1)); | ||
| 58 | } | ||
| 59 | |||
| 60 | // Is mav_value of the type 5*2^n - 1? | ||
| 61 | if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { | ||
| 62 | return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1)); | ||
| 63 | } | ||
| 64 | |||
| 65 | // Apparently it can't be represented with a bounded integer sequence... | ||
| 66 | // just iterate. | ||
| 67 | mav_value--; | ||
| 68 | } | ||
| 69 | return IntegerEncodedValue(IntegerEncoding::JustBits, 0); | ||
| 70 | } | ||
| 71 | |||
| 72 | constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { | ||
| 73 | std::array<IntegerEncodedValue, 256> encodings{}; | ||
| 74 | for (std::size_t i = 0; i < encodings.size(); ++i) { | ||
| 75 | encodings[i] = CreateEncoding(static_cast<u32>(i)); | ||
| 76 | } | ||
| 77 | return encodings; | ||
| 78 | } | ||
| 79 | |||
| 80 | constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues(); | ||
| 81 | |||
| 82 | // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] | ||
| 83 | // is the same as [(num_bits - 1):0] and repeats all the way down. | ||
| 84 | template <typename IntType> | ||
| 85 | constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) { | ||
| 86 | if (num_bits == 0 || to_bit == 0) { | ||
| 87 | return 0; | ||
| 88 | } | ||
| 89 | const IntType v = val & static_cast<IntType>((1 << num_bits) - 1); | ||
| 90 | IntType res = v; | ||
| 91 | u32 reslen = num_bits; | ||
| 92 | while (reslen < to_bit) { | ||
| 93 | u32 comp = 0; | ||
| 94 | if (num_bits > to_bit - reslen) { | ||
| 95 | u32 newshift = to_bit - reslen; | ||
| 96 | comp = num_bits - newshift; | ||
| 97 | num_bits = newshift; | ||
| 98 | } | ||
| 99 | res = static_cast<IntType>(res << num_bits); | ||
| 100 | res = static_cast<IntType>(res | (v >> comp)); | ||
| 101 | reslen += num_bits; | ||
| 102 | } | ||
| 103 | return res; | ||
| 104 | } | ||
| 105 | |||
| 106 | constexpr std::size_t NumReplicateEntries(u32 num_bits) { | ||
| 107 | return std::size_t(1) << num_bits; | ||
| 108 | } | ||
| 109 | |||
| 110 | template <typename IntType, u32 num_bits, u32 to_bit> | ||
| 111 | constexpr auto MakeReplicateTable() { | ||
| 112 | std::array<IntType, NumReplicateEntries(num_bits)> table{}; | ||
| 113 | for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) { | ||
| 114 | table[value] = Replicate(value, num_bits, to_bit); | ||
| 115 | } | ||
| 116 | return table; | ||
| 117 | } | ||
| 118 | |||
| 119 | constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>(); | ||
| 120 | constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>(); | ||
| 121 | constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>(); | ||
| 122 | |||
| 123 | void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, | 12 | void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, |
| 124 | uint32_t block_width, uint32_t block_height, std::span<uint8_t> output); | 13 | uint32_t block_width, uint32_t block_height, std::span<uint8_t> output); |
| 125 | 14 | ||