summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Mat M2020-04-13 10:17:32 -0400
committerGravatar GitHub2020-04-13 10:17:32 -0400
commitc4001225f64bbea511cf8df0885c77cb6ba70091 (patch)
tree850d379792b1018cddd9282da50fade32e38fa4f /src
parentMerge pull request #3619 from ReinUsesLisp/i2i (diff)
parentastc: Hard code bit depth changes to 8 and use fast replicate (diff)
downloadyuzu-c4001225f64bbea511cf8df0885c77cb6ba70091.tar.gz
yuzu-c4001225f64bbea511cf8df0885c77cb6ba70091.tar.xz
yuzu-c4001225f64bbea511cf8df0885c77cb6ba70091.zip
Merge pull request #3631 from ReinUsesLisp/more-astc
texture/astc: More small ASTC optimizations
Diffstat (limited to 'src')
-rw-r--r--src/video_core/textures/astc.cpp241
1 files changed, 159 insertions, 82 deletions
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 062b4f252..365bde2f1 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -20,6 +20,8 @@
20#include <cstring> 20#include <cstring>
21#include <vector> 21#include <vector>
22 22
23#include <boost/container/static_vector.hpp>
24
23#include "common/common_types.h" 25#include "common/common_types.h"
24 26
25#include "video_core/textures/astc.h" 27#include "video_core/textures/astc.h"
@@ -39,25 +41,25 @@ constexpr u32 Popcnt(u32 n) {
39 41
40class InputBitStream { 42class InputBitStream {
41public: 43public:
42 explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) 44 constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
43 : m_CurByte(ptr), m_NextBit(start_offset % 8) {} 45 : cur_byte{ptr}, next_bit{start_offset % 8} {}
44 46
45 std::size_t GetBitsRead() const { 47 constexpr std::size_t GetBitsRead() const {
46 return m_BitsRead; 48 return bits_read;
47 } 49 }
48 50
49 u32 ReadBit() { 51 constexpr bool ReadBit() {
50 u32 bit = *m_CurByte >> m_NextBit++; 52 const bool bit = (*cur_byte >> next_bit++) & 1;
51 while (m_NextBit >= 8) { 53 while (next_bit >= 8) {
52 m_NextBit -= 8; 54 next_bit -= 8;
53 m_CurByte++; 55 cur_byte++;
54 } 56 }
55 57
56 m_BitsRead++; 58 bits_read++;
57 return bit & 1; 59 return bit;
58 } 60 }
59 61
60 u32 ReadBits(std::size_t nBits) { 62 constexpr u32 ReadBits(std::size_t nBits) {
61 u32 ret = 0; 63 u32 ret = 0;
62 for (std::size_t i = 0; i < nBits; ++i) { 64 for (std::size_t i = 0; i < nBits; ++i) {
63 ret |= (ReadBit() & 1) << i; 65 ret |= (ReadBit() & 1) << i;
@@ -66,7 +68,7 @@ public:
66 } 68 }
67 69
68 template <std::size_t nBits> 70 template <std::size_t nBits>
69 u32 ReadBits() { 71 constexpr u32 ReadBits() {
70 u32 ret = 0; 72 u32 ret = 0;
71 for (std::size_t i = 0; i < nBits; ++i) { 73 for (std::size_t i = 0; i < nBits; ++i) {
72 ret |= (ReadBit() & 1) << i; 74 ret |= (ReadBit() & 1) << i;
@@ -75,64 +77,58 @@ public:
75 } 77 }
76 78
77private: 79private:
78 const u8* m_CurByte; 80 const u8* cur_byte;
79 std::size_t m_NextBit = 0; 81 std::size_t next_bit = 0;
80 std::size_t m_BitsRead = 0; 82 std::size_t bits_read = 0;
81}; 83};
82 84
83class OutputBitStream { 85class OutputBitStream {
84public: 86public:
85 explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0) 87 constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0)
86 : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {} 88 : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {}
87
88 ~OutputBitStream() = default;
89 89
90 s32 GetBitsWritten() const { 90 constexpr std::size_t GetBitsWritten() const {
91 return m_BitsWritten; 91 return bits_written;
92 } 92 }
93 93
94 void WriteBitsR(u32 val, u32 nBits) { 94 constexpr void WriteBitsR(u32 val, u32 nBits) {
95 for (u32 i = 0; i < nBits; i++) { 95 for (u32 i = 0; i < nBits; i++) {
96 WriteBit((val >> (nBits - i - 1)) & 1); 96 WriteBit((val >> (nBits - i - 1)) & 1);
97 } 97 }
98 } 98 }
99 99
100 void WriteBits(u32 val, u32 nBits) { 100 constexpr void WriteBits(u32 val, u32 nBits) {
101 for (u32 i = 0; i < nBits; i++) { 101 for (u32 i = 0; i < nBits; i++) {
102 WriteBit((val >> i) & 1); 102 WriteBit((val >> i) & 1);
103 } 103 }
104 } 104 }
105 105
106private: 106private:
107 void WriteBit(s32 b) { 107 constexpr void WriteBit(bool b) {
108 108 if (bits_written >= num_bits) {
109 if (done)
110 return; 109 return;
110 }
111 111
112 const u32 mask = 1 << m_NextBit++; 112 const u32 mask = 1 << next_bit++;
113 113
114 // clear the bit 114 // clear the bit
115 *m_CurByte &= static_cast<u8>(~mask); 115 *cur_byte &= static_cast<u8>(~mask);
116 116
117 // Write the bit, if necessary 117 // Write the bit, if necessary
118 if (b) 118 if (b)
119 *m_CurByte |= static_cast<u8>(mask); 119 *cur_byte |= static_cast<u8>(mask);
120 120
121 // Next byte? 121 // Next byte?
122 if (m_NextBit >= 8) { 122 if (next_bit >= 8) {
123 m_CurByte += 1; 123 cur_byte += 1;
124 m_NextBit = 0; 124 next_bit = 0;
125 } 125 }
126
127 done = done || ++m_BitsWritten >= m_NumBits;
128 } 126 }
129 127
130 s32 m_BitsWritten = 0; 128 u8* cur_byte;
131 const s32 m_NumBits; 129 std::size_t num_bits;
132 u8* m_CurByte; 130 std::size_t bits_written = 0;
133 s32 m_NextBit = 0; 131 std::size_t next_bit = 0;
134
135 bool done = false;
136}; 132};
137 133
138template <typename IntType> 134template <typename IntType>
@@ -195,9 +191,13 @@ struct IntegerEncodedValue {
195 u32 trit_value; 191 u32 trit_value;
196 }; 192 };
197}; 193};
194using IntegerEncodedVector = boost::container::static_vector<
195 IntegerEncodedValue, 64,
196 boost::container::static_vector_options<
197 boost::container::inplace_alignment<alignof(IntegerEncodedValue)>,
198 boost::container::throw_on_overflow<false>>::type>;
198 199
199static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, 200static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) {
200 u32 nBitsPerValue) {
201 // Implement the algorithm in section C.2.12 201 // Implement the algorithm in section C.2.12
202 u32 m[5]; 202 u32 m[5];
203 u32 t[5]; 203 u32 t[5];
@@ -255,7 +255,7 @@ static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValu
255 } 255 }
256} 256}
257 257
258static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, 258static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result,
259 u32 nBitsPerValue) { 259 u32 nBitsPerValue) {
260 // Implement the algorithm in section C.2.12 260 // Implement the algorithm in section C.2.12
261 u32 m[3]; 261 u32 m[3];
@@ -343,8 +343,8 @@ static constexpr std::array EncodingsValues = MakeEncodedValues();
343// Fills result with the values that are encoded in the given 343// Fills result with the values that are encoded in the given
344// bitstream. We must know beforehand what the maximum possible 344// bitstream. We must know beforehand what the maximum possible
345// value is, and how many values we're decoding. 345// value is, and how many values we're decoding.
346static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits, 346static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange,
347 u32 maxRange, u32 nValues) { 347 u32 nValues) {
348 // Determine encoding parameters 348 // Determine encoding parameters
349 IntegerEncodedValue val = EncodingsValues[maxRange]; 349 IntegerEncodedValue val = EncodingsValues[maxRange];
350 350
@@ -634,12 +634,14 @@ static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) {
634// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] 634// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
635// is the same as [(numBits - 1):0] and repeats all the way down. 635// is the same as [(numBits - 1):0] and repeats all the way down.
636template <typename IntType> 636template <typename IntType>
637static IntType Replicate(IntType val, u32 numBits, u32 toBit) { 637static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
638 if (numBits == 0) 638 if (numBits == 0) {
639 return 0; 639 return 0;
640 if (toBit == 0) 640 }
641 if (toBit == 0) {
641 return 0; 642 return 0;
642 IntType v = val & static_cast<IntType>((1 << numBits) - 1); 643 }
644 const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
643 IntType res = v; 645 IntType res = v;
644 u32 reslen = numBits; 646 u32 reslen = numBits;
645 while (reslen < toBit) { 647 while (reslen < toBit) {
@@ -656,6 +658,89 @@ static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
656 return res; 658 return res;
657} 659}
658 660
661static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
662 return std::size_t(1) << num_bits;
663}
664
665template <typename IntType, u32 num_bits, u32 to_bit>
666static constexpr auto MakeReplicateTable() {
667 std::array<IntType, NumReplicateEntries(num_bits)> table{};
668 for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
669 table[value] = Replicate(value, num_bits, to_bit);
670 }
671 return table;
672}
673
674static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
675static constexpr u32 ReplicateByteTo16(std::size_t value) {
676 return REPLICATE_BYTE_TO_16_TABLE[value];
677}
678
679static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
680static constexpr u32 ReplicateBitTo7(std::size_t value) {
681 return REPLICATE_BIT_TO_7_TABLE[value];
682}
683
684static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
685static constexpr u32 ReplicateBitTo9(std::size_t value) {
686 return REPLICATE_BIT_TO_9_TABLE[value];
687}
688
689static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
690static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
691static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
692static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
693static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
694static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
695static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
696static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
697/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
698/// to the runtime implementation
699static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
700 switch (num_bits) {
701 case 1:
702 return REPLICATE_1_BIT_TO_8_TABLE[value];
703 case 2:
704 return REPLICATE_2_BIT_TO_8_TABLE[value];
705 case 3:
706 return REPLICATE_3_BIT_TO_8_TABLE[value];
707 case 4:
708 return REPLICATE_4_BIT_TO_8_TABLE[value];
709 case 5:
710 return REPLICATE_5_BIT_TO_8_TABLE[value];
711 case 6:
712 return REPLICATE_6_BIT_TO_8_TABLE[value];
713 case 7:
714 return REPLICATE_7_BIT_TO_8_TABLE[value];
715 case 8:
716 return REPLICATE_8_BIT_TO_8_TABLE[value];
717 default:
718 return Replicate(value, num_bits, 8);
719 }
720}
721
722static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
723static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
724static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
725static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
726static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
727static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
728 switch (num_bits) {
729 case 1:
730 return REPLICATE_1_BIT_TO_6_TABLE[value];
731 case 2:
732 return REPLICATE_2_BIT_TO_6_TABLE[value];
733 case 3:
734 return REPLICATE_3_BIT_TO_6_TABLE[value];
735 case 4:
736 return REPLICATE_4_BIT_TO_6_TABLE[value];
737 case 5:
738 return REPLICATE_5_BIT_TO_6_TABLE[value];
739 default:
740 return Replicate(value, num_bits, 6);
741 }
742}
743
659class Pixel { 744class Pixel {
660protected: 745protected:
661 using ChannelType = s16; 746 using ChannelType = s16;
@@ -674,10 +759,10 @@ public:
674 // significant bits when going from larger to smaller bit depth 759 // significant bits when going from larger to smaller bit depth
675 // or by repeating the most significant bits when going from 760 // or by repeating the most significant bits when going from
676 // smaller to larger bit depths. 761 // smaller to larger bit depths.
677 void ChangeBitDepth(const u8 (&depth)[4]) { 762 void ChangeBitDepth() {
678 for (u32 i = 0; i < 4; i++) { 763 for (u32 i = 0; i < 4; i++) {
679 Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]); 764 Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]);
680 m_BitDepth[i] = depth[i]; 765 m_BitDepth[i] = 8;
681 } 766 }
682 } 767 }
683 768
@@ -689,28 +774,23 @@ public:
689 774
690 // Changes the bit depth of a single component. See the comment 775 // Changes the bit depth of a single component. See the comment
691 // above for how we do this. 776 // above for how we do this.
692 static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) { 777 static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) {
693 assert(newDepth <= 8);
694 assert(oldDepth <= 8); 778 assert(oldDepth <= 8);
695 779
696 if (oldDepth == newDepth) { 780 if (oldDepth == 8) {
697 // Do nothing 781 // Do nothing
698 return val; 782 return val;
699 } else if (oldDepth == 0 && newDepth != 0) { 783 } else if (oldDepth == 0) {
700 return static_cast<ChannelType>((1 << newDepth) - 1); 784 return static_cast<ChannelType>((1 << 8) - 1);
701 } else if (newDepth > oldDepth) { 785 } else if (8 > oldDepth) {
702 return Replicate(val, oldDepth, newDepth); 786 return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
703 } else { 787 } else {
704 // oldDepth > newDepth 788 // oldDepth > newDepth
705 if (newDepth == 0) { 789 const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
706 return 0xFF; 790 u16 v = static_cast<u16>(val);
707 } else { 791 v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
708 u8 bitsWasted = static_cast<u8>(oldDepth - newDepth); 792 v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1));
709 u16 v = static_cast<u16>(val); 793 return static_cast<u8>(v);
710 v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
711 v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1));
712 return static_cast<u8>(v);
713 }
714 } 794 }
715 795
716 assert(false && "We shouldn't get here."); 796 assert(false && "We shouldn't get here.");
@@ -760,8 +840,7 @@ public:
760 // up in the most-significant byte. 840 // up in the most-significant byte.
761 u32 Pack() const { 841 u32 Pack() const {
762 Pixel eightBit(*this); 842 Pixel eightBit(*this);
763 const u8 eightBitDepth[4] = {8, 8, 8, 8}; 843 eightBit.ChangeBitDepth();
764 eightBit.ChangeBitDepth(eightBitDepth);
765 844
766 u32 r = 0; 845 u32 r = 0;
767 r |= eightBit.A(); 846 r |= eightBit.A();
@@ -816,8 +895,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
816 } 895 }
817 896
818 // We now have enough to decode our integer sequence. 897 // We now have enough to decode our integer sequence.
819 std::vector<IntegerEncodedValue> decodedColorValues; 898 IntegerEncodedVector decodedColorValues;
820 decodedColorValues.reserve(32);
821 899
822 InputBitStream colorStream(data); 900 InputBitStream colorStream(data);
823 DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); 901 DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
@@ -839,12 +917,12 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
839 917
840 u32 A = 0, B = 0, C = 0, D = 0; 918 u32 A = 0, B = 0, C = 0, D = 0;
841 // A is just the lsb replicated 9 times. 919 // A is just the lsb replicated 9 times.
842 A = Replicate(bitval & 1, 1, 9); 920 A = ReplicateBitTo9(bitval & 1);
843 921
844 switch (val.encoding) { 922 switch (val.encoding) {
845 // Replicate bits 923 // Replicate bits
846 case IntegerEncoding::JustBits: 924 case IntegerEncoding::JustBits:
847 out[outIdx++] = Replicate(bitval, bitlen, 8); 925 out[outIdx++] = FastReplicateTo8(bitval, bitlen);
848 break; 926 break;
849 927
850 // Use algorithm in C.2.13 928 // Use algorithm in C.2.13
@@ -962,13 +1040,13 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
962 u32 bitval = val.bit_value; 1040 u32 bitval = val.bit_value;
963 u32 bitlen = val.num_bits; 1041 u32 bitlen = val.num_bits;
964 1042
965 u32 A = Replicate(bitval & 1, 1, 7); 1043 u32 A = ReplicateBitTo7(bitval & 1);
966 u32 B = 0, C = 0, D = 0; 1044 u32 B = 0, C = 0, D = 0;
967 1045
968 u32 result = 0; 1046 u32 result = 0;
969 switch (val.encoding) { 1047 switch (val.encoding) {
970 case IntegerEncoding::JustBits: 1048 case IntegerEncoding::JustBits:
971 result = Replicate(bitval, bitlen, 6); 1049 result = FastReplicateTo6(bitval, bitlen);
972 break; 1050 break;
973 1051
974 case IntegerEncoding::Trit: { 1052 case IntegerEncoding::Trit: {
@@ -1047,7 +1125,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
1047 return result; 1125 return result;
1048} 1126}
1049 1127
1050static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights, 1128static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights,
1051 const TexelWeightParams& params, const u32 blockWidth, 1129 const TexelWeightParams& params, const u32 blockWidth,
1052 const u32 blockHeight) { 1130 const u32 blockHeight) {
1053 u32 weightIdx = 0; 1131 u32 weightIdx = 0;
@@ -1545,8 +1623,7 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
1545 static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); 1623 static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
1546 memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); 1624 memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
1547 1625
1548 std::vector<IntegerEncodedValue> texelWeightValues; 1626 IntegerEncodedVector texelWeightValues;
1549 texelWeightValues.reserve(64);
1550 1627
1551 InputBitStream weightStream(texelWeightData); 1628 InputBitStream weightStream(texelWeightData);
1552 1629
@@ -1568,9 +1645,9 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
1568 Pixel p; 1645 Pixel p;
1569 for (u32 c = 0; c < 4; c++) { 1646 for (u32 c = 0; c < 4; c++) {
1570 u32 C0 = endpos32s[partition][0].Component(c); 1647 u32 C0 = endpos32s[partition][0].Component(c);
1571 C0 = Replicate(C0, 8, 16); 1648 C0 = ReplicateByteTo16(C0);
1572 u32 C1 = endpos32s[partition][1].Component(c); 1649 u32 C1 = endpos32s[partition][1].Component(c);
1573 C1 = Replicate(C1, 8, 16); 1650 C1 = ReplicateByteTo16(C1);
1574 1651
1575 u32 plane = 0; 1652 u32 plane = 0;
1576 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { 1653 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {