Merge pull request #3631 from ReinUsesLisp/more-astc

texture/astc: More small ASTC optimizations
author: Mat M 2020-04-13 10:17:32 -0400
committer: GitHub 2020-04-13 10:17:32 -0400
commit: c4001225f64bbea511cf8df0885c77cb6ba70091 (patch)
tree: 850d379792b1018cddd9282da50fade32e38fa4f /src
parent: Merge pull request #3619 from ReinUsesLisp/i2i (diff)
parent: astc: Hard code bit depth changes to 8 and use fast replicate (diff)
download: yuzu-c4001225f64bbea511cf8df0885c77cb6ba70091.tar.gz
yuzu-c4001225f64bbea511cf8df0885c77cb6ba70091.tar.xz
yuzu-c4001225f64bbea511cf8df0885c77cb6ba70091.zip
1 files changed, 159 insertions, 82 deletions
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 062b4f252..365bde2f1 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -20,6 +20,8 @@
 #include <cstring>
 #include <vector>
+#include <boost/container/static_vector.hpp>
 #include "common/common_types.h"
 #include "video_core/textures/astc.h"
@@ -39,25 +41,25 @@ constexpr u32 Popcnt(u32 n) {
 class InputBitStream {
 public:
-    explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
+    constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
-        : m_CurByte(ptr), m_NextBit(start_offset % 8) {}
+        : cur_byte{ptr}, next_bit{start_offset % 8} {}
-    std::size_t GetBitsRead() const {
+    constexpr std::size_t GetBitsRead() const {
-        return m_BitsRead;
+        return bits_read;
    }
-    u32 ReadBit() {
+    constexpr bool ReadBit() {
-        u32 bit = *m_CurByte >> m_NextBit++;
+        const bool bit = (*cur_byte >> next_bit++) & 1;
-        while (m_NextBit >= 8) {
+        while (next_bit >= 8) {
-            m_NextBit -= 8;
+            next_bit -= 8;
-            m_CurByte++;
+            cur_byte++;
        }
-        m_BitsRead++;
+        bits_read++;
-        return bit & 1;
+        return bit;
    }
-    u32 ReadBits(std::size_t nBits) {
+    constexpr u32 ReadBits(std::size_t nBits) {
        u32 ret = 0;
        for (std::size_t i = 0; i < nBits; ++i) {
            ret |= (ReadBit() & 1) << i;
@@ -66,7 +68,7 @@ public:
    }
    template <std::size_t nBits>
-    u32 ReadBits() {
+    constexpr u32 ReadBits() {
        u32 ret = 0;
        for (std::size_t i = 0; i < nBits; ++i) {
            ret |= (ReadBit() & 1) << i;
@@ -75,64 +77,58 @@ public:
    }
 private:
-    const u8* m_CurByte;
+    const u8* cur_byte;
-    std::size_t m_NextBit = 0;
+    std::size_t next_bit = 0;
-    std::size_t m_BitsRead = 0;
+    std::size_t bits_read = 0;
 };
 class OutputBitStream {
 public:
-    explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0)
+    constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0)
-        : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {}
+        : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {}
-    ~OutputBitStream() = default;
-    s32 GetBitsWritten() const {
+    constexpr std::size_t GetBitsWritten() const {
-        return m_BitsWritten;
+        return bits_written;
    }
-    void WriteBitsR(u32 val, u32 nBits) {
+    constexpr void WriteBitsR(u32 val, u32 nBits) {
        for (u32 i = 0; i < nBits; i++) {
            WriteBit((val >> (nBits - i - 1)) & 1);
        }
    }
-    void WriteBits(u32 val, u32 nBits) {
+    constexpr void WriteBits(u32 val, u32 nBits) {
        for (u32 i = 0; i < nBits; i++) {
            WriteBit((val >> i) & 1);
        }
    }
 private:
-    void WriteBit(s32 b) {
+    constexpr void WriteBit(bool b) {
+        if (bits_written >= num_bits) {
-        if (done)
            return;
+        }
-        const u32 mask = 1 << m_NextBit++;
+        const u32 mask = 1 << next_bit++;
        // clear the bit
-        *m_CurByte &= static_cast<u8>(~mask);
+        *cur_byte &= static_cast<u8>(~mask);
        // Write the bit, if necessary
        if (b)
-            *m_CurByte |= static_cast<u8>(mask);
+            *cur_byte |= static_cast<u8>(mask);
        // Next byte?
-        if (m_NextBit >= 8) {
+        if (next_bit >= 8) {
-            m_CurByte += 1;
+            cur_byte += 1;
-            m_NextBit = 0;
+            next_bit = 0;
        }
-        done = done || ++m_BitsWritten >= m_NumBits;
    }
-    s32 m_BitsWritten = 0;
+    u8* cur_byte;
-    const s32 m_NumBits;
+    std::size_t num_bits;
-    u8* m_CurByte;
+    std::size_t bits_written = 0;
-    s32 m_NextBit = 0;
+    std::size_t next_bit = 0;
-    bool done = false;
 };
 template <typename IntType>
@@ -195,9 +191,13 @@ struct IntegerEncodedValue {
        u32 trit_value;
    };
 };
+using IntegerEncodedVector = boost::container::static_vector<
+    IntegerEncodedValue, 64,
+    boost::container::static_vector_options<
+        boost::container::inplace_alignment<alignof(IntegerEncodedValue)>,
+        boost::container::throw_on_overflow<false>>::type>;
-static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
+static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) {
-                            u32 nBitsPerValue) {
    // Implement the algorithm in section C.2.12
    u32 m[5];
    u32 t[5];
@@ -255,7 +255,7 @@ static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValu
    }
 }
-static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
+static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result,
                             u32 nBitsPerValue) {
    // Implement the algorithm in section C.2.12
    u32 m[3];
@@ -343,8 +343,8 @@ static constexpr std::array EncodingsValues = MakeEncodedValues();
 // Fills result with the values that are encoded in the given
 // bitstream. We must know beforehand what the maximum possible
 // value is, and how many values we're decoding.
-static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits,
+static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange,
-                                  u32 maxRange, u32 nValues) {
+                                  u32 nValues) {
    // Determine encoding parameters
    IntegerEncodedValue val = EncodingsValues[maxRange];
@@ -634,12 +634,14 @@ static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) {
 // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
 // is the same as [(numBits - 1):0] and repeats all the way down.
 template <typename IntType>
-static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
+static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
-    if (numBits == 0)
+    if (numBits == 0) {
        return 0;
-    if (toBit == 0)
+    }
+    if (toBit == 0) {
        return 0;
-    IntType v = val & static_cast<IntType>((1 << numBits) - 1);
+    }
+    const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
    IntType res = v;
    u32 reslen = numBits;
    while (reslen < toBit) {
@@ -656,6 +658,89 @@ static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
    return res;
 }
+static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
+    return std::size_t(1) << num_bits;
+}
+template <typename IntType, u32 num_bits, u32 to_bit>
+static constexpr auto MakeReplicateTable() {
+    std::array<IntType, NumReplicateEntries(num_bits)> table{};
+    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
+        table[value] = Replicate(value, num_bits, to_bit);
+    }
+    return table;
+}
+static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
+static constexpr u32 ReplicateByteTo16(std::size_t value) {
+    return REPLICATE_BYTE_TO_16_TABLE[value];
+}
+static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
+static constexpr u32 ReplicateBitTo7(std::size_t value) {
+    return REPLICATE_BIT_TO_7_TABLE[value];
+}
+static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
+static constexpr u32 ReplicateBitTo9(std::size_t value) {
+    return REPLICATE_BIT_TO_9_TABLE[value];
+}
+static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
+static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
+static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
+static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
+static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
+static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
+static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
+static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
+/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
+/// to the runtime implementation
+static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_8_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_8_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_8_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_8_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_8_TABLE[value];
+    case 6:
+        return REPLICATE_6_BIT_TO_8_TABLE[value];
+    case 7:
+        return REPLICATE_7_BIT_TO_8_TABLE[value];
+    case 8:
+        return REPLICATE_8_BIT_TO_8_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 8);
+    }
+}
+static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
+static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
+static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
+static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
+static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
+static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_6_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_6_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_6_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_6_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_6_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 6);
+    }
+}
 class Pixel {
 protected:
    using ChannelType = s16;
@@ -674,10 +759,10 @@ public:
    // significant bits when going from larger to smaller bit depth
    // or by repeating the most significant bits when going from
    // smaller to larger bit depths.
-    void ChangeBitDepth(const u8 (&depth)[4]) {
+    void ChangeBitDepth() {
        for (u32 i = 0; i < 4; i++) {
-            Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]);
+            Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]);
-            m_BitDepth[i] = depth[i];
+            m_BitDepth[i] = 8;
        }
    }
@@ -689,28 +774,23 @@ public:
    // Changes the bit depth of a single component. See the comment
    // above for how we do this.
-    static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) {
+    static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) {
-        assert(newDepth <= 8);
        assert(oldDepth <= 8);
-        if (oldDepth == newDepth) {
+        if (oldDepth == 8) {
            // Do nothing
            return val;
-        } else if (oldDepth == 0 && newDepth != 0) {
+        } else if (oldDepth == 0) {
-            return static_cast<ChannelType>((1 << newDepth) - 1);
+            return static_cast<ChannelType>((1 << 8) - 1);
-        } else if (newDepth > oldDepth) {
+        } else if (8 > oldDepth) {
-            return Replicate(val, oldDepth, newDepth);
+            return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
        } else {
            // oldDepth > newDepth
-            if (newDepth == 0) {
+            const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
-                return 0xFF;
+            u16 v = static_cast<u16>(val);
-            } else {
+            v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
-                u8 bitsWasted = static_cast<u8>(oldDepth - newDepth);
+            v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1));
-                u16 v = static_cast<u16>(val);
+            return static_cast<u8>(v);
-                v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
-                v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1));
-                return static_cast<u8>(v);
-            }
        }
        assert(false && "We shouldn't get here.");
@@ -760,8 +840,7 @@ public:
    // up in the most-significant byte.
    u32 Pack() const {
        Pixel eightBit(*this);
-        const u8 eightBitDepth[4] = {8, 8, 8, 8};
+        eightBit.ChangeBitDepth();
-        eightBit.ChangeBitDepth(eightBitDepth);
        u32 r = 0;
        r |= eightBit.A();
@@ -816,8 +895,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
    }
    // We now have enough to decode our integer sequence.
-    std::vector<IntegerEncodedValue> decodedColorValues;
+    IntegerEncodedVector decodedColorValues;
-    decodedColorValues.reserve(32);
    InputBitStream colorStream(data);
    DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
@@ -839,12 +917,12 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
        u32 A = 0, B = 0, C = 0, D = 0;
        // A is just the lsb replicated 9 times.
-        A = Replicate(bitval & 1, 1, 9);
+        A = ReplicateBitTo9(bitval & 1);
        switch (val.encoding) {
        // Replicate bits
        case IntegerEncoding::JustBits:
-            out[outIdx++] = Replicate(bitval, bitlen, 8);
+            out[outIdx++] = FastReplicateTo8(bitval, bitlen);
            break;
        // Use algorithm in C.2.13
@@ -962,13 +1040,13 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
    u32 bitval = val.bit_value;
    u32 bitlen = val.num_bits;
-    u32 A = Replicate(bitval & 1, 1, 7);
+    u32 A = ReplicateBitTo7(bitval & 1);
    u32 B = 0, C = 0, D = 0;
    u32 result = 0;
    switch (val.encoding) {
    case IntegerEncoding::JustBits:
-        result = Replicate(bitval, bitlen, 6);
+        result = FastReplicateTo6(bitval, bitlen);
        break;
    case IntegerEncoding::Trit: {
@@ -1047,7 +1125,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
    return result;
 }
-static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights,
+static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights,
                                   const TexelWeightParams& params, const u32 blockWidth,
                                   const u32 blockHeight) {
    u32 weightIdx = 0;
@@ -1545,8 +1623,7 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
        static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
    memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
-    std::vector<IntegerEncodedValue> texelWeightValues;
+    IntegerEncodedVector texelWeightValues;
-    texelWeightValues.reserve(64);
    InputBitStream weightStream(texelWeightData);
@@ -1568,9 +1645,9 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
            Pixel p;
            for (u32 c = 0; c < 4; c++) {
                u32 C0 = endpos32s[partition][0].Component(c);
-                C0 = Replicate(C0, 8, 16);
+                C0 = ReplicateByteTo16(C0);
                u32 C1 = endpos32s[partition][1].Component(c);
-                C1 = Replicate(C1, 8, 16);
+                C1 = ReplicateByteTo16(C1);
                u32 plane = 0;
                if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
author	Mat M	2020-04-13 10:17:32 -0400
committer	GitHub	2020-04-13 10:17:32 -0400
commit	c4001225f64bbea511cf8df0885c77cb6ba70091 (patch)
tree	850d379792b1018cddd9282da50fade32e38fa4f /src
parent	Merge pull request #3619 from ReinUsesLisp/i2i (diff)
parent	astc: Hard code bit depth changes to 8 and use fast replicate (diff)
download	yuzu-c4001225f64bbea511cf8df0885c77cb6ba70091.tar.gz yuzu-c4001225f64bbea511cf8df0885c77cb6ba70091.tar.xz yuzu-c4001225f64bbea511cf8df0885c77cb6ba70091.zip

diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 062b4f252..365bde2f1 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp
@@ -20,6 +20,8 @@
20	#include <cstring>	20	#include <cstring>
21	#include <vector>	21	#include <vector>
22		22
		23	#include <boost/container/static_vector.hpp>
		24
23	#include "common/common_types.h"	25	#include "common/common_types.h"
24		26
25	#include "video_core/textures/astc.h"	27	#include "video_core/textures/astc.h"
@@ -39,25 +41,25 @@ constexpr u32 Popcnt(u32 n) {
39		41
40	class InputBitStream {	42	class InputBitStream {
41	public:	43	public:
42	explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)	44	constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
43	: m_CurByte(ptr), m_NextBit(start_offset % 8) {}	45	: cur_byte{ptr}, next_bit{start_offset % 8} {}
44		46
45	std::size_t GetBitsRead() const {	47	constexpr std::size_t GetBitsRead() const {
46	return m_BitsRead;	48	return bits_read;
47	}	49	}
48		50
49	u32 ReadBit() {	51	constexpr bool ReadBit() {
50	u32 bit = *m_CurByte >> m_NextBit++;	52	const bool bit = (*cur_byte >> next_bit++) & 1;
51	while (m_NextBit >= 8) {	53	while (next_bit >= 8) {
52	m_NextBit -= 8;	54	next_bit -= 8;
53	m_CurByte++;	55	cur_byte++;
54	}	56	}
55		57
56	m_BitsRead++;	58	bits_read++;
57	return bit & 1;	59	return bit;
58	}	60	}
59		61
60	u32 ReadBits(std::size_t nBits) {	62	constexpr u32 ReadBits(std::size_t nBits) {
61	u32 ret = 0;	63	u32 ret = 0;
62	for (std::size_t i = 0; i < nBits; ++i) {	64	for (std::size_t i = 0; i < nBits; ++i) {
63	ret \|= (ReadBit() & 1) << i;	65	ret \|= (ReadBit() & 1) << i;
@@ -66,7 +68,7 @@ public:
66	}	68	}
67		69
68	template <std::size_t nBits>	70	template <std::size_t nBits>
69	u32 ReadBits() {	71	constexpr u32 ReadBits() {
70	u32 ret = 0;	72	u32 ret = 0;
71	for (std::size_t i = 0; i < nBits; ++i) {	73	for (std::size_t i = 0; i < nBits; ++i) {
72	ret \|= (ReadBit() & 1) << i;	74	ret \|= (ReadBit() & 1) << i;
@@ -75,64 +77,58 @@ public:
75	}	77	}
76		78
77	private:	79	private:
78	const u8* m_CurByte;	80	const u8* cur_byte;
79	std::size_t m_NextBit = 0;	81	std::size_t next_bit = 0;
80	std::size_t m_BitsRead = 0;	82	std::size_t bits_read = 0;
81	};	83	};
82		84
83	class OutputBitStream {	85	class OutputBitStream {
84	public:	86	public:
85	explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0)	87	constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0)
86	: m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {}	88	: cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {}
87
88	~OutputBitStream() = default;
89		89
90	s32 GetBitsWritten() const {	90	constexpr std::size_t GetBitsWritten() const {
91	return m_BitsWritten;	91	return bits_written;
92	}	92	}
93		93
94	void WriteBitsR(u32 val, u32 nBits) {	94	constexpr void WriteBitsR(u32 val, u32 nBits) {
95	for (u32 i = 0; i < nBits; i++) {	95	for (u32 i = 0; i < nBits; i++) {
96	WriteBit((val >> (nBits - i - 1)) & 1);	96	WriteBit((val >> (nBits - i - 1)) & 1);
97	}	97	}
98	}	98	}
99		99
100	void WriteBits(u32 val, u32 nBits) {	100	constexpr void WriteBits(u32 val, u32 nBits) {
101	for (u32 i = 0; i < nBits; i++) {	101	for (u32 i = 0; i < nBits; i++) {
102	WriteBit((val >> i) & 1);	102	WriteBit((val >> i) & 1);
103	}	103	}
104	}	104	}
105		105
106	private:	106	private:
107	void WriteBit(s32 b) {	107	constexpr void WriteBit(bool b) {
108		108	if (bits_written >= num_bits) {
109	if (done)
110	return;	109	return;
		110	}
111		111
112	const u32 mask = 1 << m_NextBit++;	112	const u32 mask = 1 << next_bit++;
113		113
114	// clear the bit	114	// clear the bit
115	*m_CurByte &= static_cast<u8>(~mask);	115	*cur_byte &= static_cast<u8>(~mask);
116		116
117	// Write the bit, if necessary	117	// Write the bit, if necessary
118	if (b)	118	if (b)
119	*m_CurByte \|= static_cast<u8>(mask);	119	*cur_byte \|= static_cast<u8>(mask);
120		120
121	// Next byte?	121	// Next byte?
122	if (m_NextBit >= 8) {	122	if (next_bit >= 8) {
123	m_CurByte += 1;	123	cur_byte += 1;
124	m_NextBit = 0;	124	next_bit = 0;
125	}	125	}
126
127	done = done \|\| ++m_BitsWritten >= m_NumBits;
128	}	126	}
129		127
130	s32 m_BitsWritten = 0;	128	u8* cur_byte;
131	const s32 m_NumBits;	129	std::size_t num_bits;
132	u8* m_CurByte;	130	std::size_t bits_written = 0;
133	s32 m_NextBit = 0;	131	std::size_t next_bit = 0;
134
135	bool done = false;
136	};	132	};
137		133
138	template <typename IntType>	134	template <typename IntType>
@@ -195,9 +191,13 @@ struct IntegerEncodedValue {
195	u32 trit_value;	191	u32 trit_value;
196	};	192	};
197	};	193	};
		194	using IntegerEncodedVector = boost::container::static_vector<
		195	IntegerEncodedValue, 64,
		196	boost::container::static_vector_options<
		197	boost::container::inplace_alignment<alignof(IntegerEncodedValue)>,
		198	boost::container::throw_on_overflow<false>>::type>;
198		199
199	static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,	200	static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) {
200	u32 nBitsPerValue) {
201	// Implement the algorithm in section C.2.12	201	// Implement the algorithm in section C.2.12
202	u32 m[5];	202	u32 m[5];
203	u32 t[5];	203	u32 t[5];
@@ -255,7 +255,7 @@ static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValu
255	}	255	}
256	}	256	}
257		257
258	static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,	258	static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result,
259	u32 nBitsPerValue) {	259	u32 nBitsPerValue) {
260	// Implement the algorithm in section C.2.12	260	// Implement the algorithm in section C.2.12
261	u32 m[3];	261	u32 m[3];
@@ -343,8 +343,8 @@ static constexpr std::array EncodingsValues = MakeEncodedValues();
343	// Fills result with the values that are encoded in the given	343	// Fills result with the values that are encoded in the given
344	// bitstream. We must know beforehand what the maximum possible	344	// bitstream. We must know beforehand what the maximum possible
345	// value is, and how many values we're decoding.	345	// value is, and how many values we're decoding.
346	static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits,	346	static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange,
347	u32 maxRange, u32 nValues) {	347	u32 nValues) {
348	// Determine encoding parameters	348	// Determine encoding parameters
349	IntegerEncodedValue val = EncodingsValues[maxRange];	349	IntegerEncodedValue val = EncodingsValues[maxRange];
350		350
@@ -634,12 +634,14 @@ static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) {
634	// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]	634	// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
635	// is the same as [(numBits - 1):0] and repeats all the way down.	635	// is the same as [(numBits - 1):0] and repeats all the way down.
636	template <typename IntType>	636	template <typename IntType>
637	static IntType Replicate(IntType val, u32 numBits, u32 toBit) {	637	static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
638	if (numBits == 0)	638	if (numBits == 0) {
639	return 0;	639	return 0;
640	if (toBit == 0)	640	}
		641	if (toBit == 0) {
641	return 0;	642	return 0;
642	IntType v = val & static_cast<IntType>((1 << numBits) - 1);	643	}
		644	const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
643	IntType res = v;	645	IntType res = v;
644	u32 reslen = numBits;	646	u32 reslen = numBits;
645	while (reslen < toBit) {	647	while (reslen < toBit) {
@@ -656,6 +658,89 @@ static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
656	return res;	658	return res;
657	}	659	}
658		660
		661	static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
		662	return std::size_t(1) << num_bits;
		663	}
		664
		665	template <typename IntType, u32 num_bits, u32 to_bit>
		666	static constexpr auto MakeReplicateTable() {
		667	std::array<IntType, NumReplicateEntries(num_bits)> table{};
		668	for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
		669	table[value] = Replicate(value, num_bits, to_bit);
		670	}
		671	return table;
		672	}
		673
		674	static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
		675	static constexpr u32 ReplicateByteTo16(std::size_t value) {
		676	return REPLICATE_BYTE_TO_16_TABLE[value];
		677	}
		678
		679	static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
		680	static constexpr u32 ReplicateBitTo7(std::size_t value) {
		681	return REPLICATE_BIT_TO_7_TABLE[value];
		682	}
		683
		684	static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
		685	static constexpr u32 ReplicateBitTo9(std::size_t value) {
		686	return REPLICATE_BIT_TO_9_TABLE[value];
		687	}
		688
		689	static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
		690	static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
		691	static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
		692	static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
		693	static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
		694	static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
		695	static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
		696	static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
		697	/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
		698	/// to the runtime implementation
		699	static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
		700	switch (num_bits) {
		701	case 1:
		702	return REPLICATE_1_BIT_TO_8_TABLE[value];
		703	case 2:
		704	return REPLICATE_2_BIT_TO_8_TABLE[value];
		705	case 3:
		706	return REPLICATE_3_BIT_TO_8_TABLE[value];
		707	case 4:
		708	return REPLICATE_4_BIT_TO_8_TABLE[value];
		709	case 5:
		710	return REPLICATE_5_BIT_TO_8_TABLE[value];
		711	case 6:
		712	return REPLICATE_6_BIT_TO_8_TABLE[value];
		713	case 7:
		714	return REPLICATE_7_BIT_TO_8_TABLE[value];
		715	case 8:
		716	return REPLICATE_8_BIT_TO_8_TABLE[value];
		717	default:
		718	return Replicate(value, num_bits, 8);
		719	}
		720	}
		721
		722	static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
		723	static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
		724	static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
		725	static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
		726	static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
		727	static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
		728	switch (num_bits) {
		729	case 1:
		730	return REPLICATE_1_BIT_TO_6_TABLE[value];
		731	case 2:
		732	return REPLICATE_2_BIT_TO_6_TABLE[value];
		733	case 3:
		734	return REPLICATE_3_BIT_TO_6_TABLE[value];
		735	case 4:
		736	return REPLICATE_4_BIT_TO_6_TABLE[value];
		737	case 5:
		738	return REPLICATE_5_BIT_TO_6_TABLE[value];
		739	default:
		740	return Replicate(value, num_bits, 6);
		741	}
		742	}
		743
659	class Pixel {	744	class Pixel {
660	protected:	745	protected:
661	using ChannelType = s16;	746	using ChannelType = s16;
@@ -674,10 +759,10 @@ public:
674	// significant bits when going from larger to smaller bit depth	759	// significant bits when going from larger to smaller bit depth
675	// or by repeating the most significant bits when going from	760	// or by repeating the most significant bits when going from
676	// smaller to larger bit depths.	761	// smaller to larger bit depths.
677	void ChangeBitDepth(const u8 (&depth)[4]) {	762	void ChangeBitDepth() {
678	for (u32 i = 0; i < 4; i++) {	763	for (u32 i = 0; i < 4; i++) {
679	Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]);	764	Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]);
680	m_BitDepth[i] = depth[i];	765	m_BitDepth[i] = 8;
681	}	766	}
682	}	767	}
683		768
@@ -689,28 +774,23 @@ public:
689		774
690	// Changes the bit depth of a single component. See the comment	775	// Changes the bit depth of a single component. See the comment
691	// above for how we do this.	776	// above for how we do this.
692	static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) {	777	static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) {
693	assert(newDepth <= 8);
694	assert(oldDepth <= 8);	778	assert(oldDepth <= 8);
695		779
696	if (oldDepth == newDepth) {	780	if (oldDepth == 8) {
697	// Do nothing	781	// Do nothing
698	return val;	782	return val;
699	} else if (oldDepth == 0 && newDepth != 0) {	783	} else if (oldDepth == 0) {
700	return static_cast<ChannelType>((1 << newDepth) - 1);	784	return static_cast<ChannelType>((1 << 8) - 1);
701	} else if (newDepth > oldDepth) {	785	} else if (8 > oldDepth) {
702	return Replicate(val, oldDepth, newDepth);	786	return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
703	} else {	787	} else {
704	// oldDepth > newDepth	788	// oldDepth > newDepth
705	if (newDepth == 0) {	789	const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
706	return 0xFF;	790	u16 v = static_cast<u16>(val);
707	} else {	791	v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
708	u8 bitsWasted = static_cast<u8>(oldDepth - newDepth);	792	v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1));
709	u16 v = static_cast<u16>(val);	793	return static_cast<u8>(v);
710	v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
711	v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1));
712	return static_cast<u8>(v);
713	}
714	}	794	}
715		795
716	assert(false && "We shouldn't get here.");	796	assert(false && "We shouldn't get here.");
@@ -760,8 +840,7 @@ public:
760	// up in the most-significant byte.	840	// up in the most-significant byte.
761	u32 Pack() const {	841	u32 Pack() const {
762	Pixel eightBit(*this);	842	Pixel eightBit(*this);
763	const u8 eightBitDepth[4] = {8, 8, 8, 8};	843	eightBit.ChangeBitDepth();
764	eightBit.ChangeBitDepth(eightBitDepth);
765		844
766	u32 r = 0;	845	u32 r = 0;
767	r \|= eightBit.A();	846	r \|= eightBit.A();
@@ -816,8 +895,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
816	}	895	}
817		896
818	// We now have enough to decode our integer sequence.	897	// We now have enough to decode our integer sequence.
819	std::vector<IntegerEncodedValue> decodedColorValues;	898	IntegerEncodedVector decodedColorValues;
820	decodedColorValues.reserve(32);
821		899
822	InputBitStream colorStream(data);	900	InputBitStream colorStream(data);
823	DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);	901	DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
@@ -839,12 +917,12 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
839		917
840	u32 A = 0, B = 0, C = 0, D = 0;	918	u32 A = 0, B = 0, C = 0, D = 0;
841	// A is just the lsb replicated 9 times.	919	// A is just the lsb replicated 9 times.
842	A = Replicate(bitval & 1, 1, 9);	920	A = ReplicateBitTo9(bitval & 1);
843		921
844	switch (val.encoding) {	922	switch (val.encoding) {
845	// Replicate bits	923	// Replicate bits
846	case IntegerEncoding::JustBits:	924	case IntegerEncoding::JustBits:
847	out[outIdx++] = Replicate(bitval, bitlen, 8);	925	out[outIdx++] = FastReplicateTo8(bitval, bitlen);
848	break;	926	break;
849		927
850	// Use algorithm in C.2.13	928	// Use algorithm in C.2.13
@@ -962,13 +1040,13 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
962	u32 bitval = val.bit_value;	1040	u32 bitval = val.bit_value;
963	u32 bitlen = val.num_bits;	1041	u32 bitlen = val.num_bits;
964		1042
965	u32 A = Replicate(bitval & 1, 1, 7);	1043	u32 A = ReplicateBitTo7(bitval & 1);
966	u32 B = 0, C = 0, D = 0;	1044	u32 B = 0, C = 0, D = 0;
967		1045
968	u32 result = 0;	1046	u32 result = 0;
969	switch (val.encoding) {	1047	switch (val.encoding) {
970	case IntegerEncoding::JustBits:	1048	case IntegerEncoding::JustBits:
971	result = Replicate(bitval, bitlen, 6);	1049	result = FastReplicateTo6(bitval, bitlen);
972	break;	1050	break;
973		1051
974	case IntegerEncoding::Trit: {	1052	case IntegerEncoding::Trit: {
@@ -1047,7 +1125,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
1047	return result;	1125	return result;
1048	}	1126	}
1049		1127
1050	static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights,	1128	static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights,
1051	const TexelWeightParams& params, const u32 blockWidth,	1129	const TexelWeightParams& params, const u32 blockWidth,
1052	const u32 blockHeight) {	1130	const u32 blockHeight) {
1053	u32 weightIdx = 0;	1131	u32 weightIdx = 0;
@@ -1545,8 +1623,7 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
1545	static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);	1623	static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
1546	memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);	1624	memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
1547		1625
1548	std::vector<IntegerEncodedValue> texelWeightValues;	1626	IntegerEncodedVector texelWeightValues;
1549	texelWeightValues.reserve(64);
1550		1627
1551	InputBitStream weightStream(texelWeightData);	1628	InputBitStream weightStream(texelWeightData);
1552		1629
@@ -1568,9 +1645,9 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
1568	Pixel p;	1645	Pixel p;
1569	for (u32 c = 0; c < 4; c++) {	1646	for (u32 c = 0; c < 4; c++) {
1570	u32 C0 = endpos32s[partition][0].Component(c);	1647	u32 C0 = endpos32s[partition][0].Component(c);
1571	C0 = Replicate(C0, 8, 16);	1648	C0 = ReplicateByteTo16(C0);
1572	u32 C1 = endpos32s[partition][1].Component(c);	1649	u32 C1 = endpos32s[partition][1].Component(c);
1573	C1 = Replicate(C1, 8, 16);	1650	C1 = ReplicateByteTo16(C1);
1574		1651
1575	u32 plane = 0;	1652	u32 plane = 0;
1576	if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {	1653	if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {