renderer_opengl: Accelerate ASTC texture decoding with a compute shader

ASTC texture decoding is currently handled by a CPU decoder for GPU's without native ASTC decoding support (most desktop GPUs). This is the cause for noticeable performance degradation in titles which use the format extensively. This commit adds support to accelerate ASTC decoding using a compute shader on OpenGL for GPUs without native support.
author: ameerj 2021-02-13 15:50:12 -0500
committer: ameerj 2021-03-13 12:16:03 -0500
commit: 2985e5e94c82febcf215feb0023f4184b38bb24a (patch)
tree: 7b7cd8be3605560707a74a74c281577920a24248 /src/video_core/textures/astc.h
parent: Merge pull request #6053 from Morph1984/time-CalculateSpanBetween (diff)
download: yuzu-2985e5e94c82febcf215feb0023f4184b38bb24a.tar.gz
yuzu-2985e5e94c82febcf215feb0023f4184b38bb24a.tar.xz
yuzu-2985e5e94c82febcf215feb0023f4184b38bb24a.zip
1 files changed, 190 insertions, 0 deletions
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h
index 9105119bc..bc8bddaec 100644
--- a/src/video_core/textures/astc.h
+++ b/src/video_core/textures/astc.h
@@ -8,6 +8,196 @@
 namespace Tegra::Texture::ASTC {
+/// Count the number of bits set in a number.
+constexpr u32 Popcnt(u32 n) {
+    u32 c = 0;
+    for (; n; c++) {
+        n &= n - 1;
+    }
+    return c;
+}
+enum class IntegerEncoding { JustBits, Qus32, Trit };
+struct IntegerEncodedValue {
+    constexpr IntegerEncodedValue() = default;
+    constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
+        : encoding{encoding_}, num_bits{num_bits_} {}
+    constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
+        return encoding == other.encoding && num_bits == other.num_bits;
+    }
+    // Returns the number of bits required to encode nVals values.
+    u32 GetBitLength(u32 nVals) const {
+        u32 totalBits = num_bits * nVals;
+        if (encoding == IntegerEncoding::Trit) {
+            totalBits += (nVals * 8 + 4) / 5;
+        } else if (encoding == IntegerEncoding::Qus32) {
+            totalBits += (nVals * 7 + 2) / 3;
+        }
+        return totalBits;
+    }
+    IntegerEncoding encoding{};
+    u32 num_bits = 0;
+    u32 bit_value = 0;
+    union {
+        u32 qus32_value = 0;
+        u32 trit_value;
+    };
+};
+// Returns a new instance of this struct that corresponds to the
+// can take no more than maxval values
+static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) {
+    while (maxVal > 0) {
+        u32 check = maxVal + 1;
+        // Is maxVal a power of two?
+        if (!(check & (check - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal));
+        }
+        // Is maxVal of the type 3*2^n - 1?
+        if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1));
+        }
+        // Is maxVal of the type 5*2^n - 1?
+        if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1));
+        }
+        // Apparently it can't be represented with a bounded integer sequence...
+        // just iterate.
+        maxVal--;
+    }
+    return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
+}
+static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
+    std::array<IntegerEncodedValue, 256> encodings{};
+    for (std::size_t i = 0; i < encodings.size(); ++i) {
+        encodings[i] = CreateEncoding(static_cast<u32>(i));
+    }
+    return encodings;
+}
+static constexpr std::array<IntegerEncodedValue, 256> EncodingsValues = MakeEncodedValues();
+// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
+// is the same as [(numBits - 1):0] and repeats all the way down.
+template <typename IntType>
+static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
+    if (numBits == 0) {
+        return 0;
+    }
+    if (toBit == 0) {
+        return 0;
+    }
+    const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
+    IntType res = v;
+    u32 reslen = numBits;
+    while (reslen < toBit) {
+        u32 comp = 0;
+        if (numBits > toBit - reslen) {
+            u32 newshift = toBit - reslen;
+            comp = numBits - newshift;
+            numBits = newshift;
+        }
+        res = static_cast<IntType>(res << numBits);
+        res = static_cast<IntType>(res | (v >> comp));
+        reslen += numBits;
+    }
+    return res;
+}
+static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
+    return std::size_t(1) << num_bits;
+}
+template <typename IntType, u32 num_bits, u32 to_bit>
+static constexpr auto MakeReplicateTable() {
+    std::array<IntType, NumReplicateEntries(num_bits)> table{};
+    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
+        table[value] = Replicate(value, num_bits, to_bit);
+    }
+    return table;
+}
+static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
+static constexpr u32 ReplicateByteTo16(std::size_t value) {
+    return REPLICATE_BYTE_TO_16_TABLE[value];
+}
+static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
+static constexpr u32 ReplicateBitTo7(std::size_t value) {
+    return REPLICATE_BIT_TO_7_TABLE[value];
+}
+static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
+static constexpr u32 ReplicateBitTo9(std::size_t value) {
+    return REPLICATE_BIT_TO_9_TABLE[value];
+}
+static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
+static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
+static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
+static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
+static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
+static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
+static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
+static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
+/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
+/// to the runtime implementation
+static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_8_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_8_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_8_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_8_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_8_TABLE[value];
+    case 6:
+        return REPLICATE_6_BIT_TO_8_TABLE[value];
+    case 7:
+        return REPLICATE_7_BIT_TO_8_TABLE[value];
+    case 8:
+        return REPLICATE_8_BIT_TO_8_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 8);
+    }
+}
+static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
+static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
+static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
+static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
+static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
+static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_6_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_6_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_6_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_6_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_6_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 6);
+    }
+}
 void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
                uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);
author	ameerj	2021-02-13 15:50:12 -0500
committer	ameerj	2021-03-13 12:16:03 -0500
commit	2985e5e94c82febcf215feb0023f4184b38bb24a (patch)
tree	7b7cd8be3605560707a74a74c281577920a24248 /src/video_core/textures/astc.h
parent	Merge pull request #6053 from Morph1984/time-CalculateSpanBetween (diff)
download	yuzu-2985e5e94c82febcf215feb0023f4184b38bb24a.tar.gz yuzu-2985e5e94c82febcf215feb0023f4184b38bb24a.tar.xz yuzu-2985e5e94c82febcf215feb0023f4184b38bb24a.zip

diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h index 9105119bc..bc8bddaec 100644 --- a/src/video_core/textures/astc.h +++ b/src/video_core/textures/astc.h
@@ -8,6 +8,196 @@
8		8
9	namespace Tegra::Texture::ASTC {	9	namespace Tegra::Texture::ASTC {
10		10
		11	/// Count the number of bits set in a number.
		12	constexpr u32 Popcnt(u32 n) {
		13	u32 c = 0;
		14	for (; n; c++) {
		15	n &= n - 1;
		16	}
		17	return c;
		18	}
		19
		20	enum class IntegerEncoding { JustBits, Qus32, Trit };
		21
		22	struct IntegerEncodedValue {
		23	constexpr IntegerEncodedValue() = default;
		24
		25	constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
		26	: encoding{encoding_}, num_bits{num_bits_} {}
		27
		28	constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
		29	return encoding == other.encoding && num_bits == other.num_bits;
		30	}
		31
		32	// Returns the number of bits required to encode nVals values.
		33	u32 GetBitLength(u32 nVals) const {
		34	u32 totalBits = num_bits * nVals;
		35	if (encoding == IntegerEncoding::Trit) {
		36	totalBits += (nVals * 8 + 4) / 5;
		37	} else if (encoding == IntegerEncoding::Qus32) {
		38	totalBits += (nVals * 7 + 2) / 3;
		39	}
		40	return totalBits;
		41	}
		42
		43	IntegerEncoding encoding{};
		44	u32 num_bits = 0;
		45	u32 bit_value = 0;
		46	union {
		47	u32 qus32_value = 0;
		48	u32 trit_value;
		49	};
		50	};
		51
		52	// Returns a new instance of this struct that corresponds to the
		53	// can take no more than maxval values
		54	static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) {
		55	while (maxVal > 0) {
		56	u32 check = maxVal + 1;
		57
		58	// Is maxVal a power of two?
		59	if (!(check & (check - 1))) {
		60	return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal));
		61	}
		62
		63	// Is maxVal of the type 3*2^n - 1?
		64	if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
		65	return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1));
		66	}
		67
		68	// Is maxVal of the type 5*2^n - 1?
		69	if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
		70	return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1));
		71	}
		72
		73	// Apparently it can't be represented with a bounded integer sequence...
		74	// just iterate.
		75	maxVal--;
		76	}
		77	return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
		78	}
		79
		80	static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
		81	std::array<IntegerEncodedValue, 256> encodings{};
		82	for (std::size_t i = 0; i < encodings.size(); ++i) {
		83	encodings[i] = CreateEncoding(static_cast<u32>(i));
		84	}
		85	return encodings;
		86	}
		87
		88	static constexpr std::array<IntegerEncodedValue, 256> EncodingsValues = MakeEncodedValues();
		89
		90	// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
		91	// is the same as [(numBits - 1):0] and repeats all the way down.
		92	template <typename IntType>
		93	static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
		94	if (numBits == 0) {
		95	return 0;
		96	}
		97	if (toBit == 0) {
		98	return 0;
		99	}
		100	const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
		101	IntType res = v;
		102	u32 reslen = numBits;
		103	while (reslen < toBit) {
		104	u32 comp = 0;
		105	if (numBits > toBit - reslen) {
		106	u32 newshift = toBit - reslen;
		107	comp = numBits - newshift;
		108	numBits = newshift;
		109	}
		110	res = static_cast<IntType>(res << numBits);
		111	res = static_cast<IntType>(res \| (v >> comp));
		112	reslen += numBits;
		113	}
		114	return res;
		115	}
		116
		117	static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
		118	return std::size_t(1) << num_bits;
		119	}
		120
		121	template <typename IntType, u32 num_bits, u32 to_bit>
		122	static constexpr auto MakeReplicateTable() {
		123	std::array<IntType, NumReplicateEntries(num_bits)> table{};
		124	for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
		125	table[value] = Replicate(value, num_bits, to_bit);
		126	}
		127	return table;
		128	}
		129
		130	static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
		131	static constexpr u32 ReplicateByteTo16(std::size_t value) {
		132	return REPLICATE_BYTE_TO_16_TABLE[value];
		133	}
		134
		135	static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
		136	static constexpr u32 ReplicateBitTo7(std::size_t value) {
		137	return REPLICATE_BIT_TO_7_TABLE[value];
		138	}
		139
		140	static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
		141	static constexpr u32 ReplicateBitTo9(std::size_t value) {
		142	return REPLICATE_BIT_TO_9_TABLE[value];
		143	}
		144
		145	static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
		146	static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
		147	static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
		148	static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
		149	static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
		150	static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
		151	static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
		152	static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
		153	/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
		154	/// to the runtime implementation
		155	static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
		156	switch (num_bits) {
		157	case 1:
		158	return REPLICATE_1_BIT_TO_8_TABLE[value];
		159	case 2:
		160	return REPLICATE_2_BIT_TO_8_TABLE[value];
		161	case 3:
		162	return REPLICATE_3_BIT_TO_8_TABLE[value];
		163	case 4:
		164	return REPLICATE_4_BIT_TO_8_TABLE[value];
		165	case 5:
		166	return REPLICATE_5_BIT_TO_8_TABLE[value];
		167	case 6:
		168	return REPLICATE_6_BIT_TO_8_TABLE[value];
		169	case 7:
		170	return REPLICATE_7_BIT_TO_8_TABLE[value];
		171	case 8:
		172	return REPLICATE_8_BIT_TO_8_TABLE[value];
		173	default:
		174	return Replicate(value, num_bits, 8);
		175	}
		176	}
		177
		178	static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
		179	static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
		180	static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
		181	static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
		182	static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
		183
		184	static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
		185	switch (num_bits) {
		186	case 1:
		187	return REPLICATE_1_BIT_TO_6_TABLE[value];
		188	case 2:
		189	return REPLICATE_2_BIT_TO_6_TABLE[value];
		190	case 3:
		191	return REPLICATE_3_BIT_TO_6_TABLE[value];
		192	case 4:
		193	return REPLICATE_4_BIT_TO_6_TABLE[value];
		194	case 5:
		195	return REPLICATE_5_BIT_TO_6_TABLE[value];
		196	default:
		197	return Replicate(value, num_bits, 6);
		198	}
		199	}
		200
11	void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,	201	void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
12	uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);	202	uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);
13		203