summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar bunnei2021-06-16 11:29:10 -0700
committerGravatar GitHub2021-06-16 11:29:10 -0700
commit973bf306edda84e730d56dd73a04c7bbd20d9397 (patch)
tree4d64ac6f6de8e7a2eeeeeba9bd850b7924e46a6b /src
parentMerge pull request #6460 from Morph1984/fs-access-log-fix (diff)
parentastc_decoder: Fix LDR CEM1 endpoint calculation (diff)
downloadyuzu-973bf306edda84e730d56dd73a04c7bbd20d9397.tar.gz
yuzu-973bf306edda84e730d56dd73a04c7bbd20d9397.tar.xz
yuzu-973bf306edda84e730d56dd73a04c7bbd20d9397.zip
Merge pull request #6464 from ameerj/disable-astc
textures: Add a toggle for GPU Accelerated ASTC decoder
Diffstat (limited to '')
-rw-r--r--src/common/settings.cpp2
-rw-r--r--src/common/settings.h1
-rw-r--r--src/core/telemetry_session.cpp1
-rw-r--r--src/video_core/CMakeLists.txt1
-rw-r--r--src/video_core/host_shaders/astc_decoder.comp2
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp6
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp7
-rw-r--r--src/video_core/texture_cache/util.cpp13
-rw-r--r--src/video_core/textures/astc.cpp1577
-rw-r--r--src/video_core/textures/astc.h3
-rw-r--r--src/yuzu/configuration/config.cpp2
-rw-r--r--src/yuzu/configuration/configure_graphics.cpp7
-rw-r--r--src/yuzu/configuration/configure_graphics.h1
-rw-r--r--src/yuzu/configuration/configure_graphics.ui7
-rw-r--r--src/yuzu_cmd/config.cpp6
-rw-r--r--src/yuzu_cmd/default_ini.h8
16 files changed, 1637 insertions, 7 deletions
diff --git a/src/common/settings.cpp b/src/common/settings.cpp
index 360e878d6..9ec71eced 100644
--- a/src/common/settings.cpp
+++ b/src/common/settings.cpp
@@ -55,6 +55,7 @@ void LogSettings() {
55 log_setting("Renderer_UseAsynchronousGpuEmulation", 55 log_setting("Renderer_UseAsynchronousGpuEmulation",
56 values.use_asynchronous_gpu_emulation.GetValue()); 56 values.use_asynchronous_gpu_emulation.GetValue());
57 log_setting("Renderer_UseNvdecEmulation", values.use_nvdec_emulation.GetValue()); 57 log_setting("Renderer_UseNvdecEmulation", values.use_nvdec_emulation.GetValue());
58 log_setting("Renderer_AccelerateASTC", values.accelerate_astc.GetValue());
58 log_setting("Renderer_UseVsync", values.use_vsync.GetValue()); 59 log_setting("Renderer_UseVsync", values.use_vsync.GetValue());
59 log_setting("Renderer_UseAssemblyShaders", values.use_assembly_shaders.GetValue()); 60 log_setting("Renderer_UseAssemblyShaders", values.use_assembly_shaders.GetValue());
60 log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue()); 61 log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue());
@@ -135,6 +136,7 @@ void RestoreGlobalState(bool is_powered_on) {
135 values.gpu_accuracy.SetGlobal(true); 136 values.gpu_accuracy.SetGlobal(true);
136 values.use_asynchronous_gpu_emulation.SetGlobal(true); 137 values.use_asynchronous_gpu_emulation.SetGlobal(true);
137 values.use_nvdec_emulation.SetGlobal(true); 138 values.use_nvdec_emulation.SetGlobal(true);
139 values.accelerate_astc.SetGlobal(true);
138 values.use_vsync.SetGlobal(true); 140 values.use_vsync.SetGlobal(true);
139 values.use_assembly_shaders.SetGlobal(true); 141 values.use_assembly_shaders.SetGlobal(true);
140 values.use_asynchronous_shaders.SetGlobal(true); 142 values.use_asynchronous_shaders.SetGlobal(true);
diff --git a/src/common/settings.h b/src/common/settings.h
index bf34f2b5b..6198f2d9f 100644
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -147,6 +147,7 @@ struct Values {
147 Setting<GPUAccuracy> gpu_accuracy; 147 Setting<GPUAccuracy> gpu_accuracy;
148 Setting<bool> use_asynchronous_gpu_emulation; 148 Setting<bool> use_asynchronous_gpu_emulation;
149 Setting<bool> use_nvdec_emulation; 149 Setting<bool> use_nvdec_emulation;
150 Setting<bool> accelerate_astc;
150 Setting<bool> use_vsync; 151 Setting<bool> use_vsync;
151 Setting<bool> use_assembly_shaders; 152 Setting<bool> use_assembly_shaders;
152 Setting<bool> use_asynchronous_shaders; 153 Setting<bool> use_asynchronous_shaders;
diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp
index ad1a9ffb4..d4c23ced2 100644
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -230,6 +230,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader,
230 Settings::values.use_asynchronous_gpu_emulation.GetValue()); 230 Settings::values.use_asynchronous_gpu_emulation.GetValue());
231 AddField(field_type, "Renderer_UseNvdecEmulation", 231 AddField(field_type, "Renderer_UseNvdecEmulation",
232 Settings::values.use_nvdec_emulation.GetValue()); 232 Settings::values.use_nvdec_emulation.GetValue());
233 AddField(field_type, "Renderer_AccelerateASTC", Settings::values.accelerate_astc.GetValue());
233 AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync.GetValue()); 234 AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync.GetValue());
234 AddField(field_type, "Renderer_UseAssemblyShaders", 235 AddField(field_type, "Renderer_UseAssemblyShaders",
235 Settings::values.use_assembly_shaders.GetValue()); 236 Settings::values.use_assembly_shaders.GetValue());
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 47190c464..f9454bbaa 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -237,6 +237,7 @@ add_library(video_core STATIC
237 texture_cache/util.cpp 237 texture_cache/util.cpp
238 texture_cache/util.h 238 texture_cache/util.h
239 textures/astc.h 239 textures/astc.h
240 textures/astc.cpp
240 textures/decoders.cpp 241 textures/decoders.cpp
241 textures/decoders.h 242 textures/decoders.h
242 textures/texture.cpp 243 textures/texture.cpp
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index 703e34587..eaba1b103 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -763,7 +763,7 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) {
763 case 1: { 763 case 1: {
764 READ_UINT_VALUES(2) 764 READ_UINT_VALUES(2)
765 uint L0 = (v[0] >> 2) | (v[1] & 0xC0); 765 uint L0 = (v[0] >> 2) | (v[1] & 0xC0);
766 uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU); 766 uint L1 = min(L0 + (v[1] & 0x3F), 0xFFU);
767 ep1 = uvec4(0xFF, L0, L0, L0); 767 ep1 = uvec4(0xFF, L0, L0, L0);
768 ep2 = uvec4(0xFF, L1, L1, L1); 768 ep2 = uvec4(0xFF, L1, L1, L1);
769 break; 769 break;
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index ffe9edc1b..9b4038615 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -9,6 +9,8 @@
9 9
10#include <glad/glad.h> 10#include <glad/glad.h>
11 11
12#include "common/settings.h"
13
12#include "video_core/renderer_opengl/gl_device.h" 14#include "video_core/renderer_opengl/gl_device.h"
13#include "video_core/renderer_opengl/gl_shader_manager.h" 15#include "video_core/renderer_opengl/gl_shader_manager.h"
14#include "video_core/renderer_opengl/gl_state_tracker.h" 16#include "video_core/renderer_opengl/gl_state_tracker.h"
@@ -307,7 +309,9 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4
307 309
308[[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime, 310[[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime,
309 const VideoCommon::ImageInfo& info) { 311 const VideoCommon::ImageInfo& info) {
310 return !runtime.HasNativeASTC() && IsPixelFormatASTC(info.format); 312 if (IsPixelFormatASTC(info.format)) {
313 return !runtime.HasNativeASTC() && Settings::values.accelerate_astc.GetValue();
314 }
311 // Disable other accelerated uploads for now as they don't implement swizzled uploads 315 // Disable other accelerated uploads for now as they don't implement swizzled uploads
312 return false; 316 return false;
313 switch (info.type) { 317 switch (info.type) {
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index bdd0ce8bc..52860b4cf 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -8,6 +8,7 @@
8#include <vector> 8#include <vector>
9 9
10#include "common/bit_cast.h" 10#include "common/bit_cast.h"
11#include "common/settings.h"
11 12
12#include "video_core/engines/fermi_2d.h" 13#include "video_core/engines/fermi_2d.h"
13#include "video_core/renderer_vulkan/blit_image.h" 14#include "video_core/renderer_vulkan/blit_image.h"
@@ -828,7 +829,11 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
828 commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); 829 commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal);
829 } 830 }
830 if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { 831 if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) {
831 flags |= VideoCommon::ImageFlagBits::AcceleratedUpload; 832 if (Settings::values.accelerate_astc.GetValue()) {
833 flags |= VideoCommon::ImageFlagBits::AcceleratedUpload;
834 } else {
835 flags |= VideoCommon::ImageFlagBits::Converted;
836 }
832 } 837 }
833 if (runtime.device.HasDebuggingToolAttached()) { 838 if (runtime.device.HasDebuggingToolAttached()) {
834 if (image) { 839 if (image) {
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index 906604a39..0d3e0804f 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -47,6 +47,7 @@
47#include "video_core/texture_cache/formatter.h" 47#include "video_core/texture_cache/formatter.h"
48#include "video_core/texture_cache/samples_helper.h" 48#include "video_core/texture_cache/samples_helper.h"
49#include "video_core/texture_cache/util.h" 49#include "video_core/texture_cache/util.h"
50#include "video_core/textures/astc.h"
50#include "video_core/textures/decoders.h" 51#include "video_core/textures/decoders.h"
51 52
52namespace VideoCommon { 53namespace VideoCommon {
@@ -884,8 +885,16 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8
884 ASSERT(copy.image_extent == mip_size); 885 ASSERT(copy.image_extent == mip_size);
885 ASSERT(copy.buffer_row_length == Common::AlignUp(mip_size.width, tile_size.width)); 886 ASSERT(copy.buffer_row_length == Common::AlignUp(mip_size.width, tile_size.width));
886 ASSERT(copy.buffer_image_height == Common::AlignUp(mip_size.height, tile_size.height)); 887 ASSERT(copy.buffer_image_height == Common::AlignUp(mip_size.height, tile_size.height));
887 DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent, 888 if (IsPixelFormatASTC(info.format)) {
888 output.subspan(output_offset)); 889 ASSERT(copy.image_extent.depth == 1);
890 Tegra::Texture::ASTC::Decompress(input.subspan(copy.buffer_offset),
891 copy.image_extent.width, copy.image_extent.height,
892 copy.image_subresource.num_layers, tile_size.width,
893 tile_size.height, output.subspan(output_offset));
894 } else {
895 DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent,
896 output.subspan(output_offset));
897 }
889 copy.buffer_offset = output_offset; 898 copy.buffer_offset = output_offset;
890 copy.buffer_row_length = mip_size.width; 899 copy.buffer_row_length = mip_size.width;
891 copy.buffer_image_height = mip_size.height; 900 copy.buffer_image_height = mip_size.height;
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
new file mode 100644
index 000000000..9b2177ebd
--- /dev/null
+++ b/src/video_core/textures/astc.cpp
@@ -0,0 +1,1577 @@
1// Copyright 2016 The University of North Carolina at Chapel Hill
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Please send all BUG REPORTS to <pavel@cs.unc.edu>.
16// <http://gamma.cs.unc.edu/FasTC/>
17
18#include <algorithm>
19#include <cassert>
20#include <cstring>
21#include <span>
22#include <vector>
23
24#include <boost/container/static_vector.hpp>
25
26#include "common/common_types.h"
27#include "video_core/textures/astc.h"
28
29class InputBitStream {
30public:
31 constexpr explicit InputBitStream(std::span<const u8> data, size_t start_offset = 0)
32 : cur_byte{data.data()}, total_bits{data.size()}, next_bit{start_offset % 8} {}
33
34 constexpr size_t GetBitsRead() const {
35 return bits_read;
36 }
37
38 constexpr bool ReadBit() {
39 if (bits_read >= total_bits * 8) {
40 return 0;
41 }
42 const bool bit = ((*cur_byte >> next_bit) & 1) != 0;
43 ++next_bit;
44 while (next_bit >= 8) {
45 next_bit -= 8;
46 ++cur_byte;
47 }
48 ++bits_read;
49 return bit;
50 }
51
52 constexpr u32 ReadBits(std::size_t nBits) {
53 u32 ret = 0;
54 for (std::size_t i = 0; i < nBits; ++i) {
55 ret |= (ReadBit() & 1) << i;
56 }
57 return ret;
58 }
59
60 template <std::size_t nBits>
61 constexpr u32 ReadBits() {
62 u32 ret = 0;
63 for (std::size_t i = 0; i < nBits; ++i) {
64 ret |= (ReadBit() & 1) << i;
65 }
66 return ret;
67 }
68
69private:
70 const u8* cur_byte;
71 size_t total_bits = 0;
72 size_t next_bit = 0;
73 size_t bits_read = 0;
74};
75
76class OutputBitStream {
77public:
78 constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0)
79 : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {}
80
81 constexpr std::size_t GetBitsWritten() const {
82 return bits_written;
83 }
84
85 constexpr void WriteBitsR(u32 val, u32 nBits) {
86 for (u32 i = 0; i < nBits; i++) {
87 WriteBit((val >> (nBits - i - 1)) & 1);
88 }
89 }
90
91 constexpr void WriteBits(u32 val, u32 nBits) {
92 for (u32 i = 0; i < nBits; i++) {
93 WriteBit((val >> i) & 1);
94 }
95 }
96
97private:
98 constexpr void WriteBit(bool b) {
99 if (bits_written >= num_bits) {
100 return;
101 }
102
103 const u32 mask = 1 << next_bit++;
104
105 // clear the bit
106 *cur_byte &= static_cast<u8>(~mask);
107
108 // Write the bit, if necessary
109 if (b)
110 *cur_byte |= static_cast<u8>(mask);
111
112 // Next byte?
113 if (next_bit >= 8) {
114 cur_byte += 1;
115 next_bit = 0;
116 }
117 }
118
119 u8* cur_byte;
120 std::size_t num_bits;
121 std::size_t bits_written = 0;
122 std::size_t next_bit = 0;
123};
124
125template <typename IntType>
126class Bits {
127public:
128 explicit Bits(const IntType& v) : m_Bits(v) {}
129
130 Bits(const Bits&) = delete;
131 Bits& operator=(const Bits&) = delete;
132
133 u8 operator[](u32 bitPos) const {
134 return static_cast<u8>((m_Bits >> bitPos) & 1);
135 }
136
137 IntType operator()(u32 start, u32 end) const {
138 if (start == end) {
139 return (*this)[start];
140 } else if (start > end) {
141 u32 t = start;
142 start = end;
143 end = t;
144 }
145
146 u64 mask = (1 << (end - start + 1)) - 1;
147 return (m_Bits >> start) & static_cast<IntType>(mask);
148 }
149
150private:
151 const IntType& m_Bits;
152};
153
154namespace Tegra::Texture::ASTC {
155using IntegerEncodedVector = boost::container::static_vector<
156 IntegerEncodedValue, 256,
157 boost::container::static_vector_options<
158 boost::container::inplace_alignment<alignof(IntegerEncodedValue)>,
159 boost::container::throw_on_overflow<false>>::type>;
160
161static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) {
162 // Implement the algorithm in section C.2.12
163 std::array<u32, 5> m;
164 std::array<u32, 5> t;
165 u32 T;
166
167 // Read the trit encoded block according to
168 // table C.2.14
169 m[0] = bits.ReadBits(nBitsPerValue);
170 T = bits.ReadBits<2>();
171 m[1] = bits.ReadBits(nBitsPerValue);
172 T |= bits.ReadBits<2>() << 2;
173 m[2] = bits.ReadBits(nBitsPerValue);
174 T |= bits.ReadBit() << 4;
175 m[3] = bits.ReadBits(nBitsPerValue);
176 T |= bits.ReadBits<2>() << 5;
177 m[4] = bits.ReadBits(nBitsPerValue);
178 T |= bits.ReadBit() << 7;
179
180 u32 C = 0;
181
182 Bits<u32> Tb(T);
183 if (Tb(2, 4) == 7) {
184 C = (Tb(5, 7) << 2) | Tb(0, 1);
185 t[4] = t[3] = 2;
186 } else {
187 C = Tb(0, 4);
188 if (Tb(5, 6) == 3) {
189 t[4] = 2;
190 t[3] = Tb[7];
191 } else {
192 t[4] = Tb[7];
193 t[3] = Tb(5, 6);
194 }
195 }
196
197 Bits<u32> Cb(C);
198 if (Cb(0, 1) == 3) {
199 t[2] = 2;
200 t[1] = Cb[4];
201 t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]);
202 } else if (Cb(2, 3) == 3) {
203 t[2] = 2;
204 t[1] = 2;
205 t[0] = Cb(0, 1);
206 } else {
207 t[2] = Cb[4];
208 t[1] = Cb(2, 3);
209 t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]);
210 }
211
212 for (std::size_t i = 0; i < 5; ++i) {
213 IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Trit, nBitsPerValue);
214 val.bit_value = m[i];
215 val.trit_value = t[i];
216 }
217}
218
219static void DecodeQuintBlock(InputBitStream& bits, IntegerEncodedVector& result,
220 u32 nBitsPerValue) {
221 // Implement the algorithm in section C.2.12
222 u32 m[3];
223 u32 q[3];
224 u32 Q;
225
226 // Read the trit encoded block according to
227 // table C.2.15
228 m[0] = bits.ReadBits(nBitsPerValue);
229 Q = bits.ReadBits<3>();
230 m[1] = bits.ReadBits(nBitsPerValue);
231 Q |= bits.ReadBits<2>() << 3;
232 m[2] = bits.ReadBits(nBitsPerValue);
233 Q |= bits.ReadBits<2>() << 5;
234
235 Bits<u32> Qb(Q);
236 if (Qb(1, 2) == 3 && Qb(5, 6) == 0) {
237 q[0] = q[1] = 4;
238 q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]);
239 } else {
240 u32 C = 0;
241 if (Qb(1, 2) == 3) {
242 q[2] = 4;
243 C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0];
244 } else {
245 q[2] = Qb(5, 6);
246 C = Qb(0, 4);
247 }
248
249 Bits<u32> Cb(C);
250 if (Cb(0, 2) == 5) {
251 q[1] = 4;
252 q[0] = Cb(3, 4);
253 } else {
254 q[1] = Cb(3, 4);
255 q[0] = Cb(0, 2);
256 }
257 }
258
259 for (std::size_t i = 0; i < 3; ++i) {
260 IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Quint, nBitsPerValue);
261 val.bit_value = m[i];
262 val.quint_value = q[i];
263 }
264}
265
266// Fills result with the values that are encoded in the given
267// bitstream. We must know beforehand what the maximum possible
268// value is, and how many values we're decoding.
269static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange,
270 u32 nValues) {
271 // Determine encoding parameters
272 IntegerEncodedValue val = EncodingsValues[maxRange];
273
274 // Start decoding
275 u32 nValsDecoded = 0;
276 while (nValsDecoded < nValues) {
277 switch (val.encoding) {
278 case IntegerEncoding::Quint:
279 DecodeQuintBlock(bits, result, val.num_bits);
280 nValsDecoded += 3;
281 break;
282
283 case IntegerEncoding::Trit:
284 DecodeTritBlock(bits, result, val.num_bits);
285 nValsDecoded += 5;
286 break;
287
288 case IntegerEncoding::JustBits:
289 val.bit_value = bits.ReadBits(val.num_bits);
290 result.push_back(val);
291 nValsDecoded++;
292 break;
293 }
294 }
295}
296
297struct TexelWeightParams {
298 u32 m_Width = 0;
299 u32 m_Height = 0;
300 bool m_bDualPlane = false;
301 u32 m_MaxWeight = 0;
302 bool m_bError = false;
303 bool m_bVoidExtentLDR = false;
304 bool m_bVoidExtentHDR = false;
305
306 u32 GetPackedBitSize() const {
307 // How many indices do we have?
308 u32 nIdxs = m_Height * m_Width;
309 if (m_bDualPlane) {
310 nIdxs *= 2;
311 }
312
313 return EncodingsValues[m_MaxWeight].GetBitLength(nIdxs);
314 }
315
316 u32 GetNumWeightValues() const {
317 u32 ret = m_Width * m_Height;
318 if (m_bDualPlane) {
319 ret *= 2;
320 }
321 return ret;
322 }
323};
324
325static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
326 TexelWeightParams params;
327
328 // Read the entire block mode all at once
329 u16 modeBits = static_cast<u16>(strm.ReadBits<11>());
330
331 // Does this match the void extent block mode?
332 if ((modeBits & 0x01FF) == 0x1FC) {
333 if (modeBits & 0x200) {
334 params.m_bVoidExtentHDR = true;
335 } else {
336 params.m_bVoidExtentLDR = true;
337 }
338
339 // Next two bits must be one.
340 if (!(modeBits & 0x400) || !strm.ReadBit()) {
341 params.m_bError = true;
342 }
343
344 return params;
345 }
346
347 // First check if the last four bits are zero
348 if ((modeBits & 0xF) == 0) {
349 params.m_bError = true;
350 return params;
351 }
352
353 // If the last two bits are zero, then if bits
354 // [6-8] are all ones, this is also reserved.
355 if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0) {
356 params.m_bError = true;
357 return params;
358 }
359
360 // Otherwise, there is no error... Figure out the layout
361 // of the block mode. Layout is determined by a number
362 // between 0 and 9 corresponding to table C.2.8 of the
363 // ASTC spec.
364 u32 layout = 0;
365
366 if ((modeBits & 0x1) || (modeBits & 0x2)) {
367 // layout is in [0-4]
368 if (modeBits & 0x8) {
369 // layout is in [2-4]
370 if (modeBits & 0x4) {
371 // layout is in [3-4]
372 if (modeBits & 0x100) {
373 layout = 4;
374 } else {
375 layout = 3;
376 }
377 } else {
378 layout = 2;
379 }
380 } else {
381 // layout is in [0-1]
382 if (modeBits & 0x4) {
383 layout = 1;
384 } else {
385 layout = 0;
386 }
387 }
388 } else {
389 // layout is in [5-9]
390 if (modeBits & 0x100) {
391 // layout is in [7-9]
392 if (modeBits & 0x80) {
393 // layout is in [7-8]
394 assert((modeBits & 0x40) == 0U);
395 if (modeBits & 0x20) {
396 layout = 8;
397 } else {
398 layout = 7;
399 }
400 } else {
401 layout = 9;
402 }
403 } else {
404 // layout is in [5-6]
405 if (modeBits & 0x80) {
406 layout = 6;
407 } else {
408 layout = 5;
409 }
410 }
411 }
412
413 assert(layout < 10);
414
415 // Determine R
416 u32 R = !!(modeBits & 0x10);
417 if (layout < 5) {
418 R |= (modeBits & 0x3) << 1;
419 } else {
420 R |= (modeBits & 0xC) >> 1;
421 }
422 assert(2 <= R && R <= 7);
423
424 // Determine width & height
425 switch (layout) {
426 case 0: {
427 u32 A = (modeBits >> 5) & 0x3;
428 u32 B = (modeBits >> 7) & 0x3;
429 params.m_Width = B + 4;
430 params.m_Height = A + 2;
431 break;
432 }
433
434 case 1: {
435 u32 A = (modeBits >> 5) & 0x3;
436 u32 B = (modeBits >> 7) & 0x3;
437 params.m_Width = B + 8;
438 params.m_Height = A + 2;
439 break;
440 }
441
442 case 2: {
443 u32 A = (modeBits >> 5) & 0x3;
444 u32 B = (modeBits >> 7) & 0x3;
445 params.m_Width = A + 2;
446 params.m_Height = B + 8;
447 break;
448 }
449
450 case 3: {
451 u32 A = (modeBits >> 5) & 0x3;
452 u32 B = (modeBits >> 7) & 0x1;
453 params.m_Width = A + 2;
454 params.m_Height = B + 6;
455 break;
456 }
457
458 case 4: {
459 u32 A = (modeBits >> 5) & 0x3;
460 u32 B = (modeBits >> 7) & 0x1;
461 params.m_Width = B + 2;
462 params.m_Height = A + 2;
463 break;
464 }
465
466 case 5: {
467 u32 A = (modeBits >> 5) & 0x3;
468 params.m_Width = 12;
469 params.m_Height = A + 2;
470 break;
471 }
472
473 case 6: {
474 u32 A = (modeBits >> 5) & 0x3;
475 params.m_Width = A + 2;
476 params.m_Height = 12;
477 break;
478 }
479
480 case 7: {
481 params.m_Width = 6;
482 params.m_Height = 10;
483 break;
484 }
485
486 case 8: {
487 params.m_Width = 10;
488 params.m_Height = 6;
489 break;
490 }
491
492 case 9: {
493 u32 A = (modeBits >> 5) & 0x3;
494 u32 B = (modeBits >> 9) & 0x3;
495 params.m_Width = A + 6;
496 params.m_Height = B + 6;
497 break;
498 }
499
500 default:
501 assert(false && "Don't know this layout...");
502 params.m_bError = true;
503 break;
504 }
505
506 // Determine whether or not we're using dual planes
507 // and/or high precision layouts.
508 bool D = (layout != 9) && (modeBits & 0x400);
509 bool H = (layout != 9) && (modeBits & 0x200);
510
511 if (H) {
512 const u32 maxWeights[6] = {9, 11, 15, 19, 23, 31};
513 params.m_MaxWeight = maxWeights[R - 2];
514 } else {
515 const u32 maxWeights[6] = {1, 2, 3, 4, 5, 7};
516 params.m_MaxWeight = maxWeights[R - 2];
517 }
518
519 params.m_bDualPlane = D;
520
521 return params;
522}
523
524static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
525 u32 blockHeight) {
526 // Don't actually care about the void extent, just read the bits...
527 for (s32 i = 0; i < 4; ++i) {
528 strm.ReadBits<13>();
529 }
530
531 // Decode the RGBA components and renormalize them to the range [0, 255]
532 u16 r = static_cast<u16>(strm.ReadBits<16>());
533 u16 g = static_cast<u16>(strm.ReadBits<16>());
534 u16 b = static_cast<u16>(strm.ReadBits<16>());
535 u16 a = static_cast<u16>(strm.ReadBits<16>());
536
537 u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
538 (static_cast<u32>(a) & 0xFF00) << 16;
539
540 for (u32 j = 0; j < blockHeight; j++) {
541 for (u32 i = 0; i < blockWidth; i++) {
542 outBuf[j * blockWidth + i] = rgba;
543 }
544 }
545}
546
547static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
548 for (u32 j = 0; j < blockHeight; j++) {
549 for (u32 i = 0; i < blockWidth; i++) {
550 outBuf[j * blockWidth + i] = 0xFFFF00FF;
551 }
552 }
553}
554static constexpr u32 ReplicateByteTo16(std::size_t value) {
555 return REPLICATE_BYTE_TO_16_TABLE[value];
556}
557
558static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
559static constexpr u32 ReplicateBitTo7(std::size_t value) {
560 return REPLICATE_BIT_TO_7_TABLE[value];
561}
562
563static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
564static constexpr u32 ReplicateBitTo9(std::size_t value) {
565 return REPLICATE_BIT_TO_9_TABLE[value];
566}
567
568static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
569static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
570static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
571static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
572static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
573/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
574/// to the runtime implementation
575static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
576 switch (num_bits) {
577 case 1:
578 return REPLICATE_1_BIT_TO_8_TABLE[value];
579 case 2:
580 return REPLICATE_2_BIT_TO_8_TABLE[value];
581 case 3:
582 return REPLICATE_3_BIT_TO_8_TABLE[value];
583 case 4:
584 return REPLICATE_4_BIT_TO_8_TABLE[value];
585 case 5:
586 return REPLICATE_5_BIT_TO_8_TABLE[value];
587 case 6:
588 return REPLICATE_6_BIT_TO_8_TABLE[value];
589 case 7:
590 return REPLICATE_7_BIT_TO_8_TABLE[value];
591 case 8:
592 return REPLICATE_8_BIT_TO_8_TABLE[value];
593 default:
594 return Replicate(value, num_bits, 8);
595 }
596}
597
598static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
599static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
600static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
601static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
602static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
603static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
604 switch (num_bits) {
605 case 1:
606 return REPLICATE_1_BIT_TO_6_TABLE[value];
607 case 2:
608 return REPLICATE_2_BIT_TO_6_TABLE[value];
609 case 3:
610 return REPLICATE_3_BIT_TO_6_TABLE[value];
611 case 4:
612 return REPLICATE_4_BIT_TO_6_TABLE[value];
613 case 5:
614 return REPLICATE_5_BIT_TO_6_TABLE[value];
615 default:
616 return Replicate(value, num_bits, 6);
617 }
618}
619
620class Pixel {
621protected:
622 using ChannelType = s16;
623 u8 m_BitDepth[4] = {8, 8, 8, 8};
624 s16 color[4] = {};
625
626public:
627 Pixel() = default;
628 Pixel(u32 a, u32 r, u32 g, u32 b, u32 bitDepth = 8)
629 : m_BitDepth{u8(bitDepth), u8(bitDepth), u8(bitDepth), u8(bitDepth)},
630 color{static_cast<ChannelType>(a), static_cast<ChannelType>(r),
631 static_cast<ChannelType>(g), static_cast<ChannelType>(b)} {}
632
633 // Changes the depth of each pixel. This scales the values to
634 // the appropriate bit depth by either truncating the least
635 // significant bits when going from larger to smaller bit depth
636 // or by repeating the most significant bits when going from
637 // smaller to larger bit depths.
638 void ChangeBitDepth() {
639 for (u32 i = 0; i < 4; i++) {
640 Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]);
641 m_BitDepth[i] = 8;
642 }
643 }
644
645 template <typename IntType>
646 static float ConvertChannelToFloat(IntType channel, u8 bitDepth) {
647 float denominator = static_cast<float>((1 << bitDepth) - 1);
648 return static_cast<float>(channel) / denominator;
649 }
650
651 // Changes the bit depth of a single component. See the comment
652 // above for how we do this.
653 static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) {
654 assert(oldDepth <= 8);
655
656 if (oldDepth == 8) {
657 // Do nothing
658 return val;
659 } else if (oldDepth == 0) {
660 return static_cast<ChannelType>((1 << 8) - 1);
661 } else if (8 > oldDepth) {
662 return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
663 } else {
664 // oldDepth > newDepth
665 const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
666 u16 v = static_cast<u16>(val);
667 v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
668 v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1));
669 return static_cast<u8>(v);
670 }
671
672 assert(false && "We shouldn't get here.");
673 return 0;
674 }
675
676 const ChannelType& A() const {
677 return color[0];
678 }
679 ChannelType& A() {
680 return color[0];
681 }
682 const ChannelType& R() const {
683 return color[1];
684 }
685 ChannelType& R() {
686 return color[1];
687 }
688 const ChannelType& G() const {
689 return color[2];
690 }
691 ChannelType& G() {
692 return color[2];
693 }
694 const ChannelType& B() const {
695 return color[3];
696 }
697 ChannelType& B() {
698 return color[3];
699 }
700 const ChannelType& Component(u32 idx) const {
701 return color[idx];
702 }
703 ChannelType& Component(u32 idx) {
704 return color[idx];
705 }
706
707 void GetBitDepth(u8 (&outDepth)[4]) const {
708 for (s32 i = 0; i < 4; i++) {
709 outDepth[i] = m_BitDepth[i];
710 }
711 }
712
713 // Take all of the components, transform them to their 8-bit variants,
714 // and then pack each channel into an R8G8B8A8 32-bit integer. We assume
715 // that the architecture is little-endian, so the alpha channel will end
716 // up in the most-significant byte.
717 u32 Pack() const {
718 Pixel eightBit(*this);
719 eightBit.ChangeBitDepth();
720
721 u32 r = 0;
722 r |= eightBit.A();
723 r <<= 8;
724 r |= eightBit.B();
725 r <<= 8;
726 r |= eightBit.G();
727 r <<= 8;
728 r |= eightBit.R();
729 return r;
730 }
731
732 // Clamps the pixel to the range [0,255]
733 void ClampByte() {
734 for (u32 i = 0; i < 4; i++) {
735 color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
736 }
737 }
738
739 void MakeOpaque() {
740 A() = 255;
741 }
742};
743
744static void DecodeColorValues(u32* out, std::span<u8> data, const u32* modes, const u32 nPartitions,
745 const u32 nBitsForColorData) {
746 // First figure out how many color values we have
747 u32 nValues = 0;
748 for (u32 i = 0; i < nPartitions; i++) {
749 nValues += ((modes[i] >> 2) + 1) << 1;
750 }
751
752 // Then based on the number of values and the remaining number of bits,
753 // figure out the max value for each of them...
754 u32 range = 256;
755 while (--range > 0) {
756 IntegerEncodedValue val = EncodingsValues[range];
757 u32 bitLength = val.GetBitLength(nValues);
758 if (bitLength <= nBitsForColorData) {
759 // Find the smallest possible range that matches the given encoding
760 while (--range > 0) {
761 IntegerEncodedValue newval = EncodingsValues[range];
762 if (!newval.MatchesEncoding(val)) {
763 break;
764 }
765 }
766
767 // Return to last matching range.
768 range++;
769 break;
770 }
771 }
772
773 // We now have enough to decode our integer sequence.
774 IntegerEncodedVector decodedColorValues;
775
776 InputBitStream colorStream(data, 0);
777 DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
778
779 // Once we have the decoded values, we need to dequantize them to the 0-255 range
780 // This procedure is outlined in ASTC spec C.2.13
781 u32 outIdx = 0;
782 for (auto itr = decodedColorValues.begin(); itr != decodedColorValues.end(); ++itr) {
783 // Have we already decoded all that we need?
784 if (outIdx >= nValues) {
785 break;
786 }
787
788 const IntegerEncodedValue& val = *itr;
789 u32 bitlen = val.num_bits;
790 u32 bitval = val.bit_value;
791
792 assert(bitlen >= 1);
793
794 u32 A = 0, B = 0, C = 0, D = 0;
795 // A is just the lsb replicated 9 times.
796 A = ReplicateBitTo9(bitval & 1);
797
798 switch (val.encoding) {
799 // Replicate bits
800 case IntegerEncoding::JustBits:
801 out[outIdx++] = FastReplicateTo8(bitval, bitlen);
802 break;
803
804 // Use algorithm in C.2.13
805 case IntegerEncoding::Trit: {
806
807 D = val.trit_value;
808
809 switch (bitlen) {
810 case 1: {
811 C = 204;
812 } break;
813
814 case 2: {
815 C = 93;
816 // B = b000b0bb0
817 u32 b = (bitval >> 1) & 1;
818 B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
819 } break;
820
821 case 3: {
822 C = 44;
823 // B = cb000cbcb
824 u32 cb = (bitval >> 1) & 3;
825 B = (cb << 7) | (cb << 2) | cb;
826 } break;
827
828 case 4: {
829 C = 22;
830 // B = dcb000dcb
831 u32 dcb = (bitval >> 1) & 7;
832 B = (dcb << 6) | dcb;
833 } break;
834
835 case 5: {
836 C = 11;
837 // B = edcb000ed
838 u32 edcb = (bitval >> 1) & 0xF;
839 B = (edcb << 5) | (edcb >> 2);
840 } break;
841
842 case 6: {
843 C = 5;
844 // B = fedcb000f
845 u32 fedcb = (bitval >> 1) & 0x1F;
846 B = (fedcb << 4) | (fedcb >> 4);
847 } break;
848
849 default:
850 assert(false && "Unsupported trit encoding for color values!");
851 break;
852 } // switch(bitlen)
853 } // case IntegerEncoding::Trit
854 break;
855
856 case IntegerEncoding::Quint: {
857
858 D = val.quint_value;
859
860 switch (bitlen) {
861 case 1: {
862 C = 113;
863 } break;
864
865 case 2: {
866 C = 54;
867 // B = b0000bb00
868 u32 b = (bitval >> 1) & 1;
869 B = (b << 8) | (b << 3) | (b << 2);
870 } break;
871
872 case 3: {
873 C = 26;
874 // B = cb0000cbc
875 u32 cb = (bitval >> 1) & 3;
876 B = (cb << 7) | (cb << 1) | (cb >> 1);
877 } break;
878
879 case 4: {
880 C = 13;
881 // B = dcb0000dc
882 u32 dcb = (bitval >> 1) & 7;
883 B = (dcb << 6) | (dcb >> 1);
884 } break;
885
886 case 5: {
887 C = 6;
888 // B = edcb0000e
889 u32 edcb = (bitval >> 1) & 0xF;
890 B = (edcb << 5) | (edcb >> 3);
891 } break;
892
893 default:
894 assert(false && "Unsupported quint encoding for color values!");
895 break;
896 } // switch(bitlen)
897 } // case IntegerEncoding::Quint
898 break;
899 } // switch(val.encoding)
900
901 if (val.encoding != IntegerEncoding::JustBits) {
902 u32 T = D * C + B;
903 T ^= A;
904 T = (A & 0x80) | (T >> 2);
905 out[outIdx++] = T;
906 }
907 }
908
909 // Make sure that each of our values is in the proper range...
910 for (u32 i = 0; i < nValues; i++) {
911 assert(out[i] <= 255);
912 }
913}
914
915static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
916 u32 bitval = val.bit_value;
917 u32 bitlen = val.num_bits;
918
919 u32 A = ReplicateBitTo7(bitval & 1);
920 u32 B = 0, C = 0, D = 0;
921
922 u32 result = 0;
923 switch (val.encoding) {
924 case IntegerEncoding::JustBits:
925 result = FastReplicateTo6(bitval, bitlen);
926 break;
927
928 case IntegerEncoding::Trit: {
929 D = val.trit_value;
930 assert(D < 3);
931
932 switch (bitlen) {
933 case 0: {
934 u32 results[3] = {0, 32, 63};
935 result = results[D];
936 } break;
937
938 case 1: {
939 C = 50;
940 } break;
941
942 case 2: {
943 C = 23;
944 u32 b = (bitval >> 1) & 1;
945 B = (b << 6) | (b << 2) | b;
946 } break;
947
948 case 3: {
949 C = 11;
950 u32 cb = (bitval >> 1) & 3;
951 B = (cb << 5) | cb;
952 } break;
953
954 default:
955 assert(false && "Invalid trit encoding for texel weight");
956 break;
957 }
958 } break;
959
960 case IntegerEncoding::Quint: {
961 D = val.quint_value;
962 assert(D < 5);
963
964 switch (bitlen) {
965 case 0: {
966 u32 results[5] = {0, 16, 32, 47, 63};
967 result = results[D];
968 } break;
969
970 case 1: {
971 C = 28;
972 } break;
973
974 case 2: {
975 C = 13;
976 u32 b = (bitval >> 1) & 1;
977 B = (b << 6) | (b << 1);
978 } break;
979
980 default:
981 assert(false && "Invalid quint encoding for texel weight");
982 break;
983 }
984 } break;
985 }
986
987 if (val.encoding != IntegerEncoding::JustBits && bitlen > 0) {
988 // Decode the value...
989 result = D * C + B;
990 result ^= A;
991 result = (A & 0x20) | (result >> 2);
992 }
993
994 assert(result < 64);
995
996 // Change from [0,63] to [0,64]
997 if (result > 32) {
998 result += 1;
999 }
1000
1001 return result;
1002}
1003
1004static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights,
1005 const TexelWeightParams& params, const u32 blockWidth,
1006 const u32 blockHeight) {
1007 u32 weightIdx = 0;
1008 u32 unquantized[2][144];
1009
1010 for (auto itr = weights.begin(); itr != weights.end(); ++itr) {
1011 unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr);
1012
1013 if (params.m_bDualPlane) {
1014 ++itr;
1015 unquantized[1][weightIdx] = UnquantizeTexelWeight(*itr);
1016 if (itr == weights.end()) {
1017 break;
1018 }
1019 }
1020
1021 if (++weightIdx >= (params.m_Width * params.m_Height))
1022 break;
1023 }
1024
1025 // Do infill if necessary (Section C.2.18) ...
1026 u32 Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1);
1027 u32 Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1);
1028
1029 const u32 kPlaneScale = params.m_bDualPlane ? 2U : 1U;
1030 for (u32 plane = 0; plane < kPlaneScale; plane++)
1031 for (u32 t = 0; t < blockHeight; t++)
1032 for (u32 s = 0; s < blockWidth; s++) {
1033 u32 cs = Ds * s;
1034 u32 ct = Dt * t;
1035
1036 u32 gs = (cs * (params.m_Width - 1) + 32) >> 6;
1037 u32 gt = (ct * (params.m_Height - 1) + 32) >> 6;
1038
1039 u32 js = gs >> 4;
1040 u32 fs = gs & 0xF;
1041
1042 u32 jt = gt >> 4;
1043 u32 ft = gt & 0x0F;
1044
1045 u32 w11 = (fs * ft + 8) >> 4;
1046 u32 w10 = ft - w11;
1047 u32 w01 = fs - w11;
1048 u32 w00 = 16 - fs - ft + w11;
1049
1050 u32 v0 = js + jt * params.m_Width;
1051
1052#define FIND_TEXEL(tidx, bidx) \
1053 u32 p##bidx = 0; \
1054 do { \
1055 if ((tidx) < (params.m_Width * params.m_Height)) { \
1056 p##bidx = unquantized[plane][(tidx)]; \
1057 } \
1058 } while (0)
1059
1060 FIND_TEXEL(v0, 00);
1061 FIND_TEXEL(v0 + 1, 01);
1062 FIND_TEXEL(v0 + params.m_Width, 10);
1063 FIND_TEXEL(v0 + params.m_Width + 1, 11);
1064
1065#undef FIND_TEXEL
1066
1067 out[plane][t * blockWidth + s] =
1068 (p00 * w00 + p01 * w01 + p10 * w10 + p11 * w11 + 8) >> 4;
1069 }
1070}
1071
1072// Transfers a bit as described in C.2.14
1073static inline void BitTransferSigned(int& a, int& b) {
1074 b >>= 1;
1075 b |= a & 0x80;
1076 a >>= 1;
1077 a &= 0x3F;
1078 if (a & 0x20)
1079 a -= 0x40;
1080}
1081
1082// Adds more precision to the blue channel as described
1083// in C.2.14
1084static inline Pixel BlueContract(s32 a, s32 r, s32 g, s32 b) {
1085 return Pixel(static_cast<s16>(a), static_cast<s16>((r + b) >> 1),
1086 static_cast<s16>((g + b) >> 1), static_cast<s16>(b));
1087}
1088
1089// Partition selection functions as specified in
1090// C.2.21
1091static inline u32 hash52(u32 p) {
1092 p ^= p >> 15;
1093 p -= p << 17;
1094 p += p << 7;
1095 p += p << 4;
1096 p ^= p >> 5;
1097 p += p << 16;
1098 p ^= p >> 7;
1099 p ^= p >> 3;
1100 p ^= p << 6;
1101 p ^= p >> 17;
1102 return p;
1103}
1104
1105static u32 SelectPartition(s32 seed, s32 x, s32 y, s32 z, s32 partitionCount, s32 smallBlock) {
1106 if (1 == partitionCount)
1107 return 0;
1108
1109 if (smallBlock) {
1110 x <<= 1;
1111 y <<= 1;
1112 z <<= 1;
1113 }
1114
1115 seed += (partitionCount - 1) * 1024;
1116
1117 u32 rnum = hash52(static_cast<u32>(seed));
1118 u8 seed1 = static_cast<u8>(rnum & 0xF);
1119 u8 seed2 = static_cast<u8>((rnum >> 4) & 0xF);
1120 u8 seed3 = static_cast<u8>((rnum >> 8) & 0xF);
1121 u8 seed4 = static_cast<u8>((rnum >> 12) & 0xF);
1122 u8 seed5 = static_cast<u8>((rnum >> 16) & 0xF);
1123 u8 seed6 = static_cast<u8>((rnum >> 20) & 0xF);
1124 u8 seed7 = static_cast<u8>((rnum >> 24) & 0xF);
1125 u8 seed8 = static_cast<u8>((rnum >> 28) & 0xF);
1126 u8 seed9 = static_cast<u8>((rnum >> 18) & 0xF);
1127 u8 seed10 = static_cast<u8>((rnum >> 22) & 0xF);
1128 u8 seed11 = static_cast<u8>((rnum >> 26) & 0xF);
1129 u8 seed12 = static_cast<u8>(((rnum >> 30) | (rnum << 2)) & 0xF);
1130
1131 seed1 = static_cast<u8>(seed1 * seed1);
1132 seed2 = static_cast<u8>(seed2 * seed2);
1133 seed3 = static_cast<u8>(seed3 * seed3);
1134 seed4 = static_cast<u8>(seed4 * seed4);
1135 seed5 = static_cast<u8>(seed5 * seed5);
1136 seed6 = static_cast<u8>(seed6 * seed6);
1137 seed7 = static_cast<u8>(seed7 * seed7);
1138 seed8 = static_cast<u8>(seed8 * seed8);
1139 seed9 = static_cast<u8>(seed9 * seed9);
1140 seed10 = static_cast<u8>(seed10 * seed10);
1141 seed11 = static_cast<u8>(seed11 * seed11);
1142 seed12 = static_cast<u8>(seed12 * seed12);
1143
1144 s32 sh1, sh2, sh3;
1145 if (seed & 1) {
1146 sh1 = (seed & 2) ? 4 : 5;
1147 sh2 = (partitionCount == 3) ? 6 : 5;
1148 } else {
1149 sh1 = (partitionCount == 3) ? 6 : 5;
1150 sh2 = (seed & 2) ? 4 : 5;
1151 }
1152 sh3 = (seed & 0x10) ? sh1 : sh2;
1153
1154 seed1 = static_cast<u8>(seed1 >> sh1);
1155 seed2 = static_cast<u8>(seed2 >> sh2);
1156 seed3 = static_cast<u8>(seed3 >> sh1);
1157 seed4 = static_cast<u8>(seed4 >> sh2);
1158 seed5 = static_cast<u8>(seed5 >> sh1);
1159 seed6 = static_cast<u8>(seed6 >> sh2);
1160 seed7 = static_cast<u8>(seed7 >> sh1);
1161 seed8 = static_cast<u8>(seed8 >> sh2);
1162 seed9 = static_cast<u8>(seed9 >> sh3);
1163 seed10 = static_cast<u8>(seed10 >> sh3);
1164 seed11 = static_cast<u8>(seed11 >> sh3);
1165 seed12 = static_cast<u8>(seed12 >> sh3);
1166
1167 s32 a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
1168 s32 b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
1169 s32 c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
1170 s32 d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
1171
1172 a &= 0x3F;
1173 b &= 0x3F;
1174 c &= 0x3F;
1175 d &= 0x3F;
1176
1177 if (partitionCount < 4)
1178 d = 0;
1179 if (partitionCount < 3)
1180 c = 0;
1181
1182 if (a >= b && a >= c && a >= d)
1183 return 0;
1184 else if (b >= c && b >= d)
1185 return 1;
1186 else if (c >= d)
1187 return 2;
1188 return 3;
1189}
1190
1191static inline u32 Select2DPartition(s32 seed, s32 x, s32 y, s32 partitionCount, s32 smallBlock) {
1192 return SelectPartition(seed, x, y, 0, partitionCount, smallBlock);
1193}
1194
1195// Section C.2.14
1196static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const u32*& colorValues,
1197 u32 colorEndpointMode) {
1198#define READ_UINT_VALUES(N) \
1199 u32 v[N]; \
1200 for (u32 i = 0; i < N; i++) { \
1201 v[i] = *(colorValues++); \
1202 }
1203
1204#define READ_INT_VALUES(N) \
1205 s32 v[N]; \
1206 for (u32 i = 0; i < N; i++) { \
1207 v[i] = static_cast<int>(*(colorValues++)); \
1208 }
1209
1210 switch (colorEndpointMode) {
1211 case 0: {
1212 READ_UINT_VALUES(2)
1213 ep1 = Pixel(0xFF, v[0], v[0], v[0]);
1214 ep2 = Pixel(0xFF, v[1], v[1], v[1]);
1215 } break;
1216
1217 case 1: {
1218 READ_UINT_VALUES(2)
1219 u32 L0 = (v[0] >> 2) | (v[1] & 0xC0);
1220 u32 L1 = std::min(L0 + (v[1] & 0x3F), 0xFFU);
1221 ep1 = Pixel(0xFF, L0, L0, L0);
1222 ep2 = Pixel(0xFF, L1, L1, L1);
1223 } break;
1224
1225 case 4: {
1226 READ_UINT_VALUES(4)
1227 ep1 = Pixel(v[2], v[0], v[0], v[0]);
1228 ep2 = Pixel(v[3], v[1], v[1], v[1]);
1229 } break;
1230
1231 case 5: {
1232 READ_INT_VALUES(4)
1233 BitTransferSigned(v[1], v[0]);
1234 BitTransferSigned(v[3], v[2]);
1235 ep1 = Pixel(v[2], v[0], v[0], v[0]);
1236 ep2 = Pixel(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]);
1237 ep1.ClampByte();
1238 ep2.ClampByte();
1239 } break;
1240
1241 case 6: {
1242 READ_UINT_VALUES(4)
1243 ep1 = Pixel(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
1244 ep2 = Pixel(0xFF, v[0], v[1], v[2]);
1245 } break;
1246
1247 case 8: {
1248 READ_UINT_VALUES(6)
1249 if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
1250 ep1 = Pixel(0xFF, v[0], v[2], v[4]);
1251 ep2 = Pixel(0xFF, v[1], v[3], v[5]);
1252 } else {
1253 ep1 = BlueContract(0xFF, v[1], v[3], v[5]);
1254 ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
1255 }
1256 } break;
1257
1258 case 9: {
1259 READ_INT_VALUES(6)
1260 BitTransferSigned(v[1], v[0]);
1261 BitTransferSigned(v[3], v[2]);
1262 BitTransferSigned(v[5], v[4]);
1263 if (v[1] + v[3] + v[5] >= 0) {
1264 ep1 = Pixel(0xFF, v[0], v[2], v[4]);
1265 ep2 = Pixel(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1266 } else {
1267 ep1 = BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1268 ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
1269 }
1270 ep1.ClampByte();
1271 ep2.ClampByte();
1272 } break;
1273
1274 case 10: {
1275 READ_UINT_VALUES(6)
1276 ep1 = Pixel(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
1277 ep2 = Pixel(v[5], v[0], v[1], v[2]);
1278 } break;
1279
1280 case 12: {
1281 READ_UINT_VALUES(8)
1282 if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
1283 ep1 = Pixel(v[6], v[0], v[2], v[4]);
1284 ep2 = Pixel(v[7], v[1], v[3], v[5]);
1285 } else {
1286 ep1 = BlueContract(v[7], v[1], v[3], v[5]);
1287 ep2 = BlueContract(v[6], v[0], v[2], v[4]);
1288 }
1289 } break;
1290
1291 case 13: {
1292 READ_INT_VALUES(8)
1293 BitTransferSigned(v[1], v[0]);
1294 BitTransferSigned(v[3], v[2]);
1295 BitTransferSigned(v[5], v[4]);
1296 BitTransferSigned(v[7], v[6]);
1297 if (v[1] + v[3] + v[5] >= 0) {
1298 ep1 = Pixel(v[6], v[0], v[2], v[4]);
1299 ep2 = Pixel(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1300 } else {
1301 ep1 = BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
1302 ep2 = BlueContract(v[6], v[0], v[2], v[4]);
1303 }
1304 ep1.ClampByte();
1305 ep2.ClampByte();
1306 } break;
1307
1308 default:
1309 assert(false && "Unsupported color endpoint mode (is it HDR?)");
1310 break;
1311 }
1312
1313#undef READ_UINT_VALUES
1314#undef READ_INT_VALUES
1315}
1316
1317static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
1318 const u32 blockHeight, std::span<u32, 12 * 12> outBuf) {
1319 InputBitStream strm(inBuf);
1320 TexelWeightParams weightParams = DecodeBlockInfo(strm);
1321
1322 // Was there an error?
1323 if (weightParams.m_bError) {
1324 assert(false && "Invalid block mode");
1325 FillError(outBuf, blockWidth, blockHeight);
1326 return;
1327 }
1328
1329 if (weightParams.m_bVoidExtentLDR) {
1330 FillVoidExtentLDR(strm, outBuf, blockWidth, blockHeight);
1331 return;
1332 }
1333
1334 if (weightParams.m_bVoidExtentHDR) {
1335 assert(false && "HDR void extent blocks are unsupported!");
1336 FillError(outBuf, blockWidth, blockHeight);
1337 return;
1338 }
1339
1340 if (weightParams.m_Width > blockWidth) {
1341 assert(false && "Texel weight grid width should be smaller than block width");
1342 FillError(outBuf, blockWidth, blockHeight);
1343 return;
1344 }
1345
1346 if (weightParams.m_Height > blockHeight) {
1347 assert(false && "Texel weight grid height should be smaller than block height");
1348 FillError(outBuf, blockWidth, blockHeight);
1349 return;
1350 }
1351
1352 // Read num partitions
1353 u32 nPartitions = strm.ReadBits<2>() + 1;
1354 assert(nPartitions <= 4);
1355
1356 if (nPartitions == 4 && weightParams.m_bDualPlane) {
1357 assert(false && "Dual plane mode is incompatible with four partition blocks");
1358 FillError(outBuf, blockWidth, blockHeight);
1359 return;
1360 }
1361
1362 // Based on the number of partitions, read the color endpoint mode for
1363 // each partition.
1364
1365 // Determine partitions, partition index, and color endpoint modes
1366 s32 planeIdx = -1;
1367 u32 partitionIndex;
1368 u32 colorEndpointMode[4] = {0, 0, 0, 0};
1369
1370 // Define color data.
1371 u8 colorEndpointData[16];
1372 memset(colorEndpointData, 0, sizeof(colorEndpointData));
1373 OutputBitStream colorEndpointStream(colorEndpointData, 16 * 8, 0);
1374
1375 // Read extra config data...
1376 u32 baseCEM = 0;
1377 if (nPartitions == 1) {
1378 colorEndpointMode[0] = strm.ReadBits<4>();
1379 partitionIndex = 0;
1380 } else {
1381 partitionIndex = strm.ReadBits<10>();
1382 baseCEM = strm.ReadBits<6>();
1383 }
1384 u32 baseMode = (baseCEM & 3);
1385
1386 // Remaining bits are color endpoint data...
1387 u32 nWeightBits = weightParams.GetPackedBitSize();
1388 s32 remainingBits = 128 - nWeightBits - static_cast<int>(strm.GetBitsRead());
1389
1390 // Consider extra bits prior to texel data...
1391 u32 extraCEMbits = 0;
1392 if (baseMode) {
1393 switch (nPartitions) {
1394 case 2:
1395 extraCEMbits += 2;
1396 break;
1397 case 3:
1398 extraCEMbits += 5;
1399 break;
1400 case 4:
1401 extraCEMbits += 8;
1402 break;
1403 default:
1404 assert(false);
1405 break;
1406 }
1407 }
1408 remainingBits -= extraCEMbits;
1409
1410 // Do we have a dual plane situation?
1411 u32 planeSelectorBits = 0;
1412 if (weightParams.m_bDualPlane) {
1413 planeSelectorBits = 2;
1414 }
1415 remainingBits -= planeSelectorBits;
1416
1417 // Read color data...
1418 u32 colorDataBits = remainingBits;
1419 while (remainingBits > 0) {
1420 u32 nb = std::min(remainingBits, 8);
1421 u32 b = strm.ReadBits(nb);
1422 colorEndpointStream.WriteBits(b, nb);
1423 remainingBits -= 8;
1424 }
1425
1426 // Read the plane selection bits
1427 planeIdx = strm.ReadBits(planeSelectorBits);
1428
1429 // Read the rest of the CEM
1430 if (baseMode) {
1431 u32 extraCEM = strm.ReadBits(extraCEMbits);
1432 u32 CEM = (extraCEM << 6) | baseCEM;
1433 CEM >>= 2;
1434
1435 bool C[4] = {0};
1436 for (u32 i = 0; i < nPartitions; i++) {
1437 C[i] = CEM & 1;
1438 CEM >>= 1;
1439 }
1440
1441 u8 M[4] = {0};
1442 for (u32 i = 0; i < nPartitions; i++) {
1443 M[i] = CEM & 3;
1444 CEM >>= 2;
1445 assert(M[i] <= 3);
1446 }
1447
1448 for (u32 i = 0; i < nPartitions; i++) {
1449 colorEndpointMode[i] = baseMode;
1450 if (!(C[i]))
1451 colorEndpointMode[i] -= 1;
1452 colorEndpointMode[i] <<= 2;
1453 colorEndpointMode[i] |= M[i];
1454 }
1455 } else if (nPartitions > 1) {
1456 u32 CEM = baseCEM >> 2;
1457 for (u32 i = 0; i < nPartitions; i++) {
1458 colorEndpointMode[i] = CEM;
1459 }
1460 }
1461
1462 // Make sure everything up till here is sane.
1463 for (u32 i = 0; i < nPartitions; i++) {
1464 assert(colorEndpointMode[i] < 16);
1465 }
1466 assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128);
1467
1468 // Decode both color data and texel weight data
1469 u32 colorValues[32]; // Four values, two endpoints, four maximum paritions
1470 DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions,
1471 colorDataBits);
1472
1473 Pixel endpoints[4][2];
1474 const u32* colorValuesPtr = colorValues;
1475 for (u32 i = 0; i < nPartitions; i++) {
1476 ComputeEndpoints(endpoints[i][0], endpoints[i][1], colorValuesPtr, colorEndpointMode[i]);
1477 }
1478
1479 // Read the texel weight data..
1480 std::array<u8, 16> texelWeightData;
1481 std::ranges::copy(inBuf, texelWeightData.begin());
1482
1483 // Reverse everything
1484 for (u32 i = 0; i < 8; i++) {
1485// Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
1486#define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32
1487 u8 a = static_cast<u8>(REVERSE_BYTE(texelWeightData[i]));
1488 u8 b = static_cast<u8>(REVERSE_BYTE(texelWeightData[15 - i]));
1489#undef REVERSE_BYTE
1490
1491 texelWeightData[i] = b;
1492 texelWeightData[15 - i] = a;
1493 }
1494
1495 // Make sure that higher non-texel bits are set to zero
1496 const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1;
1497 if (clearByteStart > 0 && clearByteStart <= texelWeightData.size()) {
1498 texelWeightData[clearByteStart - 1] &=
1499 static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
1500 std::memset(texelWeightData.data() + clearByteStart, 0,
1501 std::min(16U - clearByteStart, 16U));
1502 }
1503
1504 IntegerEncodedVector texelWeightValues;
1505
1506 InputBitStream weightStream(texelWeightData);
1507
1508 DecodeIntegerSequence(texelWeightValues, weightStream, weightParams.m_MaxWeight,
1509 weightParams.GetNumWeightValues());
1510
1511 // Blocks can be at most 12x12, so we can have as many as 144 weights
1512 u32 weights[2][144];
1513 UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight);
1514
1515 // Now that we have endpoints and weights, we can interpolate and generate
1516 // the proper decoding...
1517 for (u32 j = 0; j < blockHeight; j++)
1518 for (u32 i = 0; i < blockWidth; i++) {
1519 u32 partition = Select2DPartition(partitionIndex, i, j, nPartitions,
1520 (blockHeight * blockWidth) < 32);
1521 assert(partition < nPartitions);
1522
1523 Pixel p;
1524 for (u32 c = 0; c < 4; c++) {
1525 u32 C0 = endpoints[partition][0].Component(c);
1526 C0 = ReplicateByteTo16(C0);
1527 u32 C1 = endpoints[partition][1].Component(c);
1528 C1 = ReplicateByteTo16(C1);
1529
1530 u32 plane = 0;
1531 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
1532 plane = 1;
1533 }
1534
1535 u32 weight = weights[plane][j * blockWidth + i];
1536 u32 C = (C0 * (64 - weight) + C1 * weight + 32) / 64;
1537 if (C == 65535) {
1538 p.Component(c) = 255;
1539 } else {
1540 double Cf = static_cast<double>(C);
1541 p.Component(c) = static_cast<u16>(255.0 * (Cf / 65536.0) + 0.5);
1542 }
1543 }
1544
1545 outBuf[j * blockWidth + i] = p.Pack();
1546 }
1547}
1548
1549void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
1550 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) {
1551 u32 block_index = 0;
1552 std::size_t depth_offset = 0;
1553 for (u32 z = 0; z < depth; z++) {
1554 for (u32 y = 0; y < height; y += block_height) {
1555 for (u32 x = 0; x < width; x += block_width) {
1556 const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
1557
1558 // Blocks can be at most 12x12
1559 std::array<u32, 12 * 12> uncompData;
1560 DecompressBlock(blockPtr, block_width, block_height, uncompData);
1561
1562 u32 decompWidth = std::min(block_width, width - x);
1563 u32 decompHeight = std::min(block_height, height - y);
1564
1565 const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
1566 for (u32 jj = 0; jj < decompHeight; jj++) {
1567 std::memcpy(outRow.data() + jj * width * 4,
1568 uncompData.data() + jj * block_width, decompWidth * 4);
1569 }
1570 ++block_index;
1571 }
1572 }
1573 depth_offset += height * width * 4;
1574 }
1575}
1576
1577} // namespace Tegra::Texture::ASTC
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h
index c1c73fda5..c1c37dfe7 100644
--- a/src/video_core/textures/astc.h
+++ b/src/video_core/textures/astc.h
@@ -129,4 +129,7 @@ struct AstcBufferData {
129 decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE; 129 decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE;
130} constexpr ASTC_BUFFER_DATA; 130} constexpr ASTC_BUFFER_DATA;
131 131
132void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
133 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);
134
132} // namespace Tegra::Texture::ASTC 135} // namespace Tegra::Texture::ASTC
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index 712319783..916a22724 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -809,6 +809,7 @@ void Config::ReadRendererValues() {
809 QStringLiteral("use_asynchronous_gpu_emulation"), true); 809 QStringLiteral("use_asynchronous_gpu_emulation"), true);
810 ReadSettingGlobal(Settings::values.use_nvdec_emulation, QStringLiteral("use_nvdec_emulation"), 810 ReadSettingGlobal(Settings::values.use_nvdec_emulation, QStringLiteral("use_nvdec_emulation"),
811 true); 811 true);
812 ReadSettingGlobal(Settings::values.accelerate_astc, QStringLiteral("accelerate_astc"), true);
812 ReadSettingGlobal(Settings::values.use_vsync, QStringLiteral("use_vsync"), true); 813 ReadSettingGlobal(Settings::values.use_vsync, QStringLiteral("use_vsync"), true);
813 ReadSettingGlobal(Settings::values.use_assembly_shaders, QStringLiteral("use_assembly_shaders"), 814 ReadSettingGlobal(Settings::values.use_assembly_shaders, QStringLiteral("use_assembly_shaders"),
814 false); 815 false);
@@ -1392,6 +1393,7 @@ void Config::SaveRendererValues() {
1392 Settings::values.use_asynchronous_gpu_emulation, true); 1393 Settings::values.use_asynchronous_gpu_emulation, true);
1393 WriteSettingGlobal(QStringLiteral("use_nvdec_emulation"), Settings::values.use_nvdec_emulation, 1394 WriteSettingGlobal(QStringLiteral("use_nvdec_emulation"), Settings::values.use_nvdec_emulation,
1394 true); 1395 true);
1396 WriteSettingGlobal(QStringLiteral("accelerate_astc"), Settings::values.accelerate_astc, true);
1395 WriteSettingGlobal(QStringLiteral("use_vsync"), Settings::values.use_vsync, true); 1397 WriteSettingGlobal(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
1396 WriteSettingGlobal(QStringLiteral("use_assembly_shaders"), 1398 WriteSettingGlobal(QStringLiteral("use_assembly_shaders"),
1397 Settings::values.use_assembly_shaders, false); 1399 Settings::values.use_assembly_shaders, false);
diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp
index fb9ec093c..41a69d9b8 100644
--- a/src/yuzu/configuration/configure_graphics.cpp
+++ b/src/yuzu/configuration/configure_graphics.cpp
@@ -70,10 +70,12 @@ void ConfigureGraphics::SetConfiguration() {
70 ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock); 70 ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock);
71 ui->use_disk_shader_cache->setEnabled(runtime_lock); 71 ui->use_disk_shader_cache->setEnabled(runtime_lock);
72 ui->use_nvdec_emulation->setEnabled(runtime_lock); 72 ui->use_nvdec_emulation->setEnabled(runtime_lock);
73 ui->accelerate_astc->setEnabled(runtime_lock);
73 ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache.GetValue()); 74 ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache.GetValue());
74 ui->use_asynchronous_gpu_emulation->setChecked( 75 ui->use_asynchronous_gpu_emulation->setChecked(
75 Settings::values.use_asynchronous_gpu_emulation.GetValue()); 76 Settings::values.use_asynchronous_gpu_emulation.GetValue());
76 ui->use_nvdec_emulation->setChecked(Settings::values.use_nvdec_emulation.GetValue()); 77 ui->use_nvdec_emulation->setChecked(Settings::values.use_nvdec_emulation.GetValue());
78 ui->accelerate_astc->setChecked(Settings::values.accelerate_astc.GetValue());
77 79
78 if (Settings::IsConfiguringGlobal()) { 80 if (Settings::IsConfiguringGlobal()) {
79 ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend.GetValue())); 81 ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend.GetValue()));
@@ -118,6 +120,8 @@ void ConfigureGraphics::ApplyConfiguration() {
118 use_asynchronous_gpu_emulation); 120 use_asynchronous_gpu_emulation);
119 ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_nvdec_emulation, 121 ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_nvdec_emulation,
120 ui->use_nvdec_emulation, use_nvdec_emulation); 122 ui->use_nvdec_emulation, use_nvdec_emulation);
123 ConfigurationShared::ApplyPerGameSetting(&Settings::values.accelerate_astc, ui->accelerate_astc,
124 accelerate_astc);
121 125
122 if (Settings::IsConfiguringGlobal()) { 126 if (Settings::IsConfiguringGlobal()) {
123 // Guard if during game and set to game-specific value 127 // Guard if during game and set to game-specific value
@@ -254,6 +258,7 @@ void ConfigureGraphics::SetupPerGameUI() {
254 ui->use_asynchronous_gpu_emulation->setEnabled( 258 ui->use_asynchronous_gpu_emulation->setEnabled(
255 Settings::values.use_asynchronous_gpu_emulation.UsingGlobal()); 259 Settings::values.use_asynchronous_gpu_emulation.UsingGlobal());
256 ui->use_nvdec_emulation->setEnabled(Settings::values.use_nvdec_emulation.UsingGlobal()); 260 ui->use_nvdec_emulation->setEnabled(Settings::values.use_nvdec_emulation.UsingGlobal());
261 ui->accelerate_astc->setEnabled(Settings::values.accelerate_astc.UsingGlobal());
257 ui->use_disk_shader_cache->setEnabled(Settings::values.use_disk_shader_cache.UsingGlobal()); 262 ui->use_disk_shader_cache->setEnabled(Settings::values.use_disk_shader_cache.UsingGlobal());
258 ui->bg_button->setEnabled(Settings::values.bg_red.UsingGlobal()); 263 ui->bg_button->setEnabled(Settings::values.bg_red.UsingGlobal());
259 264
@@ -269,6 +274,8 @@ void ConfigureGraphics::SetupPerGameUI() {
269 ui->use_disk_shader_cache, Settings::values.use_disk_shader_cache, use_disk_shader_cache); 274 ui->use_disk_shader_cache, Settings::values.use_disk_shader_cache, use_disk_shader_cache);
270 ConfigurationShared::SetColoredTristate( 275 ConfigurationShared::SetColoredTristate(
271 ui->use_nvdec_emulation, Settings::values.use_nvdec_emulation, use_nvdec_emulation); 276 ui->use_nvdec_emulation, Settings::values.use_nvdec_emulation, use_nvdec_emulation);
277 ConfigurationShared::SetColoredTristate(ui->accelerate_astc, Settings::values.accelerate_astc,
278 accelerate_astc);
272 ConfigurationShared::SetColoredTristate(ui->use_asynchronous_gpu_emulation, 279 ConfigurationShared::SetColoredTristate(ui->use_asynchronous_gpu_emulation,
273 Settings::values.use_asynchronous_gpu_emulation, 280 Settings::values.use_asynchronous_gpu_emulation,
274 use_asynchronous_gpu_emulation); 281 use_asynchronous_gpu_emulation);
diff --git a/src/yuzu/configuration/configure_graphics.h b/src/yuzu/configuration/configure_graphics.h
index c162048a2..6418115cf 100644
--- a/src/yuzu/configuration/configure_graphics.h
+++ b/src/yuzu/configuration/configure_graphics.h
@@ -47,6 +47,7 @@ private:
47 QColor bg_color; 47 QColor bg_color;
48 48
49 ConfigurationShared::CheckState use_nvdec_emulation; 49 ConfigurationShared::CheckState use_nvdec_emulation;
50 ConfigurationShared::CheckState accelerate_astc;
50 ConfigurationShared::CheckState use_disk_shader_cache; 51 ConfigurationShared::CheckState use_disk_shader_cache;
51 ConfigurationShared::CheckState use_asynchronous_gpu_emulation; 52 ConfigurationShared::CheckState use_asynchronous_gpu_emulation;
52 53
diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui
index ab0bd4d77..5b999d84d 100644
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@@ -105,6 +105,13 @@
105 </widget> 105 </widget>
106 </item> 106 </item>
107 <item> 107 <item>
108 <widget class="QCheckBox" name="accelerate_astc">
109 <property name="text">
110 <string>Accelerate ASTC texture decoding</string>
111 </property>
112 </widget>
113 </item>
114 <item>
108 <widget class="QWidget" name="fullscreen_mode_layout" native="true"> 115 <widget class="QWidget" name="fullscreen_mode_layout" native="true">
109 <layout class="QHBoxLayout" name="horizontalLayout_1"> 116 <layout class="QHBoxLayout" name="horizontalLayout_1">
110 <property name="leftMargin"> 117 <property name="leftMargin">
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp
index 107f097d0..621b31571 100644
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -447,8 +447,10 @@ void Config::ReadValues() {
447 sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", true)); 447 sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", true));
448 Settings::values.use_asynchronous_shaders.SetValue( 448 Settings::values.use_asynchronous_shaders.SetValue(
449 sdl2_config->GetBoolean("Renderer", "use_asynchronous_shaders", false)); 449 sdl2_config->GetBoolean("Renderer", "use_asynchronous_shaders", false));
450 Settings::values.use_asynchronous_shaders.SetValue( 450 Settings::values.use_nvdec_emulation.SetValue(
451 sdl2_config->GetBoolean("Renderer", "use_asynchronous_shaders", false)); 451 sdl2_config->GetBoolean("Renderer", "use_nvdec_emulation", true));
452 Settings::values.accelerate_astc.SetValue(
453 sdl2_config->GetBoolean("Renderer", "accelerate_astc", true));
452 Settings::values.use_fast_gpu_time.SetValue( 454 Settings::values.use_fast_gpu_time.SetValue(
453 sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true)); 455 sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true));
454 456
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h
index c32421671..efa1b1d18 100644
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -194,6 +194,14 @@ use_assembly_shaders =
194# 0 (default): Off, 1: On 194# 0 (default): Off, 1: On
195use_asynchronous_shaders = 195use_asynchronous_shaders =
196 196
197# Enable NVDEC emulation.
198# 0: Off, 1 (default): On
199use_nvdec_emulation =
200
201# Accelerate ASTC texture decoding.
202# 0: Off, 1 (default): On
203accelerate_astc =
204
197# Turns on the frame limiter, which will limit frames output to the target game speed 205# Turns on the frame limiter, which will limit frames output to the target game speed
198# 0: Off, 1: On (default) 206# 0: Off, 1: On (default)
199use_frame_limit = 207use_frame_limit =