summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp8
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp12
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp4
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp4
-rw-r--r--src/video_core/shader/decode/conversion.cpp113
-rw-r--r--src/video_core/shader/decode/texture.cpp14
-rw-r--r--src/video_core/texture_cache/texture_cache.h66
-rw-r--r--src/video_core/textures/astc.cpp241
8 files changed, 298 insertions, 164 deletions
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 75ef8d541..f31d960c7 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -345,7 +345,7 @@ void RasterizerOpenGL::ConfigureFramebuffers() {
345 345
346 texture_cache.GuardRenderTargets(true); 346 texture_cache.GuardRenderTargets(true);
347 347
348 View depth_surface = texture_cache.GetDepthBufferSurface(true); 348 View depth_surface = texture_cache.GetDepthBufferSurface();
349 349
350 const auto& regs = gpu.regs; 350 const auto& regs = gpu.regs;
351 UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0); 351 UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0);
@@ -354,7 +354,7 @@ void RasterizerOpenGL::ConfigureFramebuffers() {
354 FramebufferCacheKey key; 354 FramebufferCacheKey key;
355 const auto colors_count = static_cast<std::size_t>(regs.rt_control.count); 355 const auto colors_count = static_cast<std::size_t>(regs.rt_control.count);
356 for (std::size_t index = 0; index < colors_count; ++index) { 356 for (std::size_t index = 0; index < colors_count; ++index) {
357 View color_surface{texture_cache.GetColorBufferSurface(index, true)}; 357 View color_surface{texture_cache.GetColorBufferSurface(index)};
358 if (!color_surface) { 358 if (!color_surface) {
359 continue; 359 continue;
360 } 360 }
@@ -387,12 +387,12 @@ void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color_fb, bool using
387 View color_surface; 387 View color_surface;
388 if (using_color_fb) { 388 if (using_color_fb) {
389 const std::size_t index = regs.clear_buffers.RT; 389 const std::size_t index = regs.clear_buffers.RT;
390 color_surface = texture_cache.GetColorBufferSurface(index, true); 390 color_surface = texture_cache.GetColorBufferSurface(index);
391 texture_cache.MarkColorBufferInUse(index); 391 texture_cache.MarkColorBufferInUse(index);
392 } 392 }
393 View depth_surface; 393 View depth_surface;
394 if (using_depth_fb || using_stencil_fb) { 394 if (using_depth_fb || using_stencil_fb) {
395 depth_surface = texture_cache.GetDepthBufferSurface(true); 395 depth_surface = texture_cache.GetDepthBufferSurface();
396 texture_cache.MarkDepthBufferInUse(); 396 texture_cache.MarkDepthBufferInUse();
397 } 397 }
398 texture_cache.GuardRenderTargets(false); 398 texture_cache.GuardRenderTargets(false);
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 160ae4340..1f1f01313 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -1819,15 +1819,15 @@ private:
1819 } 1819 }
1820 1820
1821 Expression HMergeH0(Operation operation) { 1821 Expression HMergeH0(Operation operation) {
1822 std::string dest = VisitOperand(operation, 0).AsUint(); 1822 const std::string dest = VisitOperand(operation, 0).AsUint();
1823 std::string src = VisitOperand(operation, 1).AsUint(); 1823 const std::string src = VisitOperand(operation, 1).AsUint();
1824 return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", src, dest), Type::Uint}; 1824 return {fmt::format("bitfieldInsert({}, {}, 0, 16)", dest, src), Type::Uint};
1825 } 1825 }
1826 1826
1827 Expression HMergeH1(Operation operation) { 1827 Expression HMergeH1(Operation operation) {
1828 std::string dest = VisitOperand(operation, 0).AsUint(); 1828 const std::string dest = VisitOperand(operation, 0).AsUint();
1829 std::string src = VisitOperand(operation, 1).AsUint(); 1829 const std::string src = VisitOperand(operation, 1).AsUint();
1830 return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", dest, src), Type::Uint}; 1830 return {fmt::format("bitfieldInsert({}, {}, 16, 16)", dest, src), Type::Uint};
1831 } 1831 }
1832 1832
1833 Expression HPack2(Operation operation) { 1833 Expression HPack2(Operation operation) {
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 36590a6d0..0b4d999d7 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -411,14 +411,13 @@ CachedSurfaceView::~CachedSurfaceView() = default;
411void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { 411void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
412 ASSERT(params.num_levels == 1); 412 ASSERT(params.num_levels == 1);
413 413
414 const GLuint texture = surface.GetTexture();
415 if (params.num_layers > 1) { 414 if (params.num_layers > 1) {
416 // Layered framebuffer attachments 415 // Layered framebuffer attachments
417 UNIMPLEMENTED_IF(params.base_layer != 0); 416 UNIMPLEMENTED_IF(params.base_layer != 0);
418 417
419 switch (params.target) { 418 switch (params.target) {
420 case SurfaceTarget::Texture2DArray: 419 case SurfaceTarget::Texture2DArray:
421 glFramebufferTexture(target, attachment, texture, params.base_level); 420 glFramebufferTexture(target, attachment, GetTexture(), params.base_level);
422 break; 421 break;
423 default: 422 default:
424 UNIMPLEMENTED(); 423 UNIMPLEMENTED();
@@ -427,6 +426,7 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
427 } 426 }
428 427
429 const GLenum view_target = surface.GetTarget(); 428 const GLenum view_target = surface.GetTarget();
429 const GLuint texture = surface.GetTexture();
430 switch (surface.GetSurfaceParams().target) { 430 switch (surface.GetSurfaceParams().target) {
431 case SurfaceTarget::Texture1D: 431 case SurfaceTarget::Texture1D:
432 glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); 432 glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 0a2ea4fd4..6b99cbbbc 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -599,7 +599,7 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
599 Texceptions texceptions; 599 Texceptions texceptions;
600 for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { 600 for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
601 if (update_rendertargets) { 601 if (update_rendertargets) {
602 color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true); 602 color_attachments[rt] = texture_cache.GetColorBufferSurface(rt);
603 } 603 }
604 if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) { 604 if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) {
605 texceptions[rt] = true; 605 texceptions[rt] = true;
@@ -607,7 +607,7 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
607 } 607 }
608 608
609 if (update_rendertargets) { 609 if (update_rendertargets) {
610 zeta_attachment = texture_cache.GetDepthBufferSurface(true); 610 zeta_attachment = texture_cache.GetDepthBufferSurface();
611 } 611 }
612 if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) { 612 if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) {
613 texceptions[ZETA_TEXCEPTION_INDEX] = true; 613 texceptions[ZETA_TEXCEPTION_INDEX] = true;
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index c72690b2b..b9989c88c 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -2,6 +2,10 @@
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <limits>
6#include <optional>
7#include <utility>
8
5#include "common/assert.h" 9#include "common/assert.h"
6#include "common/common_types.h" 10#include "common/common_types.h"
7#include "video_core/engines/shader_bytecode.h" 11#include "video_core/engines/shader_bytecode.h"
@@ -15,9 +19,49 @@ using Tegra::Shader::OpCode;
15using Tegra::Shader::Register; 19using Tegra::Shader::Register;
16 20
17namespace { 21namespace {
22
18constexpr OperationCode GetFloatSelector(u64 selector) { 23constexpr OperationCode GetFloatSelector(u64 selector) {
19 return selector == 0 ? OperationCode::FCastHalf0 : OperationCode::FCastHalf1; 24 return selector == 0 ? OperationCode::FCastHalf0 : OperationCode::FCastHalf1;
20} 25}
26
27constexpr u32 SizeInBits(Register::Size size) {
28 switch (size) {
29 case Register::Size::Byte:
30 return 8;
31 case Register::Size::Short:
32 return 16;
33 case Register::Size::Word:
34 return 32;
35 case Register::Size::Long:
36 return 64;
37 }
38 return 0;
39}
40
41constexpr std::optional<std::pair<s32, s32>> IntegerSaturateBounds(Register::Size src_size,
42 Register::Size dst_size,
43 bool src_signed,
44 bool dst_signed) {
45 const u32 dst_bits = SizeInBits(dst_size);
46 if (src_size == Register::Size::Word && dst_size == Register::Size::Word) {
47 if (src_signed == dst_signed) {
48 return std::nullopt;
49 }
50 return std::make_pair(0, std::numeric_limits<s32>::max());
51 }
52 if (dst_signed) {
53 // Signed destination, clamp to [-128, 127] for instance
54 return std::make_pair(-(1 << (dst_bits - 1)), (1 << (dst_bits - 1)) - 1);
55 } else {
56 // Unsigned destination
57 if (dst_bits == 32) {
58 // Avoid shifting by 32, that is undefined behavior
59 return std::make_pair(0, s32(std::numeric_limits<u32>::max()));
60 }
61 return std::make_pair(0, (1 << dst_bits) - 1);
62 }
63}
64
21} // Anonymous namespace 65} // Anonymous namespace
22 66
23u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { 67u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
@@ -28,14 +72,13 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
28 case OpCode::Id::I2I_R: 72 case OpCode::Id::I2I_R:
29 case OpCode::Id::I2I_C: 73 case OpCode::Id::I2I_C:
30 case OpCode::Id::I2I_IMM: { 74 case OpCode::Id::I2I_IMM: {
31 UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0); 75 const bool src_signed = instr.conversion.is_input_signed;
32 UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word); 76 const bool dst_signed = instr.conversion.is_output_signed;
33 UNIMPLEMENTED_IF(instr.alu.saturate_d); 77 const Register::Size src_size = instr.conversion.src_size;
78 const Register::Size dst_size = instr.conversion.dst_size;
79 const u32 selector = static_cast<u32>(instr.conversion.int_src.selector);
34 80
35 const bool input_signed = instr.conversion.is_input_signed; 81 Node value = [this, instr, opcode] {
36 const bool output_signed = instr.conversion.is_output_signed;
37
38 Node value = [&]() {
39 switch (opcode->get().GetId()) { 82 switch (opcode->get().GetId()) {
40 case OpCode::Id::I2I_R: 83 case OpCode::Id::I2I_R:
41 return GetRegister(instr.gpr20); 84 return GetRegister(instr.gpr20);
@@ -48,16 +91,60 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
48 return Immediate(0); 91 return Immediate(0);
49 } 92 }
50 }(); 93 }();
51 value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed);
52 94
53 value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, instr.conversion.negate_a, 95 // Ensure the source selector is valid
54 input_signed); 96 switch (instr.conversion.src_size) {
55 if (input_signed != output_signed) { 97 case Register::Size::Byte:
56 value = SignedOperation(OperationCode::ICastUnsigned, output_signed, NO_PRECISE, value); 98 break;
99 case Register::Size::Short:
100 ASSERT(selector == 0 || selector == 2);
101 break;
102 default:
103 ASSERT(selector == 0);
104 break;
105 }
106
107 if (src_size != Register::Size::Word || selector != 0) {
108 value = SignedOperation(OperationCode::IBitfieldExtract, src_signed, std::move(value),
109 Immediate(selector * 8), Immediate(SizeInBits(src_size)));
110 }
111
112 value = GetOperandAbsNegInteger(std::move(value), instr.conversion.abs_a,
113 instr.conversion.negate_a, src_signed);
114
115 if (instr.alu.saturate_d) {
116 if (src_signed && !dst_signed) {
117 Node is_negative = Operation(OperationCode::LogicalUGreaterEqual, value,
118 Immediate(1 << (SizeInBits(src_size) - 1)));
119 value = Operation(OperationCode::Select, std::move(is_negative), Immediate(0),
120 std::move(value));
121
122 // Simplify generated expressions, this can be removed without semantic impact
123 SetTemporary(bb, 0, std::move(value));
124 value = GetTemporary(0);
125
126 if (dst_size != Register::Size::Word) {
127 const Node limit = Immediate((1 << SizeInBits(dst_size)) - 1);
128 Node is_large =
129 Operation(OperationCode::LogicalUGreaterThan, std::move(value), limit);
130 value = Operation(OperationCode::Select, std::move(is_large), limit,
131 std::move(value));
132 }
133 } else if (const std::optional bounds =
134 IntegerSaturateBounds(src_size, dst_size, src_signed, dst_signed)) {
135 value = SignedOperation(OperationCode::IMax, src_signed, std::move(value),
136 Immediate(bounds->first));
137 value = SignedOperation(OperationCode::IMin, src_signed, std::move(value),
138 Immediate(bounds->second));
139 }
140 } else if (dst_size != Register::Size::Word) {
141 // No saturation, we only have to mask the result
142 Node mask = Immediate((1 << SizeInBits(dst_size)) - 1);
143 value = Operation(OperationCode::UBitwiseAnd, std::move(value), std::move(mask));
57 } 144 }
58 145
59 SetInternalFlagsFromInteger(bb, value, instr.generates_cc); 146 SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
60 SetRegister(bb, instr.gpr0, value); 147 SetRegister(bb, instr.gpr0, std::move(value));
61 break; 148 break;
62 } 149 }
63 case OpCode::Id::I2F_R: 150 case OpCode::Id::I2F_R:
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 48350e042..6c4a1358b 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -780,20 +780,6 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is
780 // When lod is used always is in gpr20 780 // When lod is used always is in gpr20
781 const Node lod = lod_enabled ? GetRegister(instr.gpr20) : Immediate(0); 781 const Node lod = lod_enabled ? GetRegister(instr.gpr20) : Immediate(0);
782 782
783 // Fill empty entries from the guest sampler
784 const std::size_t entry_coord_count = GetCoordCount(sampler.GetType());
785 if (type_coord_count != entry_coord_count) {
786 LOG_WARNING(HW_GPU, "Bound and built texture types mismatch");
787
788 // When the size is higher we insert zeroes
789 for (std::size_t i = type_coord_count; i < entry_coord_count; ++i) {
790 coords.push_back(GetRegister(Register::ZeroIndex));
791 }
792
793 // Then we ensure the size matches the number of entries (dropping unused values)
794 coords.resize(entry_coord_count);
795 }
796
797 Node4 values; 783 Node4 values;
798 for (u32 element = 0; element < values.size(); ++element) { 784 for (u32 element = 0; element < values.size(); ++element) {
799 auto coords_copy = coords; 785 auto coords_copy = coords;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 88fe3e25f..cfc7fe6e9 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -108,7 +108,7 @@ public:
108 } 108 }
109 109
110 const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)}; 110 const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)};
111 const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false); 111 const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, false);
112 if (guard_samplers) { 112 if (guard_samplers) {
113 sampled_textures.push_back(surface); 113 sampled_textures.push_back(surface);
114 } 114 }
@@ -128,7 +128,7 @@ public:
128 return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); 128 return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
129 } 129 }
130 const auto params{SurfaceParams::CreateForImage(format_lookup_table, tic, entry)}; 130 const auto params{SurfaceParams::CreateForImage(format_lookup_table, tic, entry)};
131 const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false); 131 const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, false);
132 if (guard_samplers) { 132 if (guard_samplers) {
133 sampled_textures.push_back(surface); 133 sampled_textures.push_back(surface);
134 } 134 }
@@ -143,7 +143,7 @@ public:
143 return any_rt; 143 return any_rt;
144 } 144 }
145 145
146 TView GetDepthBufferSurface(bool preserve_contents) { 146 TView GetDepthBufferSurface() {
147 std::lock_guard lock{mutex}; 147 std::lock_guard lock{mutex};
148 auto& maxwell3d = system.GPU().Maxwell3D(); 148 auto& maxwell3d = system.GPU().Maxwell3D();
149 if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer]) { 149 if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer]) {
@@ -164,7 +164,7 @@ public:
164 return {}; 164 return {};
165 } 165 }
166 const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)}; 166 const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)};
167 auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, preserve_contents, true); 167 auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, true);
168 if (depth_buffer.target) 168 if (depth_buffer.target)
169 depth_buffer.target->MarkAsRenderTarget(false, NO_RT); 169 depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
170 depth_buffer.target = surface_view.first; 170 depth_buffer.target = surface_view.first;
@@ -174,7 +174,7 @@ public:
174 return surface_view.second; 174 return surface_view.second;
175 } 175 }
176 176
177 TView GetColorBufferSurface(std::size_t index, bool preserve_contents) { 177 TView GetColorBufferSurface(std::size_t index) {
178 std::lock_guard lock{mutex}; 178 std::lock_guard lock{mutex};
179 ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets); 179 ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
180 auto& maxwell3d = system.GPU().Maxwell3D(); 180 auto& maxwell3d = system.GPU().Maxwell3D();
@@ -204,9 +204,8 @@ public:
204 return {}; 204 return {};
205 } 205 }
206 206
207 auto surface_view = 207 auto surface_view = GetSurface(gpu_addr, *cpu_addr,
208 GetSurface(gpu_addr, *cpu_addr, SurfaceParams::CreateForFramebuffer(system, index), 208 SurfaceParams::CreateForFramebuffer(system, index), true);
209 preserve_contents, true);
210 if (render_targets[index].target) 209 if (render_targets[index].target)
211 render_targets[index].target->MarkAsRenderTarget(false, NO_RT); 210 render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
212 render_targets[index].target = surface_view.first; 211 render_targets[index].target = surface_view.first;
@@ -260,9 +259,9 @@ public:
260 const std::optional<VAddr> src_cpu_addr = 259 const std::optional<VAddr> src_cpu_addr =
261 system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr); 260 system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr);
262 std::pair<TSurface, TView> dst_surface = 261 std::pair<TSurface, TView> dst_surface =
263 GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); 262 GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, false);
264 std::pair<TSurface, TView> src_surface = 263 std::pair<TSurface, TView> src_surface =
265 GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false); 264 GetSurface(src_gpu_addr, *src_cpu_addr, src_params, false);
266 ImageBlit(src_surface.second, dst_surface.second, copy_config); 265 ImageBlit(src_surface.second, dst_surface.second, copy_config);
267 dst_surface.first->MarkAsModified(true, Tick()); 266 dst_surface.first->MarkAsModified(true, Tick());
268 } 267 }
@@ -451,22 +450,18 @@ private:
451 * @param overlaps The overlapping surfaces registered in the cache. 450 * @param overlaps The overlapping surfaces registered in the cache.
452 * @param params The parameters for the new surface. 451 * @param params The parameters for the new surface.
453 * @param gpu_addr The starting address of the new surface. 452 * @param gpu_addr The starting address of the new surface.
454 * @param preserve_contents Indicates that the new surface should be loaded from memory or left
455 * blank.
456 * @param untopological Indicates to the recycler that the texture has no way to match the 453 * @param untopological Indicates to the recycler that the texture has no way to match the
457 * overlaps due to topological reasons. 454 * overlaps due to topological reasons.
458 **/ 455 **/
459 std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps, 456 std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps,
460 const SurfaceParams& params, const GPUVAddr gpu_addr, 457 const SurfaceParams& params, const GPUVAddr gpu_addr,
461 const bool preserve_contents,
462 const MatchTopologyResult untopological) { 458 const MatchTopologyResult untopological) {
463 const bool do_load = preserve_contents && Settings::values.use_accurate_gpu_emulation;
464 for (auto& surface : overlaps) { 459 for (auto& surface : overlaps) {
465 Unregister(surface); 460 Unregister(surface);
466 } 461 }
467 switch (PickStrategy(overlaps, params, gpu_addr, untopological)) { 462 switch (PickStrategy(overlaps, params, gpu_addr, untopological)) {
468 case RecycleStrategy::Ignore: { 463 case RecycleStrategy::Ignore: {
469 return InitializeSurface(gpu_addr, params, do_load); 464 return InitializeSurface(gpu_addr, params, Settings::values.use_accurate_gpu_emulation);
470 } 465 }
471 case RecycleStrategy::Flush: { 466 case RecycleStrategy::Flush: {
472 std::sort(overlaps.begin(), overlaps.end(), 467 std::sort(overlaps.begin(), overlaps.end(),
@@ -476,7 +471,7 @@ private:
476 for (auto& surface : overlaps) { 471 for (auto& surface : overlaps) {
477 FlushSurface(surface); 472 FlushSurface(surface);
478 } 473 }
479 return InitializeSurface(gpu_addr, params, preserve_contents); 474 return InitializeSurface(gpu_addr, params);
480 } 475 }
481 case RecycleStrategy::BufferCopy: { 476 case RecycleStrategy::BufferCopy: {
482 auto new_surface = GetUncachedSurface(gpu_addr, params); 477 auto new_surface = GetUncachedSurface(gpu_addr, params);
@@ -485,7 +480,7 @@ private:
485 } 480 }
486 default: { 481 default: {
487 UNIMPLEMENTED_MSG("Unimplemented Texture Cache Recycling Strategy!"); 482 UNIMPLEMENTED_MSG("Unimplemented Texture Cache Recycling Strategy!");
488 return InitializeSurface(gpu_addr, params, do_load); 483 return InitializeSurface(gpu_addr, params);
489 } 484 }
490 } 485 }
491 } 486 }
@@ -621,14 +616,11 @@ private:
621 * @param params The parameters on the new surface. 616 * @param params The parameters on the new surface.
622 * @param gpu_addr The starting address of the new surface. 617 * @param gpu_addr The starting address of the new surface.
623 * @param cache_addr The starting address of the new surface on physical memory. 618 * @param cache_addr The starting address of the new surface on physical memory.
624 * @param preserve_contents Indicates that the new surface should be loaded from memory or
625 * left blank.
626 */ 619 */
627 std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps, 620 std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps,
628 const SurfaceParams& params, 621 const SurfaceParams& params,
629 const GPUVAddr gpu_addr, 622 const GPUVAddr gpu_addr,
630 const VAddr cpu_addr, 623 const VAddr cpu_addr) {
631 bool preserve_contents) {
632 if (params.target == SurfaceTarget::Texture3D) { 624 if (params.target == SurfaceTarget::Texture3D) {
633 bool failed = false; 625 bool failed = false;
634 if (params.num_levels > 1) { 626 if (params.num_levels > 1) {
@@ -677,7 +669,7 @@ private:
677 return std::nullopt; 669 return std::nullopt;
678 } 670 }
679 Unregister(surface); 671 Unregister(surface);
680 return InitializeSurface(gpu_addr, params, preserve_contents); 672 return InitializeSurface(gpu_addr, params);
681 } 673 }
682 return std::nullopt; 674 return std::nullopt;
683 } 675 }
@@ -688,7 +680,7 @@ private:
688 return {{surface, surface->GetMainView()}}; 680 return {{surface, surface->GetMainView()}};
689 } 681 }
690 } 682 }
691 return InitializeSurface(gpu_addr, params, preserve_contents); 683 return InitializeSurface(gpu_addr, params);
692 } 684 }
693 } 685 }
694 686
@@ -711,13 +703,10 @@ private:
711 * 703 *
712 * @param gpu_addr The starting address of the candidate surface. 704 * @param gpu_addr The starting address of the candidate surface.
713 * @param params The parameters on the candidate surface. 705 * @param params The parameters on the candidate surface.
714 * @param preserve_contents Indicates that the new surface should be loaded from memory or
715 * left blank.
716 * @param is_render Whether or not the surface is a render target. 706 * @param is_render Whether or not the surface is a render target.
717 **/ 707 **/
718 std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const VAddr cpu_addr, 708 std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const VAddr cpu_addr,
719 const SurfaceParams& params, bool preserve_contents, 709 const SurfaceParams& params, bool is_render) {
720 bool is_render) {
721 // Step 1 710 // Step 1
722 // Check Level 1 Cache for a fast structural match. If candidate surface 711 // Check Level 1 Cache for a fast structural match. If candidate surface
723 // matches at certain level we are pretty much done. 712 // matches at certain level we are pretty much done.
@@ -726,8 +715,7 @@ private:
726 const auto topological_result = current_surface->MatchesTopology(params); 715 const auto topological_result = current_surface->MatchesTopology(params);
727 if (topological_result != MatchTopologyResult::FullMatch) { 716 if (topological_result != MatchTopologyResult::FullMatch) {
728 std::vector<TSurface> overlaps{current_surface}; 717 std::vector<TSurface> overlaps{current_surface};
729 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, 718 return RecycleSurface(overlaps, params, gpu_addr, topological_result);
730 topological_result);
731 } 719 }
732 720
733 const auto struct_result = current_surface->MatchesStructure(params); 721 const auto struct_result = current_surface->MatchesStructure(params);
@@ -752,7 +740,7 @@ private:
752 740
753 // If none are found, we are done. we just load the surface and create it. 741 // If none are found, we are done. we just load the surface and create it.
754 if (overlaps.empty()) { 742 if (overlaps.empty()) {
755 return InitializeSurface(gpu_addr, params, preserve_contents); 743 return InitializeSurface(gpu_addr, params);
756 } 744 }
757 745
758 // Step 3 746 // Step 3
@@ -762,15 +750,13 @@ private:
762 for (const auto& surface : overlaps) { 750 for (const auto& surface : overlaps) {
763 const auto topological_result = surface->MatchesTopology(params); 751 const auto topological_result = surface->MatchesTopology(params);
764 if (topological_result != MatchTopologyResult::FullMatch) { 752 if (topological_result != MatchTopologyResult::FullMatch) {
765 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, 753 return RecycleSurface(overlaps, params, gpu_addr, topological_result);
766 topological_result);
767 } 754 }
768 } 755 }
769 756
770 // Check if it's a 3D texture 757 // Check if it's a 3D texture
771 if (params.block_depth > 0) { 758 if (params.block_depth > 0) {
772 auto surface = 759 auto surface = Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr);
773 Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents);
774 if (surface) { 760 if (surface) {
775 return *surface; 761 return *surface;
776 } 762 }
@@ -790,8 +776,7 @@ private:
790 return *view; 776 return *view;
791 } 777 }
792 } 778 }
793 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, 779 return RecycleSurface(overlaps, params, gpu_addr, MatchTopologyResult::FullMatch);
794 MatchTopologyResult::FullMatch);
795 } 780 }
796 // Now we check if the candidate is a mipmap/layer of the overlap 781 // Now we check if the candidate is a mipmap/layer of the overlap
797 std::optional<TView> view = 782 std::optional<TView> view =
@@ -815,7 +800,7 @@ private:
815 pair.first->EmplaceView(params, gpu_addr, candidate_size); 800 pair.first->EmplaceView(params, gpu_addr, candidate_size);
816 if (mirage_view) 801 if (mirage_view)
817 return {pair.first, *mirage_view}; 802 return {pair.first, *mirage_view};
818 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, 803 return RecycleSurface(overlaps, params, gpu_addr,
819 MatchTopologyResult::FullMatch); 804 MatchTopologyResult::FullMatch);
820 } 805 }
821 return {current_surface, *view}; 806 return {current_surface, *view};
@@ -831,8 +816,7 @@ private:
831 } 816 }
832 } 817 }
833 // We failed all the tests, recycle the overlaps into a new texture. 818 // We failed all the tests, recycle the overlaps into a new texture.
834 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, 819 return RecycleSurface(overlaps, params, gpu_addr, MatchTopologyResult::FullMatch);
835 MatchTopologyResult::FullMatch);
836 } 820 }
837 821
838 /** 822 /**
@@ -990,10 +974,10 @@ private:
990 } 974 }
991 975
992 std::pair<TSurface, TView> InitializeSurface(GPUVAddr gpu_addr, const SurfaceParams& params, 976 std::pair<TSurface, TView> InitializeSurface(GPUVAddr gpu_addr, const SurfaceParams& params,
993 bool preserve_contents) { 977 bool do_load = true) {
994 auto new_surface{GetUncachedSurface(gpu_addr, params)}; 978 auto new_surface{GetUncachedSurface(gpu_addr, params)};
995 Register(new_surface); 979 Register(new_surface);
996 if (preserve_contents) { 980 if (do_load) {
997 LoadSurface(new_surface); 981 LoadSurface(new_surface);
998 } 982 }
999 return {new_surface, new_surface->GetMainView()}; 983 return {new_surface, new_surface->GetMainView()};
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 062b4f252..365bde2f1 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -20,6 +20,8 @@
20#include <cstring> 20#include <cstring>
21#include <vector> 21#include <vector>
22 22
23#include <boost/container/static_vector.hpp>
24
23#include "common/common_types.h" 25#include "common/common_types.h"
24 26
25#include "video_core/textures/astc.h" 27#include "video_core/textures/astc.h"
@@ -39,25 +41,25 @@ constexpr u32 Popcnt(u32 n) {
39 41
40class InputBitStream { 42class InputBitStream {
41public: 43public:
42 explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) 44 constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
43 : m_CurByte(ptr), m_NextBit(start_offset % 8) {} 45 : cur_byte{ptr}, next_bit{start_offset % 8} {}
44 46
45 std::size_t GetBitsRead() const { 47 constexpr std::size_t GetBitsRead() const {
46 return m_BitsRead; 48 return bits_read;
47 } 49 }
48 50
49 u32 ReadBit() { 51 constexpr bool ReadBit() {
50 u32 bit = *m_CurByte >> m_NextBit++; 52 const bool bit = (*cur_byte >> next_bit++) & 1;
51 while (m_NextBit >= 8) { 53 while (next_bit >= 8) {
52 m_NextBit -= 8; 54 next_bit -= 8;
53 m_CurByte++; 55 cur_byte++;
54 } 56 }
55 57
56 m_BitsRead++; 58 bits_read++;
57 return bit & 1; 59 return bit;
58 } 60 }
59 61
60 u32 ReadBits(std::size_t nBits) { 62 constexpr u32 ReadBits(std::size_t nBits) {
61 u32 ret = 0; 63 u32 ret = 0;
62 for (std::size_t i = 0; i < nBits; ++i) { 64 for (std::size_t i = 0; i < nBits; ++i) {
63 ret |= (ReadBit() & 1) << i; 65 ret |= (ReadBit() & 1) << i;
@@ -66,7 +68,7 @@ public:
66 } 68 }
67 69
68 template <std::size_t nBits> 70 template <std::size_t nBits>
69 u32 ReadBits() { 71 constexpr u32 ReadBits() {
70 u32 ret = 0; 72 u32 ret = 0;
71 for (std::size_t i = 0; i < nBits; ++i) { 73 for (std::size_t i = 0; i < nBits; ++i) {
72 ret |= (ReadBit() & 1) << i; 74 ret |= (ReadBit() & 1) << i;
@@ -75,64 +77,58 @@ public:
75 } 77 }
76 78
77private: 79private:
78 const u8* m_CurByte; 80 const u8* cur_byte;
79 std::size_t m_NextBit = 0; 81 std::size_t next_bit = 0;
80 std::size_t m_BitsRead = 0; 82 std::size_t bits_read = 0;
81}; 83};
82 84
83class OutputBitStream { 85class OutputBitStream {
84public: 86public:
85 explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0) 87 constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0)
86 : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {} 88 : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {}
87
88 ~OutputBitStream() = default;
89 89
90 s32 GetBitsWritten() const { 90 constexpr std::size_t GetBitsWritten() const {
91 return m_BitsWritten; 91 return bits_written;
92 } 92 }
93 93
94 void WriteBitsR(u32 val, u32 nBits) { 94 constexpr void WriteBitsR(u32 val, u32 nBits) {
95 for (u32 i = 0; i < nBits; i++) { 95 for (u32 i = 0; i < nBits; i++) {
96 WriteBit((val >> (nBits - i - 1)) & 1); 96 WriteBit((val >> (nBits - i - 1)) & 1);
97 } 97 }
98 } 98 }
99 99
100 void WriteBits(u32 val, u32 nBits) { 100 constexpr void WriteBits(u32 val, u32 nBits) {
101 for (u32 i = 0; i < nBits; i++) { 101 for (u32 i = 0; i < nBits; i++) {
102 WriteBit((val >> i) & 1); 102 WriteBit((val >> i) & 1);
103 } 103 }
104 } 104 }
105 105
106private: 106private:
107 void WriteBit(s32 b) { 107 constexpr void WriteBit(bool b) {
108 108 if (bits_written >= num_bits) {
109 if (done)
110 return; 109 return;
110 }
111 111
112 const u32 mask = 1 << m_NextBit++; 112 const u32 mask = 1 << next_bit++;
113 113
114 // clear the bit 114 // clear the bit
115 *m_CurByte &= static_cast<u8>(~mask); 115 *cur_byte &= static_cast<u8>(~mask);
116 116
117 // Write the bit, if necessary 117 // Write the bit, if necessary
118 if (b) 118 if (b)
119 *m_CurByte |= static_cast<u8>(mask); 119 *cur_byte |= static_cast<u8>(mask);
120 120
121 // Next byte? 121 // Next byte?
122 if (m_NextBit >= 8) { 122 if (next_bit >= 8) {
123 m_CurByte += 1; 123 cur_byte += 1;
124 m_NextBit = 0; 124 next_bit = 0;
125 } 125 }
126
127 done = done || ++m_BitsWritten >= m_NumBits;
128 } 126 }
129 127
130 s32 m_BitsWritten = 0; 128 u8* cur_byte;
131 const s32 m_NumBits; 129 std::size_t num_bits;
132 u8* m_CurByte; 130 std::size_t bits_written = 0;
133 s32 m_NextBit = 0; 131 std::size_t next_bit = 0;
134
135 bool done = false;
136}; 132};
137 133
138template <typename IntType> 134template <typename IntType>
@@ -195,9 +191,13 @@ struct IntegerEncodedValue {
195 u32 trit_value; 191 u32 trit_value;
196 }; 192 };
197}; 193};
194using IntegerEncodedVector = boost::container::static_vector<
195 IntegerEncodedValue, 64,
196 boost::container::static_vector_options<
197 boost::container::inplace_alignment<alignof(IntegerEncodedValue)>,
198 boost::container::throw_on_overflow<false>>::type>;
198 199
199static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, 200static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) {
200 u32 nBitsPerValue) {
201 // Implement the algorithm in section C.2.12 201 // Implement the algorithm in section C.2.12
202 u32 m[5]; 202 u32 m[5];
203 u32 t[5]; 203 u32 t[5];
@@ -255,7 +255,7 @@ static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValu
255 } 255 }
256} 256}
257 257
258static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, 258static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result,
259 u32 nBitsPerValue) { 259 u32 nBitsPerValue) {
260 // Implement the algorithm in section C.2.12 260 // Implement the algorithm in section C.2.12
261 u32 m[3]; 261 u32 m[3];
@@ -343,8 +343,8 @@ static constexpr std::array EncodingsValues = MakeEncodedValues();
343// Fills result with the values that are encoded in the given 343// Fills result with the values that are encoded in the given
344// bitstream. We must know beforehand what the maximum possible 344// bitstream. We must know beforehand what the maximum possible
345// value is, and how many values we're decoding. 345// value is, and how many values we're decoding.
346static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits, 346static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange,
347 u32 maxRange, u32 nValues) { 347 u32 nValues) {
348 // Determine encoding parameters 348 // Determine encoding parameters
349 IntegerEncodedValue val = EncodingsValues[maxRange]; 349 IntegerEncodedValue val = EncodingsValues[maxRange];
350 350
@@ -634,12 +634,14 @@ static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) {
634// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] 634// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
635// is the same as [(numBits - 1):0] and repeats all the way down. 635// is the same as [(numBits - 1):0] and repeats all the way down.
636template <typename IntType> 636template <typename IntType>
637static IntType Replicate(IntType val, u32 numBits, u32 toBit) { 637static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
638 if (numBits == 0) 638 if (numBits == 0) {
639 return 0; 639 return 0;
640 if (toBit == 0) 640 }
641 if (toBit == 0) {
641 return 0; 642 return 0;
642 IntType v = val & static_cast<IntType>((1 << numBits) - 1); 643 }
644 const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
643 IntType res = v; 645 IntType res = v;
644 u32 reslen = numBits; 646 u32 reslen = numBits;
645 while (reslen < toBit) { 647 while (reslen < toBit) {
@@ -656,6 +658,89 @@ static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
656 return res; 658 return res;
657} 659}
658 660
661static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
662 return std::size_t(1) << num_bits;
663}
664
665template <typename IntType, u32 num_bits, u32 to_bit>
666static constexpr auto MakeReplicateTable() {
667 std::array<IntType, NumReplicateEntries(num_bits)> table{};
668 for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
669 table[value] = Replicate(value, num_bits, to_bit);
670 }
671 return table;
672}
673
674static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
675static constexpr u32 ReplicateByteTo16(std::size_t value) {
676 return REPLICATE_BYTE_TO_16_TABLE[value];
677}
678
679static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
680static constexpr u32 ReplicateBitTo7(std::size_t value) {
681 return REPLICATE_BIT_TO_7_TABLE[value];
682}
683
684static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
685static constexpr u32 ReplicateBitTo9(std::size_t value) {
686 return REPLICATE_BIT_TO_9_TABLE[value];
687}
688
689static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
690static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
691static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
692static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
693static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
694static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
695static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
696static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
697/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
698/// to the runtime implementation
699static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
700 switch (num_bits) {
701 case 1:
702 return REPLICATE_1_BIT_TO_8_TABLE[value];
703 case 2:
704 return REPLICATE_2_BIT_TO_8_TABLE[value];
705 case 3:
706 return REPLICATE_3_BIT_TO_8_TABLE[value];
707 case 4:
708 return REPLICATE_4_BIT_TO_8_TABLE[value];
709 case 5:
710 return REPLICATE_5_BIT_TO_8_TABLE[value];
711 case 6:
712 return REPLICATE_6_BIT_TO_8_TABLE[value];
713 case 7:
714 return REPLICATE_7_BIT_TO_8_TABLE[value];
715 case 8:
716 return REPLICATE_8_BIT_TO_8_TABLE[value];
717 default:
718 return Replicate(value, num_bits, 8);
719 }
720}
721
722static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
723static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
724static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
725static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
726static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
727static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
728 switch (num_bits) {
729 case 1:
730 return REPLICATE_1_BIT_TO_6_TABLE[value];
731 case 2:
732 return REPLICATE_2_BIT_TO_6_TABLE[value];
733 case 3:
734 return REPLICATE_3_BIT_TO_6_TABLE[value];
735 case 4:
736 return REPLICATE_4_BIT_TO_6_TABLE[value];
737 case 5:
738 return REPLICATE_5_BIT_TO_6_TABLE[value];
739 default:
740 return Replicate(value, num_bits, 6);
741 }
742}
743
659class Pixel { 744class Pixel {
660protected: 745protected:
661 using ChannelType = s16; 746 using ChannelType = s16;
@@ -674,10 +759,10 @@ public:
674 // significant bits when going from larger to smaller bit depth 759 // significant bits when going from larger to smaller bit depth
675 // or by repeating the most significant bits when going from 760 // or by repeating the most significant bits when going from
676 // smaller to larger bit depths. 761 // smaller to larger bit depths.
677 void ChangeBitDepth(const u8 (&depth)[4]) { 762 void ChangeBitDepth() {
678 for (u32 i = 0; i < 4; i++) { 763 for (u32 i = 0; i < 4; i++) {
679 Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]); 764 Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]);
680 m_BitDepth[i] = depth[i]; 765 m_BitDepth[i] = 8;
681 } 766 }
682 } 767 }
683 768
@@ -689,28 +774,23 @@ public:
689 774
690 // Changes the bit depth of a single component. See the comment 775 // Changes the bit depth of a single component. See the comment
691 // above for how we do this. 776 // above for how we do this.
692 static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) { 777 static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) {
693 assert(newDepth <= 8);
694 assert(oldDepth <= 8); 778 assert(oldDepth <= 8);
695 779
696 if (oldDepth == newDepth) { 780 if (oldDepth == 8) {
697 // Do nothing 781 // Do nothing
698 return val; 782 return val;
699 } else if (oldDepth == 0 && newDepth != 0) { 783 } else if (oldDepth == 0) {
700 return static_cast<ChannelType>((1 << newDepth) - 1); 784 return static_cast<ChannelType>((1 << 8) - 1);
701 } else if (newDepth > oldDepth) { 785 } else if (8 > oldDepth) {
702 return Replicate(val, oldDepth, newDepth); 786 return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
703 } else { 787 } else {
704 // oldDepth > newDepth 788 // oldDepth > newDepth
705 if (newDepth == 0) { 789 const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
706 return 0xFF; 790 u16 v = static_cast<u16>(val);
707 } else { 791 v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
708 u8 bitsWasted = static_cast<u8>(oldDepth - newDepth); 792 v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1));
709 u16 v = static_cast<u16>(val); 793 return static_cast<u8>(v);
710 v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
711 v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1));
712 return static_cast<u8>(v);
713 }
714 } 794 }
715 795
716 assert(false && "We shouldn't get here."); 796 assert(false && "We shouldn't get here.");
@@ -760,8 +840,7 @@ public:
760 // up in the most-significant byte. 840 // up in the most-significant byte.
761 u32 Pack() const { 841 u32 Pack() const {
762 Pixel eightBit(*this); 842 Pixel eightBit(*this);
763 const u8 eightBitDepth[4] = {8, 8, 8, 8}; 843 eightBit.ChangeBitDepth();
764 eightBit.ChangeBitDepth(eightBitDepth);
765 844
766 u32 r = 0; 845 u32 r = 0;
767 r |= eightBit.A(); 846 r |= eightBit.A();
@@ -816,8 +895,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
816 } 895 }
817 896
818 // We now have enough to decode our integer sequence. 897 // We now have enough to decode our integer sequence.
819 std::vector<IntegerEncodedValue> decodedColorValues; 898 IntegerEncodedVector decodedColorValues;
820 decodedColorValues.reserve(32);
821 899
822 InputBitStream colorStream(data); 900 InputBitStream colorStream(data);
823 DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); 901 DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
@@ -839,12 +917,12 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
839 917
840 u32 A = 0, B = 0, C = 0, D = 0; 918 u32 A = 0, B = 0, C = 0, D = 0;
841 // A is just the lsb replicated 9 times. 919 // A is just the lsb replicated 9 times.
842 A = Replicate(bitval & 1, 1, 9); 920 A = ReplicateBitTo9(bitval & 1);
843 921
844 switch (val.encoding) { 922 switch (val.encoding) {
845 // Replicate bits 923 // Replicate bits
846 case IntegerEncoding::JustBits: 924 case IntegerEncoding::JustBits:
847 out[outIdx++] = Replicate(bitval, bitlen, 8); 925 out[outIdx++] = FastReplicateTo8(bitval, bitlen);
848 break; 926 break;
849 927
850 // Use algorithm in C.2.13 928 // Use algorithm in C.2.13
@@ -962,13 +1040,13 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
962 u32 bitval = val.bit_value; 1040 u32 bitval = val.bit_value;
963 u32 bitlen = val.num_bits; 1041 u32 bitlen = val.num_bits;
964 1042
965 u32 A = Replicate(bitval & 1, 1, 7); 1043 u32 A = ReplicateBitTo7(bitval & 1);
966 u32 B = 0, C = 0, D = 0; 1044 u32 B = 0, C = 0, D = 0;
967 1045
968 u32 result = 0; 1046 u32 result = 0;
969 switch (val.encoding) { 1047 switch (val.encoding) {
970 case IntegerEncoding::JustBits: 1048 case IntegerEncoding::JustBits:
971 result = Replicate(bitval, bitlen, 6); 1049 result = FastReplicateTo6(bitval, bitlen);
972 break; 1050 break;
973 1051
974 case IntegerEncoding::Trit: { 1052 case IntegerEncoding::Trit: {
@@ -1047,7 +1125,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
1047 return result; 1125 return result;
1048} 1126}
1049 1127
1050static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights, 1128static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights,
1051 const TexelWeightParams& params, const u32 blockWidth, 1129 const TexelWeightParams& params, const u32 blockWidth,
1052 const u32 blockHeight) { 1130 const u32 blockHeight) {
1053 u32 weightIdx = 0; 1131 u32 weightIdx = 0;
@@ -1545,8 +1623,7 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
1545 static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); 1623 static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
1546 memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); 1624 memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
1547 1625
1548 std::vector<IntegerEncodedValue> texelWeightValues; 1626 IntegerEncodedVector texelWeightValues;
1549 texelWeightValues.reserve(64);
1550 1627
1551 InputBitStream weightStream(texelWeightData); 1628 InputBitStream weightStream(texelWeightData);
1552 1629
@@ -1568,9 +1645,9 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
1568 Pixel p; 1645 Pixel p;
1569 for (u32 c = 0; c < 4; c++) { 1646 for (u32 c = 0; c < 4; c++) {
1570 u32 C0 = endpos32s[partition][0].Component(c); 1647 u32 C0 = endpos32s[partition][0].Component(c);
1571 C0 = Replicate(C0, 8, 16); 1648 C0 = ReplicateByteTo16(C0);
1572 u32 C1 = endpos32s[partition][1].Component(c); 1649 u32 C1 = endpos32s[partition][1].Component(c);
1573 C1 = Replicate(C1, 8, 16); 1650 C1 = ReplicateByteTo16(C1);
1574 1651
1575 u32 plane = 0; 1652 u32 plane = 0;
1576 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { 1653 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {