diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 8 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 12 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_texture_cache.cpp | 4 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_rasterizer.cpp | 4 | ||||
| -rw-r--r-- | src/video_core/shader/decode/conversion.cpp | 113 | ||||
| -rw-r--r-- | src/video_core/shader/decode/texture.cpp | 14 | ||||
| -rw-r--r-- | src/video_core/texture_cache/texture_cache.h | 66 | ||||
| -rw-r--r-- | src/video_core/textures/astc.cpp | 241 |
8 files changed, 298 insertions, 164 deletions
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 75ef8d541..f31d960c7 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp | |||
| @@ -345,7 +345,7 @@ void RasterizerOpenGL::ConfigureFramebuffers() { | |||
| 345 | 345 | ||
| 346 | texture_cache.GuardRenderTargets(true); | 346 | texture_cache.GuardRenderTargets(true); |
| 347 | 347 | ||
| 348 | View depth_surface = texture_cache.GetDepthBufferSurface(true); | 348 | View depth_surface = texture_cache.GetDepthBufferSurface(); |
| 349 | 349 | ||
| 350 | const auto& regs = gpu.regs; | 350 | const auto& regs = gpu.regs; |
| 351 | UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0); | 351 | UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0); |
| @@ -354,7 +354,7 @@ void RasterizerOpenGL::ConfigureFramebuffers() { | |||
| 354 | FramebufferCacheKey key; | 354 | FramebufferCacheKey key; |
| 355 | const auto colors_count = static_cast<std::size_t>(regs.rt_control.count); | 355 | const auto colors_count = static_cast<std::size_t>(regs.rt_control.count); |
| 356 | for (std::size_t index = 0; index < colors_count; ++index) { | 356 | for (std::size_t index = 0; index < colors_count; ++index) { |
| 357 | View color_surface{texture_cache.GetColorBufferSurface(index, true)}; | 357 | View color_surface{texture_cache.GetColorBufferSurface(index)}; |
| 358 | if (!color_surface) { | 358 | if (!color_surface) { |
| 359 | continue; | 359 | continue; |
| 360 | } | 360 | } |
| @@ -387,12 +387,12 @@ void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color_fb, bool using | |||
| 387 | View color_surface; | 387 | View color_surface; |
| 388 | if (using_color_fb) { | 388 | if (using_color_fb) { |
| 389 | const std::size_t index = regs.clear_buffers.RT; | 389 | const std::size_t index = regs.clear_buffers.RT; |
| 390 | color_surface = texture_cache.GetColorBufferSurface(index, true); | 390 | color_surface = texture_cache.GetColorBufferSurface(index); |
| 391 | texture_cache.MarkColorBufferInUse(index); | 391 | texture_cache.MarkColorBufferInUse(index); |
| 392 | } | 392 | } |
| 393 | View depth_surface; | 393 | View depth_surface; |
| 394 | if (using_depth_fb || using_stencil_fb) { | 394 | if (using_depth_fb || using_stencil_fb) { |
| 395 | depth_surface = texture_cache.GetDepthBufferSurface(true); | 395 | depth_surface = texture_cache.GetDepthBufferSurface(); |
| 396 | texture_cache.MarkDepthBufferInUse(); | 396 | texture_cache.MarkDepthBufferInUse(); |
| 397 | } | 397 | } |
| 398 | texture_cache.GuardRenderTargets(false); | 398 | texture_cache.GuardRenderTargets(false); |
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 160ae4340..1f1f01313 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp | |||
| @@ -1819,15 +1819,15 @@ private: | |||
| 1819 | } | 1819 | } |
| 1820 | 1820 | ||
| 1821 | Expression HMergeH0(Operation operation) { | 1821 | Expression HMergeH0(Operation operation) { |
| 1822 | std::string dest = VisitOperand(operation, 0).AsUint(); | 1822 | const std::string dest = VisitOperand(operation, 0).AsUint(); |
| 1823 | std::string src = VisitOperand(operation, 1).AsUint(); | 1823 | const std::string src = VisitOperand(operation, 1).AsUint(); |
| 1824 | return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", src, dest), Type::Uint}; | 1824 | return {fmt::format("bitfieldInsert({}, {}, 0, 16)", dest, src), Type::Uint}; |
| 1825 | } | 1825 | } |
| 1826 | 1826 | ||
| 1827 | Expression HMergeH1(Operation operation) { | 1827 | Expression HMergeH1(Operation operation) { |
| 1828 | std::string dest = VisitOperand(operation, 0).AsUint(); | 1828 | const std::string dest = VisitOperand(operation, 0).AsUint(); |
| 1829 | std::string src = VisitOperand(operation, 1).AsUint(); | 1829 | const std::string src = VisitOperand(operation, 1).AsUint(); |
| 1830 | return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", dest, src), Type::Uint}; | 1830 | return {fmt::format("bitfieldInsert({}, {}, 16, 16)", dest, src), Type::Uint}; |
| 1831 | } | 1831 | } |
| 1832 | 1832 | ||
| 1833 | Expression HPack2(Operation operation) { | 1833 | Expression HPack2(Operation operation) { |
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 36590a6d0..0b4d999d7 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp | |||
| @@ -411,14 +411,13 @@ CachedSurfaceView::~CachedSurfaceView() = default; | |||
| 411 | void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { | 411 | void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { |
| 412 | ASSERT(params.num_levels == 1); | 412 | ASSERT(params.num_levels == 1); |
| 413 | 413 | ||
| 414 | const GLuint texture = surface.GetTexture(); | ||
| 415 | if (params.num_layers > 1) { | 414 | if (params.num_layers > 1) { |
| 416 | // Layered framebuffer attachments | 415 | // Layered framebuffer attachments |
| 417 | UNIMPLEMENTED_IF(params.base_layer != 0); | 416 | UNIMPLEMENTED_IF(params.base_layer != 0); |
| 418 | 417 | ||
| 419 | switch (params.target) { | 418 | switch (params.target) { |
| 420 | case SurfaceTarget::Texture2DArray: | 419 | case SurfaceTarget::Texture2DArray: |
| 421 | glFramebufferTexture(target, attachment, texture, params.base_level); | 420 | glFramebufferTexture(target, attachment, GetTexture(), params.base_level); |
| 422 | break; | 421 | break; |
| 423 | default: | 422 | default: |
| 424 | UNIMPLEMENTED(); | 423 | UNIMPLEMENTED(); |
| @@ -427,6 +426,7 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { | |||
| 427 | } | 426 | } |
| 428 | 427 | ||
| 429 | const GLenum view_target = surface.GetTarget(); | 428 | const GLenum view_target = surface.GetTarget(); |
| 429 | const GLuint texture = surface.GetTexture(); | ||
| 430 | switch (surface.GetSurfaceParams().target) { | 430 | switch (surface.GetSurfaceParams().target) { |
| 431 | case SurfaceTarget::Texture1D: | 431 | case SurfaceTarget::Texture1D: |
| 432 | glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); | 432 | glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); |
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 0a2ea4fd4..6b99cbbbc 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp | |||
| @@ -599,7 +599,7 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { | |||
| 599 | Texceptions texceptions; | 599 | Texceptions texceptions; |
| 600 | for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { | 600 | for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { |
| 601 | if (update_rendertargets) { | 601 | if (update_rendertargets) { |
| 602 | color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true); | 602 | color_attachments[rt] = texture_cache.GetColorBufferSurface(rt); |
| 603 | } | 603 | } |
| 604 | if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) { | 604 | if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) { |
| 605 | texceptions[rt] = true; | 605 | texceptions[rt] = true; |
| @@ -607,7 +607,7 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { | |||
| 607 | } | 607 | } |
| 608 | 608 | ||
| 609 | if (update_rendertargets) { | 609 | if (update_rendertargets) { |
| 610 | zeta_attachment = texture_cache.GetDepthBufferSurface(true); | 610 | zeta_attachment = texture_cache.GetDepthBufferSurface(); |
| 611 | } | 611 | } |
| 612 | if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) { | 612 | if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) { |
| 613 | texceptions[ZETA_TEXCEPTION_INDEX] = true; | 613 | texceptions[ZETA_TEXCEPTION_INDEX] = true; |
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp index c72690b2b..b9989c88c 100644 --- a/src/video_core/shader/decode/conversion.cpp +++ b/src/video_core/shader/decode/conversion.cpp | |||
| @@ -2,6 +2,10 @@ | |||
| 2 | // Licensed under GPLv2 or any later version | 2 | // Licensed under GPLv2 or any later version |
| 3 | // Refer to the license.txt file included. | 3 | // Refer to the license.txt file included. |
| 4 | 4 | ||
| 5 | #include <limits> | ||
| 6 | #include <optional> | ||
| 7 | #include <utility> | ||
| 8 | |||
| 5 | #include "common/assert.h" | 9 | #include "common/assert.h" |
| 6 | #include "common/common_types.h" | 10 | #include "common/common_types.h" |
| 7 | #include "video_core/engines/shader_bytecode.h" | 11 | #include "video_core/engines/shader_bytecode.h" |
| @@ -15,9 +19,49 @@ using Tegra::Shader::OpCode; | |||
| 15 | using Tegra::Shader::Register; | 19 | using Tegra::Shader::Register; |
| 16 | 20 | ||
| 17 | namespace { | 21 | namespace { |
| 22 | |||
| 18 | constexpr OperationCode GetFloatSelector(u64 selector) { | 23 | constexpr OperationCode GetFloatSelector(u64 selector) { |
| 19 | return selector == 0 ? OperationCode::FCastHalf0 : OperationCode::FCastHalf1; | 24 | return selector == 0 ? OperationCode::FCastHalf0 : OperationCode::FCastHalf1; |
| 20 | } | 25 | } |
| 26 | |||
| 27 | constexpr u32 SizeInBits(Register::Size size) { | ||
| 28 | switch (size) { | ||
| 29 | case Register::Size::Byte: | ||
| 30 | return 8; | ||
| 31 | case Register::Size::Short: | ||
| 32 | return 16; | ||
| 33 | case Register::Size::Word: | ||
| 34 | return 32; | ||
| 35 | case Register::Size::Long: | ||
| 36 | return 64; | ||
| 37 | } | ||
| 38 | return 0; | ||
| 39 | } | ||
| 40 | |||
| 41 | constexpr std::optional<std::pair<s32, s32>> IntegerSaturateBounds(Register::Size src_size, | ||
| 42 | Register::Size dst_size, | ||
| 43 | bool src_signed, | ||
| 44 | bool dst_signed) { | ||
| 45 | const u32 dst_bits = SizeInBits(dst_size); | ||
| 46 | if (src_size == Register::Size::Word && dst_size == Register::Size::Word) { | ||
| 47 | if (src_signed == dst_signed) { | ||
| 48 | return std::nullopt; | ||
| 49 | } | ||
| 50 | return std::make_pair(0, std::numeric_limits<s32>::max()); | ||
| 51 | } | ||
| 52 | if (dst_signed) { | ||
| 53 | // Signed destination, clamp to [-128, 127] for instance | ||
| 54 | return std::make_pair(-(1 << (dst_bits - 1)), (1 << (dst_bits - 1)) - 1); | ||
| 55 | } else { | ||
| 56 | // Unsigned destination | ||
| 57 | if (dst_bits == 32) { | ||
| 58 | // Avoid shifting by 32, that is undefined behavior | ||
| 59 | return std::make_pair(0, s32(std::numeric_limits<u32>::max())); | ||
| 60 | } | ||
| 61 | return std::make_pair(0, (1 << dst_bits) - 1); | ||
| 62 | } | ||
| 63 | } | ||
| 64 | |||
| 21 | } // Anonymous namespace | 65 | } // Anonymous namespace |
| 22 | 66 | ||
| 23 | u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { | 67 | u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { |
| @@ -28,14 +72,13 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { | |||
| 28 | case OpCode::Id::I2I_R: | 72 | case OpCode::Id::I2I_R: |
| 29 | case OpCode::Id::I2I_C: | 73 | case OpCode::Id::I2I_C: |
| 30 | case OpCode::Id::I2I_IMM: { | 74 | case OpCode::Id::I2I_IMM: { |
| 31 | UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0); | 75 | const bool src_signed = instr.conversion.is_input_signed; |
| 32 | UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word); | 76 | const bool dst_signed = instr.conversion.is_output_signed; |
| 33 | UNIMPLEMENTED_IF(instr.alu.saturate_d); | 77 | const Register::Size src_size = instr.conversion.src_size; |
| 78 | const Register::Size dst_size = instr.conversion.dst_size; | ||
| 79 | const u32 selector = static_cast<u32>(instr.conversion.int_src.selector); | ||
| 34 | 80 | ||
| 35 | const bool input_signed = instr.conversion.is_input_signed; | 81 | Node value = [this, instr, opcode] { |
| 36 | const bool output_signed = instr.conversion.is_output_signed; | ||
| 37 | |||
| 38 | Node value = [&]() { | ||
| 39 | switch (opcode->get().GetId()) { | 82 | switch (opcode->get().GetId()) { |
| 40 | case OpCode::Id::I2I_R: | 83 | case OpCode::Id::I2I_R: |
| 41 | return GetRegister(instr.gpr20); | 84 | return GetRegister(instr.gpr20); |
| @@ -48,16 +91,60 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { | |||
| 48 | return Immediate(0); | 91 | return Immediate(0); |
| 49 | } | 92 | } |
| 50 | }(); | 93 | }(); |
| 51 | value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed); | ||
| 52 | 94 | ||
| 53 | value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, instr.conversion.negate_a, | 95 | // Ensure the source selector is valid |
| 54 | input_signed); | 96 | switch (instr.conversion.src_size) { |
| 55 | if (input_signed != output_signed) { | 97 | case Register::Size::Byte: |
| 56 | value = SignedOperation(OperationCode::ICastUnsigned, output_signed, NO_PRECISE, value); | 98 | break; |
| 99 | case Register::Size::Short: | ||
| 100 | ASSERT(selector == 0 || selector == 2); | ||
| 101 | break; | ||
| 102 | default: | ||
| 103 | ASSERT(selector == 0); | ||
| 104 | break; | ||
| 105 | } | ||
| 106 | |||
| 107 | if (src_size != Register::Size::Word || selector != 0) { | ||
| 108 | value = SignedOperation(OperationCode::IBitfieldExtract, src_signed, std::move(value), | ||
| 109 | Immediate(selector * 8), Immediate(SizeInBits(src_size))); | ||
| 110 | } | ||
| 111 | |||
| 112 | value = GetOperandAbsNegInteger(std::move(value), instr.conversion.abs_a, | ||
| 113 | instr.conversion.negate_a, src_signed); | ||
| 114 | |||
| 115 | if (instr.alu.saturate_d) { | ||
| 116 | if (src_signed && !dst_signed) { | ||
| 117 | Node is_negative = Operation(OperationCode::LogicalUGreaterEqual, value, | ||
| 118 | Immediate(1 << (SizeInBits(src_size) - 1))); | ||
| 119 | value = Operation(OperationCode::Select, std::move(is_negative), Immediate(0), | ||
| 120 | std::move(value)); | ||
| 121 | |||
| 122 | // Simplify generated expressions, this can be removed without semantic impact | ||
| 123 | SetTemporary(bb, 0, std::move(value)); | ||
| 124 | value = GetTemporary(0); | ||
| 125 | |||
| 126 | if (dst_size != Register::Size::Word) { | ||
| 127 | const Node limit = Immediate((1 << SizeInBits(dst_size)) - 1); | ||
| 128 | Node is_large = | ||
| 129 | Operation(OperationCode::LogicalUGreaterThan, std::move(value), limit); | ||
| 130 | value = Operation(OperationCode::Select, std::move(is_large), limit, | ||
| 131 | std::move(value)); | ||
| 132 | } | ||
| 133 | } else if (const std::optional bounds = | ||
| 134 | IntegerSaturateBounds(src_size, dst_size, src_signed, dst_signed)) { | ||
| 135 | value = SignedOperation(OperationCode::IMax, src_signed, std::move(value), | ||
| 136 | Immediate(bounds->first)); | ||
| 137 | value = SignedOperation(OperationCode::IMin, src_signed, std::move(value), | ||
| 138 | Immediate(bounds->second)); | ||
| 139 | } | ||
| 140 | } else if (dst_size != Register::Size::Word) { | ||
| 141 | // No saturation, we only have to mask the result | ||
| 142 | Node mask = Immediate((1 << SizeInBits(dst_size)) - 1); | ||
| 143 | value = Operation(OperationCode::UBitwiseAnd, std::move(value), std::move(mask)); | ||
| 57 | } | 144 | } |
| 58 | 145 | ||
| 59 | SetInternalFlagsFromInteger(bb, value, instr.generates_cc); | 146 | SetInternalFlagsFromInteger(bb, value, instr.generates_cc); |
| 60 | SetRegister(bb, instr.gpr0, value); | 147 | SetRegister(bb, instr.gpr0, std::move(value)); |
| 61 | break; | 148 | break; |
| 62 | } | 149 | } |
| 63 | case OpCode::Id::I2F_R: | 150 | case OpCode::Id::I2F_R: |
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 48350e042..6c4a1358b 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp | |||
| @@ -780,20 +780,6 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is | |||
| 780 | // When lod is used always is in gpr20 | 780 | // When lod is used always is in gpr20 |
| 781 | const Node lod = lod_enabled ? GetRegister(instr.gpr20) : Immediate(0); | 781 | const Node lod = lod_enabled ? GetRegister(instr.gpr20) : Immediate(0); |
| 782 | 782 | ||
| 783 | // Fill empty entries from the guest sampler | ||
| 784 | const std::size_t entry_coord_count = GetCoordCount(sampler.GetType()); | ||
| 785 | if (type_coord_count != entry_coord_count) { | ||
| 786 | LOG_WARNING(HW_GPU, "Bound and built texture types mismatch"); | ||
| 787 | |||
| 788 | // When the size is higher we insert zeroes | ||
| 789 | for (std::size_t i = type_coord_count; i < entry_coord_count; ++i) { | ||
| 790 | coords.push_back(GetRegister(Register::ZeroIndex)); | ||
| 791 | } | ||
| 792 | |||
| 793 | // Then we ensure the size matches the number of entries (dropping unused values) | ||
| 794 | coords.resize(entry_coord_count); | ||
| 795 | } | ||
| 796 | |||
| 797 | Node4 values; | 783 | Node4 values; |
| 798 | for (u32 element = 0; element < values.size(); ++element) { | 784 | for (u32 element = 0; element < values.size(); ++element) { |
| 799 | auto coords_copy = coords; | 785 | auto coords_copy = coords; |
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 88fe3e25f..cfc7fe6e9 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h | |||
| @@ -108,7 +108,7 @@ public: | |||
| 108 | } | 108 | } |
| 109 | 109 | ||
| 110 | const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)}; | 110 | const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)}; |
| 111 | const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false); | 111 | const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, false); |
| 112 | if (guard_samplers) { | 112 | if (guard_samplers) { |
| 113 | sampled_textures.push_back(surface); | 113 | sampled_textures.push_back(surface); |
| 114 | } | 114 | } |
| @@ -128,7 +128,7 @@ public: | |||
| 128 | return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); | 128 | return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); |
| 129 | } | 129 | } |
| 130 | const auto params{SurfaceParams::CreateForImage(format_lookup_table, tic, entry)}; | 130 | const auto params{SurfaceParams::CreateForImage(format_lookup_table, tic, entry)}; |
| 131 | const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false); | 131 | const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, false); |
| 132 | if (guard_samplers) { | 132 | if (guard_samplers) { |
| 133 | sampled_textures.push_back(surface); | 133 | sampled_textures.push_back(surface); |
| 134 | } | 134 | } |
| @@ -143,7 +143,7 @@ public: | |||
| 143 | return any_rt; | 143 | return any_rt; |
| 144 | } | 144 | } |
| 145 | 145 | ||
| 146 | TView GetDepthBufferSurface(bool preserve_contents) { | 146 | TView GetDepthBufferSurface() { |
| 147 | std::lock_guard lock{mutex}; | 147 | std::lock_guard lock{mutex}; |
| 148 | auto& maxwell3d = system.GPU().Maxwell3D(); | 148 | auto& maxwell3d = system.GPU().Maxwell3D(); |
| 149 | if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer]) { | 149 | if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer]) { |
| @@ -164,7 +164,7 @@ public: | |||
| 164 | return {}; | 164 | return {}; |
| 165 | } | 165 | } |
| 166 | const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)}; | 166 | const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)}; |
| 167 | auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, preserve_contents, true); | 167 | auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, true); |
| 168 | if (depth_buffer.target) | 168 | if (depth_buffer.target) |
| 169 | depth_buffer.target->MarkAsRenderTarget(false, NO_RT); | 169 | depth_buffer.target->MarkAsRenderTarget(false, NO_RT); |
| 170 | depth_buffer.target = surface_view.first; | 170 | depth_buffer.target = surface_view.first; |
| @@ -174,7 +174,7 @@ public: | |||
| 174 | return surface_view.second; | 174 | return surface_view.second; |
| 175 | } | 175 | } |
| 176 | 176 | ||
| 177 | TView GetColorBufferSurface(std::size_t index, bool preserve_contents) { | 177 | TView GetColorBufferSurface(std::size_t index) { |
| 178 | std::lock_guard lock{mutex}; | 178 | std::lock_guard lock{mutex}; |
| 179 | ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets); | 179 | ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets); |
| 180 | auto& maxwell3d = system.GPU().Maxwell3D(); | 180 | auto& maxwell3d = system.GPU().Maxwell3D(); |
| @@ -204,9 +204,8 @@ public: | |||
| 204 | return {}; | 204 | return {}; |
| 205 | } | 205 | } |
| 206 | 206 | ||
| 207 | auto surface_view = | 207 | auto surface_view = GetSurface(gpu_addr, *cpu_addr, |
| 208 | GetSurface(gpu_addr, *cpu_addr, SurfaceParams::CreateForFramebuffer(system, index), | 208 | SurfaceParams::CreateForFramebuffer(system, index), true); |
| 209 | preserve_contents, true); | ||
| 210 | if (render_targets[index].target) | 209 | if (render_targets[index].target) |
| 211 | render_targets[index].target->MarkAsRenderTarget(false, NO_RT); | 210 | render_targets[index].target->MarkAsRenderTarget(false, NO_RT); |
| 212 | render_targets[index].target = surface_view.first; | 211 | render_targets[index].target = surface_view.first; |
| @@ -260,9 +259,9 @@ public: | |||
| 260 | const std::optional<VAddr> src_cpu_addr = | 259 | const std::optional<VAddr> src_cpu_addr = |
| 261 | system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr); | 260 | system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr); |
| 262 | std::pair<TSurface, TView> dst_surface = | 261 | std::pair<TSurface, TView> dst_surface = |
| 263 | GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); | 262 | GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, false); |
| 264 | std::pair<TSurface, TView> src_surface = | 263 | std::pair<TSurface, TView> src_surface = |
| 265 | GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false); | 264 | GetSurface(src_gpu_addr, *src_cpu_addr, src_params, false); |
| 266 | ImageBlit(src_surface.second, dst_surface.second, copy_config); | 265 | ImageBlit(src_surface.second, dst_surface.second, copy_config); |
| 267 | dst_surface.first->MarkAsModified(true, Tick()); | 266 | dst_surface.first->MarkAsModified(true, Tick()); |
| 268 | } | 267 | } |
| @@ -451,22 +450,18 @@ private: | |||
| 451 | * @param overlaps The overlapping surfaces registered in the cache. | 450 | * @param overlaps The overlapping surfaces registered in the cache. |
| 452 | * @param params The parameters for the new surface. | 451 | * @param params The parameters for the new surface. |
| 453 | * @param gpu_addr The starting address of the new surface. | 452 | * @param gpu_addr The starting address of the new surface. |
| 454 | * @param preserve_contents Indicates that the new surface should be loaded from memory or left | ||
| 455 | * blank. | ||
| 456 | * @param untopological Indicates to the recycler that the texture has no way to match the | 453 | * @param untopological Indicates to the recycler that the texture has no way to match the |
| 457 | * overlaps due to topological reasons. | 454 | * overlaps due to topological reasons. |
| 458 | **/ | 455 | **/ |
| 459 | std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps, | 456 | std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps, |
| 460 | const SurfaceParams& params, const GPUVAddr gpu_addr, | 457 | const SurfaceParams& params, const GPUVAddr gpu_addr, |
| 461 | const bool preserve_contents, | ||
| 462 | const MatchTopologyResult untopological) { | 458 | const MatchTopologyResult untopological) { |
| 463 | const bool do_load = preserve_contents && Settings::values.use_accurate_gpu_emulation; | ||
| 464 | for (auto& surface : overlaps) { | 459 | for (auto& surface : overlaps) { |
| 465 | Unregister(surface); | 460 | Unregister(surface); |
| 466 | } | 461 | } |
| 467 | switch (PickStrategy(overlaps, params, gpu_addr, untopological)) { | 462 | switch (PickStrategy(overlaps, params, gpu_addr, untopological)) { |
| 468 | case RecycleStrategy::Ignore: { | 463 | case RecycleStrategy::Ignore: { |
| 469 | return InitializeSurface(gpu_addr, params, do_load); | 464 | return InitializeSurface(gpu_addr, params, Settings::values.use_accurate_gpu_emulation); |
| 470 | } | 465 | } |
| 471 | case RecycleStrategy::Flush: { | 466 | case RecycleStrategy::Flush: { |
| 472 | std::sort(overlaps.begin(), overlaps.end(), | 467 | std::sort(overlaps.begin(), overlaps.end(), |
| @@ -476,7 +471,7 @@ private: | |||
| 476 | for (auto& surface : overlaps) { | 471 | for (auto& surface : overlaps) { |
| 477 | FlushSurface(surface); | 472 | FlushSurface(surface); |
| 478 | } | 473 | } |
| 479 | return InitializeSurface(gpu_addr, params, preserve_contents); | 474 | return InitializeSurface(gpu_addr, params); |
| 480 | } | 475 | } |
| 481 | case RecycleStrategy::BufferCopy: { | 476 | case RecycleStrategy::BufferCopy: { |
| 482 | auto new_surface = GetUncachedSurface(gpu_addr, params); | 477 | auto new_surface = GetUncachedSurface(gpu_addr, params); |
| @@ -485,7 +480,7 @@ private: | |||
| 485 | } | 480 | } |
| 486 | default: { | 481 | default: { |
| 487 | UNIMPLEMENTED_MSG("Unimplemented Texture Cache Recycling Strategy!"); | 482 | UNIMPLEMENTED_MSG("Unimplemented Texture Cache Recycling Strategy!"); |
| 488 | return InitializeSurface(gpu_addr, params, do_load); | 483 | return InitializeSurface(gpu_addr, params); |
| 489 | } | 484 | } |
| 490 | } | 485 | } |
| 491 | } | 486 | } |
| @@ -621,14 +616,11 @@ private: | |||
| 621 | * @param params The parameters on the new surface. | 616 | * @param params The parameters on the new surface. |
| 622 | * @param gpu_addr The starting address of the new surface. | 617 | * @param gpu_addr The starting address of the new surface. |
| 623 | * @param cache_addr The starting address of the new surface on physical memory. | 618 | * @param cache_addr The starting address of the new surface on physical memory. |
| 624 | * @param preserve_contents Indicates that the new surface should be loaded from memory or | ||
| 625 | * left blank. | ||
| 626 | */ | 619 | */ |
| 627 | std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps, | 620 | std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps, |
| 628 | const SurfaceParams& params, | 621 | const SurfaceParams& params, |
| 629 | const GPUVAddr gpu_addr, | 622 | const GPUVAddr gpu_addr, |
| 630 | const VAddr cpu_addr, | 623 | const VAddr cpu_addr) { |
| 631 | bool preserve_contents) { | ||
| 632 | if (params.target == SurfaceTarget::Texture3D) { | 624 | if (params.target == SurfaceTarget::Texture3D) { |
| 633 | bool failed = false; | 625 | bool failed = false; |
| 634 | if (params.num_levels > 1) { | 626 | if (params.num_levels > 1) { |
| @@ -677,7 +669,7 @@ private: | |||
| 677 | return std::nullopt; | 669 | return std::nullopt; |
| 678 | } | 670 | } |
| 679 | Unregister(surface); | 671 | Unregister(surface); |
| 680 | return InitializeSurface(gpu_addr, params, preserve_contents); | 672 | return InitializeSurface(gpu_addr, params); |
| 681 | } | 673 | } |
| 682 | return std::nullopt; | 674 | return std::nullopt; |
| 683 | } | 675 | } |
| @@ -688,7 +680,7 @@ private: | |||
| 688 | return {{surface, surface->GetMainView()}}; | 680 | return {{surface, surface->GetMainView()}}; |
| 689 | } | 681 | } |
| 690 | } | 682 | } |
| 691 | return InitializeSurface(gpu_addr, params, preserve_contents); | 683 | return InitializeSurface(gpu_addr, params); |
| 692 | } | 684 | } |
| 693 | } | 685 | } |
| 694 | 686 | ||
| @@ -711,13 +703,10 @@ private: | |||
| 711 | * | 703 | * |
| 712 | * @param gpu_addr The starting address of the candidate surface. | 704 | * @param gpu_addr The starting address of the candidate surface. |
| 713 | * @param params The parameters on the candidate surface. | 705 | * @param params The parameters on the candidate surface. |
| 714 | * @param preserve_contents Indicates that the new surface should be loaded from memory or | ||
| 715 | * left blank. | ||
| 716 | * @param is_render Whether or not the surface is a render target. | 706 | * @param is_render Whether or not the surface is a render target. |
| 717 | **/ | 707 | **/ |
| 718 | std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const VAddr cpu_addr, | 708 | std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const VAddr cpu_addr, |
| 719 | const SurfaceParams& params, bool preserve_contents, | 709 | const SurfaceParams& params, bool is_render) { |
| 720 | bool is_render) { | ||
| 721 | // Step 1 | 710 | // Step 1 |
| 722 | // Check Level 1 Cache for a fast structural match. If candidate surface | 711 | // Check Level 1 Cache for a fast structural match. If candidate surface |
| 723 | // matches at certain level we are pretty much done. | 712 | // matches at certain level we are pretty much done. |
| @@ -726,8 +715,7 @@ private: | |||
| 726 | const auto topological_result = current_surface->MatchesTopology(params); | 715 | const auto topological_result = current_surface->MatchesTopology(params); |
| 727 | if (topological_result != MatchTopologyResult::FullMatch) { | 716 | if (topological_result != MatchTopologyResult::FullMatch) { |
| 728 | std::vector<TSurface> overlaps{current_surface}; | 717 | std::vector<TSurface> overlaps{current_surface}; |
| 729 | return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, | 718 | return RecycleSurface(overlaps, params, gpu_addr, topological_result); |
| 730 | topological_result); | ||
| 731 | } | 719 | } |
| 732 | 720 | ||
| 733 | const auto struct_result = current_surface->MatchesStructure(params); | 721 | const auto struct_result = current_surface->MatchesStructure(params); |
| @@ -752,7 +740,7 @@ private: | |||
| 752 | 740 | ||
| 753 | // If none are found, we are done. we just load the surface and create it. | 741 | // If none are found, we are done. we just load the surface and create it. |
| 754 | if (overlaps.empty()) { | 742 | if (overlaps.empty()) { |
| 755 | return InitializeSurface(gpu_addr, params, preserve_contents); | 743 | return InitializeSurface(gpu_addr, params); |
| 756 | } | 744 | } |
| 757 | 745 | ||
| 758 | // Step 3 | 746 | // Step 3 |
| @@ -762,15 +750,13 @@ private: | |||
| 762 | for (const auto& surface : overlaps) { | 750 | for (const auto& surface : overlaps) { |
| 763 | const auto topological_result = surface->MatchesTopology(params); | 751 | const auto topological_result = surface->MatchesTopology(params); |
| 764 | if (topological_result != MatchTopologyResult::FullMatch) { | 752 | if (topological_result != MatchTopologyResult::FullMatch) { |
| 765 | return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, | 753 | return RecycleSurface(overlaps, params, gpu_addr, topological_result); |
| 766 | topological_result); | ||
| 767 | } | 754 | } |
| 768 | } | 755 | } |
| 769 | 756 | ||
| 770 | // Check if it's a 3D texture | 757 | // Check if it's a 3D texture |
| 771 | if (params.block_depth > 0) { | 758 | if (params.block_depth > 0) { |
| 772 | auto surface = | 759 | auto surface = Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr); |
| 773 | Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents); | ||
| 774 | if (surface) { | 760 | if (surface) { |
| 775 | return *surface; | 761 | return *surface; |
| 776 | } | 762 | } |
| @@ -790,8 +776,7 @@ private: | |||
| 790 | return *view; | 776 | return *view; |
| 791 | } | 777 | } |
| 792 | } | 778 | } |
| 793 | return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, | 779 | return RecycleSurface(overlaps, params, gpu_addr, MatchTopologyResult::FullMatch); |
| 794 | MatchTopologyResult::FullMatch); | ||
| 795 | } | 780 | } |
| 796 | // Now we check if the candidate is a mipmap/layer of the overlap | 781 | // Now we check if the candidate is a mipmap/layer of the overlap |
| 797 | std::optional<TView> view = | 782 | std::optional<TView> view = |
| @@ -815,7 +800,7 @@ private: | |||
| 815 | pair.first->EmplaceView(params, gpu_addr, candidate_size); | 800 | pair.first->EmplaceView(params, gpu_addr, candidate_size); |
| 816 | if (mirage_view) | 801 | if (mirage_view) |
| 817 | return {pair.first, *mirage_view}; | 802 | return {pair.first, *mirage_view}; |
| 818 | return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, | 803 | return RecycleSurface(overlaps, params, gpu_addr, |
| 819 | MatchTopologyResult::FullMatch); | 804 | MatchTopologyResult::FullMatch); |
| 820 | } | 805 | } |
| 821 | return {current_surface, *view}; | 806 | return {current_surface, *view}; |
| @@ -831,8 +816,7 @@ private: | |||
| 831 | } | 816 | } |
| 832 | } | 817 | } |
| 833 | // We failed all the tests, recycle the overlaps into a new texture. | 818 | // We failed all the tests, recycle the overlaps into a new texture. |
| 834 | return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, | 819 | return RecycleSurface(overlaps, params, gpu_addr, MatchTopologyResult::FullMatch); |
| 835 | MatchTopologyResult::FullMatch); | ||
| 836 | } | 820 | } |
| 837 | 821 | ||
| 838 | /** | 822 | /** |
| @@ -990,10 +974,10 @@ private: | |||
| 990 | } | 974 | } |
| 991 | 975 | ||
| 992 | std::pair<TSurface, TView> InitializeSurface(GPUVAddr gpu_addr, const SurfaceParams& params, | 976 | std::pair<TSurface, TView> InitializeSurface(GPUVAddr gpu_addr, const SurfaceParams& params, |
| 993 | bool preserve_contents) { | 977 | bool do_load = true) { |
| 994 | auto new_surface{GetUncachedSurface(gpu_addr, params)}; | 978 | auto new_surface{GetUncachedSurface(gpu_addr, params)}; |
| 995 | Register(new_surface); | 979 | Register(new_surface); |
| 996 | if (preserve_contents) { | 980 | if (do_load) { |
| 997 | LoadSurface(new_surface); | 981 | LoadSurface(new_surface); |
| 998 | } | 982 | } |
| 999 | return {new_surface, new_surface->GetMainView()}; | 983 | return {new_surface, new_surface->GetMainView()}; |
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 062b4f252..365bde2f1 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp | |||
| @@ -20,6 +20,8 @@ | |||
| 20 | #include <cstring> | 20 | #include <cstring> |
| 21 | #include <vector> | 21 | #include <vector> |
| 22 | 22 | ||
| 23 | #include <boost/container/static_vector.hpp> | ||
| 24 | |||
| 23 | #include "common/common_types.h" | 25 | #include "common/common_types.h" |
| 24 | 26 | ||
| 25 | #include "video_core/textures/astc.h" | 27 | #include "video_core/textures/astc.h" |
| @@ -39,25 +41,25 @@ constexpr u32 Popcnt(u32 n) { | |||
| 39 | 41 | ||
| 40 | class InputBitStream { | 42 | class InputBitStream { |
| 41 | public: | 43 | public: |
| 42 | explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) | 44 | constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) |
| 43 | : m_CurByte(ptr), m_NextBit(start_offset % 8) {} | 45 | : cur_byte{ptr}, next_bit{start_offset % 8} {} |
| 44 | 46 | ||
| 45 | std::size_t GetBitsRead() const { | 47 | constexpr std::size_t GetBitsRead() const { |
| 46 | return m_BitsRead; | 48 | return bits_read; |
| 47 | } | 49 | } |
| 48 | 50 | ||
| 49 | u32 ReadBit() { | 51 | constexpr bool ReadBit() { |
| 50 | u32 bit = *m_CurByte >> m_NextBit++; | 52 | const bool bit = (*cur_byte >> next_bit++) & 1; |
| 51 | while (m_NextBit >= 8) { | 53 | while (next_bit >= 8) { |
| 52 | m_NextBit -= 8; | 54 | next_bit -= 8; |
| 53 | m_CurByte++; | 55 | cur_byte++; |
| 54 | } | 56 | } |
| 55 | 57 | ||
| 56 | m_BitsRead++; | 58 | bits_read++; |
| 57 | return bit & 1; | 59 | return bit; |
| 58 | } | 60 | } |
| 59 | 61 | ||
| 60 | u32 ReadBits(std::size_t nBits) { | 62 | constexpr u32 ReadBits(std::size_t nBits) { |
| 61 | u32 ret = 0; | 63 | u32 ret = 0; |
| 62 | for (std::size_t i = 0; i < nBits; ++i) { | 64 | for (std::size_t i = 0; i < nBits; ++i) { |
| 63 | ret |= (ReadBit() & 1) << i; | 65 | ret |= (ReadBit() & 1) << i; |
| @@ -66,7 +68,7 @@ public: | |||
| 66 | } | 68 | } |
| 67 | 69 | ||
| 68 | template <std::size_t nBits> | 70 | template <std::size_t nBits> |
| 69 | u32 ReadBits() { | 71 | constexpr u32 ReadBits() { |
| 70 | u32 ret = 0; | 72 | u32 ret = 0; |
| 71 | for (std::size_t i = 0; i < nBits; ++i) { | 73 | for (std::size_t i = 0; i < nBits; ++i) { |
| 72 | ret |= (ReadBit() & 1) << i; | 74 | ret |= (ReadBit() & 1) << i; |
| @@ -75,64 +77,58 @@ public: | |||
| 75 | } | 77 | } |
| 76 | 78 | ||
| 77 | private: | 79 | private: |
| 78 | const u8* m_CurByte; | 80 | const u8* cur_byte; |
| 79 | std::size_t m_NextBit = 0; | 81 | std::size_t next_bit = 0; |
| 80 | std::size_t m_BitsRead = 0; | 82 | std::size_t bits_read = 0; |
| 81 | }; | 83 | }; |
| 82 | 84 | ||
| 83 | class OutputBitStream { | 85 | class OutputBitStream { |
| 84 | public: | 86 | public: |
| 85 | explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0) | 87 | constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0) |
| 86 | : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {} | 88 | : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {} |
| 87 | |||
| 88 | ~OutputBitStream() = default; | ||
| 89 | 89 | ||
| 90 | s32 GetBitsWritten() const { | 90 | constexpr std::size_t GetBitsWritten() const { |
| 91 | return m_BitsWritten; | 91 | return bits_written; |
| 92 | } | 92 | } |
| 93 | 93 | ||
| 94 | void WriteBitsR(u32 val, u32 nBits) { | 94 | constexpr void WriteBitsR(u32 val, u32 nBits) { |
| 95 | for (u32 i = 0; i < nBits; i++) { | 95 | for (u32 i = 0; i < nBits; i++) { |
| 96 | WriteBit((val >> (nBits - i - 1)) & 1); | 96 | WriteBit((val >> (nBits - i - 1)) & 1); |
| 97 | } | 97 | } |
| 98 | } | 98 | } |
| 99 | 99 | ||
| 100 | void WriteBits(u32 val, u32 nBits) { | 100 | constexpr void WriteBits(u32 val, u32 nBits) { |
| 101 | for (u32 i = 0; i < nBits; i++) { | 101 | for (u32 i = 0; i < nBits; i++) { |
| 102 | WriteBit((val >> i) & 1); | 102 | WriteBit((val >> i) & 1); |
| 103 | } | 103 | } |
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | private: | 106 | private: |
| 107 | void WriteBit(s32 b) { | 107 | constexpr void WriteBit(bool b) { |
| 108 | 108 | if (bits_written >= num_bits) { | |
| 109 | if (done) | ||
| 110 | return; | 109 | return; |
| 110 | } | ||
| 111 | 111 | ||
| 112 | const u32 mask = 1 << m_NextBit++; | 112 | const u32 mask = 1 << next_bit++; |
| 113 | 113 | ||
| 114 | // clear the bit | 114 | // clear the bit |
| 115 | *m_CurByte &= static_cast<u8>(~mask); | 115 | *cur_byte &= static_cast<u8>(~mask); |
| 116 | 116 | ||
| 117 | // Write the bit, if necessary | 117 | // Write the bit, if necessary |
| 118 | if (b) | 118 | if (b) |
| 119 | *m_CurByte |= static_cast<u8>(mask); | 119 | *cur_byte |= static_cast<u8>(mask); |
| 120 | 120 | ||
| 121 | // Next byte? | 121 | // Next byte? |
| 122 | if (m_NextBit >= 8) { | 122 | if (next_bit >= 8) { |
| 123 | m_CurByte += 1; | 123 | cur_byte += 1; |
| 124 | m_NextBit = 0; | 124 | next_bit = 0; |
| 125 | } | 125 | } |
| 126 | |||
| 127 | done = done || ++m_BitsWritten >= m_NumBits; | ||
| 128 | } | 126 | } |
| 129 | 127 | ||
| 130 | s32 m_BitsWritten = 0; | 128 | u8* cur_byte; |
| 131 | const s32 m_NumBits; | 129 | std::size_t num_bits; |
| 132 | u8* m_CurByte; | 130 | std::size_t bits_written = 0; |
| 133 | s32 m_NextBit = 0; | 131 | std::size_t next_bit = 0; |
| 134 | |||
| 135 | bool done = false; | ||
| 136 | }; | 132 | }; |
| 137 | 133 | ||
| 138 | template <typename IntType> | 134 | template <typename IntType> |
| @@ -195,9 +191,13 @@ struct IntegerEncodedValue { | |||
| 195 | u32 trit_value; | 191 | u32 trit_value; |
| 196 | }; | 192 | }; |
| 197 | }; | 193 | }; |
| 194 | using IntegerEncodedVector = boost::container::static_vector< | ||
| 195 | IntegerEncodedValue, 64, | ||
| 196 | boost::container::static_vector_options< | ||
| 197 | boost::container::inplace_alignment<alignof(IntegerEncodedValue)>, | ||
| 198 | boost::container::throw_on_overflow<false>>::type>; | ||
| 198 | 199 | ||
| 199 | static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, | 200 | static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) { |
| 200 | u32 nBitsPerValue) { | ||
| 201 | // Implement the algorithm in section C.2.12 | 201 | // Implement the algorithm in section C.2.12 |
| 202 | u32 m[5]; | 202 | u32 m[5]; |
| 203 | u32 t[5]; | 203 | u32 t[5]; |
| @@ -255,7 +255,7 @@ static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValu | |||
| 255 | } | 255 | } |
| 256 | } | 256 | } |
| 257 | 257 | ||
| 258 | static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, | 258 | static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result, |
| 259 | u32 nBitsPerValue) { | 259 | u32 nBitsPerValue) { |
| 260 | // Implement the algorithm in section C.2.12 | 260 | // Implement the algorithm in section C.2.12 |
| 261 | u32 m[3]; | 261 | u32 m[3]; |
| @@ -343,8 +343,8 @@ static constexpr std::array EncodingsValues = MakeEncodedValues(); | |||
| 343 | // Fills result with the values that are encoded in the given | 343 | // Fills result with the values that are encoded in the given |
| 344 | // bitstream. We must know beforehand what the maximum possible | 344 | // bitstream. We must know beforehand what the maximum possible |
| 345 | // value is, and how many values we're decoding. | 345 | // value is, and how many values we're decoding. |
| 346 | static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits, | 346 | static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange, |
| 347 | u32 maxRange, u32 nValues) { | 347 | u32 nValues) { |
| 348 | // Determine encoding parameters | 348 | // Determine encoding parameters |
| 349 | IntegerEncodedValue val = EncodingsValues[maxRange]; | 349 | IntegerEncodedValue val = EncodingsValues[maxRange]; |
| 350 | 350 | ||
| @@ -634,12 +634,14 @@ static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) { | |||
| 634 | // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] | 634 | // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] |
| 635 | // is the same as [(numBits - 1):0] and repeats all the way down. | 635 | // is the same as [(numBits - 1):0] and repeats all the way down. |
| 636 | template <typename IntType> | 636 | template <typename IntType> |
| 637 | static IntType Replicate(IntType val, u32 numBits, u32 toBit) { | 637 | static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) { |
| 638 | if (numBits == 0) | 638 | if (numBits == 0) { |
| 639 | return 0; | 639 | return 0; |
| 640 | if (toBit == 0) | 640 | } |
| 641 | if (toBit == 0) { | ||
| 641 | return 0; | 642 | return 0; |
| 642 | IntType v = val & static_cast<IntType>((1 << numBits) - 1); | 643 | } |
| 644 | const IntType v = val & static_cast<IntType>((1 << numBits) - 1); | ||
| 643 | IntType res = v; | 645 | IntType res = v; |
| 644 | u32 reslen = numBits; | 646 | u32 reslen = numBits; |
| 645 | while (reslen < toBit) { | 647 | while (reslen < toBit) { |
| @@ -656,6 +658,89 @@ static IntType Replicate(IntType val, u32 numBits, u32 toBit) { | |||
| 656 | return res; | 658 | return res; |
| 657 | } | 659 | } |
| 658 | 660 | ||
| 661 | static constexpr std::size_t NumReplicateEntries(u32 num_bits) { | ||
| 662 | return std::size_t(1) << num_bits; | ||
| 663 | } | ||
| 664 | |||
| 665 | template <typename IntType, u32 num_bits, u32 to_bit> | ||
| 666 | static constexpr auto MakeReplicateTable() { | ||
| 667 | std::array<IntType, NumReplicateEntries(num_bits)> table{}; | ||
| 668 | for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) { | ||
| 669 | table[value] = Replicate(value, num_bits, to_bit); | ||
| 670 | } | ||
| 671 | return table; | ||
| 672 | } | ||
| 673 | |||
| 674 | static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); | ||
| 675 | static constexpr u32 ReplicateByteTo16(std::size_t value) { | ||
| 676 | return REPLICATE_BYTE_TO_16_TABLE[value]; | ||
| 677 | } | ||
| 678 | |||
| 679 | static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>(); | ||
| 680 | static constexpr u32 ReplicateBitTo7(std::size_t value) { | ||
| 681 | return REPLICATE_BIT_TO_7_TABLE[value]; | ||
| 682 | } | ||
| 683 | |||
| 684 | static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>(); | ||
| 685 | static constexpr u32 ReplicateBitTo9(std::size_t value) { | ||
| 686 | return REPLICATE_BIT_TO_9_TABLE[value]; | ||
| 687 | } | ||
| 688 | |||
| 689 | static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>(); | ||
| 690 | static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>(); | ||
| 691 | static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>(); | ||
| 692 | static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>(); | ||
| 693 | static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>(); | ||
| 694 | static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>(); | ||
| 695 | static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>(); | ||
| 696 | static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>(); | ||
| 697 | /// Use a precompiled table with the most common usages, if it's not in the expected range, fallback | ||
| 698 | /// to the runtime implementation | ||
| 699 | static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { | ||
| 700 | switch (num_bits) { | ||
| 701 | case 1: | ||
| 702 | return REPLICATE_1_BIT_TO_8_TABLE[value]; | ||
| 703 | case 2: | ||
| 704 | return REPLICATE_2_BIT_TO_8_TABLE[value]; | ||
| 705 | case 3: | ||
| 706 | return REPLICATE_3_BIT_TO_8_TABLE[value]; | ||
| 707 | case 4: | ||
| 708 | return REPLICATE_4_BIT_TO_8_TABLE[value]; | ||
| 709 | case 5: | ||
| 710 | return REPLICATE_5_BIT_TO_8_TABLE[value]; | ||
| 711 | case 6: | ||
| 712 | return REPLICATE_6_BIT_TO_8_TABLE[value]; | ||
| 713 | case 7: | ||
| 714 | return REPLICATE_7_BIT_TO_8_TABLE[value]; | ||
| 715 | case 8: | ||
| 716 | return REPLICATE_8_BIT_TO_8_TABLE[value]; | ||
| 717 | default: | ||
| 718 | return Replicate(value, num_bits, 8); | ||
| 719 | } | ||
| 720 | } | ||
| 721 | |||
| 722 | static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>(); | ||
| 723 | static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>(); | ||
| 724 | static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>(); | ||
| 725 | static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>(); | ||
| 726 | static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>(); | ||
| 727 | static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) { | ||
| 728 | switch (num_bits) { | ||
| 729 | case 1: | ||
| 730 | return REPLICATE_1_BIT_TO_6_TABLE[value]; | ||
| 731 | case 2: | ||
| 732 | return REPLICATE_2_BIT_TO_6_TABLE[value]; | ||
| 733 | case 3: | ||
| 734 | return REPLICATE_3_BIT_TO_6_TABLE[value]; | ||
| 735 | case 4: | ||
| 736 | return REPLICATE_4_BIT_TO_6_TABLE[value]; | ||
| 737 | case 5: | ||
| 738 | return REPLICATE_5_BIT_TO_6_TABLE[value]; | ||
| 739 | default: | ||
| 740 | return Replicate(value, num_bits, 6); | ||
| 741 | } | ||
| 742 | } | ||
| 743 | |||
| 659 | class Pixel { | 744 | class Pixel { |
| 660 | protected: | 745 | protected: |
| 661 | using ChannelType = s16; | 746 | using ChannelType = s16; |
| @@ -674,10 +759,10 @@ public: | |||
| 674 | // significant bits when going from larger to smaller bit depth | 759 | // significant bits when going from larger to smaller bit depth |
| 675 | // or by repeating the most significant bits when going from | 760 | // or by repeating the most significant bits when going from |
| 676 | // smaller to larger bit depths. | 761 | // smaller to larger bit depths. |
| 677 | void ChangeBitDepth(const u8 (&depth)[4]) { | 762 | void ChangeBitDepth() { |
| 678 | for (u32 i = 0; i < 4; i++) { | 763 | for (u32 i = 0; i < 4; i++) { |
| 679 | Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]); | 764 | Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]); |
| 680 | m_BitDepth[i] = depth[i]; | 765 | m_BitDepth[i] = 8; |
| 681 | } | 766 | } |
| 682 | } | 767 | } |
| 683 | 768 | ||
| @@ -689,28 +774,23 @@ public: | |||
| 689 | 774 | ||
| 690 | // Changes the bit depth of a single component. See the comment | 775 | // Changes the bit depth of a single component. See the comment |
| 691 | // above for how we do this. | 776 | // above for how we do this. |
| 692 | static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) { | 777 | static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) { |
| 693 | assert(newDepth <= 8); | ||
| 694 | assert(oldDepth <= 8); | 778 | assert(oldDepth <= 8); |
| 695 | 779 | ||
| 696 | if (oldDepth == newDepth) { | 780 | if (oldDepth == 8) { |
| 697 | // Do nothing | 781 | // Do nothing |
| 698 | return val; | 782 | return val; |
| 699 | } else if (oldDepth == 0 && newDepth != 0) { | 783 | } else if (oldDepth == 0) { |
| 700 | return static_cast<ChannelType>((1 << newDepth) - 1); | 784 | return static_cast<ChannelType>((1 << 8) - 1); |
| 701 | } else if (newDepth > oldDepth) { | 785 | } else if (8 > oldDepth) { |
| 702 | return Replicate(val, oldDepth, newDepth); | 786 | return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth)); |
| 703 | } else { | 787 | } else { |
| 704 | // oldDepth > newDepth | 788 | // oldDepth > newDepth |
| 705 | if (newDepth == 0) { | 789 | const u8 bitsWasted = static_cast<u8>(oldDepth - 8); |
| 706 | return 0xFF; | 790 | u16 v = static_cast<u16>(val); |
| 707 | } else { | 791 | v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted); |
| 708 | u8 bitsWasted = static_cast<u8>(oldDepth - newDepth); | 792 | v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1)); |
| 709 | u16 v = static_cast<u16>(val); | 793 | return static_cast<u8>(v); |
| 710 | v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted); | ||
| 711 | v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1)); | ||
| 712 | return static_cast<u8>(v); | ||
| 713 | } | ||
| 714 | } | 794 | } |
| 715 | 795 | ||
| 716 | assert(false && "We shouldn't get here."); | 796 | assert(false && "We shouldn't get here."); |
| @@ -760,8 +840,7 @@ public: | |||
| 760 | // up in the most-significant byte. | 840 | // up in the most-significant byte. |
| 761 | u32 Pack() const { | 841 | u32 Pack() const { |
| 762 | Pixel eightBit(*this); | 842 | Pixel eightBit(*this); |
| 763 | const u8 eightBitDepth[4] = {8, 8, 8, 8}; | 843 | eightBit.ChangeBitDepth(); |
| 764 | eightBit.ChangeBitDepth(eightBitDepth); | ||
| 765 | 844 | ||
| 766 | u32 r = 0; | 845 | u32 r = 0; |
| 767 | r |= eightBit.A(); | 846 | r |= eightBit.A(); |
| @@ -816,8 +895,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP | |||
| 816 | } | 895 | } |
| 817 | 896 | ||
| 818 | // We now have enough to decode our integer sequence. | 897 | // We now have enough to decode our integer sequence. |
| 819 | std::vector<IntegerEncodedValue> decodedColorValues; | 898 | IntegerEncodedVector decodedColorValues; |
| 820 | decodedColorValues.reserve(32); | ||
| 821 | 899 | ||
| 822 | InputBitStream colorStream(data); | 900 | InputBitStream colorStream(data); |
| 823 | DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); | 901 | DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); |
| @@ -839,12 +917,12 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP | |||
| 839 | 917 | ||
| 840 | u32 A = 0, B = 0, C = 0, D = 0; | 918 | u32 A = 0, B = 0, C = 0, D = 0; |
| 841 | // A is just the lsb replicated 9 times. | 919 | // A is just the lsb replicated 9 times. |
| 842 | A = Replicate(bitval & 1, 1, 9); | 920 | A = ReplicateBitTo9(bitval & 1); |
| 843 | 921 | ||
| 844 | switch (val.encoding) { | 922 | switch (val.encoding) { |
| 845 | // Replicate bits | 923 | // Replicate bits |
| 846 | case IntegerEncoding::JustBits: | 924 | case IntegerEncoding::JustBits: |
| 847 | out[outIdx++] = Replicate(bitval, bitlen, 8); | 925 | out[outIdx++] = FastReplicateTo8(bitval, bitlen); |
| 848 | break; | 926 | break; |
| 849 | 927 | ||
| 850 | // Use algorithm in C.2.13 | 928 | // Use algorithm in C.2.13 |
| @@ -962,13 +1040,13 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { | |||
| 962 | u32 bitval = val.bit_value; | 1040 | u32 bitval = val.bit_value; |
| 963 | u32 bitlen = val.num_bits; | 1041 | u32 bitlen = val.num_bits; |
| 964 | 1042 | ||
| 965 | u32 A = Replicate(bitval & 1, 1, 7); | 1043 | u32 A = ReplicateBitTo7(bitval & 1); |
| 966 | u32 B = 0, C = 0, D = 0; | 1044 | u32 B = 0, C = 0, D = 0; |
| 967 | 1045 | ||
| 968 | u32 result = 0; | 1046 | u32 result = 0; |
| 969 | switch (val.encoding) { | 1047 | switch (val.encoding) { |
| 970 | case IntegerEncoding::JustBits: | 1048 | case IntegerEncoding::JustBits: |
| 971 | result = Replicate(bitval, bitlen, 6); | 1049 | result = FastReplicateTo6(bitval, bitlen); |
| 972 | break; | 1050 | break; |
| 973 | 1051 | ||
| 974 | case IntegerEncoding::Trit: { | 1052 | case IntegerEncoding::Trit: { |
| @@ -1047,7 +1125,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { | |||
| 1047 | return result; | 1125 | return result; |
| 1048 | } | 1126 | } |
| 1049 | 1127 | ||
| 1050 | static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights, | 1128 | static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights, |
| 1051 | const TexelWeightParams& params, const u32 blockWidth, | 1129 | const TexelWeightParams& params, const u32 blockWidth, |
| 1052 | const u32 blockHeight) { | 1130 | const u32 blockHeight) { |
| 1053 | u32 weightIdx = 0; | 1131 | u32 weightIdx = 0; |
| @@ -1545,8 +1623,7 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 | |||
| 1545 | static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); | 1623 | static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); |
| 1546 | memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); | 1624 | memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); |
| 1547 | 1625 | ||
| 1548 | std::vector<IntegerEncodedValue> texelWeightValues; | 1626 | IntegerEncodedVector texelWeightValues; |
| 1549 | texelWeightValues.reserve(64); | ||
| 1550 | 1627 | ||
| 1551 | InputBitStream weightStream(texelWeightData); | 1628 | InputBitStream weightStream(texelWeightData); |
| 1552 | 1629 | ||
| @@ -1568,9 +1645,9 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 | |||
| 1568 | Pixel p; | 1645 | Pixel p; |
| 1569 | for (u32 c = 0; c < 4; c++) { | 1646 | for (u32 c = 0; c < 4; c++) { |
| 1570 | u32 C0 = endpos32s[partition][0].Component(c); | 1647 | u32 C0 = endpos32s[partition][0].Component(c); |
| 1571 | C0 = Replicate(C0, 8, 16); | 1648 | C0 = ReplicateByteTo16(C0); |
| 1572 | u32 C1 = endpos32s[partition][1].Component(c); | 1649 | u32 C1 = endpos32s[partition][1].Component(c); |
| 1573 | C1 = Replicate(C1, 8, 16); | 1650 | C1 = ReplicateByteTo16(C1); |
| 1574 | 1651 | ||
| 1575 | u32 plane = 0; | 1652 | u32 plane = 0; |
| 1576 | if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { | 1653 | if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { |