summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/common/algorithm.h8
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h4
-rw-r--r--src/video_core/engines/engine_upload.cpp46
-rw-r--r--src/video_core/engines/engine_upload.h6
-rw-r--r--src/video_core/engines/kepler_compute.cpp13
-rw-r--r--src/video_core/engines/kepler_memory.cpp13
-rw-r--r--src/video_core/engines/maxwell_3d.cpp5
-rw-r--r--src/video_core/engines/maxwell_dma.cpp91
-rw-r--r--src/video_core/engines/maxwell_dma.h6
-rw-r--r--src/video_core/host1x/vic.cpp5
-rw-r--r--src/video_core/memory_manager.cpp91
-rw-r--r--src/video_core/memory_manager.h6
-rw-r--r--src/video_core/rasterizer_interface.h2
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp2
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h2
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.cpp2
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp2
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h2
-rw-r--r--src/video_core/texture_cache/util.cpp1
-rw-r--r--src/video_core/textures/decoders.cpp225
-rw-r--r--src/video_core/textures/decoders.h33
21 files changed, 323 insertions, 242 deletions
diff --git a/src/common/algorithm.h b/src/common/algorithm.h
index 9ddfd637b..055dca142 100644
--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@@ -24,4 +24,12 @@ template <class ForwardIt, class T, class Compare = std::less<>>
24 return first != last && !comp(value, *first) ? first : last; 24 return first != last && !comp(value, *first) ? first : last;
25} 25}
26 26
27template <typename T, typename Func, typename... Args>
28T FoldRight(T initial_value, Func&& func, Args&&... args) {
29 T value{initial_value};
30 const auto high_func = [&value, &func]<typename T>(T x) { value = func(value, x); };
31 (std::invoke(high_func, std::forward<Args>(args)), ...);
32 return value;
33}
34
27} // namespace Common 35} // namespace Common
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index e55cac0d6..359c11d6f 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -126,7 +126,7 @@ public:
126 126
127 void DownloadMemory(VAddr cpu_addr, u64 size); 127 void DownloadMemory(VAddr cpu_addr, u64 size);
128 128
129 bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<u8> inlined_buffer); 129 bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer);
130 130
131 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); 131 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
132 132
@@ -1685,7 +1685,7 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
1685 1685
1686template <class P> 1686template <class P>
1687bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size, 1687bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
1688 std::span<u8> inlined_buffer) { 1688 std::span<const u8> inlined_buffer) {
1689 const bool is_dirty = IsRegionRegistered(dest_address, copy_size); 1689 const bool is_dirty = IsRegionRegistered(dest_address, copy_size);
1690 if (!is_dirty) { 1690 if (!is_dirty) {
1691 return false; 1691 return false;
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
index 6ff5b1eca..a34819234 100644
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -3,6 +3,7 @@
3 3
4#include <cstring> 4#include <cstring>
5 5
6#include "common/algorithm.h"
6#include "common/assert.h" 7#include "common/assert.h"
7#include "video_core/engines/engine_upload.h" 8#include "video_core/engines/engine_upload.h"
8#include "video_core/memory_manager.h" 9#include "video_core/memory_manager.h"
@@ -34,21 +35,48 @@ void State::ProcessData(const u32 data, const bool is_last_call) {
34 if (!is_last_call) { 35 if (!is_last_call) {
35 return; 36 return;
36 } 37 }
38 ProcessData(inner_buffer);
39}
40
41void State::ProcessData(const u32* data, size_t num_data) {
42 std::span<const u8> read_buffer(reinterpret_cast<const u8*>(data), num_data * sizeof(u32));
43 ProcessData(read_buffer);
44}
45
46void State::ProcessData(std::span<const u8> read_buffer) {
37 const GPUVAddr address{regs.dest.Address()}; 47 const GPUVAddr address{regs.dest.Address()};
38 if (is_linear) { 48 if (is_linear) {
39 rasterizer->AccelerateInlineToMemory(address, copy_size, inner_buffer); 49 if (regs.line_count == 1) {
50 rasterizer->AccelerateInlineToMemory(address, copy_size, read_buffer);
51 } else {
52 for (u32 line = 0; line < regs.line_count; ++line) {
53 const GPUVAddr dest_line = address + static_cast<size_t>(line) * regs.dest.pitch;
54 memory_manager.WriteBlockUnsafe(
55 dest_line, read_buffer.data() + static_cast<size_t>(line) * regs.line_length_in,
56 regs.line_length_in);
57 }
58 memory_manager.InvalidateRegion(address, regs.dest.pitch * regs.line_count);
59 }
40 } else { 60 } else {
41 UNIMPLEMENTED_IF(regs.dest.z != 0); 61 u32 width = regs.dest.width;
42 UNIMPLEMENTED_IF(regs.dest.depth != 1); 62 u32 x_elements = regs.line_length_in;
43 UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 0); 63 u32 x_offset = regs.dest.x;
44 UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 0); 64 const u32 bpp_shift = Common::FoldRight(
65 4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
66 width, x_elements, x_offset, static_cast<u32>(address));
67 width >>= bpp_shift;
68 x_elements >>= bpp_shift;
69 x_offset >>= bpp_shift;
70 const u32 bytes_per_pixel = 1U << bpp_shift;
45 const std::size_t dst_size = Tegra::Texture::CalculateSize( 71 const std::size_t dst_size = Tegra::Texture::CalculateSize(
46 true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 0); 72 true, bytes_per_pixel, width, regs.dest.height, regs.dest.depth,
73 regs.dest.BlockHeight(), regs.dest.BlockDepth());
47 tmp_buffer.resize(dst_size); 74 tmp_buffer.resize(dst_size);
48 memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size); 75 memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
49 Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y, 76 Tegra::Texture::SwizzleSubrect(tmp_buffer, read_buffer, bytes_per_pixel, width,
50 regs.dest.BlockHeight(), copy_size, inner_buffer.data(), 77 regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
51 tmp_buffer.data()); 78 x_elements, regs.line_count, regs.dest.BlockHeight(),
79 regs.dest.BlockDepth(), regs.line_length_in);
52 memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size); 80 memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
53 } 81 }
54} 82}
diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h
index 94ff3314a..f08f6e36a 100644
--- a/src/video_core/engines/engine_upload.h
+++ b/src/video_core/engines/engine_upload.h
@@ -3,6 +3,7 @@
3 3
4#pragma once 4#pragma once
5 5
6#include <span>
6#include <vector> 7#include <vector>
7#include "common/bit_field.h" 8#include "common/bit_field.h"
8#include "common/common_types.h" 9#include "common/common_types.h"
@@ -33,7 +34,7 @@ struct Registers {
33 u32 width; 34 u32 width;
34 u32 height; 35 u32 height;
35 u32 depth; 36 u32 depth;
36 u32 z; 37 u32 layer;
37 u32 x; 38 u32 x;
38 u32 y; 39 u32 y;
39 40
@@ -62,11 +63,14 @@ public:
62 63
63 void ProcessExec(bool is_linear_); 64 void ProcessExec(bool is_linear_);
64 void ProcessData(u32 data, bool is_last_call); 65 void ProcessData(u32 data, bool is_last_call);
66 void ProcessData(const u32* data, size_t num_data);
65 67
66 /// Binds a rasterizer to this engine. 68 /// Binds a rasterizer to this engine.
67 void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); 69 void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
68 70
69private: 71private:
72 void ProcessData(std::span<const u8> read_buffer);
73
70 u32 write_offset = 0; 74 u32 write_offset = 0;
71 u32 copy_size = 0; 75 u32 copy_size = 0;
72 std::vector<u8> inner_buffer; 76 std::vector<u8> inner_buffer;
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index 5db254d94..7c50bdbe0 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -36,8 +36,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
36 } 36 }
37 case KEPLER_COMPUTE_REG_INDEX(data_upload): { 37 case KEPLER_COMPUTE_REG_INDEX(data_upload): {
38 upload_state.ProcessData(method_argument, is_last_call); 38 upload_state.ProcessData(method_argument, is_last_call);
39 if (is_last_call) {
40 }
41 break; 39 break;
42 } 40 }
43 case KEPLER_COMPUTE_REG_INDEX(launch): 41 case KEPLER_COMPUTE_REG_INDEX(launch):
@@ -50,8 +48,15 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
50 48
51void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount, 49void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
52 u32 methods_pending) { 50 u32 methods_pending) {
53 for (std::size_t i = 0; i < amount; i++) { 51 switch (method) {
54 CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); 52 case KEPLER_COMPUTE_REG_INDEX(data_upload):
53 upload_state.ProcessData(base_start, static_cast<size_t>(amount));
54 return;
55 default:
56 for (std::size_t i = 0; i < amount; i++) {
57 CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
58 }
59 break;
55 } 60 }
56} 61}
57 62
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index e2b029542..a3fbab1e5 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -33,8 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
33 } 33 }
34 case KEPLERMEMORY_REG_INDEX(data): { 34 case KEPLERMEMORY_REG_INDEX(data): {
35 upload_state.ProcessData(method_argument, is_last_call); 35 upload_state.ProcessData(method_argument, is_last_call);
36 if (is_last_call) {
37 }
38 break; 36 break;
39 } 37 }
40 } 38 }
@@ -42,8 +40,15 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
42 40
43void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount, 41void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
44 u32 methods_pending) { 42 u32 methods_pending) {
45 for (std::size_t i = 0; i < amount; i++) { 43 switch (method) {
46 CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); 44 case KEPLERMEMORY_REG_INDEX(data):
45 upload_state.ProcessData(base_start, static_cast<size_t>(amount));
46 return;
47 default:
48 for (std::size_t i = 0; i < amount; i++) {
49 CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
50 }
51 break;
47 } 52 }
48} 53}
49 54
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index add1ccebe..632052c53 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -239,8 +239,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
239 return upload_state.ProcessExec(regs.exec_upload.linear != 0); 239 return upload_state.ProcessExec(regs.exec_upload.linear != 0);
240 case MAXWELL3D_REG_INDEX(data_upload): 240 case MAXWELL3D_REG_INDEX(data_upload):
241 upload_state.ProcessData(argument, is_last_call); 241 upload_state.ProcessData(argument, is_last_call);
242 if (is_last_call) {
243 }
244 return; 242 return;
245 case MAXWELL3D_REG_INDEX(fragment_barrier): 243 case MAXWELL3D_REG_INDEX(fragment_barrier):
246 return rasterizer->FragmentBarrier(); 244 return rasterizer->FragmentBarrier();
@@ -316,6 +314,9 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
316 case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15: 314 case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15:
317 ProcessCBMultiData(base_start, amount); 315 ProcessCBMultiData(base_start, amount);
318 break; 316 break;
317 case MAXWELL3D_REG_INDEX(data_upload):
318 upload_state.ProcessData(base_start, static_cast<size_t>(amount));
319 return;
319 default: 320 default:
320 for (std::size_t i = 0; i < amount; i++) { 321 for (std::size_t i = 0; i < amount; i++) {
321 CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); 322 CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 0efe58282..a12a95ce2 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -1,6 +1,7 @@
1// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project 1// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later 2// SPDX-License-Identifier: GPL-2.0-or-later
3 3
4#include "common/algorithm.h"
4#include "common/assert.h" 5#include "common/assert.h"
5#include "common/logging/log.h" 6#include "common/logging/log.h"
6#include "common/microprofile.h" 7#include "common/microprofile.h"
@@ -54,8 +55,6 @@ void MaxwellDMA::Launch() {
54 const LaunchDMA& launch = regs.launch_dma; 55 const LaunchDMA& launch = regs.launch_dma;
55 ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE); 56 ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE);
56 ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED); 57 ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED);
57 ASSERT(regs.dst_params.origin.x == 0);
58 ASSERT(regs.dst_params.origin.y == 0);
59 58
60 const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH; 59 const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
61 const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH; 60 const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
@@ -121,12 +120,13 @@ void MaxwellDMA::CopyPitchToPitch() {
121 120
122void MaxwellDMA::CopyBlockLinearToPitch() { 121void MaxwellDMA::CopyBlockLinearToPitch() {
123 UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0); 122 UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
124 UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
125 UNIMPLEMENTED_IF(regs.src_params.layer != 0); 123 UNIMPLEMENTED_IF(regs.src_params.layer != 0);
126 124
125 const bool is_remapping = regs.launch_dma.remap_enable != 0;
126
127 // Optimized path for micro copies. 127 // Optimized path for micro copies.
128 const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; 128 const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
129 if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X && 129 if (!is_remapping && dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X &&
130 regs.src_params.height > GOB_SIZE_Y) { 130 regs.src_params.height > GOB_SIZE_Y) {
131 FastCopyBlockLinearToPitch(); 131 FastCopyBlockLinearToPitch();
132 return; 132 return;
@@ -134,10 +134,27 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
134 134
135 // Deswizzle the input and copy it over. 135 // Deswizzle the input and copy it over.
136 UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); 136 UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
137 const u32 bytes_per_pixel =
138 regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1;
139 const Parameters& src_params = regs.src_params; 137 const Parameters& src_params = regs.src_params;
140 const u32 width = src_params.width; 138
139 const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
140 const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
141
142 const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
143
144 u32 width = src_params.width;
145 u32 x_elements = regs.line_length_in;
146 u32 x_offset = src_params.origin.x;
147 u32 bpp_shift = 0U;
148 if (!is_remapping) {
149 bpp_shift = Common::FoldRight(
150 4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
151 width, x_elements, x_offset, static_cast<u32>(regs.offset_in));
152 width >>= bpp_shift;
153 x_elements >>= bpp_shift;
154 x_offset >>= bpp_shift;
155 }
156
157 const u32 bytes_per_pixel = base_bpp << bpp_shift;
141 const u32 height = src_params.height; 158 const u32 height = src_params.height;
142 const u32 depth = src_params.depth; 159 const u32 depth = src_params.depth;
143 const u32 block_height = src_params.block_size.height; 160 const u32 block_height = src_params.block_size.height;
@@ -155,30 +172,46 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
155 memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); 172 memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
156 memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); 173 memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
157 174
158 UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel, 175 UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
159 block_height, src_params.origin.x, src_params.origin.y, write_buffer.data(), 176 src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
160 read_buffer.data()); 177 regs.pitch_out);
161 178
162 memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); 179 memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
163} 180}
164 181
165void MaxwellDMA::CopyPitchToBlockLinear() { 182void MaxwellDMA::CopyPitchToBlockLinear() {
166 UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one"); 183 UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
184 UNIMPLEMENTED_IF(regs.dst_params.layer != 0);
167 UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); 185 UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
168 186
187 const bool is_remapping = regs.launch_dma.remap_enable != 0;
188 const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
189 const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
190
169 const auto& dst_params = regs.dst_params; 191 const auto& dst_params = regs.dst_params;
170 const u32 bytes_per_pixel = 192
171 regs.launch_dma.remap_enable ? regs.pitch_in / regs.line_length_in : 1; 193 const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
172 const u32 width = dst_params.width; 194
195 u32 width = dst_params.width;
196 u32 x_elements = regs.line_length_in;
197 u32 x_offset = dst_params.origin.x;
198 u32 bpp_shift = 0U;
199 if (!is_remapping) {
200 bpp_shift = Common::FoldRight(
201 4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
202 width, x_elements, x_offset, static_cast<u32>(regs.offset_out));
203 width >>= bpp_shift;
204 x_elements >>= bpp_shift;
205 x_offset >>= bpp_shift;
206 }
207
208 const u32 bytes_per_pixel = base_bpp << bpp_shift;
173 const u32 height = dst_params.height; 209 const u32 height = dst_params.height;
174 const u32 depth = dst_params.depth; 210 const u32 depth = dst_params.depth;
175 const u32 block_height = dst_params.block_size.height; 211 const u32 block_height = dst_params.block_size.height;
176 const u32 block_depth = dst_params.block_size.depth; 212 const u32 block_depth = dst_params.block_size.depth;
177 const size_t dst_size = 213 const size_t dst_size =
178 CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); 214 CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
179 const size_t dst_layer_size =
180 CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth);
181
182 const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count; 215 const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count;
183 216
184 if (read_buffer.size() < src_size) { 217 if (read_buffer.size() < src_size) {
@@ -188,32 +221,23 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
188 write_buffer.resize(dst_size); 221 write_buffer.resize(dst_size);
189 } 222 }
190 223
224 memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
191 if (Settings::IsGPULevelExtreme()) { 225 if (Settings::IsGPULevelExtreme()) {
192 memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
193 memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); 226 memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
194 } else { 227 } else {
195 memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size);
196 memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); 228 memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
197 } 229 }
198 230
199 // If the input is linear and the output is tiled, swizzle the input and copy it over. 231 // If the input is linear and the output is tiled, swizzle the input and copy it over.
200 if (regs.dst_params.block_size.depth > 0) { 232 SwizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
201 ASSERT(dst_params.layer == 0); 233 dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
202 SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height, 234 regs.pitch_in);
203 bytes_per_pixel, block_height, block_depth, dst_params.origin.x,
204 dst_params.origin.y, write_buffer.data(), read_buffer.data());
205 } else {
206 SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel,
207 write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(),
208 block_height, dst_params.origin.x, dst_params.origin.y);
209 }
210 235
211 memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); 236 memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
212} 237}
213 238
214void MaxwellDMA::FastCopyBlockLinearToPitch() { 239void MaxwellDMA::FastCopyBlockLinearToPitch() {
215 const u32 bytes_per_pixel = 240 const u32 bytes_per_pixel = 1U;
216 regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1;
217 const size_t src_size = GOB_SIZE; 241 const size_t src_size = GOB_SIZE;
218 const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; 242 const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
219 u32 pos_x = regs.src_params.origin.x; 243 u32 pos_x = regs.src_params.origin.x;
@@ -239,9 +263,10 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
239 memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); 263 memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
240 } 264 }
241 265
242 UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width, 266 UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, regs.src_params.width,
243 bytes_per_pixel, regs.src_params.block_size.height, pos_x, pos_y, 267 regs.src_params.height, 1, pos_x, pos_y, regs.line_length_in, regs.line_count,
244 write_buffer.data(), read_buffer.data()); 268 regs.src_params.block_size.height, regs.src_params.block_size.depth,
269 regs.pitch_out);
245 270
246 memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); 271 memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
247} 272}
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 074bac92c..9c5d567a6 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -189,10 +189,16 @@ public:
189 BitField<4, 3, Swizzle> dst_y; 189 BitField<4, 3, Swizzle> dst_y;
190 BitField<8, 3, Swizzle> dst_z; 190 BitField<8, 3, Swizzle> dst_z;
191 BitField<12, 3, Swizzle> dst_w; 191 BitField<12, 3, Swizzle> dst_w;
192 BitField<0, 12, u32> dst_components_raw;
192 BitField<16, 2, u32> component_size_minus_one; 193 BitField<16, 2, u32> component_size_minus_one;
193 BitField<20, 2, u32> num_src_components_minus_one; 194 BitField<20, 2, u32> num_src_components_minus_one;
194 BitField<24, 2, u32> num_dst_components_minus_one; 195 BitField<24, 2, u32> num_dst_components_minus_one;
195 }; 196 };
197
198 Swizzle GetComponent(size_t i) {
199 const u32 raw = dst_components_raw;
200 return static_cast<Swizzle>((raw >> (i * 3)) & 0x7);
201 }
196 }; 202 };
197 static_assert(sizeof(RemapConst) == 12); 203 static_assert(sizeof(RemapConst) == 12);
198 204
diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp
index 5d8039841..b9ac41529 100644
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@@ -156,8 +156,9 @@ void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) {
156 const u32 block_height = static_cast<u32>(config.block_linear_height_log2); 156 const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
157 const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0); 157 const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
158 luma_buffer.resize(size); 158 luma_buffer.resize(size);
159 Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(), 159 std::span<const u8> frame_buff(converted_frame_buf_addr, 4 * width * height);
160 converted_frame_buf_addr, block_height, 0, 0); 160 Texture::SwizzleSubrect(luma_buffer, frame_buff, 4, width, height, 1,
161 0, 0, width, height, block_height, 0, width * 4);
161 162
162 host1x.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); 163 host1x.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
163 } else { 164 } else {
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 4e52ce0fd..4a692448e 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -462,6 +462,97 @@ void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size) const {
462 MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, flush_short_pages); 462 MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, flush_short_pages);
463} 463}
464 464
465bool MemoryManager::IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const {
466 bool result = false;
467 auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
468 [[maybe_unused]] std::size_t offset,
469 [[maybe_unused]] std::size_t copy_amount) { return false; };
470
471 auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
472 const VAddr cpu_addr_base =
473 (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
474 result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount);
475 return result;
476 };
477 auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
478 const VAddr cpu_addr_base =
479 (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
480 result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount);
481 return result;
482 };
483 auto check_short_pages = [&](std::size_t page_index, std::size_t offset,
484 std::size_t copy_amount) {
485 GPUVAddr base = (page_index << big_page_bits) + offset;
486 MemoryOperation<false>(base, copy_amount, mapped_normal, do_nothing, do_nothing);
487 return result;
488 };
489 MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, check_short_pages);
490 return result;
491}
492
493size_t MemoryManager::MaxContinousRange(GPUVAddr gpu_addr, size_t size) const {
494 std::optional<VAddr> old_page_addr{};
495 size_t range_so_far = 0;
496 bool result{false};
497 auto fail = [&]([[maybe_unused]] std::size_t page_index, [[maybe_unused]] std::size_t offset,
498 std::size_t copy_amount) {
499 result = true;
500 return true;
501 };
502 auto short_check = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
503 const VAddr cpu_addr_base =
504 (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
505 if (old_page_addr && *old_page_addr != cpu_addr_base) {
506 result = true;
507 return true;
508 }
509 range_so_far += copy_amount;
510 old_page_addr = {cpu_addr_base + copy_amount};
511 return false;
512 };
513 auto big_check = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
514 const VAddr cpu_addr_base =
515 (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
516 if (old_page_addr && *old_page_addr != cpu_addr_base) {
517 return true;
518 }
519 range_so_far += copy_amount;
520 old_page_addr = {cpu_addr_base + copy_amount};
521 return false;
522 };
523 auto check_short_pages = [&](std::size_t page_index, std::size_t offset,
524 std::size_t copy_amount) {
525 GPUVAddr base = (page_index << big_page_bits) + offset;
526 MemoryOperation<false>(base, copy_amount, short_check, fail, fail);
527 return result;
528 };
529 MemoryOperation<true>(gpu_addr, size, big_check, fail, check_short_pages);
530 return range_so_far;
531}
532
533void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size) const {
534 auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
535 [[maybe_unused]] std::size_t offset,
536 [[maybe_unused]] std::size_t copy_amount) {};
537
538 auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
539 const VAddr cpu_addr_base =
540 (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
541 rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
542 };
543 auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
544 const VAddr cpu_addr_base =
545 (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
546 rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
547 };
548 auto invalidate_short_pages = [&](std::size_t page_index, std::size_t offset,
549 std::size_t copy_amount) {
550 GPUVAddr base = (page_index << big_page_bits) + offset;
551 MemoryOperation<false>(base, copy_amount, mapped_normal, do_nothing, do_nothing);
552 };
553 MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, invalidate_short_pages);
554}
555
465void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) { 556void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) {
466 std::vector<u8> tmp_buffer(size); 557 std::vector<u8> tmp_buffer(size);
467 ReadBlock(gpu_src_addr, tmp_buffer.data(), size); 558 ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 8f8877a92..9c08edc20 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -104,6 +104,12 @@ public:
104 104
105 void FlushRegion(GPUVAddr gpu_addr, size_t size) const; 105 void FlushRegion(GPUVAddr gpu_addr, size_t size) const;
106 106
107 void InvalidateRegion(GPUVAddr gpu_addr, size_t size) const;
108
109 bool IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const;
110
111 size_t MaxContinousRange(GPUVAddr gpu_addr, size_t size) const;
112
107private: 113private:
108 template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped> 114 template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
109 inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped, 115 inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index cb07f3d38..d2d40884c 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -129,7 +129,7 @@ public:
129 [[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0; 129 [[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0;
130 130
131 virtual void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, 131 virtual void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
132 std::span<u8> memory) = 0; 132 std::span<const u8> memory) = 0;
133 133
134 /// Attempt to use a faster method to display the framebuffer to screen 134 /// Attempt to use a faster method to display the framebuffer to screen
135 [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, 135 [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 02bb17715..c2d80605d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -476,7 +476,7 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA()
476} 476}
477 477
478void RasterizerOpenGL::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, 478void RasterizerOpenGL::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
479 std::span<u8> memory) { 479 std::span<const u8> memory) {
480 auto cpu_addr = gpu_memory->GpuToCpuAddress(address); 480 auto cpu_addr = gpu_memory->GpuToCpuAddress(address);
481 if (!cpu_addr) [[unlikely]] { 481 if (!cpu_addr) [[unlikely]] {
482 gpu_memory->WriteBlock(address, memory.data(), copy_size); 482 gpu_memory->WriteBlock(address, memory.data(), copy_size);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index fe0ba979a..45131b785 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -99,7 +99,7 @@ public:
99 const Tegra::Engines::Fermi2D::Config& copy_config) override; 99 const Tegra::Engines::Fermi2D::Config& copy_config) override;
100 Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override; 100 Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
101 void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, 101 void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
102 std::span<u8> memory) override; 102 std::span<const u8> memory) override;
103 bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, 103 bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
104 u32 pixel_stride) override; 104 u32 pixel_stride) override;
105 void LoadDiskResources(u64 title_id, std::stop_token stop_loading, 105 void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index f17a5ccd6..241d7573e 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -26,8 +26,6 @@
26 26
27namespace Vulkan { 27namespace Vulkan {
28 28
29using Tegra::Texture::SWIZZLE_TABLE;
30
31namespace { 29namespace {
32 30
33constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; 31constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index a35e41199..acfd5da7d 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -548,7 +548,7 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA()
548} 548}
549 549
550void RasterizerVulkan::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, 550void RasterizerVulkan::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
551 std::span<u8> memory) { 551 std::span<const u8> memory) {
552 auto cpu_addr = gpu_memory->GpuToCpuAddress(address); 552 auto cpu_addr = gpu_memory->GpuToCpuAddress(address);
553 if (!cpu_addr) [[unlikely]] { 553 if (!cpu_addr) [[unlikely]] {
554 gpu_memory->WriteBlock(address, memory.data(), copy_size); 554 gpu_memory->WriteBlock(address, memory.data(), copy_size);
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index fb9e83e8f..4cde3c983 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -95,7 +95,7 @@ public:
95 const Tegra::Engines::Fermi2D::Config& copy_config) override; 95 const Tegra::Engines::Fermi2D::Config& copy_config) override;
96 Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override; 96 Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
97 void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, 97 void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
98 std::span<u8> memory) override; 98 std::span<const u8> memory) override;
99 bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, 99 bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
100 u32 pixel_stride) override; 100 u32 pixel_stride) override;
101 void LoadDiskResources(u64 title_id, std::stop_token stop_loading, 101 void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index bea1c27d0..1223df5a0 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -517,7 +517,6 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr
517 const u32 host_bytes_per_layer = num_blocks_per_layer * bytes_per_block; 517 const u32 host_bytes_per_layer = num_blocks_per_layer * bytes_per_block;
518 518
519 UNIMPLEMENTED_IF(info.tile_width_spacing > 0); 519 UNIMPLEMENTED_IF(info.tile_width_spacing > 0);
520
521 UNIMPLEMENTED_IF(copy.image_offset.x != 0); 520 UNIMPLEMENTED_IF(copy.image_offset.x != 0);
522 UNIMPLEMENTED_IF(copy.image_offset.y != 0); 521 UNIMPLEMENTED_IF(copy.image_offset.y != 0);
523 UNIMPLEMENTED_IF(copy.image_offset.z != 0); 522 UNIMPLEMENTED_IF(copy.image_offset.z != 0);
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 913f8ebcb..fcc636e0b 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -89,6 +89,69 @@ void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32
89 } 89 }
90} 90}
91 91
92template <bool TO_LINEAR, u32 BYTES_PER_PIXEL>
93void SwizzleSubrectImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height,
94 u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 num_lines,
95 u32 block_height, u32 block_depth, u32 pitch_linear) {
96 // The origin of the transformation can be configured here, leave it as zero as the current API
97 // doesn't expose it.
98 static constexpr u32 origin_z = 0;
99
100 // We can configure here a custom pitch
101 // As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch.
102 const u32 pitch = pitch_linear;
103 const u32 stride = Common::AlignUpLog2(width * BYTES_PER_PIXEL, GOB_SIZE_X_SHIFT);
104
105 const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT);
106 const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
107 const u32 slice_size =
108 Common::DivCeilLog2(height, block_height + GOB_SIZE_Y_SHIFT) * block_size;
109
110 const u32 block_height_mask = (1U << block_height) - 1;
111 const u32 block_depth_mask = (1U << block_depth) - 1;
112 const u32 x_shift = GOB_SIZE_SHIFT + block_height + block_depth;
113
114 u32 unprocessed_lines = num_lines;
115 u32 extent_y = std::min(num_lines, height - origin_y);
116
117 for (u32 slice = 0; slice < depth; ++slice) {
118 const u32 z = slice + origin_z;
119 const u32 offset_z = (z >> block_depth) * slice_size +
120 ((z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height));
121 const u32 lines_in_y = std::min(unprocessed_lines, extent_y);
122 for (u32 line = 0; line < lines_in_y; ++line) {
123 const u32 y = line + origin_y;
124 const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(y);
125
126 const u32 block_y = y >> GOB_SIZE_Y_SHIFT;
127 const u32 offset_y = (block_y >> block_height) * block_size +
128 ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
129
130 u32 swizzled_x = pdep<SWIZZLE_X_BITS>(origin_x * BYTES_PER_PIXEL);
131 for (u32 column = 0; column < extent_x;
132 ++column, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
133 const u32 x = (column + origin_x) * BYTES_PER_PIXEL;
134 const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;
135
136 const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
137 const u32 swizzled_offset = base_swizzled_offset + (swizzled_x | swizzled_y);
138
139 const u32 unswizzled_offset =
140 slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL;
141
142 u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset];
143 const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset];
144
145 std::memcpy(dst, src, BYTES_PER_PIXEL);
146 }
147 }
148 unprocessed_lines -= lines_in_y;
149 if (unprocessed_lines == 0) {
150 return;
151 }
152 }
153}
154
92template <bool TO_LINEAR> 155template <bool TO_LINEAR>
93void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, 156void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
94 u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { 157 u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
@@ -111,97 +174,6 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
111 } 174 }
112} 175}
113 176
114template <u32 BYTES_PER_PIXEL>
115void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
116 u8* swizzled_data, const u8* unswizzled_data, u32 block_height_bit,
117 u32 offset_x, u32 offset_y) {
118 const u32 block_height = 1U << block_height_bit;
119 const u32 image_width_in_gobs =
120 (swizzled_width * BYTES_PER_PIXEL + (GOB_SIZE_X - 1)) / GOB_SIZE_X;
121 for (u32 line = 0; line < subrect_height; ++line) {
122 const u32 dst_y = line + offset_y;
123 const u32 gob_address_y =
124 (dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
125 ((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
126
127 const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(dst_y);
128 u32 swizzled_x = pdep<SWIZZLE_X_BITS>(offset_x * BYTES_PER_PIXEL);
129 for (u32 x = 0; x < subrect_width;
130 ++x, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
131 const u32 dst_x = x + offset_x;
132 const u32 gob_address =
133 gob_address_y + (dst_x * BYTES_PER_PIXEL / GOB_SIZE_X) * GOB_SIZE * block_height;
134 const u32 swizzled_offset = gob_address + (swizzled_x | swizzled_y);
135 const u32 unswizzled_offset = line * source_pitch + x * BYTES_PER_PIXEL;
136
137 const u8* const source_line = unswizzled_data + unswizzled_offset;
138 u8* const dest_addr = swizzled_data + swizzled_offset;
139 std::memcpy(dest_addr, source_line, BYTES_PER_PIXEL);
140 }
141 }
142}
143
144template <u32 BYTES_PER_PIXEL>
145void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 block_height,
146 u32 origin_x, u32 origin_y, u8* output, const u8* input) {
147 const u32 stride = width * BYTES_PER_PIXEL;
148 const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
149 const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height);
150
151 const u32 block_height_mask = (1U << block_height) - 1;
152 const u32 x_shift = GOB_SIZE_SHIFT + block_height;
153
154 for (u32 line = 0; line < line_count; ++line) {
155 const u32 src_y = line + origin_y;
156 const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(src_y);
157
158 const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT;
159 const u32 src_offset_y = (block_y >> block_height) * block_size +
160 ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
161
162 u32 swizzled_x = pdep<SWIZZLE_X_BITS>(origin_x * BYTES_PER_PIXEL);
163 for (u32 column = 0; column < line_length_in;
164 ++column, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
165 const u32 src_x = (column + origin_x) * BYTES_PER_PIXEL;
166 const u32 src_offset_x = (src_x >> GOB_SIZE_X_SHIFT) << x_shift;
167
168 const u32 swizzled_offset = src_offset_y + src_offset_x + (swizzled_x | swizzled_y);
169 const u32 unswizzled_offset = line * pitch + column * BYTES_PER_PIXEL;
170
171 std::memcpy(output + unswizzled_offset, input + swizzled_offset, BYTES_PER_PIXEL);
172 }
173 }
174}
175
176template <u32 BYTES_PER_PIXEL>
177void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
178 u32 block_height, u32 block_depth, u32 origin_x, u32 origin_y, u8* output,
179 const u8* input) {
180 UNIMPLEMENTED_IF(origin_x > 0);
181 UNIMPLEMENTED_IF(origin_y > 0);
182
183 const u32 stride = width * BYTES_PER_PIXEL;
184 const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
185 const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
186
187 const u32 block_height_mask = (1U << block_height) - 1;
188 const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth;
189
190 for (u32 line = 0; line < line_count; ++line) {
191 const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(line);
192 const u32 block_y = line / GOB_SIZE_Y;
193 const u32 dst_offset_y =
194 (block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE;
195
196 u32 swizzled_x = 0;
197 for (u32 x = 0; x < line_length_in; ++x, incrpdep<SWIZZLE_X_BITS, 1>(swizzled_x)) {
198 const u32 dst_offset =
199 ((x / GOB_SIZE_X) << x_shift) + dst_offset_y + (swizzled_x | swizzled_y);
200 const u32 src_offset = x * BYTES_PER_PIXEL + line * pitch;
201 std::memcpy(output + dst_offset, input + src_offset, BYTES_PER_PIXEL);
202 }
203 }
204}
205} // Anonymous namespace 177} // Anonymous namespace
206 178
207void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, 179void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
@@ -218,15 +190,15 @@ void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_p
218 stride_alignment); 190 stride_alignment);
219} 191}
220 192
221void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, 193void SwizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
222 u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data, 194 u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 extent_y,
223 u32 block_height_bit, u32 offset_x, u32 offset_y) { 195 u32 block_height, u32 block_depth, u32 pitch_linear) {
224 switch (bytes_per_pixel) { 196 switch (bytes_per_pixel) {
225#define BPP_CASE(x) \ 197#define BPP_CASE(x) \
226 case x: \ 198 case x: \
227 return SwizzleSubrect<x>(subrect_width, subrect_height, source_pitch, swizzled_width, \ 199 return SwizzleSubrectImpl<true, x>(output, input, width, height, depth, origin_x, \
228 swizzled_data, unswizzled_data, block_height_bit, offset_x, \ 200 origin_y, extent_x, extent_y, block_height, \
229 offset_y); 201 block_depth, pitch_linear);
230 BPP_CASE(1) 202 BPP_CASE(1)
231 BPP_CASE(2) 203 BPP_CASE(2)
232 BPP_CASE(3) 204 BPP_CASE(3)
@@ -241,13 +213,15 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32
241 } 213 }
242} 214}
243 215
244void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel, 216void UnswizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
245 u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) { 217 u32 width, u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x,
218 u32 extent_y, u32 block_height, u32 block_depth, u32 pitch_linear) {
246 switch (bytes_per_pixel) { 219 switch (bytes_per_pixel) {
247#define BPP_CASE(x) \ 220#define BPP_CASE(x) \
248 case x: \ 221 case x: \
249 return UnswizzleSubrect<x>(line_length_in, line_count, pitch, width, block_height, \ 222 return SwizzleSubrectImpl<false, x>(output, input, width, height, depth, origin_x, \
250 origin_x, origin_y, output, input); 223 origin_y, extent_x, extent_y, block_height, \
224 block_depth, pitch_linear);
251 BPP_CASE(1) 225 BPP_CASE(1)
252 BPP_CASE(2) 226 BPP_CASE(2)
253 BPP_CASE(3) 227 BPP_CASE(3)
@@ -262,55 +236,6 @@ void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width,
262 } 236 }
263} 237}
264 238
265void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
266 u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
267 u32 origin_y, u8* output, const u8* input) {
268 switch (bytes_per_pixel) {
269#define BPP_CASE(x) \
270 case x: \
271 return SwizzleSliceToVoxel<x>(line_length_in, line_count, pitch, width, height, \
272 block_height, block_depth, origin_x, origin_y, output, \
273 input);
274 BPP_CASE(1)
275 BPP_CASE(2)
276 BPP_CASE(3)
277 BPP_CASE(4)
278 BPP_CASE(6)
279 BPP_CASE(8)
280 BPP_CASE(12)
281 BPP_CASE(16)
282#undef BPP_CASE
283 default:
284 ASSERT_MSG(false, "Invalid bytes_per_pixel={}", bytes_per_pixel);
285 }
286}
287
288void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
289 const u32 block_height_bit, const std::size_t copy_size, const u8* source_data,
290 u8* swizzle_data) {
291 const u32 block_height = 1U << block_height_bit;
292 const u32 image_width_in_gobs{(width + GOB_SIZE_X - 1) / GOB_SIZE_X};
293 std::size_t count = 0;
294 for (std::size_t y = dst_y; y < height && count < copy_size; ++y) {
295 const std::size_t gob_address_y =
296 (y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
297 ((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
298 const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(static_cast<u32>(y));
299 u32 swizzled_x = pdep<SWIZZLE_X_BITS>(dst_x);
300 for (std::size_t x = dst_x; x < width && count < copy_size;
301 ++x, incrpdep<SWIZZLE_X_BITS, 1>(swizzled_x)) {
302 const std::size_t gob_address =
303 gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height;
304 const std::size_t swizzled_offset = gob_address + (swizzled_x | swizzled_y);
305 const u8* source_line = source_data + count;
306 u8* dest_addr = swizzle_data + swizzled_offset;
307 count++;
308
309 *dest_addr = *source_line;
310 }
311 }
312}
313
314std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, 239std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
315 u32 block_height, u32 block_depth) { 240 u32 block_height, u32 block_depth) {
316 if (tiled) { 241 if (tiled) {
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index 31a11708f..e70407692 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -40,7 +40,6 @@ constexpr SwizzleTable MakeSwizzleTable() {
40 } 40 }
41 return table; 41 return table;
42} 42}
43constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTable();
44 43
45/// Unswizzles a block linear texture into linear memory. 44/// Unswizzles a block linear texture into linear memory.
46void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, 45void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
@@ -57,34 +56,14 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height
57 u32 block_height, u32 block_depth); 56 u32 block_height, u32 block_depth);
58 57
59/// Copies an untiled subrectangle into a tiled surface. 58/// Copies an untiled subrectangle into a tiled surface.
60void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, 59void SwizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
61 u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data, 60 u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 extent_y,
62 u32 block_height_bit, u32 offset_x, u32 offset_y); 61 u32 block_height, u32 block_depth, u32 pitch_linear);
63 62
64/// Copies a tiled subrectangle into a linear surface. 63/// Copies a tiled subrectangle into a linear surface.
65void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel, 64void UnswizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
66 u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input); 65 u32 width, u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x,
67 66 u32 extent_y, u32 block_height, u32 block_depth, u32 pitch_linear);
68/// @brief Swizzles a 2D array of pixels into a 3D texture
69/// @param line_length_in Number of pixels per line
70/// @param line_count Number of lines
71/// @param pitch Number of bytes per line
72/// @param width Width of the swizzled texture
73/// @param height Height of the swizzled texture
74/// @param bytes_per_pixel Number of bytes used per pixel
75/// @param block_height Block height shift
76/// @param block_depth Block depth shift
77/// @param origin_x Column offset in pixels of the swizzled texture
78/// @param origin_y Row offset in pixels of the swizzled texture
79/// @param output Pointer to the pixels of the swizzled texture
80/// @param input Pointer to the 2D array of pixels used as input
81/// @pre input and output points to an array large enough to hold the number of bytes used
82void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
83 u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
84 u32 origin_y, u8* output, const u8* input);
85
86void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
87 std::size_t copy_size, const u8* source_data, u8* swizzle_data);
88 67
89/// Obtains the offset of the gob for positions 'dst_x' & 'dst_y' 68/// Obtains the offset of the gob for positions 'dst_x' & 'dst_y'
90u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height, 69u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,