summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar yzct123452021-08-02 15:18:58 +0000
committerGravatar GitHub2021-08-02 11:18:58 -0400
commitf56d0db5bd9b352bbd33aa4524d642a13905a28c (patch)
treef9559c8b1cda9dafb717f23691cedc1778319936 /src
parentgame_list: Make game list folder icons smaller (#6762) (diff)
downloadyuzu-f56d0db5bd9b352bbd33aa4524d642a13905a28c.tar.gz
yuzu-f56d0db5bd9b352bbd33aa4524d642a13905a28c.tar.xz
yuzu-f56d0db5bd9b352bbd33aa4524d642a13905a28c.zip
decoders: Optimize swizzle copy performance (#6790)
This makes UnswizzleTexture up to two times faster. It is the main bottleneck in NVDEC video decoding.
Diffstat (limited to 'src')
-rw-r--r--src/video_core/textures/decoders.cpp52
1 files changed, 43 insertions, 9 deletions
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index f1f523ad1..c32ae956a 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -18,9 +18,9 @@
18 18
19namespace Tegra::Texture { 19namespace Tegra::Texture {
20namespace { 20namespace {
21template <bool TO_LINEAR> 21template <bool TO_LINEAR, u32 BYTES_PER_PIXEL>
22void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, 22void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height, u32 depth,
23 u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { 23 u32 block_height, u32 block_depth, u32 stride_alignment) {
24 // The origin of the transformation can be configured here, leave it as zero as the current API 24 // The origin of the transformation can be configured here, leave it as zero as the current API
25 // doesn't expose it. 25 // doesn't expose it.
26 static constexpr u32 origin_x = 0; 26 static constexpr u32 origin_x = 0;
@@ -28,9 +28,9 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
28 static constexpr u32 origin_z = 0; 28 static constexpr u32 origin_z = 0;
29 29
30 // We can configure here a custom pitch 30 // We can configure here a custom pitch
31 // As it's not exposed 'width * bpp' will be the expected pitch. 31 // As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch.
32 const u32 pitch = width * bytes_per_pixel; 32 const u32 pitch = width * BYTES_PER_PIXEL;
33 const u32 stride = Common::AlignUpLog2(width, stride_alignment) * bytes_per_pixel; 33 const u32 stride = Common::AlignUpLog2(width, stride_alignment) * BYTES_PER_PIXEL;
34 34
35 const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT); 35 const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT);
36 const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); 36 const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
@@ -54,14 +54,14 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
54 ((block_y & block_height_mask) << GOB_SIZE_SHIFT); 54 ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
55 55
56 for (u32 column = 0; column < width; ++column) { 56 for (u32 column = 0; column < width; ++column) {
57 const u32 x = (column + origin_x) * bytes_per_pixel; 57 const u32 x = (column + origin_x) * BYTES_PER_PIXEL;
58 const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift; 58 const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;
59 59
60 const u32 base_swizzled_offset = offset_z + offset_y + offset_x; 60 const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
61 const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X]; 61 const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X];
62 62
63 const u32 unswizzled_offset = 63 const u32 unswizzled_offset =
64 slice * pitch * height + line * pitch + column * bytes_per_pixel; 64 slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL;
65 65
66 if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset); 66 if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset);
67 offset >= input.size()) { 67 offset >= input.size()) {
@@ -73,11 +73,45 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
73 73
74 u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset]; 74 u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset];
75 const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset]; 75 const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset];
76 std::memcpy(dst, src, bytes_per_pixel); 76
77 std::memcpy(dst, src, BYTES_PER_PIXEL);
77 } 78 }
78 } 79 }
79 } 80 }
80} 81}
82
83template <bool TO_LINEAR>
84void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
85 u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
86 switch (bytes_per_pixel) {
87 case 1:
88 return SwizzleImpl<TO_LINEAR, 1>(output, input, width, height, depth, block_height,
89 block_depth, stride_alignment);
90 case 2:
91 return SwizzleImpl<TO_LINEAR, 2>(output, input, width, height, depth, block_height,
92 block_depth, stride_alignment);
93 case 3:
94 return SwizzleImpl<TO_LINEAR, 3>(output, input, width, height, depth, block_height,
95 block_depth, stride_alignment);
96 case 4:
97 return SwizzleImpl<TO_LINEAR, 4>(output, input, width, height, depth, block_height,
98 block_depth, stride_alignment);
99 case 6:
100 return SwizzleImpl<TO_LINEAR, 6>(output, input, width, height, depth, block_height,
101 block_depth, stride_alignment);
102 case 8:
103 return SwizzleImpl<TO_LINEAR, 8>(output, input, width, height, depth, block_height,
104 block_depth, stride_alignment);
105 case 12:
106 return SwizzleImpl<TO_LINEAR, 12>(output, input, width, height, depth, block_height,
107 block_depth, stride_alignment);
108 case 16:
109 return SwizzleImpl<TO_LINEAR, 16>(output, input, width, height, depth, block_height,
110 block_depth, stride_alignment);
111 default:
112 UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
113 }
114}
81} // Anonymous namespace 115} // Anonymous namespace
82 116
83void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, 117void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,