video_core: Rewrite the texture cache

The current texture cache has several points that hurt maintainability and performance. It's easy to break unrelated parts of the cache when doing minor changes. The cache can easily forget valuable information about the cached textures by CPU writes or simply by its normal usage.The current texture cache has several points that hurt maintainability and performance. It's easy to break unrelated parts of the cache when doing minor changes. The cache can easily forget valuable information about the cached textures by CPU writes or simply by its normal usage. This commit aims to address those issues.
author: ReinUsesLisp 2020-12-30 02:25:23 -0300
committer: ReinUsesLisp 2020-12-30 03:38:50 -0300
commit: 9764c13d6d2977903f407761b27d847c0056e1c4 (patch)
tree: f6f5d6d6379b0404147969e7d1f548ed3d49ca01 /src/video_core/textures/decoders.cpp
parent: video_core: Add a delayed destruction ring abstraction (diff)
download: yuzu-9764c13d6d2977903f407761b27d847c0056e1c4.tar.gz
yuzu-9764c13d6d2977903f407761b27d847c0056e1c4.tar.xz
yuzu-9764c13d6d2977903f407761b27d847c0056e1c4.zip
1 files changed, 78 insertions, 171 deletions
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 16d46a018..9f5181318 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -2,204 +2,111 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
+#include <array>
 #include <cmath>
 #include <cstring>
+#include <span>
+#include <utility>
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/bit_util.h"
+#include "common/div_ceil.h"
 #include "video_core/gpu.h"
 #include "video_core/textures/decoders.h"
 #include "video_core/textures/texture.h"
 namespace Tegra::Texture {
-namespace {
+namespace {
 /**
- * This table represents the internal swizzle of a gob,
+ * This table represents the internal swizzle of a gob, in format 16 bytes x 2 sector packing.
- * in format 16 bytes x 2 sector packing.
 * Calculates the offset of an (x, y) position within a swizzled texture.
 * Taken from the Tegra X1 Technical Reference Manual. pages 1187-1188
 */
-template <std::size_t N, std::size_t M, u32 Align>
+constexpr SwizzleTable MakeSwizzleTableConst() {
-struct alignas(64) SwizzleTable {
+    SwizzleTable table{};
-    static_assert(M * Align == 64, "Swizzle Table does not align to GOB");
+    for (u32 y = 0; y < table.size(); ++y) {
-    constexpr SwizzleTable() {
+        for (u32 x = 0; x < table[0].size(); ++x) {
-        for (u32 y = 0; y < N; ++y) {
+            table[y][x] = ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 +
-            for (u32 x = 0; x < M; ++x) {
+                          (y % 2) * 16 + (x % 16);
-                const u32 x2 = x * Align;
-                values[y][x] = static_cast<u16>(((x2 % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
-                                                ((x2 % 32) / 16) * 32 + (y % 2) * 16 + (x2 % 16));
-            }
        }
    }
-    const std::array<u16, M>& operator[](std::size_t index) const {
+    return table;
-        return values[index];
+}
-    }
-    std::array<std::array<u16, M>, N> values{};
-};
-constexpr u32 FAST_SWIZZLE_ALIGN = 16;
+constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTableConst();
-constexpr auto LEGACY_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_X, GOB_SIZE_X, GOB_SIZE_Z>();
+template <bool TO_LINEAR>
-constexpr auto FAST_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_Y, 4, FAST_SWIZZLE_ALIGN>();
+void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
+             u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
+    // The origin of the transformation can be configured here, leave it as zero as the current API
+    // doesn't expose it.
+    static constexpr u32 origin_x = 0;
+    static constexpr u32 origin_y = 0;
+    static constexpr u32 origin_z = 0;
-/**
+    // We can configure here a custom pitch
- * This function manages ALL the GOBs(Group of Bytes) Inside a single block.
+    // As it's not exposed 'width * bpp' will be the expected pitch.
- * Instead of going gob by gob, we map the coordinates inside a block and manage from
+    const u32 pitch = width * bytes_per_pixel;
- * those. Block_Width is assumed to be 1.
+    const u32 stride = Common::AlignBits(width, stride_alignment) * bytes_per_pixel;
- */
-void PreciseProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle,
-                         const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end,
-                         const u32 y_end, const u32 z_end, const u32 tile_offset,
-                         const u32 xy_block_size, const u32 layer_z, const u32 stride_x,
-                         const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) {
-    std::array<u8*, 2> data_ptrs;
-    u32 z_address = tile_offset;
-    for (u32 z = z_start; z < z_end; z++) {
-        u32 y_address = z_address;
-        u32 pixel_base = layer_z * z + y_start * stride_x;
-        for (u32 y = y_start; y < y_end; y++) {
-            const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y];
-            for (u32 x = x_start; x < x_end; x++) {
-                const u32 swizzle_offset{y_address + table[x * bytes_per_pixel % GOB_SIZE_X]};
-                const u32 pixel_index{x * out_bytes_per_pixel + pixel_base};
-                data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
-                data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
-                std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
-            }
-            pixel_base += stride_x;
-            if ((y + 1) % GOB_SIZE_Y == 0)
-                y_address += GOB_SIZE;
-        }
-        z_address += xy_block_size;
-    }
-}
-/**
+    const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT);
- * This function manages ALL the GOBs(Group of Bytes) Inside a single block.
+    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
- * Instead of going gob by gob, we map the coordinates inside a block and manage from
+    const u32 slice_size =
- * those. Block_Width is assumed to be 1.
+        Common::DivCeilLog2(height, block_height + GOB_SIZE_Y_SHIFT) * block_size;
- */
-void FastProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle,
-                      const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end,
-                      const u32 y_end, const u32 z_end, const u32 tile_offset,
-                      const u32 xy_block_size, const u32 layer_z, const u32 stride_x,
-                      const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) {
-    std::array<u8*, 2> data_ptrs;
-    u32 z_address = tile_offset;
-    const u32 x_startb = x_start * bytes_per_pixel;
-    const u32 x_endb = x_end * bytes_per_pixel;
-    for (u32 z = z_start; z < z_end; z++) {
-        u32 y_address = z_address;
-        u32 pixel_base = layer_z * z + y_start * stride_x;
-        for (u32 y = y_start; y < y_end; y++) {
-            const auto& table = FAST_SWIZZLE_TABLE[y % GOB_SIZE_Y];
-            for (u32 xb = x_startb; xb < x_endb; xb += FAST_SWIZZLE_ALIGN) {
-                const u32 swizzle_offset{y_address + table[(xb / FAST_SWIZZLE_ALIGN) % 4]};
-                const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel;
-                const u32 pixel_index{out_x + pixel_base};
-                data_ptrs[unswizzle ? 1 : 0] = swizzled_data + swizzle_offset;
-                data_ptrs[unswizzle ? 0 : 1] = unswizzled_data + pixel_index;
-                std::memcpy(data_ptrs[0], data_ptrs[1], FAST_SWIZZLE_ALIGN);
-            }
-            pixel_base += stride_x;
-            if ((y + 1) % GOB_SIZE_Y == 0)
-                y_address += GOB_SIZE;
-        }
-        z_address += xy_block_size;
-    }
-}
-/**
+    const u32 block_height_mask = (1U << block_height) - 1;
- * This function unswizzles or swizzles a texture by mapping Linear to BlockLinear Textue.
+    const u32 block_depth_mask = (1U << block_depth) - 1;
- * The body of this function takes care of splitting the swizzled texture into blocks,
+    const u32 x_shift = GOB_SIZE_SHIFT + block_height + block_depth;
- * and managing the extents of it. Once all the parameters of a single block are obtained,
- * the function calls 'ProcessBlock' to process that particular Block.
+    for (u32 slice = 0; slice < depth; ++slice) {
- *
+        const u32 z = slice + origin_z;
- * Documentation for the memory layout and decoding can be found at:
+        const u32 offset_z = (z >> block_depth) * slice_size +
- *  https://envytools.readthedocs.io/en/latest/hw/memory/g80-surface.html#blocklinear-surfaces
+                             ((z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height));
- */
+        for (u32 line = 0; line < height; ++line) {
-template <bool fast>
+            const u32 y = line + origin_y;
-void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle,
+            const auto& table = SWIZZLE_TABLE[y % GOB_SIZE_Y];
-                  const u32 width, const u32 height, const u32 depth, const u32 bytes_per_pixel,
-                  const u32 out_bytes_per_pixel, const u32 block_height, const u32 block_depth,
+            const u32 block_y = y >> GOB_SIZE_Y_SHIFT;
-                  const u32 width_spacing) {
+            const u32 offset_y = (block_y >> block_height) * block_size +
-    auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); };
+                                 ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
-    const u32 stride_x = width * out_bytes_per_pixel;
-    const u32 layer_z = height * stride_x;
+            for (u32 column = 0; column < width; ++column) {
-    const u32 gob_elements_x = GOB_SIZE_X / bytes_per_pixel;
+                const u32 x = (column + origin_x) * bytes_per_pixel;
-    constexpr u32 gob_elements_y = GOB_SIZE_Y;
+                const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;
-    constexpr u32 gob_elements_z = GOB_SIZE_Z;
-    const u32 block_x_elements = gob_elements_x;
+                const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
-    const u32 block_y_elements = gob_elements_y * block_height;
+                const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X];
-    const u32 block_z_elements = gob_elements_z * block_depth;
-    const u32 aligned_width = Common::AlignUp(width, gob_elements_x * width_spacing);
+                const u32 unswizzled_offset =
-    const u32 blocks_on_x = div_ceil(aligned_width, block_x_elements);
+                    slice * pitch * height + line * pitch + column * bytes_per_pixel;
-    const u32 blocks_on_y = div_ceil(height, block_y_elements);
-    const u32 blocks_on_z = div_ceil(depth, block_z_elements);
+                u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset];
-    const u32 xy_block_size = GOB_SIZE * block_height;
+                const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset];
-    const u32 block_size = xy_block_size * block_depth;
+                std::memcpy(dst, src, bytes_per_pixel);
-    u32 tile_offset = 0;
-    for (u32 zb = 0; zb < blocks_on_z; zb++) {
-        const u32 z_start = zb * block_z_elements;
-        const u32 z_end = std::min(depth, z_start + block_z_elements);
-        for (u32 yb = 0; yb < blocks_on_y; yb++) {
-            const u32 y_start = yb * block_y_elements;
-            const u32 y_end = std::min(height, y_start + block_y_elements);
-            for (u32 xb = 0; xb < blocks_on_x; xb++) {
-                const u32 x_start = xb * block_x_elements;
-                const u32 x_end = std::min(width, x_start + block_x_elements);
-                if constexpr (fast) {
-                    FastProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,
-                                     z_start, x_end, y_end, z_end, tile_offset, xy_block_size,
-                                     layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel);
-                } else {
-                    PreciseProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,
-                                        z_start, x_end, y_end, z_end, tile_offset, xy_block_size,
-                                        layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel);
-                }
-                tile_offset += block_size;
            }
        }
    }
 }
 } // Anonymous namespace
-void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel,
+SwizzleTable MakeSwizzleTable() {
-                      u32 out_bytes_per_pixel, u8* const swizzled_data, u8* const unswizzled_data,
+    return SWIZZLE_TABLE;
-                      bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing) {
-    const u32 block_height_size{1U << block_height};
-    const u32 block_depth_size{1U << block_depth};
-    if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % FAST_SWIZZLE_ALIGN == 0) {
-        SwizzledData<true>(swizzled_data, unswizzled_data, unswizzle, width, height, depth,
-                           bytes_per_pixel, out_bytes_per_pixel, block_height_size,
-                           block_depth_size, width_spacing);
-    } else {
-        SwizzledData<false>(swizzled_data, unswizzled_data, unswizzle, width, height, depth,
-                            bytes_per_pixel, out_bytes_per_pixel, block_height_size,
-                            block_depth_size, width_spacing);
-    }
 }
-void UnswizzleTexture(u8* const unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y,
+void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
-                      u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height,
+                      u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth,
-                      u32 block_depth, u32 width_spacing) {
+                      u32 stride_alignment) {
-    CopySwizzledData((width + tile_size_x - 1) / tile_size_x,
+    Swizzle<false>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth,
-                     (height + tile_size_y - 1) / tile_size_y, depth, bytes_per_pixel,
+                   stride_alignment);
-                     bytes_per_pixel, address, unswizzled_data, true, block_height, block_depth,
-                     width_spacing);
 }
-std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y, u32 bytes_per_pixel,
+void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
-                                 u32 width, u32 height, u32 depth, u32 block_height,
+                    u32 height, u32 depth, u32 block_height, u32 block_depth,
-                                 u32 block_depth, u32 width_spacing) {
+                    u32 stride_alignment) {
-    std::vector<u8> unswizzled_data(width * height * depth * bytes_per_pixel);
+    Swizzle<true>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth,
-    UnswizzleTexture(unswizzled_data.data(), address, tile_size_x, tile_size_y, bytes_per_pixel,
+                  stride_alignment);
-                     width, height, depth, block_height, block_depth, width_spacing);
-    return unswizzled_data;
 }
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
@@ -213,7 +120,7 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32
        const u32 gob_address_y =
            (dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
            ((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
-        const auto& table = LEGACY_SWIZZLE_TABLE[dst_y % GOB_SIZE_Y];
+        const auto& table = SWIZZLE_TABLE[dst_y % GOB_SIZE_Y];
        for (u32 x = 0; x < subrect_width; ++x) {
            const u32 dst_x = x + offset_x;
            const u32 gob_address =
@@ -235,11 +142,11 @@ void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width,
    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height);
    const u32 block_height_mask = (1U << block_height) - 1;
-    const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height;
+    const u32 x_shift = GOB_SIZE_SHIFT + block_height;
    for (u32 line = 0; line < line_count; ++line) {
        const u32 src_y = line + origin_y;
-        const auto& table = LEGACY_SWIZZLE_TABLE[src_y % GOB_SIZE_Y];
+        const auto& table = SWIZZLE_TABLE[src_y % GOB_SIZE_Y];
        const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT;
        const u32 src_offset_y = (block_y >> block_height) * block_size +
@@ -270,7 +177,7 @@ void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 widt
    const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth;
    for (u32 line = 0; line < line_count; ++line) {
-        const auto& table = LEGACY_SWIZZLE_TABLE[line % GOB_SIZE_Y];
+        const auto& table = SWIZZLE_TABLE[line % GOB_SIZE_Y];
        const u32 block_y = line / GOB_SIZE_Y;
        const u32 dst_offset_y =
            (block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE;
@@ -293,7 +200,7 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32
        const std::size_t gob_address_y =
            (y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
            ((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
-        const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y];
+        const auto& table = SWIZZLE_TABLE[y % GOB_SIZE_Y];
        for (std::size_t x = dst_x; x < width && count < copy_size; ++x) {
            const std::size_t gob_address =
                gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height;
author	ReinUsesLisp	2020-12-30 02:25:23 -0300
committer	ReinUsesLisp	2020-12-30 03:38:50 -0300
commit	9764c13d6d2977903f407761b27d847c0056e1c4 (patch)
tree	f6f5d6d6379b0404147969e7d1f548ed3d49ca01 /src/video_core/textures/decoders.cpp
parent	video_core: Add a delayed destruction ring abstraction (diff)
download	yuzu-9764c13d6d2977903f407761b27d847c0056e1c4.tar.gz yuzu-9764c13d6d2977903f407761b27d847c0056e1c4.tar.xz yuzu-9764c13d6d2977903f407761b27d847c0056e1c4.zip

diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 16d46a018..9f5181318 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp
@@ -2,204 +2,111 @@
2	// Licensed under GPLv2 or any later version	2	// Licensed under GPLv2 or any later version
3	// Refer to the license.txt file included.	3	// Refer to the license.txt file included.
4		4
		5	#include <array>
5	#include <cmath>	6	#include <cmath>
6	#include <cstring>	7	#include <cstring>
		8	#include <span>
		9	#include <utility>
		10
7	#include "common/alignment.h"	11	#include "common/alignment.h"
8	#include "common/assert.h"	12	#include "common/assert.h"
9	#include "common/bit_util.h"	13	#include "common/bit_util.h"
		14	#include "common/div_ceil.h"
10	#include "video_core/gpu.h"	15	#include "video_core/gpu.h"
11	#include "video_core/textures/decoders.h"	16	#include "video_core/textures/decoders.h"
12	#include "video_core/textures/texture.h"	17	#include "video_core/textures/texture.h"
13		18
14	namespace Tegra::Texture {	19	namespace Tegra::Texture {
15	namespace {
16		20
		21	namespace {
17	/**	22	/**
18	* This table represents the internal swizzle of a gob,	23	* This table represents the internal swizzle of a gob, in format 16 bytes x 2 sector packing.
19	* in format 16 bytes x 2 sector packing.
20	* Calculates the offset of an (x, y) position within a swizzled texture.	24	* Calculates the offset of an (x, y) position within a swizzled texture.
21	* Taken from the Tegra X1 Technical Reference Manual. pages 1187-1188	25	* Taken from the Tegra X1 Technical Reference Manual. pages 1187-1188
22	*/	26	*/
23	template <std::size_t N, std::size_t M, u32 Align>	27	constexpr SwizzleTable MakeSwizzleTableConst() {
24	struct alignas(64) SwizzleTable {	28	SwizzleTable table{};
25	static_assert(M * Align == 64, "Swizzle Table does not align to GOB");	29	for (u32 y = 0; y < table.size(); ++y) {
26	constexpr SwizzleTable() {	30	for (u32 x = 0; x < table[0].size(); ++x) {
27	for (u32 y = 0; y < N; ++y) {	31	table[y][x] = ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 +
28	for (u32 x = 0; x < M; ++x) {	32	(y % 2) * 16 + (x % 16);
29	const u32 x2 = x * Align;
30	values[y][x] = static_cast<u16>(((x2 % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
31	((x2 % 32) / 16) * 32 + (y % 2) * 16 + (x2 % 16));
32	}
33	}	33	}
34	}	34	}
35	const std::array<u16, M>& operator[](std::size_t index) const {	35	return table;
36	return values[index];	36	}
37	}
38	std::array<std::array<u16, M>, N> values{};
39	};
40		37
41	constexpr u32 FAST_SWIZZLE_ALIGN = 16;	38	constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTableConst();
42		39
43	constexpr auto LEGACY_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_X, GOB_SIZE_X, GOB_SIZE_Z>();	40	template <bool TO_LINEAR>
44	constexpr auto FAST_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_Y, 4, FAST_SWIZZLE_ALIGN>();	41	void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
		42	u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
		43	// The origin of the transformation can be configured here, leave it as zero as the current API
		44	// doesn't expose it.
		45	static constexpr u32 origin_x = 0;
		46	static constexpr u32 origin_y = 0;
		47	static constexpr u32 origin_z = 0;
45		48
46	/**	49	// We can configure here a custom pitch
47	* This function manages ALL the GOBs(Group of Bytes) Inside a single block.	50	// As it's not exposed 'width * bpp' will be the expected pitch.
48	* Instead of going gob by gob, we map the coordinates inside a block and manage from	51	const u32 pitch = width * bytes_per_pixel;
49	* those. Block_Width is assumed to be 1.	52	const u32 stride = Common::AlignBits(width, stride_alignment) * bytes_per_pixel;
50	*/
51	void PreciseProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle,
52	const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end,
53	const u32 y_end, const u32 z_end, const u32 tile_offset,
54	const u32 xy_block_size, const u32 layer_z, const u32 stride_x,
55	const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) {
56	std::array<u8*, 2> data_ptrs;
57	u32 z_address = tile_offset;
58
59	for (u32 z = z_start; z < z_end; z++) {
60	u32 y_address = z_address;
61	u32 pixel_base = layer_z * z + y_start * stride_x;
62	for (u32 y = y_start; y < y_end; y++) {
63	const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y];
64	for (u32 x = x_start; x < x_end; x++) {
65	const u32 swizzle_offset{y_address + table[x * bytes_per_pixel % GOB_SIZE_X]};
66	const u32 pixel_index{x * out_bytes_per_pixel + pixel_base};
67	data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
68	data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
69	std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
70	}
71	pixel_base += stride_x;
72	if ((y + 1) % GOB_SIZE_Y == 0)
73	y_address += GOB_SIZE;
74	}
75	z_address += xy_block_size;
76	}
77	}
78		53
79	/**	54	const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT);
80	* This function manages ALL the GOBs(Group of Bytes) Inside a single block.	55	const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
81	* Instead of going gob by gob, we map the coordinates inside a block and manage from	56	const u32 slice_size =
82	* those. Block_Width is assumed to be 1.	57	Common::DivCeilLog2(height, block_height + GOB_SIZE_Y_SHIFT) * block_size;
83	*/
84	void FastProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle,
85	const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end,
86	const u32 y_end, const u32 z_end, const u32 tile_offset,
87	const u32 xy_block_size, const u32 layer_z, const u32 stride_x,
88	const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) {
89	std::array<u8*, 2> data_ptrs;
90	u32 z_address = tile_offset;
91	const u32 x_startb = x_start * bytes_per_pixel;
92	const u32 x_endb = x_end * bytes_per_pixel;
93
94	for (u32 z = z_start; z < z_end; z++) {
95	u32 y_address = z_address;
96	u32 pixel_base = layer_z * z + y_start * stride_x;
97	for (u32 y = y_start; y < y_end; y++) {
98	const auto& table = FAST_SWIZZLE_TABLE[y % GOB_SIZE_Y];
99	for (u32 xb = x_startb; xb < x_endb; xb += FAST_SWIZZLE_ALIGN) {
100	const u32 swizzle_offset{y_address + table[(xb / FAST_SWIZZLE_ALIGN) % 4]};
101	const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel;
102	const u32 pixel_index{out_x + pixel_base};
103	data_ptrs[unswizzle ? 1 : 0] = swizzled_data + swizzle_offset;
104	data_ptrs[unswizzle ? 0 : 1] = unswizzled_data + pixel_index;
105	std::memcpy(data_ptrs[0], data_ptrs[1], FAST_SWIZZLE_ALIGN);
106	}
107	pixel_base += stride_x;
108	if ((y + 1) % GOB_SIZE_Y == 0)
109	y_address += GOB_SIZE;
110	}
111	z_address += xy_block_size;
112	}
113	}
114		58
115	/**	59	const u32 block_height_mask = (1U << block_height) - 1;
116	* This function unswizzles or swizzles a texture by mapping Linear to BlockLinear Textue.	60	const u32 block_depth_mask = (1U << block_depth) - 1;
117	* The body of this function takes care of splitting the swizzled texture into blocks,	61	const u32 x_shift = GOB_SIZE_SHIFT + block_height + block_depth;
118	* and managing the extents of it. Once all the parameters of a single block are obtained,	62
119	* the function calls 'ProcessBlock' to process that particular Block.	63	for (u32 slice = 0; slice < depth; ++slice) {
120	*	64	const u32 z = slice + origin_z;
121	* Documentation for the memory layout and decoding can be found at:	65	const u32 offset_z = (z >> block_depth) * slice_size +
122	* https://envytools.readthedocs.io/en/latest/hw/memory/g80-surface.html#blocklinear-surfaces	66	((z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height));
123	*/	67	for (u32 line = 0; line < height; ++line) {
124	template <bool fast>	68	const u32 y = line + origin_y;
125	void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle,	69	const auto& table = SWIZZLE_TABLE[y % GOB_SIZE_Y];
126	const u32 width, const u32 height, const u32 depth, const u32 bytes_per_pixel,	70
127	const u32 out_bytes_per_pixel, const u32 block_height, const u32 block_depth,	71	const u32 block_y = y >> GOB_SIZE_Y_SHIFT;
128	const u32 width_spacing) {	72	const u32 offset_y = (block_y >> block_height) * block_size +
129	auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); };	73	((block_y & block_height_mask) << GOB_SIZE_SHIFT);
130	const u32 stride_x = width * out_bytes_per_pixel;	74
131	const u32 layer_z = height * stride_x;	75	for (u32 column = 0; column < width; ++column) {
132	const u32 gob_elements_x = GOB_SIZE_X / bytes_per_pixel;	76	const u32 x = (column + origin_x) * bytes_per_pixel;
133	constexpr u32 gob_elements_y = GOB_SIZE_Y;	77	const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;
134	constexpr u32 gob_elements_z = GOB_SIZE_Z;	78
135	const u32 block_x_elements = gob_elements_x;	79	const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
136	const u32 block_y_elements = gob_elements_y * block_height;	80	const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X];
137	const u32 block_z_elements = gob_elements_z * block_depth;	81
138	const u32 aligned_width = Common::AlignUp(width, gob_elements_x * width_spacing);	82	const u32 unswizzled_offset =
139	const u32 blocks_on_x = div_ceil(aligned_width, block_x_elements);	83	slice * pitch * height + line * pitch + column * bytes_per_pixel;
140	const u32 blocks_on_y = div_ceil(height, block_y_elements);	84
141	const u32 blocks_on_z = div_ceil(depth, block_z_elements);	85	u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset];
142	const u32 xy_block_size = GOB_SIZE * block_height;	86	const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset];
143	const u32 block_size = xy_block_size * block_depth;	87	std::memcpy(dst, src, bytes_per_pixel);
144	u32 tile_offset = 0;
145	for (u32 zb = 0; zb < blocks_on_z; zb++) {
146	const u32 z_start = zb * block_z_elements;
147	const u32 z_end = std::min(depth, z_start + block_z_elements);
148	for (u32 yb = 0; yb < blocks_on_y; yb++) {
149	const u32 y_start = yb * block_y_elements;
150	const u32 y_end = std::min(height, y_start + block_y_elements);
151	for (u32 xb = 0; xb < blocks_on_x; xb++) {
152	const u32 x_start = xb * block_x_elements;
153	const u32 x_end = std::min(width, x_start + block_x_elements);
154	if constexpr (fast) {
155	FastProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,
156	z_start, x_end, y_end, z_end, tile_offset, xy_block_size,
157	layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel);
158	} else {
159	PreciseProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,
160	z_start, x_end, y_end, z_end, tile_offset, xy_block_size,
161	layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel);
162	}
163	tile_offset += block_size;
164	}	88	}
165	}	89	}
166	}	90	}
167	}	91	}
168
169	} // Anonymous namespace	92	} // Anonymous namespace
170		93
171	void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel,	94	SwizzleTable MakeSwizzleTable() {
172	u32 out_bytes_per_pixel, u8* const swizzled_data, u8* const unswizzled_data,	95	return SWIZZLE_TABLE;
173	bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing) {
174	const u32 block_height_size{1U << block_height};
175	const u32 block_depth_size{1U << block_depth};
176	if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % FAST_SWIZZLE_ALIGN == 0) {
177	SwizzledData<true>(swizzled_data, unswizzled_data, unswizzle, width, height, depth,
178	bytes_per_pixel, out_bytes_per_pixel, block_height_size,
179	block_depth_size, width_spacing);
180	} else {
181	SwizzledData<false>(swizzled_data, unswizzled_data, unswizzle, width, height, depth,
182	bytes_per_pixel, out_bytes_per_pixel, block_height_size,
183	block_depth_size, width_spacing);
184	}
185	}	96	}
186		97
187	void UnswizzleTexture(u8* const unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y,	98	void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
188	u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height,	99	u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth,
189	u32 block_depth, u32 width_spacing) {	100	u32 stride_alignment) {
190	CopySwizzledData((width + tile_size_x - 1) / tile_size_x,	101	Swizzle<false>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth,
191	(height + tile_size_y - 1) / tile_size_y, depth, bytes_per_pixel,	102	stride_alignment);
192	bytes_per_pixel, address, unswizzled_data, true, block_height, block_depth,
193	width_spacing);
194	}	103	}
195		104
196	std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y, u32 bytes_per_pixel,	105	void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
197	u32 width, u32 height, u32 depth, u32 block_height,	106	u32 height, u32 depth, u32 block_height, u32 block_depth,
198	u32 block_depth, u32 width_spacing) {	107	u32 stride_alignment) {
199	std::vector<u8> unswizzled_data(width * height * depth * bytes_per_pixel);	108	Swizzle<true>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth,
200	UnswizzleTexture(unswizzled_data.data(), address, tile_size_x, tile_size_y, bytes_per_pixel,	109	stride_alignment);
201	width, height, depth, block_height, block_depth, width_spacing);
202	return unswizzled_data;
203	}	110	}
204		111
205	void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,	112	void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
@@ -213,7 +120,7 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32
213	const u32 gob_address_y =	120	const u32 gob_address_y =
214	(dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +	121	(dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
215	((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;	122	((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
216	const auto& table = LEGACY_SWIZZLE_TABLE[dst_y % GOB_SIZE_Y];	123	const auto& table = SWIZZLE_TABLE[dst_y % GOB_SIZE_Y];
217	for (u32 x = 0; x < subrect_width; ++x) {	124	for (u32 x = 0; x < subrect_width; ++x) {
218	const u32 dst_x = x + offset_x;	125	const u32 dst_x = x + offset_x;
219	const u32 gob_address =	126	const u32 gob_address =
@@ -235,11 +142,11 @@ void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width,
235	const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height);	142	const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height);
236		143
237	const u32 block_height_mask = (1U << block_height) - 1;	144	const u32 block_height_mask = (1U << block_height) - 1;
238	const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height;	145	const u32 x_shift = GOB_SIZE_SHIFT + block_height;
239		146
240	for (u32 line = 0; line < line_count; ++line) {	147	for (u32 line = 0; line < line_count; ++line) {
241	const u32 src_y = line + origin_y;	148	const u32 src_y = line + origin_y;
242	const auto& table = LEGACY_SWIZZLE_TABLE[src_y % GOB_SIZE_Y];	149	const auto& table = SWIZZLE_TABLE[src_y % GOB_SIZE_Y];
243		150
244	const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT;	151	const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT;
245	const u32 src_offset_y = (block_y >> block_height) * block_size +	152	const u32 src_offset_y = (block_y >> block_height) * block_size +
@@ -270,7 +177,7 @@ void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 widt
270	const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth;	177	const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth;
271		178
272	for (u32 line = 0; line < line_count; ++line) {	179	for (u32 line = 0; line < line_count; ++line) {
273	const auto& table = LEGACY_SWIZZLE_TABLE[line % GOB_SIZE_Y];	180	const auto& table = SWIZZLE_TABLE[line % GOB_SIZE_Y];
274	const u32 block_y = line / GOB_SIZE_Y;	181	const u32 block_y = line / GOB_SIZE_Y;
275	const u32 dst_offset_y =	182	const u32 dst_offset_y =
276	(block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE;	183	(block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE;
@@ -293,7 +200,7 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32
293	const std::size_t gob_address_y =	200	const std::size_t gob_address_y =
294	(y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +	201	(y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
295	((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;	202	((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
296	const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y];	203	const auto& table = SWIZZLE_TABLE[y % GOB_SIZE_Y];
297	for (std::size_t x = dst_x; x < width && count < copy_size; ++x) {	204	for (std::size_t x = dst_x; x < width && count < copy_size; ++x) {
298	const std::size_t gob_address =	205	const std::size_t gob_address =
299	gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height;	206	gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height;