Remove old Swizzle algorithms and use 3d Swizzle

author: FernandoS27 2018-10-11 17:08:48 -0400
committer: FernandoS27 2018-10-13 15:25:17 -0400
commit: d4ae43f9c1dd1b366cf71520841d5f2f051ce69d (patch)
tree: 177872073377d252d33c27857d9ac48394f1c757 /src
parent: Implement Precise 3D Swizzle (diff)
download: yuzu-d4ae43f9c1dd1b366cf71520841d5f2f051ce69d.tar.gz
yuzu-d4ae43f9c1dd1b366cf71520841d5f2f051ce69d.tar.xz
yuzu-d4ae43f9c1dd1b366cf71520841d5f2f051ce69d.zip
1 files changed, 69 insertions, 93 deletions
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index d6750b174..5e2d3ac32 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -40,97 +40,56 @@ struct alignas(64) SwizzleTable {
 constexpr auto legacy_swizzle_table = SwizzleTable<8, 64, 1>();
 constexpr auto fast_swizzle_table = SwizzleTable<8, 4, 16>();
-static void LegacySwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
+/**
-                              u8* swizzled_data, u8* unswizzled_data, bool unswizzle,
+ * This function manages ALL the GOBs(Group of Bytes) Inside a single block.
-                              u32 block_height) {
+ * Instead of going gob by gob, we map the coordinates inside a block and manage from
-    std::array<u8*, 2> data_ptrs;
+ * those. Block_Width is assumed to be 1.
-    const std::size_t stride = width * bytes_per_pixel;
+ */
-    const std::size_t gobs_in_x = 64;
+void Precise3DProcessBlock(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle,
-    const std::size_t gobs_in_y = 8;
+                           const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end,
-    const std::size_t gobs_size = gobs_in_x * gobs_in_y;
+                           const u32 y_end, const u32 z_end, const u32 tile_offset,
-    const std::size_t image_width_in_gobs{(stride + gobs_in_x - 1) / gobs_in_x};
+                           const u32 xy_block_size, const u32 layer_z, const u32 stride_x,
-    for (std::size_t y = 0; y < height; ++y) {
+                           const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) {
-        const std::size_t gob_y_address =
-            (y / (gobs_in_y * block_height)) * gobs_size * block_height * image_width_in_gobs +
-            (y % (gobs_in_y * block_height) / gobs_in_y) * gobs_size;
-        const auto& table = legacy_swizzle_table[y % gobs_in_y];
-        for (std::size_t x = 0; x < width; ++x) {
-            const std::size_t gob_address =
-                gob_y_address + (x * bytes_per_pixel / gobs_in_x) * gobs_size * block_height;
-            const std::size_t x2 = x * bytes_per_pixel;
-            const std::size_t swizzle_offset = gob_address + table[x2 % gobs_in_x];
-            const std::size_t pixel_index = (x + y * width) * out_bytes_per_pixel;
-            data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
-            data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
-            std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
-        }
-    }
-}
-static void FastSwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
-                            u8* swizzled_data, u8* unswizzled_data, bool unswizzle,
-                            u32 block_height) {
-    std::array<u8*, 2> data_ptrs;
-    const std::size_t stride{width * bytes_per_pixel};
-    const std::size_t gobs_in_x = 64;
-    const std::size_t gobs_in_y = 8;
-    const std::size_t gobs_size = gobs_in_x * gobs_in_y;
-    const std::size_t image_width_in_gobs{(stride + gobs_in_x - 1) / gobs_in_x};
-    const std::size_t copy_size{16};
-    for (std::size_t y = 0; y < height; ++y) {
-        const std::size_t initial_gob =
-            (y / (gobs_in_y * block_height)) * gobs_size * block_height * image_width_in_gobs +
-            (y % (gobs_in_y * block_height) / gobs_in_y) * gobs_size;
-        const std::size_t pixel_base{y * width * out_bytes_per_pixel};
-        const auto& table = fast_swizzle_table[y % gobs_in_y];
-        for (std::size_t xb = 0; xb < stride; xb += copy_size) {
-            const std::size_t gob_address{initial_gob +
-                                          (xb / gobs_in_x) * gobs_size * block_height};
-            const std::size_t swizzle_offset{gob_address + table[(xb / 16) % 4]};
-            const std::size_t out_x = xb * out_bytes_per_pixel / bytes_per_pixel;
-            const std::size_t pixel_index{out_x + pixel_base};
-            data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
-            data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
-            std::memcpy(data_ptrs[0], data_ptrs[1], copy_size);
-        }
-    }
-}
-void Precise3DProcessGobs(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, const u32 x_start,
-                       const u32 y_start, const u32 z_start, const u32 x_end, const u32 y_end,
-                       const u32 z_end, const u32 tile_offset, const u32 xy_block_size,
-                       const u32 layer_z, const u32 stride_x, const u32 bytes_per_pixel,
-                       const u32 out_bytes_per_pixel) {
    std::array<u8*, 2> data_ptrs;
    u32 z_adress = tile_offset;
-    const u32 gob_size = 64 * 8 * 1;
+    const u32 gob_size_x = 64;
+    const u32 gob_size_y = 8;
+    const u32 gob_size_z = 1;
+    const u32 gob_size = gob_size_x * gob_size_y * gob_size_z;
    for (u32 z = z_start; z < z_end; z++) {
        u32 y_adress = z_adress;
        u32 pixel_base = layer_z * z + y_start * stride_x;
        for (u32 y = y_start; y < y_end; y++) {
-            const auto& table = legacy_swizzle_table[y % 8];
+            const auto& table = legacy_swizzle_table[y % gob_size_y];
            for (u32 x = x_start; x < x_end; x++) {
-                const u32 swizzle_offset{y_adress + table[x * bytes_per_pixel % 64]};
+                const u32 swizzle_offset{y_adress + table[x * bytes_per_pixel % gob_size_x]};
                const u32 pixel_index{x * out_bytes_per_pixel + pixel_base};
                data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
                data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
                std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
            }
            pixel_base += stride_x;
-            if ((y + 1) % 8 == 0)
+            if ((y + 1) % gob_size_y == 0)
                y_adress += gob_size;
        }
        z_adress += xy_block_size;
    }
 }
-void Precise3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 width,
+/**
-                        u32 height, u32 depth, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
+ * This function unswizzles or swizzles a texture by mapping Linear to BlockLinear Textue.
-                        u32 block_height, u32 block_depth) {
+ * The body of this function takes care of splitting the swizzled texture into blocks,
-    auto div_ceil = [](u32 x, u32 y) { return ((x + y - 1) / y); };
+ * and managing the extents of it. Once all the parameters of a single block are obtained,
+ * the function calls '3DProcessBlock' to process that particular Block.
+ *
+ * Documentation for the memory layout and decoding can be found at:
+ *  https://envytools.readthedocs.io/en/latest/hw/memory/g80-surface.html#blocklinear-surfaces
+ */
+void Precise3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle,
+                           const u32 width, const u32 height, const u32 depth,
+                           const u32 bytes_per_pixel, const u32 out_bytes_per_pixel,
+                           const u32 block_height, const u32 block_depth) {
+    auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); };
    const u32 stride_x = width * out_bytes_per_pixel;
    const u32 layer_z = height * stride_x;
    const u32 gob_x_bytes = 64;
@@ -157,33 +116,41 @@ void Precise3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzl
            for (u32 xb = 0; xb < blocks_on_x; xb++) {
                const u32 x_start = xb * block_x_elements;
                const u32 x_end = std::min(width, x_start + block_x_elements);
-                Precise3DProcessGobs(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,
+                Precise3DProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,
-                                  z_start, x_end, y_end, z_end, tile_offset, xy_block_size, layer_z,
+                                      z_start, x_end, y_end, z_end, tile_offset, xy_block_size,
-                                  stride_x, bytes_per_pixel, out_bytes_per_pixel);
+                                      layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel);
                tile_offset += block_size;
            }
        }
    }
 }
-void Fast3DProcessGobs(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, const u32 x_start,
+/**
-                       const u32 y_start, const u32 z_start, const u32 x_end, const u32 y_end,
+ * This function manages ALL the GOBs(Group of Bytes) Inside a single block.
-                       const u32 z_end, const u32 tile_offset, const u32 xy_block_size,
+ * Instead of going gob by gob, we map the coordinates inside a block and manage from
-                       const u32 layer_z, const u32 stride_x, const u32 bytes_per_pixel,
+ * those. Block_Width is assumed to be 1.
-                       const u32 out_bytes_per_pixel) {
+ */
+void Fast3DProcessBlock(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle,
+                        const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end,
+                        const u32 y_end, const u32 z_end, const u32 tile_offset,
+                        const u32 xy_block_size, const u32 layer_z, const u32 stride_x,
+                        const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) {
    std::array<u8*, 2> data_ptrs;
    u32 z_adress = tile_offset;
    const u32 x_startb = x_start * bytes_per_pixel;
    const u32 x_endb = x_end * bytes_per_pixel;
    const u32 copy_size = 16;
-    const u32 gob_size = 64 * 8 * 1;
+    const u32 gob_size_x = 64;
+    const u32 gob_size_y = 8;
+    const u32 gob_size_z = 1;
+    const u32 gob_size = gob_size_x * gob_size_y * gob_size_z;
    for (u32 z = z_start; z < z_end; z++) {
        u32 y_adress = z_adress;
        u32 pixel_base = layer_z * z + y_start * stride_x;
        for (u32 y = y_start; y < y_end; y++) {
-            const auto& table = fast_swizzle_table[y % 8];
+            const auto& table = fast_swizzle_table[y % gob_size_y];
            for (u32 xb = x_startb; xb < x_endb; xb += copy_size) {
-                const u32 swizzle_offset{y_adress + table[(xb / 16) % 4]};
+                const u32 swizzle_offset{y_adress + table[(xb / copy_size) % 4]};
                const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel;
                const u32 pixel_index{out_x + pixel_base};
                data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
@@ -191,18 +158,27 @@ void Fast3DProcessGobs(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, c
                std::memcpy(data_ptrs[0], data_ptrs[1], copy_size);
            }
            pixel_base += stride_x;
-            if ((y + 1) % 8 == 0)
+            if ((y + 1) % gob_size_y == 0)
                y_adress += gob_size;
        }
        z_adress += xy_block_size;
    }
 }
-void Fast3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 width,
+/**
-                        u32 height, u32 depth, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
+ * This function unswizzles or swizzles a texture by mapping Linear to BlockLinear Textue.
-                        u32 block_height, u32 block_depth) {
+ * The body of this function takes care of splitting the swizzled texture into blocks,
-    auto div_ceil = [](u32 x, u32 y) { return ((x + y - 1) / y); };
+ * and managing the extents of it. Once all the parameters of a single block are obtained,
+ * the function calls '3DProcessBlock' to process that particular Block.
+ *
+ * Documentation for the memory layout and decoding can be found at:
+ *  https://envytools.readthedocs.io/en/latest/hw/memory/g80-surface.html#blocklinear-surfaces
+ */
+void Fast3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle,
+                        const u32 width, const u32 height, const u32 depth,
+                        const u32 bytes_per_pixel, const u32 out_bytes_per_pixel,
+                        const u32 block_height, const u32 block_depth) {
+    auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); };
    const u32 stride_x = width * out_bytes_per_pixel;
    const u32 layer_z = height * stride_x;
    const u32 gob_x_bytes = 64;
@@ -229,9 +205,9 @@ void Fast3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzle,
            for (u32 xb = 0; xb < blocks_on_x; xb++) {
                const u32 x_start = xb * block_x_elements;
                const u32 x_end = std::min(width, x_start + block_x_elements);
-                Fast3DProcessGobs(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,
+                Fast3DProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,
-                                  z_start, x_end, y_end, z_end, tile_offset, xy_block_size, layer_z,
+                                   z_start, x_end, y_end, z_end, tile_offset, xy_block_size,
-                                  stride_x, bytes_per_pixel, out_bytes_per_pixel);
+                                   layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel);
                tile_offset += block_size;
            }
        }
@@ -245,7 +221,7 @@ void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_
                           bytes_per_pixel, out_bytes_per_pixel, block_height, 1U);
    } else {
        Precise3DSwizzledData(swizzled_data, unswizzled_data, unswizzle, width, height, 1U,
-                           bytes_per_pixel, out_bytes_per_pixel, block_height, 1U);
+                              bytes_per_pixel, out_bytes_per_pixel, block_height, 1U);
    }
 }
author	FernandoS27	2018-10-11 17:08:48 -0400
committer	FernandoS27	2018-10-13 15:25:17 -0400
commit	d4ae43f9c1dd1b366cf71520841d5f2f051ce69d (patch)
tree	177872073377d252d33c27857d9ac48394f1c757 /src
parent	Implement Precise 3D Swizzle (diff)
download	yuzu-d4ae43f9c1dd1b366cf71520841d5f2f051ce69d.tar.gz yuzu-d4ae43f9c1dd1b366cf71520841d5f2f051ce69d.tar.xz yuzu-d4ae43f9c1dd1b366cf71520841d5f2f051ce69d.zip

diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index d6750b174..5e2d3ac32 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp
@@ -40,97 +40,56 @@ struct alignas(64) SwizzleTable {
40	constexpr auto legacy_swizzle_table = SwizzleTable<8, 64, 1>();	40	constexpr auto legacy_swizzle_table = SwizzleTable<8, 64, 1>();
41	constexpr auto fast_swizzle_table = SwizzleTable<8, 4, 16>();	41	constexpr auto fast_swizzle_table = SwizzleTable<8, 4, 16>();
42		42
43	static void LegacySwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,	43	/**
44	u8* swizzled_data, u8* unswizzled_data, bool unswizzle,	44	* This function manages ALL the GOBs(Group of Bytes) Inside a single block.
45	u32 block_height) {	45	* Instead of going gob by gob, we map the coordinates inside a block and manage from
46	std::array<u8*, 2> data_ptrs;	46	* those. Block_Width is assumed to be 1.
47	const std::size_t stride = width * bytes_per_pixel;	47	*/
48	const std::size_t gobs_in_x = 64;	48	void Precise3DProcessBlock(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle,
49	const std::size_t gobs_in_y = 8;	49	const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end,
50	const std::size_t gobs_size = gobs_in_x * gobs_in_y;	50	const u32 y_end, const u32 z_end, const u32 tile_offset,
51	const std::size_t image_width_in_gobs{(stride + gobs_in_x - 1) / gobs_in_x};	51	const u32 xy_block_size, const u32 layer_z, const u32 stride_x,
52	for (std::size_t y = 0; y < height; ++y) {	52	const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) {
53	const std::size_t gob_y_address =
54	(y / (gobs_in_y * block_height)) * gobs_size * block_height * image_width_in_gobs +
55	(y % (gobs_in_y * block_height) / gobs_in_y) * gobs_size;
56	const auto& table = legacy_swizzle_table[y % gobs_in_y];
57	for (std::size_t x = 0; x < width; ++x) {
58	const std::size_t gob_address =
59	gob_y_address + (x * bytes_per_pixel / gobs_in_x) * gobs_size * block_height;
60	const std::size_t x2 = x * bytes_per_pixel;
61	const std::size_t swizzle_offset = gob_address + table[x2 % gobs_in_x];
62	const std::size_t pixel_index = (x + y * width) * out_bytes_per_pixel;
63
64	data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
65	data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
66
67	std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
68	}
69	}
70	}
71
72	static void FastSwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
73	u8* swizzled_data, u8* unswizzled_data, bool unswizzle,
74	u32 block_height) {
75	std::array<u8*, 2> data_ptrs;
76	const std::size_t stride{width * bytes_per_pixel};
77	const std::size_t gobs_in_x = 64;
78	const std::size_t gobs_in_y = 8;
79	const std::size_t gobs_size = gobs_in_x * gobs_in_y;
80	const std::size_t image_width_in_gobs{(stride + gobs_in_x - 1) / gobs_in_x};
81	const std::size_t copy_size{16};
82	for (std::size_t y = 0; y < height; ++y) {
83	const std::size_t initial_gob =
84	(y / (gobs_in_y * block_height)) * gobs_size * block_height * image_width_in_gobs +
85	(y % (gobs_in_y * block_height) / gobs_in_y) * gobs_size;
86	const std::size_t pixel_base{y * width * out_bytes_per_pixel};
87	const auto& table = fast_swizzle_table[y % gobs_in_y];
88	for (std::size_t xb = 0; xb < stride; xb += copy_size) {
89	const std::size_t gob_address{initial_gob +
90	(xb / gobs_in_x) * gobs_size * block_height};
91	const std::size_t swizzle_offset{gob_address + table[(xb / 16) % 4]};
92	const std::size_t out_x = xb * out_bytes_per_pixel / bytes_per_pixel;
93	const std::size_t pixel_index{out_x + pixel_base};
94	data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
95	data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
96	std::memcpy(data_ptrs[0], data_ptrs[1], copy_size);
97	}
98	}
99	}
100
101	void Precise3DProcessGobs(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, const u32 x_start,
102	const u32 y_start, const u32 z_start, const u32 x_end, const u32 y_end,
103	const u32 z_end, const u32 tile_offset, const u32 xy_block_size,
104	const u32 layer_z, const u32 stride_x, const u32 bytes_per_pixel,
105	const u32 out_bytes_per_pixel) {
106	std::array<u8*, 2> data_ptrs;	53	std::array<u8*, 2> data_ptrs;
107	u32 z_adress = tile_offset;	54	u32 z_adress = tile_offset;
108	const u32 gob_size = 64 * 8 * 1;	55	const u32 gob_size_x = 64;
		56	const u32 gob_size_y = 8;
		57	const u32 gob_size_z = 1;
		58	const u32 gob_size = gob_size_x * gob_size_y * gob_size_z;
109	for (u32 z = z_start; z < z_end; z++) {	59	for (u32 z = z_start; z < z_end; z++) {
110	u32 y_adress = z_adress;	60	u32 y_adress = z_adress;
111	u32 pixel_base = layer_z * z + y_start * stride_x;	61	u32 pixel_base = layer_z * z + y_start * stride_x;
112	for (u32 y = y_start; y < y_end; y++) {	62	for (u32 y = y_start; y < y_end; y++) {
113	const auto& table = legacy_swizzle_table[y % 8];	63	const auto& table = legacy_swizzle_table[y % gob_size_y];
114	for (u32 x = x_start; x < x_end; x++) {	64	for (u32 x = x_start; x < x_end; x++) {
115	const u32 swizzle_offset{y_adress + table[x * bytes_per_pixel % 64]};	65	const u32 swizzle_offset{y_adress + table[x * bytes_per_pixel % gob_size_x]};
116	const u32 pixel_index{x * out_bytes_per_pixel + pixel_base};	66	const u32 pixel_index{x * out_bytes_per_pixel + pixel_base};
117	data_ptrs[unswizzle] = swizzled_data + swizzle_offset;	67	data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
118	data_ptrs[!unswizzle] = unswizzled_data + pixel_index;	68	data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
119	std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);	69	std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
120	}	70	}
121	pixel_base += stride_x;	71	pixel_base += stride_x;
122	if ((y + 1) % 8 == 0)	72	if ((y + 1) % gob_size_y == 0)
123	y_adress += gob_size;	73	y_adress += gob_size;
124	}	74	}
125	z_adress += xy_block_size;	75	z_adress += xy_block_size;
126	}	76	}
127	}	77	}
128		78
129	void Precise3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 width,	79	/**
130	u32 height, u32 depth, u32 bytes_per_pixel, u32 out_bytes_per_pixel,	80	* This function unswizzles or swizzles a texture by mapping Linear to BlockLinear Textue.
131	u32 block_height, u32 block_depth) {	81	* The body of this function takes care of splitting the swizzled texture into blocks,
132	auto div_ceil = [](u32 x, u32 y) { return ((x + y - 1) / y); };	82	* and managing the extents of it. Once all the parameters of a single block are obtained,
133		83	* the function calls '3DProcessBlock' to process that particular Block.
		84	*
		85	* Documentation for the memory layout and decoding can be found at:
		86	* https://envytools.readthedocs.io/en/latest/hw/memory/g80-surface.html#blocklinear-surfaces
		87	*/
		88	void Precise3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle,
		89	const u32 width, const u32 height, const u32 depth,
		90	const u32 bytes_per_pixel, const u32 out_bytes_per_pixel,
		91	const u32 block_height, const u32 block_depth) {
		92	auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); };
134	const u32 stride_x = width * out_bytes_per_pixel;	93	const u32 stride_x = width * out_bytes_per_pixel;
135	const u32 layer_z = height * stride_x;	94	const u32 layer_z = height * stride_x;
136	const u32 gob_x_bytes = 64;	95	const u32 gob_x_bytes = 64;
@@ -157,33 +116,41 @@ void Precise3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzl
157	for (u32 xb = 0; xb < blocks_on_x; xb++) {	116	for (u32 xb = 0; xb < blocks_on_x; xb++) {
158	const u32 x_start = xb * block_x_elements;	117	const u32 x_start = xb * block_x_elements;
159	const u32 x_end = std::min(width, x_start + block_x_elements);	118	const u32 x_end = std::min(width, x_start + block_x_elements);
160	Precise3DProcessGobs(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,	119	Precise3DProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,
161	z_start, x_end, y_end, z_end, tile_offset, xy_block_size, layer_z,	120	z_start, x_end, y_end, z_end, tile_offset, xy_block_size,
162	stride_x, bytes_per_pixel, out_bytes_per_pixel);	121	layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel);
163	tile_offset += block_size;	122	tile_offset += block_size;
164	}	123	}
165	}	124	}
166	}	125	}
167	}	126	}
168		127
169	void Fast3DProcessGobs(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, const u32 x_start,	128	/**
170	const u32 y_start, const u32 z_start, const u32 x_end, const u32 y_end,	129	* This function manages ALL the GOBs(Group of Bytes) Inside a single block.
171	const u32 z_end, const u32 tile_offset, const u32 xy_block_size,	130	* Instead of going gob by gob, we map the coordinates inside a block and manage from
172	const u32 layer_z, const u32 stride_x, const u32 bytes_per_pixel,	131	* those. Block_Width is assumed to be 1.
173	const u32 out_bytes_per_pixel) {	132	*/
		133	void Fast3DProcessBlock(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle,
		134	const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end,
		135	const u32 y_end, const u32 z_end, const u32 tile_offset,
		136	const u32 xy_block_size, const u32 layer_z, const u32 stride_x,
		137	const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) {
174	std::array<u8*, 2> data_ptrs;	138	std::array<u8*, 2> data_ptrs;
175	u32 z_adress = tile_offset;	139	u32 z_adress = tile_offset;
176	const u32 x_startb = x_start * bytes_per_pixel;	140	const u32 x_startb = x_start * bytes_per_pixel;
177	const u32 x_endb = x_end * bytes_per_pixel;	141	const u32 x_endb = x_end * bytes_per_pixel;
178	const u32 copy_size = 16;	142	const u32 copy_size = 16;
179	const u32 gob_size = 64 * 8 * 1;	143	const u32 gob_size_x = 64;
		144	const u32 gob_size_y = 8;
		145	const u32 gob_size_z = 1;
		146	const u32 gob_size = gob_size_x * gob_size_y * gob_size_z;
180	for (u32 z = z_start; z < z_end; z++) {	147	for (u32 z = z_start; z < z_end; z++) {
181	u32 y_adress = z_adress;	148	u32 y_adress = z_adress;
182	u32 pixel_base = layer_z * z + y_start * stride_x;	149	u32 pixel_base = layer_z * z + y_start * stride_x;
183	for (u32 y = y_start; y < y_end; y++) {	150	for (u32 y = y_start; y < y_end; y++) {
184	const auto& table = fast_swizzle_table[y % 8];	151	const auto& table = fast_swizzle_table[y % gob_size_y];
185	for (u32 xb = x_startb; xb < x_endb; xb += copy_size) {	152	for (u32 xb = x_startb; xb < x_endb; xb += copy_size) {
186	const u32 swizzle_offset{y_adress + table[(xb / 16) % 4]};	153	const u32 swizzle_offset{y_adress + table[(xb / copy_size) % 4]};
187	const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel;	154	const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel;
188	const u32 pixel_index{out_x + pixel_base};	155	const u32 pixel_index{out_x + pixel_base};
189	data_ptrs[unswizzle] = swizzled_data + swizzle_offset;	156	data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
@@ -191,18 +158,27 @@ void Fast3DProcessGobs(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, c
191	std::memcpy(data_ptrs[0], data_ptrs[1], copy_size);	158	std::memcpy(data_ptrs[0], data_ptrs[1], copy_size);
192	}	159	}
193	pixel_base += stride_x;	160	pixel_base += stride_x;
194	if ((y + 1) % 8 == 0)	161	if ((y + 1) % gob_size_y == 0)
195	y_adress += gob_size;	162	y_adress += gob_size;
196	}	163	}
197	z_adress += xy_block_size;	164	z_adress += xy_block_size;
198	}	165	}
199	}	166	}
200		167
201	void Fast3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 width,	168	/**
202	u32 height, u32 depth, u32 bytes_per_pixel, u32 out_bytes_per_pixel,	169	* This function unswizzles or swizzles a texture by mapping Linear to BlockLinear Textue.
203	u32 block_height, u32 block_depth) {	170	* The body of this function takes care of splitting the swizzled texture into blocks,
204	auto div_ceil = [](u32 x, u32 y) { return ((x + y - 1) / y); };	171	* and managing the extents of it. Once all the parameters of a single block are obtained,
205		172	* the function calls '3DProcessBlock' to process that particular Block.
		173	*
		174	* Documentation for the memory layout and decoding can be found at:
		175	* https://envytools.readthedocs.io/en/latest/hw/memory/g80-surface.html#blocklinear-surfaces
		176	*/
		177	void Fast3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle,
		178	const u32 width, const u32 height, const u32 depth,
		179	const u32 bytes_per_pixel, const u32 out_bytes_per_pixel,
		180	const u32 block_height, const u32 block_depth) {
		181	auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); };
206	const u32 stride_x = width * out_bytes_per_pixel;	182	const u32 stride_x = width * out_bytes_per_pixel;
207	const u32 layer_z = height * stride_x;	183	const u32 layer_z = height * stride_x;
208	const u32 gob_x_bytes = 64;	184	const u32 gob_x_bytes = 64;
@@ -229,9 +205,9 @@ void Fast3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzle,
229	for (u32 xb = 0; xb < blocks_on_x; xb++) {	205	for (u32 xb = 0; xb < blocks_on_x; xb++) {
230	const u32 x_start = xb * block_x_elements;	206	const u32 x_start = xb * block_x_elements;
231	const u32 x_end = std::min(width, x_start + block_x_elements);	207	const u32 x_end = std::min(width, x_start + block_x_elements);
232	Fast3DProcessGobs(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,	208	Fast3DProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,
233	z_start, x_end, y_end, z_end, tile_offset, xy_block_size, layer_z,	209	z_start, x_end, y_end, z_end, tile_offset, xy_block_size,
234	stride_x, bytes_per_pixel, out_bytes_per_pixel);	210	layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel);
235	tile_offset += block_size;	211	tile_offset += block_size;
236	}	212	}
237	}	213	}
@@ -245,7 +221,7 @@ void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_
245	bytes_per_pixel, out_bytes_per_pixel, block_height, 1U);	221	bytes_per_pixel, out_bytes_per_pixel, block_height, 1U);
246	} else {	222	} else {
247	Precise3DSwizzledData(swizzled_data, unswizzled_data, unswizzle, width, height, 1U,	223	Precise3DSwizzledData(swizzled_data, unswizzled_data, unswizzle, width, height, 1U,
248	bytes_per_pixel, out_bytes_per_pixel, block_height, 1U);	224	bytes_per_pixel, out_bytes_per_pixel, block_height, 1U);
249	}	225	}
250	}	226	}
251		227