diff options
| author | 2018-10-11 17:08:48 -0400 | |
|---|---|---|
| committer | 2018-10-13 15:25:17 -0400 | |
| commit | d4ae43f9c1dd1b366cf71520841d5f2f051ce69d (patch) | |
| tree | 177872073377d252d33c27857d9ac48394f1c757 /src | |
| parent | Implement Precise 3D Swizzle (diff) | |
| download | yuzu-d4ae43f9c1dd1b366cf71520841d5f2f051ce69d.tar.gz yuzu-d4ae43f9c1dd1b366cf71520841d5f2f051ce69d.tar.xz yuzu-d4ae43f9c1dd1b366cf71520841d5f2f051ce69d.zip | |
Remove old Swizzle algorithms and use 3d Swizzle
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/textures/decoders.cpp | 162 |
1 files changed, 69 insertions, 93 deletions
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index d6750b174..5e2d3ac32 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp | |||
| @@ -40,97 +40,56 @@ struct alignas(64) SwizzleTable { | |||
| 40 | constexpr auto legacy_swizzle_table = SwizzleTable<8, 64, 1>(); | 40 | constexpr auto legacy_swizzle_table = SwizzleTable<8, 64, 1>(); |
| 41 | constexpr auto fast_swizzle_table = SwizzleTable<8, 4, 16>(); | 41 | constexpr auto fast_swizzle_table = SwizzleTable<8, 4, 16>(); |
| 42 | 42 | ||
| 43 | static void LegacySwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel, | 43 | /** |
| 44 | u8* swizzled_data, u8* unswizzled_data, bool unswizzle, | 44 | * This function manages ALL the GOBs(Group of Bytes) Inside a single block. |
| 45 | u32 block_height) { | 45 | * Instead of going gob by gob, we map the coordinates inside a block and manage from |
| 46 | std::array<u8*, 2> data_ptrs; | 46 | * those. Block_Width is assumed to be 1. |
| 47 | const std::size_t stride = width * bytes_per_pixel; | 47 | */ |
| 48 | const std::size_t gobs_in_x = 64; | 48 | void Precise3DProcessBlock(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle, |
| 49 | const std::size_t gobs_in_y = 8; | 49 | const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end, |
| 50 | const std::size_t gobs_size = gobs_in_x * gobs_in_y; | 50 | const u32 y_end, const u32 z_end, const u32 tile_offset, |
| 51 | const std::size_t image_width_in_gobs{(stride + gobs_in_x - 1) / gobs_in_x}; | 51 | const u32 xy_block_size, const u32 layer_z, const u32 stride_x, |
| 52 | for (std::size_t y = 0; y < height; ++y) { | 52 | const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) { |
| 53 | const std::size_t gob_y_address = | ||
| 54 | (y / (gobs_in_y * block_height)) * gobs_size * block_height * image_width_in_gobs + | ||
| 55 | (y % (gobs_in_y * block_height) / gobs_in_y) * gobs_size; | ||
| 56 | const auto& table = legacy_swizzle_table[y % gobs_in_y]; | ||
| 57 | for (std::size_t x = 0; x < width; ++x) { | ||
| 58 | const std::size_t gob_address = | ||
| 59 | gob_y_address + (x * bytes_per_pixel / gobs_in_x) * gobs_size * block_height; | ||
| 60 | const std::size_t x2 = x * bytes_per_pixel; | ||
| 61 | const std::size_t swizzle_offset = gob_address + table[x2 % gobs_in_x]; | ||
| 62 | const std::size_t pixel_index = (x + y * width) * out_bytes_per_pixel; | ||
| 63 | |||
| 64 | data_ptrs[unswizzle] = swizzled_data + swizzle_offset; | ||
| 65 | data_ptrs[!unswizzle] = unswizzled_data + pixel_index; | ||
| 66 | |||
| 67 | std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel); | ||
| 68 | } | ||
| 69 | } | ||
| 70 | } | ||
| 71 | |||
| 72 | static void FastSwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel, | ||
| 73 | u8* swizzled_data, u8* unswizzled_data, bool unswizzle, | ||
| 74 | u32 block_height) { | ||
| 75 | std::array<u8*, 2> data_ptrs; | ||
| 76 | const std::size_t stride{width * bytes_per_pixel}; | ||
| 77 | const std::size_t gobs_in_x = 64; | ||
| 78 | const std::size_t gobs_in_y = 8; | ||
| 79 | const std::size_t gobs_size = gobs_in_x * gobs_in_y; | ||
| 80 | const std::size_t image_width_in_gobs{(stride + gobs_in_x - 1) / gobs_in_x}; | ||
| 81 | const std::size_t copy_size{16}; | ||
| 82 | for (std::size_t y = 0; y < height; ++y) { | ||
| 83 | const std::size_t initial_gob = | ||
| 84 | (y / (gobs_in_y * block_height)) * gobs_size * block_height * image_width_in_gobs + | ||
| 85 | (y % (gobs_in_y * block_height) / gobs_in_y) * gobs_size; | ||
| 86 | const std::size_t pixel_base{y * width * out_bytes_per_pixel}; | ||
| 87 | const auto& table = fast_swizzle_table[y % gobs_in_y]; | ||
| 88 | for (std::size_t xb = 0; xb < stride; xb += copy_size) { | ||
| 89 | const std::size_t gob_address{initial_gob + | ||
| 90 | (xb / gobs_in_x) * gobs_size * block_height}; | ||
| 91 | const std::size_t swizzle_offset{gob_address + table[(xb / 16) % 4]}; | ||
| 92 | const std::size_t out_x = xb * out_bytes_per_pixel / bytes_per_pixel; | ||
| 93 | const std::size_t pixel_index{out_x + pixel_base}; | ||
| 94 | data_ptrs[unswizzle] = swizzled_data + swizzle_offset; | ||
| 95 | data_ptrs[!unswizzle] = unswizzled_data + pixel_index; | ||
| 96 | std::memcpy(data_ptrs[0], data_ptrs[1], copy_size); | ||
| 97 | } | ||
| 98 | } | ||
| 99 | } | ||
| 100 | |||
| 101 | void Precise3DProcessGobs(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, const u32 x_start, | ||
| 102 | const u32 y_start, const u32 z_start, const u32 x_end, const u32 y_end, | ||
| 103 | const u32 z_end, const u32 tile_offset, const u32 xy_block_size, | ||
| 104 | const u32 layer_z, const u32 stride_x, const u32 bytes_per_pixel, | ||
| 105 | const u32 out_bytes_per_pixel) { | ||
| 106 | std::array<u8*, 2> data_ptrs; | 53 | std::array<u8*, 2> data_ptrs; |
| 107 | u32 z_adress = tile_offset; | 54 | u32 z_adress = tile_offset; |
| 108 | const u32 gob_size = 64 * 8 * 1; | 55 | const u32 gob_size_x = 64; |
| 56 | const u32 gob_size_y = 8; | ||
| 57 | const u32 gob_size_z = 1; | ||
| 58 | const u32 gob_size = gob_size_x * gob_size_y * gob_size_z; | ||
| 109 | for (u32 z = z_start; z < z_end; z++) { | 59 | for (u32 z = z_start; z < z_end; z++) { |
| 110 | u32 y_adress = z_adress; | 60 | u32 y_adress = z_adress; |
| 111 | u32 pixel_base = layer_z * z + y_start * stride_x; | 61 | u32 pixel_base = layer_z * z + y_start * stride_x; |
| 112 | for (u32 y = y_start; y < y_end; y++) { | 62 | for (u32 y = y_start; y < y_end; y++) { |
| 113 | const auto& table = legacy_swizzle_table[y % 8]; | 63 | const auto& table = legacy_swizzle_table[y % gob_size_y]; |
| 114 | for (u32 x = x_start; x < x_end; x++) { | 64 | for (u32 x = x_start; x < x_end; x++) { |
| 115 | const u32 swizzle_offset{y_adress + table[x * bytes_per_pixel % 64]}; | 65 | const u32 swizzle_offset{y_adress + table[x * bytes_per_pixel % gob_size_x]}; |
| 116 | const u32 pixel_index{x * out_bytes_per_pixel + pixel_base}; | 66 | const u32 pixel_index{x * out_bytes_per_pixel + pixel_base}; |
| 117 | data_ptrs[unswizzle] = swizzled_data + swizzle_offset; | 67 | data_ptrs[unswizzle] = swizzled_data + swizzle_offset; |
| 118 | data_ptrs[!unswizzle] = unswizzled_data + pixel_index; | 68 | data_ptrs[!unswizzle] = unswizzled_data + pixel_index; |
| 119 | std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel); | 69 | std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel); |
| 120 | } | 70 | } |
| 121 | pixel_base += stride_x; | 71 | pixel_base += stride_x; |
| 122 | if ((y + 1) % 8 == 0) | 72 | if ((y + 1) % gob_size_y == 0) |
| 123 | y_adress += gob_size; | 73 | y_adress += gob_size; |
| 124 | } | 74 | } |
| 125 | z_adress += xy_block_size; | 75 | z_adress += xy_block_size; |
| 126 | } | 76 | } |
| 127 | } | 77 | } |
| 128 | 78 | ||
| 129 | void Precise3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 width, | 79 | /** |
| 130 | u32 height, u32 depth, u32 bytes_per_pixel, u32 out_bytes_per_pixel, | 80 | * This function unswizzles or swizzles a texture by mapping Linear to BlockLinear Textue. |
| 131 | u32 block_height, u32 block_depth) { | 81 | * The body of this function takes care of splitting the swizzled texture into blocks, |
| 132 | auto div_ceil = [](u32 x, u32 y) { return ((x + y - 1) / y); }; | 82 | * and managing the extents of it. Once all the parameters of a single block are obtained, |
| 133 | 83 | * the function calls '3DProcessBlock' to process that particular Block. | |
| 84 | * | ||
| 85 | * Documentation for the memory layout and decoding can be found at: | ||
| 86 | * https://envytools.readthedocs.io/en/latest/hw/memory/g80-surface.html#blocklinear-surfaces | ||
| 87 | */ | ||
| 88 | void Precise3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle, | ||
| 89 | const u32 width, const u32 height, const u32 depth, | ||
| 90 | const u32 bytes_per_pixel, const u32 out_bytes_per_pixel, | ||
| 91 | const u32 block_height, const u32 block_depth) { | ||
| 92 | auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); }; | ||
| 134 | const u32 stride_x = width * out_bytes_per_pixel; | 93 | const u32 stride_x = width * out_bytes_per_pixel; |
| 135 | const u32 layer_z = height * stride_x; | 94 | const u32 layer_z = height * stride_x; |
| 136 | const u32 gob_x_bytes = 64; | 95 | const u32 gob_x_bytes = 64; |
| @@ -157,33 +116,41 @@ void Precise3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzl | |||
| 157 | for (u32 xb = 0; xb < blocks_on_x; xb++) { | 116 | for (u32 xb = 0; xb < blocks_on_x; xb++) { |
| 158 | const u32 x_start = xb * block_x_elements; | 117 | const u32 x_start = xb * block_x_elements; |
| 159 | const u32 x_end = std::min(width, x_start + block_x_elements); | 118 | const u32 x_end = std::min(width, x_start + block_x_elements); |
| 160 | Precise3DProcessGobs(swizzled_data, unswizzled_data, unswizzle, x_start, y_start, | 119 | Precise3DProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start, |
| 161 | z_start, x_end, y_end, z_end, tile_offset, xy_block_size, layer_z, | 120 | z_start, x_end, y_end, z_end, tile_offset, xy_block_size, |
| 162 | stride_x, bytes_per_pixel, out_bytes_per_pixel); | 121 | layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel); |
| 163 | tile_offset += block_size; | 122 | tile_offset += block_size; |
| 164 | } | 123 | } |
| 165 | } | 124 | } |
| 166 | } | 125 | } |
| 167 | } | 126 | } |
| 168 | 127 | ||
| 169 | void Fast3DProcessGobs(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, const u32 x_start, | 128 | /** |
| 170 | const u32 y_start, const u32 z_start, const u32 x_end, const u32 y_end, | 129 | * This function manages ALL the GOBs(Group of Bytes) Inside a single block. |
| 171 | const u32 z_end, const u32 tile_offset, const u32 xy_block_size, | 130 | * Instead of going gob by gob, we map the coordinates inside a block and manage from |
| 172 | const u32 layer_z, const u32 stride_x, const u32 bytes_per_pixel, | 131 | * those. Block_Width is assumed to be 1. |
| 173 | const u32 out_bytes_per_pixel) { | 132 | */ |
| 133 | void Fast3DProcessBlock(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle, | ||
| 134 | const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end, | ||
| 135 | const u32 y_end, const u32 z_end, const u32 tile_offset, | ||
| 136 | const u32 xy_block_size, const u32 layer_z, const u32 stride_x, | ||
| 137 | const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) { | ||
| 174 | std::array<u8*, 2> data_ptrs; | 138 | std::array<u8*, 2> data_ptrs; |
| 175 | u32 z_adress = tile_offset; | 139 | u32 z_adress = tile_offset; |
| 176 | const u32 x_startb = x_start * bytes_per_pixel; | 140 | const u32 x_startb = x_start * bytes_per_pixel; |
| 177 | const u32 x_endb = x_end * bytes_per_pixel; | 141 | const u32 x_endb = x_end * bytes_per_pixel; |
| 178 | const u32 copy_size = 16; | 142 | const u32 copy_size = 16; |
| 179 | const u32 gob_size = 64 * 8 * 1; | 143 | const u32 gob_size_x = 64; |
| 144 | const u32 gob_size_y = 8; | ||
| 145 | const u32 gob_size_z = 1; | ||
| 146 | const u32 gob_size = gob_size_x * gob_size_y * gob_size_z; | ||
| 180 | for (u32 z = z_start; z < z_end; z++) { | 147 | for (u32 z = z_start; z < z_end; z++) { |
| 181 | u32 y_adress = z_adress; | 148 | u32 y_adress = z_adress; |
| 182 | u32 pixel_base = layer_z * z + y_start * stride_x; | 149 | u32 pixel_base = layer_z * z + y_start * stride_x; |
| 183 | for (u32 y = y_start; y < y_end; y++) { | 150 | for (u32 y = y_start; y < y_end; y++) { |
| 184 | const auto& table = fast_swizzle_table[y % 8]; | 151 | const auto& table = fast_swizzle_table[y % gob_size_y]; |
| 185 | for (u32 xb = x_startb; xb < x_endb; xb += copy_size) { | 152 | for (u32 xb = x_startb; xb < x_endb; xb += copy_size) { |
| 186 | const u32 swizzle_offset{y_adress + table[(xb / 16) % 4]}; | 153 | const u32 swizzle_offset{y_adress + table[(xb / copy_size) % 4]}; |
| 187 | const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel; | 154 | const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel; |
| 188 | const u32 pixel_index{out_x + pixel_base}; | 155 | const u32 pixel_index{out_x + pixel_base}; |
| 189 | data_ptrs[unswizzle] = swizzled_data + swizzle_offset; | 156 | data_ptrs[unswizzle] = swizzled_data + swizzle_offset; |
| @@ -191,18 +158,27 @@ void Fast3DProcessGobs(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, c | |||
| 191 | std::memcpy(data_ptrs[0], data_ptrs[1], copy_size); | 158 | std::memcpy(data_ptrs[0], data_ptrs[1], copy_size); |
| 192 | } | 159 | } |
| 193 | pixel_base += stride_x; | 160 | pixel_base += stride_x; |
| 194 | if ((y + 1) % 8 == 0) | 161 | if ((y + 1) % gob_size_y == 0) |
| 195 | y_adress += gob_size; | 162 | y_adress += gob_size; |
| 196 | } | 163 | } |
| 197 | z_adress += xy_block_size; | 164 | z_adress += xy_block_size; |
| 198 | } | 165 | } |
| 199 | } | 166 | } |
| 200 | 167 | ||
| 201 | void Fast3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 width, | 168 | /** |
| 202 | u32 height, u32 depth, u32 bytes_per_pixel, u32 out_bytes_per_pixel, | 169 | * This function unswizzles or swizzles a texture by mapping Linear to BlockLinear Textue. |
| 203 | u32 block_height, u32 block_depth) { | 170 | * The body of this function takes care of splitting the swizzled texture into blocks, |
| 204 | auto div_ceil = [](u32 x, u32 y) { return ((x + y - 1) / y); }; | 171 | * and managing the extents of it. Once all the parameters of a single block are obtained, |
| 205 | 172 | * the function calls '3DProcessBlock' to process that particular Block. | |
| 173 | * | ||
| 174 | * Documentation for the memory layout and decoding can be found at: | ||
| 175 | * https://envytools.readthedocs.io/en/latest/hw/memory/g80-surface.html#blocklinear-surfaces | ||
| 176 | */ | ||
| 177 | void Fast3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle, | ||
| 178 | const u32 width, const u32 height, const u32 depth, | ||
| 179 | const u32 bytes_per_pixel, const u32 out_bytes_per_pixel, | ||
| 180 | const u32 block_height, const u32 block_depth) { | ||
| 181 | auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); }; | ||
| 206 | const u32 stride_x = width * out_bytes_per_pixel; | 182 | const u32 stride_x = width * out_bytes_per_pixel; |
| 207 | const u32 layer_z = height * stride_x; | 183 | const u32 layer_z = height * stride_x; |
| 208 | const u32 gob_x_bytes = 64; | 184 | const u32 gob_x_bytes = 64; |
| @@ -229,9 +205,9 @@ void Fast3DSwizzledData(u8* swizzled_data, u8* unswizzled_data, bool unswizzle, | |||
| 229 | for (u32 xb = 0; xb < blocks_on_x; xb++) { | 205 | for (u32 xb = 0; xb < blocks_on_x; xb++) { |
| 230 | const u32 x_start = xb * block_x_elements; | 206 | const u32 x_start = xb * block_x_elements; |
| 231 | const u32 x_end = std::min(width, x_start + block_x_elements); | 207 | const u32 x_end = std::min(width, x_start + block_x_elements); |
| 232 | Fast3DProcessGobs(swizzled_data, unswizzled_data, unswizzle, x_start, y_start, | 208 | Fast3DProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start, |
| 233 | z_start, x_end, y_end, z_end, tile_offset, xy_block_size, layer_z, | 209 | z_start, x_end, y_end, z_end, tile_offset, xy_block_size, |
| 234 | stride_x, bytes_per_pixel, out_bytes_per_pixel); | 210 | layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel); |
| 235 | tile_offset += block_size; | 211 | tile_offset += block_size; |
| 236 | } | 212 | } |
| 237 | } | 213 | } |
| @@ -245,7 +221,7 @@ void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_ | |||
| 245 | bytes_per_pixel, out_bytes_per_pixel, block_height, 1U); | 221 | bytes_per_pixel, out_bytes_per_pixel, block_height, 1U); |
| 246 | } else { | 222 | } else { |
| 247 | Precise3DSwizzledData(swizzled_data, unswizzled_data, unswizzle, width, height, 1U, | 223 | Precise3DSwizzledData(swizzled_data, unswizzled_data, unswizzle, width, height, 1U, |
| 248 | bytes_per_pixel, out_bytes_per_pixel, block_height, 1U); | 224 | bytes_per_pixel, out_bytes_per_pixel, block_height, 1U); |
| 249 | } | 225 | } |
| 250 | } | 226 | } |
| 251 | 227 | ||