summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar ameerj2021-02-13 16:49:24 -0500
committerGravatar ameerj2021-03-13 12:16:03 -0500
commit20eb368e147e1c27f05d6923c51596f8dfe24e89 (patch)
treea8b1c8eb79eb55e189a10dfd43b8b6bb1449220f
parenthost_shaders: Modify shader cmake integration to allow for larger shaders (diff)
downloadyuzu-20eb368e147e1c27f05d6923c51596f8dfe24e89.tar.gz
yuzu-20eb368e147e1c27f05d6923c51596f8dfe24e89.tar.xz
yuzu-20eb368e147e1c27f05d6923c51596f8dfe24e89.zip
renderer_vulkan: Accelerate ASTC decoding
Co-Authored-By: Rodrigo Locatti <reinuseslisp@airmail.cc>
-rw-r--r--src/video_core/host_shaders/astc_decoder.comp43
-rw-r--r--src/video_core/renderer_vulkan/maxwell_to_vk.cpp2
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.cpp298
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.h32
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp5
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h1
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp45
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.h12
-rw-r--r--src/video_core/texture_cache/accelerated_swizzle.h4
-rw-r--r--src/video_core/textures/decoders.cpp23
-rw-r--r--src/video_core/textures/decoders.h18
11 files changed, 426 insertions, 57 deletions
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index 070190a5c..2ddac2e1d 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -16,7 +16,7 @@
16#define BINDING_7_TO_8_BUFFER 4 16#define BINDING_7_TO_8_BUFFER 4
17#define BINDING_8_TO_8_BUFFER 5 17#define BINDING_8_TO_8_BUFFER 5
18#define BINDING_BYTE_TO_16_BUFFER 6 18#define BINDING_BYTE_TO_16_BUFFER 6
19#define BINDING_OUTPUT_IMAGE 3 19#define BINDING_OUTPUT_IMAGE 7
20 20
21#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv 21#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
22 22
@@ -85,7 +85,26 @@ layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
85layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { 85layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 {
86 uint astc_data[]; 86 uint astc_data[];
87}; 87};
88layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly image2D dest_image; 88
89// ASTC Encodings data
90layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
91 EncodingData encoding_values[];
92};
93// ASTC Precompiled tables
94layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 {
95 uint REPLICATE_6_BIT_TO_8_TABLE[];
96};
97layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 {
98 uint REPLICATE_7_BIT_TO_8_TABLE[];
99};
100layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 {
101 uint REPLICATE_8_BIT_TO_8_TABLE[];
102};
103layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 {
104 uint REPLICATE_BYTE_TO_16_TABLE[];
105};
106
107layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2D dest_image;
89 108
90const uint GOB_SIZE_X = 64; 109const uint GOB_SIZE_X = 64;
91const uint GOB_SIZE_Y = 8; 110const uint GOB_SIZE_Y = 8;
@@ -109,23 +128,6 @@ uint ReadTexel(uint offset) {
109 return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8); 128 return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
110} 129}
111 130
112// ASTC Encodings data
113layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
114 EncodingData encoding_values[256];
115};
116// ASTC Precompiled tables
117layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 {
118 uint REPLICATE_6_BIT_TO_8_TABLE[];
119};
120layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 {
121 uint REPLICATE_7_BIT_TO_8_TABLE[];
122};
123layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 {
124 uint REPLICATE_8_BIT_TO_8_TABLE[];
125};
126layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 {
127 uint REPLICATE_BYTE_TO_16_TABLE[];
128};
129 131
130const int BLOCK_SIZE_IN_BYTES = 16; 132const int BLOCK_SIZE_IN_BYTES = 16;
131 133
@@ -1275,8 +1277,7 @@ void main() {
1275 offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; 1277 offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
1276 offset += swizzle; 1278 offset += swizzle;
1277 1279
1278 const ivec3 invocation_destination = ivec3(gl_GlobalInvocationID + destination); 1280 const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1.0));
1279 const ivec3 coord = ivec3(invocation_destination * uvec3(block_dims, 1.0));
1280 uint block_index = 1281 uint block_index =
1281 layer * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x; 1282 layer * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x;
1282 current_index = 0; 1283 current_index = 0;
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 19aaf034f..f088447e9 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -166,7 +166,7 @@ struct FormatTuple {
166 {VK_FORMAT_R16G16_SINT, Attachable | Storage}, // R16G16_SINT 166 {VK_FORMAT_R16G16_SINT, Attachable | Storage}, // R16G16_SINT
167 {VK_FORMAT_R16G16_SNORM, Attachable | Storage}, // R16G16_SNORM 167 {VK_FORMAT_R16G16_SNORM, Attachable | Storage}, // R16G16_SNORM
168 {VK_FORMAT_UNDEFINED}, // R32G32B32_FLOAT 168 {VK_FORMAT_UNDEFINED}, // R32G32B32_FLOAT
169 {VK_FORMAT_R8G8B8A8_SRGB, Attachable}, // A8B8G8R8_SRGB 169 {VK_FORMAT_A8B8G8R8_SRGB_PACK32, Attachable}, // A8B8G8R8_SRGB
170 {VK_FORMAT_R8G8_UNORM, Attachable | Storage}, // R8G8_UNORM 170 {VK_FORMAT_R8G8_UNORM, Attachable | Storage}, // R8G8_UNORM
171 {VK_FORMAT_R8G8_SNORM, Attachable | Storage}, // R8G8_SNORM 171 {VK_FORMAT_R8G8_SNORM, Attachable | Storage}, // R8G8_SNORM
172 {VK_FORMAT_R8G8_SINT, Attachable | Storage}, // R8G8_SINT 172 {VK_FORMAT_R8G8_SINT, Attachable | Storage}, // R8G8_SINT
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index 2f9a7b028..7587ab1e0 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -11,18 +11,38 @@
11#include "common/assert.h" 11#include "common/assert.h"
12#include "common/common_types.h" 12#include "common/common_types.h"
13#include "common/div_ceil.h" 13#include "common/div_ceil.h"
14#include "video_core/host_shaders/astc_decoder_comp_spv.h"
14#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" 15#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
15#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" 16#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
16#include "video_core/renderer_vulkan/vk_compute_pass.h" 17#include "video_core/renderer_vulkan/vk_compute_pass.h"
17#include "video_core/renderer_vulkan/vk_descriptor_pool.h" 18#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
18#include "video_core/renderer_vulkan/vk_scheduler.h" 19#include "video_core/renderer_vulkan/vk_scheduler.h"
19#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" 20#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
21#include "video_core/renderer_vulkan/vk_texture_cache.h"
20#include "video_core/renderer_vulkan/vk_update_descriptor.h" 22#include "video_core/renderer_vulkan/vk_update_descriptor.h"
23#include "video_core/texture_cache/accelerated_swizzle.h"
24#include "video_core/texture_cache/types.h"
25#include "video_core/textures/astc.h"
26#include "video_core/textures/decoders.h"
21#include "video_core/vulkan_common/vulkan_device.h" 27#include "video_core/vulkan_common/vulkan_device.h"
22#include "video_core/vulkan_common/vulkan_wrapper.h" 28#include "video_core/vulkan_common/vulkan_wrapper.h"
23 29
24namespace Vulkan { 30namespace Vulkan {
31
32using Tegra::Texture::SWIZZLE_TABLE;
33using Tegra::Texture::ASTC::EncodingsValues;
34
25namespace { 35namespace {
36
37constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 0;
38constexpr u32 ASTC_BINDING_INPUT_BUFFER = 1;
39constexpr u32 ASTC_BINDING_ENC_BUFFER = 2;
40constexpr u32 ASTC_BINDING_6_TO_8_BUFFER = 3;
41constexpr u32 ASTC_BINDING_7_TO_8_BUFFER = 4;
42constexpr u32 ASTC_BINDING_8_TO_8_BUFFER = 5;
43constexpr u32 ASTC_BINDING_BYTE_TO_16_BUFFER = 6;
44constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 7;
45
26VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { 46VkPushConstantRange BuildComputePushConstantRange(std::size_t size) {
27 return { 47 return {
28 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, 48 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
@@ -50,6 +70,67 @@ std::array<VkDescriptorSetLayoutBinding, 2> BuildInputOutputDescriptorSetBinding
50 }}; 70 }};
51} 71}
52 72
73std::array<VkDescriptorSetLayoutBinding, 8> BuildASTCDescriptorSetBindings() {
74 return {{
75 {
76 .binding = ASTC_BINDING_SWIZZLE_BUFFER, // Swizzle buffer
77 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
78 .descriptorCount = 1,
79 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
80 .pImmutableSamplers = nullptr,
81 },
82 {
83 .binding = ASTC_BINDING_INPUT_BUFFER, // ASTC Img data buffer
84 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
85 .descriptorCount = 1,
86 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
87 .pImmutableSamplers = nullptr,
88 },
89 {
90 .binding = ASTC_BINDING_ENC_BUFFER, // Encodings buffer
91 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
92 .descriptorCount = 1,
93 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
94 .pImmutableSamplers = nullptr,
95 },
96 {
97 .binding = ASTC_BINDING_6_TO_8_BUFFER, // BINDING_6_TO_8_BUFFER
98 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
99 .descriptorCount = 1,
100 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
101 .pImmutableSamplers = nullptr,
102 },
103 {
104 .binding = ASTC_BINDING_7_TO_8_BUFFER, // BINDING_7_TO_8_BUFFER
105 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
106 .descriptorCount = 1,
107 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
108 .pImmutableSamplers = nullptr,
109 },
110 {
111 .binding = ASTC_BINDING_8_TO_8_BUFFER, // BINDING_8_TO_8_BUFFER
112 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
113 .descriptorCount = 1,
114 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
115 .pImmutableSamplers = nullptr,
116 },
117 {
118 .binding = ASTC_BINDING_BYTE_TO_16_BUFFER, // BINDING_BYTE_TO_16_BUFFER
119 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
120 .descriptorCount = 1,
121 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
122 .pImmutableSamplers = nullptr,
123 },
124 {
125 .binding = ASTC_BINDING_OUTPUT_IMAGE, // Output image
126 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
127 .descriptorCount = 1,
128 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
129 .pImmutableSamplers = nullptr,
130 },
131 }};
132}
133
53VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() { 134VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() {
54 return { 135 return {
55 .dstBinding = 0, 136 .dstBinding = 0,
@@ -61,6 +142,90 @@ VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() {
61 }; 142 };
62} 143}
63 144
145std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateTemplateEntry() {
146 return {{
147 {
148 .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER,
149 .dstArrayElement = 0,
150 .descriptorCount = 1,
151 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
152 .offset = 0 * sizeof(DescriptorUpdateEntry),
153 .stride = sizeof(DescriptorUpdateEntry),
154 },
155 {
156 .dstBinding = ASTC_BINDING_INPUT_BUFFER,
157 .dstArrayElement = 0,
158 .descriptorCount = 1,
159 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
160 .offset = 1 * sizeof(DescriptorUpdateEntry),
161 .stride = sizeof(DescriptorUpdateEntry),
162 },
163 {
164 .dstBinding = ASTC_BINDING_ENC_BUFFER,
165 .dstArrayElement = 0,
166 .descriptorCount = 1,
167 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
168 .offset = 2 * sizeof(DescriptorUpdateEntry),
169 .stride = sizeof(DescriptorUpdateEntry),
170 },
171 {
172 .dstBinding = ASTC_BINDING_6_TO_8_BUFFER,
173 .dstArrayElement = 0,
174 .descriptorCount = 1,
175 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
176 .offset = 3 * sizeof(DescriptorUpdateEntry),
177 .stride = sizeof(DescriptorUpdateEntry),
178 },
179 {
180 .dstBinding = ASTC_BINDING_7_TO_8_BUFFER,
181 .dstArrayElement = 0,
182 .descriptorCount = 1,
183 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
184 .offset = 4 * sizeof(DescriptorUpdateEntry),
185 .stride = sizeof(DescriptorUpdateEntry),
186 },
187 {
188 .dstBinding = ASTC_BINDING_8_TO_8_BUFFER,
189 .dstArrayElement = 0,
190 .descriptorCount = 1,
191 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
192 .offset = 5 * sizeof(DescriptorUpdateEntry),
193 .stride = sizeof(DescriptorUpdateEntry),
194 },
195 {
196 .dstBinding = ASTC_BINDING_BYTE_TO_16_BUFFER,
197 .dstArrayElement = 0,
198 .descriptorCount = 1,
199 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
200 .offset = 6 * sizeof(DescriptorUpdateEntry),
201 .stride = sizeof(DescriptorUpdateEntry),
202 },
203 {
204 .dstBinding = ASTC_BINDING_OUTPUT_IMAGE,
205 .dstArrayElement = 0,
206 .descriptorCount = 1,
207 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
208 .offset = 7 * sizeof(DescriptorUpdateEntry),
209 .stride = sizeof(DescriptorUpdateEntry),
210 },
211 }};
212}
213
214struct AstcPushConstants {
215 std::array<u32, 2> num_image_blocks;
216 std::array<u32, 2> blocks_dims;
217 u32 layer;
218 VideoCommon::Accelerated::BlockLinearSwizzle2DParams params;
219};
220
221struct AstcBufferData {
222 decltype(SWIZZLE_TABLE) swizzle_table_buffer = SWIZZLE_TABLE;
223 decltype(EncodingsValues) encoding_values = EncodingsValues;
224 decltype(REPLICATE_6_BIT_TO_8_TABLE) replicate_6_to_8 = REPLICATE_6_BIT_TO_8_TABLE;
225 decltype(REPLICATE_7_BIT_TO_8_TABLE) replicate_7_to_8 = REPLICATE_7_BIT_TO_8_TABLE;
226 decltype(REPLICATE_8_BIT_TO_8_TABLE) replicate_8_to_8 = REPLICATE_8_BIT_TO_8_TABLE;
227 decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE;
228} constexpr ASTC_BUFFER_DATA;
64} // Anonymous namespace 229} // Anonymous namespace
65 230
66VKComputePass::VKComputePass(const Device& device, VKDescriptorPool& descriptor_pool, 231VKComputePass::VKComputePass(const Device& device, VKDescriptorPool& descriptor_pool,
@@ -238,4 +403,137 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
238 return {staging.buffer, staging.offset}; 403 return {staging.buffer, staging.offset};
239} 404}
240 405
406using namespace Tegra::Texture::ASTC;
407ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_,
408 VKDescriptorPool& descriptor_pool_,
409 StagingBufferPool& staging_buffer_pool_,
410 VKUpdateDescriptorQueue& update_descriptor_queue_,
411 MemoryAllocator& memory_allocator_)
412 : VKComputePass(device_, descriptor_pool_, BuildASTCDescriptorSetBindings(),
413 BuildASTCPassDescriptorUpdateTemplateEntry(),
414 BuildComputePushConstantRange(sizeof(AstcPushConstants)),
415 ASTC_DECODER_COMP_SPV),
416 device{device_}, scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_},
417 update_descriptor_queue{update_descriptor_queue_}, memory_allocator{memory_allocator_} {}
418
419ASTCDecoderPass::~ASTCDecoderPass() = default;
420
421void ASTCDecoderPass::MakeDataBuffer() {
422 data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
423 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
424 .pNext = nullptr,
425 .flags = 0,
426 .size = sizeof(ASTC_BUFFER_DATA),
427 .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
428 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
429 .queueFamilyIndexCount = 0,
430 .pQueueFamilyIndices = nullptr,
431 });
432 data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload);
433
434 const auto staging_ref =
435 staging_buffer_pool.Request(sizeof(ASTC_BUFFER_DATA), MemoryUsage::Upload);
436 std::memcpy(staging_ref.mapped_span.data(), &ASTC_BUFFER_DATA, sizeof(ASTC_BUFFER_DATA));
437 scheduler.Record([src = staging_ref.buffer, dst = *data_buffer](vk::CommandBuffer cmdbuf) {
438 cmdbuf.CopyBuffer(src, dst,
439 VkBufferCopy{
440 .srcOffset = 0,
441 .dstOffset = 0,
442 .size = sizeof(ASTC_BUFFER_DATA),
443 });
444 cmdbuf.PipelineBarrier(
445 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0,
446 VkMemoryBarrier{
447 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
448 .pNext = nullptr,
449 .srcAccessMask = 0,
450 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
451 },
452 {}, {});
453 });
454}
455
456void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
457 std::span<const VideoCommon::SwizzleParameters> swizzles) {
458 using namespace VideoCommon::Accelerated;
459 const VideoCommon::Extent2D tile_size{
460 .width = VideoCore::Surface::DefaultBlockWidth(image.info.format),
461 .height = VideoCore::Surface::DefaultBlockHeight(image.info.format),
462 };
463 scheduler.RequestOutsideRenderPassOperationContext();
464 if (!data_buffer) {
465 MakeDataBuffer();
466 }
467 const std::array<u32, 2> block_dims{tile_size.width, tile_size.height};
468 for (s32 layer = 0; layer < image.info.resources.layers; layer++) {
469 for (const VideoCommon::SwizzleParameters& swizzle : swizzles) {
470 const size_t input_offset = swizzle.buffer_offset + map.offset;
471 const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U);
472 const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U);
473 const std::array num_image_blocks{swizzle.num_tiles.width, swizzle.num_tiles.height};
474 const u32 layer_image_size =
475 image.guest_size_bytes - static_cast<u32>(swizzle.buffer_offset);
476
477 update_descriptor_queue.Acquire();
478 update_descriptor_queue.AddBuffer(*data_buffer,
479 offsetof(AstcBufferData, swizzle_table_buffer),
480 sizeof(AstcBufferData::swizzle_table_buffer));
481 update_descriptor_queue.AddBuffer(map.buffer, input_offset, image.guest_size_bytes);
482 update_descriptor_queue.AddBuffer(*data_buffer,
483 offsetof(AstcBufferData, encoding_values),
484 sizeof(AstcBufferData::encoding_values));
485 update_descriptor_queue.AddBuffer(*data_buffer,
486 offsetof(AstcBufferData, replicate_6_to_8),
487 sizeof(AstcBufferData::replicate_6_to_8));
488 update_descriptor_queue.AddBuffer(*data_buffer,
489 offsetof(AstcBufferData, replicate_7_to_8),
490 sizeof(AstcBufferData::replicate_7_to_8));
491 update_descriptor_queue.AddBuffer(*data_buffer,
492 offsetof(AstcBufferData, replicate_8_to_8),
493 sizeof(AstcBufferData::replicate_8_to_8));
494 update_descriptor_queue.AddBuffer(*data_buffer,
495 offsetof(AstcBufferData, replicate_byte_to_16),
496 sizeof(AstcBufferData::replicate_byte_to_16));
497 update_descriptor_queue.AddImage(image.StorageImageView());
498
499 const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
500 // To unswizzle the ASTC data
501 const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
502 scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = map.buffer,
503 num_dispatches_x, num_dispatches_y, layer_image_size,
504 num_image_blocks, block_dims, layer, params, set,
505 image = image.Handle(), input_offset,
506 aspect_mask = image.AspectMask()](vk::CommandBuffer cmdbuf) {
507 const AstcPushConstants uniforms{num_image_blocks, block_dims, layer, params};
508
509 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
510 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
511 cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms);
512 cmdbuf.Dispatch(num_dispatches_x, num_dispatches_y, 1);
513
514 const VkImageMemoryBarrier image_barrier{
515 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
516 .pNext = nullptr,
517 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
518 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
519 .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED,
520 .newLayout = VK_IMAGE_LAYOUT_GENERAL,
521 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
522 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
523 .image = image,
524 .subresourceRange{
525 .aspectMask = aspect_mask,
526 .baseMipLevel = 0,
527 .levelCount = VK_REMAINING_MIP_LEVELS,
528 .baseArrayLayer = 0,
529 .layerCount = VK_REMAINING_ARRAY_LAYERS,
530 },
531 };
532 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
533 VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, image_barrier);
534 });
535 }
536 }
537}
538
241} // namespace Vulkan 539} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index 17d781d99..5ea187c30 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -11,14 +11,21 @@
11#include "common/common_types.h" 11#include "common/common_types.h"
12#include "video_core/engines/maxwell_3d.h" 12#include "video_core/engines/maxwell_3d.h"
13#include "video_core/renderer_vulkan/vk_descriptor_pool.h" 13#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
14#include "video_core/vulkan_common/vulkan_memory_allocator.h"
14#include "video_core/vulkan_common/vulkan_wrapper.h" 15#include "video_core/vulkan_common/vulkan_wrapper.h"
15 16
17namespace VideoCommon {
18struct SwizzleParameters;
19}
20
16namespace Vulkan { 21namespace Vulkan {
17 22
18class Device; 23class Device;
19class StagingBufferPool; 24class StagingBufferPool;
20class VKScheduler; 25class VKScheduler;
21class VKUpdateDescriptorQueue; 26class VKUpdateDescriptorQueue;
27class Image;
28struct StagingBufferRef;
22 29
23class VKComputePass { 30class VKComputePass {
24public: 31public:
@@ -77,4 +84,29 @@ private:
77 VKUpdateDescriptorQueue& update_descriptor_queue; 84 VKUpdateDescriptorQueue& update_descriptor_queue;
78}; 85};
79 86
87class ASTCDecoderPass final : public VKComputePass {
88public:
89 explicit ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_,
90 VKDescriptorPool& descriptor_pool_,
91 StagingBufferPool& staging_buffer_pool_,
92 VKUpdateDescriptorQueue& update_descriptor_queue_,
93 MemoryAllocator& memory_allocator_);
94 ~ASTCDecoderPass();
95
96 void Assemble(Image& image, const StagingBufferRef& map,
97 std::span<const VideoCommon::SwizzleParameters> swizzles);
98
99private:
100 void MakeDataBuffer();
101
102 const Device& device;
103 VKScheduler& scheduler;
104 StagingBufferPool& staging_buffer_pool;
105 VKUpdateDescriptorQueue& update_descriptor_queue;
106 MemoryAllocator& memory_allocator;
107
108 vk::Buffer data_buffer;
109 MemoryCommit data_buffer_commit;
110};
111
80} // namespace Vulkan 112} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index dfd38f575..df5b7b172 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -241,7 +241,10 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
241 staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler), 241 staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler),
242 update_descriptor_queue(device, scheduler), 242 update_descriptor_queue(device, scheduler),
243 blit_image(device, scheduler, state_tracker, descriptor_pool), 243 blit_image(device, scheduler, state_tracker, descriptor_pool),
244 texture_cache_runtime{device, scheduler, memory_allocator, staging_pool, blit_image}, 244 astc_decoder_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue,
245 memory_allocator),
246 texture_cache_runtime{device, scheduler, memory_allocator,
247 staging_pool, blit_image, astc_decoder_pass},
245 texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), 248 texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory),
246 buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, 249 buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool,
247 update_descriptor_queue, descriptor_pool), 250 update_descriptor_queue, descriptor_pool),
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index acea1ba2d..235afc6f3 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -173,6 +173,7 @@ private:
173 VKDescriptorPool descriptor_pool; 173 VKDescriptorPool descriptor_pool;
174 VKUpdateDescriptorQueue update_descriptor_queue; 174 VKUpdateDescriptorQueue update_descriptor_queue;
175 BlitImageHelper blit_image; 175 BlitImageHelper blit_image;
176 ASTCDecoderPass astc_decoder_pass;
176 177
177 GraphicsPipelineCacheKey graphics_key; 178 GraphicsPipelineCacheKey graphics_key;
178 179
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 22a1014a9..f7f744587 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -10,6 +10,7 @@
10#include "video_core/engines/fermi_2d.h" 10#include "video_core/engines/fermi_2d.h"
11#include "video_core/renderer_vulkan/blit_image.h" 11#include "video_core/renderer_vulkan/blit_image.h"
12#include "video_core/renderer_vulkan/maxwell_to_vk.h" 12#include "video_core/renderer_vulkan/maxwell_to_vk.h"
13#include "video_core/renderer_vulkan/vk_compute_pass.h"
13#include "video_core/renderer_vulkan/vk_rasterizer.h" 14#include "video_core/renderer_vulkan/vk_rasterizer.h"
14#include "video_core/renderer_vulkan/vk_scheduler.h" 15#include "video_core/renderer_vulkan/vk_scheduler.h"
15#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" 16#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
@@ -807,7 +808,7 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
807 commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); 808 commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal);
808 } 809 }
809 if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { 810 if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) {
810 flags |= VideoCommon::ImageFlagBits::Converted; 811 flags |= VideoCommon::ImageFlagBits::AcceleratedUpload;
811 } 812 }
812 if (runtime.device.HasDebuggingToolAttached()) { 813 if (runtime.device.HasDebuggingToolAttached()) {
813 if (image) { 814 if (image) {
@@ -816,6 +817,34 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
816 buffer.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); 817 buffer.SetObjectNameEXT(VideoCommon::Name(*this).c_str());
817 } 818 }
818 } 819 }
820 static constexpr VkImageViewUsageCreateInfo storage_image_view_usage_create_info{
821 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
822 .pNext = nullptr,
823 .usage = VK_IMAGE_USAGE_STORAGE_BIT,
824 };
825 if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) {
826 storage_image_view = runtime.device.GetLogical().CreateImageView(VkImageViewCreateInfo{
827 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
828 .pNext = &storage_image_view_usage_create_info,
829 .flags = 0,
830 .image = *image,
831 .viewType = VK_IMAGE_VIEW_TYPE_2D,
832 .format = VK_FORMAT_A8B8G8R8_UNORM_PACK32,
833 .components{
834 .r = VK_COMPONENT_SWIZZLE_IDENTITY,
835 .g = VK_COMPONENT_SWIZZLE_IDENTITY,
836 .b = VK_COMPONENT_SWIZZLE_IDENTITY,
837 .a = VK_COMPONENT_SWIZZLE_IDENTITY,
838 },
839 .subresourceRange{
840 .aspectMask = aspect_mask,
841 .baseMipLevel = 0,
842 .levelCount = VK_REMAINING_MIP_LEVELS,
843 .baseArrayLayer = 0,
844 .layerCount = VK_REMAINING_ARRAY_LAYERS,
845 },
846 });
847 }
819} 848}
820 849
821void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { 850void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
@@ -918,7 +947,6 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI
918 } 947 }
919 } 948 }
920 const auto format_info = MaxwellToVK::SurfaceFormat(*device, FormatType::Optimal, true, format); 949 const auto format_info = MaxwellToVK::SurfaceFormat(*device, FormatType::Optimal, true, format);
921 const VkFormat vk_format = format_info.format;
922 const VkImageViewUsageCreateInfo image_view_usage{ 950 const VkImageViewUsageCreateInfo image_view_usage{
923 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, 951 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
924 .pNext = nullptr, 952 .pNext = nullptr,
@@ -930,7 +958,7 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI
930 .flags = 0, 958 .flags = 0,
931 .image = image.Handle(), 959 .image = image.Handle(),
932 .viewType = VkImageViewType{}, 960 .viewType = VkImageViewType{},
933 .format = vk_format, 961 .format = format_info.format,
934 .components{ 962 .components{
935 .r = ComponentSwizzle(swizzle[0]), 963 .r = ComponentSwizzle(swizzle[0]),
936 .g = ComponentSwizzle(swizzle[1]), 964 .g = ComponentSwizzle(swizzle[1]),
@@ -982,7 +1010,7 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI
982 .pNext = nullptr, 1010 .pNext = nullptr,
983 .flags = 0, 1011 .flags = 0,
984 .buffer = image.Buffer(), 1012 .buffer = image.Buffer(),
985 .format = vk_format, 1013 .format = format_info.format,
986 .offset = 0, // TODO: Redesign buffer cache to support this 1014 .offset = 0, // TODO: Redesign buffer cache to support this
987 .range = image.guest_size_bytes, 1015 .range = image.guest_size_bytes,
988 }); 1016 });
@@ -1167,4 +1195,13 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
1167 } 1195 }
1168} 1196}
1169 1197
1198void TextureCacheRuntime::AccelerateImageUpload(
1199 Image& image, const StagingBufferRef& map,
1200 std::span<const VideoCommon::SwizzleParameters> swizzles) {
1201 if (IsPixelFormatASTC(image.info.format)) {
1202 return astc_decoder_pass.Assemble(image, map, swizzles);
1203 }
1204 UNREACHABLE();
1205}
1206
1170} // namespace Vulkan 1207} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 3aee27ce0..51705eccb 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -20,6 +20,7 @@ using VideoCommon::Offset2D;
20using VideoCommon::RenderTargets; 20using VideoCommon::RenderTargets;
21using VideoCore::Surface::PixelFormat; 21using VideoCore::Surface::PixelFormat;
22 22
23class ASTCDecoderPass;
23class BlitImageHelper; 24class BlitImageHelper;
24class Device; 25class Device;
25class Image; 26class Image;
@@ -60,6 +61,7 @@ struct TextureCacheRuntime {
60 MemoryAllocator& memory_allocator; 61 MemoryAllocator& memory_allocator;
61 StagingBufferPool& staging_buffer_pool; 62 StagingBufferPool& staging_buffer_pool;
62 BlitImageHelper& blit_image_helper; 63 BlitImageHelper& blit_image_helper;
64 ASTCDecoderPass& astc_decoder_pass;
63 std::unordered_map<RenderPassKey, vk::RenderPass> renderpass_cache{}; 65 std::unordered_map<RenderPassKey, vk::RenderPass> renderpass_cache{};
64 66
65 void Finish(); 67 void Finish();
@@ -83,9 +85,7 @@ struct TextureCacheRuntime {
83 } 85 }
84 86
85 void AccelerateImageUpload(Image&, const StagingBufferRef&, 87 void AccelerateImageUpload(Image&, const StagingBufferRef&,
86 std::span<const VideoCommon::SwizzleParameters>) { 88 std::span<const VideoCommon::SwizzleParameters>);
87 UNREACHABLE();
88 }
89 89
90 void InsertUploadMemoryBarrier() {} 90 void InsertUploadMemoryBarrier() {}
91 91
@@ -125,11 +125,17 @@ public:
125 return aspect_mask; 125 return aspect_mask;
126 } 126 }
127 127
128 [[nodiscard]] VkImageView StorageImageView() const noexcept {
129 return *storage_image_view;
130 }
131
128private: 132private:
129 VKScheduler* scheduler; 133 VKScheduler* scheduler;
130 vk::Image image; 134 vk::Image image;
131 vk::Buffer buffer; 135 vk::Buffer buffer;
132 MemoryCommit commit; 136 MemoryCommit commit;
137 vk::ImageView image_view;
138 vk::ImageView storage_image_view;
133 VkImageAspectFlags aspect_mask = 0; 139 VkImageAspectFlags aspect_mask = 0;
134 bool initialized = false; 140 bool initialized = false;
135}; 141};
diff --git a/src/video_core/texture_cache/accelerated_swizzle.h b/src/video_core/texture_cache/accelerated_swizzle.h
index 6ec5c78c4..a11c924e1 100644
--- a/src/video_core/texture_cache/accelerated_swizzle.h
+++ b/src/video_core/texture_cache/accelerated_swizzle.h
@@ -13,8 +13,8 @@
13namespace VideoCommon::Accelerated { 13namespace VideoCommon::Accelerated {
14 14
15struct BlockLinearSwizzle2DParams { 15struct BlockLinearSwizzle2DParams {
16 std::array<u32, 3> origin; 16 alignas(16) std::array<u32, 3> origin;
17 std::array<s32, 3> destination; 17 alignas(16) std::array<s32, 3> destination;
18 u32 bytes_per_block_log2; 18 u32 bytes_per_block_log2;
19 u32 layer_stride; 19 u32 layer_stride;
20 u32 block_size; 20 u32 block_size;
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 62685a183..3a463d5db 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -17,26 +17,7 @@
17#include "video_core/textures/texture.h" 17#include "video_core/textures/texture.h"
18 18
19namespace Tegra::Texture { 19namespace Tegra::Texture {
20
21namespace { 20namespace {
22/**
23 * This table represents the internal swizzle of a gob, in format 16 bytes x 2 sector packing.
24 * Calculates the offset of an (x, y) position within a swizzled texture.
25 * Taken from the Tegra X1 Technical Reference Manual. pages 1187-1188
26 */
27constexpr SwizzleTable MakeSwizzleTableConst() {
28 SwizzleTable table{};
29 for (u32 y = 0; y < table.size(); ++y) {
30 for (u32 x = 0; x < table[0].size(); ++x) {
31 table[y][x] = ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 +
32 (y % 2) * 16 + (x % 16);
33 }
34 }
35 return table;
36}
37
38constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTableConst();
39
40template <bool TO_LINEAR> 21template <bool TO_LINEAR>
41void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, 22void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
42 u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { 23 u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
@@ -91,10 +72,6 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
91} 72}
92} // Anonymous namespace 73} // Anonymous namespace
93 74
94SwizzleTable MakeSwizzleTable() {
95 return SWIZZLE_TABLE;
96}
97
98void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, 75void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
99 u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth, 76 u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth,
100 u32 stride_alignment) { 77 u32 stride_alignment) {
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index d7cdc81e8..4c14cefbf 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -23,8 +23,22 @@ constexpr u32 GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_
23 23
24using SwizzleTable = std::array<std::array<u32, GOB_SIZE_X>, GOB_SIZE_Y>; 24using SwizzleTable = std::array<std::array<u32, GOB_SIZE_X>, GOB_SIZE_Y>;
25 25
26/// Returns a z-order swizzle table 26/**
27SwizzleTable MakeSwizzleTable(); 27 * This table represents the internal swizzle of a gob, in format 16 bytes x 2 sector packing.
28 * Calculates the offset of an (x, y) position within a swizzled texture.
29 * Taken from the Tegra X1 Technical Reference Manual. pages 1187-1188
30 */
31constexpr SwizzleTable MakeSwizzleTable() {
32 SwizzleTable table{};
33 for (u32 y = 0; y < table.size(); ++y) {
34 for (u32 x = 0; x < table[0].size(); ++x) {
35 table[y][x] = ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 +
36 (y % 2) * 16 + (x % 16);
37 }
38 }
39 return table;
40}
41constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTable();
28 42
29/// Unswizzles a block linear texture into linear memory. 43/// Unswizzles a block linear texture into linear memory.
30void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, 44void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,