summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Morph2022-09-01 21:29:22 -0400
committerGravatar Morph2022-09-16 10:16:42 -0400
commit809126c94a0ed8e7964d5a550abf7b3731d00512 (patch)
treec0b1554f05ea0863e826d8ca77b7380b78fc2a74
parentMerge pull request #8891 from Kelebek1/pragma (diff)
downloadyuzu-809126c94a0ed8e7964d5a550abf7b3731d00512.tar.gz
yuzu-809126c94a0ed8e7964d5a550abf7b3731d00512.tar.xz
yuzu-809126c94a0ed8e7964d5a550abf7b3731d00512.zip
astc: Enable parallel CPU astc decoding
Given the issues with GPU accelerated ASTC decoding with NVIDIA's latest drivers, parallelize astc decoding on the CPU. Uses half the available threads in the system for astc decoding.
Diffstat (limited to '')
-rw-r--r--src/video_core/textures/astc.cpp56
1 files changed, 35 insertions, 21 deletions
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index e3f3d3c5d..b159494c5 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -13,7 +13,9 @@
13 13
14#include <boost/container/static_vector.hpp> 14#include <boost/container/static_vector.hpp>
15 15
16#include "common/alignment.h"
16#include "common/common_types.h" 17#include "common/common_types.h"
18#include "common/thread_worker.h"
17#include "video_core/textures/astc.h" 19#include "video_core/textures/astc.h"
18 20
19class InputBitStream { 21class InputBitStream {
@@ -1650,29 +1652,41 @@ static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
1650 1652
1651void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, 1653void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
1652 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) { 1654 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) {
1653 u32 block_index = 0; 1655 const u32 rows = Common::DivideUp(height, block_height);
1654 std::size_t depth_offset = 0; 1656 const u32 cols = Common::DivideUp(width, block_width);
1655 for (u32 z = 0; z < depth; z++) { 1657
1656 for (u32 y = 0; y < height; y += block_height) { 1658 Common::ThreadWorker workers{std::max(std::thread::hardware_concurrency(), 2U) / 2,
1657 for (u32 x = 0; x < width; x += block_width) { 1659 "yuzu:ASTCDecompress"};
1658 const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)}; 1660
1659 1661 for (u32 z = 0; z < depth; ++z) {
1660 // Blocks can be at most 12x12 1662 const u32 depth_offset = z * height * width * 4;
1661 std::array<u32, 12 * 12> uncompData; 1663 for (u32 y_index = 0; y_index < rows; ++y_index) {
1662 DecompressBlock(blockPtr, block_width, block_height, uncompData); 1664 auto decompress_stride = [data, width, height, depth, block_width, block_height, output,
1663 1665 rows, cols, z, depth_offset, y_index] {
1664 u32 decompWidth = std::min(block_width, width - x); 1666 const u32 y = y_index * block_height;
1665 u32 decompHeight = std::min(block_height, height - y); 1667 for (u32 x_index = 0; x_index < cols; ++x_index) {
1666 1668 const u32 block_index = (z * rows * cols) + (y_index * cols) + x_index;
1667 const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4); 1669 const u32 x = x_index * block_width;
1668 for (u32 jj = 0; jj < decompHeight; jj++) { 1670
1669 std::memcpy(outRow.data() + jj * width * 4, 1671 const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
1670 uncompData.data() + jj * block_width, decompWidth * 4); 1672
1673 // Blocks can be at most 12x12
1674 std::array<u32, 12 * 12> uncompData;
1675 DecompressBlock(blockPtr, block_width, block_height, uncompData);
1676
1677 u32 decompWidth = std::min(block_width, width - x);
1678 u32 decompHeight = std::min(block_height, height - y);
1679
1680 const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
1681 for (u32 h = 0; h < decompHeight; ++h) {
1682 std::memcpy(outRow.data() + h * width * 4,
1683 uncompData.data() + h * block_width, decompWidth * 4);
1684 }
1671 } 1685 }
1672 ++block_index; 1686 };
1673 } 1687 workers.QueueWork(std::move(decompress_stride));
1674 } 1688 }
1675 depth_offset += height * width * 4; 1689 workers.WaitForRequests();
1676 } 1690 }
1677} 1691}
1678 1692