astc: Enable parallel CPU astc decoding

Given the issues with GPU accelerated ASTC decoding with NVIDIA's latest drivers, parallelize astc decoding on the CPU. Uses half the available threads in the system for astc decoding.
author: Morph 2022-09-01 21:29:22 -0400
committer: Morph 2022-09-16 10:16:42 -0400
commit: 809126c94a0ed8e7964d5a550abf7b3731d00512 (patch)
tree: c0b1554f05ea0863e826d8ca77b7380b78fc2a74
parent: Merge pull request #8891 from Kelebek1/pragma (diff)
download: yuzu-809126c94a0ed8e7964d5a550abf7b3731d00512.tar.gz
yuzu-809126c94a0ed8e7964d5a550abf7b3731d00512.tar.xz
yuzu-809126c94a0ed8e7964d5a550abf7b3731d00512.zip
1 files changed, 35 insertions, 21 deletions
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index e3f3d3c5d..b159494c5 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -13,7 +13,9 @@
 #include <boost/container/static_vector.hpp>
+#include "common/alignment.h"
 #include "common/common_types.h"
+#include "common/thread_worker.h"
 #include "video_core/textures/astc.h"
 class InputBitStream {
@@ -1650,29 +1652,41 @@ static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
 void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
                uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) {
-    u32 block_index = 0;
+    const u32 rows = Common::DivideUp(height, block_height);
-    std::size_t depth_offset = 0;
+    const u32 cols = Common::DivideUp(width, block_width);
-    for (u32 z = 0; z < depth; z++) {
-        for (u32 y = 0; y < height; y += block_height) {
+    Common::ThreadWorker workers{std::max(std::thread::hardware_concurrency(), 2U) / 2,
-            for (u32 x = 0; x < width; x += block_width) {
+                                 "yuzu:ASTCDecompress"};
-                const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
+    for (u32 z = 0; z < depth; ++z) {
-                // Blocks can be at most 12x12
+        const u32 depth_offset = z * height * width * 4;
-                std::array<u32, 12 * 12> uncompData;
+        for (u32 y_index = 0; y_index < rows; ++y_index) {
-                DecompressBlock(blockPtr, block_width, block_height, uncompData);
+            auto decompress_stride = [data, width, height, depth, block_width, block_height, output,
+                                      rows, cols, z, depth_offset, y_index] {
-                u32 decompWidth = std::min(block_width, width - x);
+                const u32 y = y_index * block_height;
-                u32 decompHeight = std::min(block_height, height - y);
+                for (u32 x_index = 0; x_index < cols; ++x_index) {
+                    const u32 block_index = (z * rows * cols) + (y_index * cols) + x_index;
-                const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
+                    const u32 x = x_index * block_width;
-                for (u32 jj = 0; jj < decompHeight; jj++) {
-                    std::memcpy(outRow.data() + jj * width * 4,
+                    const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
-                                uncompData.data() + jj * block_width, decompWidth * 4);
+                    // Blocks can be at most 12x12
+                    std::array<u32, 12 * 12> uncompData;
+                    DecompressBlock(blockPtr, block_width, block_height, uncompData);
+                    u32 decompWidth = std::min(block_width, width - x);
+                    u32 decompHeight = std::min(block_height, height - y);
+                    const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
+                    for (u32 h = 0; h < decompHeight; ++h) {
+                        std::memcpy(outRow.data() + h * width * 4,
+                                    uncompData.data() + h * block_width, decompWidth * 4);
+                    }
                }
-                ++block_index;
+            };
-            }
+            workers.QueueWork(std::move(decompress_stride));
        }
-        depth_offset += height * width * 4;
+        workers.WaitForRequests();
    }
 }
author	Morph	2022-09-01 21:29:22 -0400
committer	Morph	2022-09-16 10:16:42 -0400
commit	809126c94a0ed8e7964d5a550abf7b3731d00512 (patch)
tree	c0b1554f05ea0863e826d8ca77b7380b78fc2a74
parent	Merge pull request #8891 from Kelebek1/pragma (diff)
download	yuzu-809126c94a0ed8e7964d5a550abf7b3731d00512.tar.gz yuzu-809126c94a0ed8e7964d5a550abf7b3731d00512.tar.xz yuzu-809126c94a0ed8e7964d5a550abf7b3731d00512.zip

diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index e3f3d3c5d..b159494c5 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp
@@ -13,7 +13,9 @@
13		13
14	#include <boost/container/static_vector.hpp>	14	#include <boost/container/static_vector.hpp>
15		15
		16	#include "common/alignment.h"
16	#include "common/common_types.h"	17	#include "common/common_types.h"
		18	#include "common/thread_worker.h"
17	#include "video_core/textures/astc.h"	19	#include "video_core/textures/astc.h"
18		20
19	class InputBitStream {	21	class InputBitStream {
@@ -1650,29 +1652,41 @@ static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
1650		1652
1651	void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,	1653	void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
1652	uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) {	1654	uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) {
1653	u32 block_index = 0;	1655	const u32 rows = Common::DivideUp(height, block_height);
1654	std::size_t depth_offset = 0;	1656	const u32 cols = Common::DivideUp(width, block_width);
1655	for (u32 z = 0; z < depth; z++) {	1657
1656	for (u32 y = 0; y < height; y += block_height) {	1658	Common::ThreadWorker workers{std::max(std::thread::hardware_concurrency(), 2U) / 2,
1657	for (u32 x = 0; x < width; x += block_width) {	1659	"yuzu:ASTCDecompress"};
1658	const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};	1660
1659		1661	for (u32 z = 0; z < depth; ++z) {
1660	// Blocks can be at most 12x12	1662	const u32 depth_offset = z * height * width * 4;
1661	std::array<u32, 12 * 12> uncompData;	1663	for (u32 y_index = 0; y_index < rows; ++y_index) {
1662	DecompressBlock(blockPtr, block_width, block_height, uncompData);	1664	auto decompress_stride = [data, width, height, depth, block_width, block_height, output,
1663		1665	rows, cols, z, depth_offset, y_index] {
1664	u32 decompWidth = std::min(block_width, width - x);	1666	const u32 y = y_index * block_height;
1665	u32 decompHeight = std::min(block_height, height - y);	1667	for (u32 x_index = 0; x_index < cols; ++x_index) {
1666		1668	const u32 block_index = (z * rows * cols) + (y_index * cols) + x_index;
1667	const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);	1669	const u32 x = x_index * block_width;
1668	for (u32 jj = 0; jj < decompHeight; jj++) {	1670
1669	std::memcpy(outRow.data() + jj * width * 4,	1671	const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
1670	uncompData.data() + jj * block_width, decompWidth * 4);	1672
		1673	// Blocks can be at most 12x12
		1674	std::array<u32, 12 * 12> uncompData;
		1675	DecompressBlock(blockPtr, block_width, block_height, uncompData);
		1676
		1677	u32 decompWidth = std::min(block_width, width - x);
		1678	u32 decompHeight = std::min(block_height, height - y);
		1679
		1680	const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
		1681	for (u32 h = 0; h < decompHeight; ++h) {
		1682	std::memcpy(outRow.data() + h * width * 4,
		1683	uncompData.data() + h * block_width, decompWidth * 4);
		1684	}
1671	}	1685	}
1672	++block_index;	1686	};
1673	}	1687	workers.QueueWork(std::move(decompress_stride));
1674	}	1688	}
1675	depth_offset += height * width * 4;	1689	workers.WaitForRequests();
1676	}	1690	}
1677	}	1691	}
1678		1692