Query Cache: Implement host side sample counting.

author: Fernando Sahmkow 2023-08-20 17:53:08 +0200
committer: Fernando Sahmkow 2023-09-23 23:05:30 +0200
commit: c8237d5c312485394389b2520451ef720604ea9a (patch)
tree: 1a1064ed38a7a53bd61e4c04bf4571cdebfce2ec /src/video_core/host_shaders
parent: Query Cache: Fix guest side sample counting (diff)
download: yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.gz
yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.xz
yuzu-c8237d5c312485394389b2520451ef720604ea9a.zip
2 files changed, 125 insertions, 0 deletions
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index fb24b6532..8218ec4c8 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -41,6 +41,7 @@ set(SHADER_FILES
    pitch_unswizzle.comp
    present_bicubic.frag
    present_gaussian.frag
+    queries_prefix_scan_sum.comp
    resolve_conditional_render.comp
    smaa_edge_detection.vert
    smaa_edge_detection.frag
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
new file mode 100644
index 000000000..dce1279fe
--- /dev/null
+++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
@@ -0,0 +1,124 @@
+// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
+// SPDX-License-Identifier: MIT
+// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
+// Nicholas Haemel. Modified to suit needs and optimize for subgroup
+#version 460 core
+#ifdef VULKAN
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS                                                                         \
+    }                                                                                              \
+    ;
+#define UNIFORM(n)
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 1
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout(location = n) uniform
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 0
+#endif
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uint max_accumulation_base;
+UNIFORM(1) uint accumulation_limit;
+END_PUSH_CONSTANTS
+layout(local_size_x = 32) in;
+layout(std430, binding = 0) readonly buffer block1 {
+    uvec2 input_data[gl_WorkGroupSize.x];
+};
+layout(std430, binding = 1) writeonly coherent buffer block2 {
+    uvec2 output_data[gl_WorkGroupSize.x];
+};
+layout(std430, binding = 2) coherent buffer block3 {
+    uvec2 accumulated_data;
+};
+shared uvec2 shared_data[gl_WorkGroupSize.x * 2];
+uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
+    uint carry = 0;
+    uvec2 result;
+    result.x = uaddCarry(value_1.x, value_2.x, carry);
+    result.y = value_1.y + value_2.y + carry;
+    return result;
+}
+void main(void) {
+    uint id = gl_LocalInvocationID.x;
+    uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0);
+    uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0);
+    uint work_size = gl_WorkGroupSize.x;
+    uint rd_id;
+    uint wr_id;
+    uint mask;
+    uvec2 input_1 = input_data[id * 2];
+    uvec2 input_2 = input_data[id * 2 + 1];
+    // The number of steps is the log base 2 of the
+    // work group size, which should be a power of 2
+    const uint steps = uint(log2(work_size)) + 1;
+    uint step = 0;
+    // Each invocation is responsible for the content of
+    // two elements of the output array
+    shared_data[id * 2] = input_1;
+    shared_data[id * 2 + 1] = input_2;
+    // Synchronize to make sure that everyone has initialized
+    // their elements of shared_data[] with data loaded from
+    // the input arrays
+    barrier();
+    memoryBarrierShared();
+    // For each step...
+    for (step = 0; step < steps; step++) {
+        // Calculate the read and write index in the
+        // shared array
+        mask = (1 << step) - 1;
+        rd_id = ((id >> step) << (step + 1)) + mask;
+        wr_id = rd_id + 1 + (id & mask);
+        // Accumulate the read data into our element
+        shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
+        // Synchronize again to make sure that everyone
+        // has caught up with us
+        barrier();
+        memoryBarrierShared();
+    }
+    // Add the accumulation
+    shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1);
+    shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2);
+    barrier();
+    memoryBarrierShared();
+    // Finally write our data back to the output buffer
+    output_data[id * 2] = shared_data[id * 2];
+    output_data[id * 2 + 1] = shared_data[id * 2 + 1];
+    if (id == 0) {
+        if (max_accumulation_base >= accumulation_limit + 1) {
+            accumulated_data = shared_data[accumulation_limit];
+            return;
+        }
+        uvec2 value_1 = shared_data[max_accumulation_base];
+        uvec2 value_2 = shared_data[accumulation_limit];
+        accumulated_data = AddUint64(value_1, -value_2);
+    }
+}
+\ No newline at end of file
author	Fernando Sahmkow	2023-08-20 17:53:08 +0200
committer	Fernando Sahmkow	2023-09-23 23:05:30 +0200
commit	c8237d5c312485394389b2520451ef720604ea9a (patch)
tree	1a1064ed38a7a53bd61e4c04bf4571cdebfce2ec /src/video_core/host_shaders
parent	Query Cache: Fix guest side sample counting (diff)
download	yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.gz yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.xz yuzu-c8237d5c312485394389b2520451ef720604ea9a.zip

diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index fb24b6532..8218ec4c8 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -41,6 +41,7 @@ set(SHADER_FILES
41	pitch_unswizzle.comp	41	pitch_unswizzle.comp
42	present_bicubic.frag	42	present_bicubic.frag
43	present_gaussian.frag	43	present_gaussian.frag
		44	queries_prefix_scan_sum.comp
44	resolve_conditional_render.comp	45	resolve_conditional_render.comp
45	smaa_edge_detection.vert	46	smaa_edge_detection.vert
46	smaa_edge_detection.frag	47	smaa_edge_detection.frag


diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp new file mode 100644 index 000000000..dce1279fe --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
@@ -0,0 +1,124 @@
		1	// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
		2	// SPDX-License-Identifier: MIT
		3
		4	// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
		5	// Nicholas Haemel. Modified to suit needs and optimize for subgroup
		6
		7	#version 460 core
		8
		9	#ifdef VULKAN
		10
		11	#extension GL_KHR_shader_subgroup_arithmetic : enable
		12	#define HAS_EXTENDED_TYPES 1
		13	#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
		14	#define END_PUSH_CONSTANTS \
		15	} \
		16	;
		17	#define UNIFORM(n)
		18	#define BINDING_INPUT_BUFFER 0
		19	#define BINDING_OUTPUT_IMAGE 1
		20
		21	#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
		22
		23	#extension GL_KHR_shader_subgroup_arithmetic : enable
		24	#extension GL_NV_gpu_shader5 : enable
		25	#ifdef GL_NV_gpu_shader5
		26	#define HAS_EXTENDED_TYPES 1
		27	#else
		28	#define HAS_EXTENDED_TYPES 0
		29	#endif
		30	#define BEGIN_PUSH_CONSTANTS
		31	#define END_PUSH_CONSTANTS
		32	#define UNIFORM(n) layout(location = n) uniform
		33	#define BINDING_INPUT_BUFFER 0
		34	#define BINDING_OUTPUT_IMAGE 0
		35
		36	#endif
		37
		38	BEGIN_PUSH_CONSTANTS
		39	UNIFORM(0) uint max_accumulation_base;
		40	UNIFORM(1) uint accumulation_limit;
		41	END_PUSH_CONSTANTS
		42
		43	layout(local_size_x = 32) in;
		44
		45	layout(std430, binding = 0) readonly buffer block1 {
		46	uvec2 input_data[gl_WorkGroupSize.x];
		47	};
		48
		49	layout(std430, binding = 1) writeonly coherent buffer block2 {
		50	uvec2 output_data[gl_WorkGroupSize.x];
		51	};
		52
		53	layout(std430, binding = 2) coherent buffer block3 {
		54	uvec2 accumulated_data;
		55	};
		56
		57	shared uvec2 shared_data[gl_WorkGroupSize.x * 2];
		58
		59	uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
		60	uint carry = 0;
		61	uvec2 result;
		62	result.x = uaddCarry(value_1.x, value_2.x, carry);
		63	result.y = value_1.y + value_2.y + carry;
		64	return result;
		65	}
		66
		67	void main(void) {
		68	uint id = gl_LocalInvocationID.x;
		69	uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0);
		70	uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0);
		71	uint work_size = gl_WorkGroupSize.x;
		72	uint rd_id;
		73	uint wr_id;
		74	uint mask;
		75	uvec2 input_1 = input_data[id * 2];
		76	uvec2 input_2 = input_data[id * 2 + 1];
		77	// The number of steps is the log base 2 of the
		78	// work group size, which should be a power of 2
		79	const uint steps = uint(log2(work_size)) + 1;
		80	uint step = 0;
		81
		82	// Each invocation is responsible for the content of
		83	// two elements of the output array
		84	shared_data[id * 2] = input_1;
		85	shared_data[id * 2 + 1] = input_2;
		86	// Synchronize to make sure that everyone has initialized
		87	// their elements of shared_data[] with data loaded from
		88	// the input arrays
		89	barrier();
		90	memoryBarrierShared();
		91	// For each step...
		92	for (step = 0; step < steps; step++) {
		93	// Calculate the read and write index in the
		94	// shared array
		95	mask = (1 << step) - 1;
		96	rd_id = ((id >> step) << (step + 1)) + mask;
		97	wr_id = rd_id + 1 + (id & mask);
		98	// Accumulate the read data into our element
		99
		100	shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
		101	// Synchronize again to make sure that everyone
		102	// has caught up with us
		103	barrier();
		104	memoryBarrierShared();
		105	}
		106	// Add the accumulation
		107	shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1);
		108	shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2);
		109	barrier();
		110	memoryBarrierShared();
		111
		112	// Finally write our data back to the output buffer
		113	output_data[id * 2] = shared_data[id * 2];
		114	output_data[id * 2 + 1] = shared_data[id * 2 + 1];
		115	if (id == 0) {
		116	if (max_accumulation_base >= accumulation_limit + 1) {
		117	accumulated_data = shared_data[accumulation_limit];
		118	return;
		119	}
		120	uvec2 value_1 = shared_data[max_accumulation_base];
		121	uvec2 value_2 = shared_data[accumulation_limit];
		122	accumulated_data = AddUint64(value_1, -value_2);
		123	}
		124	} \ No newline at end of file