summaryrefslogtreecommitdiff
path: root/src/video_core/host_shaders
diff options
context:
space:
mode:
authorGravatar Fernando Sahmkow2023-08-22 12:28:25 +0200
committerGravatar Fernando Sahmkow2023-09-23 23:05:30 +0200
commita07c88e686fb9b65924876d472a8184f1f1849df (patch)
tree1eb5015652f00ba728217d16a71ecbed67faa24b /src/video_core/host_shaders
parentQuery Cache: Implement host side sample counting. (diff)
downloadyuzu-a07c88e686fb9b65924876d472a8184f1f1849df.tar.gz
yuzu-a07c88e686fb9b65924876d472a8184f1f1849df.tar.xz
yuzu-a07c88e686fb9b65924876d472a8184f1f1849df.zip
Query Cache: Simplify Prefix Sum compute shader
Diffstat (limited to 'src/video_core/host_shaders')
-rw-r--r--src/video_core/host_shaders/CMakeLists.txt4
-rw-r--r--src/video_core/host_shaders/queries_prefix_scan_sum.comp168
-rw-r--r--src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp120
3 files changed, 227 insertions, 65 deletions
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index 8218ec4c8..6b912027f 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -42,6 +42,7 @@ set(SHADER_FILES
42 present_bicubic.frag 42 present_bicubic.frag
43 present_gaussian.frag 43 present_gaussian.frag
44 queries_prefix_scan_sum.comp 44 queries_prefix_scan_sum.comp
45 queries_prefix_scan_sum_nosubgroups.comp
45 resolve_conditional_render.comp 46 resolve_conditional_render.comp
46 smaa_edge_detection.vert 47 smaa_edge_detection.vert
47 smaa_edge_detection.frag 48 smaa_edge_detection.frag
@@ -72,6 +73,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND")
72endif() 73endif()
73 74
74set(GLSL_FLAGS "") 75set(GLSL_FLAGS "")
76set(SPIR_V_VERSION "spirv1.3")
75set(QUIET_FLAG "--quiet") 77set(QUIET_FLAG "--quiet")
76 78
77set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) 79set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
@@ -125,7 +127,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES})
125 OUTPUT 127 OUTPUT
126 ${SPIRV_HEADER_FILE} 128 ${SPIRV_HEADER_FILE}
127 COMMAND 129 COMMAND
128 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} 130 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION}
129 MAIN_DEPENDENCY 131 MAIN_DEPENDENCY
130 ${SOURCE_FILE} 132 ${SOURCE_FILE}
131 ) 133 )
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
index dce1279fe..8f10e248e 100644
--- a/src/video_core/host_shaders/queries_prefix_scan_sum.comp
+++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
@@ -1,26 +1,24 @@
1// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel 1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: MIT 2// SPDX-License-Identifier: GPL-3.0-or-later
3
4// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
5// Nicholas Haemel. Modified to suit needs and optimize for subgroup
6 3
7#version 460 core 4#version 460 core
8 5
6#extension GL_KHR_shader_subgroup_basic : require
7#extension GL_KHR_shader_subgroup_shuffle : require
8#extension GL_KHR_shader_subgroup_shuffle_relative : require
9#extension GL_KHR_shader_subgroup_arithmetic : require
10
9#ifdef VULKAN 11#ifdef VULKAN
10 12
11#extension GL_KHR_shader_subgroup_arithmetic : enable
12#define HAS_EXTENDED_TYPES 1 13#define HAS_EXTENDED_TYPES 1
13#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { 14#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
14#define END_PUSH_CONSTANTS \ 15#define END_PUSH_CONSTANTS };
15 } \
16 ;
17#define UNIFORM(n) 16#define UNIFORM(n)
18#define BINDING_INPUT_BUFFER 0 17#define BINDING_INPUT_BUFFER 0
19#define BINDING_OUTPUT_IMAGE 1 18#define BINDING_OUTPUT_IMAGE 1
20 19
21#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv 20#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
22 21
23#extension GL_KHR_shader_subgroup_arithmetic : enable
24#extension GL_NV_gpu_shader5 : enable 22#extension GL_NV_gpu_shader5 : enable
25#ifdef GL_NV_gpu_shader5 23#ifdef GL_NV_gpu_shader5
26#define HAS_EXTENDED_TYPES 1 24#define HAS_EXTENDED_TYPES 1
@@ -43,19 +41,20 @@ END_PUSH_CONSTANTS
43layout(local_size_x = 32) in; 41layout(local_size_x = 32) in;
44 42
45layout(std430, binding = 0) readonly buffer block1 { 43layout(std430, binding = 0) readonly buffer block1 {
46 uvec2 input_data[gl_WorkGroupSize.x]; 44 uvec2 input_data[];
47}; 45};
48 46
49layout(std430, binding = 1) writeonly coherent buffer block2 { 47layout(std430, binding = 1) coherent buffer block2 {
50 uvec2 output_data[gl_WorkGroupSize.x]; 48 uvec2 output_data[];
51}; 49};
52 50
53layout(std430, binding = 2) coherent buffer block3 { 51layout(std430, binding = 2) coherent buffer block3 {
54 uvec2 accumulated_data; 52 uvec2 accumulated_data;
55}; 53};
56 54
57shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; 55shared uvec2 shared_data[2];
58 56
57// Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64
59uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { 58uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
60 uint carry = 0; 59 uint carry = 0;
61 uvec2 result; 60 uvec2 result;
@@ -64,61 +63,102 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
64 return result; 63 return result;
65} 64}
66 65
67void main(void) { 66// do subgroup Prefix Sum using Hillis and Steele's algorithm
68 uint id = gl_LocalInvocationID.x; 67uvec2 subgroupInclusiveAddUint64(uvec2 value) {
69 uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); 68 uvec2 result = value;
70 uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); 69 for (uint i = 1; i < gl_SubgroupSize; i *= 2) {
71 uint work_size = gl_WorkGroupSize.x; 70 if (i <= gl_SubgroupInvocationID) {
72 uint rd_id; 71 uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i;
73 uint wr_id; 72 result = AddUint64(result, other);
74 uint mask; 73 }
75 uvec2 input_1 = input_data[id * 2]; 74 }
76 uvec2 input_2 = input_data[id * 2 + 1]; 75 return result;
77 // The number of steps is the log base 2 of the 76}
78 // work group size, which should be a power of 2 77
79 const uint steps = uint(log2(work_size)) + 1; 78// Writes down the results to the output buffer and to the accumulation buffer
80 uint step = 0; 79void WriteResults(uvec2 result) {
81 80 uint current_global_id = gl_GlobalInvocationID.x;
82 // Each invocation is responsible for the content of 81 uvec2 base_data = current_global_id < max_accumulation_base ? accumulated_data : uvec2(0);
83 // two elements of the output array 82 output_data[current_global_id] = result + base_data;
84 shared_data[id * 2] = input_1; 83 if (max_accumulation_base >= accumulation_limit + 1) {
85 shared_data[id * 2 + 1] = input_2; 84 if (current_global_id == accumulation_limit) {
86 // Synchronize to make sure that everyone has initialized 85 accumulated_data = result;
87 // their elements of shared_data[] with data loaded from 86 }
88 // the input arrays 87 return;
88 }
89 // We have that ugly case in which the accumulation data is reset in the middle somewhere.
89 barrier(); 90 barrier();
90 memoryBarrierShared(); 91 groupMemoryBarrier();
91 // For each step... 92 if (current_global_id == accumulation_limit) {
92 for (step = 0; step < steps; step++) { 93 uvec2 value_1 = output_data[max_accumulation_base];
93 // Calculate the read and write index in the 94 accumulated_data = AddUint64(result, -value_1);
94 // shared array
95 mask = (1 << step) - 1;
96 rd_id = ((id >> step) << (step + 1)) + mask;
97 wr_id = rd_id + 1 + (id & mask);
98 // Accumulate the read data into our element
99
100 shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
101 // Synchronize again to make sure that everyone
102 // has caught up with us
103 barrier();
104 memoryBarrierShared();
105 } 95 }
106 // Add the accumulation 96}
107 shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); 97
108 shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); 98void main() {
99 uint subgroup_inv_id = gl_SubgroupInvocationID;
100 uint subgroup_id = gl_SubgroupID;
101 uint last_subgroup_id = subgroupMax(subgroup_inv_id);
102 uint current_global_id = gl_GlobalInvocationID.x;
103 uint total_work = gl_NumWorkGroups.x * gl_WorkGroupSize.x;
104 uvec2 data = input_data[current_global_id];
105 // make sure all input data has been loaded
106 subgroupBarrier();
107 subgroupMemoryBarrier();
108
109 uvec2 result = subgroupInclusiveAddUint64(data);
110
111 // if we had less queries than our subgroup, just write down the results.
112 if (total_work <= gl_SubgroupSize) { // This condition is constant per dispatch.
113 WriteResults(result);
114 return;
115 }
116
117 // We now have more, so lets write the last result into shared memory.
118 // Only pick the last subgroup.
119 if (subgroup_inv_id == last_subgroup_id) {
120 shared_data[subgroup_id] = result;
121 }
122 // wait until everyone loaded their stuffs
109 barrier(); 123 barrier();
110 memoryBarrierShared(); 124 memoryBarrierShared();
111 125
112 // Finally write our data back to the output buffer 126 // Case 1: the total work for the grouped results can be calculated in a single subgroup
113 output_data[id * 2] = shared_data[id * 2]; 127 // operation (about 1024 queries).
114 output_data[id * 2 + 1] = shared_data[id * 2 + 1]; 128 uint total_extra_work = gl_NumSubgroups * gl_NumWorkGroups.x;
115 if (id == 0) { 129 if (total_extra_work <= gl_SubgroupSize) { // This condition is constant per dispatch.
116 if (max_accumulation_base >= accumulation_limit + 1) { 130 if (subgroup_id != 0) {
117 accumulated_data = shared_data[accumulation_limit]; 131 uvec2 tmp = shared_data[subgroup_inv_id];
118 return; 132 subgroupBarrier();
133 subgroupMemoryBarrierShared();
134 tmp = subgroupInclusiveAddUint64(tmp);
135 result = AddUint64(result, subgroupShuffle(tmp, subgroup_id - 1));
136 }
137
138 WriteResults(result);
139 return;
140 }
141
142 // Case 2: our work amount is huge, so lets do it in O(log n) steps.
143 const uint extra = (total_extra_work ^ (total_extra_work - 1)) != 0 ? 1 : 0;
144 const uint steps = 1 << (findMSB(total_extra_work) + extra);
145 uint step;
146 // Hillis and Steele's algorithm
147 for (step = 1; step < steps; step *= 2) {
148 if (current_global_id < steps && current_global_id >= step) {
149 uvec2 current = shared_data[current_global_id];
150 uvec2 other = shared_data[current_global_id - step];
151 shared_data[current_global_id] = AddUint64(current, other);
119 } 152 }
120 uvec2 value_1 = shared_data[max_accumulation_base]; 153 // steps is constant, so this will always execute in ever workgroup's thread.
121 uvec2 value_2 = shared_data[accumulation_limit]; 154 barrier();
122 accumulated_data = AddUint64(value_1, -value_2); 155 memoryBarrierShared();
156 }
157 // Only add results for groups higher than 0
158 if (subgroup_id != 0) {
159 result = AddUint64(result, shared_data[subgroup_id - 1]);
123 } 160 }
161
162 // Just write the final results. We are done
163 WriteResults(result);
124} \ No newline at end of file 164} \ No newline at end of file
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp
new file mode 100644
index 000000000..8021476ed
--- /dev/null
+++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp
@@ -0,0 +1,120 @@
1// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
2// SPDX-License-Identifier: MIT
3
4// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
5// Nicholas Haemel. Modified to suit needs.
6
7#version 460 core
8
9#ifdef VULKAN
10
11#define HAS_EXTENDED_TYPES 1
12#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
13#define END_PUSH_CONSTANTS };
14#define UNIFORM(n)
15#define BINDING_INPUT_BUFFER 0
16#define BINDING_OUTPUT_IMAGE 1
17
18#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
19
20#extension GL_NV_gpu_shader5 : enable
21#ifdef GL_NV_gpu_shader5
22#define HAS_EXTENDED_TYPES 1
23#else
24#define HAS_EXTENDED_TYPES 0
25#endif
26#define BEGIN_PUSH_CONSTANTS
27#define END_PUSH_CONSTANTS
28#define UNIFORM(n) layout(location = n) uniform
29#define BINDING_INPUT_BUFFER 0
30#define BINDING_OUTPUT_IMAGE 0
31
32#endif
33
34BEGIN_PUSH_CONSTANTS
35UNIFORM(0) uint max_accumulation_base;
36UNIFORM(1) uint accumulation_limit;
37END_PUSH_CONSTANTS
38
39layout(local_size_x = 32) in;
40
41layout(std430, binding = 0) readonly buffer block1 {
42 uvec2 input_data[gl_WorkGroupSize.x];
43};
44
45layout(std430, binding = 1) writeonly coherent buffer block2 {
46 uvec2 output_data[gl_WorkGroupSize.x];
47};
48
49layout(std430, binding = 2) coherent buffer block3 {
50 uvec2 accumulated_data;
51};
52
53shared uvec2 shared_data[gl_WorkGroupSize.x * 2];
54
55uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
56 uint carry = 0;
57 uvec2 result;
58 result.x = uaddCarry(value_1.x, value_2.x, carry);
59 result.y = value_1.y + value_2.y + carry;
60 return result;
61}
62
63void main(void) {
64 uint id = gl_LocalInvocationID.x;
65 uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0);
66 uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0);
67 uint work_size = gl_WorkGroupSize.x;
68 uint rd_id;
69 uint wr_id;
70 uint mask;
71 uvec2 input_1 = input_data[id * 2];
72 uvec2 input_2 = input_data[id * 2 + 1];
73 // The number of steps is the log base 2 of the
74 // work group size, which should be a power of 2
75 const uint steps = uint(log2(work_size)) + 1;
76 uint step = 0;
77
78 // Each invocation is responsible for the content of
79 // two elements of the output array
80 shared_data[id * 2] = input_1;
81 shared_data[id * 2 + 1] = input_2;
82 // Synchronize to make sure that everyone has initialized
83 // their elements of shared_data[] with data loaded from
84 // the input arrays
85 barrier();
86 memoryBarrierShared();
87 // For each step...
88 for (step = 0; step < steps; step++) {
89 // Calculate the read and write index in the
90 // shared array
91 mask = (1 << step) - 1;
92 rd_id = ((id >> step) << (step + 1)) + mask;
93 wr_id = rd_id + 1 + (id & mask);
94 // Accumulate the read data into our element
95
96 shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
97 // Synchronize again to make sure that everyone
98 // has caught up with us
99 barrier();
100 memoryBarrierShared();
101 }
102 // Add the accumulation
103 shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1);
104 shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2);
105 barrier();
106 memoryBarrierShared();
107
108 // Finally write our data back to the output buffer
109 output_data[id * 2] = shared_data[id * 2];
110 output_data[id * 2 + 1] = shared_data[id * 2 + 1];
111 if (id == 0) {
112 if (max_accumulation_base >= accumulation_limit + 1) {
113 accumulated_data = shared_data[accumulation_limit];
114 return;
115 }
116 uvec2 value_1 = shared_data[max_accumulation_base];
117 uvec2 value_2 = shared_data[accumulation_limit];
118 accumulated_data = AddUint64(value_1, -value_2);
119 }
120} \ No newline at end of file