diff options
| author | 2023-08-22 12:28:25 +0200 | |
|---|---|---|
| committer | 2023-09-23 23:05:30 +0200 | |
| commit | a07c88e686fb9b65924876d472a8184f1f1849df (patch) | |
| tree | 1eb5015652f00ba728217d16a71ecbed67faa24b /src/video_core/host_shaders | |
| parent | Query Cache: Implement host side sample counting. (diff) | |
| download | yuzu-a07c88e686fb9b65924876d472a8184f1f1849df.tar.gz yuzu-a07c88e686fb9b65924876d472a8184f1f1849df.tar.xz yuzu-a07c88e686fb9b65924876d472a8184f1f1849df.zip | |
Query Cache: Simplify Prefix Sum compute shader
Diffstat (limited to 'src/video_core/host_shaders')
3 files changed, 227 insertions, 65 deletions
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 8218ec4c8..6b912027f 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt | |||
| @@ -42,6 +42,7 @@ set(SHADER_FILES | |||
| 42 | present_bicubic.frag | 42 | present_bicubic.frag |
| 43 | present_gaussian.frag | 43 | present_gaussian.frag |
| 44 | queries_prefix_scan_sum.comp | 44 | queries_prefix_scan_sum.comp |
| 45 | queries_prefix_scan_sum_nosubgroups.comp | ||
| 45 | resolve_conditional_render.comp | 46 | resolve_conditional_render.comp |
| 46 | smaa_edge_detection.vert | 47 | smaa_edge_detection.vert |
| 47 | smaa_edge_detection.frag | 48 | smaa_edge_detection.frag |
| @@ -72,6 +73,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND") | |||
| 72 | endif() | 73 | endif() |
| 73 | 74 | ||
| 74 | set(GLSL_FLAGS "") | 75 | set(GLSL_FLAGS "") |
| 76 | set(SPIR_V_VERSION "spirv1.3") | ||
| 75 | set(QUIET_FLAG "--quiet") | 77 | set(QUIET_FLAG "--quiet") |
| 76 | 78 | ||
| 77 | set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) | 79 | set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) |
| @@ -125,7 +127,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES}) | |||
| 125 | OUTPUT | 127 | OUTPUT |
| 126 | ${SPIRV_HEADER_FILE} | 128 | ${SPIRV_HEADER_FILE} |
| 127 | COMMAND | 129 | COMMAND |
| 128 | ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} | 130 | ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION} |
| 129 | MAIN_DEPENDENCY | 131 | MAIN_DEPENDENCY |
| 130 | ${SOURCE_FILE} | 132 | ${SOURCE_FILE} |
| 131 | ) | 133 | ) |
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp index dce1279fe..8f10e248e 100644 --- a/src/video_core/host_shaders/queries_prefix_scan_sum.comp +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp | |||
| @@ -1,26 +1,24 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel | 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project |
| 2 | // SPDX-License-Identifier: MIT | 2 | // SPDX-License-Identifier: GPL-3.0-or-later |
| 3 | |||
| 4 | // Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and | ||
| 5 | // Nicholas Haemel. Modified to suit needs and optimize for subgroup | ||
| 6 | 3 | ||
| 7 | #version 460 core | 4 | #version 460 core |
| 8 | 5 | ||
| 6 | #extension GL_KHR_shader_subgroup_basic : require | ||
| 7 | #extension GL_KHR_shader_subgroup_shuffle : require | ||
| 8 | #extension GL_KHR_shader_subgroup_shuffle_relative : require | ||
| 9 | #extension GL_KHR_shader_subgroup_arithmetic : require | ||
| 10 | |||
| 9 | #ifdef VULKAN | 11 | #ifdef VULKAN |
| 10 | 12 | ||
| 11 | #extension GL_KHR_shader_subgroup_arithmetic : enable | ||
| 12 | #define HAS_EXTENDED_TYPES 1 | 13 | #define HAS_EXTENDED_TYPES 1 |
| 13 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | 14 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { |
| 14 | #define END_PUSH_CONSTANTS \ | 15 | #define END_PUSH_CONSTANTS }; |
| 15 | } \ | ||
| 16 | ; | ||
| 17 | #define UNIFORM(n) | 16 | #define UNIFORM(n) |
| 18 | #define BINDING_INPUT_BUFFER 0 | 17 | #define BINDING_INPUT_BUFFER 0 |
| 19 | #define BINDING_OUTPUT_IMAGE 1 | 18 | #define BINDING_OUTPUT_IMAGE 1 |
| 20 | 19 | ||
| 21 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | 20 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv |
| 22 | 21 | ||
| 23 | #extension GL_KHR_shader_subgroup_arithmetic : enable | ||
| 24 | #extension GL_NV_gpu_shader5 : enable | 22 | #extension GL_NV_gpu_shader5 : enable |
| 25 | #ifdef GL_NV_gpu_shader5 | 23 | #ifdef GL_NV_gpu_shader5 |
| 26 | #define HAS_EXTENDED_TYPES 1 | 24 | #define HAS_EXTENDED_TYPES 1 |
| @@ -43,19 +41,20 @@ END_PUSH_CONSTANTS | |||
| 43 | layout(local_size_x = 32) in; | 41 | layout(local_size_x = 32) in; |
| 44 | 42 | ||
| 45 | layout(std430, binding = 0) readonly buffer block1 { | 43 | layout(std430, binding = 0) readonly buffer block1 { |
| 46 | uvec2 input_data[gl_WorkGroupSize.x]; | 44 | uvec2 input_data[]; |
| 47 | }; | 45 | }; |
| 48 | 46 | ||
| 49 | layout(std430, binding = 1) writeonly coherent buffer block2 { | 47 | layout(std430, binding = 1) coherent buffer block2 { |
| 50 | uvec2 output_data[gl_WorkGroupSize.x]; | 48 | uvec2 output_data[]; |
| 51 | }; | 49 | }; |
| 52 | 50 | ||
| 53 | layout(std430, binding = 2) coherent buffer block3 { | 51 | layout(std430, binding = 2) coherent buffer block3 { |
| 54 | uvec2 accumulated_data; | 52 | uvec2 accumulated_data; |
| 55 | }; | 53 | }; |
| 56 | 54 | ||
| 57 | shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; | 55 | shared uvec2 shared_data[2]; |
| 58 | 56 | ||
| 57 | // Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 | ||
| 59 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | 58 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { |
| 60 | uint carry = 0; | 59 | uint carry = 0; |
| 61 | uvec2 result; | 60 | uvec2 result; |
| @@ -64,61 +63,102 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | |||
| 64 | return result; | 63 | return result; |
| 65 | } | 64 | } |
| 66 | 65 | ||
| 67 | void main(void) { | 66 | // do subgroup Prefix Sum using Hillis and Steele's algorithm |
| 68 | uint id = gl_LocalInvocationID.x; | 67 | uvec2 subgroupInclusiveAddUint64(uvec2 value) { |
| 69 | uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); | 68 | uvec2 result = value; |
| 70 | uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); | 69 | for (uint i = 1; i < gl_SubgroupSize; i *= 2) { |
| 71 | uint work_size = gl_WorkGroupSize.x; | 70 | if (i <= gl_SubgroupInvocationID) { |
| 72 | uint rd_id; | 71 | uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; |
| 73 | uint wr_id; | 72 | result = AddUint64(result, other); |
| 74 | uint mask; | 73 | } |
| 75 | uvec2 input_1 = input_data[id * 2]; | 74 | } |
| 76 | uvec2 input_2 = input_data[id * 2 + 1]; | 75 | return result; |
| 77 | // The number of steps is the log base 2 of the | 76 | } |
| 78 | // work group size, which should be a power of 2 | 77 | |
| 79 | const uint steps = uint(log2(work_size)) + 1; | 78 | // Writes down the results to the output buffer and to the accumulation buffer |
| 80 | uint step = 0; | 79 | void WriteResults(uvec2 result) { |
| 81 | 80 | uint current_global_id = gl_GlobalInvocationID.x; | |
| 82 | // Each invocation is responsible for the content of | 81 | uvec2 base_data = current_global_id < max_accumulation_base ? accumulated_data : uvec2(0); |
| 83 | // two elements of the output array | 82 | output_data[current_global_id] = result + base_data; |
| 84 | shared_data[id * 2] = input_1; | 83 | if (max_accumulation_base >= accumulation_limit + 1) { |
| 85 | shared_data[id * 2 + 1] = input_2; | 84 | if (current_global_id == accumulation_limit) { |
| 86 | // Synchronize to make sure that everyone has initialized | 85 | accumulated_data = result; |
| 87 | // their elements of shared_data[] with data loaded from | 86 | } |
| 88 | // the input arrays | 87 | return; |
| 88 | } | ||
| 89 | // We have that ugly case in which the accumulation data is reset in the middle somewhere. | ||
| 89 | barrier(); | 90 | barrier(); |
| 90 | memoryBarrierShared(); | 91 | groupMemoryBarrier(); |
| 91 | // For each step... | 92 | if (current_global_id == accumulation_limit) { |
| 92 | for (step = 0; step < steps; step++) { | 93 | uvec2 value_1 = output_data[max_accumulation_base]; |
| 93 | // Calculate the read and write index in the | 94 | accumulated_data = AddUint64(result, -value_1); |
| 94 | // shared array | ||
| 95 | mask = (1 << step) - 1; | ||
| 96 | rd_id = ((id >> step) << (step + 1)) + mask; | ||
| 97 | wr_id = rd_id + 1 + (id & mask); | ||
| 98 | // Accumulate the read data into our element | ||
| 99 | |||
| 100 | shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); | ||
| 101 | // Synchronize again to make sure that everyone | ||
| 102 | // has caught up with us | ||
| 103 | barrier(); | ||
| 104 | memoryBarrierShared(); | ||
| 105 | } | 95 | } |
| 106 | // Add the accumulation | 96 | } |
| 107 | shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); | 97 | |
| 108 | shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); | 98 | void main() { |
| 99 | uint subgroup_inv_id = gl_SubgroupInvocationID; | ||
| 100 | uint subgroup_id = gl_SubgroupID; | ||
| 101 | uint last_subgroup_id = subgroupMax(subgroup_inv_id); | ||
| 102 | uint current_global_id = gl_GlobalInvocationID.x; | ||
| 103 | uint total_work = gl_NumWorkGroups.x * gl_WorkGroupSize.x; | ||
| 104 | uvec2 data = input_data[current_global_id]; | ||
| 105 | // make sure all input data has been loaded | ||
| 106 | subgroupBarrier(); | ||
| 107 | subgroupMemoryBarrier(); | ||
| 108 | |||
| 109 | uvec2 result = subgroupInclusiveAddUint64(data); | ||
| 110 | |||
| 111 | // if we had less queries than our subgroup, just write down the results. | ||
| 112 | if (total_work <= gl_SubgroupSize) { // This condition is constant per dispatch. | ||
| 113 | WriteResults(result); | ||
| 114 | return; | ||
| 115 | } | ||
| 116 | |||
| 117 | // We now have more, so lets write the last result into shared memory. | ||
| 118 | // Only pick the last subgroup. | ||
| 119 | if (subgroup_inv_id == last_subgroup_id) { | ||
| 120 | shared_data[subgroup_id] = result; | ||
| 121 | } | ||
| 122 | // wait until everyone loaded their stuffs | ||
| 109 | barrier(); | 123 | barrier(); |
| 110 | memoryBarrierShared(); | 124 | memoryBarrierShared(); |
| 111 | 125 | ||
| 112 | // Finally write our data back to the output buffer | 126 | // Case 1: the total work for the grouped results can be calculated in a single subgroup |
| 113 | output_data[id * 2] = shared_data[id * 2]; | 127 | // operation (about 1024 queries). |
| 114 | output_data[id * 2 + 1] = shared_data[id * 2 + 1]; | 128 | uint total_extra_work = gl_NumSubgroups * gl_NumWorkGroups.x; |
| 115 | if (id == 0) { | 129 | if (total_extra_work <= gl_SubgroupSize) { // This condition is constant per dispatch. |
| 116 | if (max_accumulation_base >= accumulation_limit + 1) { | 130 | if (subgroup_id != 0) { |
| 117 | accumulated_data = shared_data[accumulation_limit]; | 131 | uvec2 tmp = shared_data[subgroup_inv_id]; |
| 118 | return; | 132 | subgroupBarrier(); |
| 133 | subgroupMemoryBarrierShared(); | ||
| 134 | tmp = subgroupInclusiveAddUint64(tmp); | ||
| 135 | result = AddUint64(result, subgroupShuffle(tmp, subgroup_id - 1)); | ||
| 136 | } | ||
| 137 | |||
| 138 | WriteResults(result); | ||
| 139 | return; | ||
| 140 | } | ||
| 141 | |||
| 142 | // Case 2: our work amount is huge, so lets do it in O(log n) steps. | ||
| 143 | const uint extra = (total_extra_work ^ (total_extra_work - 1)) != 0 ? 1 : 0; | ||
| 144 | const uint steps = 1 << (findMSB(total_extra_work) + extra); | ||
| 145 | uint step; | ||
| 146 | // Hillis and Steele's algorithm | ||
| 147 | for (step = 1; step < steps; step *= 2) { | ||
| 148 | if (current_global_id < steps && current_global_id >= step) { | ||
| 149 | uvec2 current = shared_data[current_global_id]; | ||
| 150 | uvec2 other = shared_data[current_global_id - step]; | ||
| 151 | shared_data[current_global_id] = AddUint64(current, other); | ||
| 119 | } | 152 | } |
| 120 | uvec2 value_1 = shared_data[max_accumulation_base]; | 153 | // steps is constant, so this will always execute in ever workgroup's thread. |
| 121 | uvec2 value_2 = shared_data[accumulation_limit]; | 154 | barrier(); |
| 122 | accumulated_data = AddUint64(value_1, -value_2); | 155 | memoryBarrierShared(); |
| 156 | } | ||
| 157 | // Only add results for groups higher than 0 | ||
| 158 | if (subgroup_id != 0) { | ||
| 159 | result = AddUint64(result, shared_data[subgroup_id - 1]); | ||
| 123 | } | 160 | } |
| 161 | |||
| 162 | // Just write the final results. We are done | ||
| 163 | WriteResults(result); | ||
| 124 | } \ No newline at end of file | 164 | } \ No newline at end of file |
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp new file mode 100644 index 000000000..8021476ed --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp | |||
| @@ -0,0 +1,120 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel | ||
| 2 | // SPDX-License-Identifier: MIT | ||
| 3 | |||
| 4 | // Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and | ||
| 5 | // Nicholas Haemel. Modified to suit needs. | ||
| 6 | |||
| 7 | #version 460 core | ||
| 8 | |||
| 9 | #ifdef VULKAN | ||
| 10 | |||
| 11 | #define HAS_EXTENDED_TYPES 1 | ||
| 12 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | ||
| 13 | #define END_PUSH_CONSTANTS }; | ||
| 14 | #define UNIFORM(n) | ||
| 15 | #define BINDING_INPUT_BUFFER 0 | ||
| 16 | #define BINDING_OUTPUT_IMAGE 1 | ||
| 17 | |||
| 18 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||
| 19 | |||
| 20 | #extension GL_NV_gpu_shader5 : enable | ||
| 21 | #ifdef GL_NV_gpu_shader5 | ||
| 22 | #define HAS_EXTENDED_TYPES 1 | ||
| 23 | #else | ||
| 24 | #define HAS_EXTENDED_TYPES 0 | ||
| 25 | #endif | ||
| 26 | #define BEGIN_PUSH_CONSTANTS | ||
| 27 | #define END_PUSH_CONSTANTS | ||
| 28 | #define UNIFORM(n) layout(location = n) uniform | ||
| 29 | #define BINDING_INPUT_BUFFER 0 | ||
| 30 | #define BINDING_OUTPUT_IMAGE 0 | ||
| 31 | |||
| 32 | #endif | ||
| 33 | |||
| 34 | BEGIN_PUSH_CONSTANTS | ||
| 35 | UNIFORM(0) uint max_accumulation_base; | ||
| 36 | UNIFORM(1) uint accumulation_limit; | ||
| 37 | END_PUSH_CONSTANTS | ||
| 38 | |||
| 39 | layout(local_size_x = 32) in; | ||
| 40 | |||
| 41 | layout(std430, binding = 0) readonly buffer block1 { | ||
| 42 | uvec2 input_data[gl_WorkGroupSize.x]; | ||
| 43 | }; | ||
| 44 | |||
| 45 | layout(std430, binding = 1) writeonly coherent buffer block2 { | ||
| 46 | uvec2 output_data[gl_WorkGroupSize.x]; | ||
| 47 | }; | ||
| 48 | |||
| 49 | layout(std430, binding = 2) coherent buffer block3 { | ||
| 50 | uvec2 accumulated_data; | ||
| 51 | }; | ||
| 52 | |||
| 53 | shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; | ||
| 54 | |||
| 55 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | ||
| 56 | uint carry = 0; | ||
| 57 | uvec2 result; | ||
| 58 | result.x = uaddCarry(value_1.x, value_2.x, carry); | ||
| 59 | result.y = value_1.y + value_2.y + carry; | ||
| 60 | return result; | ||
| 61 | } | ||
| 62 | |||
| 63 | void main(void) { | ||
| 64 | uint id = gl_LocalInvocationID.x; | ||
| 65 | uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); | ||
| 66 | uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); | ||
| 67 | uint work_size = gl_WorkGroupSize.x; | ||
| 68 | uint rd_id; | ||
| 69 | uint wr_id; | ||
| 70 | uint mask; | ||
| 71 | uvec2 input_1 = input_data[id * 2]; | ||
| 72 | uvec2 input_2 = input_data[id * 2 + 1]; | ||
| 73 | // The number of steps is the log base 2 of the | ||
| 74 | // work group size, which should be a power of 2 | ||
| 75 | const uint steps = uint(log2(work_size)) + 1; | ||
| 76 | uint step = 0; | ||
| 77 | |||
| 78 | // Each invocation is responsible for the content of | ||
| 79 | // two elements of the output array | ||
| 80 | shared_data[id * 2] = input_1; | ||
| 81 | shared_data[id * 2 + 1] = input_2; | ||
| 82 | // Synchronize to make sure that everyone has initialized | ||
| 83 | // their elements of shared_data[] with data loaded from | ||
| 84 | // the input arrays | ||
| 85 | barrier(); | ||
| 86 | memoryBarrierShared(); | ||
| 87 | // For each step... | ||
| 88 | for (step = 0; step < steps; step++) { | ||
| 89 | // Calculate the read and write index in the | ||
| 90 | // shared array | ||
| 91 | mask = (1 << step) - 1; | ||
| 92 | rd_id = ((id >> step) << (step + 1)) + mask; | ||
| 93 | wr_id = rd_id + 1 + (id & mask); | ||
| 94 | // Accumulate the read data into our element | ||
| 95 | |||
| 96 | shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); | ||
| 97 | // Synchronize again to make sure that everyone | ||
| 98 | // has caught up with us | ||
| 99 | barrier(); | ||
| 100 | memoryBarrierShared(); | ||
| 101 | } | ||
| 102 | // Add the accumulation | ||
| 103 | shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); | ||
| 104 | shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); | ||
| 105 | barrier(); | ||
| 106 | memoryBarrierShared(); | ||
| 107 | |||
| 108 | // Finally write our data back to the output buffer | ||
| 109 | output_data[id * 2] = shared_data[id * 2]; | ||
| 110 | output_data[id * 2 + 1] = shared_data[id * 2 + 1]; | ||
| 111 | if (id == 0) { | ||
| 112 | if (max_accumulation_base >= accumulation_limit + 1) { | ||
| 113 | accumulated_data = shared_data[accumulation_limit]; | ||
| 114 | return; | ||
| 115 | } | ||
| 116 | uvec2 value_1 = shared_data[max_accumulation_base]; | ||
| 117 | uvec2 value_2 = shared_data[accumulation_limit]; | ||
| 118 | accumulated_data = AddUint64(value_1, -value_2); | ||
| 119 | } | ||
| 120 | } \ No newline at end of file | ||