diff options
| author | 2023-08-20 17:53:08 +0200 | |
|---|---|---|
| committer | 2023-09-23 23:05:30 +0200 | |
| commit | c8237d5c312485394389b2520451ef720604ea9a (patch) | |
| tree | 1a1064ed38a7a53bd61e4c04bf4571cdebfce2ec /src/video_core/host_shaders | |
| parent | Query Cache: Fix guest side sample counting (diff) | |
| download | yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.gz yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.xz yuzu-c8237d5c312485394389b2520451ef720604ea9a.zip | |
Query Cache: Implement host side sample counting.
Diffstat (limited to 'src/video_core/host_shaders')
| -rw-r--r-- | src/video_core/host_shaders/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | src/video_core/host_shaders/queries_prefix_scan_sum.comp | 124 |
2 files changed, 125 insertions, 0 deletions
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index fb24b6532..8218ec4c8 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt | |||
| @@ -41,6 +41,7 @@ set(SHADER_FILES | |||
| 41 | pitch_unswizzle.comp | 41 | pitch_unswizzle.comp |
| 42 | present_bicubic.frag | 42 | present_bicubic.frag |
| 43 | present_gaussian.frag | 43 | present_gaussian.frag |
| 44 | queries_prefix_scan_sum.comp | ||
| 44 | resolve_conditional_render.comp | 45 | resolve_conditional_render.comp |
| 45 | smaa_edge_detection.vert | 46 | smaa_edge_detection.vert |
| 46 | smaa_edge_detection.frag | 47 | smaa_edge_detection.frag |
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp new file mode 100644 index 000000000..dce1279fe --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp | |||
| @@ -0,0 +1,124 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel | ||
| 2 | // SPDX-License-Identifier: MIT | ||
| 3 | |||
| 4 | // Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and | ||
| 5 | // Nicholas Haemel. Modified to suit needs and optimize for subgroup | ||
| 6 | |||
| 7 | #version 460 core | ||
| 8 | |||
| 9 | #ifdef VULKAN | ||
| 10 | |||
| 11 | #extension GL_KHR_shader_subgroup_arithmetic : enable | ||
| 12 | #define HAS_EXTENDED_TYPES 1 | ||
| 13 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | ||
| 14 | #define END_PUSH_CONSTANTS \ | ||
| 15 | } \ | ||
| 16 | ; | ||
| 17 | #define UNIFORM(n) | ||
| 18 | #define BINDING_INPUT_BUFFER 0 | ||
| 19 | #define BINDING_OUTPUT_IMAGE 1 | ||
| 20 | |||
| 21 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||
| 22 | |||
| 23 | #extension GL_KHR_shader_subgroup_arithmetic : enable | ||
| 24 | #extension GL_NV_gpu_shader5 : enable | ||
| 25 | #ifdef GL_NV_gpu_shader5 | ||
| 26 | #define HAS_EXTENDED_TYPES 1 | ||
| 27 | #else | ||
| 28 | #define HAS_EXTENDED_TYPES 0 | ||
| 29 | #endif | ||
| 30 | #define BEGIN_PUSH_CONSTANTS | ||
| 31 | #define END_PUSH_CONSTANTS | ||
| 32 | #define UNIFORM(n) layout(location = n) uniform | ||
| 33 | #define BINDING_INPUT_BUFFER 0 | ||
| 34 | #define BINDING_OUTPUT_IMAGE 0 | ||
| 35 | |||
| 36 | #endif | ||
| 37 | |||
| 38 | BEGIN_PUSH_CONSTANTS | ||
| 39 | UNIFORM(0) uint max_accumulation_base; | ||
| 40 | UNIFORM(1) uint accumulation_limit; | ||
| 41 | END_PUSH_CONSTANTS | ||
| 42 | |||
| 43 | layout(local_size_x = 32) in; | ||
| 44 | |||
| 45 | layout(std430, binding = 0) readonly buffer block1 { | ||
| 46 | uvec2 input_data[gl_WorkGroupSize.x]; | ||
| 47 | }; | ||
| 48 | |||
| 49 | layout(std430, binding = 1) writeonly coherent buffer block2 { | ||
| 50 | uvec2 output_data[gl_WorkGroupSize.x]; | ||
| 51 | }; | ||
| 52 | |||
| 53 | layout(std430, binding = 2) coherent buffer block3 { | ||
| 54 | uvec2 accumulated_data; | ||
| 55 | }; | ||
| 56 | |||
| 57 | shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; | ||
| 58 | |||
| 59 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | ||
| 60 | uint carry = 0; | ||
| 61 | uvec2 result; | ||
| 62 | result.x = uaddCarry(value_1.x, value_2.x, carry); | ||
| 63 | result.y = value_1.y + value_2.y + carry; | ||
| 64 | return result; | ||
| 65 | } | ||
| 66 | |||
| 67 | void main(void) { | ||
| 68 | uint id = gl_LocalInvocationID.x; | ||
| 69 | uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); | ||
| 70 | uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); | ||
| 71 | uint work_size = gl_WorkGroupSize.x; | ||
| 72 | uint rd_id; | ||
| 73 | uint wr_id; | ||
| 74 | uint mask; | ||
| 75 | uvec2 input_1 = input_data[id * 2]; | ||
| 76 | uvec2 input_2 = input_data[id * 2 + 1]; | ||
| 77 | // The number of steps is the log base 2 of the | ||
| 78 | // work group size, which should be a power of 2 | ||
| 79 | const uint steps = uint(log2(work_size)) + 1; | ||
| 80 | uint step = 0; | ||
| 81 | |||
| 82 | // Each invocation is responsible for the content of | ||
| 83 | // two elements of the output array | ||
| 84 | shared_data[id * 2] = input_1; | ||
| 85 | shared_data[id * 2 + 1] = input_2; | ||
| 86 | // Synchronize to make sure that everyone has initialized | ||
| 87 | // their elements of shared_data[] with data loaded from | ||
| 88 | // the input arrays | ||
| 89 | barrier(); | ||
| 90 | memoryBarrierShared(); | ||
| 91 | // For each step... | ||
| 92 | for (step = 0; step < steps; step++) { | ||
| 93 | // Calculate the read and write index in the | ||
| 94 | // shared array | ||
| 95 | mask = (1 << step) - 1; | ||
| 96 | rd_id = ((id >> step) << (step + 1)) + mask; | ||
| 97 | wr_id = rd_id + 1 + (id & mask); | ||
| 98 | // Accumulate the read data into our element | ||
| 99 | |||
| 100 | shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); | ||
| 101 | // Synchronize again to make sure that everyone | ||
| 102 | // has caught up with us | ||
| 103 | barrier(); | ||
| 104 | memoryBarrierShared(); | ||
| 105 | } | ||
| 106 | // Add the accumulation | ||
| 107 | shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); | ||
| 108 | shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); | ||
| 109 | barrier(); | ||
| 110 | memoryBarrierShared(); | ||
| 111 | |||
| 112 | // Finally write our data back to the output buffer | ||
| 113 | output_data[id * 2] = shared_data[id * 2]; | ||
| 114 | output_data[id * 2 + 1] = shared_data[id * 2 + 1]; | ||
| 115 | if (id == 0) { | ||
| 116 | if (max_accumulation_base >= accumulation_limit + 1) { | ||
| 117 | accumulated_data = shared_data[accumulation_limit]; | ||
| 118 | return; | ||
| 119 | } | ||
| 120 | uvec2 value_1 = shared_data[max_accumulation_base]; | ||
| 121 | uvec2 value_2 = shared_data[accumulation_limit]; | ||
| 122 | accumulated_data = AddUint64(value_1, -value_2); | ||
| 123 | } | ||
| 124 | } \ No newline at end of file | ||