diff options
6 files changed, 253 insertions, 74 deletions
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 8218ec4c8..6b912027f 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt | |||
| @@ -42,6 +42,7 @@ set(SHADER_FILES | |||
| 42 | present_bicubic.frag | 42 | present_bicubic.frag |
| 43 | present_gaussian.frag | 43 | present_gaussian.frag |
| 44 | queries_prefix_scan_sum.comp | 44 | queries_prefix_scan_sum.comp |
| 45 | queries_prefix_scan_sum_nosubgroups.comp | ||
| 45 | resolve_conditional_render.comp | 46 | resolve_conditional_render.comp |
| 46 | smaa_edge_detection.vert | 47 | smaa_edge_detection.vert |
| 47 | smaa_edge_detection.frag | 48 | smaa_edge_detection.frag |
| @@ -72,6 +73,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND") | |||
| 72 | endif() | 73 | endif() |
| 73 | 74 | ||
| 74 | set(GLSL_FLAGS "") | 75 | set(GLSL_FLAGS "") |
| 76 | set(SPIR_V_VERSION "spirv1.3") | ||
| 75 | set(QUIET_FLAG "--quiet") | 77 | set(QUIET_FLAG "--quiet") |
| 76 | 78 | ||
| 77 | set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) | 79 | set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) |
| @@ -125,7 +127,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES}) | |||
| 125 | OUTPUT | 127 | OUTPUT |
| 126 | ${SPIRV_HEADER_FILE} | 128 | ${SPIRV_HEADER_FILE} |
| 127 | COMMAND | 129 | COMMAND |
| 128 | ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} | 130 | ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION} |
| 129 | MAIN_DEPENDENCY | 131 | MAIN_DEPENDENCY |
| 130 | ${SOURCE_FILE} | 132 | ${SOURCE_FILE} |
| 131 | ) | 133 | ) |
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp index dce1279fe..8f10e248e 100644 --- a/src/video_core/host_shaders/queries_prefix_scan_sum.comp +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp | |||
| @@ -1,26 +1,24 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel | 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project |
| 2 | // SPDX-License-Identifier: MIT | 2 | // SPDX-License-Identifier: GPL-3.0-or-later |
| 3 | |||
| 4 | // Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and | ||
| 5 | // Nicholas Haemel. Modified to suit needs and optimize for subgroup | ||
| 6 | 3 | ||
| 7 | #version 460 core | 4 | #version 460 core |
| 8 | 5 | ||
| 6 | #extension GL_KHR_shader_subgroup_basic : require | ||
| 7 | #extension GL_KHR_shader_subgroup_shuffle : require | ||
| 8 | #extension GL_KHR_shader_subgroup_shuffle_relative : require | ||
| 9 | #extension GL_KHR_shader_subgroup_arithmetic : require | ||
| 10 | |||
| 9 | #ifdef VULKAN | 11 | #ifdef VULKAN |
| 10 | 12 | ||
| 11 | #extension GL_KHR_shader_subgroup_arithmetic : enable | ||
| 12 | #define HAS_EXTENDED_TYPES 1 | 13 | #define HAS_EXTENDED_TYPES 1 |
| 13 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | 14 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { |
| 14 | #define END_PUSH_CONSTANTS \ | 15 | #define END_PUSH_CONSTANTS }; |
| 15 | } \ | ||
| 16 | ; | ||
| 17 | #define UNIFORM(n) | 16 | #define UNIFORM(n) |
| 18 | #define BINDING_INPUT_BUFFER 0 | 17 | #define BINDING_INPUT_BUFFER 0 |
| 19 | #define BINDING_OUTPUT_IMAGE 1 | 18 | #define BINDING_OUTPUT_IMAGE 1 |
| 20 | 19 | ||
| 21 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | 20 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv |
| 22 | 21 | ||
| 23 | #extension GL_KHR_shader_subgroup_arithmetic : enable | ||
| 24 | #extension GL_NV_gpu_shader5 : enable | 22 | #extension GL_NV_gpu_shader5 : enable |
| 25 | #ifdef GL_NV_gpu_shader5 | 23 | #ifdef GL_NV_gpu_shader5 |
| 26 | #define HAS_EXTENDED_TYPES 1 | 24 | #define HAS_EXTENDED_TYPES 1 |
| @@ -43,19 +41,20 @@ END_PUSH_CONSTANTS | |||
| 43 | layout(local_size_x = 32) in; | 41 | layout(local_size_x = 32) in; |
| 44 | 42 | ||
| 45 | layout(std430, binding = 0) readonly buffer block1 { | 43 | layout(std430, binding = 0) readonly buffer block1 { |
| 46 | uvec2 input_data[gl_WorkGroupSize.x]; | 44 | uvec2 input_data[]; |
| 47 | }; | 45 | }; |
| 48 | 46 | ||
| 49 | layout(std430, binding = 1) writeonly coherent buffer block2 { | 47 | layout(std430, binding = 1) coherent buffer block2 { |
| 50 | uvec2 output_data[gl_WorkGroupSize.x]; | 48 | uvec2 output_data[]; |
| 51 | }; | 49 | }; |
| 52 | 50 | ||
| 53 | layout(std430, binding = 2) coherent buffer block3 { | 51 | layout(std430, binding = 2) coherent buffer block3 { |
| 54 | uvec2 accumulated_data; | 52 | uvec2 accumulated_data; |
| 55 | }; | 53 | }; |
| 56 | 54 | ||
| 57 | shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; | 55 | shared uvec2 shared_data[2]; |
| 58 | 56 | ||
| 57 | // Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 | ||
| 59 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | 58 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { |
| 60 | uint carry = 0; | 59 | uint carry = 0; |
| 61 | uvec2 result; | 60 | uvec2 result; |
| @@ -64,61 +63,102 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | |||
| 64 | return result; | 63 | return result; |
| 65 | } | 64 | } |
| 66 | 65 | ||
| 67 | void main(void) { | 66 | // do subgroup Prefix Sum using Hillis and Steele's algorithm |
| 68 | uint id = gl_LocalInvocationID.x; | 67 | uvec2 subgroupInclusiveAddUint64(uvec2 value) { |
| 69 | uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); | 68 | uvec2 result = value; |
| 70 | uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); | 69 | for (uint i = 1; i < gl_SubgroupSize; i *= 2) { |
| 71 | uint work_size = gl_WorkGroupSize.x; | 70 | if (i <= gl_SubgroupInvocationID) { |
| 72 | uint rd_id; | 71 | uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; |
| 73 | uint wr_id; | 72 | result = AddUint64(result, other); |
| 74 | uint mask; | 73 | } |
| 75 | uvec2 input_1 = input_data[id * 2]; | 74 | } |
| 76 | uvec2 input_2 = input_data[id * 2 + 1]; | 75 | return result; |
| 77 | // The number of steps is the log base 2 of the | 76 | } |
| 78 | // work group size, which should be a power of 2 | 77 | |
| 79 | const uint steps = uint(log2(work_size)) + 1; | 78 | // Writes down the results to the output buffer and to the accumulation buffer |
| 80 | uint step = 0; | 79 | void WriteResults(uvec2 result) { |
| 81 | 80 | uint current_global_id = gl_GlobalInvocationID.x; | |
| 82 | // Each invocation is responsible for the content of | 81 | uvec2 base_data = current_global_id < max_accumulation_base ? accumulated_data : uvec2(0); |
| 83 | // two elements of the output array | 82 | output_data[current_global_id] = result + base_data; |
| 84 | shared_data[id * 2] = input_1; | 83 | if (max_accumulation_base >= accumulation_limit + 1) { |
| 85 | shared_data[id * 2 + 1] = input_2; | 84 | if (current_global_id == accumulation_limit) { |
| 86 | // Synchronize to make sure that everyone has initialized | 85 | accumulated_data = result; |
| 87 | // their elements of shared_data[] with data loaded from | 86 | } |
| 88 | // the input arrays | 87 | return; |
| 88 | } | ||
| 89 | // We have that ugly case in which the accumulation data is reset in the middle somewhere. | ||
| 89 | barrier(); | 90 | barrier(); |
| 90 | memoryBarrierShared(); | 91 | groupMemoryBarrier(); |
| 91 | // For each step... | 92 | if (current_global_id == accumulation_limit) { |
| 92 | for (step = 0; step < steps; step++) { | 93 | uvec2 value_1 = output_data[max_accumulation_base]; |
| 93 | // Calculate the read and write index in the | 94 | accumulated_data = AddUint64(result, -value_1); |
| 94 | // shared array | ||
| 95 | mask = (1 << step) - 1; | ||
| 96 | rd_id = ((id >> step) << (step + 1)) + mask; | ||
| 97 | wr_id = rd_id + 1 + (id & mask); | ||
| 98 | // Accumulate the read data into our element | ||
| 99 | |||
| 100 | shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); | ||
| 101 | // Synchronize again to make sure that everyone | ||
| 102 | // has caught up with us | ||
| 103 | barrier(); | ||
| 104 | memoryBarrierShared(); | ||
| 105 | } | 95 | } |
| 106 | // Add the accumulation | 96 | } |
| 107 | shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); | 97 | |
| 108 | shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); | 98 | void main() { |
| 99 | uint subgroup_inv_id = gl_SubgroupInvocationID; | ||
| 100 | uint subgroup_id = gl_SubgroupID; | ||
| 101 | uint last_subgroup_id = subgroupMax(subgroup_inv_id); | ||
| 102 | uint current_global_id = gl_GlobalInvocationID.x; | ||
| 103 | uint total_work = gl_NumWorkGroups.x * gl_WorkGroupSize.x; | ||
| 104 | uvec2 data = input_data[current_global_id]; | ||
| 105 | // make sure all input data has been loaded | ||
| 106 | subgroupBarrier(); | ||
| 107 | subgroupMemoryBarrier(); | ||
| 108 | |||
| 109 | uvec2 result = subgroupInclusiveAddUint64(data); | ||
| 110 | |||
| 111 | // if we had less queries than our subgroup, just write down the results. | ||
| 112 | if (total_work <= gl_SubgroupSize) { // This condition is constant per dispatch. | ||
| 113 | WriteResults(result); | ||
| 114 | return; | ||
| 115 | } | ||
| 116 | |||
| 117 | // We now have more, so lets write the last result into shared memory. | ||
| 118 | // Only pick the last subgroup. | ||
| 119 | if (subgroup_inv_id == last_subgroup_id) { | ||
| 120 | shared_data[subgroup_id] = result; | ||
| 121 | } | ||
| 122 | // wait until everyone loaded their stuffs | ||
| 109 | barrier(); | 123 | barrier(); |
| 110 | memoryBarrierShared(); | 124 | memoryBarrierShared(); |
| 111 | 125 | ||
| 112 | // Finally write our data back to the output buffer | 126 | // Case 1: the total work for the grouped results can be calculated in a single subgroup |
| 113 | output_data[id * 2] = shared_data[id * 2]; | 127 | // operation (about 1024 queries). |
| 114 | output_data[id * 2 + 1] = shared_data[id * 2 + 1]; | 128 | uint total_extra_work = gl_NumSubgroups * gl_NumWorkGroups.x; |
| 115 | if (id == 0) { | 129 | if (total_extra_work <= gl_SubgroupSize) { // This condition is constant per dispatch. |
| 116 | if (max_accumulation_base >= accumulation_limit + 1) { | 130 | if (subgroup_id != 0) { |
| 117 | accumulated_data = shared_data[accumulation_limit]; | 131 | uvec2 tmp = shared_data[subgroup_inv_id]; |
| 118 | return; | 132 | subgroupBarrier(); |
| 133 | subgroupMemoryBarrierShared(); | ||
| 134 | tmp = subgroupInclusiveAddUint64(tmp); | ||
| 135 | result = AddUint64(result, subgroupShuffle(tmp, subgroup_id - 1)); | ||
| 136 | } | ||
| 137 | |||
| 138 | WriteResults(result); | ||
| 139 | return; | ||
| 140 | } | ||
| 141 | |||
| 142 | // Case 2: our work amount is huge, so lets do it in O(log n) steps. | ||
| 143 | const uint extra = (total_extra_work ^ (total_extra_work - 1)) != 0 ? 1 : 0; | ||
| 144 | const uint steps = 1 << (findMSB(total_extra_work) + extra); | ||
| 145 | uint step; | ||
| 146 | // Hillis and Steele's algorithm | ||
| 147 | for (step = 1; step < steps; step *= 2) { | ||
| 148 | if (current_global_id < steps && current_global_id >= step) { | ||
| 149 | uvec2 current = shared_data[current_global_id]; | ||
| 150 | uvec2 other = shared_data[current_global_id - step]; | ||
| 151 | shared_data[current_global_id] = AddUint64(current, other); | ||
| 119 | } | 152 | } |
| 120 | uvec2 value_1 = shared_data[max_accumulation_base]; | 153 | // steps is constant, so this will always execute in ever workgroup's thread. |
| 121 | uvec2 value_2 = shared_data[accumulation_limit]; | 154 | barrier(); |
| 122 | accumulated_data = AddUint64(value_1, -value_2); | 155 | memoryBarrierShared(); |
| 156 | } | ||
| 157 | // Only add results for groups higher than 0 | ||
| 158 | if (subgroup_id != 0) { | ||
| 159 | result = AddUint64(result, shared_data[subgroup_id - 1]); | ||
| 123 | } | 160 | } |
| 161 | |||
| 162 | // Just write the final results. We are done | ||
| 163 | WriteResults(result); | ||
| 124 | } \ No newline at end of file | 164 | } \ No newline at end of file |
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp new file mode 100644 index 000000000..8021476ed --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp | |||
| @@ -0,0 +1,120 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel | ||
| 2 | // SPDX-License-Identifier: MIT | ||
| 3 | |||
| 4 | // Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and | ||
| 5 | // Nicholas Haemel. Modified to suit needs. | ||
| 6 | |||
| 7 | #version 460 core | ||
| 8 | |||
| 9 | #ifdef VULKAN | ||
| 10 | |||
| 11 | #define HAS_EXTENDED_TYPES 1 | ||
| 12 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | ||
| 13 | #define END_PUSH_CONSTANTS }; | ||
| 14 | #define UNIFORM(n) | ||
| 15 | #define BINDING_INPUT_BUFFER 0 | ||
| 16 | #define BINDING_OUTPUT_IMAGE 1 | ||
| 17 | |||
| 18 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||
| 19 | |||
| 20 | #extension GL_NV_gpu_shader5 : enable | ||
| 21 | #ifdef GL_NV_gpu_shader5 | ||
| 22 | #define HAS_EXTENDED_TYPES 1 | ||
| 23 | #else | ||
| 24 | #define HAS_EXTENDED_TYPES 0 | ||
| 25 | #endif | ||
| 26 | #define BEGIN_PUSH_CONSTANTS | ||
| 27 | #define END_PUSH_CONSTANTS | ||
| 28 | #define UNIFORM(n) layout(location = n) uniform | ||
| 29 | #define BINDING_INPUT_BUFFER 0 | ||
| 30 | #define BINDING_OUTPUT_IMAGE 0 | ||
| 31 | |||
| 32 | #endif | ||
| 33 | |||
| 34 | BEGIN_PUSH_CONSTANTS | ||
| 35 | UNIFORM(0) uint max_accumulation_base; | ||
| 36 | UNIFORM(1) uint accumulation_limit; | ||
| 37 | END_PUSH_CONSTANTS | ||
| 38 | |||
| 39 | layout(local_size_x = 32) in; | ||
| 40 | |||
| 41 | layout(std430, binding = 0) readonly buffer block1 { | ||
| 42 | uvec2 input_data[gl_WorkGroupSize.x]; | ||
| 43 | }; | ||
| 44 | |||
| 45 | layout(std430, binding = 1) writeonly coherent buffer block2 { | ||
| 46 | uvec2 output_data[gl_WorkGroupSize.x]; | ||
| 47 | }; | ||
| 48 | |||
| 49 | layout(std430, binding = 2) coherent buffer block3 { | ||
| 50 | uvec2 accumulated_data; | ||
| 51 | }; | ||
| 52 | |||
| 53 | shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; | ||
| 54 | |||
| 55 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | ||
| 56 | uint carry = 0; | ||
| 57 | uvec2 result; | ||
| 58 | result.x = uaddCarry(value_1.x, value_2.x, carry); | ||
| 59 | result.y = value_1.y + value_2.y + carry; | ||
| 60 | return result; | ||
| 61 | } | ||
| 62 | |||
| 63 | void main(void) { | ||
| 64 | uint id = gl_LocalInvocationID.x; | ||
| 65 | uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); | ||
| 66 | uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); | ||
| 67 | uint work_size = gl_WorkGroupSize.x; | ||
| 68 | uint rd_id; | ||
| 69 | uint wr_id; | ||
| 70 | uint mask; | ||
| 71 | uvec2 input_1 = input_data[id * 2]; | ||
| 72 | uvec2 input_2 = input_data[id * 2 + 1]; | ||
| 73 | // The number of steps is the log base 2 of the | ||
| 74 | // work group size, which should be a power of 2 | ||
| 75 | const uint steps = uint(log2(work_size)) + 1; | ||
| 76 | uint step = 0; | ||
| 77 | |||
| 78 | // Each invocation is responsible for the content of | ||
| 79 | // two elements of the output array | ||
| 80 | shared_data[id * 2] = input_1; | ||
| 81 | shared_data[id * 2 + 1] = input_2; | ||
| 82 | // Synchronize to make sure that everyone has initialized | ||
| 83 | // their elements of shared_data[] with data loaded from | ||
| 84 | // the input arrays | ||
| 85 | barrier(); | ||
| 86 | memoryBarrierShared(); | ||
| 87 | // For each step... | ||
| 88 | for (step = 0; step < steps; step++) { | ||
| 89 | // Calculate the read and write index in the | ||
| 90 | // shared array | ||
| 91 | mask = (1 << step) - 1; | ||
| 92 | rd_id = ((id >> step) << (step + 1)) + mask; | ||
| 93 | wr_id = rd_id + 1 + (id & mask); | ||
| 94 | // Accumulate the read data into our element | ||
| 95 | |||
| 96 | shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); | ||
| 97 | // Synchronize again to make sure that everyone | ||
| 98 | // has caught up with us | ||
| 99 | barrier(); | ||
| 100 | memoryBarrierShared(); | ||
| 101 | } | ||
| 102 | // Add the accumulation | ||
| 103 | shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); | ||
| 104 | shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); | ||
| 105 | barrier(); | ||
| 106 | memoryBarrierShared(); | ||
| 107 | |||
| 108 | // Finally write our data back to the output buffer | ||
| 109 | output_data[id * 2] = shared_data[id * 2]; | ||
| 110 | output_data[id * 2 + 1] = shared_data[id * 2 + 1]; | ||
| 111 | if (id == 0) { | ||
| 112 | if (max_accumulation_base >= accumulation_limit + 1) { | ||
| 113 | accumulated_data = shared_data[accumulation_limit]; | ||
| 114 | return; | ||
| 115 | } | ||
| 116 | uvec2 value_1 = shared_data[max_accumulation_base]; | ||
| 117 | uvec2 value_2 = shared_data[accumulation_limit]; | ||
| 118 | accumulated_data = AddUint64(value_1, -value_2); | ||
| 119 | } | ||
| 120 | } \ No newline at end of file | ||
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index a1af08cda..44ec5a032 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include "common/div_ceil.h" | 13 | #include "common/div_ceil.h" |
| 14 | #include "video_core/host_shaders/astc_decoder_comp_spv.h" | 14 | #include "video_core/host_shaders/astc_decoder_comp_spv.h" |
| 15 | #include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" | 15 | #include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" |
| 16 | #include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h" | ||
| 16 | #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" | 17 | #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" |
| 17 | #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" | 18 | #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" |
| 18 | #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" | 19 | #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" |
| @@ -187,7 +188,8 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, | |||
| 187 | vk::Span<VkDescriptorSetLayoutBinding> bindings, | 188 | vk::Span<VkDescriptorSetLayoutBinding> bindings, |
| 188 | vk::Span<VkDescriptorUpdateTemplateEntry> templates, | 189 | vk::Span<VkDescriptorUpdateTemplateEntry> templates, |
| 189 | const DescriptorBankInfo& bank_info, | 190 | const DescriptorBankInfo& bank_info, |
| 190 | vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code) | 191 | vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, |
| 192 | std::optional<u32> optional_subgroup_size) | ||
| 191 | : device{device_} { | 193 | : device{device_} { |
| 192 | descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ | 194 | descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ |
| 193 | .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, | 195 | .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, |
| @@ -228,13 +230,19 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, | |||
| 228 | .pCode = code.data(), | 230 | .pCode = code.data(), |
| 229 | }); | 231 | }); |
| 230 | device.SaveShader(code); | 232 | device.SaveShader(code); |
| 233 | const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{ | ||
| 234 | .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, | ||
| 235 | .pNext = nullptr, | ||
| 236 | .requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U, | ||
| 237 | }; | ||
| 238 | bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size; | ||
| 231 | pipeline = device.GetLogical().CreateComputePipeline({ | 239 | pipeline = device.GetLogical().CreateComputePipeline({ |
| 232 | .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, | 240 | .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, |
| 233 | .pNext = nullptr, | 241 | .pNext = nullptr, |
| 234 | .flags = 0, | 242 | .flags = 0, |
| 235 | .stage{ | 243 | .stage{ |
| 236 | .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, | 244 | .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, |
| 237 | .pNext = nullptr, | 245 | .pNext = use_setup_size ? &subgroup_size_ci : nullptr, |
| 238 | .flags = 0, | 246 | .flags = 0, |
| 239 | .stage = VK_SHADER_STAGE_COMPUTE_BIT, | 247 | .stage = VK_SHADER_STAGE_COMPUTE_BIT, |
| 240 | .module = *module, | 248 | .module = *module, |
| @@ -399,10 +407,17 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ | |||
| 399 | QueriesPrefixScanPass::QueriesPrefixScanPass( | 407 | QueriesPrefixScanPass::QueriesPrefixScanPass( |
| 400 | const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, | 408 | const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, |
| 401 | ComputePassDescriptorQueue& compute_pass_descriptor_queue_) | 409 | ComputePassDescriptorQueue& compute_pass_descriptor_queue_) |
| 402 | : ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, | 410 | : ComputePass( |
| 403 | QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, | 411 | device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, |
| 404 | COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>, | 412 | QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, |
| 405 | QUERIES_PREFIX_SCAN_SUM_COMP_SPV), | 413 | COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>, |
| 414 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) && | ||
| 415 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) && | ||
| 416 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && | ||
| 417 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) | ||
| 418 | ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) | ||
| 419 | : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV), | ||
| 420 | {32}), | ||
| 406 | scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} | 421 | scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} |
| 407 | 422 | ||
| 408 | void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, | 423 | void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, |
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index e6ff86e9a..68ffb1b82 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h | |||
| @@ -3,6 +3,7 @@ | |||
| 3 | 3 | ||
| 4 | #pragma once | 4 | #pragma once |
| 5 | 5 | ||
| 6 | #include <optional> | ||
| 6 | #include <span> | 7 | #include <span> |
| 7 | #include <utility> | 8 | #include <utility> |
| 8 | 9 | ||
| @@ -31,7 +32,8 @@ public: | |||
| 31 | vk::Span<VkDescriptorSetLayoutBinding> bindings, | 32 | vk::Span<VkDescriptorSetLayoutBinding> bindings, |
| 32 | vk::Span<VkDescriptorUpdateTemplateEntry> templates, | 33 | vk::Span<VkDescriptorUpdateTemplateEntry> templates, |
| 33 | const DescriptorBankInfo& bank_info, | 34 | const DescriptorBankInfo& bank_info, |
| 34 | vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code); | 35 | vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, |
| 36 | std::optional<u32> optional_subgroup_size = std::nullopt); | ||
| 35 | ~ComputePass(); | 37 | ~ComputePass(); |
| 36 | 38 | ||
| 37 | protected: | 39 | protected: |
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index ded190ae0..825e1a72e 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp | |||
| @@ -1376,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku | |||
| 1376 | return true; | 1376 | return true; |
| 1377 | } | 1377 | } |
| 1378 | } | 1378 | } |
| 1379 | /*if (!is_in_bc[0] && !is_in_bc[1]) { | 1379 | if (!is_in_bc[0] && !is_in_bc[1]) { |
| 1380 | // Both queries are in query cache, it's best to just flush. | 1380 | // Both queries are in query cache, it's best to just flush. |
| 1381 | return true; | 1381 | return true; |
| 1382 | }*/ | 1382 | } |
| 1383 | HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); | 1383 | HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); |
| 1384 | return true; | 1384 | return true; |
| 1385 | } | 1385 | } |