summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/video_core/host_shaders/CMakeLists.txt4
-rw-r--r--src/video_core/host_shaders/queries_prefix_scan_sum.comp168
-rw-r--r--src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp120
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.cpp27
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.h4
-rw-r--r--src/video_core/renderer_vulkan/vk_query_cache.cpp4
6 files changed, 253 insertions, 74 deletions
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index 8218ec4c8..6b912027f 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -42,6 +42,7 @@ set(SHADER_FILES
42 present_bicubic.frag 42 present_bicubic.frag
43 present_gaussian.frag 43 present_gaussian.frag
44 queries_prefix_scan_sum.comp 44 queries_prefix_scan_sum.comp
45 queries_prefix_scan_sum_nosubgroups.comp
45 resolve_conditional_render.comp 46 resolve_conditional_render.comp
46 smaa_edge_detection.vert 47 smaa_edge_detection.vert
47 smaa_edge_detection.frag 48 smaa_edge_detection.frag
@@ -72,6 +73,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND")
72endif() 73endif()
73 74
74set(GLSL_FLAGS "") 75set(GLSL_FLAGS "")
76set(SPIR_V_VERSION "spirv1.3")
75set(QUIET_FLAG "--quiet") 77set(QUIET_FLAG "--quiet")
76 78
77set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) 79set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
@@ -125,7 +127,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES})
125 OUTPUT 127 OUTPUT
126 ${SPIRV_HEADER_FILE} 128 ${SPIRV_HEADER_FILE}
127 COMMAND 129 COMMAND
128 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} 130 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION}
129 MAIN_DEPENDENCY 131 MAIN_DEPENDENCY
130 ${SOURCE_FILE} 132 ${SOURCE_FILE}
131 ) 133 )
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
index dce1279fe..8f10e248e 100644
--- a/src/video_core/host_shaders/queries_prefix_scan_sum.comp
+++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
@@ -1,26 +1,24 @@
1// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel 1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: MIT 2// SPDX-License-Identifier: GPL-3.0-or-later
3
4// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
5// Nicholas Haemel. Modified to suit needs and optimize for subgroup
6 3
7#version 460 core 4#version 460 core
8 5
6#extension GL_KHR_shader_subgroup_basic : require
7#extension GL_KHR_shader_subgroup_shuffle : require
8#extension GL_KHR_shader_subgroup_shuffle_relative : require
9#extension GL_KHR_shader_subgroup_arithmetic : require
10
9#ifdef VULKAN 11#ifdef VULKAN
10 12
11#extension GL_KHR_shader_subgroup_arithmetic : enable
12#define HAS_EXTENDED_TYPES 1 13#define HAS_EXTENDED_TYPES 1
13#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { 14#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
14#define END_PUSH_CONSTANTS \ 15#define END_PUSH_CONSTANTS };
15 } \
16 ;
17#define UNIFORM(n) 16#define UNIFORM(n)
18#define BINDING_INPUT_BUFFER 0 17#define BINDING_INPUT_BUFFER 0
19#define BINDING_OUTPUT_IMAGE 1 18#define BINDING_OUTPUT_IMAGE 1
20 19
21#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv 20#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
22 21
23#extension GL_KHR_shader_subgroup_arithmetic : enable
24#extension GL_NV_gpu_shader5 : enable 22#extension GL_NV_gpu_shader5 : enable
25#ifdef GL_NV_gpu_shader5 23#ifdef GL_NV_gpu_shader5
26#define HAS_EXTENDED_TYPES 1 24#define HAS_EXTENDED_TYPES 1
@@ -43,19 +41,20 @@ END_PUSH_CONSTANTS
43layout(local_size_x = 32) in; 41layout(local_size_x = 32) in;
44 42
45layout(std430, binding = 0) readonly buffer block1 { 43layout(std430, binding = 0) readonly buffer block1 {
46 uvec2 input_data[gl_WorkGroupSize.x]; 44 uvec2 input_data[];
47}; 45};
48 46
49layout(std430, binding = 1) writeonly coherent buffer block2 { 47layout(std430, binding = 1) coherent buffer block2 {
50 uvec2 output_data[gl_WorkGroupSize.x]; 48 uvec2 output_data[];
51}; 49};
52 50
53layout(std430, binding = 2) coherent buffer block3 { 51layout(std430, binding = 2) coherent buffer block3 {
54 uvec2 accumulated_data; 52 uvec2 accumulated_data;
55}; 53};
56 54
57shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; 55shared uvec2 shared_data[2];
58 56
57// Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64
59uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { 58uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
60 uint carry = 0; 59 uint carry = 0;
61 uvec2 result; 60 uvec2 result;
@@ -64,61 +63,102 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
64 return result; 63 return result;
65} 64}
66 65
67void main(void) { 66// do subgroup Prefix Sum using Hillis and Steele's algorithm
68 uint id = gl_LocalInvocationID.x; 67uvec2 subgroupInclusiveAddUint64(uvec2 value) {
69 uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); 68 uvec2 result = value;
70 uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); 69 for (uint i = 1; i < gl_SubgroupSize; i *= 2) {
71 uint work_size = gl_WorkGroupSize.x; 70 if (i <= gl_SubgroupInvocationID) {
72 uint rd_id; 71 uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i;
73 uint wr_id; 72 result = AddUint64(result, other);
74 uint mask; 73 }
75 uvec2 input_1 = input_data[id * 2]; 74 }
76 uvec2 input_2 = input_data[id * 2 + 1]; 75 return result;
77 // The number of steps is the log base 2 of the 76}
78 // work group size, which should be a power of 2 77
79 const uint steps = uint(log2(work_size)) + 1; 78// Writes down the results to the output buffer and to the accumulation buffer
80 uint step = 0; 79void WriteResults(uvec2 result) {
81 80 uint current_global_id = gl_GlobalInvocationID.x;
82 // Each invocation is responsible for the content of 81 uvec2 base_data = current_global_id < max_accumulation_base ? accumulated_data : uvec2(0);
83 // two elements of the output array 82 output_data[current_global_id] = result + base_data;
84 shared_data[id * 2] = input_1; 83 if (max_accumulation_base >= accumulation_limit + 1) {
85 shared_data[id * 2 + 1] = input_2; 84 if (current_global_id == accumulation_limit) {
86 // Synchronize to make sure that everyone has initialized 85 accumulated_data = result;
87 // their elements of shared_data[] with data loaded from 86 }
88 // the input arrays 87 return;
88 }
89 // We have that ugly case in which the accumulation data is reset in the middle somewhere.
89 barrier(); 90 barrier();
90 memoryBarrierShared(); 91 groupMemoryBarrier();
91 // For each step... 92 if (current_global_id == accumulation_limit) {
92 for (step = 0; step < steps; step++) { 93 uvec2 value_1 = output_data[max_accumulation_base];
93 // Calculate the read and write index in the 94 accumulated_data = AddUint64(result, -value_1);
94 // shared array
95 mask = (1 << step) - 1;
96 rd_id = ((id >> step) << (step + 1)) + mask;
97 wr_id = rd_id + 1 + (id & mask);
98 // Accumulate the read data into our element
99
100 shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
101 // Synchronize again to make sure that everyone
102 // has caught up with us
103 barrier();
104 memoryBarrierShared();
105 } 95 }
106 // Add the accumulation 96}
107 shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); 97
108 shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); 98void main() {
99 uint subgroup_inv_id = gl_SubgroupInvocationID;
100 uint subgroup_id = gl_SubgroupID;
101 uint last_subgroup_id = subgroupMax(subgroup_inv_id);
102 uint current_global_id = gl_GlobalInvocationID.x;
103 uint total_work = gl_NumWorkGroups.x * gl_WorkGroupSize.x;
104 uvec2 data = input_data[current_global_id];
105 // make sure all input data has been loaded
106 subgroupBarrier();
107 subgroupMemoryBarrier();
108
109 uvec2 result = subgroupInclusiveAddUint64(data);
110
111 // if we had less queries than our subgroup, just write down the results.
112 if (total_work <= gl_SubgroupSize) { // This condition is constant per dispatch.
113 WriteResults(result);
114 return;
115 }
116
117 // We now have more, so lets write the last result into shared memory.
118 // Only pick the last subgroup.
119 if (subgroup_inv_id == last_subgroup_id) {
120 shared_data[subgroup_id] = result;
121 }
122 // wait until everyone loaded their stuffs
109 barrier(); 123 barrier();
110 memoryBarrierShared(); 124 memoryBarrierShared();
111 125
112 // Finally write our data back to the output buffer 126 // Case 1: the total work for the grouped results can be calculated in a single subgroup
113 output_data[id * 2] = shared_data[id * 2]; 127 // operation (about 1024 queries).
114 output_data[id * 2 + 1] = shared_data[id * 2 + 1]; 128 uint total_extra_work = gl_NumSubgroups * gl_NumWorkGroups.x;
115 if (id == 0) { 129 if (total_extra_work <= gl_SubgroupSize) { // This condition is constant per dispatch.
116 if (max_accumulation_base >= accumulation_limit + 1) { 130 if (subgroup_id != 0) {
117 accumulated_data = shared_data[accumulation_limit]; 131 uvec2 tmp = shared_data[subgroup_inv_id];
118 return; 132 subgroupBarrier();
133 subgroupMemoryBarrierShared();
134 tmp = subgroupInclusiveAddUint64(tmp);
135 result = AddUint64(result, subgroupShuffle(tmp, subgroup_id - 1));
136 }
137
138 WriteResults(result);
139 return;
140 }
141
142 // Case 2: our work amount is huge, so lets do it in O(log n) steps.
143 const uint extra = (total_extra_work ^ (total_extra_work - 1)) != 0 ? 1 : 0;
144 const uint steps = 1 << (findMSB(total_extra_work) + extra);
145 uint step;
146 // Hillis and Steele's algorithm
147 for (step = 1; step < steps; step *= 2) {
148 if (current_global_id < steps && current_global_id >= step) {
149 uvec2 current = shared_data[current_global_id];
150 uvec2 other = shared_data[current_global_id - step];
151 shared_data[current_global_id] = AddUint64(current, other);
119 } 152 }
120 uvec2 value_1 = shared_data[max_accumulation_base]; 153 // steps is constant, so this will always execute in ever workgroup's thread.
121 uvec2 value_2 = shared_data[accumulation_limit]; 154 barrier();
122 accumulated_data = AddUint64(value_1, -value_2); 155 memoryBarrierShared();
156 }
157 // Only add results for groups higher than 0
158 if (subgroup_id != 0) {
159 result = AddUint64(result, shared_data[subgroup_id - 1]);
123 } 160 }
161
162 // Just write the final results. We are done
163 WriteResults(result);
124} \ No newline at end of file 164} \ No newline at end of file
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp
new file mode 100644
index 000000000..8021476ed
--- /dev/null
+++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp
@@ -0,0 +1,120 @@
1// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
2// SPDX-License-Identifier: MIT
3
4// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
5// Nicholas Haemel. Modified to suit needs.
6
7#version 460 core
8
9#ifdef VULKAN
10
11#define HAS_EXTENDED_TYPES 1
12#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
13#define END_PUSH_CONSTANTS };
14#define UNIFORM(n)
15#define BINDING_INPUT_BUFFER 0
16#define BINDING_OUTPUT_IMAGE 1
17
18#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
19
20#extension GL_NV_gpu_shader5 : enable
21#ifdef GL_NV_gpu_shader5
22#define HAS_EXTENDED_TYPES 1
23#else
24#define HAS_EXTENDED_TYPES 0
25#endif
26#define BEGIN_PUSH_CONSTANTS
27#define END_PUSH_CONSTANTS
28#define UNIFORM(n) layout(location = n) uniform
29#define BINDING_INPUT_BUFFER 0
30#define BINDING_OUTPUT_IMAGE 0
31
32#endif
33
34BEGIN_PUSH_CONSTANTS
35UNIFORM(0) uint max_accumulation_base;
36UNIFORM(1) uint accumulation_limit;
37END_PUSH_CONSTANTS
38
39layout(local_size_x = 32) in;
40
41layout(std430, binding = 0) readonly buffer block1 {
42 uvec2 input_data[gl_WorkGroupSize.x];
43};
44
45layout(std430, binding = 1) writeonly coherent buffer block2 {
46 uvec2 output_data[gl_WorkGroupSize.x];
47};
48
49layout(std430, binding = 2) coherent buffer block3 {
50 uvec2 accumulated_data;
51};
52
53shared uvec2 shared_data[gl_WorkGroupSize.x * 2];
54
55uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
56 uint carry = 0;
57 uvec2 result;
58 result.x = uaddCarry(value_1.x, value_2.x, carry);
59 result.y = value_1.y + value_2.y + carry;
60 return result;
61}
62
63void main(void) {
64 uint id = gl_LocalInvocationID.x;
65 uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0);
66 uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0);
67 uint work_size = gl_WorkGroupSize.x;
68 uint rd_id;
69 uint wr_id;
70 uint mask;
71 uvec2 input_1 = input_data[id * 2];
72 uvec2 input_2 = input_data[id * 2 + 1];
73 // The number of steps is the log base 2 of the
74 // work group size, which should be a power of 2
75 const uint steps = uint(log2(work_size)) + 1;
76 uint step = 0;
77
78 // Each invocation is responsible for the content of
79 // two elements of the output array
80 shared_data[id * 2] = input_1;
81 shared_data[id * 2 + 1] = input_2;
82 // Synchronize to make sure that everyone has initialized
83 // their elements of shared_data[] with data loaded from
84 // the input arrays
85 barrier();
86 memoryBarrierShared();
87 // For each step...
88 for (step = 0; step < steps; step++) {
89 // Calculate the read and write index in the
90 // shared array
91 mask = (1 << step) - 1;
92 rd_id = ((id >> step) << (step + 1)) + mask;
93 wr_id = rd_id + 1 + (id & mask);
94 // Accumulate the read data into our element
95
96 shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
97 // Synchronize again to make sure that everyone
98 // has caught up with us
99 barrier();
100 memoryBarrierShared();
101 }
102 // Add the accumulation
103 shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1);
104 shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2);
105 barrier();
106 memoryBarrierShared();
107
108 // Finally write our data back to the output buffer
109 output_data[id * 2] = shared_data[id * 2];
110 output_data[id * 2 + 1] = shared_data[id * 2 + 1];
111 if (id == 0) {
112 if (max_accumulation_base >= accumulation_limit + 1) {
113 accumulated_data = shared_data[accumulation_limit];
114 return;
115 }
116 uvec2 value_1 = shared_data[max_accumulation_base];
117 uvec2 value_2 = shared_data[accumulation_limit];
118 accumulated_data = AddUint64(value_1, -value_2);
119 }
120} \ No newline at end of file
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index a1af08cda..44ec5a032 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -13,6 +13,7 @@
13#include "common/div_ceil.h" 13#include "common/div_ceil.h"
14#include "video_core/host_shaders/astc_decoder_comp_spv.h" 14#include "video_core/host_shaders/astc_decoder_comp_spv.h"
15#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" 15#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h"
16#include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h"
16#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" 17#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
17#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" 18#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
18#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" 19#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
@@ -187,7 +188,8 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
187 vk::Span<VkDescriptorSetLayoutBinding> bindings, 188 vk::Span<VkDescriptorSetLayoutBinding> bindings,
188 vk::Span<VkDescriptorUpdateTemplateEntry> templates, 189 vk::Span<VkDescriptorUpdateTemplateEntry> templates,
189 const DescriptorBankInfo& bank_info, 190 const DescriptorBankInfo& bank_info,
190 vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code) 191 vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code,
192 std::optional<u32> optional_subgroup_size)
191 : device{device_} { 193 : device{device_} {
192 descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ 194 descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({
193 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, 195 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
@@ -228,13 +230,19 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
228 .pCode = code.data(), 230 .pCode = code.data(),
229 }); 231 });
230 device.SaveShader(code); 232 device.SaveShader(code);
233 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{
234 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
235 .pNext = nullptr,
236 .requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U,
237 };
238 bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size;
231 pipeline = device.GetLogical().CreateComputePipeline({ 239 pipeline = device.GetLogical().CreateComputePipeline({
232 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 240 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
233 .pNext = nullptr, 241 .pNext = nullptr,
234 .flags = 0, 242 .flags = 0,
235 .stage{ 243 .stage{
236 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 244 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
237 .pNext = nullptr, 245 .pNext = use_setup_size ? &subgroup_size_ci : nullptr,
238 .flags = 0, 246 .flags = 0,
239 .stage = VK_SHADER_STAGE_COMPUTE_BIT, 247 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
240 .module = *module, 248 .module = *module,
@@ -399,10 +407,17 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
399QueriesPrefixScanPass::QueriesPrefixScanPass( 407QueriesPrefixScanPass::QueriesPrefixScanPass(
400 const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, 408 const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
401 ComputePassDescriptorQueue& compute_pass_descriptor_queue_) 409 ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
402 : ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, 410 : ComputePass(
403 QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, 411 device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS,
404 COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>, 412 QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO,
405 QUERIES_PREFIX_SCAN_SUM_COMP_SPV), 413 COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>,
414 device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) &&
415 device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) &&
416 device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) &&
417 device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT)
418 ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV)
419 : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV),
420 {32}),
406 scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} 421 scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
407 422
408void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, 423void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer,
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index e6ff86e9a..68ffb1b82 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -3,6 +3,7 @@
3 3
4#pragma once 4#pragma once
5 5
6#include <optional>
6#include <span> 7#include <span>
7#include <utility> 8#include <utility>
8 9
@@ -31,7 +32,8 @@ public:
31 vk::Span<VkDescriptorSetLayoutBinding> bindings, 32 vk::Span<VkDescriptorSetLayoutBinding> bindings,
32 vk::Span<VkDescriptorUpdateTemplateEntry> templates, 33 vk::Span<VkDescriptorUpdateTemplateEntry> templates,
33 const DescriptorBankInfo& bank_info, 34 const DescriptorBankInfo& bank_info,
34 vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code); 35 vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code,
36 std::optional<u32> optional_subgroup_size = std::nullopt);
35 ~ComputePass(); 37 ~ComputePass();
36 38
37protected: 39protected:
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp
index ded190ae0..825e1a72e 100644
--- a/src/video_core/renderer_vulkan/vk_query_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp
@@ -1376,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku
1376 return true; 1376 return true;
1377 } 1377 }
1378 } 1378 }
1379 /*if (!is_in_bc[0] && !is_in_bc[1]) { 1379 if (!is_in_bc[0] && !is_in_bc[1]) {
1380 // Both queries are in query cache, it's best to just flush. 1380 // Both queries are in query cache, it's best to just flush.
1381 return true; 1381 return true;
1382 }*/ 1382 }
1383 HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); 1383 HostConditionalRenderingCompareBCImpl(object_1.address, equal_check);
1384 return true; 1384 return true;
1385} 1385}