summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Fernando Sahmkow2023-08-20 17:53:08 +0200
committerGravatar Fernando Sahmkow2023-09-23 23:05:30 +0200
commitc8237d5c312485394389b2520451ef720604ea9a (patch)
tree1a1064ed38a7a53bd61e4c04bf4571cdebfce2ec /src
parentQuery Cache: Fix guest side sample counting (diff)
downloadyuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.gz
yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.xz
yuzu-c8237d5c312485394389b2520451ef720604ea9a.zip
Query Cache: Implement host side sample counting.
Diffstat (limited to 'src')
-rw-r--r--src/video_core/host_shaders/CMakeLists.txt1
-rw-r--r--src/video_core/host_shaders/queries_prefix_scan_sum.comp124
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.cpp110
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.h14
-rw-r--r--src/video_core/renderer_vulkan/vk_query_cache.cpp147
5 files changed, 348 insertions, 48 deletions
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index fb24b6532..8218ec4c8 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -41,6 +41,7 @@ set(SHADER_FILES
41 pitch_unswizzle.comp 41 pitch_unswizzle.comp
42 present_bicubic.frag 42 present_bicubic.frag
43 present_gaussian.frag 43 present_gaussian.frag
44 queries_prefix_scan_sum.comp
44 resolve_conditional_render.comp 45 resolve_conditional_render.comp
45 smaa_edge_detection.vert 46 smaa_edge_detection.vert
46 smaa_edge_detection.frag 47 smaa_edge_detection.frag
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
new file mode 100644
index 000000000..dce1279fe
--- /dev/null
+++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
@@ -0,0 +1,124 @@
1// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
2// SPDX-License-Identifier: MIT
3
4// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
5// Nicholas Haemel. Modified to suit needs and optimize for subgroup
6
7#version 460 core
8
9#ifdef VULKAN
10
11#extension GL_KHR_shader_subgroup_arithmetic : enable
12#define HAS_EXTENDED_TYPES 1
13#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
14#define END_PUSH_CONSTANTS \
15 } \
16 ;
17#define UNIFORM(n)
18#define BINDING_INPUT_BUFFER 0
19#define BINDING_OUTPUT_IMAGE 1
20
21#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
22
23#extension GL_KHR_shader_subgroup_arithmetic : enable
24#extension GL_NV_gpu_shader5 : enable
25#ifdef GL_NV_gpu_shader5
26#define HAS_EXTENDED_TYPES 1
27#else
28#define HAS_EXTENDED_TYPES 0
29#endif
30#define BEGIN_PUSH_CONSTANTS
31#define END_PUSH_CONSTANTS
32#define UNIFORM(n) layout(location = n) uniform
33#define BINDING_INPUT_BUFFER 0
34#define BINDING_OUTPUT_IMAGE 0
35
36#endif
37
38BEGIN_PUSH_CONSTANTS
39UNIFORM(0) uint max_accumulation_base;
40UNIFORM(1) uint accumulation_limit;
41END_PUSH_CONSTANTS
42
43layout(local_size_x = 32) in;
44
45layout(std430, binding = 0) readonly buffer block1 {
46 uvec2 input_data[gl_WorkGroupSize.x];
47};
48
49layout(std430, binding = 1) writeonly coherent buffer block2 {
50 uvec2 output_data[gl_WorkGroupSize.x];
51};
52
53layout(std430, binding = 2) coherent buffer block3 {
54 uvec2 accumulated_data;
55};
56
57shared uvec2 shared_data[gl_WorkGroupSize.x * 2];
58
59uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
60 uint carry = 0;
61 uvec2 result;
62 result.x = uaddCarry(value_1.x, value_2.x, carry);
63 result.y = value_1.y + value_2.y + carry;
64 return result;
65}
66
67void main(void) {
68 uint id = gl_LocalInvocationID.x;
69 uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0);
70 uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0);
71 uint work_size = gl_WorkGroupSize.x;
72 uint rd_id;
73 uint wr_id;
74 uint mask;
75 uvec2 input_1 = input_data[id * 2];
76 uvec2 input_2 = input_data[id * 2 + 1];
77 // The number of steps is the log base 2 of the
78 // work group size, which should be a power of 2
79 const uint steps = uint(log2(work_size)) + 1;
80 uint step = 0;
81
82 // Each invocation is responsible for the content of
83 // two elements of the output array
84 shared_data[id * 2] = input_1;
85 shared_data[id * 2 + 1] = input_2;
86 // Synchronize to make sure that everyone has initialized
87 // their elements of shared_data[] with data loaded from
88 // the input arrays
89 barrier();
90 memoryBarrierShared();
91 // For each step...
92 for (step = 0; step < steps; step++) {
93 // Calculate the read and write index in the
94 // shared array
95 mask = (1 << step) - 1;
96 rd_id = ((id >> step) << (step + 1)) + mask;
97 wr_id = rd_id + 1 + (id & mask);
98 // Accumulate the read data into our element
99
100 shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
101 // Synchronize again to make sure that everyone
102 // has caught up with us
103 barrier();
104 memoryBarrierShared();
105 }
106 // Add the accumulation
107 shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1);
108 shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2);
109 barrier();
110 memoryBarrierShared();
111
112 // Finally write our data back to the output buffer
113 output_data[id * 2] = shared_data[id * 2];
114 output_data[id * 2 + 1] = shared_data[id * 2 + 1];
115 if (id == 0) {
116 if (max_accumulation_base >= accumulation_limit + 1) {
117 accumulated_data = shared_data[accumulation_limit];
118 return;
119 }
120 uvec2 value_1 = shared_data[max_accumulation_base];
121 uvec2 value_2 = shared_data[accumulation_limit];
122 accumulated_data = AddUint64(value_1, -value_2);
123 }
124} \ No newline at end of file
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index 039dc95e1..a1af08cda 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -12,6 +12,7 @@
12#include "common/common_types.h" 12#include "common/common_types.h"
13#include "common/div_ceil.h" 13#include "common/div_ceil.h"
14#include "video_core/host_shaders/astc_decoder_comp_spv.h" 14#include "video_core/host_shaders/astc_decoder_comp_spv.h"
15#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h"
15#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" 16#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
16#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" 17#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
17#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" 18#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
@@ -58,6 +59,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE
58 }, 59 },
59}}; 60}};
60 61
62constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{
63 {
64 .binding = 0,
65 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
66 .descriptorCount = 1,
67 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
68 .pImmutableSamplers = nullptr,
69 },
70 {
71 .binding = 1,
72 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
73 .descriptorCount = 1,
74 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
75 .pImmutableSamplers = nullptr,
76 },
77 {
78 .binding = 2,
79 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
80 .descriptorCount = 1,
81 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
82 .pImmutableSamplers = nullptr,
83 },
84}};
85
61constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ 86constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
62 .uniform_buffers = 0, 87 .uniform_buffers = 0,
63 .storage_buffers = 2, 88 .storage_buffers = 2,
@@ -68,6 +93,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
68 .score = 2, 93 .score = 2,
69}; 94};
70 95
96constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{
97 .uniform_buffers = 0,
98 .storage_buffers = 3,
99 .texture_buffers = 0,
100 .image_buffers = 0,
101 .textures = 0,
102 .images = 0,
103 .score = 3,
104};
105
71constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ 106constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
72 { 107 {
73 .binding = ASTC_BINDING_INPUT_BUFFER, 108 .binding = ASTC_BINDING_INPUT_BUFFER,
@@ -104,6 +139,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT
104 .stride = sizeof(DescriptorUpdateEntry), 139 .stride = sizeof(DescriptorUpdateEntry),
105}; 140};
106 141
142constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{
143 .dstBinding = 0,
144 .dstArrayElement = 0,
145 .descriptorCount = 3,
146 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
147 .offset = 0,
148 .stride = sizeof(DescriptorUpdateEntry),
149};
150
107constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS> 151constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS>
108 ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ 152 ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
109 { 153 {
@@ -132,6 +176,11 @@ struct AstcPushConstants {
132 u32 block_height; 176 u32 block_height;
133 u32 block_height_mask; 177 u32 block_height_mask;
134}; 178};
179
180struct QueriesPrefixScanPushConstants {
181 u32 max_accumulation_base;
182 u32 accumulation_limit;
183};
135} // Anonymous namespace 184} // Anonymous namespace
136 185
137ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, 186ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
@@ -313,8 +362,6 @@ ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(
313 362
314void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, 363void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer,
315 u32 src_offset, bool compare_to_zero) { 364 u32 src_offset, bool compare_to_zero) {
316 scheduler.RequestOutsideRenderPassOperationContext();
317
318 const size_t compare_size = compare_to_zero ? 8 : 24; 365 const size_t compare_size = compare_to_zero ? 8 : 24;
319 366
320 compute_pass_descriptor_queue.Acquire(); 367 compute_pass_descriptor_queue.Acquire();
@@ -327,7 +374,7 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
327 static constexpr VkMemoryBarrier read_barrier{ 374 static constexpr VkMemoryBarrier read_barrier{
328 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, 375 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
329 .pNext = nullptr, 376 .pNext = nullptr,
330 .srcAccessMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 377 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT,
331 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, 378 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
332 }; 379 };
333 static constexpr VkMemoryBarrier write_barrier{ 380 static constexpr VkMemoryBarrier write_barrier{
@@ -349,6 +396,63 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
349 }); 396 });
350} 397}
351 398
399QueriesPrefixScanPass::QueriesPrefixScanPass(
400 const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
401 ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
402 : ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS,
403 QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO,
404 COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>,
405 QUERIES_PREFIX_SCAN_SUM_COMP_SPV),
406 scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
407
408void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer,
409 VkBuffer src_buffer, size_t number_of_sums,
410 size_t max_accumulation_limit) {
411 size_t aligned_runs = Common::AlignUp(number_of_sums, 32);
412
413 compute_pass_descriptor_queue.Acquire();
414 compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64));
415 compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64));
416 compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64));
417 const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()};
418
419 scheduler.RequestOutsideRenderPassOperationContext();
420 scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums,
421 aligned_runs](vk::CommandBuffer cmdbuf) {
422 static constexpr VkMemoryBarrier read_barrier{
423 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
424 .pNext = nullptr,
425 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
426 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
427 };
428 static constexpr VkMemoryBarrier write_barrier{
429 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
430 .pNext = nullptr,
431 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
432 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT |
433 VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
434 VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT |
435 VK_ACCESS_UNIFORM_READ_BIT |
436 VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT,
437 };
438 const QueriesPrefixScanPushConstants uniforms{
439 .max_accumulation_base = static_cast<u32>(max_accumulation_limit),
440 .accumulation_limit = static_cast<u32>(number_of_sums - 1),
441 };
442 const VkDescriptorSet set = descriptor_allocator.Commit();
443 device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
444
445 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
446 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier);
447 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
448 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
449 cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms);
450 cmdbuf.Dispatch(static_cast<u32>(aligned_runs / 32U), 1, 1);
451 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
452 VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier);
453 });
454}
455
352ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, 456ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
353 DescriptorPool& descriptor_pool_, 457 DescriptorPool& descriptor_pool_,
354 StagingBufferPool& staging_buffer_pool_, 458 StagingBufferPool& staging_buffer_pool_,
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index c62f30d30..e6ff86e9a 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -95,6 +95,20 @@ private:
95 ComputePassDescriptorQueue& compute_pass_descriptor_queue; 95 ComputePassDescriptorQueue& compute_pass_descriptor_queue;
96}; 96};
97 97
98class QueriesPrefixScanPass final : public ComputePass {
99public:
100 explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_,
101 DescriptorPool& descriptor_pool_,
102 ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
103
104 void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer,
105 size_t number_of_sums, size_t max_accumulation_limit);
106
107private:
108 Scheduler& scheduler;
109 ComputePassDescriptorQueue& compute_pass_descriptor_queue;
110};
111
98class ASTCDecoderPass final : public ComputePass { 112class ASTCDecoderPass final : public ComputePass {
99public: 113public:
100 explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, 114 explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp
index 2147776f8..ded190ae0 100644
--- a/src/video_core/renderer_vulkan/vk_query_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp
@@ -11,6 +11,7 @@
11#include <utility> 11#include <utility>
12#include <vector> 12#include <vector>
13 13
14#include "common/bit_util.h"
14#include "common/common_types.h" 15#include "common/common_types.h"
15#include "core/memory.h" 16#include "core/memory.h"
16#include "video_core/engines/draw_manager.h" 17#include "video_core/engines/draw_manager.h"
@@ -112,14 +113,34 @@ class SamplesStreamer : public BaseStreamer {
112public: 113public:
113 explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, 114 explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_,
114 VideoCore::RasterizerInterface* rasterizer_, const Device& device_, 115 VideoCore::RasterizerInterface* rasterizer_, const Device& device_,
115 Scheduler& scheduler_, const MemoryAllocator& memory_allocator_) 116 Scheduler& scheduler_, const MemoryAllocator& memory_allocator_,
117 ComputePassDescriptorQueue& compute_pass_descriptor_queue,
118 DescriptorPool& descriptor_pool)
116 : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, 119 : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_},
117 scheduler{scheduler_}, memory_allocator{memory_allocator_} { 120 scheduler{scheduler_}, memory_allocator{memory_allocator_} {
118 BuildResolveBuffer();
119 current_bank = nullptr; 121 current_bank = nullptr;
120 current_query = nullptr; 122 current_query = nullptr;
121 ammend_value = 0; 123 ammend_value = 0;
122 acumulation_value = 0; 124 acumulation_value = 0;
125 queries_prefix_scan_pass = std::make_unique<QueriesPrefixScanPass>(
126 device, scheduler, descriptor_pool, compute_pass_descriptor_queue);
127
128 const VkBufferCreateInfo buffer_ci = {
129 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
130 .pNext = nullptr,
131 .flags = 0,
132 .size = 8,
133 .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
134 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
135 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
136 .queueFamilyIndexCount = 0,
137 .pQueueFamilyIndices = nullptr,
138 };
139 accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal);
140 scheduler.RequestOutsideRenderPassOperationContext();
141 scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
142 cmdbuf.FillBuffer(buffer, 0, 8, 0);
143 });
123 } 144 }
124 145
125 ~SamplesStreamer() = default; 146 ~SamplesStreamer() = default;
@@ -159,6 +180,8 @@ public:
159 acumulation_value = 0; 180 acumulation_value = 0;
160 }); 181 });
161 rasterizer->SyncOperation(std::move(func)); 182 rasterizer->SyncOperation(std::move(func));
183 accumulation_since_last_sync = false;
184 last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used);
162 } 185 }
163 186
164 void CloseCounter() override { 187 void CloseCounter() override {
@@ -175,7 +198,8 @@ public:
175 } 198 }
176 199
177 for (size_t i = 0; i < sync_values_stash.size(); i++) { 200 for (size_t i = 0; i < sync_values_stash.size(); i++) {
178 runtime.template SyncValues<HostSyncValues>(sync_values_stash[i], *resolve_buffers[i]); 201 runtime.template SyncValues<HostSyncValues>(sync_values_stash[i],
202 *buffers[resolve_buffers[i]]);
179 } 203 }
180 204
181 sync_values_stash.clear(); 205 sync_values_stash.clear();
@@ -189,36 +213,21 @@ public:
189 sync_values_stash.clear(); 213 sync_values_stash.clear();
190 sync_values_stash.emplace_back(); 214 sync_values_stash.emplace_back();
191 std::vector<HostSyncValues>* sync_values = &sync_values_stash.back(); 215 std::vector<HostSyncValues>* sync_values = &sync_values_stash.back();
192 sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); 216 sync_values->reserve(num_slots_used);
193 std::unordered_map<size_t, std::pair<size_t, size_t>> offsets; 217 std::unordered_map<size_t, std::pair<size_t, size_t>> offsets;
194 size_t this_bank_slot = std::numeric_limits<size_t>::max(); 218 resolve_buffers.clear();
195 size_t resolve_slots_remaining = resolve_slots; 219 size_t resolve_buffer_index = ObtainBuffer<true>(num_slots_used);
196 size_t resolve_buffer_index = 0; 220 resolve_buffers.push_back(resolve_buffer_index);
221 size_t base_offset = 0;
222
197 ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start, 223 ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start,
198 size_t amount) { 224 size_t amount) {
199 size_t bank_id = bank->GetIndex(); 225 size_t bank_id = bank->GetIndex();
200 if (this_bank_slot != bank_id) { 226 auto& resolve_buffer = buffers[resolve_buffer_index];
201 this_bank_slot = bank_id;
202 if (resolve_slots_remaining == 0) {
203 resolve_buffer_index++;
204 if (resolve_buffer_index >= resolve_buffers.size()) {
205 BuildResolveBuffer();
206 }
207 resolve_slots_remaining = resolve_slots;
208 sync_values_stash.emplace_back();
209 sync_values = &sync_values_stash.back();
210 sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE);
211 }
212 resolve_slots_remaining--;
213 }
214 auto& resolve_buffer = resolve_buffers[resolve_buffer_index];
215 const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE *
216 (resolve_slots - resolve_slots_remaining - 1);
217 VkQueryPool query_pool = bank->GetInnerPool(); 227 VkQueryPool query_pool = bank->GetInnerPool();
218 scheduler.RequestOutsideRenderPassOperationContext(); 228 scheduler.RequestOutsideRenderPassOperationContext();
219 scheduler.Record([start, amount, base_offset, query_pool, 229 scheduler.Record([start, amount, base_offset, query_pool,
220 buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { 230 buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) {
221 size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE;
222 const VkBufferMemoryBarrier copy_query_pool_barrier{ 231 const VkBufferMemoryBarrier copy_query_pool_barrier{
223 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, 232 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
224 .pNext = nullptr, 233 .pNext = nullptr,
@@ -227,39 +236,60 @@ public:
227 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, 236 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
228 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, 237 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
229 .buffer = buffer, 238 .buffer = buffer,
230 .offset = final_offset, 239 .offset = base_offset,
231 .size = amount * SamplesQueryBank::QUERY_SIZE, 240 .size = amount * SamplesQueryBank::QUERY_SIZE,
232 }; 241 };
233 242
234 cmdbuf.CopyQueryPoolResults( 243 cmdbuf.CopyQueryPoolResults(
235 query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer, 244 query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer,
236 static_cast<u32>(final_offset), SamplesQueryBank::QUERY_SIZE, 245 static_cast<u32>(base_offset), SamplesQueryBank::QUERY_SIZE,
237 VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); 246 VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT);
238 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, 247 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
239 VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier); 248 VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier);
240 }); 249 });
241 offsets[bank_id] = {sync_values_stash.size() - 1, base_offset}; 250 offsets[bank_id] = {start, base_offset};
251 base_offset += amount * SamplesQueryBank::QUERY_SIZE;
242 }); 252 });
243 253
244 // Convert queries 254 // Convert queries
255 bool has_multi_queries = false;
245 for (auto q : pending_sync) { 256 for (auto q : pending_sync) {
246 auto* query = GetQuery(q); 257 auto* query = GetQuery(q);
258 size_t sync_value_slot = 0;
247 if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { 259 if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) {
248 continue; 260 continue;
249 } 261 }
250 if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { 262 if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) {
251 continue; 263 continue;
252 } 264 }
253 if (query->size_slots > 1) { 265 if (accumulation_since_last_sync || query->size_slots > 1) {
254 // This is problematic. 266 if (!has_multi_queries) {
255 // UNIMPLEMENTED(); 267 has_multi_queries = true;
268 sync_values_stash.emplace_back();
269 }
270 sync_value_slot = 1;
256 } 271 }
257 query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; 272 query->flags |= VideoCommon::QueryFlagBits::IsHostSynced;
258 auto loc_data = offsets[query->start_bank_id]; 273 auto loc_data = offsets[query->start_bank_id];
259 sync_values_stash[loc_data.first].emplace_back(HostSyncValues{ 274 sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{
260 .address = query->guest_address, 275 .address = query->guest_address,
261 .size = SamplesQueryBank::QUERY_SIZE, 276 .size = SamplesQueryBank::QUERY_SIZE,
262 .offset = loc_data.second + query->start_slot * SamplesQueryBank::QUERY_SIZE, 277 .offset =
278 loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) *
279 SamplesQueryBank::QUERY_SIZE,
280 });
281 }
282
283 if (has_multi_queries) {
284 size_t intermediary_buffer_index = ObtainBuffer<false>(num_slots_used);
285 resolve_buffers.push_back(intermediary_buffer_index);
286 queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index],
287 *buffers[resolve_buffer_index], num_slots_used,
288 std::min(last_accumulation_checkpoint, num_slots_used));
289 } else {
290 scheduler.RequestOutsideRenderPassOperationContext();
291 scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
292 cmdbuf.FillBuffer(buffer, 0, 8, 0);
263 }); 293 });
264 } 294 }
265 295
@@ -267,6 +297,9 @@ public:
267 std::function<void()> func([this] { ammend_value = acumulation_value; }); 297 std::function<void()> func([this] { ammend_value = acumulation_value; });
268 rasterizer->SyncOperation(std::move(func)); 298 rasterizer->SyncOperation(std::move(func));
269 AbandonCurrentQuery(); 299 AbandonCurrentQuery();
300 num_slots_used = 0;
301 last_accumulation_checkpoint = std::numeric_limits<size_t>::max();
302 accumulation_since_last_sync = has_multi_queries;
270 pending_sync.clear(); 303 pending_sync.clear();
271 } 304 }
272 305
@@ -400,6 +433,7 @@ private:
400 void ReserveHostQuery() { 433 void ReserveHostQuery() {
401 size_t new_slot = ReserveBankSlot(); 434 size_t new_slot = ReserveBankSlot();
402 current_bank->AddReference(1); 435 current_bank->AddReference(1);
436 num_slots_used++;
403 if (current_query) { 437 if (current_query) {
404 size_t bank_id = current_query->start_bank_id; 438 size_t bank_id = current_query->start_bank_id;
405 size_t banks_set = current_query->size_banks - 1; 439 size_t banks_set = current_query->size_banks - 1;
@@ -470,32 +504,50 @@ private:
470 }); 504 });
471 } 505 }
472 506
473 void BuildResolveBuffer() { 507 template <bool is_resolve>
508 size_t ObtainBuffer(size_t num_needed) {
509 const size_t log_2 = std::max<size_t>(6U, Common::Log2Ceil64(num_needed));
510 if constexpr (is_resolve) {
511 if (resolve_table[log_2] != 0) {
512 return resolve_table[log_2] - 1;
513 }
514 } else {
515 if (intermediary_table[log_2] != 0) {
516 return intermediary_table[log_2] - 1;
517 }
518 }
474 const VkBufferCreateInfo buffer_ci = { 519 const VkBufferCreateInfo buffer_ci = {
475 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, 520 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
476 .pNext = nullptr, 521 .pNext = nullptr,
477 .flags = 0, 522 .flags = 0,
478 .size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots, 523 .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2),
479 .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | 524 .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
480 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, 525 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
481 .sharingMode = VK_SHARING_MODE_EXCLUSIVE, 526 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
482 .queueFamilyIndexCount = 0, 527 .queueFamilyIndexCount = 0,
483 .pQueueFamilyIndices = nullptr, 528 .pQueueFamilyIndices = nullptr,
484 }; 529 };
485 resolve_buffers.emplace_back( 530 buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal));
486 memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); 531 if constexpr (is_resolve) {
532 resolve_table[log_2] = buffers.size();
533 } else {
534 intermediary_table[log_2] = buffers.size();
535 }
536 return buffers.size() - 1;
487 } 537 }
488 538
489 static constexpr size_t resolve_slots = 8;
490
491 QueryCacheRuntime& runtime; 539 QueryCacheRuntime& runtime;
492 VideoCore::RasterizerInterface* rasterizer; 540 VideoCore::RasterizerInterface* rasterizer;
493 const Device& device; 541 const Device& device;
494 Scheduler& scheduler; 542 Scheduler& scheduler;
495 const MemoryAllocator& memory_allocator; 543 const MemoryAllocator& memory_allocator;
496 VideoCommon::BankPool<SamplesQueryBank> bank_pool; 544 VideoCommon::BankPool<SamplesQueryBank> bank_pool;
497 std::deque<vk::Buffer> resolve_buffers; 545 std::deque<vk::Buffer> buffers;
546 std::array<size_t, 32> resolve_table{};
547 std::array<size_t, 32> intermediary_table{};
548 vk::Buffer accumulation_buffer;
498 std::deque<std::vector<HostSyncValues>> sync_values_stash; 549 std::deque<std::vector<HostSyncValues>> sync_values_stash;
550 std::vector<size_t> resolve_buffers;
499 551
500 // syncing queue 552 // syncing queue
501 std::vector<size_t> pending_sync; 553 std::vector<size_t> pending_sync;
@@ -510,10 +562,14 @@ private:
510 SamplesQueryBank* current_bank; 562 SamplesQueryBank* current_bank;
511 VkQueryPool current_query_pool; 563 VkQueryPool current_query_pool;
512 size_t current_query_id; 564 size_t current_query_id;
565 size_t num_slots_used{};
566 size_t last_accumulation_checkpoint{};
567 bool accumulation_since_last_sync{};
513 VideoCommon::HostQueryBase* current_query; 568 VideoCommon::HostQueryBase* current_query;
514 bool has_started{}; 569 bool has_started{};
515 bool current_unset{};
516 std::mutex flush_guard; 570 std::mutex flush_guard;
571
572 std::unique_ptr<QueriesPrefixScanPass> queries_prefix_scan_pass;
517}; 573};
518 574
519// Transform feedback queries 575// Transform feedback queries
@@ -1090,7 +1146,8 @@ struct QueryCacheRuntimeImpl {
1090 memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, 1146 memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_},
1091 guest_streamer(0, runtime), 1147 guest_streamer(0, runtime),
1092 sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer, 1148 sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer,
1093 device, scheduler, memory_allocator), 1149 device, scheduler, memory_allocator, compute_pass_descriptor_queue,
1150 descriptor_pool),
1094 tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device, 1151 tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device,
1095 scheduler, memory_allocator, staging_pool), 1152 scheduler, memory_allocator, staging_pool),
1096 primitives_succeeded_streamer( 1153 primitives_succeeded_streamer(
@@ -1319,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku
1319 return true; 1376 return true;
1320 } 1377 }
1321 } 1378 }
1322 if (!is_in_bc[0] && !is_in_bc[1]) { 1379 /*if (!is_in_bc[0] && !is_in_bc[1]) {
1323 // Both queries are in query cache, it's best to just flush. 1380 // Both queries are in query cache, it's best to just flush.
1324 return false; 1381 return true;
1325 } 1382 }*/
1326 HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); 1383 HostConditionalRenderingCompareBCImpl(object_1.address, equal_check);
1327 return true; 1384 return true;
1328} 1385}