summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar liamwhite2023-09-25 09:18:29 -0400
committerGravatar GitHub2023-09-25 09:18:29 -0400
commit854457a392b6d38168f7f9d19d1fa8c43fad653c (patch)
tree3bc1007b5776f1ce82c057875609105de0a1ca44 /src
parentMerge pull request #11569 from german77/lle_applet (diff)
parentQuery Cache: Fix Prefix Sums (diff)
downloadyuzu-854457a392b6d38168f7f9d19d1fa8c43fad653c.tar.gz
yuzu-854457a392b6d38168f7f9d19d1fa8c43fad653c.tar.xz
yuzu-854457a392b6d38168f7f9d19d1fa8c43fad653c.zip
Merge pull request #11225 from FernandoS27/no-laxatives-in-santas-cookies
Y.F.C: Rework the Query Cache.
Diffstat (limited to 'src')
-rw-r--r--src/common/settings.cpp10
-rw-r--r--src/common/settings.h2
-rw-r--r--src/video_core/CMakeLists.txt6
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h18
-rw-r--r--src/video_core/buffer_cache/buffer_cache_base.h12
-rw-r--r--src/video_core/control/channel_state_cache.h2
-rw-r--r--src/video_core/engines/draw_manager.h1
-rw-r--r--src/video_core/engines/maxwell_3d.cpp74
-rw-r--r--src/video_core/engines/maxwell_3d.h3
-rw-r--r--src/video_core/engines/maxwell_dma.cpp12
-rw-r--r--src/video_core/engines/puller.cpp13
-rw-r--r--src/video_core/fence_manager.h21
-rw-r--r--src/video_core/gpu.cpp4
-rw-r--r--src/video_core/host_shaders/CMakeLists.txt6
-rw-r--r--src/video_core/host_shaders/queries_prefix_scan_sum.comp173
-rw-r--r--src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp138
-rw-r--r--src/video_core/host_shaders/resolve_conditional_render.comp20
-rw-r--r--src/video_core/macro/macro_hle.cpp49
-rw-r--r--src/video_core/query_cache.h13
-rw-r--r--src/video_core/query_cache/bank_base.h104
-rw-r--r--src/video_core/query_cache/query_base.h70
-rw-r--r--src/video_core/query_cache/query_cache.h580
-rw-r--r--src/video_core/query_cache/query_cache_base.h181
-rw-r--r--src/video_core/query_cache/query_stream.h149
-rw-r--r--src/video_core/query_cache/types.h74
-rw-r--r--src/video_core/rasterizer_interface.h13
-rw-r--r--src/video_core/renderer_null/null_rasterizer.cpp18
-rw-r--r--src/video_core/renderer_null/null_rasterizer.h7
-rw-r--r--src/video_core/renderer_opengl/gl_query_cache.cpp2
-rw-r--r--src/video_core/renderer_opengl/gl_query_cache.h2
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp40
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h7
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.cpp3
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.cpp181
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.h31
-rw-r--r--src/video_core/renderer_vulkan/vk_fence_manager.h2
-rw-r--r--src/video_core/renderer_vulkan/vk_query_cache.cpp1593
-rw-r--r--src/video_core/renderer_vulkan/vk_query_cache.h106
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp107
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h14
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.cpp9
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.h12
-rw-r--r--src/video_core/vulkan_common/vulkan_device.h6
-rw-r--r--src/video_core/vulkan_common/vulkan_wrapper.cpp4
-rw-r--r--src/video_core/vulkan_common/vulkan_wrapper.h27
45 files changed, 3553 insertions, 366 deletions
diff --git a/src/common/settings.cpp b/src/common/settings.cpp
index 4ecaf550b..3fde3cae6 100644
--- a/src/common/settings.cpp
+++ b/src/common/settings.cpp
@@ -130,13 +130,17 @@ void LogSettings() {
130 log_path("DataStorage_SDMCDir", Common::FS::GetYuzuPath(Common::FS::YuzuPath::SDMCDir)); 130 log_path("DataStorage_SDMCDir", Common::FS::GetYuzuPath(Common::FS::YuzuPath::SDMCDir));
131} 131}
132 132
133void UpdateGPUAccuracy() {
134 values.current_gpu_accuracy = values.gpu_accuracy.GetValue();
135}
136
133bool IsGPULevelExtreme() { 137bool IsGPULevelExtreme() {
134 return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme; 138 return values.current_gpu_accuracy == GpuAccuracy::Extreme;
135} 139}
136 140
137bool IsGPULevelHigh() { 141bool IsGPULevelHigh() {
138 return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme || 142 return values.current_gpu_accuracy == GpuAccuracy::Extreme ||
139 values.gpu_accuracy.GetValue() == GpuAccuracy::High; 143 values.current_gpu_accuracy == GpuAccuracy::High;
140} 144}
141 145
142bool IsFastmemEnabled() { 146bool IsFastmemEnabled() {
diff --git a/src/common/settings.h b/src/common/settings.h
index 82ec9077e..ae5e5d2b8 100644
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -307,6 +307,7 @@ struct Values {
307 Specialization::Default, 307 Specialization::Default,
308 true, 308 true,
309 true}; 309 true};
310 GpuAccuracy current_gpu_accuracy{GpuAccuracy::High};
310 SwitchableSetting<AnisotropyMode, true> max_anisotropy{ 311 SwitchableSetting<AnisotropyMode, true> max_anisotropy{
311 linkage, AnisotropyMode::Automatic, AnisotropyMode::Automatic, AnisotropyMode::X16, 312 linkage, AnisotropyMode::Automatic, AnisotropyMode::Automatic, AnisotropyMode::X16,
312 "max_anisotropy", Category::RendererAdvanced}; 313 "max_anisotropy", Category::RendererAdvanced};
@@ -522,6 +523,7 @@ struct Values {
522 523
523extern Values values; 524extern Values values;
524 525
526void UpdateGPUAccuracy();
525bool IsGPULevelExtreme(); 527bool IsGPULevelExtreme();
526bool IsGPULevelHigh(); 528bool IsGPULevelHigh();
527 529
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 9b13ccbab..cf9266d54 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -95,6 +95,12 @@ add_library(video_core STATIC
95 memory_manager.h 95 memory_manager.h
96 precompiled_headers.h 96 precompiled_headers.h
97 pte_kind.h 97 pte_kind.h
98 query_cache/bank_base.h
99 query_cache/query_base.h
100 query_cache/query_cache_base.h
101 query_cache/query_cache.h
102 query_cache/query_stream.h
103 query_cache/types.h
98 query_cache.h 104 query_cache.h
99 rasterizer_accelerated.cpp 105 rasterizer_accelerated.cpp
100 rasterizer_accelerated.h 106 rasterizer_accelerated.h
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 8be7bd594..9e90c587c 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -272,13 +272,19 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad
272 if (!cpu_addr) { 272 if (!cpu_addr) {
273 return {&slot_buffers[NULL_BUFFER_ID], 0}; 273 return {&slot_buffers[NULL_BUFFER_ID], 0};
274 } 274 }
275 const BufferId buffer_id = FindBuffer(*cpu_addr, size); 275 return ObtainCPUBuffer(*cpu_addr, size, sync_info, post_op);
276}
277
278template <class P>
279std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainCPUBuffer(
280 VAddr cpu_addr, u32 size, ObtainBufferSynchronize sync_info, ObtainBufferOperation post_op) {
281 const BufferId buffer_id = FindBuffer(cpu_addr, size);
276 Buffer& buffer = slot_buffers[buffer_id]; 282 Buffer& buffer = slot_buffers[buffer_id];
277 283
278 // synchronize op 284 // synchronize op
279 switch (sync_info) { 285 switch (sync_info) {
280 case ObtainBufferSynchronize::FullSynchronize: 286 case ObtainBufferSynchronize::FullSynchronize:
281 SynchronizeBuffer(buffer, *cpu_addr, size); 287 SynchronizeBuffer(buffer, cpu_addr, size);
282 break; 288 break;
283 default: 289 default:
284 break; 290 break;
@@ -286,11 +292,11 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad
286 292
287 switch (post_op) { 293 switch (post_op) {
288 case ObtainBufferOperation::MarkAsWritten: 294 case ObtainBufferOperation::MarkAsWritten:
289 MarkWrittenBuffer(buffer_id, *cpu_addr, size); 295 MarkWrittenBuffer(buffer_id, cpu_addr, size);
290 break; 296 break;
291 case ObtainBufferOperation::DiscardWrite: { 297 case ObtainBufferOperation::DiscardWrite: {
292 VAddr cpu_addr_start = Common::AlignDown(*cpu_addr, 64); 298 VAddr cpu_addr_start = Common::AlignDown(cpu_addr, 64);
293 VAddr cpu_addr_end = Common::AlignUp(*cpu_addr + size, 64); 299 VAddr cpu_addr_end = Common::AlignUp(cpu_addr + size, 64);
294 IntervalType interval{cpu_addr_start, cpu_addr_end}; 300 IntervalType interval{cpu_addr_start, cpu_addr_end};
295 ClearDownload(interval); 301 ClearDownload(interval);
296 common_ranges.subtract(interval); 302 common_ranges.subtract(interval);
@@ -300,7 +306,7 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad
300 break; 306 break;
301 } 307 }
302 308
303 return {&buffer, buffer.Offset(*cpu_addr)}; 309 return {&buffer, buffer.Offset(cpu_addr)};
304} 310}
305 311
306template <class P> 312template <class P>
diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h
index 0b7135d49..c4f6e8d12 100644
--- a/src/video_core/buffer_cache/buffer_cache_base.h
+++ b/src/video_core/buffer_cache/buffer_cache_base.h
@@ -295,6 +295,10 @@ public:
295 [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, 295 [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size,
296 ObtainBufferSynchronize sync_info, 296 ObtainBufferSynchronize sync_info,
297 ObtainBufferOperation post_op); 297 ObtainBufferOperation post_op);
298
299 [[nodiscard]] std::pair<Buffer*, u32> ObtainCPUBuffer(VAddr gpu_addr, u32 size,
300 ObtainBufferSynchronize sync_info,
301 ObtainBufferOperation post_op);
298 void FlushCachedWrites(); 302 void FlushCachedWrites();
299 303
300 /// Return true when there are uncommitted buffers to be downloaded 304 /// Return true when there are uncommitted buffers to be downloaded
@@ -335,6 +339,14 @@ public:
335 339
336 [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); 340 [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer();
337 341
342 template <typename Func>
343 void BufferOperations(Func&& func) {
344 do {
345 channel_state->has_deleted_buffers = false;
346 func();
347 } while (channel_state->has_deleted_buffers);
348 }
349
338 std::recursive_mutex mutex; 350 std::recursive_mutex mutex;
339 Runtime& runtime; 351 Runtime& runtime;
340 352
diff --git a/src/video_core/control/channel_state_cache.h b/src/video_core/control/channel_state_cache.h
index 46bc9e322..5574e1fba 100644
--- a/src/video_core/control/channel_state_cache.h
+++ b/src/video_core/control/channel_state_cache.h
@@ -51,7 +51,7 @@ public:
51 virtual void CreateChannel(Tegra::Control::ChannelState& channel); 51 virtual void CreateChannel(Tegra::Control::ChannelState& channel);
52 52
53 /// Bind a channel for execution. 53 /// Bind a channel for execution.
54 void BindToChannel(s32 id); 54 virtual void BindToChannel(s32 id);
55 55
56 /// Erase channel's state. 56 /// Erase channel's state.
57 void EraseChannel(s32 id); 57 void EraseChannel(s32 id);
diff --git a/src/video_core/engines/draw_manager.h b/src/video_core/engines/draw_manager.h
index 7c22c49f1..18d959143 100644
--- a/src/video_core/engines/draw_manager.h
+++ b/src/video_core/engines/draw_manager.h
@@ -46,6 +46,7 @@ public:
46 }; 46 };
47 47
48 struct IndirectParams { 48 struct IndirectParams {
49 bool is_byte_count;
49 bool is_indexed; 50 bool is_indexed;
50 bool include_count; 51 bool include_count;
51 GPUVAddr count_start_address; 52 GPUVAddr count_start_address;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 06e349e43..32d767d85 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -20,8 +20,6 @@
20 20
21namespace Tegra::Engines { 21namespace Tegra::Engines {
22 22
23using VideoCore::QueryType;
24
25/// First register id that is actually a Macro call. 23/// First register id that is actually a Macro call.
26constexpr u32 MacroRegistersStart = 0xE00; 24constexpr u32 MacroRegistersStart = 0xE00;
27 25
@@ -500,27 +498,21 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
500} 498}
501 499
502void Maxwell3D::ProcessQueryGet() { 500void Maxwell3D::ProcessQueryGet() {
501 VideoCommon::QueryPropertiesFlags flags{};
502 if (regs.report_semaphore.query.short_query == 0) {
503 flags |= VideoCommon::QueryPropertiesFlags::HasTimeout;
504 }
505 const GPUVAddr sequence_address{regs.report_semaphore.Address()};
506 const VideoCommon::QueryType query_type =
507 static_cast<VideoCommon::QueryType>(regs.report_semaphore.query.report.Value());
508 const u32 payload = regs.report_semaphore.payload;
509 const u32 subreport = regs.report_semaphore.query.sub_report;
503 switch (regs.report_semaphore.query.operation) { 510 switch (regs.report_semaphore.query.operation) {
504 case Regs::ReportSemaphore::Operation::Release: 511 case Regs::ReportSemaphore::Operation::Release:
505 if (regs.report_semaphore.query.short_query != 0) { 512 if (regs.report_semaphore.query.short_query != 0) {
506 const GPUVAddr sequence_address{regs.report_semaphore.Address()}; 513 flags |= VideoCommon::QueryPropertiesFlags::IsAFence;
507 const u32 payload = regs.report_semaphore.payload;
508 std::function<void()> operation([this, sequence_address, payload] {
509 memory_manager.Write<u32>(sequence_address, payload);
510 });
511 rasterizer->SignalFence(std::move(operation));
512 } else {
513 struct LongQueryResult {
514 u64_le value;
515 u64_le timestamp;
516 };
517 const GPUVAddr sequence_address{regs.report_semaphore.Address()};
518 const u32 payload = regs.report_semaphore.payload;
519 [this, sequence_address, payload] {
520 memory_manager.Write<u64>(sequence_address + sizeof(u64), system.GPU().GetTicks());
521 memory_manager.Write<u64>(sequence_address, payload);
522 }();
523 } 514 }
515 rasterizer->Query(sequence_address, query_type, flags, payload, subreport);
524 break; 516 break;
525 case Regs::ReportSemaphore::Operation::Acquire: 517 case Regs::ReportSemaphore::Operation::Acquire:
526 // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that 518 // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
@@ -528,11 +520,7 @@ void Maxwell3D::ProcessQueryGet() {
528 UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); 520 UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");
529 break; 521 break;
530 case Regs::ReportSemaphore::Operation::ReportOnly: 522 case Regs::ReportSemaphore::Operation::ReportOnly:
531 if (const std::optional<u64> result = GetQueryResult()) { 523 rasterizer->Query(sequence_address, query_type, flags, payload, subreport);
532 // If the query returns an empty optional it means it's cached and deferred.
533 // In this case we have a non-empty result, so we stamp it immediately.
534 StampQueryResult(*result, regs.report_semaphore.query.short_query == 0);
535 }
536 break; 524 break;
537 case Regs::ReportSemaphore::Operation::Trap: 525 case Regs::ReportSemaphore::Operation::Trap:
538 UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); 526 UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");
@@ -544,6 +532,10 @@ void Maxwell3D::ProcessQueryGet() {
544} 532}
545 533
546void Maxwell3D::ProcessQueryCondition() { 534void Maxwell3D::ProcessQueryCondition() {
535 if (rasterizer->AccelerateConditionalRendering()) {
536 execute_on = true;
537 return;
538 }
547 const GPUVAddr condition_address{regs.render_enable.Address()}; 539 const GPUVAddr condition_address{regs.render_enable.Address()};
548 switch (regs.render_enable_override) { 540 switch (regs.render_enable_override) {
549 case Regs::RenderEnable::Override::AlwaysRender: 541 case Regs::RenderEnable::Override::AlwaysRender:
@@ -553,10 +545,6 @@ void Maxwell3D::ProcessQueryCondition() {
553 execute_on = false; 545 execute_on = false;
554 break; 546 break;
555 case Regs::RenderEnable::Override::UseRenderEnable: { 547 case Regs::RenderEnable::Override::UseRenderEnable: {
556 if (rasterizer->AccelerateConditionalRendering()) {
557 execute_on = true;
558 return;
559 }
560 switch (regs.render_enable.mode) { 548 switch (regs.render_enable.mode) {
561 case Regs::RenderEnable::Mode::True: { 549 case Regs::RenderEnable::Mode::True: {
562 execute_on = true; 550 execute_on = true;
@@ -598,15 +586,9 @@ void Maxwell3D::ProcessQueryCondition() {
598} 586}
599 587
600void Maxwell3D::ProcessCounterReset() { 588void Maxwell3D::ProcessCounterReset() {
601#if ANDROID
602 if (!Settings::IsGPULevelHigh()) {
603 // This is problematic on Android, disable on GPU Normal.
604 return;
605 }
606#endif
607 switch (regs.clear_report_value) { 589 switch (regs.clear_report_value) {
608 case Regs::ClearReport::ZPassPixelCount: 590 case Regs::ClearReport::ZPassPixelCount:
609 rasterizer->ResetCounter(QueryType::SamplesPassed); 591 rasterizer->ResetCounter(VideoCommon::QueryType::ZPassPixelCount64);
610 break; 592 break;
611 default: 593 default:
612 LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value); 594 LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value);
@@ -620,28 +602,6 @@ void Maxwell3D::ProcessSyncPoint() {
620 rasterizer->SignalSyncPoint(sync_point); 602 rasterizer->SignalSyncPoint(sync_point);
621} 603}
622 604
623std::optional<u64> Maxwell3D::GetQueryResult() {
624 switch (regs.report_semaphore.query.report) {
625 case Regs::ReportSemaphore::Report::Payload:
626 return regs.report_semaphore.payload;
627 case Regs::ReportSemaphore::Report::ZPassPixelCount64:
628#if ANDROID
629 if (!Settings::IsGPULevelHigh()) {
630 // This is problematic on Android, disable on GPU Normal.
631 return 120;
632 }
633#endif
634 // Deferred.
635 rasterizer->Query(regs.report_semaphore.Address(), QueryType::SamplesPassed,
636 system.GPU().GetTicks());
637 return std::nullopt;
638 default:
639 LOG_DEBUG(HW_GPU, "Unimplemented query report type {}",
640 regs.report_semaphore.query.report.Value());
641 return 1;
642 }
643}
644
645void Maxwell3D::ProcessCBBind(size_t stage_index) { 605void Maxwell3D::ProcessCBBind(size_t stage_index) {
646 // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader 606 // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader
647 // stage. 607 // stage.
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 6c19354e1..17faacc37 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -3182,9 +3182,6 @@ private:
3182 /// Handles writes to syncing register. 3182 /// Handles writes to syncing register.
3183 void ProcessSyncPoint(); 3183 void ProcessSyncPoint();
3184 3184
3185 /// Returns a query's value or an empty object if the value will be deferred through a cache.
3186 std::optional<u64> GetQueryResult();
3187
3188 void RefreshParametersImpl(); 3185 void RefreshParametersImpl();
3189 3186
3190 bool IsMethodExecutable(u32 method); 3187 bool IsMethodExecutable(u32 method);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 279f0daa1..422d4d859 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -362,21 +362,17 @@ void MaxwellDMA::ReleaseSemaphore() {
362 const auto type = regs.launch_dma.semaphore_type; 362 const auto type = regs.launch_dma.semaphore_type;
363 const GPUVAddr address = regs.semaphore.address; 363 const GPUVAddr address = regs.semaphore.address;
364 const u32 payload = regs.semaphore.payload; 364 const u32 payload = regs.semaphore.payload;
365 VideoCommon::QueryPropertiesFlags flags{VideoCommon::QueryPropertiesFlags::IsAFence};
365 switch (type) { 366 switch (type) {
366 case LaunchDMA::SemaphoreType::NONE: 367 case LaunchDMA::SemaphoreType::NONE:
367 break; 368 break;
368 case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: { 369 case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: {
369 std::function<void()> operation( 370 rasterizer->Query(address, VideoCommon::QueryType::Payload, flags, payload, 0);
370 [this, address, payload] { memory_manager.Write<u32>(address, payload); });
371 rasterizer->SignalFence(std::move(operation));
372 break; 371 break;
373 } 372 }
374 case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: { 373 case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: {
375 std::function<void()> operation([this, address, payload] { 374 rasterizer->Query(address, VideoCommon::QueryType::Payload,
376 memory_manager.Write<u64>(address + sizeof(u64), system.GPU().GetTicks()); 375 flags | VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0);
377 memory_manager.Write<u64>(address, payload);
378 });
379 rasterizer->SignalFence(std::move(operation));
380 break; 376 break;
381 } 377 }
382 default: 378 default:
diff --git a/src/video_core/engines/puller.cpp b/src/video_core/engines/puller.cpp
index 6de2543b7..8dd34c04a 100644
--- a/src/video_core/engines/puller.cpp
+++ b/src/video_core/engines/puller.cpp
@@ -82,10 +82,8 @@ void Puller::ProcessSemaphoreTriggerMethod() {
82 if (op == GpuSemaphoreOperation::WriteLong) { 82 if (op == GpuSemaphoreOperation::WriteLong) {
83 const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; 83 const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()};
84 const u32 payload = regs.semaphore_sequence; 84 const u32 payload = regs.semaphore_sequence;
85 [this, sequence_address, payload] { 85 rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload,
86 memory_manager.Write<u64>(sequence_address + sizeof(u64), gpu.GetTicks()); 86 VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0);
87 memory_manager.Write<u64>(sequence_address, payload);
88 }();
89 } else { 87 } else {
90 do { 88 do {
91 const u32 word{memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress())}; 89 const u32 word{memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress())};
@@ -120,10 +118,8 @@ void Puller::ProcessSemaphoreTriggerMethod() {
120void Puller::ProcessSemaphoreRelease() { 118void Puller::ProcessSemaphoreRelease() {
121 const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; 119 const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()};
122 const u32 payload = regs.semaphore_release; 120 const u32 payload = regs.semaphore_release;
123 std::function<void()> operation([this, sequence_address, payload] { 121 rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload,
124 memory_manager.Write<u32>(sequence_address, payload); 122 VideoCommon::QueryPropertiesFlags::IsAFence, payload, 0);
125 });
126 rasterizer->SignalFence(std::move(operation));
127} 123}
128 124
129void Puller::ProcessSemaphoreAcquire() { 125void Puller::ProcessSemaphoreAcquire() {
@@ -132,7 +128,6 @@ void Puller::ProcessSemaphoreAcquire() {
132 while (word != value) { 128 while (word != value) {
133 regs.acquire_active = true; 129 regs.acquire_active = true;
134 regs.acquire_value = value; 130 regs.acquire_value = value;
135 std::this_thread::sleep_for(std::chrono::milliseconds(1));
136 rasterizer->ReleaseFences(); 131 rasterizer->ReleaseFences();
137 word = memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress()); 132 word = memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress());
138 // TODO(kemathe73) figure out how to do the acquire_timeout 133 // TODO(kemathe73) figure out how to do the acquire_timeout
diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h
index ab20ff30f..805a89900 100644
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@@ -55,6 +55,9 @@ public:
55 55
56 // Unlike other fences, this one doesn't 56 // Unlike other fences, this one doesn't
57 void SignalOrdering() { 57 void SignalOrdering() {
58 if constexpr (!can_async_check) {
59 TryReleasePendingFences<false>();
60 }
58 std::scoped_lock lock{buffer_cache.mutex}; 61 std::scoped_lock lock{buffer_cache.mutex};
59 buffer_cache.AccumulateFlushes(); 62 buffer_cache.AccumulateFlushes();
60 } 63 }
@@ -104,9 +107,25 @@ public:
104 SignalFence(std::move(func)); 107 SignalFence(std::move(func));
105 } 108 }
106 109
107 void WaitPendingFences() { 110 void WaitPendingFences([[maybe_unused]] bool force) {
108 if constexpr (!can_async_check) { 111 if constexpr (!can_async_check) {
109 TryReleasePendingFences<true>(); 112 TryReleasePendingFences<true>();
113 } else {
114 if (!force) {
115 return;
116 }
117 std::mutex wait_mutex;
118 std::condition_variable wait_cv;
119 std::atomic<bool> wait_finished{};
120 std::function<void()> func([&] {
121 std::scoped_lock lk(wait_mutex);
122 wait_finished.store(true, std::memory_order_relaxed);
123 wait_cv.notify_all();
124 });
125 SignalFence(std::move(func));
126 std::unique_lock lk(wait_mutex);
127 wait_cv.wait(
128 lk, [&wait_finished] { return wait_finished.load(std::memory_order_relaxed); });
110 } 129 }
111 } 130 }
112 131
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index c192e33b2..11549d448 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -102,7 +102,8 @@ struct GPU::Impl {
102 102
103 /// Signal the ending of command list. 103 /// Signal the ending of command list.
104 void OnCommandListEnd() { 104 void OnCommandListEnd() {
105 rasterizer->ReleaseFences(); 105 rasterizer->ReleaseFences(false);
106 Settings::UpdateGPUAccuracy();
106 } 107 }
107 108
108 /// Request a host GPU memory flush from the CPU. 109 /// Request a host GPU memory flush from the CPU.
@@ -220,6 +221,7 @@ struct GPU::Impl {
220 /// This can be used to launch any necessary threads and register any necessary 221 /// This can be used to launch any necessary threads and register any necessary
221 /// core timing events. 222 /// core timing events.
222 void Start() { 223 void Start() {
224 Settings::UpdateGPUAccuracy();
223 gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler); 225 gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler);
224 } 226 }
225 227
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index c4d459077..6b912027f 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -41,6 +41,9 @@ set(SHADER_FILES
41 pitch_unswizzle.comp 41 pitch_unswizzle.comp
42 present_bicubic.frag 42 present_bicubic.frag
43 present_gaussian.frag 43 present_gaussian.frag
44 queries_prefix_scan_sum.comp
45 queries_prefix_scan_sum_nosubgroups.comp
46 resolve_conditional_render.comp
44 smaa_edge_detection.vert 47 smaa_edge_detection.vert
45 smaa_edge_detection.frag 48 smaa_edge_detection.frag
46 smaa_blending_weight_calculation.vert 49 smaa_blending_weight_calculation.vert
@@ -70,6 +73,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND")
70endif() 73endif()
71 74
72set(GLSL_FLAGS "") 75set(GLSL_FLAGS "")
76set(SPIR_V_VERSION "spirv1.3")
73set(QUIET_FLAG "--quiet") 77set(QUIET_FLAG "--quiet")
74 78
75set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) 79set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
@@ -123,7 +127,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES})
123 OUTPUT 127 OUTPUT
124 ${SPIRV_HEADER_FILE} 128 ${SPIRV_HEADER_FILE}
125 COMMAND 129 COMMAND
126 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} 130 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION}
127 MAIN_DEPENDENCY 131 MAIN_DEPENDENCY
128 ${SOURCE_FILE} 132 ${SOURCE_FILE}
129 ) 133 )
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
new file mode 100644
index 000000000..6faa8981f
--- /dev/null
+++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
@@ -0,0 +1,173 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#version 460 core
5
6#extension GL_KHR_shader_subgroup_basic : require
7#extension GL_KHR_shader_subgroup_shuffle : require
8#extension GL_KHR_shader_subgroup_shuffle_relative : require
9#extension GL_KHR_shader_subgroup_arithmetic : require
10
11#ifdef VULKAN
12
13#define HAS_EXTENDED_TYPES 1
14#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
15#define END_PUSH_CONSTANTS };
16#define UNIFORM(n)
17#define BINDING_INPUT_BUFFER 0
18#define BINDING_OUTPUT_IMAGE 1
19
20#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
21
22#extension GL_NV_gpu_shader5 : enable
23#ifdef GL_NV_gpu_shader5
24#define HAS_EXTENDED_TYPES 1
25#else
26#define HAS_EXTENDED_TYPES 0
27#endif
28#define BEGIN_PUSH_CONSTANTS
29#define END_PUSH_CONSTANTS
30#define UNIFORM(n) layout(location = n) uniform
31#define BINDING_INPUT_BUFFER 0
32#define BINDING_OUTPUT_IMAGE 0
33
34#endif
35
36BEGIN_PUSH_CONSTANTS
37UNIFORM(0) uint min_accumulation_base;
38UNIFORM(1) uint max_accumulation_base;
39UNIFORM(2) uint accumulation_limit;
40UNIFORM(3) uint buffer_offset;
41END_PUSH_CONSTANTS
42
43#define LOCAL_RESULTS 8
44#define QUERIES_PER_INVOC 2048
45
46layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in;
47
48layout(std430, binding = 0) readonly buffer block1 {
49 uvec2 input_data[];
50};
51
52layout(std430, binding = 1) coherent buffer block2 {
53 uvec2 output_data[];
54};
55
56layout(std430, binding = 2) coherent buffer block3 {
57 uvec2 accumulated_data;
58};
59
60shared uvec2 shared_data[128];
61
62// Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64
63uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
64 uint carry = 0;
65 uvec2 result;
66 result.x = uaddCarry(value_1.x, value_2.x, carry);
67 result.y = value_1.y + value_2.y + carry;
68 return result;
69}
70
71// do subgroup Prefix Sum using Hillis and Steele's algorithm
72uvec2 subgroupInclusiveAddUint64(uvec2 value) {
73 uvec2 result = value;
74 for (uint i = 1; i < gl_SubgroupSize; i *= 2) {
75 uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i;
76 if (i <= gl_SubgroupInvocationID) {
77 result = AddUint64(result, other);
78 }
79 }
80 return result;
81}
82
83// Writes down the results to the output buffer and to the accumulation buffer
84void WriteResults(uvec2 results[LOCAL_RESULTS]) {
85 const uint current_id = gl_LocalInvocationID.x;
86 const uvec2 accum = accumulated_data;
87 for (uint i = 0; i < LOCAL_RESULTS; i++) {
88 uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0);
89 AddUint64(results[i], base_data);
90 }
91 for (uint i = 0; i < LOCAL_RESULTS; i++) {
92 output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i];
93 }
94 uint index = accumulation_limit % LOCAL_RESULTS;
95 uint base_id = accumulation_limit / LOCAL_RESULTS;
96 if (min_accumulation_base >= accumulation_limit + 1) {
97 if (current_id == base_id) {
98 accumulated_data = results[index];
99 }
100 return;
101 }
102 // We have that ugly case in which the accumulation data is reset in the middle somewhere.
103 barrier();
104 groupMemoryBarrier();
105
106 if (current_id == base_id) {
107 uvec2 reset_value = output_data[max_accumulation_base - 1];
108 // Calculate two complement / negate manually
109 reset_value = AddUint64(uvec2(1,0), ~reset_value);
110 accumulated_data = AddUint64(results[index], reset_value);
111 }
112}
113
114void main() {
115 const uint subgroup_inv_id = gl_SubgroupInvocationID;
116 const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups;
117 const uint last_subgroup_id = subgroupMax(subgroup_inv_id);
118 const uint current_id = gl_LocalInvocationID.x;
119 const uint total_work = accumulation_limit;
120 const uint last_result_id = LOCAL_RESULTS - 1;
121 uvec2 data[LOCAL_RESULTS];
122 for (uint i = 0; i < LOCAL_RESULTS; i++) {
123 data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i];
124 }
125 uvec2 results[LOCAL_RESULTS];
126 results[0] = data[0];
127 for (uint i = 1; i < LOCAL_RESULTS; i++) {
128 results[i] = AddUint64(data[i], results[i - 1]);
129 }
130 // make sure all input data has been loaded
131 subgroupBarrier();
132 subgroupMemoryBarrier();
133
134 // on the last local result, do a subgroup inclusive scan sum
135 results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]);
136 // get the last local result from the subgroup behind the current
137 uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1);
138 if (subgroup_inv_id != 0) {
139 for (uint i = 1; i < LOCAL_RESULTS; i++) {
140 results[i - 1] = AddUint64(results[i - 1], result_behind);
141 }
142 }
143
144 // if we had less queries than our subgroup, just write down the results.
145 if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch.
146 WriteResults(results);
147 return;
148 }
149
150 // We now have more, so lets write the last result into shared memory.
151 // Only pick the last subgroup.
152 if (subgroup_inv_id == last_subgroup_id) {
153 shared_data[subgroup_id] = results[last_result_id];
154 }
155 // wait until everyone loaded their stuffs
156 barrier();
157 memoryBarrierShared();
158
159 // only if it's not the first subgroup
160 if (subgroup_id != 0) {
161 // get the results from some previous invocation
162 uvec2 tmp = shared_data[subgroup_inv_id];
163 subgroupBarrier();
164 subgroupMemoryBarrierShared();
165 tmp = subgroupInclusiveAddUint64(tmp);
166 // obtain the result that would be equivalent to the previous result
167 uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1);
168 for (uint i = 0; i < LOCAL_RESULTS; i++) {
169 results[i] = AddUint64(results[i], shuffled_result);
170 }
171 }
172 WriteResults(results);
173} \ No newline at end of file
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp
new file mode 100644
index 000000000..559a213b9
--- /dev/null
+++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp
@@ -0,0 +1,138 @@
1// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
2// SPDX-License-Identifier: MIT
3
4// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
5// Nicholas Haemel. Modified to suit needs.
6
7#version 460 core
8
9#ifdef VULKAN
10
11#define HAS_EXTENDED_TYPES 1
12#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
13#define END_PUSH_CONSTANTS };
14#define UNIFORM(n)
15#define BINDING_INPUT_BUFFER 0
16#define BINDING_OUTPUT_IMAGE 1
17
18#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
19
20#extension GL_NV_gpu_shader5 : enable
21#ifdef GL_NV_gpu_shader5
22#define HAS_EXTENDED_TYPES 1
23#else
24#define HAS_EXTENDED_TYPES 0
25#endif
26#define BEGIN_PUSH_CONSTANTS
27#define END_PUSH_CONSTANTS
28#define UNIFORM(n) layout(location = n) uniform
29#define BINDING_INPUT_BUFFER 0
30#define BINDING_OUTPUT_IMAGE 0
31
32#endif
33
34BEGIN_PUSH_CONSTANTS
35UNIFORM(0) uint min_accumulation_base;
36UNIFORM(1) uint max_accumulation_base;
37UNIFORM(2) uint accumulation_limit;
38UNIFORM(3) uint buffer_offset;
39END_PUSH_CONSTANTS
40
41#define LOCAL_RESULTS 4
42#define QUERIES_PER_INVOC 2048
43
44layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in;
45
46layout(std430, binding = 0) readonly buffer block1 {
47 uvec2 input_data[gl_WorkGroupSize.x * LOCAL_RESULTS];
48};
49
50layout(std430, binding = 1) writeonly coherent buffer block2 {
51 uvec2 output_data[gl_WorkGroupSize.x * LOCAL_RESULTS];
52};
53
54layout(std430, binding = 2) coherent buffer block3 {
55 uvec2 accumulated_data;
56};
57
58shared uvec2 shared_data[gl_WorkGroupSize.x * LOCAL_RESULTS];
59
60uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
61 uint carry = 0;
62 uvec2 result;
63 result.x = uaddCarry(value_1.x, value_2.x, carry);
64 result.y = value_1.y + value_2.y + carry;
65 return result;
66}
67
68void main(void) {
69 uint id = gl_LocalInvocationID.x;
70 uvec2 base_value[LOCAL_RESULTS];
71 const uvec2 accum = accumulated_data;
72 for (uint i = 0; i < LOCAL_RESULTS; i++) {
73 base_value[i] = (buffer_offset + id * LOCAL_RESULTS + i) < min_accumulation_base
74 ? accumulated_data
75 : uvec2(0);
76 }
77 uint work_size = gl_WorkGroupSize.x;
78 uint rd_id;
79 uint wr_id;
80 uint mask;
81 uvec2 inputs[LOCAL_RESULTS];
82 for (uint i = 0; i < LOCAL_RESULTS; i++) {
83 inputs[i] = input_data[buffer_offset + id * LOCAL_RESULTS + i];
84 }
85 // The number of steps is the log base 2 of the
86 // work group size, which should be a power of 2
87 const uint steps = uint(log2(work_size)) + uint(log2(LOCAL_RESULTS));
88 uint step = 0;
89
90 // Each invocation is responsible for the content of
91 // two elements of the output array
92 for (uint i = 0; i < LOCAL_RESULTS; i++) {
93 shared_data[id * LOCAL_RESULTS + i] = inputs[i];
94 }
95 // Synchronize to make sure that everyone has initialized
96 // their elements of shared_data[] with data loaded from
97 // the input arrays
98 barrier();
99 memoryBarrierShared();
100 // For each step...
101 for (step = 0; step < steps; step++) {
102 // Calculate the read and write index in the
103 // shared array
104 mask = (1 << step) - 1;
105 rd_id = ((id >> step) << (step + 1)) + mask;
106 wr_id = rd_id + 1 + (id & mask);
107 // Accumulate the read data into our element
108
109 shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
110 // Synchronize again to make sure that everyone
111 // has caught up with us
112 barrier();
113 memoryBarrierShared();
114 }
115 // Add the accumulation
116 for (uint i = 0; i < LOCAL_RESULTS; i++) {
117 shared_data[id * LOCAL_RESULTS + i] =
118 AddUint64(shared_data[id * LOCAL_RESULTS + i], base_value[i]);
119 }
120 barrier();
121 memoryBarrierShared();
122
123 // Finally write our data back to the output buffer
124 for (uint i = 0; i < LOCAL_RESULTS; i++) {
125 output_data[buffer_offset + id * LOCAL_RESULTS + i] = shared_data[id * LOCAL_RESULTS + i];
126 }
127 if (id == 0) {
128 if (min_accumulation_base >= accumulation_limit + 1) {
129 accumulated_data = shared_data[accumulation_limit];
130 return;
131 }
132 uvec2 reset_value = shared_data[max_accumulation_base - 1];
133 uvec2 final_value = shared_data[accumulation_limit];
134 // Two complements
135 reset_value = AddUint64(uvec2(1, 0), ~reset_value);
136 accumulated_data = AddUint64(final_value, reset_value);
137 }
138} \ No newline at end of file
diff --git a/src/video_core/host_shaders/resolve_conditional_render.comp b/src/video_core/host_shaders/resolve_conditional_render.comp
new file mode 100644
index 000000000..307e77d1a
--- /dev/null
+++ b/src/video_core/host_shaders/resolve_conditional_render.comp
@@ -0,0 +1,20 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#version 450
5
6layout(local_size_x = 1) in;
7
8layout(std430, binding = 0) buffer Query {
9 uvec2 initial;
10 uvec2 unknown;
11 uvec2 current;
12};
13
14layout(std430, binding = 1) buffer Result {
15 uint result;
16};
17
18void main() {
19 result = all(equal(initial, current)) ? 1 : 0;
20}
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
index 6272a4652..046c8085e 100644
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -67,6 +67,7 @@ public:
67 } 67 }
68 68
69 auto& params = maxwell3d.draw_manager->GetIndirectParams(); 69 auto& params = maxwell3d.draw_manager->GetIndirectParams();
70 params.is_byte_count = false;
70 params.is_indexed = false; 71 params.is_indexed = false;
71 params.include_count = false; 72 params.include_count = false;
72 params.count_start_address = 0; 73 params.count_start_address = 0;
@@ -161,6 +162,7 @@ public:
161 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); 162 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
162 } 163 }
163 auto& params = maxwell3d.draw_manager->GetIndirectParams(); 164 auto& params = maxwell3d.draw_manager->GetIndirectParams();
165 params.is_byte_count = false;
164 params.is_indexed = true; 166 params.is_indexed = true;
165 params.include_count = false; 167 params.include_count = false;
166 params.count_start_address = 0; 168 params.count_start_address = 0;
@@ -256,6 +258,7 @@ public:
256 const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize()); 258 const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize());
257 maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; 259 maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
258 auto& params = maxwell3d.draw_manager->GetIndirectParams(); 260 auto& params = maxwell3d.draw_manager->GetIndirectParams();
261 params.is_byte_count = false;
259 params.is_indexed = true; 262 params.is_indexed = true;
260 params.include_count = true; 263 params.include_count = true;
261 params.count_start_address = maxwell3d.GetMacroAddress(4); 264 params.count_start_address = maxwell3d.GetMacroAddress(4);
@@ -319,6 +322,47 @@ private:
319 } 322 }
320}; 323};
321 324
325class HLE_DrawIndirectByteCount final : public HLEMacroImpl {
326public:
327 explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
328
329 void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
330 auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0xFFFFU);
331 if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) {
332 Fallback(parameters);
333 return;
334 }
335
336 auto& params = maxwell3d.draw_manager->GetIndirectParams();
337 params.is_byte_count = true;
338 params.is_indexed = false;
339 params.include_count = false;
340 params.count_start_address = 0;
341 params.indirect_start_address = maxwell3d.GetMacroAddress(2);
342 params.buffer_size = 4;
343 params.max_draw_counts = 1;
344 params.stride = parameters[1];
345 maxwell3d.regs.draw.begin = parameters[0];
346 maxwell3d.regs.draw_auto_stride = parameters[1];
347 maxwell3d.regs.draw_auto_byte_count = parameters[2];
348
349 maxwell3d.draw_manager->DrawArrayIndirect(topology);
350 }
351
352private:
353 void Fallback(const std::vector<u32>& parameters) {
354 maxwell3d.RefreshParameters();
355
356 maxwell3d.regs.draw.begin = parameters[0];
357 maxwell3d.regs.draw_auto_stride = parameters[1];
358 maxwell3d.regs.draw_auto_byte_count = parameters[2];
359
360 maxwell3d.draw_manager->DrawArray(
361 maxwell3d.regs.draw.topology, 0,
362 maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1);
363 }
364};
365
322class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl { 366class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl {
323public: 367public:
324 explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} 368 explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
@@ -536,6 +580,11 @@ HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {
536 [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> { 580 [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
537 return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d__); 581 return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d__);
538 })); 582 }));
583 builders.emplace(0xB5F74EDB717278ECULL,
584 std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
585 [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
586 return std::make_unique<HLE_DrawIndirectByteCount>(maxwell3d__);
587 }));
539} 588}
540 589
541HLEMacro::~HLEMacro() = default; 590HLEMacro::~HLEMacro() = default;
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
index 7047e2e63..9fcaeeac7 100644
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -25,6 +25,13 @@
25#include "video_core/rasterizer_interface.h" 25#include "video_core/rasterizer_interface.h"
26#include "video_core/texture_cache/slot_vector.h" 26#include "video_core/texture_cache/slot_vector.h"
27 27
28namespace VideoCore {
29enum class QueryType {
30 SamplesPassed,
31};
32constexpr std::size_t NumQueryTypes = 1;
33} // namespace VideoCore
34
28namespace VideoCommon { 35namespace VideoCommon {
29 36
30using AsyncJobId = SlotId; 37using AsyncJobId = SlotId;
@@ -98,10 +105,10 @@ private:
98}; 105};
99 106
100template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter> 107template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter>
101class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { 108class QueryCacheLegacy : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> {
102public: 109public:
103 explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_, 110 explicit QueryCacheLegacy(VideoCore::RasterizerInterface& rasterizer_,
104 Core::Memory::Memory& cpu_memory_) 111 Core::Memory::Memory& cpu_memory_)
105 : rasterizer{rasterizer_}, 112 : rasterizer{rasterizer_},
106 // Use reinterpret_cast instead of static_cast as workaround for 113 // Use reinterpret_cast instead of static_cast as workaround for
107 // UBSan bug (https://github.com/llvm/llvm-project/issues/59060) 114 // UBSan bug (https://github.com/llvm/llvm-project/issues/59060)
diff --git a/src/video_core/query_cache/bank_base.h b/src/video_core/query_cache/bank_base.h
new file mode 100644
index 000000000..420927091
--- /dev/null
+++ b/src/video_core/query_cache/bank_base.h
@@ -0,0 +1,104 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#pragma once
5
6#include <atomic>
7#include <deque>
8#include <utility>
9
10#include "common/common_types.h"
11
12namespace VideoCommon {
13
14class BankBase {
15protected:
16 const size_t base_bank_size{};
17 size_t bank_size{};
18 std::atomic<size_t> references{};
19 size_t current_slot{};
20
21public:
22 explicit BankBase(size_t bank_size_) : base_bank_size{bank_size_}, bank_size(bank_size_) {}
23
24 virtual ~BankBase() = default;
25
26 virtual std::pair<bool, size_t> Reserve() {
27 if (IsClosed()) {
28 return {false, bank_size};
29 }
30 const size_t result = current_slot++;
31 return {true, result};
32 }
33
34 virtual void Reset() {
35 current_slot = 0;
36 references = 0;
37 bank_size = base_bank_size;
38 }
39
40 size_t Size() const {
41 return bank_size;
42 }
43
44 void AddReference(size_t how_many = 1) {
45 references.fetch_add(how_many, std::memory_order_relaxed);
46 }
47
48 void CloseReference(size_t how_many = 1) {
49 if (how_many > references.load(std::memory_order_relaxed)) {
50 UNREACHABLE();
51 }
52 references.fetch_sub(how_many, std::memory_order_relaxed);
53 }
54
55 void Close() {
56 bank_size = current_slot;
57 }
58
59 bool IsClosed() const {
60 return current_slot >= bank_size;
61 }
62
63 bool IsDead() const {
64 return IsClosed() && references == 0;
65 }
66};
67
68template <typename BankType>
69class BankPool {
70private:
71 std::deque<BankType> bank_pool;
72 std::deque<size_t> bank_indices;
73
74public:
75 BankPool() = default;
76 ~BankPool() = default;
77
78 // Reserve a bank from the pool and return its index
79 template <typename Func>
80 size_t ReserveBank(Func&& builder) {
81 if (!bank_indices.empty() && bank_pool[bank_indices.front()].IsDead()) {
82 size_t new_index = bank_indices.front();
83 bank_indices.pop_front();
84 bank_pool[new_index].Reset();
85 return new_index;
86 }
87 size_t new_index = bank_pool.size();
88 builder(bank_pool, new_index);
89 bank_indices.push_back(new_index);
90 return new_index;
91 }
92
93 // Get a reference to a bank using its index
94 BankType& GetBank(size_t index) {
95 return bank_pool[index];
96 }
97
98 // Get the total number of banks in the pool
99 size_t BankCount() const {
100 return bank_pool.size();
101 }
102};
103
104} // namespace VideoCommon
diff --git a/src/video_core/query_cache/query_base.h b/src/video_core/query_cache/query_base.h
new file mode 100644
index 000000000..1d786b3a7
--- /dev/null
+++ b/src/video_core/query_cache/query_base.h
@@ -0,0 +1,70 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#pragma once
5
6#include "common/common_funcs.h"
7#include "common/common_types.h"
8
9namespace VideoCommon {
10
11enum class QueryFlagBits : u32 {
12 HasTimestamp = 1 << 0, ///< Indicates if this query has a timestamp.
13 IsFinalValueSynced = 1 << 1, ///< Indicates if the query has been synced in the host
14 IsHostSynced = 1 << 2, ///< Indicates if the query has been synced in the host
15 IsGuestSynced = 1 << 3, ///< Indicates if the query has been synced with the guest.
16 IsHostManaged = 1 << 4, ///< Indicates if this query points to a host query
17 IsRewritten = 1 << 5, ///< Indicates if this query was rewritten by another query
18 IsInvalidated = 1 << 6, ///< Indicates the value of th query has been nullified.
19 IsOrphan = 1 << 7, ///< Indicates the query has not been set by a guest query.
20 IsFence = 1 << 8, ///< Indicates the query is a fence.
21};
22DECLARE_ENUM_FLAG_OPERATORS(QueryFlagBits)
23
24class QueryBase {
25public:
26 VAddr guest_address{};
27 QueryFlagBits flags{};
28 u64 value{};
29
30protected:
31 // Default constructor
32 QueryBase() = default;
33
34 // Parameterized constructor
35 QueryBase(VAddr address, QueryFlagBits flags_, u64 value_)
36 : guest_address(address), flags(flags_), value{value_} {}
37};
38
39class GuestQuery : public QueryBase {
40public:
41 // Parameterized constructor
42 GuestQuery(bool isLong, VAddr address, u64 queryValue)
43 : QueryBase(address, QueryFlagBits::IsFinalValueSynced, queryValue) {
44 if (isLong) {
45 flags |= QueryFlagBits::HasTimestamp;
46 }
47 }
48};
49
50class HostQueryBase : public QueryBase {
51public:
52 // Default constructor
53 HostQueryBase() : QueryBase(0, QueryFlagBits::IsHostManaged | QueryFlagBits::IsOrphan, 0) {}
54
55 // Parameterized constructor
56 HostQueryBase(bool has_timestamp, VAddr address)
57 : QueryBase(address, QueryFlagBits::IsHostManaged, 0), start_bank_id{}, size_banks{},
58 start_slot{}, size_slots{} {
59 if (has_timestamp) {
60 flags |= QueryFlagBits::HasTimestamp;
61 }
62 }
63
64 u32 start_bank_id{};
65 u32 size_banks{};
66 size_t start_slot{};
67 size_t size_slots{};
68};
69
70} // namespace VideoCommon \ No newline at end of file
diff --git a/src/video_core/query_cache/query_cache.h b/src/video_core/query_cache/query_cache.h
new file mode 100644
index 000000000..78b42b518
--- /dev/null
+++ b/src/video_core/query_cache/query_cache.h
@@ -0,0 +1,580 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#pragma once
5
6#include <array>
7#include <deque>
8#include <memory>
9#include <mutex>
10#include <unordered_map>
11#include <utility>
12
13#include "common/assert.h"
14#include "common/common_types.h"
15#include "common/logging/log.h"
16#include "common/scope_exit.h"
17#include "common/settings.h"
18#include "core/memory.h"
19#include "video_core/engines/maxwell_3d.h"
20#include "video_core/gpu.h"
21#include "video_core/memory_manager.h"
22#include "video_core/query_cache/bank_base.h"
23#include "video_core/query_cache/query_base.h"
24#include "video_core/query_cache/query_cache_base.h"
25#include "video_core/query_cache/query_stream.h"
26#include "video_core/query_cache/types.h"
27
28namespace VideoCommon {
29
30using Maxwell = Tegra::Engines::Maxwell3D;
31
32struct SyncValuesStruct {
33 VAddr address;
34 u64 value;
35 u64 size;
36
37 static constexpr bool GeneratesBaseBuffer = true;
38};
39
40template <typename Traits>
41class GuestStreamer : public SimpleStreamer<GuestQuery> {
42public:
43 using RuntimeType = typename Traits::RuntimeType;
44
45 GuestStreamer(size_t id_, RuntimeType& runtime_)
46 : SimpleStreamer<GuestQuery>(id_), runtime{runtime_} {}
47
48 virtual ~GuestStreamer() = default;
49
50 size_t WriteCounter(VAddr address, bool has_timestamp, u32 value,
51 std::optional<u32> subreport = std::nullopt) override {
52 auto new_id = BuildQuery(has_timestamp, address, static_cast<u64>(value));
53 pending_sync.push_back(new_id);
54 return new_id;
55 }
56
57 bool HasPendingSync() const override {
58 return !pending_sync.empty();
59 }
60
61 void SyncWrites() override {
62 if (pending_sync.empty()) {
63 return;
64 }
65 std::vector<SyncValuesStruct> sync_values;
66 sync_values.reserve(pending_sync.size());
67 for (size_t pending_id : pending_sync) {
68 auto& query = slot_queries[pending_id];
69 if (True(query.flags & QueryFlagBits::IsRewritten) ||
70 True(query.flags & QueryFlagBits::IsInvalidated)) {
71 continue;
72 }
73 query.flags |= QueryFlagBits::IsHostSynced;
74 sync_values.emplace_back(SyncValuesStruct{
75 .address = query.guest_address,
76 .value = query.value,
77 .size = static_cast<u64>(True(query.flags & QueryFlagBits::HasTimestamp) ? 8 : 4)});
78 }
79 pending_sync.clear();
80 if (sync_values.size() > 0) {
81 runtime.template SyncValues<SyncValuesStruct>(sync_values);
82 }
83 }
84
85private:
86 RuntimeType& runtime;
87 std::deque<size_t> pending_sync;
88};
89
90template <typename Traits>
91class StubStreamer : public GuestStreamer<Traits> {
92public:
93 using RuntimeType = typename Traits::RuntimeType;
94
95 StubStreamer(size_t id_, RuntimeType& runtime_, u32 stub_value_)
96 : GuestStreamer<Traits>(id_, runtime_), stub_value{stub_value_} {}
97
98 ~StubStreamer() override = default;
99
100 size_t WriteCounter(VAddr address, bool has_timestamp, [[maybe_unused]] u32 value,
101 std::optional<u32> subreport = std::nullopt) override {
102 size_t new_id =
103 GuestStreamer<Traits>::WriteCounter(address, has_timestamp, stub_value, subreport);
104 return new_id;
105 }
106
107private:
108 u32 stub_value;
109};
110
111template <typename Traits>
112struct QueryCacheBase<Traits>::QueryCacheBaseImpl {
113 using RuntimeType = typename Traits::RuntimeType;
114
115 QueryCacheBaseImpl(QueryCacheBase<Traits>* owner_, VideoCore::RasterizerInterface& rasterizer_,
116 Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_, Tegra::GPU& gpu_)
117 : owner{owner_}, rasterizer{rasterizer_},
118 cpu_memory{cpu_memory_}, runtime{runtime_}, gpu{gpu_} {
119 streamer_mask = 0;
120 for (size_t i = 0; i < static_cast<size_t>(QueryType::MaxQueryTypes); i++) {
121 streamers[i] = runtime.GetStreamerInterface(static_cast<QueryType>(i));
122 if (streamers[i]) {
123 streamer_mask |= 1ULL << streamers[i]->GetId();
124 }
125 }
126 }
127
128 template <typename Func>
129 void ForEachStreamerIn(u64 mask, Func&& func) {
130 static constexpr bool RETURNS_BOOL =
131 std::is_same_v<std::invoke_result<Func, StreamerInterface*>, bool>;
132 while (mask != 0) {
133 size_t position = std::countr_zero(mask);
134 mask &= ~(1ULL << position);
135 if constexpr (RETURNS_BOOL) {
136 if (func(streamers[position])) {
137 return;
138 }
139 } else {
140 func(streamers[position]);
141 }
142 }
143 }
144
145 template <typename Func>
146 void ForEachStreamer(Func&& func) {
147 ForEachStreamerIn(streamer_mask, func);
148 }
149
150 QueryBase* ObtainQuery(QueryCacheBase<Traits>::QueryLocation location) {
151 size_t which_stream = location.stream_id.Value();
152 auto* streamer = streamers[which_stream];
153 if (!streamer) {
154 return nullptr;
155 }
156 return streamer->GetQuery(location.query_id.Value());
157 }
158
159 QueryCacheBase<Traits>* owner;
160 VideoCore::RasterizerInterface& rasterizer;
161 Core::Memory::Memory& cpu_memory;
162 RuntimeType& runtime;
163 Tegra::GPU& gpu;
164 std::array<StreamerInterface*, static_cast<size_t>(QueryType::MaxQueryTypes)> streamers;
165 u64 streamer_mask;
166 std::mutex flush_guard;
167 std::deque<u64> flushes_pending;
168 std::vector<QueryCacheBase<Traits>::QueryLocation> pending_unregister;
169};
170
171template <typename Traits>
172QueryCacheBase<Traits>::QueryCacheBase(Tegra::GPU& gpu_,
173 VideoCore::RasterizerInterface& rasterizer_,
174 Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_)
175 : cached_queries{} {
176 impl = std::make_unique<QueryCacheBase<Traits>::QueryCacheBaseImpl>(
177 this, rasterizer_, cpu_memory_, runtime_, gpu_);
178}
179
180template <typename Traits>
181QueryCacheBase<Traits>::~QueryCacheBase() = default;
182
183template <typename Traits>
184void QueryCacheBase<Traits>::CounterEnable(QueryType counter_type, bool is_enabled) {
185 size_t index = static_cast<size_t>(counter_type);
186 StreamerInterface* streamer = impl->streamers[index];
187 if (!streamer) [[unlikely]] {
188 UNREACHABLE();
189 return;
190 }
191 if (is_enabled) {
192 streamer->StartCounter();
193 } else {
194 streamer->PauseCounter();
195 }
196}
197
198template <typename Traits>
199void QueryCacheBase<Traits>::CounterClose(QueryType counter_type) {
200 size_t index = static_cast<size_t>(counter_type);
201 StreamerInterface* streamer = impl->streamers[index];
202 if (!streamer) [[unlikely]] {
203 UNREACHABLE();
204 return;
205 }
206 streamer->CloseCounter();
207}
208
209template <typename Traits>
210void QueryCacheBase<Traits>::CounterReset(QueryType counter_type) {
211 size_t index = static_cast<size_t>(counter_type);
212 StreamerInterface* streamer = impl->streamers[index];
213 if (!streamer) [[unlikely]] {
214 UNIMPLEMENTED();
215 return;
216 }
217 streamer->ResetCounter();
218}
219
220template <typename Traits>
221void QueryCacheBase<Traits>::BindToChannel(s32 id) {
222 VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo>::BindToChannel(id);
223 impl->runtime.Bind3DEngine(maxwell3d);
224}
225
226template <typename Traits>
227void QueryCacheBase<Traits>::CounterReport(GPUVAddr addr, QueryType counter_type,
228 QueryPropertiesFlags flags, u32 payload, u32 subreport) {
229 const bool has_timestamp = True(flags & QueryPropertiesFlags::HasTimeout);
230 const bool is_fence = True(flags & QueryPropertiesFlags::IsAFence);
231 size_t streamer_id = static_cast<size_t>(counter_type);
232 auto* streamer = impl->streamers[streamer_id];
233 if (streamer == nullptr) [[unlikely]] {
234 counter_type = QueryType::Payload;
235 payload = 1U;
236 streamer_id = static_cast<size_t>(counter_type);
237 streamer = impl->streamers[streamer_id];
238 }
239 auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(addr);
240 if (!cpu_addr_opt) [[unlikely]] {
241 return;
242 }
243 VAddr cpu_addr = *cpu_addr_opt;
244 const size_t new_query_id = streamer->WriteCounter(cpu_addr, has_timestamp, payload, subreport);
245 auto* query = streamer->GetQuery(new_query_id);
246 if (is_fence) {
247 query->flags |= QueryFlagBits::IsFence;
248 }
249 QueryLocation query_location{};
250 query_location.stream_id.Assign(static_cast<u32>(streamer_id));
251 query_location.query_id.Assign(static_cast<u32>(new_query_id));
252 const auto gen_caching_indexing = [](VAddr cur_addr) {
253 return std::make_pair<u64, u32>(cur_addr >> Core::Memory::YUZU_PAGEBITS,
254 static_cast<u32>(cur_addr & Core::Memory::YUZU_PAGEMASK));
255 };
256 u8* pointer = impl->cpu_memory.GetPointer(cpu_addr);
257 u8* pointer_timestamp = impl->cpu_memory.GetPointer(cpu_addr + 8);
258 bool is_synced = !Settings::IsGPULevelHigh() && is_fence;
259
260 std::function<void()> operation([this, is_synced, streamer, query_base = query, query_location,
261 pointer, pointer_timestamp] {
262 if (True(query_base->flags & QueryFlagBits::IsInvalidated)) {
263 if (!is_synced) [[likely]] {
264 impl->pending_unregister.push_back(query_location);
265 }
266 return;
267 }
268 if (False(query_base->flags & QueryFlagBits::IsFinalValueSynced)) [[unlikely]] {
269 UNREACHABLE();
270 return;
271 }
272 query_base->value += streamer->GetAmmendValue();
273 streamer->SetAccumulationValue(query_base->value);
274 if (True(query_base->flags & QueryFlagBits::HasTimestamp)) {
275 u64 timestamp = impl->gpu.GetTicks();
276 std::memcpy(pointer_timestamp, &timestamp, sizeof(timestamp));
277 std::memcpy(pointer, &query_base->value, sizeof(query_base->value));
278 } else {
279 u32 value = static_cast<u32>(query_base->value);
280 std::memcpy(pointer, &value, sizeof(value));
281 }
282 if (!is_synced) [[likely]] {
283 impl->pending_unregister.push_back(query_location);
284 }
285 });
286 if (is_fence) {
287 impl->rasterizer.SignalFence(std::move(operation));
288 } else {
289 if (!Settings::IsGPULevelHigh() && counter_type == QueryType::Payload) {
290 if (has_timestamp) {
291 u64 timestamp = impl->gpu.GetTicks();
292 u64 value = static_cast<u64>(payload);
293 std::memcpy(pointer_timestamp, &timestamp, sizeof(timestamp));
294 std::memcpy(pointer, &value, sizeof(value));
295 } else {
296 std::memcpy(pointer, &payload, sizeof(payload));
297 }
298 streamer->Free(new_query_id);
299 return;
300 }
301 impl->rasterizer.SyncOperation(std::move(operation));
302 }
303 if (is_synced) {
304 streamer->Free(new_query_id);
305 return;
306 }
307 auto [cont_addr, base] = gen_caching_indexing(cpu_addr);
308 {
309 std::scoped_lock lock(cache_mutex);
310 auto it1 = cached_queries.try_emplace(cont_addr);
311 auto& sub_container = it1.first->second;
312 auto it_current = sub_container.find(base);
313 if (it_current == sub_container.end()) {
314 sub_container.insert_or_assign(base, query_location);
315 return;
316 }
317 auto* old_query = impl->ObtainQuery(it_current->second);
318 old_query->flags |= QueryFlagBits::IsRewritten;
319 sub_container.insert_or_assign(base, query_location);
320 }
321}
322
323template <typename Traits>
324void QueryCacheBase<Traits>::UnregisterPending() {
325 const auto gen_caching_indexing = [](VAddr cur_addr) {
326 return std::make_pair<u64, u32>(cur_addr >> Core::Memory::YUZU_PAGEBITS,
327 static_cast<u32>(cur_addr & Core::Memory::YUZU_PAGEMASK));
328 };
329 std::scoped_lock lock(cache_mutex);
330 for (QueryLocation loc : impl->pending_unregister) {
331 const auto [streamer_id, query_id] = loc.unpack();
332 auto* streamer = impl->streamers[streamer_id];
333 if (!streamer) [[unlikely]] {
334 continue;
335 }
336 auto* query = streamer->GetQuery(query_id);
337 auto [cont_addr, base] = gen_caching_indexing(query->guest_address);
338 auto it1 = cached_queries.find(cont_addr);
339 if (it1 != cached_queries.end()) {
340 auto it2 = it1->second.find(base);
341 if (it2 != it1->second.end()) {
342 if (it2->second.raw == loc.raw) {
343 it1->second.erase(it2);
344 }
345 }
346 }
347 streamer->Free(query_id);
348 }
349 impl->pending_unregister.clear();
350}
351
352template <typename Traits>
353void QueryCacheBase<Traits>::NotifyWFI() {
354 bool should_sync = false;
355 impl->ForEachStreamer(
356 [&should_sync](StreamerInterface* streamer) { should_sync |= streamer->HasPendingSync(); });
357 if (!should_sync) {
358 return;
359 }
360
361 impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->PresyncWrites(); });
362 impl->runtime.Barriers(true);
363 impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->SyncWrites(); });
364 impl->runtime.Barriers(false);
365}
366
367template <typename Traits>
368void QueryCacheBase<Traits>::NotifySegment(bool resume) {
369 if (resume) {
370 impl->runtime.ResumeHostConditionalRendering();
371 } else {
372 CounterClose(VideoCommon::QueryType::ZPassPixelCount64);
373 CounterClose(VideoCommon::QueryType::StreamingByteCount);
374 impl->runtime.PauseHostConditionalRendering();
375 }
376}
377
378template <typename Traits>
379bool QueryCacheBase<Traits>::AccelerateHostConditionalRendering() {
380 bool qc_dirty = false;
381 const auto gen_lookup = [this, &qc_dirty](GPUVAddr address) -> VideoCommon::LookupData {
382 auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(address);
383 if (!cpu_addr_opt) [[unlikely]] {
384 return VideoCommon::LookupData{
385 .address = 0,
386 .found_query = nullptr,
387 };
388 }
389 VAddr cpu_addr = *cpu_addr_opt;
390 std::scoped_lock lock(cache_mutex);
391 auto it1 = cached_queries.find(cpu_addr >> Core::Memory::YUZU_PAGEBITS);
392 if (it1 == cached_queries.end()) {
393 return VideoCommon::LookupData{
394 .address = cpu_addr,
395 .found_query = nullptr,
396 };
397 }
398 auto& sub_container = it1->second;
399 auto it_current = sub_container.find(cpu_addr & Core::Memory::YUZU_PAGEMASK);
400
401 if (it_current == sub_container.end()) {
402 auto it_current_2 = sub_container.find((cpu_addr & Core::Memory::YUZU_PAGEMASK) + 4);
403 if (it_current_2 == sub_container.end()) {
404 return VideoCommon::LookupData{
405 .address = cpu_addr,
406 .found_query = nullptr,
407 };
408 }
409 }
410 auto* query = impl->ObtainQuery(it_current->second);
411 qc_dirty |= True(query->flags & QueryFlagBits::IsHostManaged) &&
412 False(query->flags & QueryFlagBits::IsGuestSynced);
413 return VideoCommon::LookupData{
414 .address = cpu_addr,
415 .found_query = query,
416 };
417 };
418
419 auto& regs = maxwell3d->regs;
420 if (regs.render_enable_override != Maxwell::Regs::RenderEnable::Override::UseRenderEnable) {
421 impl->runtime.EndHostConditionalRendering();
422 return false;
423 }
424 const ComparisonMode mode = static_cast<ComparisonMode>(regs.render_enable.mode);
425 const GPUVAddr address = regs.render_enable.Address();
426 switch (mode) {
427 case ComparisonMode::True:
428 impl->runtime.EndHostConditionalRendering();
429 return false;
430 case ComparisonMode::False:
431 impl->runtime.EndHostConditionalRendering();
432 return false;
433 case ComparisonMode::Conditional: {
434 VideoCommon::LookupData object_1{gen_lookup(address)};
435 return impl->runtime.HostConditionalRenderingCompareValue(object_1, qc_dirty);
436 }
437 case ComparisonMode::IfEqual: {
438 VideoCommon::LookupData object_1{gen_lookup(address)};
439 VideoCommon::LookupData object_2{gen_lookup(address + 16)};
440 return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty,
441 true);
442 }
443 case ComparisonMode::IfNotEqual: {
444 VideoCommon::LookupData object_1{gen_lookup(address)};
445 VideoCommon::LookupData object_2{gen_lookup(address + 16)};
446 return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty,
447 false);
448 }
449 default:
450 return false;
451 }
452}
453
454// Async downloads
455template <typename Traits>
456void QueryCacheBase<Traits>::CommitAsyncFlushes() {
457 // Make sure to have the results synced in Host.
458 NotifyWFI();
459
460 u64 mask{};
461 {
462 std::scoped_lock lk(impl->flush_guard);
463 impl->ForEachStreamer([&mask](StreamerInterface* streamer) {
464 bool local_result = streamer->HasUnsyncedQueries();
465 if (local_result) {
466 mask |= 1ULL << streamer->GetId();
467 }
468 });
469 impl->flushes_pending.push_back(mask);
470 }
471 std::function<void()> func([this] { UnregisterPending(); });
472 impl->rasterizer.SyncOperation(std::move(func));
473 if (mask == 0) {
474 return;
475 }
476 u64 ran_mask = ~mask;
477 while (mask) {
478 impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) {
479 u64 dep_mask = streamer->GetDependentMask();
480 if ((dep_mask & ~ran_mask) != 0) {
481 return;
482 }
483 u64 index = streamer->GetId();
484 ran_mask |= (1ULL << index);
485 mask &= ~(1ULL << index);
486 streamer->PushUnsyncedQueries();
487 });
488 }
489}
490
491template <typename Traits>
492bool QueryCacheBase<Traits>::HasUncommittedFlushes() const {
493 bool result = false;
494 impl->ForEachStreamer([&result](StreamerInterface* streamer) {
495 result |= streamer->HasUnsyncedQueries();
496 return result;
497 });
498 return result;
499}
500
501template <typename Traits>
502bool QueryCacheBase<Traits>::ShouldWaitAsyncFlushes() {
503 std::scoped_lock lk(impl->flush_guard);
504 return !impl->flushes_pending.empty() && impl->flushes_pending.front() != 0ULL;
505}
506
507template <typename Traits>
508void QueryCacheBase<Traits>::PopAsyncFlushes() {
509 u64 mask;
510 {
511 std::scoped_lock lk(impl->flush_guard);
512 mask = impl->flushes_pending.front();
513 impl->flushes_pending.pop_front();
514 }
515 if (mask == 0) {
516 return;
517 }
518 u64 ran_mask = ~mask;
519 while (mask) {
520 impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) {
521 u64 dep_mask = streamer->GetDependenceMask();
522 if ((dep_mask & ~ran_mask) != 0) {
523 return;
524 }
525 u64 index = streamer->GetId();
526 ran_mask |= (1ULL << index);
527 mask &= ~(1ULL << index);
528 streamer->PopUnsyncedQueries();
529 });
530 }
531}
532
533// Invalidation
534
535template <typename Traits>
536void QueryCacheBase<Traits>::InvalidateQuery(QueryCacheBase<Traits>::QueryLocation location) {
537 auto* query_base = impl->ObtainQuery(location);
538 if (!query_base) {
539 return;
540 }
541 query_base->flags |= QueryFlagBits::IsInvalidated;
542}
543
544template <typename Traits>
545bool QueryCacheBase<Traits>::IsQueryDirty(QueryCacheBase<Traits>::QueryLocation location) {
546 auto* query_base = impl->ObtainQuery(location);
547 if (!query_base) {
548 return false;
549 }
550 return True(query_base->flags & QueryFlagBits::IsHostManaged) &&
551 False(query_base->flags & QueryFlagBits::IsGuestSynced);
552}
553
554template <typename Traits>
555bool QueryCacheBase<Traits>::SemiFlushQueryDirty(QueryCacheBase<Traits>::QueryLocation location) {
556 auto* query_base = impl->ObtainQuery(location);
557 if (!query_base) {
558 return false;
559 }
560 if (True(query_base->flags & QueryFlagBits::IsFinalValueSynced) &&
561 False(query_base->flags & QueryFlagBits::IsGuestSynced)) {
562 auto* ptr = impl->cpu_memory.GetPointer(query_base->guest_address);
563 if (True(query_base->flags & QueryFlagBits::HasTimestamp)) {
564 std::memcpy(ptr, &query_base->value, sizeof(query_base->value));
565 return false;
566 }
567 u32 value_l = static_cast<u32>(query_base->value);
568 std::memcpy(ptr, &value_l, sizeof(value_l));
569 return false;
570 }
571 return True(query_base->flags & QueryFlagBits::IsHostManaged) &&
572 False(query_base->flags & QueryFlagBits::IsGuestSynced);
573}
574
575template <typename Traits>
576void QueryCacheBase<Traits>::RequestGuestHostSync() {
577 impl->rasterizer.ReleaseFences();
578}
579
580} // namespace VideoCommon
diff --git a/src/video_core/query_cache/query_cache_base.h b/src/video_core/query_cache/query_cache_base.h
new file mode 100644
index 000000000..07be421c6
--- /dev/null
+++ b/src/video_core/query_cache/query_cache_base.h
@@ -0,0 +1,181 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#pragma once
5
6#include <functional>
7#include <mutex>
8#include <optional>
9#include <span>
10#include <unordered_map>
11#include <utility>
12
13#include "common/assert.h"
14#include "common/bit_field.h"
15#include "common/common_types.h"
16#include "core/memory.h"
17#include "video_core/control/channel_state_cache.h"
18#include "video_core/query_cache/query_base.h"
19#include "video_core/query_cache/types.h"
20
21namespace Core::Memory {
22class Memory;
23}
24
25namespace VideoCore {
26class RasterizerInterface;
27}
28
29namespace Tegra {
30class GPU;
31}
32
33namespace VideoCommon {
34
35struct LookupData {
36 VAddr address;
37 QueryBase* found_query;
38};
39
40template <typename Traits>
41class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> {
42 using RuntimeType = typename Traits::RuntimeType;
43
44public:
45 union QueryLocation {
46 BitField<27, 5, u32> stream_id;
47 BitField<0, 27, u32> query_id;
48 u32 raw;
49
50 std::pair<size_t, size_t> unpack() const {
51 return {static_cast<size_t>(stream_id.Value()), static_cast<size_t>(query_id.Value())};
52 }
53 };
54
55 explicit QueryCacheBase(Tegra::GPU& gpu, VideoCore::RasterizerInterface& rasterizer_,
56 Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_);
57
58 ~QueryCacheBase();
59
60 void InvalidateRegion(VAddr addr, std::size_t size) {
61 IterateCache<true>(addr, size,
62 [this](QueryLocation location) { InvalidateQuery(location); });
63 }
64
65 void FlushRegion(VAddr addr, std::size_t size) {
66 bool result = false;
67 IterateCache<false>(addr, size, [this, &result](QueryLocation location) {
68 result |= SemiFlushQueryDirty(location);
69 return result;
70 });
71 if (result) {
72 RequestGuestHostSync();
73 }
74 }
75
76 static u64 BuildMask(std::span<const QueryType> types) {
77 u64 mask = 0;
78 for (auto query_type : types) {
79 mask |= 1ULL << (static_cast<u64>(query_type));
80 }
81 return mask;
82 }
83
84 /// Return true when a CPU region is modified from the GPU
85 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size) {
86 bool result = false;
87 IterateCache<false>(addr, size, [this, &result](QueryLocation location) {
88 result |= IsQueryDirty(location);
89 return result;
90 });
91 return result;
92 }
93
94 void CounterEnable(QueryType counter_type, bool is_enabled);
95
96 void CounterReset(QueryType counter_type);
97
98 void CounterClose(QueryType counter_type);
99
100 void CounterReport(GPUVAddr addr, QueryType counter_type, QueryPropertiesFlags flags,
101 u32 payload, u32 subreport);
102
103 void NotifyWFI();
104
105 bool AccelerateHostConditionalRendering();
106
107 // Async downloads
108 void CommitAsyncFlushes();
109
110 bool HasUncommittedFlushes() const;
111
112 bool ShouldWaitAsyncFlushes();
113
114 void PopAsyncFlushes();
115
116 void NotifySegment(bool resume);
117
118 void BindToChannel(s32 id) override;
119
120protected:
121 template <bool remove_from_cache, typename Func>
122 void IterateCache(VAddr addr, std::size_t size, Func&& func) {
123 static constexpr bool RETURNS_BOOL =
124 std::is_same_v<std::invoke_result<Func, QueryLocation>, bool>;
125 const u64 addr_begin = addr;
126 const u64 addr_end = addr_begin + size;
127
128 const u64 page_end = addr_end >> Core::Memory::YUZU_PAGEBITS;
129 std::scoped_lock lock(cache_mutex);
130 for (u64 page = addr_begin >> Core::Memory::YUZU_PAGEBITS; page <= page_end; ++page) {
131 const u64 page_start = page << Core::Memory::YUZU_PAGEBITS;
132 const auto in_range = [page_start, addr_begin, addr_end](const u32 query_location) {
133 const u64 cache_begin = page_start + query_location;
134 const u64 cache_end = cache_begin + sizeof(u32);
135 return cache_begin < addr_end && addr_begin < cache_end;
136 };
137 const auto& it = cached_queries.find(page);
138 if (it == std::end(cached_queries)) {
139 continue;
140 }
141 auto& contents = it->second;
142 for (auto& query : contents) {
143 if (!in_range(query.first)) {
144 continue;
145 }
146 if constexpr (RETURNS_BOOL) {
147 if (func(query.second)) {
148 return;
149 }
150 } else {
151 func(query.second);
152 }
153 }
154 if constexpr (remove_from_cache) {
155 const auto in_range2 = [&](const std::pair<u32, QueryLocation>& pair) {
156 return in_range(pair.first);
157 };
158 std::erase_if(contents, in_range2);
159 }
160 }
161 }
162
163 using ContentCache = std::unordered_map<u64, std::unordered_map<u32, QueryLocation>>;
164
165 void InvalidateQuery(QueryLocation location);
166 bool IsQueryDirty(QueryLocation location);
167 bool SemiFlushQueryDirty(QueryLocation location);
168 void RequestGuestHostSync();
169 void UnregisterPending();
170
171 std::unordered_map<u64, std::unordered_map<u32, QueryLocation>> cached_queries;
172 std::mutex cache_mutex;
173
174 struct QueryCacheBaseImpl;
175 friend struct QueryCacheBaseImpl;
176 friend RuntimeType;
177
178 std::unique_ptr<QueryCacheBaseImpl> impl;
179};
180
181} // namespace VideoCommon \ No newline at end of file
diff --git a/src/video_core/query_cache/query_stream.h b/src/video_core/query_cache/query_stream.h
new file mode 100644
index 000000000..39da6ac07
--- /dev/null
+++ b/src/video_core/query_cache/query_stream.h
@@ -0,0 +1,149 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#pragma once
5
6#include <deque>
7#include <optional>
8#include <vector>
9
10#include "common/assert.h"
11#include "common/common_types.h"
12#include "video_core/query_cache/bank_base.h"
13#include "video_core/query_cache/query_base.h"
14
15namespace VideoCommon {
16
17class StreamerInterface {
18public:
19 explicit StreamerInterface(size_t id_) : id{id_}, dependence_mask{}, dependent_mask{} {}
20 virtual ~StreamerInterface() = default;
21
22 virtual QueryBase* GetQuery(size_t id) = 0;
23
24 virtual void StartCounter() {
25 /* Do Nothing */
26 }
27
28 virtual void PauseCounter() {
29 /* Do Nothing */
30 }
31
32 virtual void ResetCounter() {
33 /* Do Nothing */
34 }
35
36 virtual void CloseCounter() {
37 /* Do Nothing */
38 }
39
40 virtual bool HasPendingSync() const {
41 return false;
42 }
43
44 virtual void PresyncWrites() {
45 /* Do Nothing */
46 }
47
48 virtual void SyncWrites() {
49 /* Do Nothing */
50 }
51
52 virtual size_t WriteCounter(VAddr address, bool has_timestamp, u32 value,
53 std::optional<u32> subreport = std::nullopt) = 0;
54
55 virtual bool HasUnsyncedQueries() const {
56 return false;
57 }
58
59 virtual void PushUnsyncedQueries() {
60 /* Do Nothing */
61 }
62
63 virtual void PopUnsyncedQueries() {
64 /* Do Nothing */
65 }
66
67 virtual void Free(size_t query_id) = 0;
68
69 size_t GetId() const {
70 return id;
71 }
72
73 u64 GetDependenceMask() const {
74 return dependence_mask;
75 }
76
77 u64 GetDependentMask() const {
78 return dependence_mask;
79 }
80
81 u64 GetAmmendValue() const {
82 return ammend_value;
83 }
84
85 void SetAccumulationValue(u64 new_value) {
86 acumulation_value = new_value;
87 }
88
89protected:
90 void MakeDependent(StreamerInterface* depend_on) {
91 dependence_mask |= 1ULL << depend_on->id;
92 depend_on->dependent_mask |= 1ULL << id;
93 }
94
95 const size_t id;
96 u64 dependence_mask;
97 u64 dependent_mask;
98 u64 ammend_value{};
99 u64 acumulation_value{};
100};
101
102template <typename QueryType>
103class SimpleStreamer : public StreamerInterface {
104public:
105 explicit SimpleStreamer(size_t id_) : StreamerInterface{id_} {}
106 virtual ~SimpleStreamer() = default;
107
108protected:
109 virtual QueryType* GetQuery(size_t query_id) override {
110 if (query_id < slot_queries.size()) {
111 return &slot_queries[query_id];
112 }
113 return nullptr;
114 }
115
116 virtual void Free(size_t query_id) override {
117 std::scoped_lock lk(guard);
118 ReleaseQuery(query_id);
119 }
120
121 template <typename... Args, typename = decltype(QueryType(std::declval<Args>()...))>
122 size_t BuildQuery(Args&&... args) {
123 std::scoped_lock lk(guard);
124 if (!old_queries.empty()) {
125 size_t new_id = old_queries.front();
126 old_queries.pop_front();
127 new (&slot_queries[new_id]) QueryType(std::forward<Args>(args)...);
128 return new_id;
129 }
130 size_t new_id = slot_queries.size();
131 slot_queries.emplace_back(std::forward<Args>(args)...);
132 return new_id;
133 }
134
135 void ReleaseQuery(size_t query_id) {
136
137 if (query_id < slot_queries.size()) {
138 old_queries.push_back(query_id);
139 return;
140 }
141 UNREACHABLE();
142 }
143
144 std::mutex guard;
145 std::deque<QueryType> slot_queries;
146 std::deque<size_t> old_queries;
147};
148
149} // namespace VideoCommon \ No newline at end of file
diff --git a/src/video_core/query_cache/types.h b/src/video_core/query_cache/types.h
new file mode 100644
index 000000000..e9226bbfc
--- /dev/null
+++ b/src/video_core/query_cache/types.h
@@ -0,0 +1,74 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#pragma once
5
6#include "common/common_funcs.h"
7#include "common/common_types.h"
8
9namespace VideoCommon {
10
11enum class QueryPropertiesFlags : u32 {
12 HasTimeout = 1 << 0,
13 IsAFence = 1 << 1,
14};
15DECLARE_ENUM_FLAG_OPERATORS(QueryPropertiesFlags)
16
17// This should always be equivalent to maxwell3d Report Semaphore Reports
18enum class QueryType : u32 {
19 Payload = 0, // "None" in docs, but confirmed via hardware to return the payload
20 VerticesGenerated = 1,
21 ZPassPixelCount = 2,
22 PrimitivesGenerated = 3,
23 AlphaBetaClocks = 4,
24 VertexShaderInvocations = 5,
25 StreamingPrimitivesNeededMinusSucceeded = 6,
26 GeometryShaderInvocations = 7,
27 GeometryShaderPrimitivesGenerated = 9,
28 ZCullStats0 = 10,
29 StreamingPrimitivesSucceeded = 11,
30 ZCullStats1 = 12,
31 StreamingPrimitivesNeeded = 13,
32 ZCullStats2 = 14,
33 ClipperInvocations = 15,
34 ZCullStats3 = 16,
35 ClipperPrimitivesGenerated = 17,
36 VtgPrimitivesOut = 18,
37 PixelShaderInvocations = 19,
38 ZPassPixelCount64 = 21,
39 IEEECleanColorTarget = 24,
40 IEEECleanZetaTarget = 25,
41 StreamingByteCount = 26,
42 TessellationInitInvocations = 27,
43 BoundingRectangle = 28,
44 TessellationShaderInvocations = 29,
45 TotalStreamingPrimitivesNeededMinusSucceeded = 30,
46 TessellationShaderPrimitivesGenerated = 31,
47 // max.
48 MaxQueryTypes,
49};
50
51// Comparison modes for Host Conditional Rendering
52enum class ComparisonMode : u32 {
53 False = 0,
54 True = 1,
55 Conditional = 2,
56 IfEqual = 3,
57 IfNotEqual = 4,
58 MaxComparisonMode,
59};
60
61// Reduction ops.
62enum class ReductionOp : u32 {
63 RedAdd = 0,
64 RedMin = 1,
65 RedMax = 2,
66 RedInc = 3,
67 RedDec = 4,
68 RedAnd = 5,
69 RedOr = 6,
70 RedXor = 7,
71 MaxReductionOp,
72};
73
74} // namespace VideoCommon \ No newline at end of file
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index cb8029a4f..af1469147 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -12,6 +12,7 @@
12#include "video_core/cache_types.h" 12#include "video_core/cache_types.h"
13#include "video_core/engines/fermi_2d.h" 13#include "video_core/engines/fermi_2d.h"
14#include "video_core/gpu.h" 14#include "video_core/gpu.h"
15#include "video_core/query_cache/types.h"
15#include "video_core/rasterizer_download_area.h" 16#include "video_core/rasterizer_download_area.h"
16 17
17namespace Tegra { 18namespace Tegra {
@@ -26,11 +27,6 @@ struct ChannelState;
26 27
27namespace VideoCore { 28namespace VideoCore {
28 29
29enum class QueryType {
30 SamplesPassed,
31};
32constexpr std::size_t NumQueryTypes = 1;
33
34enum class LoadCallbackStage { 30enum class LoadCallbackStage {
35 Prepare, 31 Prepare,
36 Build, 32 Build,
@@ -58,10 +54,11 @@ public:
58 virtual void DispatchCompute() = 0; 54 virtual void DispatchCompute() = 0;
59 55
60 /// Resets the counter of a query 56 /// Resets the counter of a query
61 virtual void ResetCounter(QueryType type) = 0; 57 virtual void ResetCounter(VideoCommon::QueryType type) = 0;
62 58
63 /// Records a GPU query and caches it 59 /// Records a GPU query and caches it
64 virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; 60 virtual void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
61 VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) = 0;
65 62
66 /// Signal an uniform buffer binding 63 /// Signal an uniform buffer binding
67 virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, 64 virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
@@ -83,7 +80,7 @@ public:
83 virtual void SignalReference() = 0; 80 virtual void SignalReference() = 0;
84 81
85 /// Release all pending fences. 82 /// Release all pending fences.
86 virtual void ReleaseFences() = 0; 83 virtual void ReleaseFences(bool force = true) = 0;
87 84
88 /// Notify rasterizer that all caches should be flushed to Switch memory 85 /// Notify rasterizer that all caches should be flushed to Switch memory
89 virtual void FlushAll() = 0; 86 virtual void FlushAll() = 0;
diff --git a/src/video_core/renderer_null/null_rasterizer.cpp b/src/video_core/renderer_null/null_rasterizer.cpp
index 92ecf6682..65cd5aa06 100644
--- a/src/video_core/renderer_null/null_rasterizer.cpp
+++ b/src/video_core/renderer_null/null_rasterizer.cpp
@@ -26,16 +26,18 @@ void RasterizerNull::Draw(bool is_indexed, u32 instance_count) {}
26void RasterizerNull::DrawTexture() {} 26void RasterizerNull::DrawTexture() {}
27void RasterizerNull::Clear(u32 layer_count) {} 27void RasterizerNull::Clear(u32 layer_count) {}
28void RasterizerNull::DispatchCompute() {} 28void RasterizerNull::DispatchCompute() {}
29void RasterizerNull::ResetCounter(VideoCore::QueryType type) {} 29void RasterizerNull::ResetCounter(VideoCommon::QueryType type) {}
30void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, 30void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
31 std::optional<u64> timestamp) { 31 VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) {
32 if (!gpu_memory) { 32 if (!gpu_memory) {
33 return; 33 return;
34 } 34 }
35 35 if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) {
36 gpu_memory->Write(gpu_addr, u64{0}); 36 u64 ticks = m_gpu.GetTicks();
37 if (timestamp) { 37 gpu_memory->Write<u64>(gpu_addr + 8, ticks);
38 gpu_memory->Write(gpu_addr + 8, *timestamp); 38 gpu_memory->Write<u64>(gpu_addr, static_cast<u64>(payload));
39 } else {
40 gpu_memory->Write<u32>(gpu_addr, payload);
39 } 41 }
40} 42}
41void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, 43void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
@@ -74,7 +76,7 @@ void RasterizerNull::SignalSyncPoint(u32 value) {
74 syncpoint_manager.IncrementHost(value); 76 syncpoint_manager.IncrementHost(value);
75} 77}
76void RasterizerNull::SignalReference() {} 78void RasterizerNull::SignalReference() {}
77void RasterizerNull::ReleaseFences() {} 79void RasterizerNull::ReleaseFences(bool) {}
78void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {} 80void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {}
79void RasterizerNull::WaitForIdle() {} 81void RasterizerNull::WaitForIdle() {}
80void RasterizerNull::FragmentBarrier() {} 82void RasterizerNull::FragmentBarrier() {}
diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h
index 93b9a6971..23001eeb8 100644
--- a/src/video_core/renderer_null/null_rasterizer.h
+++ b/src/video_core/renderer_null/null_rasterizer.h
@@ -42,8 +42,9 @@ public:
42 void DrawTexture() override; 42 void DrawTexture() override;
43 void Clear(u32 layer_count) override; 43 void Clear(u32 layer_count) override;
44 void DispatchCompute() override; 44 void DispatchCompute() override;
45 void ResetCounter(VideoCore::QueryType type) override; 45 void ResetCounter(VideoCommon::QueryType type) override;
46 void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; 46 void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
47 VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
47 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; 48 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
48 void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; 49 void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
49 void FlushAll() override; 50 void FlushAll() override;
@@ -63,7 +64,7 @@ public:
63 void SyncOperation(std::function<void()>&& func) override; 64 void SyncOperation(std::function<void()>&& func) override;
64 void SignalSyncPoint(u32 value) override; 65 void SignalSyncPoint(u32 value) override;
65 void SignalReference() override; 66 void SignalReference() override;
66 void ReleaseFences() override; 67 void ReleaseFences(bool force) override;
67 void FlushAndInvalidateRegion( 68 void FlushAndInvalidateRegion(
68 VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; 69 VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
69 void WaitForIdle() override; 70 void WaitForIdle() override;
diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp
index 99d7347f5..ec142d48e 100644
--- a/src/video_core/renderer_opengl/gl_query_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_query_cache.cpp
@@ -27,7 +27,7 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) {
27} // Anonymous namespace 27} // Anonymous namespace
28 28
29QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) 29QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_)
30 : QueryCacheBase(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} 30 : QueryCacheLegacy(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {}
31 31
32QueryCache::~QueryCache() = default; 32QueryCache::~QueryCache() = default;
33 33
diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h
index 872513f22..0721e0b3d 100644
--- a/src/video_core/renderer_opengl/gl_query_cache.h
+++ b/src/video_core/renderer_opengl/gl_query_cache.h
@@ -26,7 +26,7 @@ class RasterizerOpenGL;
26using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; 26using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
27 27
28class QueryCache final 28class QueryCache final
29 : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { 29 : public VideoCommon::QueryCacheLegacy<QueryCache, CachedQuery, CounterStream, HostCounter> {
30public: 30public:
31 explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); 31 explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_);
32 ~QueryCache(); 32 ~QueryCache();
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index dd03efecd..27e2de1bf 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -396,13 +396,39 @@ void RasterizerOpenGL::DispatchCompute() {
396 has_written_global_memory |= pipeline->WritesGlobalMemory(); 396 has_written_global_memory |= pipeline->WritesGlobalMemory();
397} 397}
398 398
399void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) { 399void RasterizerOpenGL::ResetCounter(VideoCommon::QueryType type) {
400 query_cache.ResetCounter(type); 400 if (type == VideoCommon::QueryType::ZPassPixelCount64) {
401 query_cache.ResetCounter(VideoCore::QueryType::SamplesPassed);
402 }
401} 403}
402 404
403void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, 405void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
404 std::optional<u64> timestamp) { 406 VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) {
405 query_cache.Query(gpu_addr, type, timestamp); 407 if (type == VideoCommon::QueryType::ZPassPixelCount64) {
408 if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) {
409 query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, {gpu.GetTicks()});
410 } else {
411 query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, std::nullopt);
412 }
413 return;
414 }
415 if (type != VideoCommon::QueryType::Payload) {
416 payload = 1u;
417 }
418 std::function<void()> func([this, gpu_addr, flags, memory_manager = gpu_memory, payload]() {
419 if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) {
420 u64 ticks = gpu.GetTicks();
421 memory_manager->Write<u64>(gpu_addr + 8, ticks);
422 memory_manager->Write<u64>(gpu_addr, static_cast<u64>(payload));
423 } else {
424 memory_manager->Write<u32>(gpu_addr, payload);
425 }
426 });
427 if (True(flags & VideoCommon::QueryPropertiesFlags::IsAFence)) {
428 SignalFence(std::move(func));
429 return;
430 }
431 func();
406} 432}
407 433
408void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, 434void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
@@ -573,8 +599,8 @@ void RasterizerOpenGL::SignalReference() {
573 fence_manager.SignalOrdering(); 599 fence_manager.SignalOrdering();
574} 600}
575 601
576void RasterizerOpenGL::ReleaseFences() { 602void RasterizerOpenGL::ReleaseFences(bool force) {
577 fence_manager.WaitPendingFences(); 603 fence_manager.WaitPendingFences(force);
578} 604}
579 605
580void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size, 606void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 8eda2ddba..ceffe1f1e 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -86,8 +86,9 @@ public:
86 void DrawTexture() override; 86 void DrawTexture() override;
87 void Clear(u32 layer_count) override; 87 void Clear(u32 layer_count) override;
88 void DispatchCompute() override; 88 void DispatchCompute() override;
89 void ResetCounter(VideoCore::QueryType type) override; 89 void ResetCounter(VideoCommon::QueryType type) override;
90 void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; 90 void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
91 VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
91 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; 92 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
92 void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; 93 void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
93 void FlushAll() override; 94 void FlushAll() override;
@@ -107,7 +108,7 @@ public:
107 void SyncOperation(std::function<void()>&& func) override; 108 void SyncOperation(std::function<void()>&& func) override;
108 void SignalSyncPoint(u32 value) override; 109 void SignalSyncPoint(u32 value) override;
109 void SignalReference() override; 110 void SignalReference() override;
110 void ReleaseFences() override; 111 void ReleaseFences(bool force = true) override;
111 void FlushAndInvalidateRegion( 112 void FlushAndInvalidateRegion(
112 VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; 113 VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
113 void WaitForIdle() override; 114 void WaitForIdle() override;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index e15865d16..d8148e89a 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -61,6 +61,9 @@ vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allo
61 if (device.IsExtTransformFeedbackSupported()) { 61 if (device.IsExtTransformFeedbackSupported()) {
62 flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; 62 flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT;
63 } 63 }
64 if (device.IsExtConditionalRendering()) {
65 flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT;
66 }
64 const VkBufferCreateInfo buffer_ci = { 67 const VkBufferCreateInfo buffer_ci = {
65 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, 68 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
66 .pNext = nullptr, 69 .pNext = nullptr,
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index 54ee030ce..289d5b25c 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -12,6 +12,9 @@
12#include "common/common_types.h" 12#include "common/common_types.h"
13#include "common/div_ceil.h" 13#include "common/div_ceil.h"
14#include "video_core/host_shaders/astc_decoder_comp_spv.h" 14#include "video_core/host_shaders/astc_decoder_comp_spv.h"
15#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h"
16#include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h"
17#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
15#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" 18#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
16#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" 19#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
17#include "video_core/renderer_vulkan/vk_compute_pass.h" 20#include "video_core/renderer_vulkan/vk_compute_pass.h"
@@ -57,6 +60,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE
57 }, 60 },
58}}; 61}};
59 62
63constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{
64 {
65 .binding = 0,
66 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
67 .descriptorCount = 1,
68 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
69 .pImmutableSamplers = nullptr,
70 },
71 {
72 .binding = 1,
73 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
74 .descriptorCount = 1,
75 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
76 .pImmutableSamplers = nullptr,
77 },
78 {
79 .binding = 2,
80 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
81 .descriptorCount = 1,
82 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
83 .pImmutableSamplers = nullptr,
84 },
85}};
86
60constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ 87constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
61 .uniform_buffers = 0, 88 .uniform_buffers = 0,
62 .storage_buffers = 2, 89 .storage_buffers = 2,
@@ -67,6 +94,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
67 .score = 2, 94 .score = 2,
68}; 95};
69 96
97constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{
98 .uniform_buffers = 0,
99 .storage_buffers = 3,
100 .texture_buffers = 0,
101 .image_buffers = 0,
102 .textures = 0,
103 .images = 0,
104 .score = 3,
105};
106
70constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ 107constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
71 { 108 {
72 .binding = ASTC_BINDING_INPUT_BUFFER, 109 .binding = ASTC_BINDING_INPUT_BUFFER,
@@ -103,6 +140,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT
103 .stride = sizeof(DescriptorUpdateEntry), 140 .stride = sizeof(DescriptorUpdateEntry),
104}; 141};
105 142
143constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{
144 .dstBinding = 0,
145 .dstArrayElement = 0,
146 .descriptorCount = 3,
147 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
148 .offset = 0,
149 .stride = sizeof(DescriptorUpdateEntry),
150};
151
106constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS> 152constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS>
107 ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ 153 ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
108 { 154 {
@@ -131,13 +177,21 @@ struct AstcPushConstants {
131 u32 block_height; 177 u32 block_height;
132 u32 block_height_mask; 178 u32 block_height_mask;
133}; 179};
180
181struct QueriesPrefixScanPushConstants {
182 u32 min_accumulation_base;
183 u32 max_accumulation_base;
184 u32 accumulation_limit;
185 u32 buffer_offset;
186};
134} // Anonymous namespace 187} // Anonymous namespace
135 188
136ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, 189ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
137 vk::Span<VkDescriptorSetLayoutBinding> bindings, 190 vk::Span<VkDescriptorSetLayoutBinding> bindings,
138 vk::Span<VkDescriptorUpdateTemplateEntry> templates, 191 vk::Span<VkDescriptorUpdateTemplateEntry> templates,
139 const DescriptorBankInfo& bank_info, 192 const DescriptorBankInfo& bank_info,
140 vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code) 193 vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code,
194 std::optional<u32> optional_subgroup_size)
141 : device{device_} { 195 : device{device_} {
142 descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ 196 descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({
143 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, 197 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
@@ -178,13 +232,19 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
178 .pCode = code.data(), 232 .pCode = code.data(),
179 }); 233 });
180 device.SaveShader(code); 234 device.SaveShader(code);
235 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{
236 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
237 .pNext = nullptr,
238 .requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U,
239 };
240 bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size;
181 pipeline = device.GetLogical().CreateComputePipeline({ 241 pipeline = device.GetLogical().CreateComputePipeline({
182 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 242 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
183 .pNext = nullptr, 243 .pNext = nullptr,
184 .flags = 0, 244 .flags = 0,
185 .stage{ 245 .stage{
186 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 246 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
187 .pNext = nullptr, 247 .pNext = use_setup_size ? &subgroup_size_ci : nullptr,
188 .flags = 0, 248 .flags = 0,
189 .stage = VK_SHADER_STAGE_COMPUTE_BIT, 249 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
190 .module = *module, 250 .module = *module,
@@ -302,6 +362,123 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
302 return {staging.buffer, staging.offset}; 362 return {staging.buffer, staging.offset};
303} 363}
304 364
365ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(
366 const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
367 ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
368 : ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS,
369 INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO, nullptr,
370 RESOLVE_CONDITIONAL_RENDER_COMP_SPV),
371 scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
372
373void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer,
374 u32 src_offset, bool compare_to_zero) {
375 const size_t compare_size = compare_to_zero ? 8 : 24;
376
377 compute_pass_descriptor_queue.Acquire();
378 compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, compare_size);
379 compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, sizeof(u32));
380 const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()};
381
382 scheduler.RequestOutsideRenderPassOperationContext();
383 scheduler.Record([this, descriptor_data](vk::CommandBuffer cmdbuf) {
384 static constexpr VkMemoryBarrier read_barrier{
385 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
386 .pNext = nullptr,
387 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT,
388 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
389 };
390 static constexpr VkMemoryBarrier write_barrier{
391 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
392 .pNext = nullptr,
393 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
394 .dstAccessMask = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT,
395 };
396 const VkDescriptorSet set = descriptor_allocator.Commit();
397 device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
398
399 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
400 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier);
401 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
402 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
403 cmdbuf.Dispatch(1, 1, 1);
404 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
405 VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier);
406 });
407}
408
409QueriesPrefixScanPass::QueriesPrefixScanPass(
410 const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
411 ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
412 : ComputePass(
413 device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS,
414 QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO,
415 COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>,
416 device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) &&
417 device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) &&
418 device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) &&
419 device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT)
420 ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV)
421 : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV)),
422 scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
423
424void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer,
425 VkBuffer src_buffer, size_t number_of_sums,
426 size_t min_accumulation_limit, size_t max_accumulation_limit) {
427 size_t current_runs = number_of_sums;
428 size_t offset = 0;
429 while (current_runs != 0) {
430 static constexpr size_t DISPATCH_SIZE = 2048U;
431 size_t runs_to_do = std::min<size_t>(current_runs, DISPATCH_SIZE);
432 current_runs -= runs_to_do;
433 compute_pass_descriptor_queue.Acquire();
434 compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, number_of_sums * sizeof(u64));
435 compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, number_of_sums * sizeof(u64));
436 compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64));
437 const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()};
438 size_t used_offset = offset;
439 offset += runs_to_do;
440
441 scheduler.RequestOutsideRenderPassOperationContext();
442 scheduler.Record([this, descriptor_data, min_accumulation_limit, max_accumulation_limit,
443 runs_to_do, used_offset](vk::CommandBuffer cmdbuf) {
444 static constexpr VkMemoryBarrier read_barrier{
445 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
446 .pNext = nullptr,
447 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
448 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
449 };
450 static constexpr VkMemoryBarrier write_barrier{
451 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
452 .pNext = nullptr,
453 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
454 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT |
455 VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
456 VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT |
457 VK_ACCESS_UNIFORM_READ_BIT |
458 VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT,
459 };
460 const QueriesPrefixScanPushConstants uniforms{
461 .min_accumulation_base = static_cast<u32>(min_accumulation_limit),
462 .max_accumulation_base = static_cast<u32>(max_accumulation_limit),
463 .accumulation_limit = static_cast<u32>(runs_to_do - 1),
464 .buffer_offset = static_cast<u32>(used_offset),
465 };
466 const VkDescriptorSet set = descriptor_allocator.Commit();
467 device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
468
469 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
470 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier);
471 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
472 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
473 cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms);
474 cmdbuf.Dispatch(1, 1, 1);
475 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
476 VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0,
477 write_barrier);
478 });
479 }
480}
481
305ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, 482ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
306 DescriptorPool& descriptor_pool_, 483 DescriptorPool& descriptor_pool_,
307 StagingBufferPool& staging_buffer_pool_, 484 StagingBufferPool& staging_buffer_pool_,
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index dd3927376..3ff935639 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -3,6 +3,7 @@
3 3
4#pragma once 4#pragma once
5 5
6#include <optional>
6#include <span> 7#include <span>
7#include <utility> 8#include <utility>
8 9
@@ -31,7 +32,8 @@ public:
31 vk::Span<VkDescriptorSetLayoutBinding> bindings, 32 vk::Span<VkDescriptorSetLayoutBinding> bindings,
32 vk::Span<VkDescriptorUpdateTemplateEntry> templates, 33 vk::Span<VkDescriptorUpdateTemplateEntry> templates,
33 const DescriptorBankInfo& bank_info, 34 const DescriptorBankInfo& bank_info,
34 vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code); 35 vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code,
36 std::optional<u32> optional_subgroup_size = std::nullopt);
35 ~ComputePass(); 37 ~ComputePass();
36 38
37protected: 39protected:
@@ -82,6 +84,33 @@ private:
82 ComputePassDescriptorQueue& compute_pass_descriptor_queue; 84 ComputePassDescriptorQueue& compute_pass_descriptor_queue;
83}; 85};
84 86
87class ConditionalRenderingResolvePass final : public ComputePass {
88public:
89 explicit ConditionalRenderingResolvePass(
90 const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
91 ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
92
93 void Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero);
94
95private:
96 Scheduler& scheduler;
97 ComputePassDescriptorQueue& compute_pass_descriptor_queue;
98};
99
100class QueriesPrefixScanPass final : public ComputePass {
101public:
102 explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_,
103 DescriptorPool& descriptor_pool_,
104 ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
105
106 void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer,
107 size_t number_of_sums, size_t min_accumulation_limit, size_t max_accumulation_limit);
108
109private:
110 Scheduler& scheduler;
111 ComputePassDescriptorQueue& compute_pass_descriptor_queue;
112};
113
85class ASTCDecoderPass final : public ComputePass { 114class ASTCDecoderPass final : public ComputePass {
86public: 115public:
87 explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, 116 explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h
index 145359d4e..336573574 100644
--- a/src/video_core/renderer_vulkan/vk_fence_manager.h
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.h
@@ -7,6 +7,7 @@
7 7
8#include "video_core/fence_manager.h" 8#include "video_core/fence_manager.h"
9#include "video_core/renderer_vulkan/vk_buffer_cache.h" 9#include "video_core/renderer_vulkan/vk_buffer_cache.h"
10#include "video_core/renderer_vulkan/vk_query_cache.h"
10#include "video_core/renderer_vulkan/vk_texture_cache.h" 11#include "video_core/renderer_vulkan/vk_texture_cache.h"
11 12
12namespace Core { 13namespace Core {
@@ -20,7 +21,6 @@ class RasterizerInterface;
20namespace Vulkan { 21namespace Vulkan {
21 22
22class Device; 23class Device;
23class QueryCache;
24class Scheduler; 24class Scheduler;
25 25
26class InnerFence : public VideoCommon::FenceBase { 26class InnerFence : public VideoCommon::FenceBase {
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp
index 29e0b797b..a32da3ba3 100644
--- a/src/video_core/renderer_vulkan/vk_query_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp
@@ -1,139 +1,1552 @@
1// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project 1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later 2// SPDX-License-Identifier: GPL-3.0-or-later
3 3
4#include <algorithm>
5#include <cstddef> 4#include <cstddef>
5#include <limits>
6#include <map>
7#include <memory>
8#include <span>
9#include <type_traits>
10#include <unordered_map>
6#include <utility> 11#include <utility>
7#include <vector> 12#include <vector>
8 13
14#include "common/bit_util.h"
15#include "common/common_types.h"
16#include "core/memory.h"
17#include "video_core/engines/draw_manager.h"
18#include "video_core/query_cache/query_cache.h"
19#include "video_core/renderer_vulkan/vk_buffer_cache.h"
20#include "video_core/renderer_vulkan/vk_compute_pass.h"
9#include "video_core/renderer_vulkan/vk_query_cache.h" 21#include "video_core/renderer_vulkan/vk_query_cache.h"
10#include "video_core/renderer_vulkan/vk_resource_pool.h" 22#include "video_core/renderer_vulkan/vk_resource_pool.h"
11#include "video_core/renderer_vulkan/vk_scheduler.h" 23#include "video_core/renderer_vulkan/vk_scheduler.h"
24#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
25#include "video_core/renderer_vulkan/vk_update_descriptor.h"
12#include "video_core/vulkan_common/vulkan_device.h" 26#include "video_core/vulkan_common/vulkan_device.h"
27#include "video_core/vulkan_common/vulkan_memory_allocator.h"
13#include "video_core/vulkan_common/vulkan_wrapper.h" 28#include "video_core/vulkan_common/vulkan_wrapper.h"
14 29
15namespace Vulkan { 30namespace Vulkan {
16 31
17using VideoCore::QueryType; 32using Tegra::Engines::Maxwell3D;
33using VideoCommon::QueryType;
18 34
19namespace { 35namespace {
36class SamplesQueryBank : public VideoCommon::BankBase {
37public:
38 static constexpr size_t BANK_SIZE = 256;
39 static constexpr size_t QUERY_SIZE = 8;
40 explicit SamplesQueryBank(const Device& device_, size_t index_)
41 : BankBase(BANK_SIZE), device{device_}, index{index_} {
42 const auto& dev = device.GetLogical();
43 query_pool = dev.CreateQueryPool({
44 .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
45 .pNext = nullptr,
46 .flags = 0,
47 .queryType = VK_QUERY_TYPE_OCCLUSION,
48 .queryCount = BANK_SIZE,
49 .pipelineStatistics = 0,
50 });
51 Reset();
52 }
20 53
21constexpr std::array QUERY_TARGETS = {VK_QUERY_TYPE_OCCLUSION}; 54 ~SamplesQueryBank() = default;
22 55
23constexpr VkQueryType GetTarget(QueryType type) { 56 void Reset() override {
24 return QUERY_TARGETS[static_cast<std::size_t>(type)]; 57 ASSERT(references == 0);
25} 58 VideoCommon::BankBase::Reset();
59 const auto& dev = device.GetLogical();
60 dev.ResetQueryPool(*query_pool, 0, BANK_SIZE);
61 host_results.fill(0ULL);
62 next_bank = 0;
63 }
64
65 void Sync(size_t start, size_t size) {
66 const auto& dev = device.GetLogical();
67 const VkResult query_result = dev.GetQueryResults(
68 *query_pool, static_cast<u32>(start), static_cast<u32>(size), sizeof(u64) * size,
69 &host_results[start], sizeof(u64), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
70 switch (query_result) {
71 case VK_SUCCESS:
72 return;
73 case VK_ERROR_DEVICE_LOST:
74 device.ReportLoss();
75 [[fallthrough]];
76 default:
77 throw vk::Exception(query_result);
78 }
79 }
80
81 VkQueryPool GetInnerPool() {
82 return *query_pool;
83 }
84
85 size_t GetIndex() const {
86 return index;
87 }
88
89 const std::array<u64, BANK_SIZE>& GetResults() const {
90 return host_results;
91 }
92
93 size_t next_bank;
94
95private:
96 const Device& device;
97 const size_t index;
98 vk::QueryPool query_pool;
99 std::array<u64, BANK_SIZE> host_results;
100};
101
102using BaseStreamer = VideoCommon::SimpleStreamer<VideoCommon::HostQueryBase>;
103
104struct HostSyncValues {
105 VAddr address;
106 size_t size;
107 size_t offset;
108
109 static constexpr bool GeneratesBaseBuffer = false;
110};
111
112class SamplesStreamer : public BaseStreamer {
113public:
114 explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_,
115 VideoCore::RasterizerInterface* rasterizer_, const Device& device_,
116 Scheduler& scheduler_, const MemoryAllocator& memory_allocator_,
117 ComputePassDescriptorQueue& compute_pass_descriptor_queue,
118 DescriptorPool& descriptor_pool)
119 : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_},
120 scheduler{scheduler_}, memory_allocator{memory_allocator_} {
121 current_bank = nullptr;
122 current_query = nullptr;
123 ammend_value = 0;
124 acumulation_value = 0;
125 queries_prefix_scan_pass = std::make_unique<QueriesPrefixScanPass>(
126 device, scheduler, descriptor_pool, compute_pass_descriptor_queue);
127
128 const VkBufferCreateInfo buffer_ci = {
129 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
130 .pNext = nullptr,
131 .flags = 0,
132 .size = 8,
133 .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
134 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
135 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
136 .queueFamilyIndexCount = 0,
137 .pQueueFamilyIndices = nullptr,
138 };
139 accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal);
140 scheduler.RequestOutsideRenderPassOperationContext();
141 scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
142 cmdbuf.FillBuffer(buffer, 0, 8, 0);
143 });
144 }
145
146 ~SamplesStreamer() = default;
147
148 void StartCounter() override {
149 if (has_started) {
150 return;
151 }
152 ReserveHostQuery();
153 scheduler.Record([query_pool = current_query_pool,
154 query_index = current_bank_slot](vk::CommandBuffer cmdbuf) {
155 const bool use_precise = Settings::IsGPULevelHigh();
156 cmdbuf.BeginQuery(query_pool, static_cast<u32>(query_index),
157 use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0);
158 });
159 has_started = true;
160 }
161
162 void PauseCounter() override {
163 if (!has_started) {
164 return;
165 }
166 scheduler.Record([query_pool = current_query_pool,
167 query_index = current_bank_slot](vk::CommandBuffer cmdbuf) {
168 cmdbuf.EndQuery(query_pool, static_cast<u32>(query_index));
169 });
170 has_started = false;
171 }
172
173 void ResetCounter() override {
174 if (has_started) {
175 PauseCounter();
176 }
177 AbandonCurrentQuery();
178 std::function<void()> func([this, counts = pending_flush_queries.size()] {
179 ammend_value = 0;
180 acumulation_value = 0;
181 });
182 rasterizer->SyncOperation(std::move(func));
183 accumulation_since_last_sync = false;
184 first_accumulation_checkpoint = std::min(first_accumulation_checkpoint, num_slots_used);
185 last_accumulation_checkpoint = std::max(last_accumulation_checkpoint, num_slots_used);
186 }
187
188 void CloseCounter() override {
189 PauseCounter();
190 }
191
192 bool HasPendingSync() const override {
193 return !pending_sync.empty();
194 }
195
196 void SyncWrites() override {
197 if (sync_values_stash.empty()) {
198 return;
199 }
200
201 for (size_t i = 0; i < sync_values_stash.size(); i++) {
202 runtime.template SyncValues<HostSyncValues>(sync_values_stash[i],
203 *buffers[resolve_buffers[i]]);
204 }
205
206 sync_values_stash.clear();
207 }
208
209 void PresyncWrites() override {
210 if (pending_sync.empty()) {
211 return;
212 }
213 PauseCounter();
214 sync_values_stash.clear();
215 sync_values_stash.emplace_back();
216 std::vector<HostSyncValues>* sync_values = &sync_values_stash.back();
217 sync_values->reserve(num_slots_used);
218 std::unordered_map<size_t, std::pair<size_t, size_t>> offsets;
219 resolve_buffers.clear();
220 size_t resolve_buffer_index = ObtainBuffer<true>(num_slots_used);
221 resolve_buffers.push_back(resolve_buffer_index);
222 size_t base_offset = 0;
223
224 ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start,
225 size_t amount) {
226 size_t bank_id = bank->GetIndex();
227 auto& resolve_buffer = buffers[resolve_buffer_index];
228 VkQueryPool query_pool = bank->GetInnerPool();
229 scheduler.RequestOutsideRenderPassOperationContext();
230 scheduler.Record([start, amount, base_offset, query_pool,
231 buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) {
232 const VkBufferMemoryBarrier copy_query_pool_barrier{
233 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
234 .pNext = nullptr,
235 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
236 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
237 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
238 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
239 .buffer = buffer,
240 .offset = base_offset,
241 .size = amount * SamplesQueryBank::QUERY_SIZE,
242 };
243
244 cmdbuf.CopyQueryPoolResults(
245 query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer,
246 static_cast<u32>(base_offset), SamplesQueryBank::QUERY_SIZE,
247 VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT);
248 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
249 VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier);
250 });
251 offsets[bank_id] = {start, base_offset};
252 base_offset += amount * SamplesQueryBank::QUERY_SIZE;
253 });
254
255 // Convert queries
256 bool has_multi_queries = false;
257 for (auto q : pending_sync) {
258 auto* query = GetQuery(q);
259 size_t sync_value_slot = 0;
260 if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) {
261 continue;
262 }
263 if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) {
264 continue;
265 }
266 if (accumulation_since_last_sync || query->size_slots > 1) {
267 if (!has_multi_queries) {
268 has_multi_queries = true;
269 sync_values_stash.emplace_back();
270 }
271 sync_value_slot = 1;
272 }
273 query->flags |= VideoCommon::QueryFlagBits::IsHostSynced;
274 auto loc_data = offsets[query->start_bank_id];
275 sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{
276 .address = query->guest_address,
277 .size = SamplesQueryBank::QUERY_SIZE,
278 .offset =
279 loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) *
280 SamplesQueryBank::QUERY_SIZE,
281 });
282 }
283
284 if (has_multi_queries) {
285 size_t intermediary_buffer_index = ObtainBuffer<false>(num_slots_used);
286 resolve_buffers.push_back(intermediary_buffer_index);
287 queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index],
288 *buffers[resolve_buffer_index], num_slots_used,
289 std::min(first_accumulation_checkpoint, num_slots_used),
290 last_accumulation_checkpoint);
291
292 } else {
293 scheduler.RequestOutsideRenderPassOperationContext();
294 scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
295 cmdbuf.FillBuffer(buffer, 0, 8, 0);
296 });
297 }
298
299 ReplicateCurrentQueryIfNeeded();
300 std::function<void()> func([this] { ammend_value = acumulation_value; });
301 rasterizer->SyncOperation(std::move(func));
302 AbandonCurrentQuery();
303 num_slots_used = 0;
304 first_accumulation_checkpoint = std::numeric_limits<size_t>::max();
305 last_accumulation_checkpoint = 0;
306 accumulation_since_last_sync = has_multi_queries;
307 pending_sync.clear();
308 }
309
310 size_t WriteCounter(VAddr address, bool has_timestamp, u32 value,
311 [[maybe_unused]] std::optional<u32> subreport) override {
312 PauseCounter();
313 auto index = BuildQuery();
314 auto* new_query = GetQuery(index);
315 new_query->guest_address = address;
316 new_query->value = 0;
317 new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan;
318 if (has_timestamp) {
319 new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp;
320 }
321 if (!current_query) {
322 new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced;
323 return index;
324 }
325 new_query->start_bank_id = current_query->start_bank_id;
326 new_query->size_banks = current_query->size_banks;
327 new_query->start_slot = current_query->start_slot;
328 new_query->size_slots = current_query->size_slots;
329 ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) {
330 bank->AddReference(amount);
331 });
332 pending_sync.push_back(index);
333 pending_flush_queries.push_back(index);
334 return index;
335 }
336
337 bool HasUnsyncedQueries() const override {
338 return !pending_flush_queries.empty();
339 }
340
341 void PushUnsyncedQueries() override {
342 PauseCounter();
343 current_bank->Close();
344 {
345 std::scoped_lock lk(flush_guard);
346 pending_flush_sets.emplace_back(std::move(pending_flush_queries));
347 }
348 }
349
350 void PopUnsyncedQueries() override {
351 std::vector<size_t> current_flush_queries;
352 {
353 std::scoped_lock lk(flush_guard);
354 current_flush_queries = std::move(pending_flush_sets.front());
355 pending_flush_sets.pop_front();
356 }
357 ApplyBanksWideOp<false>(
358 current_flush_queries,
359 [](SamplesQueryBank* bank, size_t start, size_t amount) { bank->Sync(start, amount); });
360 for (auto q : current_flush_queries) {
361 auto* query = GetQuery(q);
362 u64 total = 0;
363 ApplyBankOp(query, [&total](SamplesQueryBank* bank, size_t start, size_t amount) {
364 const auto& results = bank->GetResults();
365 for (size_t i = 0; i < amount; i++) {
366 total += results[start + i];
367 }
368 });
369 query->value = total;
370 query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced;
371 }
372 }
373
374private:
375 template <typename Func>
376 void ApplyBankOp(VideoCommon::HostQueryBase* query, Func&& func) {
377 size_t size_slots = query->size_slots;
378 if (size_slots == 0) {
379 return;
380 }
381 size_t bank_id = query->start_bank_id;
382 size_t banks_set = query->size_banks;
383 size_t start_slot = query->start_slot;
384 for (size_t i = 0; i < banks_set; i++) {
385 auto& the_bank = bank_pool.GetBank(bank_id);
386 size_t amount = std::min(the_bank.Size() - start_slot, size_slots);
387 func(&the_bank, start_slot, amount);
388 bank_id = the_bank.next_bank - 1;
389 start_slot = 0;
390 size_slots -= amount;
391 }
392 }
393
394 template <bool is_ordered, typename Func>
395 void ApplyBanksWideOp(std::vector<size_t>& queries, Func&& func) {
396 std::conditional_t<is_ordered, std::map<size_t, std::pair<size_t, size_t>>,
397 std::unordered_map<size_t, std::pair<size_t, size_t>>>
398 indexer;
399 for (auto q : queries) {
400 auto* query = GetQuery(q);
401 ApplyBankOp(query, [&indexer](SamplesQueryBank* bank, size_t start, size_t amount) {
402 auto id_ = bank->GetIndex();
403 auto pair = indexer.try_emplace(id_, std::numeric_limits<size_t>::max(),
404 std::numeric_limits<size_t>::min());
405 auto& current_pair = pair.first->second;
406 current_pair.first = std::min(current_pair.first, start);
407 current_pair.second = std::max(current_pair.second, amount + start);
408 });
409 }
410 for (auto& cont : indexer) {
411 func(&bank_pool.GetBank(cont.first), cont.second.first,
412 cont.second.second - cont.second.first);
413 }
414 }
415
416 void ReserveBank() {
417 current_bank_id =
418 bank_pool.ReserveBank([this](std::deque<SamplesQueryBank>& queue, size_t index) {
419 queue.emplace_back(device, index);
420 });
421 if (current_bank) {
422 current_bank->next_bank = current_bank_id + 1;
423 }
424 current_bank = &bank_pool.GetBank(current_bank_id);
425 current_query_pool = current_bank->GetInnerPool();
426 }
427
428 size_t ReserveBankSlot() {
429 if (!current_bank || current_bank->IsClosed()) {
430 ReserveBank();
431 }
432 auto [built, index] = current_bank->Reserve();
433 current_bank_slot = index;
434 return index;
435 }
436
437 void ReserveHostQuery() {
438 size_t new_slot = ReserveBankSlot();
439 current_bank->AddReference(1);
440 num_slots_used++;
441 if (current_query) {
442 size_t bank_id = current_query->start_bank_id;
443 size_t banks_set = current_query->size_banks - 1;
444 bool found = bank_id == current_bank_id;
445 while (!found && banks_set > 0) {
446 SamplesQueryBank& some_bank = bank_pool.GetBank(bank_id);
447 bank_id = some_bank.next_bank - 1;
448 found = bank_id == current_bank_id;
449 banks_set--;
450 }
451 if (!found) {
452 current_query->size_banks++;
453 }
454 current_query->size_slots++;
455 } else {
456 current_query_id = BuildQuery();
457 current_query = GetQuery(current_query_id);
458 current_query->start_bank_id = static_cast<u32>(current_bank_id);
459 current_query->size_banks = 1;
460 current_query->start_slot = new_slot;
461 current_query->size_slots = 1;
462 }
463 }
464
465 void Free(size_t query_id) override {
466 std::scoped_lock lk(guard);
467 auto* query = GetQuery(query_id);
468 ApplyBankOp(query, [](SamplesQueryBank* bank, size_t start, size_t amount) {
469 bank->CloseReference(amount);
470 });
471 ReleaseQuery(query_id);
472 }
473
474 void AbandonCurrentQuery() {
475 if (!current_query) {
476 return;
477 }
478 Free(current_query_id);
479 current_query = nullptr;
480 current_query_id = 0;
481 }
482
483 void ReplicateCurrentQueryIfNeeded() {
484 if (pending_sync.empty()) {
485 return;
486 }
487 if (!current_query) {
488 return;
489 }
490 auto index = BuildQuery();
491 auto* new_query = GetQuery(index);
492 new_query->guest_address = 0;
493 new_query->value = 0;
494 new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan;
495 new_query->start_bank_id = current_query->start_bank_id;
496 new_query->size_banks = current_query->size_banks;
497 new_query->start_slot = current_query->start_slot;
498 new_query->size_slots = current_query->size_slots;
499 ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) {
500 bank->AddReference(amount);
501 });
502 pending_flush_queries.push_back(index);
503 std::function<void()> func([this, index] {
504 auto* query = GetQuery(index);
505 query->value += GetAmmendValue();
506 SetAccumulationValue(query->value);
507 Free(index);
508 });
509 }
510
511 template <bool is_resolve>
512 size_t ObtainBuffer(size_t num_needed) {
513 const size_t log_2 = std::max<size_t>(11U, Common::Log2Ceil64(num_needed));
514 if constexpr (is_resolve) {
515 if (resolve_table[log_2] != 0) {
516 return resolve_table[log_2] - 1;
517 }
518 } else {
519 if (intermediary_table[log_2] != 0) {
520 return intermediary_table[log_2] - 1;
521 }
522 }
523 const VkBufferCreateInfo buffer_ci = {
524 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
525 .pNext = nullptr,
526 .flags = 0,
527 .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2),
528 .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
529 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
530 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
531 .queueFamilyIndexCount = 0,
532 .pQueueFamilyIndices = nullptr,
533 };
534 buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal));
535 if constexpr (is_resolve) {
536 resolve_table[log_2] = buffers.size();
537 } else {
538 intermediary_table[log_2] = buffers.size();
539 }
540 return buffers.size() - 1;
541 }
542
543 QueryCacheRuntime& runtime;
544 VideoCore::RasterizerInterface* rasterizer;
545 const Device& device;
546 Scheduler& scheduler;
547 const MemoryAllocator& memory_allocator;
548 VideoCommon::BankPool<SamplesQueryBank> bank_pool;
549 std::deque<vk::Buffer> buffers;
550 std::array<size_t, 32> resolve_table{};
551 std::array<size_t, 32> intermediary_table{};
552 vk::Buffer accumulation_buffer;
553 std::deque<std::vector<HostSyncValues>> sync_values_stash;
554 std::vector<size_t> resolve_buffers;
555
556 // syncing queue
557 std::vector<size_t> pending_sync;
558
559 // flush levels
560 std::vector<size_t> pending_flush_queries;
561 std::deque<std::vector<size_t>> pending_flush_sets;
562
563 // State Machine
564 size_t current_bank_slot;
565 size_t current_bank_id;
566 SamplesQueryBank* current_bank;
567 VkQueryPool current_query_pool;
568 size_t current_query_id;
569 size_t num_slots_used{};
570 size_t first_accumulation_checkpoint{};
571 size_t last_accumulation_checkpoint{};
572 bool accumulation_since_last_sync{};
573 VideoCommon::HostQueryBase* current_query;
574 bool has_started{};
575 std::mutex flush_guard;
576
577 std::unique_ptr<QueriesPrefixScanPass> queries_prefix_scan_pass;
578};
579
580// Transform feedback queries
581class TFBQueryBank : public VideoCommon::BankBase {
582public:
583 static constexpr size_t BANK_SIZE = 1024;
584 static constexpr size_t QUERY_SIZE = 4;
585 explicit TFBQueryBank(Scheduler& scheduler_, const MemoryAllocator& memory_allocator,
586 size_t index_)
587 : BankBase(BANK_SIZE), scheduler{scheduler_}, index{index_} {
588 const VkBufferCreateInfo buffer_ci = {
589 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
590 .pNext = nullptr,
591 .flags = 0,
592 .size = QUERY_SIZE * BANK_SIZE,
593 .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
594 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
595 .queueFamilyIndexCount = 0,
596 .pQueueFamilyIndices = nullptr,
597 };
598 buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal);
599 }
600
601 ~TFBQueryBank() = default;
602
603 void Reset() override {
604 ASSERT(references == 0);
605 VideoCommon::BankBase::Reset();
606 }
607
608 void Sync(StagingBufferRef& stagging_buffer, size_t extra_offset, size_t start, size_t size) {
609 scheduler.RequestOutsideRenderPassOperationContext();
610 scheduler.Record([this, dst_buffer = stagging_buffer.buffer, extra_offset, start,
611 size](vk::CommandBuffer cmdbuf) {
612 std::array<VkBufferCopy, 1> copy{VkBufferCopy{
613 .srcOffset = start * QUERY_SIZE,
614 .dstOffset = extra_offset,
615 .size = size * QUERY_SIZE,
616 }};
617 cmdbuf.CopyBuffer(*buffer, dst_buffer, copy);
618 });
619 }
620
621 size_t GetIndex() const {
622 return index;
623 }
624
625 VkBuffer GetBuffer() const {
626 return *buffer;
627 }
628
629private:
630 Scheduler& scheduler;
631 const size_t index;
632 vk::Buffer buffer;
633};
634
635class PrimitivesSucceededStreamer;
636
637class TFBCounterStreamer : public BaseStreamer {
638public:
639 explicit TFBCounterStreamer(size_t id_, QueryCacheRuntime& runtime_, const Device& device_,
640 Scheduler& scheduler_, const MemoryAllocator& memory_allocator_,
641 StagingBufferPool& staging_pool_)
642 : BaseStreamer(id_), runtime{runtime_}, device{device_}, scheduler{scheduler_},
643 memory_allocator{memory_allocator_}, staging_pool{staging_pool_} {
644 buffers_count = 0;
645 current_bank = nullptr;
646 counter_buffers.fill(VK_NULL_HANDLE);
647 offsets.fill(0);
648 last_queries.fill(0);
649 last_queries_stride.fill(1);
650 const VkBufferCreateInfo buffer_ci = {
651 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
652 .pNext = nullptr,
653 .flags = 0,
654 .size = TFBQueryBank::QUERY_SIZE * NUM_STREAMS,
655 .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
656 VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT,
657 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
658 .queueFamilyIndexCount = 0,
659 .pQueueFamilyIndices = nullptr,
660 };
661
662 counters_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal);
663 for (auto& c : counter_buffers) {
664 c = *counters_buffer;
665 }
666 size_t base_offset = 0;
667 for (auto& o : offsets) {
668 o = base_offset;
669 base_offset += TFBQueryBank::QUERY_SIZE;
670 }
671 }
672
673 ~TFBCounterStreamer() = default;
674
675 void StartCounter() override {
676 FlushBeginTFB();
677 has_started = true;
678 }
679
680 void PauseCounter() override {
681 CloseCounter();
682 }
683
684 void ResetCounter() override {
685 CloseCounter();
686 }
687
688 void CloseCounter() override {
689 if (has_flushed_end_pending) {
690 FlushEndTFB();
691 }
692 runtime.View3DRegs([this](Maxwell3D& maxwell3d) {
693 if (maxwell3d.regs.transform_feedback_enabled == 0) {
694 streams_mask = 0;
695 has_started = false;
696 }
697 });
698 }
699
700 bool HasPendingSync() const override {
701 return !pending_sync.empty();
702 }
703
704 void SyncWrites() override {
705 CloseCounter();
706 std::unordered_map<size_t, std::vector<HostSyncValues>> sync_values_stash;
707 for (auto q : pending_sync) {
708 auto* query = GetQuery(q);
709 if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) {
710 continue;
711 }
712 if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) {
713 continue;
714 }
715 query->flags |= VideoCommon::QueryFlagBits::IsHostSynced;
716 sync_values_stash.try_emplace(query->start_bank_id);
717 sync_values_stash[query->start_bank_id].emplace_back(HostSyncValues{
718 .address = query->guest_address,
719 .size = TFBQueryBank::QUERY_SIZE,
720 .offset = query->start_slot * TFBQueryBank::QUERY_SIZE,
721 });
722 }
723 for (auto& p : sync_values_stash) {
724 auto& bank = bank_pool.GetBank(p.first);
725 runtime.template SyncValues<HostSyncValues>(p.second, bank.GetBuffer());
726 }
727 pending_sync.clear();
728 }
26 729
27} // Anonymous namespace 730 size_t WriteCounter(VAddr address, bool has_timestamp, u32 value,
731 std::optional<u32> subreport_) override {
732 auto index = BuildQuery();
733 auto* new_query = GetQuery(index);
734 new_query->guest_address = address;
735 new_query->value = 0;
736 new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan;
737 if (has_timestamp) {
738 new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp;
739 }
740 if (!subreport_) {
741 new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced;
742 return index;
743 }
744 const size_t subreport = static_cast<size_t>(*subreport_);
745 last_queries[subreport] = address;
746 if ((streams_mask & (1ULL << subreport)) == 0) {
747 new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced;
748 return index;
749 }
750 CloseCounter();
751 auto [bank_slot, data_slot] = ProduceCounterBuffer(subreport);
752 new_query->start_bank_id = static_cast<u32>(bank_slot);
753 new_query->size_banks = 1;
754 new_query->start_slot = static_cast<u32>(data_slot);
755 new_query->size_slots = 1;
756 pending_sync.push_back(index);
757 pending_flush_queries.push_back(index);
758 return index;
759 }
760
761 std::optional<std::pair<VAddr, size_t>> GetLastQueryStream(size_t stream) {
762 if (last_queries[stream] != 0) {
763 std::pair<VAddr, size_t> result(last_queries[stream], last_queries_stride[stream]);
764 return result;
765 }
766 return std::nullopt;
767 }
768
769 Maxwell3D::Regs::PrimitiveTopology GetOutputTopology() const {
770 return out_topology;
771 }
772
773 bool HasUnsyncedQueries() const override {
774 return !pending_flush_queries.empty();
775 }
776
777 void PushUnsyncedQueries() override {
778 CloseCounter();
779 auto staging_ref = staging_pool.Request(
780 pending_flush_queries.size() * TFBQueryBank::QUERY_SIZE, MemoryUsage::Download, true);
781 size_t offset_base = staging_ref.offset;
782 for (auto q : pending_flush_queries) {
783 auto* query = GetQuery(q);
784 auto& bank = bank_pool.GetBank(query->start_bank_id);
785 bank.Sync(staging_ref, offset_base, query->start_slot, 1);
786 offset_base += TFBQueryBank::QUERY_SIZE;
787 bank.CloseReference();
788 }
789 static constexpr VkMemoryBarrier WRITE_BARRIER{
790 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
791 .pNext = nullptr,
792 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
793 .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
794 };
795 scheduler.RequestOutsideRenderPassOperationContext();
796 scheduler.Record([](vk::CommandBuffer cmdbuf) {
797 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
798 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER);
799 });
800
801 std::scoped_lock lk(flush_guard);
802 for (auto& str : free_queue) {
803 staging_pool.FreeDeferred(str);
804 }
805 free_queue.clear();
806 download_buffers.emplace_back(staging_ref);
807 pending_flush_sets.emplace_back(std::move(pending_flush_queries));
808 }
809
810 void PopUnsyncedQueries() override {
811 StagingBufferRef staging_ref;
812 std::vector<size_t> flushed_queries;
813 {
814 std::scoped_lock lk(flush_guard);
815 staging_ref = download_buffers.front();
816 flushed_queries = std::move(pending_flush_sets.front());
817 download_buffers.pop_front();
818 pending_flush_sets.pop_front();
819 }
820
821 size_t offset_base = staging_ref.offset;
822 for (auto q : flushed_queries) {
823 auto* query = GetQuery(q);
824 u32 result = 0;
825 std::memcpy(&result, staging_ref.mapped_span.data() + offset_base, sizeof(u32));
826 query->value = static_cast<u64>(result);
827 query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced;
828 offset_base += TFBQueryBank::QUERY_SIZE;
829 }
830
831 {
832 std::scoped_lock lk(flush_guard);
833 free_queue.emplace_back(staging_ref);
834 }
835 }
836
837private:
838 void FlushBeginTFB() {
839 if (has_flushed_end_pending) [[unlikely]] {
840 return;
841 }
842 has_flushed_end_pending = true;
843 if (!has_started || buffers_count == 0) {
844 scheduler.Record([](vk::CommandBuffer cmdbuf) {
845 cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr);
846 });
847 UpdateBuffers();
848 return;
849 }
850 scheduler.Record([this, total = static_cast<u32>(buffers_count)](vk::CommandBuffer cmdbuf) {
851 cmdbuf.BeginTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data());
852 });
853 UpdateBuffers();
854 }
855
856 void FlushEndTFB() {
857 if (!has_flushed_end_pending) [[unlikely]] {
858 UNREACHABLE();
859 return;
860 }
861 has_flushed_end_pending = false;
862
863 if (buffers_count == 0) {
864 scheduler.Record([](vk::CommandBuffer cmdbuf) {
865 cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr);
866 });
867 } else {
868 scheduler.Record([this,
869 total = static_cast<u32>(buffers_count)](vk::CommandBuffer cmdbuf) {
870 cmdbuf.EndTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data());
871 });
872 }
873 }
874
875 void UpdateBuffers() {
876 last_queries.fill(0);
877 last_queries_stride.fill(1);
878 runtime.View3DRegs([this](Maxwell3D& maxwell3d) {
879 buffers_count = 0;
880 out_topology = maxwell3d.draw_manager->GetDrawState().topology;
881 for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) {
882 const auto& tf = maxwell3d.regs.transform_feedback;
883 if (tf.buffers[i].enable == 0) {
884 continue;
885 }
886 const size_t stream = tf.controls[i].stream;
887 last_queries_stride[stream] = tf.controls[i].stride;
888 streams_mask |= 1ULL << stream;
889 buffers_count = std::max<size_t>(buffers_count, stream + 1);
890 }
891 });
892 }
893
894 std::pair<size_t, size_t> ProduceCounterBuffer(size_t stream) {
895 if (current_bank == nullptr || current_bank->IsClosed()) {
896 current_bank_id =
897 bank_pool.ReserveBank([this](std::deque<TFBQueryBank>& queue, size_t index) {
898 queue.emplace_back(scheduler, memory_allocator, index);
899 });
900 current_bank = &bank_pool.GetBank(current_bank_id);
901 }
902 auto [dont_care, other] = current_bank->Reserve();
903 const size_t slot = other; // workaround to compile bug.
904 current_bank->AddReference();
905
906 static constexpr VkMemoryBarrier READ_BARRIER{
907 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
908 .pNext = nullptr,
909 .srcAccessMask = VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT,
910 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
911 };
912 static constexpr VkMemoryBarrier WRITE_BARRIER{
913 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
914 .pNext = nullptr,
915 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
916 .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT,
917 };
918 scheduler.RequestOutsideRenderPassOperationContext();
919 scheduler.Record([dst_buffer = current_bank->GetBuffer(),
920 src_buffer = counter_buffers[stream], src_offset = offsets[stream],
921 slot](vk::CommandBuffer cmdbuf) {
922 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT,
923 VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER);
924 std::array<VkBufferCopy, 1> copy{VkBufferCopy{
925 .srcOffset = src_offset,
926 .dstOffset = slot * TFBQueryBank::QUERY_SIZE,
927 .size = TFBQueryBank::QUERY_SIZE,
928 }};
929 cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy);
930 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
931 0, WRITE_BARRIER);
932 });
933 return {current_bank_id, slot};
934 }
935
936 friend class PrimitivesSucceededStreamer;
937
938 static constexpr size_t NUM_STREAMS = 4;
939
940 QueryCacheRuntime& runtime;
941 const Device& device;
942 Scheduler& scheduler;
943 const MemoryAllocator& memory_allocator;
944 StagingBufferPool& staging_pool;
945 VideoCommon::BankPool<TFBQueryBank> bank_pool;
946 size_t current_bank_id;
947 TFBQueryBank* current_bank;
948 vk::Buffer counters_buffer;
949
950 // syncing queue
951 std::vector<size_t> pending_sync;
952
953 // flush levels
954 std::vector<size_t> pending_flush_queries;
955 std::deque<StagingBufferRef> download_buffers;
956 std::deque<std::vector<size_t>> pending_flush_sets;
957 std::vector<StagingBufferRef> free_queue;
958 std::mutex flush_guard;
959
960 // state machine
961 bool has_started{};
962 bool has_flushed_end_pending{};
963 size_t buffers_count{};
964 std::array<VkBuffer, NUM_STREAMS> counter_buffers{};
965 std::array<VkDeviceSize, NUM_STREAMS> offsets{};
966 std::array<VAddr, NUM_STREAMS> last_queries;
967 std::array<size_t, NUM_STREAMS> last_queries_stride;
968 Maxwell3D::Regs::PrimitiveTopology out_topology;
969 u64 streams_mask;
970};
971
972class PrimitivesQueryBase : public VideoCommon::QueryBase {
973public:
974 // Default constructor
975 PrimitivesQueryBase()
976 : VideoCommon::QueryBase(0, VideoCommon::QueryFlagBits::IsHostManaged, 0) {}
977
978 // Parameterized constructor
979 PrimitivesQueryBase(bool has_timestamp, VAddr address)
980 : VideoCommon::QueryBase(address, VideoCommon::QueryFlagBits::IsHostManaged, 0) {
981 if (has_timestamp) {
982 flags |= VideoCommon::QueryFlagBits::HasTimestamp;
983 }
984 }
985
986 u64 stride{};
987 VAddr dependant_address{};
988 Maxwell3D::Regs::PrimitiveTopology topology{Maxwell3D::Regs::PrimitiveTopology::Points};
989 size_t dependant_index{};
990 bool dependant_manage{};
991};
992
993class PrimitivesSucceededStreamer : public VideoCommon::SimpleStreamer<PrimitivesQueryBase> {
994public:
995 explicit PrimitivesSucceededStreamer(size_t id_, QueryCacheRuntime& runtime_,
996 TFBCounterStreamer& tfb_streamer_,
997 Core::Memory::Memory& cpu_memory_)
998 : VideoCommon::SimpleStreamer<PrimitivesQueryBase>(id_), runtime{runtime_},
999 tfb_streamer{tfb_streamer_}, cpu_memory{cpu_memory_} {
1000 MakeDependent(&tfb_streamer);
1001 }
1002
1003 ~PrimitivesSucceededStreamer() = default;
1004
1005 size_t WriteCounter(VAddr address, bool has_timestamp, u32 value,
1006 std::optional<u32> subreport_) override {
1007 auto index = BuildQuery();
1008 auto* new_query = GetQuery(index);
1009 new_query->guest_address = address;
1010 new_query->value = 0;
1011 if (has_timestamp) {
1012 new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp;
1013 }
1014 if (!subreport_) {
1015 new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced;
1016 return index;
1017 }
1018 const size_t subreport = static_cast<size_t>(*subreport_);
1019 auto dependant_address_opt = tfb_streamer.GetLastQueryStream(subreport);
1020 bool must_manage_dependance = false;
1021 new_query->topology = tfb_streamer.GetOutputTopology();
1022 if (dependant_address_opt) {
1023 auto [dep_address, stride] = *dependant_address_opt;
1024 new_query->dependant_address = dep_address;
1025 new_query->stride = stride;
1026 } else {
1027 new_query->dependant_index =
1028 tfb_streamer.WriteCounter(address, has_timestamp, value, subreport_);
1029 auto* dependant_query = tfb_streamer.GetQuery(new_query->dependant_index);
1030 dependant_query->flags |= VideoCommon::QueryFlagBits::IsInvalidated;
1031 must_manage_dependance = true;
1032 if (True(dependant_query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) {
1033 new_query->value = 0;
1034 new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced;
1035 if (must_manage_dependance) {
1036 tfb_streamer.Free(new_query->dependant_index);
1037 }
1038 return index;
1039 }
1040 new_query->stride = 1;
1041 runtime.View3DRegs([new_query, subreport](Maxwell3D& maxwell3d) {
1042 for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) {
1043 const auto& tf = maxwell3d.regs.transform_feedback;
1044 if (tf.buffers[i].enable == 0) {
1045 continue;
1046 }
1047 if (tf.controls[i].stream != subreport) {
1048 continue;
1049 }
1050 new_query->stride = tf.controls[i].stride;
1051 break;
1052 }
1053 });
1054 }
1055
1056 new_query->dependant_manage = must_manage_dependance;
1057 pending_flush_queries.push_back(index);
1058 return index;
1059 }
1060
1061 bool HasUnsyncedQueries() const override {
1062 return !pending_flush_queries.empty();
1063 }
1064
1065 void PushUnsyncedQueries() override {
1066 std::scoped_lock lk(flush_guard);
1067 pending_flush_sets.emplace_back(std::move(pending_flush_queries));
1068 pending_flush_queries.clear();
1069 }
1070
1071 void PopUnsyncedQueries() override {
1072 std::vector<size_t> flushed_queries;
1073 {
1074 std::scoped_lock lk(flush_guard);
1075 flushed_queries = std::move(pending_flush_sets.front());
1076 pending_flush_sets.pop_front();
1077 }
1078
1079 for (auto q : flushed_queries) {
1080 auto* query = GetQuery(q);
1081 if (True(query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) {
1082 continue;
1083 }
1084
1085 query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced;
1086 u64 num_vertices = 0;
1087 if (query->dependant_manage) {
1088 auto* dependant_query = tfb_streamer.GetQuery(query->dependant_index);
1089 num_vertices = dependant_query->value / query->stride;
1090 tfb_streamer.Free(query->dependant_index);
1091 } else {
1092 u8* pointer = cpu_memory.GetPointer(query->dependant_address);
1093 u32 result;
1094 std::memcpy(&result, pointer, sizeof(u32));
1095 num_vertices = static_cast<u64>(result) / query->stride;
1096 }
1097 query->value = [&]() -> u64 {
1098 switch (query->topology) {
1099 case Maxwell3D::Regs::PrimitiveTopology::Points:
1100 return num_vertices;
1101 case Maxwell3D::Regs::PrimitiveTopology::Lines:
1102 return num_vertices / 2;
1103 case Maxwell3D::Regs::PrimitiveTopology::LineLoop:
1104 return (num_vertices / 2) + 1;
1105 case Maxwell3D::Regs::PrimitiveTopology::LineStrip:
1106 return num_vertices - 1;
1107 case Maxwell3D::Regs::PrimitiveTopology::Patches:
1108 case Maxwell3D::Regs::PrimitiveTopology::Triangles:
1109 case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
1110 return num_vertices / 3;
1111 case Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
1112 case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
1113 case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
1114 return num_vertices - 2;
1115 case Maxwell3D::Regs::PrimitiveTopology::Quads:
1116 return num_vertices / 4;
1117 case Maxwell3D::Regs::PrimitiveTopology::Polygon:
1118 return 1U;
1119 default:
1120 return num_vertices;
1121 }
1122 }();
1123 }
1124 }
1125
1126private:
1127 QueryCacheRuntime& runtime;
1128 TFBCounterStreamer& tfb_streamer;
1129 Core::Memory::Memory& cpu_memory;
1130
1131 // syncing queue
1132 std::vector<size_t> pending_sync;
1133
1134 // flush levels
1135 std::vector<size_t> pending_flush_queries;
1136 std::deque<std::vector<size_t>> pending_flush_sets;
1137 std::mutex flush_guard;
1138};
1139
1140} // namespace
1141
1142struct QueryCacheRuntimeImpl {
1143 QueryCacheRuntimeImpl(QueryCacheRuntime& runtime, VideoCore::RasterizerInterface* rasterizer_,
1144 Core::Memory::Memory& cpu_memory_, Vulkan::BufferCache& buffer_cache_,
1145 const Device& device_, const MemoryAllocator& memory_allocator_,
1146 Scheduler& scheduler_, StagingBufferPool& staging_pool_,
1147 ComputePassDescriptorQueue& compute_pass_descriptor_queue,
1148 DescriptorPool& descriptor_pool)
1149 : rasterizer{rasterizer_}, cpu_memory{cpu_memory_},
1150 buffer_cache{buffer_cache_}, device{device_},
1151 memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_},
1152 guest_streamer(0, runtime),
1153 sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer,
1154 device, scheduler, memory_allocator, compute_pass_descriptor_queue,
1155 descriptor_pool),
1156 tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device,
1157 scheduler, memory_allocator, staging_pool),
1158 primitives_succeeded_streamer(
1159 static_cast<size_t>(QueryType::StreamingPrimitivesSucceeded), runtime, tfb_streamer,
1160 cpu_memory_),
1161 primitives_needed_minus_suceeded_streamer(
1162 static_cast<size_t>(QueryType::StreamingPrimitivesNeededMinusSucceeded), runtime, 0u),
1163 hcr_setup{}, hcr_is_set{}, is_hcr_running{} {
28 1164
29QueryPool::QueryPool(const Device& device_, Scheduler& scheduler, QueryType type_) 1165 hcr_setup.sType = VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT;
30 : ResourcePool{scheduler.GetMasterSemaphore(), GROW_STEP}, device{device_}, type{type_} {} 1166 hcr_setup.pNext = nullptr;
1167 hcr_setup.flags = 0;
31 1168
32QueryPool::~QueryPool() = default; 1169 conditional_resolve_pass = std::make_unique<ConditionalRenderingResolvePass>(
1170 device, scheduler, descriptor_pool, compute_pass_descriptor_queue);
33 1171
34std::pair<VkQueryPool, u32> QueryPool::Commit() { 1172 const VkBufferCreateInfo buffer_ci = {
35 std::size_t index; 1173 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
36 do { 1174 .pNext = nullptr,
37 index = CommitResource(); 1175 .flags = 0,
38 } while (usage[index]); 1176 .size = sizeof(u32),
39 usage[index] = true; 1177 .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
1178 VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT,
1179 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
1180 .queueFamilyIndexCount = 0,
1181 .pQueueFamilyIndices = nullptr,
1182 };
1183 hcr_resolve_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal);
1184 }
1185
1186 VideoCore::RasterizerInterface* rasterizer;
1187 Core::Memory::Memory& cpu_memory;
1188 Vulkan::BufferCache& buffer_cache;
1189
1190 const Device& device;
1191 const MemoryAllocator& memory_allocator;
1192 Scheduler& scheduler;
1193 StagingBufferPool& staging_pool;
1194
1195 // Streamers
1196 VideoCommon::GuestStreamer<QueryCacheParams> guest_streamer;
1197 SamplesStreamer sample_streamer;
1198 TFBCounterStreamer tfb_streamer;
1199 PrimitivesSucceededStreamer primitives_succeeded_streamer;
1200 VideoCommon::StubStreamer<QueryCacheParams> primitives_needed_minus_suceeded_streamer;
40 1201
41 return {*pools[index / GROW_STEP], static_cast<u32>(index % GROW_STEP)}; 1202 std::vector<std::pair<VAddr, VAddr>> little_cache;
1203 std::vector<std::pair<VkBuffer, VkDeviceSize>> buffers_to_upload_to;
1204 std::vector<size_t> redirect_cache;
1205 std::vector<std::vector<VkBufferCopy>> copies_setup;
1206
1207 // Host conditional rendering data
1208 std::unique_ptr<ConditionalRenderingResolvePass> conditional_resolve_pass;
1209 vk::Buffer hcr_resolve_buffer;
1210 VkConditionalRenderingBeginInfoEXT hcr_setup;
1211 VkBuffer hcr_buffer;
1212 size_t hcr_offset;
1213 bool hcr_is_set;
1214 bool is_hcr_running;
1215
1216 // maxwell3d
1217 Maxwell3D* maxwell3d;
1218};
1219
1220QueryCacheRuntime::QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer,
1221 Core::Memory::Memory& cpu_memory_,
1222 Vulkan::BufferCache& buffer_cache_, const Device& device_,
1223 const MemoryAllocator& memory_allocator_,
1224 Scheduler& scheduler_, StagingBufferPool& staging_pool_,
1225 ComputePassDescriptorQueue& compute_pass_descriptor_queue,
1226 DescriptorPool& descriptor_pool) {
1227 impl = std::make_unique<QueryCacheRuntimeImpl>(
1228 *this, rasterizer, cpu_memory_, buffer_cache_, device_, memory_allocator_, scheduler_,
1229 staging_pool_, compute_pass_descriptor_queue, descriptor_pool);
42} 1230}
43 1231
44void QueryPool::Allocate(std::size_t begin, std::size_t end) { 1232void QueryCacheRuntime::Bind3DEngine(Maxwell3D* maxwell3d) {
45 usage.resize(end); 1233 impl->maxwell3d = maxwell3d;
1234}
46 1235
47 pools.push_back(device.GetLogical().CreateQueryPool({ 1236template <typename Func>
48 .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, 1237void QueryCacheRuntime::View3DRegs(Func&& func) {
49 .pNext = nullptr, 1238 func(*impl->maxwell3d);
50 .flags = 0, 1239}
51 .queryType = GetTarget(type), 1240
52 .queryCount = static_cast<u32>(end - begin), 1241void QueryCacheRuntime::EndHostConditionalRendering() {
53 .pipelineStatistics = 0, 1242 PauseHostConditionalRendering();
54 })); 1243 impl->hcr_is_set = false;
1244 impl->is_hcr_running = false;
1245 impl->hcr_buffer = nullptr;
1246 impl->hcr_offset = 0;
1247}
1248
1249void QueryCacheRuntime::PauseHostConditionalRendering() {
1250 if (!impl->hcr_is_set) {
1251 return;
1252 }
1253 if (impl->is_hcr_running) {
1254 impl->scheduler.Record(
1255 [](vk::CommandBuffer cmdbuf) { cmdbuf.EndConditionalRenderingEXT(); });
1256 }
1257 impl->is_hcr_running = false;
55} 1258}
56 1259
57void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) { 1260void QueryCacheRuntime::ResumeHostConditionalRendering() {
58 const auto it = 1261 if (!impl->hcr_is_set) {
59 std::find_if(pools.begin(), pools.end(), [query_pool = query.first](vk::QueryPool& pool) { 1262 return;
60 return query_pool == *pool; 1263 }
1264 if (!impl->is_hcr_running) {
1265 impl->scheduler.Record([hcr_setup = impl->hcr_setup](vk::CommandBuffer cmdbuf) {
1266 cmdbuf.BeginConditionalRenderingEXT(hcr_setup);
61 }); 1267 });
1268 }
1269 impl->is_hcr_running = true;
1270}
62 1271
63 if (it != std::end(pools)) { 1272void QueryCacheRuntime::HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object,
64 const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it); 1273 bool is_equal) {
65 usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; 1274 {
1275 std::scoped_lock lk(impl->buffer_cache.mutex);
1276 static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
1277 const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing;
1278 const auto [buffer, offset] =
1279 impl->buffer_cache.ObtainCPUBuffer(object.address, 8, sync_info, post_op);
1280 impl->hcr_buffer = buffer->Handle();
1281 impl->hcr_offset = offset;
1282 }
1283 if (impl->hcr_is_set) {
1284 if (impl->hcr_setup.buffer == impl->hcr_buffer &&
1285 impl->hcr_setup.offset == impl->hcr_offset) {
1286 ResumeHostConditionalRendering();
1287 return;
1288 }
1289 PauseHostConditionalRendering();
66 } 1290 }
1291 impl->hcr_setup.buffer = impl->hcr_buffer;
1292 impl->hcr_setup.offset = impl->hcr_offset;
1293 impl->hcr_setup.flags = is_equal ? VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT : 0;
1294 impl->hcr_is_set = true;
1295 impl->is_hcr_running = false;
1296 ResumeHostConditionalRendering();
67} 1297}
68 1298
69QueryCache::QueryCache(VideoCore::RasterizerInterface& rasterizer_, 1299void QueryCacheRuntime::HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal) {
70 Core::Memory::Memory& cpu_memory_, const Device& device_, 1300 VkBuffer to_resolve;
71 Scheduler& scheduler_) 1301 u32 to_resolve_offset;
72 : QueryCacheBase{rasterizer_, cpu_memory_}, device{device_}, scheduler{scheduler_}, 1302 {
73 query_pools{ 1303 std::scoped_lock lk(impl->buffer_cache.mutex);
74 QueryPool{device_, scheduler_, QueryType::SamplesPassed}, 1304 static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::NoSynchronize;
75 } {} 1305 const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing;
76 1306 const auto [buffer, offset] =
77QueryCache::~QueryCache() { 1307 impl->buffer_cache.ObtainCPUBuffer(address, 24, sync_info, post_op);
78 // TODO(Rodrigo): This is a hack to destroy all HostCounter instances before the base class 1308 to_resolve = buffer->Handle();
79 // destructor is called. The query cache should be redesigned to have a proper ownership model 1309 to_resolve_offset = static_cast<u32>(offset);
80 // instead of using shared pointers.
81 for (size_t query_type = 0; query_type < VideoCore::NumQueryTypes; ++query_type) {
82 auto& stream = Stream(static_cast<QueryType>(query_type));
83 stream.Update(false);
84 stream.Reset();
85 } 1310 }
1311 if (impl->is_hcr_running) {
1312 PauseHostConditionalRendering();
1313 }
1314 impl->conditional_resolve_pass->Resolve(*impl->hcr_resolve_buffer, to_resolve,
1315 to_resolve_offset, false);
1316 impl->hcr_setup.buffer = *impl->hcr_resolve_buffer;
1317 impl->hcr_setup.offset = 0;
1318 impl->hcr_setup.flags = is_equal ? 0 : VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
1319 impl->hcr_is_set = true;
1320 impl->is_hcr_running = false;
1321 ResumeHostConditionalRendering();
86} 1322}
87 1323
88std::pair<VkQueryPool, u32> QueryCache::AllocateQuery(QueryType type) { 1324bool QueryCacheRuntime::HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1,
89 return query_pools[static_cast<std::size_t>(type)].Commit(); 1325 [[maybe_unused]] bool qc_dirty) {
1326 if (!impl->device.IsExtConditionalRendering()) {
1327 return false;
1328 }
1329 HostConditionalRenderingCompareValueImpl(object_1, false);
1330 return true;
90} 1331}
91 1332
92void QueryCache::Reserve(QueryType type, std::pair<VkQueryPool, u32> query) { 1333bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1,
93 query_pools[static_cast<std::size_t>(type)].Reserve(query); 1334 VideoCommon::LookupData object_2,
1335 bool qc_dirty, bool equal_check) {
1336 if (!impl->device.IsExtConditionalRendering()) {
1337 return false;
1338 }
1339
1340 const auto check_in_bc = [&](VAddr address) {
1341 return impl->buffer_cache.IsRegionGpuModified(address, 8);
1342 };
1343 const auto check_value = [&](VAddr address) {
1344 u8* ptr = impl->cpu_memory.GetPointer(address);
1345 u64 value{};
1346 std::memcpy(&value, ptr, sizeof(value));
1347 return value == 0;
1348 };
1349 std::array<VideoCommon::LookupData*, 2> objects{&object_1, &object_2};
1350 std::array<bool, 2> is_in_bc{};
1351 std::array<bool, 2> is_in_qc{};
1352 std::array<bool, 2> is_in_ac{};
1353 std::array<bool, 2> is_null{};
1354 {
1355 std::scoped_lock lk(impl->buffer_cache.mutex);
1356 for (size_t i = 0; i < 2; i++) {
1357 is_in_qc[i] = objects[i]->found_query != nullptr;
1358 is_in_bc[i] = !is_in_qc[i] && check_in_bc(objects[i]->address);
1359 is_in_ac[i] = is_in_qc[i] || is_in_bc[i];
1360 }
1361 }
1362
1363 if (!is_in_ac[0] && !is_in_ac[1]) {
1364 EndHostConditionalRendering();
1365 return false;
1366 }
1367
1368 if (!qc_dirty && !is_in_bc[0] && !is_in_bc[1]) {
1369 EndHostConditionalRendering();
1370 return false;
1371 }
1372
1373 const bool is_gpu_high = Settings::IsGPULevelHigh();
1374 if (!is_gpu_high && impl->device.GetDriverID() == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) {
1375 return true;
1376 }
1377
1378 for (size_t i = 0; i < 2; i++) {
1379 is_null[i] = !is_in_ac[i] && check_value(objects[i]->address);
1380 }
1381
1382 for (size_t i = 0; i < 2; i++) {
1383 if (is_null[i]) {
1384 size_t j = (i + 1) % 2;
1385 HostConditionalRenderingCompareValueImpl(*objects[j], equal_check);
1386 return true;
1387 }
1388 }
1389
1390 if (!is_gpu_high) {
1391 return true;
1392 }
1393
1394 if (!is_in_bc[0] && !is_in_bc[1]) {
1395 // Both queries are in query cache, it's best to just flush.
1396 return true;
1397 }
1398 HostConditionalRenderingCompareBCImpl(object_1.address, equal_check);
1399 return true;
94} 1400}
95 1401
96HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, 1402QueryCacheRuntime::~QueryCacheRuntime() = default;
97 QueryType type_) 1403
98 : HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, 1404VideoCommon::StreamerInterface* QueryCacheRuntime::GetStreamerInterface(QueryType query_type) {
99 query{cache_.AllocateQuery(type_)}, tick{cache_.GetScheduler().CurrentTick()} { 1405 switch (query_type) {
100 const vk::Device* logical = &cache.GetDevice().GetLogical(); 1406 case QueryType::Payload:
101 cache.GetScheduler().Record([logical, query_ = query](vk::CommandBuffer cmdbuf) { 1407 return &impl->guest_streamer;
102 const bool use_precise = Settings::IsGPULevelHigh(); 1408 case QueryType::ZPassPixelCount64:
103 logical->ResetQueryPool(query_.first, query_.second, 1); 1409 return &impl->sample_streamer;
104 cmdbuf.BeginQuery(query_.first, query_.second, 1410 case QueryType::StreamingByteCount:
105 use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); 1411 return &impl->tfb_streamer;
106 }); 1412 case QueryType::StreamingPrimitivesNeeded:
1413 case QueryType::VtgPrimitivesOut:
1414 case QueryType::StreamingPrimitivesSucceeded:
1415 return &impl->primitives_succeeded_streamer;
1416 case QueryType::StreamingPrimitivesNeededMinusSucceeded:
1417 return &impl->primitives_needed_minus_suceeded_streamer;
1418 default:
1419 return nullptr;
1420 }
107} 1421}
108 1422
109HostCounter::~HostCounter() { 1423void QueryCacheRuntime::Barriers(bool is_prebarrier) {
110 cache.Reserve(type, query); 1424 static constexpr VkMemoryBarrier READ_BARRIER{
1425 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
1426 .pNext = nullptr,
1427 .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
1428 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
1429 };
1430 static constexpr VkMemoryBarrier WRITE_BARRIER{
1431 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
1432 .pNext = nullptr,
1433 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
1434 .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
1435 };
1436 if (is_prebarrier) {
1437 impl->scheduler.Record([](vk::CommandBuffer cmdbuf) {
1438 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
1439 VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER);
1440 });
1441 } else {
1442 impl->scheduler.Record([](vk::CommandBuffer cmdbuf) {
1443 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
1444 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER);
1445 });
1446 }
111} 1447}
112 1448
113void HostCounter::EndQuery() { 1449template <typename SyncValuesType>
114 cache.GetScheduler().Record([query_ = query](vk::CommandBuffer cmdbuf) { 1450void QueryCacheRuntime::SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer) {
115 cmdbuf.EndQuery(query_.first, query_.second); 1451 if (values.size() == 0) {
1452 return;
1453 }
1454 impl->redirect_cache.clear();
1455 impl->little_cache.clear();
1456 size_t total_size = 0;
1457 for (auto& sync_val : values) {
1458 total_size += sync_val.size;
1459 bool found = false;
1460 VAddr base = Common::AlignDown(sync_val.address, Core::Memory::YUZU_PAGESIZE);
1461 VAddr base_end = base + Core::Memory::YUZU_PAGESIZE;
1462 for (size_t i = 0; i < impl->little_cache.size(); i++) {
1463 const auto set_found = [&] {
1464 impl->redirect_cache.push_back(i);
1465 found = true;
1466 };
1467 auto& loc = impl->little_cache[i];
1468 if (base < loc.second && loc.first < base_end) {
1469 set_found();
1470 break;
1471 }
1472 if (loc.first == base_end) {
1473 loc.first = base;
1474 set_found();
1475 break;
1476 }
1477 if (loc.second == base) {
1478 loc.second = base_end;
1479 set_found();
1480 break;
1481 }
1482 }
1483 if (!found) {
1484 impl->redirect_cache.push_back(impl->little_cache.size());
1485 impl->little_cache.emplace_back(base, base_end);
1486 }
1487 }
1488
1489 // Vulkan part.
1490 std::scoped_lock lk(impl->buffer_cache.mutex);
1491 impl->buffer_cache.BufferOperations([&] {
1492 impl->buffers_to_upload_to.clear();
1493 for (auto& pair : impl->little_cache) {
1494 static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
1495 const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing;
1496 const auto [buffer, offset] = impl->buffer_cache.ObtainCPUBuffer(
1497 pair.first, static_cast<u32>(pair.second - pair.first), sync_info, post_op);
1498 impl->buffers_to_upload_to.emplace_back(buffer->Handle(), offset);
1499 }
116 }); 1500 });
117}
118 1501
119u64 HostCounter::BlockingQuery(bool async) const { 1502 VkBuffer src_buffer;
120 if (!async) { 1503 [[maybe_unused]] StagingBufferRef ref;
121 cache.GetScheduler().Wait(tick); 1504 impl->copies_setup.clear();
122 } 1505 impl->copies_setup.resize(impl->little_cache.size());
123 u64 data; 1506 if constexpr (SyncValuesType::GeneratesBaseBuffer) {
124 const VkResult query_result = cache.GetDevice().GetLogical().GetQueryResults( 1507 ref = impl->staging_pool.Request(total_size, MemoryUsage::Upload);
125 query.first, query.second, 1, sizeof(data), &data, sizeof(data), 1508 size_t current_offset = ref.offset;
126 VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); 1509 size_t accumulated_size = 0;
127 1510 for (size_t i = 0; i < values.size(); i++) {
128 switch (query_result) { 1511 size_t which_copy = impl->redirect_cache[i];
129 case VK_SUCCESS: 1512 impl->copies_setup[which_copy].emplace_back(VkBufferCopy{
130 return data; 1513 .srcOffset = current_offset + accumulated_size,
131 case VK_ERROR_DEVICE_LOST: 1514 .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address -
132 cache.GetDevice().ReportLoss(); 1515 impl->little_cache[which_copy].first,
133 [[fallthrough]]; 1516 .size = values[i].size,
134 default: 1517 });
135 throw vk::Exception(query_result); 1518 std::memcpy(ref.mapped_span.data() + accumulated_size, &values[i].value,
1519 values[i].size);
1520 accumulated_size += values[i].size;
1521 }
1522 src_buffer = ref.buffer;
1523 } else {
1524 for (size_t i = 0; i < values.size(); i++) {
1525 size_t which_copy = impl->redirect_cache[i];
1526 impl->copies_setup[which_copy].emplace_back(VkBufferCopy{
1527 .srcOffset = values[i].offset,
1528 .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address -
1529 impl->little_cache[which_copy].first,
1530 .size = values[i].size,
1531 });
1532 }
1533 src_buffer = base_src_buffer;
136 } 1534 }
1535
1536 impl->scheduler.RequestOutsideRenderPassOperationContext();
1537 impl->scheduler.Record([src_buffer, dst_buffers = std::move(impl->buffers_to_upload_to),
1538 vk_copies = std::move(impl->copies_setup)](vk::CommandBuffer cmdbuf) {
1539 size_t size = dst_buffers.size();
1540 for (size_t i = 0; i < size; i++) {
1541 cmdbuf.CopyBuffer(src_buffer, dst_buffers[i].first, vk_copies[i]);
1542 }
1543 });
137} 1544}
138 1545
139} // namespace Vulkan 1546} // namespace Vulkan
1547
1548namespace VideoCommon {
1549
1550template class QueryCacheBase<Vulkan::QueryCacheParams>;
1551
1552} // namespace VideoCommon
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h
index c1b9552eb..e9a1ea169 100644
--- a/src/video_core/renderer_vulkan/vk_query_cache.h
+++ b/src/video_core/renderer_vulkan/vk_query_cache.h
@@ -1,101 +1,75 @@
1// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project 1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later 2// SPDX-License-Identifier: GPL-3.0-or-later
3 3
4#pragma once 4#pragma once
5 5
6#include <cstddef>
7#include <memory> 6#include <memory>
8#include <utility>
9#include <vector>
10 7
11#include "common/common_types.h" 8#include "video_core/query_cache/query_cache_base.h"
12#include "video_core/query_cache.h" 9#include "video_core/renderer_vulkan/vk_buffer_cache.h"
13#include "video_core/renderer_vulkan/vk_resource_pool.h"
14#include "video_core/vulkan_common/vulkan_wrapper.h"
15 10
16namespace VideoCore { 11namespace VideoCore {
17class RasterizerInterface; 12class RasterizerInterface;
18} 13}
19 14
15namespace VideoCommon {
16class StreamerInterface;
17}
18
20namespace Vulkan { 19namespace Vulkan {
21 20
22class CachedQuery;
23class Device; 21class Device;
24class HostCounter;
25class QueryCache;
26class Scheduler; 22class Scheduler;
23class StagingBufferPool;
27 24
28using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; 25struct QueryCacheRuntimeImpl;
29 26
30class QueryPool final : public ResourcePool { 27class QueryCacheRuntime {
31public: 28public:
32 explicit QueryPool(const Device& device, Scheduler& scheduler, VideoCore::QueryType type); 29 explicit QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer,
33 ~QueryPool() override; 30 Core::Memory::Memory& cpu_memory_,
31 Vulkan::BufferCache& buffer_cache_, const Device& device_,
32 const MemoryAllocator& memory_allocator_, Scheduler& scheduler_,
33 StagingBufferPool& staging_pool_,
34 ComputePassDescriptorQueue& compute_pass_descriptor_queue,
35 DescriptorPool& descriptor_pool);
36 ~QueryCacheRuntime();
34 37
35 std::pair<VkQueryPool, u32> Commit(); 38 template <typename SyncValuesType>
39 void SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer = nullptr);
36 40
37 void Reserve(std::pair<VkQueryPool, u32> query); 41 void Barriers(bool is_prebarrier);
38 42
39protected: 43 void EndHostConditionalRendering();
40 void Allocate(std::size_t begin, std::size_t end) override;
41 44
42private: 45 void PauseHostConditionalRendering();
43 static constexpr std::size_t GROW_STEP = 512;
44 46
45 const Device& device; 47 void ResumeHostConditionalRendering();
46 const VideoCore::QueryType type;
47 48
48 std::vector<vk::QueryPool> pools; 49 bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty);
49 std::vector<bool> usage;
50};
51 50
52class QueryCache final 51 bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1,
53 : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { 52 VideoCommon::LookupData object_2, bool qc_dirty,
54public: 53 bool equal_check);
55 explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_,
56 Core::Memory::Memory& cpu_memory_, const Device& device_,
57 Scheduler& scheduler_);
58 ~QueryCache();
59
60 std::pair<VkQueryPool, u32> AllocateQuery(VideoCore::QueryType type);
61 54
62 void Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query); 55 VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type);
63 56
64 const Device& GetDevice() const noexcept { 57 void Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d);
65 return device;
66 }
67 58
68 Scheduler& GetScheduler() const noexcept { 59 template <typename Func>
69 return scheduler; 60 void View3DRegs(Func&& func);
70 }
71 61
72private: 62private:
73 const Device& device; 63 void HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, bool is_equal);
74 Scheduler& scheduler; 64 void HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal);
75 std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; 65 friend struct QueryCacheRuntimeImpl;
66 std::unique_ptr<QueryCacheRuntimeImpl> impl;
76}; 67};
77 68
78class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { 69struct QueryCacheParams {
79public: 70 using RuntimeType = typename Vulkan::QueryCacheRuntime;
80 explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_,
81 VideoCore::QueryType type_);
82 ~HostCounter();
83
84 void EndQuery();
85
86private:
87 u64 BlockingQuery(bool async = false) const override;
88
89 QueryCache& cache;
90 const VideoCore::QueryType type;
91 const std::pair<VkQueryPool, u32> query;
92 const u64 tick;
93}; 71};
94 72
95class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> { 73using QueryCache = VideoCommon::QueryCacheBase<QueryCacheParams>;
96public:
97 explicit CachedQuery(QueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_)
98 : CachedQueryBase{cpu_addr_, host_ptr_} {}
99};
100 74
101} // namespace Vulkan 75} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 01e76a82c..c7ce7c312 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -24,6 +24,7 @@
24#include "video_core/renderer_vulkan/vk_compute_pipeline.h" 24#include "video_core/renderer_vulkan/vk_compute_pipeline.h"
25#include "video_core/renderer_vulkan/vk_descriptor_pool.h" 25#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
26#include "video_core/renderer_vulkan/vk_pipeline_cache.h" 26#include "video_core/renderer_vulkan/vk_pipeline_cache.h"
27#include "video_core/renderer_vulkan/vk_query_cache.h"
27#include "video_core/renderer_vulkan/vk_rasterizer.h" 28#include "video_core/renderer_vulkan/vk_rasterizer.h"
28#include "video_core/renderer_vulkan/vk_scheduler.h" 29#include "video_core/renderer_vulkan/vk_scheduler.h"
29#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" 30#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
@@ -170,9 +171,11 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
170 buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, 171 buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool,
171 guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool), 172 guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool),
172 buffer_cache(*this, cpu_memory_, buffer_cache_runtime), 173 buffer_cache(*this, cpu_memory_, buffer_cache_runtime),
174 query_cache_runtime(this, cpu_memory_, buffer_cache, device, memory_allocator, scheduler,
175 staging_pool, compute_pass_descriptor_queue, descriptor_pool),
176 query_cache(gpu, *this, cpu_memory_, query_cache_runtime),
173 pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue, 177 pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue,
174 render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), 178 render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()),
175 query_cache{*this, cpu_memory_, device, scheduler},
176 accelerate_dma(buffer_cache, texture_cache, scheduler), 179 accelerate_dma(buffer_cache, texture_cache, scheduler),
177 fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), 180 fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler),
178 wfi_event(device.GetLogical().CreateEvent()) { 181 wfi_event(device.GetLogical().CreateEvent()) {
@@ -189,14 +192,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
189 FlushWork(); 192 FlushWork();
190 gpu_memory->FlushCaching(); 193 gpu_memory->FlushCaching();
191 194
192#if ANDROID 195 query_cache.NotifySegment(true);
193 if (Settings::IsGPULevelHigh()) {
194 // This is problematic on Android, disable on GPU Normal.
195 query_cache.UpdateCounters();
196 }
197#else
198 query_cache.UpdateCounters();
199#endif
200 196
201 GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()}; 197 GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()};
202 if (!pipeline) { 198 if (!pipeline) {
@@ -207,13 +203,12 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
207 pipeline->SetEngine(maxwell3d, gpu_memory); 203 pipeline->SetEngine(maxwell3d, gpu_memory);
208 pipeline->Configure(is_indexed); 204 pipeline->Configure(is_indexed);
209 205
210 BeginTransformFeedback();
211
212 UpdateDynamicStates(); 206 UpdateDynamicStates();
213 207
208 HandleTransformFeedback();
209 query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64,
210 maxwell3d->regs.zpass_pixel_count_enable);
214 draw_func(); 211 draw_func();
215
216 EndTransformFeedback();
217} 212}
218 213
219void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { 214void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) {
@@ -241,6 +236,14 @@ void RasterizerVulkan::DrawIndirect() {
241 const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer(); 236 const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer();
242 const auto& buffer = indirect_buffer.first; 237 const auto& buffer = indirect_buffer.first;
243 const auto& offset = indirect_buffer.second; 238 const auto& offset = indirect_buffer.second;
239 if (params.is_byte_count) {
240 scheduler.Record([buffer_obj = buffer->Handle(), offset,
241 stride = params.stride](vk::CommandBuffer cmdbuf) {
242 cmdbuf.DrawIndirectByteCountEXT(1, 0, buffer_obj, offset, 0,
243 static_cast<u32>(stride));
244 });
245 return;
246 }
244 if (params.include_count) { 247 if (params.include_count) {
245 const auto count = buffer_cache.GetDrawIndirectCount(); 248 const auto count = buffer_cache.GetDrawIndirectCount();
246 const auto& draw_buffer = count.first; 249 const auto& draw_buffer = count.first;
@@ -280,20 +283,15 @@ void RasterizerVulkan::DrawTexture() {
280 SCOPE_EXIT({ gpu.TickWork(); }); 283 SCOPE_EXIT({ gpu.TickWork(); });
281 FlushWork(); 284 FlushWork();
282 285
283#if ANDROID 286 query_cache.NotifySegment(true);
284 if (Settings::IsGPULevelHigh()) {
285 // This is problematic on Android, disable on GPU Normal.
286 query_cache.UpdateCounters();
287 }
288#else
289 query_cache.UpdateCounters();
290#endif
291 287
292 texture_cache.SynchronizeGraphicsDescriptors(); 288 texture_cache.SynchronizeGraphicsDescriptors();
293 texture_cache.UpdateRenderTargets(false); 289 texture_cache.UpdateRenderTargets(false);
294 290
295 UpdateDynamicStates(); 291 UpdateDynamicStates();
296 292
293 query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64,
294 maxwell3d->regs.zpass_pixel_count_enable);
297 const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState(); 295 const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState();
298 const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler); 296 const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler);
299 const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture); 297 const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture);
@@ -316,14 +314,9 @@ void RasterizerVulkan::Clear(u32 layer_count) {
316 FlushWork(); 314 FlushWork();
317 gpu_memory->FlushCaching(); 315 gpu_memory->FlushCaching();
318 316
319#if ANDROID 317 query_cache.NotifySegment(true);
320 if (Settings::IsGPULevelHigh()) { 318 query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64,
321 // This is problematic on Android, disable on GPU Normal. 319 maxwell3d->regs.zpass_pixel_count_enable);
322 query_cache.UpdateCounters();
323 }
324#else
325 query_cache.UpdateCounters();
326#endif
327 320
328 auto& regs = maxwell3d->regs; 321 auto& regs = maxwell3d->regs;
329 const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || 322 const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B ||
@@ -482,13 +475,13 @@ void RasterizerVulkan::DispatchCompute() {
482 scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); }); 475 scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); });
483} 476}
484 477
485void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) { 478void RasterizerVulkan::ResetCounter(VideoCommon::QueryType type) {
486 query_cache.ResetCounter(type); 479 query_cache.CounterReset(type);
487} 480}
488 481
489void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, 482void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
490 std::optional<u64> timestamp) { 483 VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) {
491 query_cache.Query(gpu_addr, type, timestamp); 484 query_cache.CounterReport(gpu_addr, type, flags, payload, subreport);
492} 485}
493 486
494void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, 487void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
@@ -669,8 +662,8 @@ void RasterizerVulkan::SignalReference() {
669 fence_manager.SignalReference(); 662 fence_manager.SignalReference();
670} 663}
671 664
672void RasterizerVulkan::ReleaseFences() { 665void RasterizerVulkan::ReleaseFences(bool force) {
673 fence_manager.WaitPendingFences(); 666 fence_manager.WaitPendingFences(force);
674} 667}
675 668
676void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size, 669void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size,
@@ -694,6 +687,8 @@ void RasterizerVulkan::WaitForIdle() {
694 flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; 687 flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT;
695 } 688 }
696 689
690 query_cache.NotifyWFI();
691
697 scheduler.RequestOutsideRenderPassOperationContext(); 692 scheduler.RequestOutsideRenderPassOperationContext();
698 scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { 693 scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) {
699 cmdbuf.SetEvent(event, flags); 694 cmdbuf.SetEvent(event, flags);
@@ -737,19 +732,7 @@ void RasterizerVulkan::TickFrame() {
737 732
738bool RasterizerVulkan::AccelerateConditionalRendering() { 733bool RasterizerVulkan::AccelerateConditionalRendering() {
739 gpu_memory->FlushCaching(); 734 gpu_memory->FlushCaching();
740 if (Settings::IsGPULevelHigh()) { 735 return query_cache.AccelerateHostConditionalRendering();
741 // TODO(Blinkhawk): Reimplement Host conditional rendering.
742 return false;
743 }
744 // Medium / Low Hack: stub any checks on queries written into the buffer cache.
745 const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()};
746 Maxwell::ReportSemaphore::Compare cmp;
747 if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp),
748 VideoCommon::CacheType::BufferCache |
749 VideoCommon::CacheType::QueryCache)) {
750 return true;
751 }
752 return false;
753} 736}
754 737
755bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, 738bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
@@ -795,6 +778,7 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
795 if (!image_view) { 778 if (!image_view) {
796 return false; 779 return false;
797 } 780 }
781 query_cache.NotifySegment(false);
798 screen_info.image = image_view->ImageHandle(); 782 screen_info.image = image_view->ImageHandle();
799 screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D); 783 screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D);
800 screen_info.width = image_view->size.width; 784 screen_info.width = image_view->size.width;
@@ -933,31 +917,18 @@ void RasterizerVulkan::UpdateDynamicStates() {
933 } 917 }
934} 918}
935 919
936void RasterizerVulkan::BeginTransformFeedback() { 920void RasterizerVulkan::HandleTransformFeedback() {
937 const auto& regs = maxwell3d->regs; 921 const auto& regs = maxwell3d->regs;
938 if (regs.transform_feedback_enabled == 0) {
939 return;
940 }
941 if (!device.IsExtTransformFeedbackSupported()) { 922 if (!device.IsExtTransformFeedbackSupported()) {
942 LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); 923 LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
943 return; 924 return;
944 } 925 }
945 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || 926 query_cache.CounterEnable(VideoCommon::QueryType::StreamingByteCount,
946 regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); 927 regs.transform_feedback_enabled);
947 scheduler.Record( 928 if (regs.transform_feedback_enabled != 0) {
948 [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); 929 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) ||
949} 930 regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation));
950
951void RasterizerVulkan::EndTransformFeedback() {
952 const auto& regs = maxwell3d->regs;
953 if (regs.transform_feedback_enabled == 0) {
954 return;
955 }
956 if (!device.IsExtTransformFeedbackSupported()) {
957 return;
958 } 931 }
959 scheduler.Record(
960 [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
961} 932}
962 933
963void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { 934void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) {
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index b31982485..ad069556c 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -84,8 +84,9 @@ public:
84 void DrawTexture() override; 84 void DrawTexture() override;
85 void Clear(u32 layer_count) override; 85 void Clear(u32 layer_count) override;
86 void DispatchCompute() override; 86 void DispatchCompute() override;
87 void ResetCounter(VideoCore::QueryType type) override; 87 void ResetCounter(VideoCommon::QueryType type) override;
88 void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; 88 void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
89 VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
89 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; 90 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
90 void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; 91 void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
91 void FlushAll() override; 92 void FlushAll() override;
@@ -106,7 +107,7 @@ public:
106 void SyncOperation(std::function<void()>&& func) override; 107 void SyncOperation(std::function<void()>&& func) override;
107 void SignalSyncPoint(u32 value) override; 108 void SignalSyncPoint(u32 value) override;
108 void SignalReference() override; 109 void SignalReference() override;
109 void ReleaseFences() override; 110 void ReleaseFences(bool force = true) override;
110 void FlushAndInvalidateRegion( 111 void FlushAndInvalidateRegion(
111 VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; 112 VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
112 void WaitForIdle() override; 113 void WaitForIdle() override;
@@ -146,9 +147,7 @@ private:
146 147
147 void UpdateDynamicStates(); 148 void UpdateDynamicStates();
148 149
149 void BeginTransformFeedback(); 150 void HandleTransformFeedback();
150
151 void EndTransformFeedback();
152 151
153 void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); 152 void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
154 void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); 153 void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs);
@@ -195,8 +194,9 @@ private:
195 TextureCache texture_cache; 194 TextureCache texture_cache;
196 BufferCacheRuntime buffer_cache_runtime; 195 BufferCacheRuntime buffer_cache_runtime;
197 BufferCache buffer_cache; 196 BufferCache buffer_cache;
198 PipelineCache pipeline_cache; 197 QueryCacheRuntime query_cache_runtime;
199 QueryCache query_cache; 198 QueryCache query_cache;
199 PipelineCache pipeline_cache;
200 AccelerateDMA accelerate_dma; 200 AccelerateDMA accelerate_dma;
201 FenceManager fence_manager; 201 FenceManager fence_manager;
202 202
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 89fd31b4f..3be7837f4 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -243,10 +243,10 @@ void Scheduler::AllocateNewContext() {
243#if ANDROID 243#if ANDROID
244 if (Settings::IsGPULevelHigh()) { 244 if (Settings::IsGPULevelHigh()) {
245 // This is problematic on Android, disable on GPU Normal. 245 // This is problematic on Android, disable on GPU Normal.
246 query_cache->UpdateCounters(); 246 query_cache->NotifySegment(true);
247 } 247 }
248#else 248#else
249 query_cache->UpdateCounters(); 249 query_cache->NotifySegment(true);
250#endif 250#endif
251 } 251 }
252} 252}
@@ -261,11 +261,12 @@ void Scheduler::EndPendingOperations() {
261#if ANDROID 261#if ANDROID
262 if (Settings::IsGPULevelHigh()) { 262 if (Settings::IsGPULevelHigh()) {
263 // This is problematic on Android, disable on GPU Normal. 263 // This is problematic on Android, disable on GPU Normal.
264 query_cache->DisableStreams(); 264 // query_cache->DisableStreams();
265 } 265 }
266#else 266#else
267 query_cache->DisableStreams(); 267 // query_cache->DisableStreams();
268#endif 268#endif
269 query_cache->NotifySegment(false);
269 EndRenderPass(); 270 EndRenderPass();
270} 271}
271 272
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index 475c682eb..da03803aa 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -17,6 +17,11 @@
17#include "video_core/renderer_vulkan/vk_master_semaphore.h" 17#include "video_core/renderer_vulkan/vk_master_semaphore.h"
18#include "video_core/vulkan_common/vulkan_wrapper.h" 18#include "video_core/vulkan_common/vulkan_wrapper.h"
19 19
20namespace VideoCommon {
21template <typename Trait>
22class QueryCacheBase;
23}
24
20namespace Vulkan { 25namespace Vulkan {
21 26
22class CommandPool; 27class CommandPool;
@@ -24,7 +29,8 @@ class Device;
24class Framebuffer; 29class Framebuffer;
25class GraphicsPipeline; 30class GraphicsPipeline;
26class StateTracker; 31class StateTracker;
27class QueryCache; 32
33struct QueryCacheParams;
28 34
29/// The scheduler abstracts command buffer and fence management with an interface that's able to do 35/// The scheduler abstracts command buffer and fence management with an interface that's able to do
30/// OpenGL-like operations on Vulkan command buffers. 36/// OpenGL-like operations on Vulkan command buffers.
@@ -63,7 +69,7 @@ public:
63 void InvalidateState(); 69 void InvalidateState();
64 70
65 /// Assigns the query cache. 71 /// Assigns the query cache.
66 void SetQueryCache(QueryCache& query_cache_) { 72 void SetQueryCache(VideoCommon::QueryCacheBase<QueryCacheParams>& query_cache_) {
67 query_cache = &query_cache_; 73 query_cache = &query_cache_;
68 } 74 }
69 75
@@ -219,7 +225,7 @@ private:
219 std::unique_ptr<MasterSemaphore> master_semaphore; 225 std::unique_ptr<MasterSemaphore> master_semaphore;
220 std::unique_ptr<CommandPool> command_pool; 226 std::unique_ptr<CommandPool> command_pool;
221 227
222 QueryCache* query_cache = nullptr; 228 VideoCommon::QueryCacheBase<QueryCacheParams>* query_cache = nullptr;
223 229
224 vk::CommandBuffer current_cmdbuf; 230 vk::CommandBuffer current_cmdbuf;
225 231
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index 8c5355a28..94f41266d 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -61,6 +61,7 @@ VK_DEFINE_HANDLE(VmaAllocator)
61 61
62// Define miscellaneous extensions which may be used by the implementation here. 62// Define miscellaneous extensions which may be used by the implementation here.
63#define FOR_EACH_VK_EXTENSION(EXTENSION) \ 63#define FOR_EACH_VK_EXTENSION(EXTENSION) \
64 EXTENSION(EXT, CONDITIONAL_RENDERING, conditional_rendering) \
64 EXTENSION(EXT, CONSERVATIVE_RASTERIZATION, conservative_rasterization) \ 65 EXTENSION(EXT, CONSERVATIVE_RASTERIZATION, conservative_rasterization) \
65 EXTENSION(EXT, DEPTH_RANGE_UNRESTRICTED, depth_range_unrestricted) \ 66 EXTENSION(EXT, DEPTH_RANGE_UNRESTRICTED, depth_range_unrestricted) \
66 EXTENSION(EXT, MEMORY_BUDGET, memory_budget) \ 67 EXTENSION(EXT, MEMORY_BUDGET, memory_budget) \
@@ -93,6 +94,7 @@ VK_DEFINE_HANDLE(VmaAllocator)
93 94
94// Define extensions where the absence of the extension may result in a degraded experience. 95// Define extensions where the absence of the extension may result in a degraded experience.
95#define FOR_EACH_VK_RECOMMENDED_EXTENSION(EXTENSION_NAME) \ 96#define FOR_EACH_VK_RECOMMENDED_EXTENSION(EXTENSION_NAME) \
97 EXTENSION_NAME(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME) \
96 EXTENSION_NAME(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME) \ 98 EXTENSION_NAME(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME) \
97 EXTENSION_NAME(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME) \ 99 EXTENSION_NAME(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME) \
98 EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME) \ 100 EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME) \
@@ -541,6 +543,10 @@ public:
541 return extensions.shader_atomic_int64; 543 return extensions.shader_atomic_int64;
542 } 544 }
543 545
546 bool IsExtConditionalRendering() const {
547 return extensions.conditional_rendering;
548 }
549
544 bool HasTimelineSemaphore() const; 550 bool HasTimelineSemaphore() const;
545 551
546 /// Returns the minimum supported version of SPIR-V. 552 /// Returns the minimum supported version of SPIR-V.
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp
index c3f388d89..5afba365c 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.cpp
+++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp
@@ -75,6 +75,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
75 X(vkBeginCommandBuffer); 75 X(vkBeginCommandBuffer);
76 X(vkBindBufferMemory); 76 X(vkBindBufferMemory);
77 X(vkBindImageMemory); 77 X(vkBindImageMemory);
78 X(vkCmdBeginConditionalRenderingEXT);
78 X(vkCmdBeginQuery); 79 X(vkCmdBeginQuery);
79 X(vkCmdBeginRenderPass); 80 X(vkCmdBeginRenderPass);
80 X(vkCmdBeginTransformFeedbackEXT); 81 X(vkCmdBeginTransformFeedbackEXT);
@@ -91,6 +92,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
91 X(vkCmdCopyBufferToImage); 92 X(vkCmdCopyBufferToImage);
92 X(vkCmdCopyImage); 93 X(vkCmdCopyImage);
93 X(vkCmdCopyImageToBuffer); 94 X(vkCmdCopyImageToBuffer);
95 X(vkCmdCopyQueryPoolResults);
94 X(vkCmdDispatch); 96 X(vkCmdDispatch);
95 X(vkCmdDispatchIndirect); 97 X(vkCmdDispatchIndirect);
96 X(vkCmdDraw); 98 X(vkCmdDraw);
@@ -99,6 +101,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
99 X(vkCmdDrawIndexedIndirect); 101 X(vkCmdDrawIndexedIndirect);
100 X(vkCmdDrawIndirectCount); 102 X(vkCmdDrawIndirectCount);
101 X(vkCmdDrawIndexedIndirectCount); 103 X(vkCmdDrawIndexedIndirectCount);
104 X(vkCmdDrawIndirectByteCountEXT);
105 X(vkCmdEndConditionalRenderingEXT);
102 X(vkCmdEndQuery); 106 X(vkCmdEndQuery);
103 X(vkCmdEndRenderPass); 107 X(vkCmdEndRenderPass);
104 X(vkCmdEndTransformFeedbackEXT); 108 X(vkCmdEndTransformFeedbackEXT);
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h
index 049fa8038..0d4bbe7f7 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.h
+++ b/src/video_core/vulkan_common/vulkan_wrapper.h
@@ -185,6 +185,7 @@ struct DeviceDispatch : InstanceDispatch {
185 PFN_vkBeginCommandBuffer vkBeginCommandBuffer{}; 185 PFN_vkBeginCommandBuffer vkBeginCommandBuffer{};
186 PFN_vkBindBufferMemory vkBindBufferMemory{}; 186 PFN_vkBindBufferMemory vkBindBufferMemory{};
187 PFN_vkBindImageMemory vkBindImageMemory{}; 187 PFN_vkBindImageMemory vkBindImageMemory{};
188 PFN_vkCmdBeginConditionalRenderingEXT vkCmdBeginConditionalRenderingEXT{};
188 PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{}; 189 PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{};
189 PFN_vkCmdBeginQuery vkCmdBeginQuery{}; 190 PFN_vkCmdBeginQuery vkCmdBeginQuery{};
190 PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{}; 191 PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{};
@@ -202,6 +203,7 @@ struct DeviceDispatch : InstanceDispatch {
202 PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{}; 203 PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{};
203 PFN_vkCmdCopyImage vkCmdCopyImage{}; 204 PFN_vkCmdCopyImage vkCmdCopyImage{};
204 PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{}; 205 PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{};
206 PFN_vkCmdCopyQueryPoolResults vkCmdCopyQueryPoolResults{};
205 PFN_vkCmdDispatch vkCmdDispatch{}; 207 PFN_vkCmdDispatch vkCmdDispatch{};
206 PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect{}; 208 PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect{};
207 PFN_vkCmdDraw vkCmdDraw{}; 209 PFN_vkCmdDraw vkCmdDraw{};
@@ -210,6 +212,8 @@ struct DeviceDispatch : InstanceDispatch {
210 PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; 212 PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{};
211 PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; 213 PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{};
212 PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; 214 PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{};
215 PFN_vkCmdDrawIndirectByteCountEXT vkCmdDrawIndirectByteCountEXT{};
216 PFN_vkCmdEndConditionalRenderingEXT vkCmdEndConditionalRenderingEXT{};
213 PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; 217 PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{};
214 PFN_vkCmdEndQuery vkCmdEndQuery{}; 218 PFN_vkCmdEndQuery vkCmdEndQuery{};
215 PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; 219 PFN_vkCmdEndRenderPass vkCmdEndRenderPass{};
@@ -1182,6 +1186,13 @@ public:
1182 count_offset, draw_count, stride); 1186 count_offset, draw_count, stride);
1183 } 1187 }
1184 1188
1189 void DrawIndirectByteCountEXT(u32 instance_count, u32 first_instance, VkBuffer counter_buffer,
1190 VkDeviceSize counter_buffer_offset, u32 counter_offset,
1191 u32 stride) {
1192 dld->vkCmdDrawIndirectByteCountEXT(handle, instance_count, first_instance, counter_buffer,
1193 counter_buffer_offset, counter_offset, stride);
1194 }
1195
1185 void ClearAttachments(Span<VkClearAttachment> attachments, 1196 void ClearAttachments(Span<VkClearAttachment> attachments,
1186 Span<VkClearRect> rects) const noexcept { 1197 Span<VkClearRect> rects) const noexcept {
1187 dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(), 1198 dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(),
@@ -1270,6 +1281,13 @@ public:
1270 regions.data()); 1281 regions.data());
1271 } 1282 }
1272 1283
1284 void CopyQueryPoolResults(VkQueryPool query_pool, u32 first_query, u32 query_count,
1285 VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize stride,
1286 VkQueryResultFlags flags) const noexcept {
1287 dld->vkCmdCopyQueryPoolResults(handle, query_pool, first_query, query_count, dst_buffer,
1288 dst_offset, stride, flags);
1289 }
1290
1273 void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, 1291 void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size,
1274 u32 data) const noexcept { 1292 u32 data) const noexcept {
1275 dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data); 1293 dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data);
@@ -1448,6 +1466,15 @@ public:
1448 counter_buffers, counter_buffer_offsets); 1466 counter_buffers, counter_buffer_offsets);
1449 } 1467 }
1450 1468
1469 void BeginConditionalRenderingEXT(
1470 const VkConditionalRenderingBeginInfoEXT& info) const noexcept {
1471 dld->vkCmdBeginConditionalRenderingEXT(handle, &info);
1472 }
1473
1474 void EndConditionalRenderingEXT() const noexcept {
1475 dld->vkCmdEndConditionalRenderingEXT(handle);
1476 }
1477
1451 void BeginDebugUtilsLabelEXT(const char* label, std::span<float, 4> color) const noexcept { 1478 void BeginDebugUtilsLabelEXT(const char* label, std::span<float, 4> color) const noexcept {
1452 const VkDebugUtilsLabelEXT label_info{ 1479 const VkDebugUtilsLabelEXT label_info{
1453 .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, 1480 .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,