diff options
| author | 2023-09-25 09:18:29 -0400 | |
|---|---|---|
| committer | 2023-09-25 09:18:29 -0400 | |
| commit | 854457a392b6d38168f7f9d19d1fa8c43fad653c (patch) | |
| tree | 3bc1007b5776f1ce82c057875609105de0a1ca44 /src | |
| parent | Merge pull request #11569 from german77/lle_applet (diff) | |
| parent | Query Cache: Fix Prefix Sums (diff) | |
| download | yuzu-854457a392b6d38168f7f9d19d1fa8c43fad653c.tar.gz yuzu-854457a392b6d38168f7f9d19d1fa8c43fad653c.tar.xz yuzu-854457a392b6d38168f7f9d19d1fa8c43fad653c.zip | |
Merge pull request #11225 from FernandoS27/no-laxatives-in-santas-cookies
Y.F.C: Rework the Query Cache.
Diffstat (limited to 'src')
45 files changed, 3553 insertions, 366 deletions
diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 4ecaf550b..3fde3cae6 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp | |||
| @@ -130,13 +130,17 @@ void LogSettings() { | |||
| 130 | log_path("DataStorage_SDMCDir", Common::FS::GetYuzuPath(Common::FS::YuzuPath::SDMCDir)); | 130 | log_path("DataStorage_SDMCDir", Common::FS::GetYuzuPath(Common::FS::YuzuPath::SDMCDir)); |
| 131 | } | 131 | } |
| 132 | 132 | ||
| 133 | void UpdateGPUAccuracy() { | ||
| 134 | values.current_gpu_accuracy = values.gpu_accuracy.GetValue(); | ||
| 135 | } | ||
| 136 | |||
| 133 | bool IsGPULevelExtreme() { | 137 | bool IsGPULevelExtreme() { |
| 134 | return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme; | 138 | return values.current_gpu_accuracy == GpuAccuracy::Extreme; |
| 135 | } | 139 | } |
| 136 | 140 | ||
| 137 | bool IsGPULevelHigh() { | 141 | bool IsGPULevelHigh() { |
| 138 | return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme || | 142 | return values.current_gpu_accuracy == GpuAccuracy::Extreme || |
| 139 | values.gpu_accuracy.GetValue() == GpuAccuracy::High; | 143 | values.current_gpu_accuracy == GpuAccuracy::High; |
| 140 | } | 144 | } |
| 141 | 145 | ||
| 142 | bool IsFastmemEnabled() { | 146 | bool IsFastmemEnabled() { |
diff --git a/src/common/settings.h b/src/common/settings.h index 82ec9077e..ae5e5d2b8 100644 --- a/src/common/settings.h +++ b/src/common/settings.h | |||
| @@ -307,6 +307,7 @@ struct Values { | |||
| 307 | Specialization::Default, | 307 | Specialization::Default, |
| 308 | true, | 308 | true, |
| 309 | true}; | 309 | true}; |
| 310 | GpuAccuracy current_gpu_accuracy{GpuAccuracy::High}; | ||
| 310 | SwitchableSetting<AnisotropyMode, true> max_anisotropy{ | 311 | SwitchableSetting<AnisotropyMode, true> max_anisotropy{ |
| 311 | linkage, AnisotropyMode::Automatic, AnisotropyMode::Automatic, AnisotropyMode::X16, | 312 | linkage, AnisotropyMode::Automatic, AnisotropyMode::Automatic, AnisotropyMode::X16, |
| 312 | "max_anisotropy", Category::RendererAdvanced}; | 313 | "max_anisotropy", Category::RendererAdvanced}; |
| @@ -522,6 +523,7 @@ struct Values { | |||
| 522 | 523 | ||
| 523 | extern Values values; | 524 | extern Values values; |
| 524 | 525 | ||
| 526 | void UpdateGPUAccuracy(); | ||
| 525 | bool IsGPULevelExtreme(); | 527 | bool IsGPULevelExtreme(); |
| 526 | bool IsGPULevelHigh(); | 528 | bool IsGPULevelHigh(); |
| 527 | 529 | ||
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 9b13ccbab..cf9266d54 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt | |||
| @@ -95,6 +95,12 @@ add_library(video_core STATIC | |||
| 95 | memory_manager.h | 95 | memory_manager.h |
| 96 | precompiled_headers.h | 96 | precompiled_headers.h |
| 97 | pte_kind.h | 97 | pte_kind.h |
| 98 | query_cache/bank_base.h | ||
| 99 | query_cache/query_base.h | ||
| 100 | query_cache/query_cache_base.h | ||
| 101 | query_cache/query_cache.h | ||
| 102 | query_cache/query_stream.h | ||
| 103 | query_cache/types.h | ||
| 98 | query_cache.h | 104 | query_cache.h |
| 99 | rasterizer_accelerated.cpp | 105 | rasterizer_accelerated.cpp |
| 100 | rasterizer_accelerated.h | 106 | rasterizer_accelerated.h |
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 8be7bd594..9e90c587c 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h | |||
| @@ -272,13 +272,19 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad | |||
| 272 | if (!cpu_addr) { | 272 | if (!cpu_addr) { |
| 273 | return {&slot_buffers[NULL_BUFFER_ID], 0}; | 273 | return {&slot_buffers[NULL_BUFFER_ID], 0}; |
| 274 | } | 274 | } |
| 275 | const BufferId buffer_id = FindBuffer(*cpu_addr, size); | 275 | return ObtainCPUBuffer(*cpu_addr, size, sync_info, post_op); |
| 276 | } | ||
| 277 | |||
| 278 | template <class P> | ||
| 279 | std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainCPUBuffer( | ||
| 280 | VAddr cpu_addr, u32 size, ObtainBufferSynchronize sync_info, ObtainBufferOperation post_op) { | ||
| 281 | const BufferId buffer_id = FindBuffer(cpu_addr, size); | ||
| 276 | Buffer& buffer = slot_buffers[buffer_id]; | 282 | Buffer& buffer = slot_buffers[buffer_id]; |
| 277 | 283 | ||
| 278 | // synchronize op | 284 | // synchronize op |
| 279 | switch (sync_info) { | 285 | switch (sync_info) { |
| 280 | case ObtainBufferSynchronize::FullSynchronize: | 286 | case ObtainBufferSynchronize::FullSynchronize: |
| 281 | SynchronizeBuffer(buffer, *cpu_addr, size); | 287 | SynchronizeBuffer(buffer, cpu_addr, size); |
| 282 | break; | 288 | break; |
| 283 | default: | 289 | default: |
| 284 | break; | 290 | break; |
| @@ -286,11 +292,11 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad | |||
| 286 | 292 | ||
| 287 | switch (post_op) { | 293 | switch (post_op) { |
| 288 | case ObtainBufferOperation::MarkAsWritten: | 294 | case ObtainBufferOperation::MarkAsWritten: |
| 289 | MarkWrittenBuffer(buffer_id, *cpu_addr, size); | 295 | MarkWrittenBuffer(buffer_id, cpu_addr, size); |
| 290 | break; | 296 | break; |
| 291 | case ObtainBufferOperation::DiscardWrite: { | 297 | case ObtainBufferOperation::DiscardWrite: { |
| 292 | VAddr cpu_addr_start = Common::AlignDown(*cpu_addr, 64); | 298 | VAddr cpu_addr_start = Common::AlignDown(cpu_addr, 64); |
| 293 | VAddr cpu_addr_end = Common::AlignUp(*cpu_addr + size, 64); | 299 | VAddr cpu_addr_end = Common::AlignUp(cpu_addr + size, 64); |
| 294 | IntervalType interval{cpu_addr_start, cpu_addr_end}; | 300 | IntervalType interval{cpu_addr_start, cpu_addr_end}; |
| 295 | ClearDownload(interval); | 301 | ClearDownload(interval); |
| 296 | common_ranges.subtract(interval); | 302 | common_ranges.subtract(interval); |
| @@ -300,7 +306,7 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad | |||
| 300 | break; | 306 | break; |
| 301 | } | 307 | } |
| 302 | 308 | ||
| 303 | return {&buffer, buffer.Offset(*cpu_addr)}; | 309 | return {&buffer, buffer.Offset(cpu_addr)}; |
| 304 | } | 310 | } |
| 305 | 311 | ||
| 306 | template <class P> | 312 | template <class P> |
diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 0b7135d49..c4f6e8d12 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h | |||
| @@ -295,6 +295,10 @@ public: | |||
| 295 | [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, | 295 | [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, |
| 296 | ObtainBufferSynchronize sync_info, | 296 | ObtainBufferSynchronize sync_info, |
| 297 | ObtainBufferOperation post_op); | 297 | ObtainBufferOperation post_op); |
| 298 | |||
| 299 | [[nodiscard]] std::pair<Buffer*, u32> ObtainCPUBuffer(VAddr gpu_addr, u32 size, | ||
| 300 | ObtainBufferSynchronize sync_info, | ||
| 301 | ObtainBufferOperation post_op); | ||
| 298 | void FlushCachedWrites(); | 302 | void FlushCachedWrites(); |
| 299 | 303 | ||
| 300 | /// Return true when there are uncommitted buffers to be downloaded | 304 | /// Return true when there are uncommitted buffers to be downloaded |
| @@ -335,6 +339,14 @@ public: | |||
| 335 | 339 | ||
| 336 | [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); | 340 | [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); |
| 337 | 341 | ||
| 342 | template <typename Func> | ||
| 343 | void BufferOperations(Func&& func) { | ||
| 344 | do { | ||
| 345 | channel_state->has_deleted_buffers = false; | ||
| 346 | func(); | ||
| 347 | } while (channel_state->has_deleted_buffers); | ||
| 348 | } | ||
| 349 | |||
| 338 | std::recursive_mutex mutex; | 350 | std::recursive_mutex mutex; |
| 339 | Runtime& runtime; | 351 | Runtime& runtime; |
| 340 | 352 | ||
diff --git a/src/video_core/control/channel_state_cache.h b/src/video_core/control/channel_state_cache.h index 46bc9e322..5574e1fba 100644 --- a/src/video_core/control/channel_state_cache.h +++ b/src/video_core/control/channel_state_cache.h | |||
| @@ -51,7 +51,7 @@ public: | |||
| 51 | virtual void CreateChannel(Tegra::Control::ChannelState& channel); | 51 | virtual void CreateChannel(Tegra::Control::ChannelState& channel); |
| 52 | 52 | ||
| 53 | /// Bind a channel for execution. | 53 | /// Bind a channel for execution. |
| 54 | void BindToChannel(s32 id); | 54 | virtual void BindToChannel(s32 id); |
| 55 | 55 | ||
| 56 | /// Erase channel's state. | 56 | /// Erase channel's state. |
| 57 | void EraseChannel(s32 id); | 57 | void EraseChannel(s32 id); |
diff --git a/src/video_core/engines/draw_manager.h b/src/video_core/engines/draw_manager.h index 7c22c49f1..18d959143 100644 --- a/src/video_core/engines/draw_manager.h +++ b/src/video_core/engines/draw_manager.h | |||
| @@ -46,6 +46,7 @@ public: | |||
| 46 | }; | 46 | }; |
| 47 | 47 | ||
| 48 | struct IndirectParams { | 48 | struct IndirectParams { |
| 49 | bool is_byte_count; | ||
| 49 | bool is_indexed; | 50 | bool is_indexed; |
| 50 | bool include_count; | 51 | bool include_count; |
| 51 | GPUVAddr count_start_address; | 52 | GPUVAddr count_start_address; |
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 06e349e43..32d767d85 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp | |||
| @@ -20,8 +20,6 @@ | |||
| 20 | 20 | ||
| 21 | namespace Tegra::Engines { | 21 | namespace Tegra::Engines { |
| 22 | 22 | ||
| 23 | using VideoCore::QueryType; | ||
| 24 | |||
| 25 | /// First register id that is actually a Macro call. | 23 | /// First register id that is actually a Macro call. |
| 26 | constexpr u32 MacroRegistersStart = 0xE00; | 24 | constexpr u32 MacroRegistersStart = 0xE00; |
| 27 | 25 | ||
| @@ -500,27 +498,21 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { | |||
| 500 | } | 498 | } |
| 501 | 499 | ||
| 502 | void Maxwell3D::ProcessQueryGet() { | 500 | void Maxwell3D::ProcessQueryGet() { |
| 501 | VideoCommon::QueryPropertiesFlags flags{}; | ||
| 502 | if (regs.report_semaphore.query.short_query == 0) { | ||
| 503 | flags |= VideoCommon::QueryPropertiesFlags::HasTimeout; | ||
| 504 | } | ||
| 505 | const GPUVAddr sequence_address{regs.report_semaphore.Address()}; | ||
| 506 | const VideoCommon::QueryType query_type = | ||
| 507 | static_cast<VideoCommon::QueryType>(regs.report_semaphore.query.report.Value()); | ||
| 508 | const u32 payload = regs.report_semaphore.payload; | ||
| 509 | const u32 subreport = regs.report_semaphore.query.sub_report; | ||
| 503 | switch (regs.report_semaphore.query.operation) { | 510 | switch (regs.report_semaphore.query.operation) { |
| 504 | case Regs::ReportSemaphore::Operation::Release: | 511 | case Regs::ReportSemaphore::Operation::Release: |
| 505 | if (regs.report_semaphore.query.short_query != 0) { | 512 | if (regs.report_semaphore.query.short_query != 0) { |
| 506 | const GPUVAddr sequence_address{regs.report_semaphore.Address()}; | 513 | flags |= VideoCommon::QueryPropertiesFlags::IsAFence; |
| 507 | const u32 payload = regs.report_semaphore.payload; | ||
| 508 | std::function<void()> operation([this, sequence_address, payload] { | ||
| 509 | memory_manager.Write<u32>(sequence_address, payload); | ||
| 510 | }); | ||
| 511 | rasterizer->SignalFence(std::move(operation)); | ||
| 512 | } else { | ||
| 513 | struct LongQueryResult { | ||
| 514 | u64_le value; | ||
| 515 | u64_le timestamp; | ||
| 516 | }; | ||
| 517 | const GPUVAddr sequence_address{regs.report_semaphore.Address()}; | ||
| 518 | const u32 payload = regs.report_semaphore.payload; | ||
| 519 | [this, sequence_address, payload] { | ||
| 520 | memory_manager.Write<u64>(sequence_address + sizeof(u64), system.GPU().GetTicks()); | ||
| 521 | memory_manager.Write<u64>(sequence_address, payload); | ||
| 522 | }(); | ||
| 523 | } | 514 | } |
| 515 | rasterizer->Query(sequence_address, query_type, flags, payload, subreport); | ||
| 524 | break; | 516 | break; |
| 525 | case Regs::ReportSemaphore::Operation::Acquire: | 517 | case Regs::ReportSemaphore::Operation::Acquire: |
| 526 | // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that | 518 | // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that |
| @@ -528,11 +520,7 @@ void Maxwell3D::ProcessQueryGet() { | |||
| 528 | UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); | 520 | UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); |
| 529 | break; | 521 | break; |
| 530 | case Regs::ReportSemaphore::Operation::ReportOnly: | 522 | case Regs::ReportSemaphore::Operation::ReportOnly: |
| 531 | if (const std::optional<u64> result = GetQueryResult()) { | 523 | rasterizer->Query(sequence_address, query_type, flags, payload, subreport); |
| 532 | // If the query returns an empty optional it means it's cached and deferred. | ||
| 533 | // In this case we have a non-empty result, so we stamp it immediately. | ||
| 534 | StampQueryResult(*result, regs.report_semaphore.query.short_query == 0); | ||
| 535 | } | ||
| 536 | break; | 524 | break; |
| 537 | case Regs::ReportSemaphore::Operation::Trap: | 525 | case Regs::ReportSemaphore::Operation::Trap: |
| 538 | UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); | 526 | UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); |
| @@ -544,6 +532,10 @@ void Maxwell3D::ProcessQueryGet() { | |||
| 544 | } | 532 | } |
| 545 | 533 | ||
| 546 | void Maxwell3D::ProcessQueryCondition() { | 534 | void Maxwell3D::ProcessQueryCondition() { |
| 535 | if (rasterizer->AccelerateConditionalRendering()) { | ||
| 536 | execute_on = true; | ||
| 537 | return; | ||
| 538 | } | ||
| 547 | const GPUVAddr condition_address{regs.render_enable.Address()}; | 539 | const GPUVAddr condition_address{regs.render_enable.Address()}; |
| 548 | switch (regs.render_enable_override) { | 540 | switch (regs.render_enable_override) { |
| 549 | case Regs::RenderEnable::Override::AlwaysRender: | 541 | case Regs::RenderEnable::Override::AlwaysRender: |
| @@ -553,10 +545,6 @@ void Maxwell3D::ProcessQueryCondition() { | |||
| 553 | execute_on = false; | 545 | execute_on = false; |
| 554 | break; | 546 | break; |
| 555 | case Regs::RenderEnable::Override::UseRenderEnable: { | 547 | case Regs::RenderEnable::Override::UseRenderEnable: { |
| 556 | if (rasterizer->AccelerateConditionalRendering()) { | ||
| 557 | execute_on = true; | ||
| 558 | return; | ||
| 559 | } | ||
| 560 | switch (regs.render_enable.mode) { | 548 | switch (regs.render_enable.mode) { |
| 561 | case Regs::RenderEnable::Mode::True: { | 549 | case Regs::RenderEnable::Mode::True: { |
| 562 | execute_on = true; | 550 | execute_on = true; |
| @@ -598,15 +586,9 @@ void Maxwell3D::ProcessQueryCondition() { | |||
| 598 | } | 586 | } |
| 599 | 587 | ||
| 600 | void Maxwell3D::ProcessCounterReset() { | 588 | void Maxwell3D::ProcessCounterReset() { |
| 601 | #if ANDROID | ||
| 602 | if (!Settings::IsGPULevelHigh()) { | ||
| 603 | // This is problematic on Android, disable on GPU Normal. | ||
| 604 | return; | ||
| 605 | } | ||
| 606 | #endif | ||
| 607 | switch (regs.clear_report_value) { | 589 | switch (regs.clear_report_value) { |
| 608 | case Regs::ClearReport::ZPassPixelCount: | 590 | case Regs::ClearReport::ZPassPixelCount: |
| 609 | rasterizer->ResetCounter(QueryType::SamplesPassed); | 591 | rasterizer->ResetCounter(VideoCommon::QueryType::ZPassPixelCount64); |
| 610 | break; | 592 | break; |
| 611 | default: | 593 | default: |
| 612 | LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value); | 594 | LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value); |
| @@ -620,28 +602,6 @@ void Maxwell3D::ProcessSyncPoint() { | |||
| 620 | rasterizer->SignalSyncPoint(sync_point); | 602 | rasterizer->SignalSyncPoint(sync_point); |
| 621 | } | 603 | } |
| 622 | 604 | ||
| 623 | std::optional<u64> Maxwell3D::GetQueryResult() { | ||
| 624 | switch (regs.report_semaphore.query.report) { | ||
| 625 | case Regs::ReportSemaphore::Report::Payload: | ||
| 626 | return regs.report_semaphore.payload; | ||
| 627 | case Regs::ReportSemaphore::Report::ZPassPixelCount64: | ||
| 628 | #if ANDROID | ||
| 629 | if (!Settings::IsGPULevelHigh()) { | ||
| 630 | // This is problematic on Android, disable on GPU Normal. | ||
| 631 | return 120; | ||
| 632 | } | ||
| 633 | #endif | ||
| 634 | // Deferred. | ||
| 635 | rasterizer->Query(regs.report_semaphore.Address(), QueryType::SamplesPassed, | ||
| 636 | system.GPU().GetTicks()); | ||
| 637 | return std::nullopt; | ||
| 638 | default: | ||
| 639 | LOG_DEBUG(HW_GPU, "Unimplemented query report type {}", | ||
| 640 | regs.report_semaphore.query.report.Value()); | ||
| 641 | return 1; | ||
| 642 | } | ||
| 643 | } | ||
| 644 | |||
| 645 | void Maxwell3D::ProcessCBBind(size_t stage_index) { | 605 | void Maxwell3D::ProcessCBBind(size_t stage_index) { |
| 646 | // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader | 606 | // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader |
| 647 | // stage. | 607 | // stage. |
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 6c19354e1..17faacc37 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h | |||
| @@ -3182,9 +3182,6 @@ private: | |||
| 3182 | /// Handles writes to syncing register. | 3182 | /// Handles writes to syncing register. |
| 3183 | void ProcessSyncPoint(); | 3183 | void ProcessSyncPoint(); |
| 3184 | 3184 | ||
| 3185 | /// Returns a query's value or an empty object if the value will be deferred through a cache. | ||
| 3186 | std::optional<u64> GetQueryResult(); | ||
| 3187 | |||
| 3188 | void RefreshParametersImpl(); | 3185 | void RefreshParametersImpl(); |
| 3189 | 3186 | ||
| 3190 | bool IsMethodExecutable(u32 method); | 3187 | bool IsMethodExecutable(u32 method); |
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 279f0daa1..422d4d859 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp | |||
| @@ -362,21 +362,17 @@ void MaxwellDMA::ReleaseSemaphore() { | |||
| 362 | const auto type = regs.launch_dma.semaphore_type; | 362 | const auto type = regs.launch_dma.semaphore_type; |
| 363 | const GPUVAddr address = regs.semaphore.address; | 363 | const GPUVAddr address = regs.semaphore.address; |
| 364 | const u32 payload = regs.semaphore.payload; | 364 | const u32 payload = regs.semaphore.payload; |
| 365 | VideoCommon::QueryPropertiesFlags flags{VideoCommon::QueryPropertiesFlags::IsAFence}; | ||
| 365 | switch (type) { | 366 | switch (type) { |
| 366 | case LaunchDMA::SemaphoreType::NONE: | 367 | case LaunchDMA::SemaphoreType::NONE: |
| 367 | break; | 368 | break; |
| 368 | case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: { | 369 | case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: { |
| 369 | std::function<void()> operation( | 370 | rasterizer->Query(address, VideoCommon::QueryType::Payload, flags, payload, 0); |
| 370 | [this, address, payload] { memory_manager.Write<u32>(address, payload); }); | ||
| 371 | rasterizer->SignalFence(std::move(operation)); | ||
| 372 | break; | 371 | break; |
| 373 | } | 372 | } |
| 374 | case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: { | 373 | case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: { |
| 375 | std::function<void()> operation([this, address, payload] { | 374 | rasterizer->Query(address, VideoCommon::QueryType::Payload, |
| 376 | memory_manager.Write<u64>(address + sizeof(u64), system.GPU().GetTicks()); | 375 | flags | VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); |
| 377 | memory_manager.Write<u64>(address, payload); | ||
| 378 | }); | ||
| 379 | rasterizer->SignalFence(std::move(operation)); | ||
| 380 | break; | 376 | break; |
| 381 | } | 377 | } |
| 382 | default: | 378 | default: |
diff --git a/src/video_core/engines/puller.cpp b/src/video_core/engines/puller.cpp index 6de2543b7..8dd34c04a 100644 --- a/src/video_core/engines/puller.cpp +++ b/src/video_core/engines/puller.cpp | |||
| @@ -82,10 +82,8 @@ void Puller::ProcessSemaphoreTriggerMethod() { | |||
| 82 | if (op == GpuSemaphoreOperation::WriteLong) { | 82 | if (op == GpuSemaphoreOperation::WriteLong) { |
| 83 | const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; | 83 | const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; |
| 84 | const u32 payload = regs.semaphore_sequence; | 84 | const u32 payload = regs.semaphore_sequence; |
| 85 | [this, sequence_address, payload] { | 85 | rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, |
| 86 | memory_manager.Write<u64>(sequence_address + sizeof(u64), gpu.GetTicks()); | 86 | VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); |
| 87 | memory_manager.Write<u64>(sequence_address, payload); | ||
| 88 | }(); | ||
| 89 | } else { | 87 | } else { |
| 90 | do { | 88 | do { |
| 91 | const u32 word{memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress())}; | 89 | const u32 word{memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress())}; |
| @@ -120,10 +118,8 @@ void Puller::ProcessSemaphoreTriggerMethod() { | |||
| 120 | void Puller::ProcessSemaphoreRelease() { | 118 | void Puller::ProcessSemaphoreRelease() { |
| 121 | const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; | 119 | const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; |
| 122 | const u32 payload = regs.semaphore_release; | 120 | const u32 payload = regs.semaphore_release; |
| 123 | std::function<void()> operation([this, sequence_address, payload] { | 121 | rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, |
| 124 | memory_manager.Write<u32>(sequence_address, payload); | 122 | VideoCommon::QueryPropertiesFlags::IsAFence, payload, 0); |
| 125 | }); | ||
| 126 | rasterizer->SignalFence(std::move(operation)); | ||
| 127 | } | 123 | } |
| 128 | 124 | ||
| 129 | void Puller::ProcessSemaphoreAcquire() { | 125 | void Puller::ProcessSemaphoreAcquire() { |
| @@ -132,7 +128,6 @@ void Puller::ProcessSemaphoreAcquire() { | |||
| 132 | while (word != value) { | 128 | while (word != value) { |
| 133 | regs.acquire_active = true; | 129 | regs.acquire_active = true; |
| 134 | regs.acquire_value = value; | 130 | regs.acquire_value = value; |
| 135 | std::this_thread::sleep_for(std::chrono::milliseconds(1)); | ||
| 136 | rasterizer->ReleaseFences(); | 131 | rasterizer->ReleaseFences(); |
| 137 | word = memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress()); | 132 | word = memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress()); |
| 138 | // TODO(kemathe73) figure out how to do the acquire_timeout | 133 | // TODO(kemathe73) figure out how to do the acquire_timeout |
diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index ab20ff30f..805a89900 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h | |||
| @@ -55,6 +55,9 @@ public: | |||
| 55 | 55 | ||
| 56 | // Unlike other fences, this one doesn't | 56 | // Unlike other fences, this one doesn't |
| 57 | void SignalOrdering() { | 57 | void SignalOrdering() { |
| 58 | if constexpr (!can_async_check) { | ||
| 59 | TryReleasePendingFences<false>(); | ||
| 60 | } | ||
| 58 | std::scoped_lock lock{buffer_cache.mutex}; | 61 | std::scoped_lock lock{buffer_cache.mutex}; |
| 59 | buffer_cache.AccumulateFlushes(); | 62 | buffer_cache.AccumulateFlushes(); |
| 60 | } | 63 | } |
| @@ -104,9 +107,25 @@ public: | |||
| 104 | SignalFence(std::move(func)); | 107 | SignalFence(std::move(func)); |
| 105 | } | 108 | } |
| 106 | 109 | ||
| 107 | void WaitPendingFences() { | 110 | void WaitPendingFences([[maybe_unused]] bool force) { |
| 108 | if constexpr (!can_async_check) { | 111 | if constexpr (!can_async_check) { |
| 109 | TryReleasePendingFences<true>(); | 112 | TryReleasePendingFences<true>(); |
| 113 | } else { | ||
| 114 | if (!force) { | ||
| 115 | return; | ||
| 116 | } | ||
| 117 | std::mutex wait_mutex; | ||
| 118 | std::condition_variable wait_cv; | ||
| 119 | std::atomic<bool> wait_finished{}; | ||
| 120 | std::function<void()> func([&] { | ||
| 121 | std::scoped_lock lk(wait_mutex); | ||
| 122 | wait_finished.store(true, std::memory_order_relaxed); | ||
| 123 | wait_cv.notify_all(); | ||
| 124 | }); | ||
| 125 | SignalFence(std::move(func)); | ||
| 126 | std::unique_lock lk(wait_mutex); | ||
| 127 | wait_cv.wait( | ||
| 128 | lk, [&wait_finished] { return wait_finished.load(std::memory_order_relaxed); }); | ||
| 110 | } | 129 | } |
| 111 | } | 130 | } |
| 112 | 131 | ||
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index c192e33b2..11549d448 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp | |||
| @@ -102,7 +102,8 @@ struct GPU::Impl { | |||
| 102 | 102 | ||
| 103 | /// Signal the ending of command list. | 103 | /// Signal the ending of command list. |
| 104 | void OnCommandListEnd() { | 104 | void OnCommandListEnd() { |
| 105 | rasterizer->ReleaseFences(); | 105 | rasterizer->ReleaseFences(false); |
| 106 | Settings::UpdateGPUAccuracy(); | ||
| 106 | } | 107 | } |
| 107 | 108 | ||
| 108 | /// Request a host GPU memory flush from the CPU. | 109 | /// Request a host GPU memory flush from the CPU. |
| @@ -220,6 +221,7 @@ struct GPU::Impl { | |||
| 220 | /// This can be used to launch any necessary threads and register any necessary | 221 | /// This can be used to launch any necessary threads and register any necessary |
| 221 | /// core timing events. | 222 | /// core timing events. |
| 222 | void Start() { | 223 | void Start() { |
| 224 | Settings::UpdateGPUAccuracy(); | ||
| 223 | gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler); | 225 | gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler); |
| 224 | } | 226 | } |
| 225 | 227 | ||
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index c4d459077..6b912027f 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt | |||
| @@ -41,6 +41,9 @@ set(SHADER_FILES | |||
| 41 | pitch_unswizzle.comp | 41 | pitch_unswizzle.comp |
| 42 | present_bicubic.frag | 42 | present_bicubic.frag |
| 43 | present_gaussian.frag | 43 | present_gaussian.frag |
| 44 | queries_prefix_scan_sum.comp | ||
| 45 | queries_prefix_scan_sum_nosubgroups.comp | ||
| 46 | resolve_conditional_render.comp | ||
| 44 | smaa_edge_detection.vert | 47 | smaa_edge_detection.vert |
| 45 | smaa_edge_detection.frag | 48 | smaa_edge_detection.frag |
| 46 | smaa_blending_weight_calculation.vert | 49 | smaa_blending_weight_calculation.vert |
| @@ -70,6 +73,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND") | |||
| 70 | endif() | 73 | endif() |
| 71 | 74 | ||
| 72 | set(GLSL_FLAGS "") | 75 | set(GLSL_FLAGS "") |
| 76 | set(SPIR_V_VERSION "spirv1.3") | ||
| 73 | set(QUIET_FLAG "--quiet") | 77 | set(QUIET_FLAG "--quiet") |
| 74 | 78 | ||
| 75 | set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) | 79 | set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) |
| @@ -123,7 +127,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES}) | |||
| 123 | OUTPUT | 127 | OUTPUT |
| 124 | ${SPIRV_HEADER_FILE} | 128 | ${SPIRV_HEADER_FILE} |
| 125 | COMMAND | 129 | COMMAND |
| 126 | ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} | 130 | ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION} |
| 127 | MAIN_DEPENDENCY | 131 | MAIN_DEPENDENCY |
| 128 | ${SOURCE_FILE} | 132 | ${SOURCE_FILE} |
| 129 | ) | 133 | ) |
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp new file mode 100644 index 000000000..6faa8981f --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp | |||
| @@ -0,0 +1,173 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #version 460 core | ||
| 5 | |||
| 6 | #extension GL_KHR_shader_subgroup_basic : require | ||
| 7 | #extension GL_KHR_shader_subgroup_shuffle : require | ||
| 8 | #extension GL_KHR_shader_subgroup_shuffle_relative : require | ||
| 9 | #extension GL_KHR_shader_subgroup_arithmetic : require | ||
| 10 | |||
| 11 | #ifdef VULKAN | ||
| 12 | |||
| 13 | #define HAS_EXTENDED_TYPES 1 | ||
| 14 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | ||
| 15 | #define END_PUSH_CONSTANTS }; | ||
| 16 | #define UNIFORM(n) | ||
| 17 | #define BINDING_INPUT_BUFFER 0 | ||
| 18 | #define BINDING_OUTPUT_IMAGE 1 | ||
| 19 | |||
| 20 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||
| 21 | |||
| 22 | #extension GL_NV_gpu_shader5 : enable | ||
| 23 | #ifdef GL_NV_gpu_shader5 | ||
| 24 | #define HAS_EXTENDED_TYPES 1 | ||
| 25 | #else | ||
| 26 | #define HAS_EXTENDED_TYPES 0 | ||
| 27 | #endif | ||
| 28 | #define BEGIN_PUSH_CONSTANTS | ||
| 29 | #define END_PUSH_CONSTANTS | ||
| 30 | #define UNIFORM(n) layout(location = n) uniform | ||
| 31 | #define BINDING_INPUT_BUFFER 0 | ||
| 32 | #define BINDING_OUTPUT_IMAGE 0 | ||
| 33 | |||
| 34 | #endif | ||
| 35 | |||
| 36 | BEGIN_PUSH_CONSTANTS | ||
| 37 | UNIFORM(0) uint min_accumulation_base; | ||
| 38 | UNIFORM(1) uint max_accumulation_base; | ||
| 39 | UNIFORM(2) uint accumulation_limit; | ||
| 40 | UNIFORM(3) uint buffer_offset; | ||
| 41 | END_PUSH_CONSTANTS | ||
| 42 | |||
| 43 | #define LOCAL_RESULTS 8 | ||
| 44 | #define QUERIES_PER_INVOC 2048 | ||
| 45 | |||
| 46 | layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; | ||
| 47 | |||
| 48 | layout(std430, binding = 0) readonly buffer block1 { | ||
| 49 | uvec2 input_data[]; | ||
| 50 | }; | ||
| 51 | |||
| 52 | layout(std430, binding = 1) coherent buffer block2 { | ||
| 53 | uvec2 output_data[]; | ||
| 54 | }; | ||
| 55 | |||
| 56 | layout(std430, binding = 2) coherent buffer block3 { | ||
| 57 | uvec2 accumulated_data; | ||
| 58 | }; | ||
| 59 | |||
| 60 | shared uvec2 shared_data[128]; | ||
| 61 | |||
| 62 | // Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 | ||
| 63 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | ||
| 64 | uint carry = 0; | ||
| 65 | uvec2 result; | ||
| 66 | result.x = uaddCarry(value_1.x, value_2.x, carry); | ||
| 67 | result.y = value_1.y + value_2.y + carry; | ||
| 68 | return result; | ||
| 69 | } | ||
| 70 | |||
| 71 | // do subgroup Prefix Sum using Hillis and Steele's algorithm | ||
| 72 | uvec2 subgroupInclusiveAddUint64(uvec2 value) { | ||
| 73 | uvec2 result = value; | ||
| 74 | for (uint i = 1; i < gl_SubgroupSize; i *= 2) { | ||
| 75 | uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; | ||
| 76 | if (i <= gl_SubgroupInvocationID) { | ||
| 77 | result = AddUint64(result, other); | ||
| 78 | } | ||
| 79 | } | ||
| 80 | return result; | ||
| 81 | } | ||
| 82 | |||
| 83 | // Writes down the results to the output buffer and to the accumulation buffer | ||
| 84 | void WriteResults(uvec2 results[LOCAL_RESULTS]) { | ||
| 85 | const uint current_id = gl_LocalInvocationID.x; | ||
| 86 | const uvec2 accum = accumulated_data; | ||
| 87 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 88 | uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0); | ||
| 89 | AddUint64(results[i], base_data); | ||
| 90 | } | ||
| 91 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 92 | output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i]; | ||
| 93 | } | ||
| 94 | uint index = accumulation_limit % LOCAL_RESULTS; | ||
| 95 | uint base_id = accumulation_limit / LOCAL_RESULTS; | ||
| 96 | if (min_accumulation_base >= accumulation_limit + 1) { | ||
| 97 | if (current_id == base_id) { | ||
| 98 | accumulated_data = results[index]; | ||
| 99 | } | ||
| 100 | return; | ||
| 101 | } | ||
| 102 | // We have that ugly case in which the accumulation data is reset in the middle somewhere. | ||
| 103 | barrier(); | ||
| 104 | groupMemoryBarrier(); | ||
| 105 | |||
| 106 | if (current_id == base_id) { | ||
| 107 | uvec2 reset_value = output_data[max_accumulation_base - 1]; | ||
| 108 | // Calculate two complement / negate manually | ||
| 109 | reset_value = AddUint64(uvec2(1,0), ~reset_value); | ||
| 110 | accumulated_data = AddUint64(results[index], reset_value); | ||
| 111 | } | ||
| 112 | } | ||
| 113 | |||
| 114 | void main() { | ||
| 115 | const uint subgroup_inv_id = gl_SubgroupInvocationID; | ||
| 116 | const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups; | ||
| 117 | const uint last_subgroup_id = subgroupMax(subgroup_inv_id); | ||
| 118 | const uint current_id = gl_LocalInvocationID.x; | ||
| 119 | const uint total_work = accumulation_limit; | ||
| 120 | const uint last_result_id = LOCAL_RESULTS - 1; | ||
| 121 | uvec2 data[LOCAL_RESULTS]; | ||
| 122 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 123 | data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i]; | ||
| 124 | } | ||
| 125 | uvec2 results[LOCAL_RESULTS]; | ||
| 126 | results[0] = data[0]; | ||
| 127 | for (uint i = 1; i < LOCAL_RESULTS; i++) { | ||
| 128 | results[i] = AddUint64(data[i], results[i - 1]); | ||
| 129 | } | ||
| 130 | // make sure all input data has been loaded | ||
| 131 | subgroupBarrier(); | ||
| 132 | subgroupMemoryBarrier(); | ||
| 133 | |||
| 134 | // on the last local result, do a subgroup inclusive scan sum | ||
| 135 | results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]); | ||
| 136 | // get the last local result from the subgroup behind the current | ||
| 137 | uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1); | ||
| 138 | if (subgroup_inv_id != 0) { | ||
| 139 | for (uint i = 1; i < LOCAL_RESULTS; i++) { | ||
| 140 | results[i - 1] = AddUint64(results[i - 1], result_behind); | ||
| 141 | } | ||
| 142 | } | ||
| 143 | |||
| 144 | // if we had less queries than our subgroup, just write down the results. | ||
| 145 | if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch. | ||
| 146 | WriteResults(results); | ||
| 147 | return; | ||
| 148 | } | ||
| 149 | |||
| 150 | // We now have more, so lets write the last result into shared memory. | ||
| 151 | // Only pick the last subgroup. | ||
| 152 | if (subgroup_inv_id == last_subgroup_id) { | ||
| 153 | shared_data[subgroup_id] = results[last_result_id]; | ||
| 154 | } | ||
| 155 | // wait until everyone loaded their stuffs | ||
| 156 | barrier(); | ||
| 157 | memoryBarrierShared(); | ||
| 158 | |||
| 159 | // only if it's not the first subgroup | ||
| 160 | if (subgroup_id != 0) { | ||
| 161 | // get the results from some previous invocation | ||
| 162 | uvec2 tmp = shared_data[subgroup_inv_id]; | ||
| 163 | subgroupBarrier(); | ||
| 164 | subgroupMemoryBarrierShared(); | ||
| 165 | tmp = subgroupInclusiveAddUint64(tmp); | ||
| 166 | // obtain the result that would be equivalent to the previous result | ||
| 167 | uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1); | ||
| 168 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 169 | results[i] = AddUint64(results[i], shuffled_result); | ||
| 170 | } | ||
| 171 | } | ||
| 172 | WriteResults(results); | ||
| 173 | } \ No newline at end of file | ||
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp new file mode 100644 index 000000000..559a213b9 --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp | |||
| @@ -0,0 +1,138 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel | ||
| 2 | // SPDX-License-Identifier: MIT | ||
| 3 | |||
| 4 | // Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and | ||
| 5 | // Nicholas Haemel. Modified to suit needs. | ||
| 6 | |||
| 7 | #version 460 core | ||
| 8 | |||
| 9 | #ifdef VULKAN | ||
| 10 | |||
| 11 | #define HAS_EXTENDED_TYPES 1 | ||
| 12 | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | ||
| 13 | #define END_PUSH_CONSTANTS }; | ||
| 14 | #define UNIFORM(n) | ||
| 15 | #define BINDING_INPUT_BUFFER 0 | ||
| 16 | #define BINDING_OUTPUT_IMAGE 1 | ||
| 17 | |||
| 18 | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||
| 19 | |||
| 20 | #extension GL_NV_gpu_shader5 : enable | ||
| 21 | #ifdef GL_NV_gpu_shader5 | ||
| 22 | #define HAS_EXTENDED_TYPES 1 | ||
| 23 | #else | ||
| 24 | #define HAS_EXTENDED_TYPES 0 | ||
| 25 | #endif | ||
| 26 | #define BEGIN_PUSH_CONSTANTS | ||
| 27 | #define END_PUSH_CONSTANTS | ||
| 28 | #define UNIFORM(n) layout(location = n) uniform | ||
| 29 | #define BINDING_INPUT_BUFFER 0 | ||
| 30 | #define BINDING_OUTPUT_IMAGE 0 | ||
| 31 | |||
| 32 | #endif | ||
| 33 | |||
| 34 | BEGIN_PUSH_CONSTANTS | ||
| 35 | UNIFORM(0) uint min_accumulation_base; | ||
| 36 | UNIFORM(1) uint max_accumulation_base; | ||
| 37 | UNIFORM(2) uint accumulation_limit; | ||
| 38 | UNIFORM(3) uint buffer_offset; | ||
| 39 | END_PUSH_CONSTANTS | ||
| 40 | |||
| 41 | #define LOCAL_RESULTS 4 | ||
| 42 | #define QUERIES_PER_INVOC 2048 | ||
| 43 | |||
| 44 | layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; | ||
| 45 | |||
| 46 | layout(std430, binding = 0) readonly buffer block1 { | ||
| 47 | uvec2 input_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; | ||
| 48 | }; | ||
| 49 | |||
| 50 | layout(std430, binding = 1) writeonly coherent buffer block2 { | ||
| 51 | uvec2 output_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; | ||
| 52 | }; | ||
| 53 | |||
| 54 | layout(std430, binding = 2) coherent buffer block3 { | ||
| 55 | uvec2 accumulated_data; | ||
| 56 | }; | ||
| 57 | |||
| 58 | shared uvec2 shared_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; | ||
| 59 | |||
| 60 | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | ||
| 61 | uint carry = 0; | ||
| 62 | uvec2 result; | ||
| 63 | result.x = uaddCarry(value_1.x, value_2.x, carry); | ||
| 64 | result.y = value_1.y + value_2.y + carry; | ||
| 65 | return result; | ||
| 66 | } | ||
| 67 | |||
| 68 | void main(void) { | ||
| 69 | uint id = gl_LocalInvocationID.x; | ||
| 70 | uvec2 base_value[LOCAL_RESULTS]; | ||
| 71 | const uvec2 accum = accumulated_data; | ||
| 72 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 73 | base_value[i] = (buffer_offset + id * LOCAL_RESULTS + i) < min_accumulation_base | ||
| 74 | ? accumulated_data | ||
| 75 | : uvec2(0); | ||
| 76 | } | ||
| 77 | uint work_size = gl_WorkGroupSize.x; | ||
| 78 | uint rd_id; | ||
| 79 | uint wr_id; | ||
| 80 | uint mask; | ||
| 81 | uvec2 inputs[LOCAL_RESULTS]; | ||
| 82 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 83 | inputs[i] = input_data[buffer_offset + id * LOCAL_RESULTS + i]; | ||
| 84 | } | ||
| 85 | // The number of steps is the log base 2 of the | ||
| 86 | // work group size, which should be a power of 2 | ||
| 87 | const uint steps = uint(log2(work_size)) + uint(log2(LOCAL_RESULTS)); | ||
| 88 | uint step = 0; | ||
| 89 | |||
| 90 | // Each invocation is responsible for the content of | ||
| 91 | // two elements of the output array | ||
| 92 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 93 | shared_data[id * LOCAL_RESULTS + i] = inputs[i]; | ||
| 94 | } | ||
| 95 | // Synchronize to make sure that everyone has initialized | ||
| 96 | // their elements of shared_data[] with data loaded from | ||
| 97 | // the input arrays | ||
| 98 | barrier(); | ||
| 99 | memoryBarrierShared(); | ||
| 100 | // For each step... | ||
| 101 | for (step = 0; step < steps; step++) { | ||
| 102 | // Calculate the read and write index in the | ||
| 103 | // shared array | ||
| 104 | mask = (1 << step) - 1; | ||
| 105 | rd_id = ((id >> step) << (step + 1)) + mask; | ||
| 106 | wr_id = rd_id + 1 + (id & mask); | ||
| 107 | // Accumulate the read data into our element | ||
| 108 | |||
| 109 | shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); | ||
| 110 | // Synchronize again to make sure that everyone | ||
| 111 | // has caught up with us | ||
| 112 | barrier(); | ||
| 113 | memoryBarrierShared(); | ||
| 114 | } | ||
| 115 | // Add the accumulation | ||
| 116 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 117 | shared_data[id * LOCAL_RESULTS + i] = | ||
| 118 | AddUint64(shared_data[id * LOCAL_RESULTS + i], base_value[i]); | ||
| 119 | } | ||
| 120 | barrier(); | ||
| 121 | memoryBarrierShared(); | ||
| 122 | |||
| 123 | // Finally write our data back to the output buffer | ||
| 124 | for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||
| 125 | output_data[buffer_offset + id * LOCAL_RESULTS + i] = shared_data[id * LOCAL_RESULTS + i]; | ||
| 126 | } | ||
| 127 | if (id == 0) { | ||
| 128 | if (min_accumulation_base >= accumulation_limit + 1) { | ||
| 129 | accumulated_data = shared_data[accumulation_limit]; | ||
| 130 | return; | ||
| 131 | } | ||
| 132 | uvec2 reset_value = shared_data[max_accumulation_base - 1]; | ||
| 133 | uvec2 final_value = shared_data[accumulation_limit]; | ||
| 134 | // Two complements | ||
| 135 | reset_value = AddUint64(uvec2(1, 0), ~reset_value); | ||
| 136 | accumulated_data = AddUint64(final_value, reset_value); | ||
| 137 | } | ||
| 138 | } \ No newline at end of file | ||
diff --git a/src/video_core/host_shaders/resolve_conditional_render.comp b/src/video_core/host_shaders/resolve_conditional_render.comp new file mode 100644 index 000000000..307e77d1a --- /dev/null +++ b/src/video_core/host_shaders/resolve_conditional_render.comp | |||
| @@ -0,0 +1,20 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #version 450 | ||
| 5 | |||
| 6 | layout(local_size_x = 1) in; | ||
| 7 | |||
| 8 | layout(std430, binding = 0) buffer Query { | ||
| 9 | uvec2 initial; | ||
| 10 | uvec2 unknown; | ||
| 11 | uvec2 current; | ||
| 12 | }; | ||
| 13 | |||
| 14 | layout(std430, binding = 1) buffer Result { | ||
| 15 | uint result; | ||
| 16 | }; | ||
| 17 | |||
| 18 | void main() { | ||
| 19 | result = all(equal(initial, current)) ? 1 : 0; | ||
| 20 | } | ||
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 6272a4652..046c8085e 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp | |||
| @@ -67,6 +67,7 @@ public: | |||
| 67 | } | 67 | } |
| 68 | 68 | ||
| 69 | auto& params = maxwell3d.draw_manager->GetIndirectParams(); | 69 | auto& params = maxwell3d.draw_manager->GetIndirectParams(); |
| 70 | params.is_byte_count = false; | ||
| 70 | params.is_indexed = false; | 71 | params.is_indexed = false; |
| 71 | params.include_count = false; | 72 | params.include_count = false; |
| 72 | params.count_start_address = 0; | 73 | params.count_start_address = 0; |
| @@ -161,6 +162,7 @@ public: | |||
| 161 | 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); | 162 | 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); |
| 162 | } | 163 | } |
| 163 | auto& params = maxwell3d.draw_manager->GetIndirectParams(); | 164 | auto& params = maxwell3d.draw_manager->GetIndirectParams(); |
| 165 | params.is_byte_count = false; | ||
| 164 | params.is_indexed = true; | 166 | params.is_indexed = true; |
| 165 | params.include_count = false; | 167 | params.include_count = false; |
| 166 | params.count_start_address = 0; | 168 | params.count_start_address = 0; |
| @@ -256,6 +258,7 @@ public: | |||
| 256 | const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize()); | 258 | const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize()); |
| 257 | maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; | 259 | maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; |
| 258 | auto& params = maxwell3d.draw_manager->GetIndirectParams(); | 260 | auto& params = maxwell3d.draw_manager->GetIndirectParams(); |
| 261 | params.is_byte_count = false; | ||
| 259 | params.is_indexed = true; | 262 | params.is_indexed = true; |
| 260 | params.include_count = true; | 263 | params.include_count = true; |
| 261 | params.count_start_address = maxwell3d.GetMacroAddress(4); | 264 | params.count_start_address = maxwell3d.GetMacroAddress(4); |
| @@ -319,6 +322,47 @@ private: | |||
| 319 | } | 322 | } |
| 320 | }; | 323 | }; |
| 321 | 324 | ||
| 325 | class HLE_DrawIndirectByteCount final : public HLEMacroImpl { | ||
| 326 | public: | ||
| 327 | explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} | ||
| 328 | |||
| 329 | void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override { | ||
| 330 | auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0xFFFFU); | ||
| 331 | if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) { | ||
| 332 | Fallback(parameters); | ||
| 333 | return; | ||
| 334 | } | ||
| 335 | |||
| 336 | auto& params = maxwell3d.draw_manager->GetIndirectParams(); | ||
| 337 | params.is_byte_count = true; | ||
| 338 | params.is_indexed = false; | ||
| 339 | params.include_count = false; | ||
| 340 | params.count_start_address = 0; | ||
| 341 | params.indirect_start_address = maxwell3d.GetMacroAddress(2); | ||
| 342 | params.buffer_size = 4; | ||
| 343 | params.max_draw_counts = 1; | ||
| 344 | params.stride = parameters[1]; | ||
| 345 | maxwell3d.regs.draw.begin = parameters[0]; | ||
| 346 | maxwell3d.regs.draw_auto_stride = parameters[1]; | ||
| 347 | maxwell3d.regs.draw_auto_byte_count = parameters[2]; | ||
| 348 | |||
| 349 | maxwell3d.draw_manager->DrawArrayIndirect(topology); | ||
| 350 | } | ||
| 351 | |||
| 352 | private: | ||
| 353 | void Fallback(const std::vector<u32>& parameters) { | ||
| 354 | maxwell3d.RefreshParameters(); | ||
| 355 | |||
| 356 | maxwell3d.regs.draw.begin = parameters[0]; | ||
| 357 | maxwell3d.regs.draw_auto_stride = parameters[1]; | ||
| 358 | maxwell3d.regs.draw_auto_byte_count = parameters[2]; | ||
| 359 | |||
| 360 | maxwell3d.draw_manager->DrawArray( | ||
| 361 | maxwell3d.regs.draw.topology, 0, | ||
| 362 | maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1); | ||
| 363 | } | ||
| 364 | }; | ||
| 365 | |||
| 322 | class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl { | 366 | class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl { |
| 323 | public: | 367 | public: |
| 324 | explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} | 368 | explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} |
| @@ -536,6 +580,11 @@ HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} { | |||
| 536 | [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> { | 580 | [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> { |
| 537 | return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d__); | 581 | return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d__); |
| 538 | })); | 582 | })); |
| 583 | builders.emplace(0xB5F74EDB717278ECULL, | ||
| 584 | std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>( | ||
| 585 | [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> { | ||
| 586 | return std::make_unique<HLE_DrawIndirectByteCount>(maxwell3d__); | ||
| 587 | })); | ||
| 539 | } | 588 | } |
| 540 | 589 | ||
| 541 | HLEMacro::~HLEMacro() = default; | 590 | HLEMacro::~HLEMacro() = default; |
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 7047e2e63..9fcaeeac7 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h | |||
| @@ -25,6 +25,13 @@ | |||
| 25 | #include "video_core/rasterizer_interface.h" | 25 | #include "video_core/rasterizer_interface.h" |
| 26 | #include "video_core/texture_cache/slot_vector.h" | 26 | #include "video_core/texture_cache/slot_vector.h" |
| 27 | 27 | ||
| 28 | namespace VideoCore { | ||
| 29 | enum class QueryType { | ||
| 30 | SamplesPassed, | ||
| 31 | }; | ||
| 32 | constexpr std::size_t NumQueryTypes = 1; | ||
| 33 | } // namespace VideoCore | ||
| 34 | |||
| 28 | namespace VideoCommon { | 35 | namespace VideoCommon { |
| 29 | 36 | ||
| 30 | using AsyncJobId = SlotId; | 37 | using AsyncJobId = SlotId; |
| @@ -98,10 +105,10 @@ private: | |||
| 98 | }; | 105 | }; |
| 99 | 106 | ||
| 100 | template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter> | 107 | template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter> |
| 101 | class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { | 108 | class QueryCacheLegacy : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { |
| 102 | public: | 109 | public: |
| 103 | explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_, | 110 | explicit QueryCacheLegacy(VideoCore::RasterizerInterface& rasterizer_, |
| 104 | Core::Memory::Memory& cpu_memory_) | 111 | Core::Memory::Memory& cpu_memory_) |
| 105 | : rasterizer{rasterizer_}, | 112 | : rasterizer{rasterizer_}, |
| 106 | // Use reinterpret_cast instead of static_cast as workaround for | 113 | // Use reinterpret_cast instead of static_cast as workaround for |
| 107 | // UBSan bug (https://github.com/llvm/llvm-project/issues/59060) | 114 | // UBSan bug (https://github.com/llvm/llvm-project/issues/59060) |
diff --git a/src/video_core/query_cache/bank_base.h b/src/video_core/query_cache/bank_base.h new file mode 100644 index 000000000..420927091 --- /dev/null +++ b/src/video_core/query_cache/bank_base.h | |||
| @@ -0,0 +1,104 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #pragma once | ||
| 5 | |||
| 6 | #include <atomic> | ||
| 7 | #include <deque> | ||
| 8 | #include <utility> | ||
| 9 | |||
| 10 | #include "common/common_types.h" | ||
| 11 | |||
| 12 | namespace VideoCommon { | ||
| 13 | |||
| 14 | class BankBase { | ||
| 15 | protected: | ||
| 16 | const size_t base_bank_size{}; | ||
| 17 | size_t bank_size{}; | ||
| 18 | std::atomic<size_t> references{}; | ||
| 19 | size_t current_slot{}; | ||
| 20 | |||
| 21 | public: | ||
| 22 | explicit BankBase(size_t bank_size_) : base_bank_size{bank_size_}, bank_size(bank_size_) {} | ||
| 23 | |||
| 24 | virtual ~BankBase() = default; | ||
| 25 | |||
| 26 | virtual std::pair<bool, size_t> Reserve() { | ||
| 27 | if (IsClosed()) { | ||
| 28 | return {false, bank_size}; | ||
| 29 | } | ||
| 30 | const size_t result = current_slot++; | ||
| 31 | return {true, result}; | ||
| 32 | } | ||
| 33 | |||
| 34 | virtual void Reset() { | ||
| 35 | current_slot = 0; | ||
| 36 | references = 0; | ||
| 37 | bank_size = base_bank_size; | ||
| 38 | } | ||
| 39 | |||
| 40 | size_t Size() const { | ||
| 41 | return bank_size; | ||
| 42 | } | ||
| 43 | |||
| 44 | void AddReference(size_t how_many = 1) { | ||
| 45 | references.fetch_add(how_many, std::memory_order_relaxed); | ||
| 46 | } | ||
| 47 | |||
| 48 | void CloseReference(size_t how_many = 1) { | ||
| 49 | if (how_many > references.load(std::memory_order_relaxed)) { | ||
| 50 | UNREACHABLE(); | ||
| 51 | } | ||
| 52 | references.fetch_sub(how_many, std::memory_order_relaxed); | ||
| 53 | } | ||
| 54 | |||
| 55 | void Close() { | ||
| 56 | bank_size = current_slot; | ||
| 57 | } | ||
| 58 | |||
| 59 | bool IsClosed() const { | ||
| 60 | return current_slot >= bank_size; | ||
| 61 | } | ||
| 62 | |||
| 63 | bool IsDead() const { | ||
| 64 | return IsClosed() && references == 0; | ||
| 65 | } | ||
| 66 | }; | ||
| 67 | |||
| 68 | template <typename BankType> | ||
| 69 | class BankPool { | ||
| 70 | private: | ||
| 71 | std::deque<BankType> bank_pool; | ||
| 72 | std::deque<size_t> bank_indices; | ||
| 73 | |||
| 74 | public: | ||
| 75 | BankPool() = default; | ||
| 76 | ~BankPool() = default; | ||
| 77 | |||
| 78 | // Reserve a bank from the pool and return its index | ||
| 79 | template <typename Func> | ||
| 80 | size_t ReserveBank(Func&& builder) { | ||
| 81 | if (!bank_indices.empty() && bank_pool[bank_indices.front()].IsDead()) { | ||
| 82 | size_t new_index = bank_indices.front(); | ||
| 83 | bank_indices.pop_front(); | ||
| 84 | bank_pool[new_index].Reset(); | ||
| 85 | return new_index; | ||
| 86 | } | ||
| 87 | size_t new_index = bank_pool.size(); | ||
| 88 | builder(bank_pool, new_index); | ||
| 89 | bank_indices.push_back(new_index); | ||
| 90 | return new_index; | ||
| 91 | } | ||
| 92 | |||
| 93 | // Get a reference to a bank using its index | ||
| 94 | BankType& GetBank(size_t index) { | ||
| 95 | return bank_pool[index]; | ||
| 96 | } | ||
| 97 | |||
| 98 | // Get the total number of banks in the pool | ||
| 99 | size_t BankCount() const { | ||
| 100 | return bank_pool.size(); | ||
| 101 | } | ||
| 102 | }; | ||
| 103 | |||
| 104 | } // namespace VideoCommon | ||
diff --git a/src/video_core/query_cache/query_base.h b/src/video_core/query_cache/query_base.h new file mode 100644 index 000000000..1d786b3a7 --- /dev/null +++ b/src/video_core/query_cache/query_base.h | |||
| @@ -0,0 +1,70 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #pragma once | ||
| 5 | |||
| 6 | #include "common/common_funcs.h" | ||
| 7 | #include "common/common_types.h" | ||
| 8 | |||
| 9 | namespace VideoCommon { | ||
| 10 | |||
| 11 | enum class QueryFlagBits : u32 { | ||
| 12 | HasTimestamp = 1 << 0, ///< Indicates if this query has a timestamp. | ||
| 13 | IsFinalValueSynced = 1 << 1, ///< Indicates if the query has been synced in the host | ||
| 14 | IsHostSynced = 1 << 2, ///< Indicates if the query has been synced in the host | ||
| 15 | IsGuestSynced = 1 << 3, ///< Indicates if the query has been synced with the guest. | ||
| 16 | IsHostManaged = 1 << 4, ///< Indicates if this query points to a host query | ||
| 17 | IsRewritten = 1 << 5, ///< Indicates if this query was rewritten by another query | ||
| 18 | IsInvalidated = 1 << 6, ///< Indicates the value of th query has been nullified. | ||
| 19 | IsOrphan = 1 << 7, ///< Indicates the query has not been set by a guest query. | ||
| 20 | IsFence = 1 << 8, ///< Indicates the query is a fence. | ||
| 21 | }; | ||
| 22 | DECLARE_ENUM_FLAG_OPERATORS(QueryFlagBits) | ||
| 23 | |||
| 24 | class QueryBase { | ||
| 25 | public: | ||
| 26 | VAddr guest_address{}; | ||
| 27 | QueryFlagBits flags{}; | ||
| 28 | u64 value{}; | ||
| 29 | |||
| 30 | protected: | ||
| 31 | // Default constructor | ||
| 32 | QueryBase() = default; | ||
| 33 | |||
| 34 | // Parameterized constructor | ||
| 35 | QueryBase(VAddr address, QueryFlagBits flags_, u64 value_) | ||
| 36 | : guest_address(address), flags(flags_), value{value_} {} | ||
| 37 | }; | ||
| 38 | |||
| 39 | class GuestQuery : public QueryBase { | ||
| 40 | public: | ||
| 41 | // Parameterized constructor | ||
| 42 | GuestQuery(bool isLong, VAddr address, u64 queryValue) | ||
| 43 | : QueryBase(address, QueryFlagBits::IsFinalValueSynced, queryValue) { | ||
| 44 | if (isLong) { | ||
| 45 | flags |= QueryFlagBits::HasTimestamp; | ||
| 46 | } | ||
| 47 | } | ||
| 48 | }; | ||
| 49 | |||
| 50 | class HostQueryBase : public QueryBase { | ||
| 51 | public: | ||
| 52 | // Default constructor | ||
| 53 | HostQueryBase() : QueryBase(0, QueryFlagBits::IsHostManaged | QueryFlagBits::IsOrphan, 0) {} | ||
| 54 | |||
| 55 | // Parameterized constructor | ||
| 56 | HostQueryBase(bool has_timestamp, VAddr address) | ||
| 57 | : QueryBase(address, QueryFlagBits::IsHostManaged, 0), start_bank_id{}, size_banks{}, | ||
| 58 | start_slot{}, size_slots{} { | ||
| 59 | if (has_timestamp) { | ||
| 60 | flags |= QueryFlagBits::HasTimestamp; | ||
| 61 | } | ||
| 62 | } | ||
| 63 | |||
| 64 | u32 start_bank_id{}; | ||
| 65 | u32 size_banks{}; | ||
| 66 | size_t start_slot{}; | ||
| 67 | size_t size_slots{}; | ||
| 68 | }; | ||
| 69 | |||
| 70 | } // namespace VideoCommon \ No newline at end of file | ||
diff --git a/src/video_core/query_cache/query_cache.h b/src/video_core/query_cache/query_cache.h new file mode 100644 index 000000000..78b42b518 --- /dev/null +++ b/src/video_core/query_cache/query_cache.h | |||
| @@ -0,0 +1,580 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #pragma once | ||
| 5 | |||
| 6 | #include <array> | ||
| 7 | #include <deque> | ||
| 8 | #include <memory> | ||
| 9 | #include <mutex> | ||
| 10 | #include <unordered_map> | ||
| 11 | #include <utility> | ||
| 12 | |||
| 13 | #include "common/assert.h" | ||
| 14 | #include "common/common_types.h" | ||
| 15 | #include "common/logging/log.h" | ||
| 16 | #include "common/scope_exit.h" | ||
| 17 | #include "common/settings.h" | ||
| 18 | #include "core/memory.h" | ||
| 19 | #include "video_core/engines/maxwell_3d.h" | ||
| 20 | #include "video_core/gpu.h" | ||
| 21 | #include "video_core/memory_manager.h" | ||
| 22 | #include "video_core/query_cache/bank_base.h" | ||
| 23 | #include "video_core/query_cache/query_base.h" | ||
| 24 | #include "video_core/query_cache/query_cache_base.h" | ||
| 25 | #include "video_core/query_cache/query_stream.h" | ||
| 26 | #include "video_core/query_cache/types.h" | ||
| 27 | |||
| 28 | namespace VideoCommon { | ||
| 29 | |||
| 30 | using Maxwell = Tegra::Engines::Maxwell3D; | ||
| 31 | |||
| 32 | struct SyncValuesStruct { | ||
| 33 | VAddr address; | ||
| 34 | u64 value; | ||
| 35 | u64 size; | ||
| 36 | |||
| 37 | static constexpr bool GeneratesBaseBuffer = true; | ||
| 38 | }; | ||
| 39 | |||
| 40 | template <typename Traits> | ||
| 41 | class GuestStreamer : public SimpleStreamer<GuestQuery> { | ||
| 42 | public: | ||
| 43 | using RuntimeType = typename Traits::RuntimeType; | ||
| 44 | |||
| 45 | GuestStreamer(size_t id_, RuntimeType& runtime_) | ||
| 46 | : SimpleStreamer<GuestQuery>(id_), runtime{runtime_} {} | ||
| 47 | |||
| 48 | virtual ~GuestStreamer() = default; | ||
| 49 | |||
| 50 | size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, | ||
| 51 | std::optional<u32> subreport = std::nullopt) override { | ||
| 52 | auto new_id = BuildQuery(has_timestamp, address, static_cast<u64>(value)); | ||
| 53 | pending_sync.push_back(new_id); | ||
| 54 | return new_id; | ||
| 55 | } | ||
| 56 | |||
| 57 | bool HasPendingSync() const override { | ||
| 58 | return !pending_sync.empty(); | ||
| 59 | } | ||
| 60 | |||
| 61 | void SyncWrites() override { | ||
| 62 | if (pending_sync.empty()) { | ||
| 63 | return; | ||
| 64 | } | ||
| 65 | std::vector<SyncValuesStruct> sync_values; | ||
| 66 | sync_values.reserve(pending_sync.size()); | ||
| 67 | for (size_t pending_id : pending_sync) { | ||
| 68 | auto& query = slot_queries[pending_id]; | ||
| 69 | if (True(query.flags & QueryFlagBits::IsRewritten) || | ||
| 70 | True(query.flags & QueryFlagBits::IsInvalidated)) { | ||
| 71 | continue; | ||
| 72 | } | ||
| 73 | query.flags |= QueryFlagBits::IsHostSynced; | ||
| 74 | sync_values.emplace_back(SyncValuesStruct{ | ||
| 75 | .address = query.guest_address, | ||
| 76 | .value = query.value, | ||
| 77 | .size = static_cast<u64>(True(query.flags & QueryFlagBits::HasTimestamp) ? 8 : 4)}); | ||
| 78 | } | ||
| 79 | pending_sync.clear(); | ||
| 80 | if (sync_values.size() > 0) { | ||
| 81 | runtime.template SyncValues<SyncValuesStruct>(sync_values); | ||
| 82 | } | ||
| 83 | } | ||
| 84 | |||
| 85 | private: | ||
| 86 | RuntimeType& runtime; | ||
| 87 | std::deque<size_t> pending_sync; | ||
| 88 | }; | ||
| 89 | |||
| 90 | template <typename Traits> | ||
| 91 | class StubStreamer : public GuestStreamer<Traits> { | ||
| 92 | public: | ||
| 93 | using RuntimeType = typename Traits::RuntimeType; | ||
| 94 | |||
| 95 | StubStreamer(size_t id_, RuntimeType& runtime_, u32 stub_value_) | ||
| 96 | : GuestStreamer<Traits>(id_, runtime_), stub_value{stub_value_} {} | ||
| 97 | |||
| 98 | ~StubStreamer() override = default; | ||
| 99 | |||
| 100 | size_t WriteCounter(VAddr address, bool has_timestamp, [[maybe_unused]] u32 value, | ||
| 101 | std::optional<u32> subreport = std::nullopt) override { | ||
| 102 | size_t new_id = | ||
| 103 | GuestStreamer<Traits>::WriteCounter(address, has_timestamp, stub_value, subreport); | ||
| 104 | return new_id; | ||
| 105 | } | ||
| 106 | |||
| 107 | private: | ||
| 108 | u32 stub_value; | ||
| 109 | }; | ||
| 110 | |||
| 111 | template <typename Traits> | ||
| 112 | struct QueryCacheBase<Traits>::QueryCacheBaseImpl { | ||
| 113 | using RuntimeType = typename Traits::RuntimeType; | ||
| 114 | |||
| 115 | QueryCacheBaseImpl(QueryCacheBase<Traits>* owner_, VideoCore::RasterizerInterface& rasterizer_, | ||
| 116 | Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_, Tegra::GPU& gpu_) | ||
| 117 | : owner{owner_}, rasterizer{rasterizer_}, | ||
| 118 | cpu_memory{cpu_memory_}, runtime{runtime_}, gpu{gpu_} { | ||
| 119 | streamer_mask = 0; | ||
| 120 | for (size_t i = 0; i < static_cast<size_t>(QueryType::MaxQueryTypes); i++) { | ||
| 121 | streamers[i] = runtime.GetStreamerInterface(static_cast<QueryType>(i)); | ||
| 122 | if (streamers[i]) { | ||
| 123 | streamer_mask |= 1ULL << streamers[i]->GetId(); | ||
| 124 | } | ||
| 125 | } | ||
| 126 | } | ||
| 127 | |||
| 128 | template <typename Func> | ||
| 129 | void ForEachStreamerIn(u64 mask, Func&& func) { | ||
| 130 | static constexpr bool RETURNS_BOOL = | ||
| 131 | std::is_same_v<std::invoke_result<Func, StreamerInterface*>, bool>; | ||
| 132 | while (mask != 0) { | ||
| 133 | size_t position = std::countr_zero(mask); | ||
| 134 | mask &= ~(1ULL << position); | ||
| 135 | if constexpr (RETURNS_BOOL) { | ||
| 136 | if (func(streamers[position])) { | ||
| 137 | return; | ||
| 138 | } | ||
| 139 | } else { | ||
| 140 | func(streamers[position]); | ||
| 141 | } | ||
| 142 | } | ||
| 143 | } | ||
| 144 | |||
| 145 | template <typename Func> | ||
| 146 | void ForEachStreamer(Func&& func) { | ||
| 147 | ForEachStreamerIn(streamer_mask, func); | ||
| 148 | } | ||
| 149 | |||
| 150 | QueryBase* ObtainQuery(QueryCacheBase<Traits>::QueryLocation location) { | ||
| 151 | size_t which_stream = location.stream_id.Value(); | ||
| 152 | auto* streamer = streamers[which_stream]; | ||
| 153 | if (!streamer) { | ||
| 154 | return nullptr; | ||
| 155 | } | ||
| 156 | return streamer->GetQuery(location.query_id.Value()); | ||
| 157 | } | ||
| 158 | |||
| 159 | QueryCacheBase<Traits>* owner; | ||
| 160 | VideoCore::RasterizerInterface& rasterizer; | ||
| 161 | Core::Memory::Memory& cpu_memory; | ||
| 162 | RuntimeType& runtime; | ||
| 163 | Tegra::GPU& gpu; | ||
| 164 | std::array<StreamerInterface*, static_cast<size_t>(QueryType::MaxQueryTypes)> streamers; | ||
| 165 | u64 streamer_mask; | ||
| 166 | std::mutex flush_guard; | ||
| 167 | std::deque<u64> flushes_pending; | ||
| 168 | std::vector<QueryCacheBase<Traits>::QueryLocation> pending_unregister; | ||
| 169 | }; | ||
| 170 | |||
| 171 | template <typename Traits> | ||
| 172 | QueryCacheBase<Traits>::QueryCacheBase(Tegra::GPU& gpu_, | ||
| 173 | VideoCore::RasterizerInterface& rasterizer_, | ||
| 174 | Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_) | ||
| 175 | : cached_queries{} { | ||
| 176 | impl = std::make_unique<QueryCacheBase<Traits>::QueryCacheBaseImpl>( | ||
| 177 | this, rasterizer_, cpu_memory_, runtime_, gpu_); | ||
| 178 | } | ||
| 179 | |||
| 180 | template <typename Traits> | ||
| 181 | QueryCacheBase<Traits>::~QueryCacheBase() = default; | ||
| 182 | |||
| 183 | template <typename Traits> | ||
| 184 | void QueryCacheBase<Traits>::CounterEnable(QueryType counter_type, bool is_enabled) { | ||
| 185 | size_t index = static_cast<size_t>(counter_type); | ||
| 186 | StreamerInterface* streamer = impl->streamers[index]; | ||
| 187 | if (!streamer) [[unlikely]] { | ||
| 188 | UNREACHABLE(); | ||
| 189 | return; | ||
| 190 | } | ||
| 191 | if (is_enabled) { | ||
| 192 | streamer->StartCounter(); | ||
| 193 | } else { | ||
| 194 | streamer->PauseCounter(); | ||
| 195 | } | ||
| 196 | } | ||
| 197 | |||
| 198 | template <typename Traits> | ||
| 199 | void QueryCacheBase<Traits>::CounterClose(QueryType counter_type) { | ||
| 200 | size_t index = static_cast<size_t>(counter_type); | ||
| 201 | StreamerInterface* streamer = impl->streamers[index]; | ||
| 202 | if (!streamer) [[unlikely]] { | ||
| 203 | UNREACHABLE(); | ||
| 204 | return; | ||
| 205 | } | ||
| 206 | streamer->CloseCounter(); | ||
| 207 | } | ||
| 208 | |||
| 209 | template <typename Traits> | ||
| 210 | void QueryCacheBase<Traits>::CounterReset(QueryType counter_type) { | ||
| 211 | size_t index = static_cast<size_t>(counter_type); | ||
| 212 | StreamerInterface* streamer = impl->streamers[index]; | ||
| 213 | if (!streamer) [[unlikely]] { | ||
| 214 | UNIMPLEMENTED(); | ||
| 215 | return; | ||
| 216 | } | ||
| 217 | streamer->ResetCounter(); | ||
| 218 | } | ||
| 219 | |||
| 220 | template <typename Traits> | ||
| 221 | void QueryCacheBase<Traits>::BindToChannel(s32 id) { | ||
| 222 | VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo>::BindToChannel(id); | ||
| 223 | impl->runtime.Bind3DEngine(maxwell3d); | ||
| 224 | } | ||
| 225 | |||
| 226 | template <typename Traits> | ||
| 227 | void QueryCacheBase<Traits>::CounterReport(GPUVAddr addr, QueryType counter_type, | ||
| 228 | QueryPropertiesFlags flags, u32 payload, u32 subreport) { | ||
| 229 | const bool has_timestamp = True(flags & QueryPropertiesFlags::HasTimeout); | ||
| 230 | const bool is_fence = True(flags & QueryPropertiesFlags::IsAFence); | ||
| 231 | size_t streamer_id = static_cast<size_t>(counter_type); | ||
| 232 | auto* streamer = impl->streamers[streamer_id]; | ||
| 233 | if (streamer == nullptr) [[unlikely]] { | ||
| 234 | counter_type = QueryType::Payload; | ||
| 235 | payload = 1U; | ||
| 236 | streamer_id = static_cast<size_t>(counter_type); | ||
| 237 | streamer = impl->streamers[streamer_id]; | ||
| 238 | } | ||
| 239 | auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(addr); | ||
| 240 | if (!cpu_addr_opt) [[unlikely]] { | ||
| 241 | return; | ||
| 242 | } | ||
| 243 | VAddr cpu_addr = *cpu_addr_opt; | ||
| 244 | const size_t new_query_id = streamer->WriteCounter(cpu_addr, has_timestamp, payload, subreport); | ||
| 245 | auto* query = streamer->GetQuery(new_query_id); | ||
| 246 | if (is_fence) { | ||
| 247 | query->flags |= QueryFlagBits::IsFence; | ||
| 248 | } | ||
| 249 | QueryLocation query_location{}; | ||
| 250 | query_location.stream_id.Assign(static_cast<u32>(streamer_id)); | ||
| 251 | query_location.query_id.Assign(static_cast<u32>(new_query_id)); | ||
| 252 | const auto gen_caching_indexing = [](VAddr cur_addr) { | ||
| 253 | return std::make_pair<u64, u32>(cur_addr >> Core::Memory::YUZU_PAGEBITS, | ||
| 254 | static_cast<u32>(cur_addr & Core::Memory::YUZU_PAGEMASK)); | ||
| 255 | }; | ||
| 256 | u8* pointer = impl->cpu_memory.GetPointer(cpu_addr); | ||
| 257 | u8* pointer_timestamp = impl->cpu_memory.GetPointer(cpu_addr + 8); | ||
| 258 | bool is_synced = !Settings::IsGPULevelHigh() && is_fence; | ||
| 259 | |||
| 260 | std::function<void()> operation([this, is_synced, streamer, query_base = query, query_location, | ||
| 261 | pointer, pointer_timestamp] { | ||
| 262 | if (True(query_base->flags & QueryFlagBits::IsInvalidated)) { | ||
| 263 | if (!is_synced) [[likely]] { | ||
| 264 | impl->pending_unregister.push_back(query_location); | ||
| 265 | } | ||
| 266 | return; | ||
| 267 | } | ||
| 268 | if (False(query_base->flags & QueryFlagBits::IsFinalValueSynced)) [[unlikely]] { | ||
| 269 | UNREACHABLE(); | ||
| 270 | return; | ||
| 271 | } | ||
| 272 | query_base->value += streamer->GetAmmendValue(); | ||
| 273 | streamer->SetAccumulationValue(query_base->value); | ||
| 274 | if (True(query_base->flags & QueryFlagBits::HasTimestamp)) { | ||
| 275 | u64 timestamp = impl->gpu.GetTicks(); | ||
| 276 | std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp)); | ||
| 277 | std::memcpy(pointer, &query_base->value, sizeof(query_base->value)); | ||
| 278 | } else { | ||
| 279 | u32 value = static_cast<u32>(query_base->value); | ||
| 280 | std::memcpy(pointer, &value, sizeof(value)); | ||
| 281 | } | ||
| 282 | if (!is_synced) [[likely]] { | ||
| 283 | impl->pending_unregister.push_back(query_location); | ||
| 284 | } | ||
| 285 | }); | ||
| 286 | if (is_fence) { | ||
| 287 | impl->rasterizer.SignalFence(std::move(operation)); | ||
| 288 | } else { | ||
| 289 | if (!Settings::IsGPULevelHigh() && counter_type == QueryType::Payload) { | ||
| 290 | if (has_timestamp) { | ||
| 291 | u64 timestamp = impl->gpu.GetTicks(); | ||
| 292 | u64 value = static_cast<u64>(payload); | ||
| 293 | std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp)); | ||
| 294 | std::memcpy(pointer, &value, sizeof(value)); | ||
| 295 | } else { | ||
| 296 | std::memcpy(pointer, &payload, sizeof(payload)); | ||
| 297 | } | ||
| 298 | streamer->Free(new_query_id); | ||
| 299 | return; | ||
| 300 | } | ||
| 301 | impl->rasterizer.SyncOperation(std::move(operation)); | ||
| 302 | } | ||
| 303 | if (is_synced) { | ||
| 304 | streamer->Free(new_query_id); | ||
| 305 | return; | ||
| 306 | } | ||
| 307 | auto [cont_addr, base] = gen_caching_indexing(cpu_addr); | ||
| 308 | { | ||
| 309 | std::scoped_lock lock(cache_mutex); | ||
| 310 | auto it1 = cached_queries.try_emplace(cont_addr); | ||
| 311 | auto& sub_container = it1.first->second; | ||
| 312 | auto it_current = sub_container.find(base); | ||
| 313 | if (it_current == sub_container.end()) { | ||
| 314 | sub_container.insert_or_assign(base, query_location); | ||
| 315 | return; | ||
| 316 | } | ||
| 317 | auto* old_query = impl->ObtainQuery(it_current->second); | ||
| 318 | old_query->flags |= QueryFlagBits::IsRewritten; | ||
| 319 | sub_container.insert_or_assign(base, query_location); | ||
| 320 | } | ||
| 321 | } | ||
| 322 | |||
| 323 | template <typename Traits> | ||
| 324 | void QueryCacheBase<Traits>::UnregisterPending() { | ||
| 325 | const auto gen_caching_indexing = [](VAddr cur_addr) { | ||
| 326 | return std::make_pair<u64, u32>(cur_addr >> Core::Memory::YUZU_PAGEBITS, | ||
| 327 | static_cast<u32>(cur_addr & Core::Memory::YUZU_PAGEMASK)); | ||
| 328 | }; | ||
| 329 | std::scoped_lock lock(cache_mutex); | ||
| 330 | for (QueryLocation loc : impl->pending_unregister) { | ||
| 331 | const auto [streamer_id, query_id] = loc.unpack(); | ||
| 332 | auto* streamer = impl->streamers[streamer_id]; | ||
| 333 | if (!streamer) [[unlikely]] { | ||
| 334 | continue; | ||
| 335 | } | ||
| 336 | auto* query = streamer->GetQuery(query_id); | ||
| 337 | auto [cont_addr, base] = gen_caching_indexing(query->guest_address); | ||
| 338 | auto it1 = cached_queries.find(cont_addr); | ||
| 339 | if (it1 != cached_queries.end()) { | ||
| 340 | auto it2 = it1->second.find(base); | ||
| 341 | if (it2 != it1->second.end()) { | ||
| 342 | if (it2->second.raw == loc.raw) { | ||
| 343 | it1->second.erase(it2); | ||
| 344 | } | ||
| 345 | } | ||
| 346 | } | ||
| 347 | streamer->Free(query_id); | ||
| 348 | } | ||
| 349 | impl->pending_unregister.clear(); | ||
| 350 | } | ||
| 351 | |||
| 352 | template <typename Traits> | ||
| 353 | void QueryCacheBase<Traits>::NotifyWFI() { | ||
| 354 | bool should_sync = false; | ||
| 355 | impl->ForEachStreamer( | ||
| 356 | [&should_sync](StreamerInterface* streamer) { should_sync |= streamer->HasPendingSync(); }); | ||
| 357 | if (!should_sync) { | ||
| 358 | return; | ||
| 359 | } | ||
| 360 | |||
| 361 | impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->PresyncWrites(); }); | ||
| 362 | impl->runtime.Barriers(true); | ||
| 363 | impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->SyncWrites(); }); | ||
| 364 | impl->runtime.Barriers(false); | ||
| 365 | } | ||
| 366 | |||
| 367 | template <typename Traits> | ||
| 368 | void QueryCacheBase<Traits>::NotifySegment(bool resume) { | ||
| 369 | if (resume) { | ||
| 370 | impl->runtime.ResumeHostConditionalRendering(); | ||
| 371 | } else { | ||
| 372 | CounterClose(VideoCommon::QueryType::ZPassPixelCount64); | ||
| 373 | CounterClose(VideoCommon::QueryType::StreamingByteCount); | ||
| 374 | impl->runtime.PauseHostConditionalRendering(); | ||
| 375 | } | ||
| 376 | } | ||
| 377 | |||
| 378 | template <typename Traits> | ||
| 379 | bool QueryCacheBase<Traits>::AccelerateHostConditionalRendering() { | ||
| 380 | bool qc_dirty = false; | ||
| 381 | const auto gen_lookup = [this, &qc_dirty](GPUVAddr address) -> VideoCommon::LookupData { | ||
| 382 | auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(address); | ||
| 383 | if (!cpu_addr_opt) [[unlikely]] { | ||
| 384 | return VideoCommon::LookupData{ | ||
| 385 | .address = 0, | ||
| 386 | .found_query = nullptr, | ||
| 387 | }; | ||
| 388 | } | ||
| 389 | VAddr cpu_addr = *cpu_addr_opt; | ||
| 390 | std::scoped_lock lock(cache_mutex); | ||
| 391 | auto it1 = cached_queries.find(cpu_addr >> Core::Memory::YUZU_PAGEBITS); | ||
| 392 | if (it1 == cached_queries.end()) { | ||
| 393 | return VideoCommon::LookupData{ | ||
| 394 | .address = cpu_addr, | ||
| 395 | .found_query = nullptr, | ||
| 396 | }; | ||
| 397 | } | ||
| 398 | auto& sub_container = it1->second; | ||
| 399 | auto it_current = sub_container.find(cpu_addr & Core::Memory::YUZU_PAGEMASK); | ||
| 400 | |||
| 401 | if (it_current == sub_container.end()) { | ||
| 402 | auto it_current_2 = sub_container.find((cpu_addr & Core::Memory::YUZU_PAGEMASK) + 4); | ||
| 403 | if (it_current_2 == sub_container.end()) { | ||
| 404 | return VideoCommon::LookupData{ | ||
| 405 | .address = cpu_addr, | ||
| 406 | .found_query = nullptr, | ||
| 407 | }; | ||
| 408 | } | ||
| 409 | } | ||
| 410 | auto* query = impl->ObtainQuery(it_current->second); | ||
| 411 | qc_dirty |= True(query->flags & QueryFlagBits::IsHostManaged) && | ||
| 412 | False(query->flags & QueryFlagBits::IsGuestSynced); | ||
| 413 | return VideoCommon::LookupData{ | ||
| 414 | .address = cpu_addr, | ||
| 415 | .found_query = query, | ||
| 416 | }; | ||
| 417 | }; | ||
| 418 | |||
| 419 | auto& regs = maxwell3d->regs; | ||
| 420 | if (regs.render_enable_override != Maxwell::Regs::RenderEnable::Override::UseRenderEnable) { | ||
| 421 | impl->runtime.EndHostConditionalRendering(); | ||
| 422 | return false; | ||
| 423 | } | ||
| 424 | const ComparisonMode mode = static_cast<ComparisonMode>(regs.render_enable.mode); | ||
| 425 | const GPUVAddr address = regs.render_enable.Address(); | ||
| 426 | switch (mode) { | ||
| 427 | case ComparisonMode::True: | ||
| 428 | impl->runtime.EndHostConditionalRendering(); | ||
| 429 | return false; | ||
| 430 | case ComparisonMode::False: | ||
| 431 | impl->runtime.EndHostConditionalRendering(); | ||
| 432 | return false; | ||
| 433 | case ComparisonMode::Conditional: { | ||
| 434 | VideoCommon::LookupData object_1{gen_lookup(address)}; | ||
| 435 | return impl->runtime.HostConditionalRenderingCompareValue(object_1, qc_dirty); | ||
| 436 | } | ||
| 437 | case ComparisonMode::IfEqual: { | ||
| 438 | VideoCommon::LookupData object_1{gen_lookup(address)}; | ||
| 439 | VideoCommon::LookupData object_2{gen_lookup(address + 16)}; | ||
| 440 | return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty, | ||
| 441 | true); | ||
| 442 | } | ||
| 443 | case ComparisonMode::IfNotEqual: { | ||
| 444 | VideoCommon::LookupData object_1{gen_lookup(address)}; | ||
| 445 | VideoCommon::LookupData object_2{gen_lookup(address + 16)}; | ||
| 446 | return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty, | ||
| 447 | false); | ||
| 448 | } | ||
| 449 | default: | ||
| 450 | return false; | ||
| 451 | } | ||
| 452 | } | ||
| 453 | |||
| 454 | // Async downloads | ||
| 455 | template <typename Traits> | ||
| 456 | void QueryCacheBase<Traits>::CommitAsyncFlushes() { | ||
| 457 | // Make sure to have the results synced in Host. | ||
| 458 | NotifyWFI(); | ||
| 459 | |||
| 460 | u64 mask{}; | ||
| 461 | { | ||
| 462 | std::scoped_lock lk(impl->flush_guard); | ||
| 463 | impl->ForEachStreamer([&mask](StreamerInterface* streamer) { | ||
| 464 | bool local_result = streamer->HasUnsyncedQueries(); | ||
| 465 | if (local_result) { | ||
| 466 | mask |= 1ULL << streamer->GetId(); | ||
| 467 | } | ||
| 468 | }); | ||
| 469 | impl->flushes_pending.push_back(mask); | ||
| 470 | } | ||
| 471 | std::function<void()> func([this] { UnregisterPending(); }); | ||
| 472 | impl->rasterizer.SyncOperation(std::move(func)); | ||
| 473 | if (mask == 0) { | ||
| 474 | return; | ||
| 475 | } | ||
| 476 | u64 ran_mask = ~mask; | ||
| 477 | while (mask) { | ||
| 478 | impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) { | ||
| 479 | u64 dep_mask = streamer->GetDependentMask(); | ||
| 480 | if ((dep_mask & ~ran_mask) != 0) { | ||
| 481 | return; | ||
| 482 | } | ||
| 483 | u64 index = streamer->GetId(); | ||
| 484 | ran_mask |= (1ULL << index); | ||
| 485 | mask &= ~(1ULL << index); | ||
| 486 | streamer->PushUnsyncedQueries(); | ||
| 487 | }); | ||
| 488 | } | ||
| 489 | } | ||
| 490 | |||
| 491 | template <typename Traits> | ||
| 492 | bool QueryCacheBase<Traits>::HasUncommittedFlushes() const { | ||
| 493 | bool result = false; | ||
| 494 | impl->ForEachStreamer([&result](StreamerInterface* streamer) { | ||
| 495 | result |= streamer->HasUnsyncedQueries(); | ||
| 496 | return result; | ||
| 497 | }); | ||
| 498 | return result; | ||
| 499 | } | ||
| 500 | |||
| 501 | template <typename Traits> | ||
| 502 | bool QueryCacheBase<Traits>::ShouldWaitAsyncFlushes() { | ||
| 503 | std::scoped_lock lk(impl->flush_guard); | ||
| 504 | return !impl->flushes_pending.empty() && impl->flushes_pending.front() != 0ULL; | ||
| 505 | } | ||
| 506 | |||
| 507 | template <typename Traits> | ||
| 508 | void QueryCacheBase<Traits>::PopAsyncFlushes() { | ||
| 509 | u64 mask; | ||
| 510 | { | ||
| 511 | std::scoped_lock lk(impl->flush_guard); | ||
| 512 | mask = impl->flushes_pending.front(); | ||
| 513 | impl->flushes_pending.pop_front(); | ||
| 514 | } | ||
| 515 | if (mask == 0) { | ||
| 516 | return; | ||
| 517 | } | ||
| 518 | u64 ran_mask = ~mask; | ||
| 519 | while (mask) { | ||
| 520 | impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) { | ||
| 521 | u64 dep_mask = streamer->GetDependenceMask(); | ||
| 522 | if ((dep_mask & ~ran_mask) != 0) { | ||
| 523 | return; | ||
| 524 | } | ||
| 525 | u64 index = streamer->GetId(); | ||
| 526 | ran_mask |= (1ULL << index); | ||
| 527 | mask &= ~(1ULL << index); | ||
| 528 | streamer->PopUnsyncedQueries(); | ||
| 529 | }); | ||
| 530 | } | ||
| 531 | } | ||
| 532 | |||
| 533 | // Invalidation | ||
| 534 | |||
| 535 | template <typename Traits> | ||
| 536 | void QueryCacheBase<Traits>::InvalidateQuery(QueryCacheBase<Traits>::QueryLocation location) { | ||
| 537 | auto* query_base = impl->ObtainQuery(location); | ||
| 538 | if (!query_base) { | ||
| 539 | return; | ||
| 540 | } | ||
| 541 | query_base->flags |= QueryFlagBits::IsInvalidated; | ||
| 542 | } | ||
| 543 | |||
| 544 | template <typename Traits> | ||
| 545 | bool QueryCacheBase<Traits>::IsQueryDirty(QueryCacheBase<Traits>::QueryLocation location) { | ||
| 546 | auto* query_base = impl->ObtainQuery(location); | ||
| 547 | if (!query_base) { | ||
| 548 | return false; | ||
| 549 | } | ||
| 550 | return True(query_base->flags & QueryFlagBits::IsHostManaged) && | ||
| 551 | False(query_base->flags & QueryFlagBits::IsGuestSynced); | ||
| 552 | } | ||
| 553 | |||
| 554 | template <typename Traits> | ||
| 555 | bool QueryCacheBase<Traits>::SemiFlushQueryDirty(QueryCacheBase<Traits>::QueryLocation location) { | ||
| 556 | auto* query_base = impl->ObtainQuery(location); | ||
| 557 | if (!query_base) { | ||
| 558 | return false; | ||
| 559 | } | ||
| 560 | if (True(query_base->flags & QueryFlagBits::IsFinalValueSynced) && | ||
| 561 | False(query_base->flags & QueryFlagBits::IsGuestSynced)) { | ||
| 562 | auto* ptr = impl->cpu_memory.GetPointer(query_base->guest_address); | ||
| 563 | if (True(query_base->flags & QueryFlagBits::HasTimestamp)) { | ||
| 564 | std::memcpy(ptr, &query_base->value, sizeof(query_base->value)); | ||
| 565 | return false; | ||
| 566 | } | ||
| 567 | u32 value_l = static_cast<u32>(query_base->value); | ||
| 568 | std::memcpy(ptr, &value_l, sizeof(value_l)); | ||
| 569 | return false; | ||
| 570 | } | ||
| 571 | return True(query_base->flags & QueryFlagBits::IsHostManaged) && | ||
| 572 | False(query_base->flags & QueryFlagBits::IsGuestSynced); | ||
| 573 | } | ||
| 574 | |||
| 575 | template <typename Traits> | ||
| 576 | void QueryCacheBase<Traits>::RequestGuestHostSync() { | ||
| 577 | impl->rasterizer.ReleaseFences(); | ||
| 578 | } | ||
| 579 | |||
| 580 | } // namespace VideoCommon | ||
diff --git a/src/video_core/query_cache/query_cache_base.h b/src/video_core/query_cache/query_cache_base.h new file mode 100644 index 000000000..07be421c6 --- /dev/null +++ b/src/video_core/query_cache/query_cache_base.h | |||
| @@ -0,0 +1,181 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #pragma once | ||
| 5 | |||
| 6 | #include <functional> | ||
| 7 | #include <mutex> | ||
| 8 | #include <optional> | ||
| 9 | #include <span> | ||
| 10 | #include <unordered_map> | ||
| 11 | #include <utility> | ||
| 12 | |||
| 13 | #include "common/assert.h" | ||
| 14 | #include "common/bit_field.h" | ||
| 15 | #include "common/common_types.h" | ||
| 16 | #include "core/memory.h" | ||
| 17 | #include "video_core/control/channel_state_cache.h" | ||
| 18 | #include "video_core/query_cache/query_base.h" | ||
| 19 | #include "video_core/query_cache/types.h" | ||
| 20 | |||
| 21 | namespace Core::Memory { | ||
| 22 | class Memory; | ||
| 23 | } | ||
| 24 | |||
| 25 | namespace VideoCore { | ||
| 26 | class RasterizerInterface; | ||
| 27 | } | ||
| 28 | |||
| 29 | namespace Tegra { | ||
| 30 | class GPU; | ||
| 31 | } | ||
| 32 | |||
| 33 | namespace VideoCommon { | ||
| 34 | |||
| 35 | struct LookupData { | ||
| 36 | VAddr address; | ||
| 37 | QueryBase* found_query; | ||
| 38 | }; | ||
| 39 | |||
| 40 | template <typename Traits> | ||
| 41 | class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { | ||
| 42 | using RuntimeType = typename Traits::RuntimeType; | ||
| 43 | |||
| 44 | public: | ||
| 45 | union QueryLocation { | ||
| 46 | BitField<27, 5, u32> stream_id; | ||
| 47 | BitField<0, 27, u32> query_id; | ||
| 48 | u32 raw; | ||
| 49 | |||
| 50 | std::pair<size_t, size_t> unpack() const { | ||
| 51 | return {static_cast<size_t>(stream_id.Value()), static_cast<size_t>(query_id.Value())}; | ||
| 52 | } | ||
| 53 | }; | ||
| 54 | |||
| 55 | explicit QueryCacheBase(Tegra::GPU& gpu, VideoCore::RasterizerInterface& rasterizer_, | ||
| 56 | Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_); | ||
| 57 | |||
| 58 | ~QueryCacheBase(); | ||
| 59 | |||
| 60 | void InvalidateRegion(VAddr addr, std::size_t size) { | ||
| 61 | IterateCache<true>(addr, size, | ||
| 62 | [this](QueryLocation location) { InvalidateQuery(location); }); | ||
| 63 | } | ||
| 64 | |||
| 65 | void FlushRegion(VAddr addr, std::size_t size) { | ||
| 66 | bool result = false; | ||
| 67 | IterateCache<false>(addr, size, [this, &result](QueryLocation location) { | ||
| 68 | result |= SemiFlushQueryDirty(location); | ||
| 69 | return result; | ||
| 70 | }); | ||
| 71 | if (result) { | ||
| 72 | RequestGuestHostSync(); | ||
| 73 | } | ||
| 74 | } | ||
| 75 | |||
| 76 | static u64 BuildMask(std::span<const QueryType> types) { | ||
| 77 | u64 mask = 0; | ||
| 78 | for (auto query_type : types) { | ||
| 79 | mask |= 1ULL << (static_cast<u64>(query_type)); | ||
| 80 | } | ||
| 81 | return mask; | ||
| 82 | } | ||
| 83 | |||
| 84 | /// Return true when a CPU region is modified from the GPU | ||
| 85 | [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size) { | ||
| 86 | bool result = false; | ||
| 87 | IterateCache<false>(addr, size, [this, &result](QueryLocation location) { | ||
| 88 | result |= IsQueryDirty(location); | ||
| 89 | return result; | ||
| 90 | }); | ||
| 91 | return result; | ||
| 92 | } | ||
| 93 | |||
| 94 | void CounterEnable(QueryType counter_type, bool is_enabled); | ||
| 95 | |||
| 96 | void CounterReset(QueryType counter_type); | ||
| 97 | |||
| 98 | void CounterClose(QueryType counter_type); | ||
| 99 | |||
| 100 | void CounterReport(GPUVAddr addr, QueryType counter_type, QueryPropertiesFlags flags, | ||
| 101 | u32 payload, u32 subreport); | ||
| 102 | |||
| 103 | void NotifyWFI(); | ||
| 104 | |||
| 105 | bool AccelerateHostConditionalRendering(); | ||
| 106 | |||
| 107 | // Async downloads | ||
| 108 | void CommitAsyncFlushes(); | ||
| 109 | |||
| 110 | bool HasUncommittedFlushes() const; | ||
| 111 | |||
| 112 | bool ShouldWaitAsyncFlushes(); | ||
| 113 | |||
| 114 | void PopAsyncFlushes(); | ||
| 115 | |||
| 116 | void NotifySegment(bool resume); | ||
| 117 | |||
| 118 | void BindToChannel(s32 id) override; | ||
| 119 | |||
| 120 | protected: | ||
| 121 | template <bool remove_from_cache, typename Func> | ||
| 122 | void IterateCache(VAddr addr, std::size_t size, Func&& func) { | ||
| 123 | static constexpr bool RETURNS_BOOL = | ||
| 124 | std::is_same_v<std::invoke_result<Func, QueryLocation>, bool>; | ||
| 125 | const u64 addr_begin = addr; | ||
| 126 | const u64 addr_end = addr_begin + size; | ||
| 127 | |||
| 128 | const u64 page_end = addr_end >> Core::Memory::YUZU_PAGEBITS; | ||
| 129 | std::scoped_lock lock(cache_mutex); | ||
| 130 | for (u64 page = addr_begin >> Core::Memory::YUZU_PAGEBITS; page <= page_end; ++page) { | ||
| 131 | const u64 page_start = page << Core::Memory::YUZU_PAGEBITS; | ||
| 132 | const auto in_range = [page_start, addr_begin, addr_end](const u32 query_location) { | ||
| 133 | const u64 cache_begin = page_start + query_location; | ||
| 134 | const u64 cache_end = cache_begin + sizeof(u32); | ||
| 135 | return cache_begin < addr_end && addr_begin < cache_end; | ||
| 136 | }; | ||
| 137 | const auto& it = cached_queries.find(page); | ||
| 138 | if (it == std::end(cached_queries)) { | ||
| 139 | continue; | ||
| 140 | } | ||
| 141 | auto& contents = it->second; | ||
| 142 | for (auto& query : contents) { | ||
| 143 | if (!in_range(query.first)) { | ||
| 144 | continue; | ||
| 145 | } | ||
| 146 | if constexpr (RETURNS_BOOL) { | ||
| 147 | if (func(query.second)) { | ||
| 148 | return; | ||
| 149 | } | ||
| 150 | } else { | ||
| 151 | func(query.second); | ||
| 152 | } | ||
| 153 | } | ||
| 154 | if constexpr (remove_from_cache) { | ||
| 155 | const auto in_range2 = [&](const std::pair<u32, QueryLocation>& pair) { | ||
| 156 | return in_range(pair.first); | ||
| 157 | }; | ||
| 158 | std::erase_if(contents, in_range2); | ||
| 159 | } | ||
| 160 | } | ||
| 161 | } | ||
| 162 | |||
| 163 | using ContentCache = std::unordered_map<u64, std::unordered_map<u32, QueryLocation>>; | ||
| 164 | |||
| 165 | void InvalidateQuery(QueryLocation location); | ||
| 166 | bool IsQueryDirty(QueryLocation location); | ||
| 167 | bool SemiFlushQueryDirty(QueryLocation location); | ||
| 168 | void RequestGuestHostSync(); | ||
| 169 | void UnregisterPending(); | ||
| 170 | |||
| 171 | std::unordered_map<u64, std::unordered_map<u32, QueryLocation>> cached_queries; | ||
| 172 | std::mutex cache_mutex; | ||
| 173 | |||
| 174 | struct QueryCacheBaseImpl; | ||
| 175 | friend struct QueryCacheBaseImpl; | ||
| 176 | friend RuntimeType; | ||
| 177 | |||
| 178 | std::unique_ptr<QueryCacheBaseImpl> impl; | ||
| 179 | }; | ||
| 180 | |||
| 181 | } // namespace VideoCommon \ No newline at end of file | ||
diff --git a/src/video_core/query_cache/query_stream.h b/src/video_core/query_cache/query_stream.h new file mode 100644 index 000000000..39da6ac07 --- /dev/null +++ b/src/video_core/query_cache/query_stream.h | |||
| @@ -0,0 +1,149 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #pragma once | ||
| 5 | |||
| 6 | #include <deque> | ||
| 7 | #include <optional> | ||
| 8 | #include <vector> | ||
| 9 | |||
| 10 | #include "common/assert.h" | ||
| 11 | #include "common/common_types.h" | ||
| 12 | #include "video_core/query_cache/bank_base.h" | ||
| 13 | #include "video_core/query_cache/query_base.h" | ||
| 14 | |||
| 15 | namespace VideoCommon { | ||
| 16 | |||
| 17 | class StreamerInterface { | ||
| 18 | public: | ||
| 19 | explicit StreamerInterface(size_t id_) : id{id_}, dependence_mask{}, dependent_mask{} {} | ||
| 20 | virtual ~StreamerInterface() = default; | ||
| 21 | |||
| 22 | virtual QueryBase* GetQuery(size_t id) = 0; | ||
| 23 | |||
| 24 | virtual void StartCounter() { | ||
| 25 | /* Do Nothing */ | ||
| 26 | } | ||
| 27 | |||
| 28 | virtual void PauseCounter() { | ||
| 29 | /* Do Nothing */ | ||
| 30 | } | ||
| 31 | |||
| 32 | virtual void ResetCounter() { | ||
| 33 | /* Do Nothing */ | ||
| 34 | } | ||
| 35 | |||
| 36 | virtual void CloseCounter() { | ||
| 37 | /* Do Nothing */ | ||
| 38 | } | ||
| 39 | |||
| 40 | virtual bool HasPendingSync() const { | ||
| 41 | return false; | ||
| 42 | } | ||
| 43 | |||
| 44 | virtual void PresyncWrites() { | ||
| 45 | /* Do Nothing */ | ||
| 46 | } | ||
| 47 | |||
| 48 | virtual void SyncWrites() { | ||
| 49 | /* Do Nothing */ | ||
| 50 | } | ||
| 51 | |||
| 52 | virtual size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, | ||
| 53 | std::optional<u32> subreport = std::nullopt) = 0; | ||
| 54 | |||
| 55 | virtual bool HasUnsyncedQueries() const { | ||
| 56 | return false; | ||
| 57 | } | ||
| 58 | |||
| 59 | virtual void PushUnsyncedQueries() { | ||
| 60 | /* Do Nothing */ | ||
| 61 | } | ||
| 62 | |||
| 63 | virtual void PopUnsyncedQueries() { | ||
| 64 | /* Do Nothing */ | ||
| 65 | } | ||
| 66 | |||
| 67 | virtual void Free(size_t query_id) = 0; | ||
| 68 | |||
| 69 | size_t GetId() const { | ||
| 70 | return id; | ||
| 71 | } | ||
| 72 | |||
| 73 | u64 GetDependenceMask() const { | ||
| 74 | return dependence_mask; | ||
| 75 | } | ||
| 76 | |||
| 77 | u64 GetDependentMask() const { | ||
| 78 | return dependence_mask; | ||
| 79 | } | ||
| 80 | |||
| 81 | u64 GetAmmendValue() const { | ||
| 82 | return ammend_value; | ||
| 83 | } | ||
| 84 | |||
| 85 | void SetAccumulationValue(u64 new_value) { | ||
| 86 | acumulation_value = new_value; | ||
| 87 | } | ||
| 88 | |||
| 89 | protected: | ||
| 90 | void MakeDependent(StreamerInterface* depend_on) { | ||
| 91 | dependence_mask |= 1ULL << depend_on->id; | ||
| 92 | depend_on->dependent_mask |= 1ULL << id; | ||
| 93 | } | ||
| 94 | |||
| 95 | const size_t id; | ||
| 96 | u64 dependence_mask; | ||
| 97 | u64 dependent_mask; | ||
| 98 | u64 ammend_value{}; | ||
| 99 | u64 acumulation_value{}; | ||
| 100 | }; | ||
| 101 | |||
| 102 | template <typename QueryType> | ||
| 103 | class SimpleStreamer : public StreamerInterface { | ||
| 104 | public: | ||
| 105 | explicit SimpleStreamer(size_t id_) : StreamerInterface{id_} {} | ||
| 106 | virtual ~SimpleStreamer() = default; | ||
| 107 | |||
| 108 | protected: | ||
| 109 | virtual QueryType* GetQuery(size_t query_id) override { | ||
| 110 | if (query_id < slot_queries.size()) { | ||
| 111 | return &slot_queries[query_id]; | ||
| 112 | } | ||
| 113 | return nullptr; | ||
| 114 | } | ||
| 115 | |||
| 116 | virtual void Free(size_t query_id) override { | ||
| 117 | std::scoped_lock lk(guard); | ||
| 118 | ReleaseQuery(query_id); | ||
| 119 | } | ||
| 120 | |||
| 121 | template <typename... Args, typename = decltype(QueryType(std::declval<Args>()...))> | ||
| 122 | size_t BuildQuery(Args&&... args) { | ||
| 123 | std::scoped_lock lk(guard); | ||
| 124 | if (!old_queries.empty()) { | ||
| 125 | size_t new_id = old_queries.front(); | ||
| 126 | old_queries.pop_front(); | ||
| 127 | new (&slot_queries[new_id]) QueryType(std::forward<Args>(args)...); | ||
| 128 | return new_id; | ||
| 129 | } | ||
| 130 | size_t new_id = slot_queries.size(); | ||
| 131 | slot_queries.emplace_back(std::forward<Args>(args)...); | ||
| 132 | return new_id; | ||
| 133 | } | ||
| 134 | |||
| 135 | void ReleaseQuery(size_t query_id) { | ||
| 136 | |||
| 137 | if (query_id < slot_queries.size()) { | ||
| 138 | old_queries.push_back(query_id); | ||
| 139 | return; | ||
| 140 | } | ||
| 141 | UNREACHABLE(); | ||
| 142 | } | ||
| 143 | |||
| 144 | std::mutex guard; | ||
| 145 | std::deque<QueryType> slot_queries; | ||
| 146 | std::deque<size_t> old_queries; | ||
| 147 | }; | ||
| 148 | |||
| 149 | } // namespace VideoCommon \ No newline at end of file | ||
diff --git a/src/video_core/query_cache/types.h b/src/video_core/query_cache/types.h new file mode 100644 index 000000000..e9226bbfc --- /dev/null +++ b/src/video_core/query_cache/types.h | |||
| @@ -0,0 +1,74 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #pragma once | ||
| 5 | |||
| 6 | #include "common/common_funcs.h" | ||
| 7 | #include "common/common_types.h" | ||
| 8 | |||
| 9 | namespace VideoCommon { | ||
| 10 | |||
| 11 | enum class QueryPropertiesFlags : u32 { | ||
| 12 | HasTimeout = 1 << 0, | ||
| 13 | IsAFence = 1 << 1, | ||
| 14 | }; | ||
| 15 | DECLARE_ENUM_FLAG_OPERATORS(QueryPropertiesFlags) | ||
| 16 | |||
| 17 | // This should always be equivalent to maxwell3d Report Semaphore Reports | ||
| 18 | enum class QueryType : u32 { | ||
| 19 | Payload = 0, // "None" in docs, but confirmed via hardware to return the payload | ||
| 20 | VerticesGenerated = 1, | ||
| 21 | ZPassPixelCount = 2, | ||
| 22 | PrimitivesGenerated = 3, | ||
| 23 | AlphaBetaClocks = 4, | ||
| 24 | VertexShaderInvocations = 5, | ||
| 25 | StreamingPrimitivesNeededMinusSucceeded = 6, | ||
| 26 | GeometryShaderInvocations = 7, | ||
| 27 | GeometryShaderPrimitivesGenerated = 9, | ||
| 28 | ZCullStats0 = 10, | ||
| 29 | StreamingPrimitivesSucceeded = 11, | ||
| 30 | ZCullStats1 = 12, | ||
| 31 | StreamingPrimitivesNeeded = 13, | ||
| 32 | ZCullStats2 = 14, | ||
| 33 | ClipperInvocations = 15, | ||
| 34 | ZCullStats3 = 16, | ||
| 35 | ClipperPrimitivesGenerated = 17, | ||
| 36 | VtgPrimitivesOut = 18, | ||
| 37 | PixelShaderInvocations = 19, | ||
| 38 | ZPassPixelCount64 = 21, | ||
| 39 | IEEECleanColorTarget = 24, | ||
| 40 | IEEECleanZetaTarget = 25, | ||
| 41 | StreamingByteCount = 26, | ||
| 42 | TessellationInitInvocations = 27, | ||
| 43 | BoundingRectangle = 28, | ||
| 44 | TessellationShaderInvocations = 29, | ||
| 45 | TotalStreamingPrimitivesNeededMinusSucceeded = 30, | ||
| 46 | TessellationShaderPrimitivesGenerated = 31, | ||
| 47 | // max. | ||
| 48 | MaxQueryTypes, | ||
| 49 | }; | ||
| 50 | |||
| 51 | // Comparison modes for Host Conditional Rendering | ||
| 52 | enum class ComparisonMode : u32 { | ||
| 53 | False = 0, | ||
| 54 | True = 1, | ||
| 55 | Conditional = 2, | ||
| 56 | IfEqual = 3, | ||
| 57 | IfNotEqual = 4, | ||
| 58 | MaxComparisonMode, | ||
| 59 | }; | ||
| 60 | |||
| 61 | // Reduction ops. | ||
| 62 | enum class ReductionOp : u32 { | ||
| 63 | RedAdd = 0, | ||
| 64 | RedMin = 1, | ||
| 65 | RedMax = 2, | ||
| 66 | RedInc = 3, | ||
| 67 | RedDec = 4, | ||
| 68 | RedAnd = 5, | ||
| 69 | RedOr = 6, | ||
| 70 | RedXor = 7, | ||
| 71 | MaxReductionOp, | ||
| 72 | }; | ||
| 73 | |||
| 74 | } // namespace VideoCommon \ No newline at end of file | ||
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index cb8029a4f..af1469147 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | #include "video_core/cache_types.h" | 12 | #include "video_core/cache_types.h" |
| 13 | #include "video_core/engines/fermi_2d.h" | 13 | #include "video_core/engines/fermi_2d.h" |
| 14 | #include "video_core/gpu.h" | 14 | #include "video_core/gpu.h" |
| 15 | #include "video_core/query_cache/types.h" | ||
| 15 | #include "video_core/rasterizer_download_area.h" | 16 | #include "video_core/rasterizer_download_area.h" |
| 16 | 17 | ||
| 17 | namespace Tegra { | 18 | namespace Tegra { |
| @@ -26,11 +27,6 @@ struct ChannelState; | |||
| 26 | 27 | ||
| 27 | namespace VideoCore { | 28 | namespace VideoCore { |
| 28 | 29 | ||
| 29 | enum class QueryType { | ||
| 30 | SamplesPassed, | ||
| 31 | }; | ||
| 32 | constexpr std::size_t NumQueryTypes = 1; | ||
| 33 | |||
| 34 | enum class LoadCallbackStage { | 30 | enum class LoadCallbackStage { |
| 35 | Prepare, | 31 | Prepare, |
| 36 | Build, | 32 | Build, |
| @@ -58,10 +54,11 @@ public: | |||
| 58 | virtual void DispatchCompute() = 0; | 54 | virtual void DispatchCompute() = 0; |
| 59 | 55 | ||
| 60 | /// Resets the counter of a query | 56 | /// Resets the counter of a query |
| 61 | virtual void ResetCounter(QueryType type) = 0; | 57 | virtual void ResetCounter(VideoCommon::QueryType type) = 0; |
| 62 | 58 | ||
| 63 | /// Records a GPU query and caches it | 59 | /// Records a GPU query and caches it |
| 64 | virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; | 60 | virtual void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, |
| 61 | VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) = 0; | ||
| 65 | 62 | ||
| 66 | /// Signal an uniform buffer binding | 63 | /// Signal an uniform buffer binding |
| 67 | virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | 64 | virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, |
| @@ -83,7 +80,7 @@ public: | |||
| 83 | virtual void SignalReference() = 0; | 80 | virtual void SignalReference() = 0; |
| 84 | 81 | ||
| 85 | /// Release all pending fences. | 82 | /// Release all pending fences. |
| 86 | virtual void ReleaseFences() = 0; | 83 | virtual void ReleaseFences(bool force = true) = 0; |
| 87 | 84 | ||
| 88 | /// Notify rasterizer that all caches should be flushed to Switch memory | 85 | /// Notify rasterizer that all caches should be flushed to Switch memory |
| 89 | virtual void FlushAll() = 0; | 86 | virtual void FlushAll() = 0; |
diff --git a/src/video_core/renderer_null/null_rasterizer.cpp b/src/video_core/renderer_null/null_rasterizer.cpp index 92ecf6682..65cd5aa06 100644 --- a/src/video_core/renderer_null/null_rasterizer.cpp +++ b/src/video_core/renderer_null/null_rasterizer.cpp | |||
| @@ -26,16 +26,18 @@ void RasterizerNull::Draw(bool is_indexed, u32 instance_count) {} | |||
| 26 | void RasterizerNull::DrawTexture() {} | 26 | void RasterizerNull::DrawTexture() {} |
| 27 | void RasterizerNull::Clear(u32 layer_count) {} | 27 | void RasterizerNull::Clear(u32 layer_count) {} |
| 28 | void RasterizerNull::DispatchCompute() {} | 28 | void RasterizerNull::DispatchCompute() {} |
| 29 | void RasterizerNull::ResetCounter(VideoCore::QueryType type) {} | 29 | void RasterizerNull::ResetCounter(VideoCommon::QueryType type) {} |
| 30 | void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, | 30 | void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, |
| 31 | std::optional<u64> timestamp) { | 31 | VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { |
| 32 | if (!gpu_memory) { | 32 | if (!gpu_memory) { |
| 33 | return; | 33 | return; |
| 34 | } | 34 | } |
| 35 | 35 | if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { | |
| 36 | gpu_memory->Write(gpu_addr, u64{0}); | 36 | u64 ticks = m_gpu.GetTicks(); |
| 37 | if (timestamp) { | 37 | gpu_memory->Write<u64>(gpu_addr + 8, ticks); |
| 38 | gpu_memory->Write(gpu_addr + 8, *timestamp); | 38 | gpu_memory->Write<u64>(gpu_addr, static_cast<u64>(payload)); |
| 39 | } else { | ||
| 40 | gpu_memory->Write<u32>(gpu_addr, payload); | ||
| 39 | } | 41 | } |
| 40 | } | 42 | } |
| 41 | void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | 43 | void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, |
| @@ -74,7 +76,7 @@ void RasterizerNull::SignalSyncPoint(u32 value) { | |||
| 74 | syncpoint_manager.IncrementHost(value); | 76 | syncpoint_manager.IncrementHost(value); |
| 75 | } | 77 | } |
| 76 | void RasterizerNull::SignalReference() {} | 78 | void RasterizerNull::SignalReference() {} |
| 77 | void RasterizerNull::ReleaseFences() {} | 79 | void RasterizerNull::ReleaseFences(bool) {} |
| 78 | void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {} | 80 | void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {} |
| 79 | void RasterizerNull::WaitForIdle() {} | 81 | void RasterizerNull::WaitForIdle() {} |
| 80 | void RasterizerNull::FragmentBarrier() {} | 82 | void RasterizerNull::FragmentBarrier() {} |
diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h index 93b9a6971..23001eeb8 100644 --- a/src/video_core/renderer_null/null_rasterizer.h +++ b/src/video_core/renderer_null/null_rasterizer.h | |||
| @@ -42,8 +42,9 @@ public: | |||
| 42 | void DrawTexture() override; | 42 | void DrawTexture() override; |
| 43 | void Clear(u32 layer_count) override; | 43 | void Clear(u32 layer_count) override; |
| 44 | void DispatchCompute() override; | 44 | void DispatchCompute() override; |
| 45 | void ResetCounter(VideoCore::QueryType type) override; | 45 | void ResetCounter(VideoCommon::QueryType type) override; |
| 46 | void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; | 46 | void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, |
| 47 | VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; | ||
| 47 | void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; | 48 | void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; |
| 48 | void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; | 49 | void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; |
| 49 | void FlushAll() override; | 50 | void FlushAll() override; |
| @@ -63,7 +64,7 @@ public: | |||
| 63 | void SyncOperation(std::function<void()>&& func) override; | 64 | void SyncOperation(std::function<void()>&& func) override; |
| 64 | void SignalSyncPoint(u32 value) override; | 65 | void SignalSyncPoint(u32 value) override; |
| 65 | void SignalReference() override; | 66 | void SignalReference() override; |
| 66 | void ReleaseFences() override; | 67 | void ReleaseFences(bool force) override; |
| 67 | void FlushAndInvalidateRegion( | 68 | void FlushAndInvalidateRegion( |
| 68 | VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; | 69 | VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; |
| 69 | void WaitForIdle() override; | 70 | void WaitForIdle() override; |
diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index 99d7347f5..ec142d48e 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp | |||
| @@ -27,7 +27,7 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { | |||
| 27 | } // Anonymous namespace | 27 | } // Anonymous namespace |
| 28 | 28 | ||
| 29 | QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) | 29 | QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) |
| 30 | : QueryCacheBase(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} | 30 | : QueryCacheLegacy(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} |
| 31 | 31 | ||
| 32 | QueryCache::~QueryCache() = default; | 32 | QueryCache::~QueryCache() = default; |
| 33 | 33 | ||
diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index 872513f22..0721e0b3d 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h | |||
| @@ -26,7 +26,7 @@ class RasterizerOpenGL; | |||
| 26 | using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; | 26 | using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; |
| 27 | 27 | ||
| 28 | class QueryCache final | 28 | class QueryCache final |
| 29 | : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { | 29 | : public VideoCommon::QueryCacheLegacy<QueryCache, CachedQuery, CounterStream, HostCounter> { |
| 30 | public: | 30 | public: |
| 31 | explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); | 31 | explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); |
| 32 | ~QueryCache(); | 32 | ~QueryCache(); |
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index dd03efecd..27e2de1bf 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp | |||
| @@ -396,13 +396,39 @@ void RasterizerOpenGL::DispatchCompute() { | |||
| 396 | has_written_global_memory |= pipeline->WritesGlobalMemory(); | 396 | has_written_global_memory |= pipeline->WritesGlobalMemory(); |
| 397 | } | 397 | } |
| 398 | 398 | ||
| 399 | void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) { | 399 | void RasterizerOpenGL::ResetCounter(VideoCommon::QueryType type) { |
| 400 | query_cache.ResetCounter(type); | 400 | if (type == VideoCommon::QueryType::ZPassPixelCount64) { |
| 401 | query_cache.ResetCounter(VideoCore::QueryType::SamplesPassed); | ||
| 402 | } | ||
| 401 | } | 403 | } |
| 402 | 404 | ||
| 403 | void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, | 405 | void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, |
| 404 | std::optional<u64> timestamp) { | 406 | VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { |
| 405 | query_cache.Query(gpu_addr, type, timestamp); | 407 | if (type == VideoCommon::QueryType::ZPassPixelCount64) { |
| 408 | if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { | ||
| 409 | query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, {gpu.GetTicks()}); | ||
| 410 | } else { | ||
| 411 | query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, std::nullopt); | ||
| 412 | } | ||
| 413 | return; | ||
| 414 | } | ||
| 415 | if (type != VideoCommon::QueryType::Payload) { | ||
| 416 | payload = 1u; | ||
| 417 | } | ||
| 418 | std::function<void()> func([this, gpu_addr, flags, memory_manager = gpu_memory, payload]() { | ||
| 419 | if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { | ||
| 420 | u64 ticks = gpu.GetTicks(); | ||
| 421 | memory_manager->Write<u64>(gpu_addr + 8, ticks); | ||
| 422 | memory_manager->Write<u64>(gpu_addr, static_cast<u64>(payload)); | ||
| 423 | } else { | ||
| 424 | memory_manager->Write<u32>(gpu_addr, payload); | ||
| 425 | } | ||
| 426 | }); | ||
| 427 | if (True(flags & VideoCommon::QueryPropertiesFlags::IsAFence)) { | ||
| 428 | SignalFence(std::move(func)); | ||
| 429 | return; | ||
| 430 | } | ||
| 431 | func(); | ||
| 406 | } | 432 | } |
| 407 | 433 | ||
| 408 | void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | 434 | void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, |
| @@ -573,8 +599,8 @@ void RasterizerOpenGL::SignalReference() { | |||
| 573 | fence_manager.SignalOrdering(); | 599 | fence_manager.SignalOrdering(); |
| 574 | } | 600 | } |
| 575 | 601 | ||
| 576 | void RasterizerOpenGL::ReleaseFences() { | 602 | void RasterizerOpenGL::ReleaseFences(bool force) { |
| 577 | fence_manager.WaitPendingFences(); | 603 | fence_manager.WaitPendingFences(force); |
| 578 | } | 604 | } |
| 579 | 605 | ||
| 580 | void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size, | 606 | void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size, |
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 8eda2ddba..ceffe1f1e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h | |||
| @@ -86,8 +86,9 @@ public: | |||
| 86 | void DrawTexture() override; | 86 | void DrawTexture() override; |
| 87 | void Clear(u32 layer_count) override; | 87 | void Clear(u32 layer_count) override; |
| 88 | void DispatchCompute() override; | 88 | void DispatchCompute() override; |
| 89 | void ResetCounter(VideoCore::QueryType type) override; | 89 | void ResetCounter(VideoCommon::QueryType type) override; |
| 90 | void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; | 90 | void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, |
| 91 | VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; | ||
| 91 | void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; | 92 | void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; |
| 92 | void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; | 93 | void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; |
| 93 | void FlushAll() override; | 94 | void FlushAll() override; |
| @@ -107,7 +108,7 @@ public: | |||
| 107 | void SyncOperation(std::function<void()>&& func) override; | 108 | void SyncOperation(std::function<void()>&& func) override; |
| 108 | void SignalSyncPoint(u32 value) override; | 109 | void SignalSyncPoint(u32 value) override; |
| 109 | void SignalReference() override; | 110 | void SignalReference() override; |
| 110 | void ReleaseFences() override; | 111 | void ReleaseFences(bool force = true) override; |
| 111 | void FlushAndInvalidateRegion( | 112 | void FlushAndInvalidateRegion( |
| 112 | VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; | 113 | VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; |
| 113 | void WaitForIdle() override; | 114 | void WaitForIdle() override; |
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index e15865d16..d8148e89a 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp | |||
| @@ -61,6 +61,9 @@ vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allo | |||
| 61 | if (device.IsExtTransformFeedbackSupported()) { | 61 | if (device.IsExtTransformFeedbackSupported()) { |
| 62 | flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; | 62 | flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; |
| 63 | } | 63 | } |
| 64 | if (device.IsExtConditionalRendering()) { | ||
| 65 | flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT; | ||
| 66 | } | ||
| 64 | const VkBufferCreateInfo buffer_ci = { | 67 | const VkBufferCreateInfo buffer_ci = { |
| 65 | .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, | 68 | .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, |
| 66 | .pNext = nullptr, | 69 | .pNext = nullptr, |
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 54ee030ce..289d5b25c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp | |||
| @@ -12,6 +12,9 @@ | |||
| 12 | #include "common/common_types.h" | 12 | #include "common/common_types.h" |
| 13 | #include "common/div_ceil.h" | 13 | #include "common/div_ceil.h" |
| 14 | #include "video_core/host_shaders/astc_decoder_comp_spv.h" | 14 | #include "video_core/host_shaders/astc_decoder_comp_spv.h" |
| 15 | #include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" | ||
| 16 | #include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h" | ||
| 17 | #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" | ||
| 15 | #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" | 18 | #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" |
| 16 | #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" | 19 | #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" |
| 17 | #include "video_core/renderer_vulkan/vk_compute_pass.h" | 20 | #include "video_core/renderer_vulkan/vk_compute_pass.h" |
| @@ -57,6 +60,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE | |||
| 57 | }, | 60 | }, |
| 58 | }}; | 61 | }}; |
| 59 | 62 | ||
| 63 | constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{ | ||
| 64 | { | ||
| 65 | .binding = 0, | ||
| 66 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||
| 67 | .descriptorCount = 1, | ||
| 68 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||
| 69 | .pImmutableSamplers = nullptr, | ||
| 70 | }, | ||
| 71 | { | ||
| 72 | .binding = 1, | ||
| 73 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||
| 74 | .descriptorCount = 1, | ||
| 75 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||
| 76 | .pImmutableSamplers = nullptr, | ||
| 77 | }, | ||
| 78 | { | ||
| 79 | .binding = 2, | ||
| 80 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||
| 81 | .descriptorCount = 1, | ||
| 82 | .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||
| 83 | .pImmutableSamplers = nullptr, | ||
| 84 | }, | ||
| 85 | }}; | ||
| 86 | |||
| 60 | constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ | 87 | constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ |
| 61 | .uniform_buffers = 0, | 88 | .uniform_buffers = 0, |
| 62 | .storage_buffers = 2, | 89 | .storage_buffers = 2, |
| @@ -67,6 +94,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ | |||
| 67 | .score = 2, | 94 | .score = 2, |
| 68 | }; | 95 | }; |
| 69 | 96 | ||
| 97 | constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{ | ||
| 98 | .uniform_buffers = 0, | ||
| 99 | .storage_buffers = 3, | ||
| 100 | .texture_buffers = 0, | ||
| 101 | .image_buffers = 0, | ||
| 102 | .textures = 0, | ||
| 103 | .images = 0, | ||
| 104 | .score = 3, | ||
| 105 | }; | ||
| 106 | |||
| 70 | constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ | 107 | constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ |
| 71 | { | 108 | { |
| 72 | .binding = ASTC_BINDING_INPUT_BUFFER, | 109 | .binding = ASTC_BINDING_INPUT_BUFFER, |
| @@ -103,6 +140,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT | |||
| 103 | .stride = sizeof(DescriptorUpdateEntry), | 140 | .stride = sizeof(DescriptorUpdateEntry), |
| 104 | }; | 141 | }; |
| 105 | 142 | ||
| 143 | constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{ | ||
| 144 | .dstBinding = 0, | ||
| 145 | .dstArrayElement = 0, | ||
| 146 | .descriptorCount = 3, | ||
| 147 | .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||
| 148 | .offset = 0, | ||
| 149 | .stride = sizeof(DescriptorUpdateEntry), | ||
| 150 | }; | ||
| 151 | |||
| 106 | constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS> | 152 | constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS> |
| 107 | ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ | 153 | ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ |
| 108 | { | 154 | { |
| @@ -131,13 +177,21 @@ struct AstcPushConstants { | |||
| 131 | u32 block_height; | 177 | u32 block_height; |
| 132 | u32 block_height_mask; | 178 | u32 block_height_mask; |
| 133 | }; | 179 | }; |
| 180 | |||
| 181 | struct QueriesPrefixScanPushConstants { | ||
| 182 | u32 min_accumulation_base; | ||
| 183 | u32 max_accumulation_base; | ||
| 184 | u32 accumulation_limit; | ||
| 185 | u32 buffer_offset; | ||
| 186 | }; | ||
| 134 | } // Anonymous namespace | 187 | } // Anonymous namespace |
| 135 | 188 | ||
| 136 | ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, | 189 | ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, |
| 137 | vk::Span<VkDescriptorSetLayoutBinding> bindings, | 190 | vk::Span<VkDescriptorSetLayoutBinding> bindings, |
| 138 | vk::Span<VkDescriptorUpdateTemplateEntry> templates, | 191 | vk::Span<VkDescriptorUpdateTemplateEntry> templates, |
| 139 | const DescriptorBankInfo& bank_info, | 192 | const DescriptorBankInfo& bank_info, |
| 140 | vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code) | 193 | vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, |
| 194 | std::optional<u32> optional_subgroup_size) | ||
| 141 | : device{device_} { | 195 | : device{device_} { |
| 142 | descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ | 196 | descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ |
| 143 | .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, | 197 | .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, |
| @@ -178,13 +232,19 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, | |||
| 178 | .pCode = code.data(), | 232 | .pCode = code.data(), |
| 179 | }); | 233 | }); |
| 180 | device.SaveShader(code); | 234 | device.SaveShader(code); |
| 235 | const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{ | ||
| 236 | .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, | ||
| 237 | .pNext = nullptr, | ||
| 238 | .requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U, | ||
| 239 | }; | ||
| 240 | bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size; | ||
| 181 | pipeline = device.GetLogical().CreateComputePipeline({ | 241 | pipeline = device.GetLogical().CreateComputePipeline({ |
| 182 | .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, | 242 | .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, |
| 183 | .pNext = nullptr, | 243 | .pNext = nullptr, |
| 184 | .flags = 0, | 244 | .flags = 0, |
| 185 | .stage{ | 245 | .stage{ |
| 186 | .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, | 246 | .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, |
| 187 | .pNext = nullptr, | 247 | .pNext = use_setup_size ? &subgroup_size_ci : nullptr, |
| 188 | .flags = 0, | 248 | .flags = 0, |
| 189 | .stage = VK_SHADER_STAGE_COMPUTE_BIT, | 249 | .stage = VK_SHADER_STAGE_COMPUTE_BIT, |
| 190 | .module = *module, | 250 | .module = *module, |
| @@ -302,6 +362,123 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble( | |||
| 302 | return {staging.buffer, staging.offset}; | 362 | return {staging.buffer, staging.offset}; |
| 303 | } | 363 | } |
| 304 | 364 | ||
| 365 | ConditionalRenderingResolvePass::ConditionalRenderingResolvePass( | ||
| 366 | const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, | ||
| 367 | ComputePassDescriptorQueue& compute_pass_descriptor_queue_) | ||
| 368 | : ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS, | ||
| 369 | INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO, nullptr, | ||
| 370 | RESOLVE_CONDITIONAL_RENDER_COMP_SPV), | ||
| 371 | scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} | ||
| 372 | |||
| 373 | void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, | ||
| 374 | u32 src_offset, bool compare_to_zero) { | ||
| 375 | const size_t compare_size = compare_to_zero ? 8 : 24; | ||
| 376 | |||
| 377 | compute_pass_descriptor_queue.Acquire(); | ||
| 378 | compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, compare_size); | ||
| 379 | compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, sizeof(u32)); | ||
| 380 | const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; | ||
| 381 | |||
| 382 | scheduler.RequestOutsideRenderPassOperationContext(); | ||
| 383 | scheduler.Record([this, descriptor_data](vk::CommandBuffer cmdbuf) { | ||
| 384 | static constexpr VkMemoryBarrier read_barrier{ | ||
| 385 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||
| 386 | .pNext = nullptr, | ||
| 387 | .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, | ||
| 388 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, | ||
| 389 | }; | ||
| 390 | static constexpr VkMemoryBarrier write_barrier{ | ||
| 391 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||
| 392 | .pNext = nullptr, | ||
| 393 | .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, | ||
| 394 | .dstAccessMask = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, | ||
| 395 | }; | ||
| 396 | const VkDescriptorSet set = descriptor_allocator.Commit(); | ||
| 397 | device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); | ||
| 398 | |||
| 399 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | ||
| 400 | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); | ||
| 401 | cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); | ||
| 402 | cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); | ||
| 403 | cmdbuf.Dispatch(1, 1, 1); | ||
| 404 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, | ||
| 405 | VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); | ||
| 406 | }); | ||
| 407 | } | ||
| 408 | |||
| 409 | QueriesPrefixScanPass::QueriesPrefixScanPass( | ||
| 410 | const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, | ||
| 411 | ComputePassDescriptorQueue& compute_pass_descriptor_queue_) | ||
| 412 | : ComputePass( | ||
| 413 | device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, | ||
| 414 | QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, | ||
| 415 | COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>, | ||
| 416 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) && | ||
| 417 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) && | ||
| 418 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && | ||
| 419 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) | ||
| 420 | ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) | ||
| 421 | : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV)), | ||
| 422 | scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} | ||
| 423 | |||
| 424 | void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, | ||
| 425 | VkBuffer src_buffer, size_t number_of_sums, | ||
| 426 | size_t min_accumulation_limit, size_t max_accumulation_limit) { | ||
| 427 | size_t current_runs = number_of_sums; | ||
| 428 | size_t offset = 0; | ||
| 429 | while (current_runs != 0) { | ||
| 430 | static constexpr size_t DISPATCH_SIZE = 2048U; | ||
| 431 | size_t runs_to_do = std::min<size_t>(current_runs, DISPATCH_SIZE); | ||
| 432 | current_runs -= runs_to_do; | ||
| 433 | compute_pass_descriptor_queue.Acquire(); | ||
| 434 | compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, number_of_sums * sizeof(u64)); | ||
| 435 | compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, number_of_sums * sizeof(u64)); | ||
| 436 | compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); | ||
| 437 | const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; | ||
| 438 | size_t used_offset = offset; | ||
| 439 | offset += runs_to_do; | ||
| 440 | |||
| 441 | scheduler.RequestOutsideRenderPassOperationContext(); | ||
| 442 | scheduler.Record([this, descriptor_data, min_accumulation_limit, max_accumulation_limit, | ||
| 443 | runs_to_do, used_offset](vk::CommandBuffer cmdbuf) { | ||
| 444 | static constexpr VkMemoryBarrier read_barrier{ | ||
| 445 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||
| 446 | .pNext = nullptr, | ||
| 447 | .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, | ||
| 448 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, | ||
| 449 | }; | ||
| 450 | static constexpr VkMemoryBarrier write_barrier{ | ||
| 451 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||
| 452 | .pNext = nullptr, | ||
| 453 | .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, | ||
| 454 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | | ||
| 455 | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | | ||
| 456 | VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | | ||
| 457 | VK_ACCESS_UNIFORM_READ_BIT | | ||
| 458 | VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, | ||
| 459 | }; | ||
| 460 | const QueriesPrefixScanPushConstants uniforms{ | ||
| 461 | .min_accumulation_base = static_cast<u32>(min_accumulation_limit), | ||
| 462 | .max_accumulation_base = static_cast<u32>(max_accumulation_limit), | ||
| 463 | .accumulation_limit = static_cast<u32>(runs_to_do - 1), | ||
| 464 | .buffer_offset = static_cast<u32>(used_offset), | ||
| 465 | }; | ||
| 466 | const VkDescriptorSet set = descriptor_allocator.Commit(); | ||
| 467 | device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); | ||
| 468 | |||
| 469 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | ||
| 470 | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); | ||
| 471 | cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); | ||
| 472 | cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); | ||
| 473 | cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); | ||
| 474 | cmdbuf.Dispatch(1, 1, 1); | ||
| 475 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, | ||
| 476 | VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, | ||
| 477 | write_barrier); | ||
| 478 | }); | ||
| 479 | } | ||
| 480 | } | ||
| 481 | |||
| 305 | ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, | 482 | ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, |
| 306 | DescriptorPool& descriptor_pool_, | 483 | DescriptorPool& descriptor_pool_, |
| 307 | StagingBufferPool& staging_buffer_pool_, | 484 | StagingBufferPool& staging_buffer_pool_, |
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index dd3927376..3ff935639 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h | |||
| @@ -3,6 +3,7 @@ | |||
| 3 | 3 | ||
| 4 | #pragma once | 4 | #pragma once |
| 5 | 5 | ||
| 6 | #include <optional> | ||
| 6 | #include <span> | 7 | #include <span> |
| 7 | #include <utility> | 8 | #include <utility> |
| 8 | 9 | ||
| @@ -31,7 +32,8 @@ public: | |||
| 31 | vk::Span<VkDescriptorSetLayoutBinding> bindings, | 32 | vk::Span<VkDescriptorSetLayoutBinding> bindings, |
| 32 | vk::Span<VkDescriptorUpdateTemplateEntry> templates, | 33 | vk::Span<VkDescriptorUpdateTemplateEntry> templates, |
| 33 | const DescriptorBankInfo& bank_info, | 34 | const DescriptorBankInfo& bank_info, |
| 34 | vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code); | 35 | vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, |
| 36 | std::optional<u32> optional_subgroup_size = std::nullopt); | ||
| 35 | ~ComputePass(); | 37 | ~ComputePass(); |
| 36 | 38 | ||
| 37 | protected: | 39 | protected: |
| @@ -82,6 +84,33 @@ private: | |||
| 82 | ComputePassDescriptorQueue& compute_pass_descriptor_queue; | 84 | ComputePassDescriptorQueue& compute_pass_descriptor_queue; |
| 83 | }; | 85 | }; |
| 84 | 86 | ||
| 87 | class ConditionalRenderingResolvePass final : public ComputePass { | ||
| 88 | public: | ||
| 89 | explicit ConditionalRenderingResolvePass( | ||
| 90 | const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, | ||
| 91 | ComputePassDescriptorQueue& compute_pass_descriptor_queue_); | ||
| 92 | |||
| 93 | void Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero); | ||
| 94 | |||
| 95 | private: | ||
| 96 | Scheduler& scheduler; | ||
| 97 | ComputePassDescriptorQueue& compute_pass_descriptor_queue; | ||
| 98 | }; | ||
| 99 | |||
| 100 | class QueriesPrefixScanPass final : public ComputePass { | ||
| 101 | public: | ||
| 102 | explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_, | ||
| 103 | DescriptorPool& descriptor_pool_, | ||
| 104 | ComputePassDescriptorQueue& compute_pass_descriptor_queue_); | ||
| 105 | |||
| 106 | void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, | ||
| 107 | size_t number_of_sums, size_t min_accumulation_limit, size_t max_accumulation_limit); | ||
| 108 | |||
| 109 | private: | ||
| 110 | Scheduler& scheduler; | ||
| 111 | ComputePassDescriptorQueue& compute_pass_descriptor_queue; | ||
| 112 | }; | ||
| 113 | |||
| 85 | class ASTCDecoderPass final : public ComputePass { | 114 | class ASTCDecoderPass final : public ComputePass { |
| 86 | public: | 115 | public: |
| 87 | explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, | 116 | explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, |
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h index 145359d4e..336573574 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.h +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | 7 | ||
| 8 | #include "video_core/fence_manager.h" | 8 | #include "video_core/fence_manager.h" |
| 9 | #include "video_core/renderer_vulkan/vk_buffer_cache.h" | 9 | #include "video_core/renderer_vulkan/vk_buffer_cache.h" |
| 10 | #include "video_core/renderer_vulkan/vk_query_cache.h" | ||
| 10 | #include "video_core/renderer_vulkan/vk_texture_cache.h" | 11 | #include "video_core/renderer_vulkan/vk_texture_cache.h" |
| 11 | 12 | ||
| 12 | namespace Core { | 13 | namespace Core { |
| @@ -20,7 +21,6 @@ class RasterizerInterface; | |||
| 20 | namespace Vulkan { | 21 | namespace Vulkan { |
| 21 | 22 | ||
| 22 | class Device; | 23 | class Device; |
| 23 | class QueryCache; | ||
| 24 | class Scheduler; | 24 | class Scheduler; |
| 25 | 25 | ||
| 26 | class InnerFence : public VideoCommon::FenceBase { | 26 | class InnerFence : public VideoCommon::FenceBase { |
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 29e0b797b..a32da3ba3 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp | |||
| @@ -1,139 +1,1552 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project | 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project |
| 2 | // SPDX-License-Identifier: GPL-2.0-or-later | 2 | // SPDX-License-Identifier: GPL-3.0-or-later |
| 3 | 3 | ||
| 4 | #include <algorithm> | ||
| 5 | #include <cstddef> | 4 | #include <cstddef> |
| 5 | #include <limits> | ||
| 6 | #include <map> | ||
| 7 | #include <memory> | ||
| 8 | #include <span> | ||
| 9 | #include <type_traits> | ||
| 10 | #include <unordered_map> | ||
| 6 | #include <utility> | 11 | #include <utility> |
| 7 | #include <vector> | 12 | #include <vector> |
| 8 | 13 | ||
| 14 | #include "common/bit_util.h" | ||
| 15 | #include "common/common_types.h" | ||
| 16 | #include "core/memory.h" | ||
| 17 | #include "video_core/engines/draw_manager.h" | ||
| 18 | #include "video_core/query_cache/query_cache.h" | ||
| 19 | #include "video_core/renderer_vulkan/vk_buffer_cache.h" | ||
| 20 | #include "video_core/renderer_vulkan/vk_compute_pass.h" | ||
| 9 | #include "video_core/renderer_vulkan/vk_query_cache.h" | 21 | #include "video_core/renderer_vulkan/vk_query_cache.h" |
| 10 | #include "video_core/renderer_vulkan/vk_resource_pool.h" | 22 | #include "video_core/renderer_vulkan/vk_resource_pool.h" |
| 11 | #include "video_core/renderer_vulkan/vk_scheduler.h" | 23 | #include "video_core/renderer_vulkan/vk_scheduler.h" |
| 24 | #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" | ||
| 25 | #include "video_core/renderer_vulkan/vk_update_descriptor.h" | ||
| 12 | #include "video_core/vulkan_common/vulkan_device.h" | 26 | #include "video_core/vulkan_common/vulkan_device.h" |
| 27 | #include "video_core/vulkan_common/vulkan_memory_allocator.h" | ||
| 13 | #include "video_core/vulkan_common/vulkan_wrapper.h" | 28 | #include "video_core/vulkan_common/vulkan_wrapper.h" |
| 14 | 29 | ||
| 15 | namespace Vulkan { | 30 | namespace Vulkan { |
| 16 | 31 | ||
| 17 | using VideoCore::QueryType; | 32 | using Tegra::Engines::Maxwell3D; |
| 33 | using VideoCommon::QueryType; | ||
| 18 | 34 | ||
| 19 | namespace { | 35 | namespace { |
| 36 | class SamplesQueryBank : public VideoCommon::BankBase { | ||
| 37 | public: | ||
| 38 | static constexpr size_t BANK_SIZE = 256; | ||
| 39 | static constexpr size_t QUERY_SIZE = 8; | ||
| 40 | explicit SamplesQueryBank(const Device& device_, size_t index_) | ||
| 41 | : BankBase(BANK_SIZE), device{device_}, index{index_} { | ||
| 42 | const auto& dev = device.GetLogical(); | ||
| 43 | query_pool = dev.CreateQueryPool({ | ||
| 44 | .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, | ||
| 45 | .pNext = nullptr, | ||
| 46 | .flags = 0, | ||
| 47 | .queryType = VK_QUERY_TYPE_OCCLUSION, | ||
| 48 | .queryCount = BANK_SIZE, | ||
| 49 | .pipelineStatistics = 0, | ||
| 50 | }); | ||
| 51 | Reset(); | ||
| 52 | } | ||
| 20 | 53 | ||
| 21 | constexpr std::array QUERY_TARGETS = {VK_QUERY_TYPE_OCCLUSION}; | 54 | ~SamplesQueryBank() = default; |
| 22 | 55 | ||
| 23 | constexpr VkQueryType GetTarget(QueryType type) { | 56 | void Reset() override { |
| 24 | return QUERY_TARGETS[static_cast<std::size_t>(type)]; | 57 | ASSERT(references == 0); |
| 25 | } | 58 | VideoCommon::BankBase::Reset(); |
| 59 | const auto& dev = device.GetLogical(); | ||
| 60 | dev.ResetQueryPool(*query_pool, 0, BANK_SIZE); | ||
| 61 | host_results.fill(0ULL); | ||
| 62 | next_bank = 0; | ||
| 63 | } | ||
| 64 | |||
| 65 | void Sync(size_t start, size_t size) { | ||
| 66 | const auto& dev = device.GetLogical(); | ||
| 67 | const VkResult query_result = dev.GetQueryResults( | ||
| 68 | *query_pool, static_cast<u32>(start), static_cast<u32>(size), sizeof(u64) * size, | ||
| 69 | &host_results[start], sizeof(u64), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); | ||
| 70 | switch (query_result) { | ||
| 71 | case VK_SUCCESS: | ||
| 72 | return; | ||
| 73 | case VK_ERROR_DEVICE_LOST: | ||
| 74 | device.ReportLoss(); | ||
| 75 | [[fallthrough]]; | ||
| 76 | default: | ||
| 77 | throw vk::Exception(query_result); | ||
| 78 | } | ||
| 79 | } | ||
| 80 | |||
| 81 | VkQueryPool GetInnerPool() { | ||
| 82 | return *query_pool; | ||
| 83 | } | ||
| 84 | |||
| 85 | size_t GetIndex() const { | ||
| 86 | return index; | ||
| 87 | } | ||
| 88 | |||
| 89 | const std::array<u64, BANK_SIZE>& GetResults() const { | ||
| 90 | return host_results; | ||
| 91 | } | ||
| 92 | |||
| 93 | size_t next_bank; | ||
| 94 | |||
| 95 | private: | ||
| 96 | const Device& device; | ||
| 97 | const size_t index; | ||
| 98 | vk::QueryPool query_pool; | ||
| 99 | std::array<u64, BANK_SIZE> host_results; | ||
| 100 | }; | ||
| 101 | |||
| 102 | using BaseStreamer = VideoCommon::SimpleStreamer<VideoCommon::HostQueryBase>; | ||
| 103 | |||
| 104 | struct HostSyncValues { | ||
| 105 | VAddr address; | ||
| 106 | size_t size; | ||
| 107 | size_t offset; | ||
| 108 | |||
| 109 | static constexpr bool GeneratesBaseBuffer = false; | ||
| 110 | }; | ||
| 111 | |||
| 112 | class SamplesStreamer : public BaseStreamer { | ||
| 113 | public: | ||
| 114 | explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, | ||
| 115 | VideoCore::RasterizerInterface* rasterizer_, const Device& device_, | ||
| 116 | Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, | ||
| 117 | ComputePassDescriptorQueue& compute_pass_descriptor_queue, | ||
| 118 | DescriptorPool& descriptor_pool) | ||
| 119 | : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, | ||
| 120 | scheduler{scheduler_}, memory_allocator{memory_allocator_} { | ||
| 121 | current_bank = nullptr; | ||
| 122 | current_query = nullptr; | ||
| 123 | ammend_value = 0; | ||
| 124 | acumulation_value = 0; | ||
| 125 | queries_prefix_scan_pass = std::make_unique<QueriesPrefixScanPass>( | ||
| 126 | device, scheduler, descriptor_pool, compute_pass_descriptor_queue); | ||
| 127 | |||
| 128 | const VkBufferCreateInfo buffer_ci = { | ||
| 129 | .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, | ||
| 130 | .pNext = nullptr, | ||
| 131 | .flags = 0, | ||
| 132 | .size = 8, | ||
| 133 | .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | | ||
| 134 | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, | ||
| 135 | .sharingMode = VK_SHARING_MODE_EXCLUSIVE, | ||
| 136 | .queueFamilyIndexCount = 0, | ||
| 137 | .pQueueFamilyIndices = nullptr, | ||
| 138 | }; | ||
| 139 | accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); | ||
| 140 | scheduler.RequestOutsideRenderPassOperationContext(); | ||
| 141 | scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { | ||
| 142 | cmdbuf.FillBuffer(buffer, 0, 8, 0); | ||
| 143 | }); | ||
| 144 | } | ||
| 145 | |||
| 146 | ~SamplesStreamer() = default; | ||
| 147 | |||
| 148 | void StartCounter() override { | ||
| 149 | if (has_started) { | ||
| 150 | return; | ||
| 151 | } | ||
| 152 | ReserveHostQuery(); | ||
| 153 | scheduler.Record([query_pool = current_query_pool, | ||
| 154 | query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { | ||
| 155 | const bool use_precise = Settings::IsGPULevelHigh(); | ||
| 156 | cmdbuf.BeginQuery(query_pool, static_cast<u32>(query_index), | ||
| 157 | use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); | ||
| 158 | }); | ||
| 159 | has_started = true; | ||
| 160 | } | ||
| 161 | |||
| 162 | void PauseCounter() override { | ||
| 163 | if (!has_started) { | ||
| 164 | return; | ||
| 165 | } | ||
| 166 | scheduler.Record([query_pool = current_query_pool, | ||
| 167 | query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { | ||
| 168 | cmdbuf.EndQuery(query_pool, static_cast<u32>(query_index)); | ||
| 169 | }); | ||
| 170 | has_started = false; | ||
| 171 | } | ||
| 172 | |||
| 173 | void ResetCounter() override { | ||
| 174 | if (has_started) { | ||
| 175 | PauseCounter(); | ||
| 176 | } | ||
| 177 | AbandonCurrentQuery(); | ||
| 178 | std::function<void()> func([this, counts = pending_flush_queries.size()] { | ||
| 179 | ammend_value = 0; | ||
| 180 | acumulation_value = 0; | ||
| 181 | }); | ||
| 182 | rasterizer->SyncOperation(std::move(func)); | ||
| 183 | accumulation_since_last_sync = false; | ||
| 184 | first_accumulation_checkpoint = std::min(first_accumulation_checkpoint, num_slots_used); | ||
| 185 | last_accumulation_checkpoint = std::max(last_accumulation_checkpoint, num_slots_used); | ||
| 186 | } | ||
| 187 | |||
| 188 | void CloseCounter() override { | ||
| 189 | PauseCounter(); | ||
| 190 | } | ||
| 191 | |||
| 192 | bool HasPendingSync() const override { | ||
| 193 | return !pending_sync.empty(); | ||
| 194 | } | ||
| 195 | |||
| 196 | void SyncWrites() override { | ||
| 197 | if (sync_values_stash.empty()) { | ||
| 198 | return; | ||
| 199 | } | ||
| 200 | |||
| 201 | for (size_t i = 0; i < sync_values_stash.size(); i++) { | ||
| 202 | runtime.template SyncValues<HostSyncValues>(sync_values_stash[i], | ||
| 203 | *buffers[resolve_buffers[i]]); | ||
| 204 | } | ||
| 205 | |||
| 206 | sync_values_stash.clear(); | ||
| 207 | } | ||
| 208 | |||
| 209 | void PresyncWrites() override { | ||
| 210 | if (pending_sync.empty()) { | ||
| 211 | return; | ||
| 212 | } | ||
| 213 | PauseCounter(); | ||
| 214 | sync_values_stash.clear(); | ||
| 215 | sync_values_stash.emplace_back(); | ||
| 216 | std::vector<HostSyncValues>* sync_values = &sync_values_stash.back(); | ||
| 217 | sync_values->reserve(num_slots_used); | ||
| 218 | std::unordered_map<size_t, std::pair<size_t, size_t>> offsets; | ||
| 219 | resolve_buffers.clear(); | ||
| 220 | size_t resolve_buffer_index = ObtainBuffer<true>(num_slots_used); | ||
| 221 | resolve_buffers.push_back(resolve_buffer_index); | ||
| 222 | size_t base_offset = 0; | ||
| 223 | |||
| 224 | ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start, | ||
| 225 | size_t amount) { | ||
| 226 | size_t bank_id = bank->GetIndex(); | ||
| 227 | auto& resolve_buffer = buffers[resolve_buffer_index]; | ||
| 228 | VkQueryPool query_pool = bank->GetInnerPool(); | ||
| 229 | scheduler.RequestOutsideRenderPassOperationContext(); | ||
| 230 | scheduler.Record([start, amount, base_offset, query_pool, | ||
| 231 | buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { | ||
| 232 | const VkBufferMemoryBarrier copy_query_pool_barrier{ | ||
| 233 | .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, | ||
| 234 | .pNext = nullptr, | ||
| 235 | .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, | ||
| 236 | .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, | ||
| 237 | .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||
| 238 | .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||
| 239 | .buffer = buffer, | ||
| 240 | .offset = base_offset, | ||
| 241 | .size = amount * SamplesQueryBank::QUERY_SIZE, | ||
| 242 | }; | ||
| 243 | |||
| 244 | cmdbuf.CopyQueryPoolResults( | ||
| 245 | query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer, | ||
| 246 | static_cast<u32>(base_offset), SamplesQueryBank::QUERY_SIZE, | ||
| 247 | VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); | ||
| 248 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, | ||
| 249 | VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier); | ||
| 250 | }); | ||
| 251 | offsets[bank_id] = {start, base_offset}; | ||
| 252 | base_offset += amount * SamplesQueryBank::QUERY_SIZE; | ||
| 253 | }); | ||
| 254 | |||
| 255 | // Convert queries | ||
| 256 | bool has_multi_queries = false; | ||
| 257 | for (auto q : pending_sync) { | ||
| 258 | auto* query = GetQuery(q); | ||
| 259 | size_t sync_value_slot = 0; | ||
| 260 | if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { | ||
| 261 | continue; | ||
| 262 | } | ||
| 263 | if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { | ||
| 264 | continue; | ||
| 265 | } | ||
| 266 | if (accumulation_since_last_sync || query->size_slots > 1) { | ||
| 267 | if (!has_multi_queries) { | ||
| 268 | has_multi_queries = true; | ||
| 269 | sync_values_stash.emplace_back(); | ||
| 270 | } | ||
| 271 | sync_value_slot = 1; | ||
| 272 | } | ||
| 273 | query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; | ||
| 274 | auto loc_data = offsets[query->start_bank_id]; | ||
| 275 | sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{ | ||
| 276 | .address = query->guest_address, | ||
| 277 | .size = SamplesQueryBank::QUERY_SIZE, | ||
| 278 | .offset = | ||
| 279 | loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) * | ||
| 280 | SamplesQueryBank::QUERY_SIZE, | ||
| 281 | }); | ||
| 282 | } | ||
| 283 | |||
| 284 | if (has_multi_queries) { | ||
| 285 | size_t intermediary_buffer_index = ObtainBuffer<false>(num_slots_used); | ||
| 286 | resolve_buffers.push_back(intermediary_buffer_index); | ||
| 287 | queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], | ||
| 288 | *buffers[resolve_buffer_index], num_slots_used, | ||
| 289 | std::min(first_accumulation_checkpoint, num_slots_used), | ||
| 290 | last_accumulation_checkpoint); | ||
| 291 | |||
| 292 | } else { | ||
| 293 | scheduler.RequestOutsideRenderPassOperationContext(); | ||
| 294 | scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { | ||
| 295 | cmdbuf.FillBuffer(buffer, 0, 8, 0); | ||
| 296 | }); | ||
| 297 | } | ||
| 298 | |||
| 299 | ReplicateCurrentQueryIfNeeded(); | ||
| 300 | std::function<void()> func([this] { ammend_value = acumulation_value; }); | ||
| 301 | rasterizer->SyncOperation(std::move(func)); | ||
| 302 | AbandonCurrentQuery(); | ||
| 303 | num_slots_used = 0; | ||
| 304 | first_accumulation_checkpoint = std::numeric_limits<size_t>::max(); | ||
| 305 | last_accumulation_checkpoint = 0; | ||
| 306 | accumulation_since_last_sync = has_multi_queries; | ||
| 307 | pending_sync.clear(); | ||
| 308 | } | ||
| 309 | |||
| 310 | size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, | ||
| 311 | [[maybe_unused]] std::optional<u32> subreport) override { | ||
| 312 | PauseCounter(); | ||
| 313 | auto index = BuildQuery(); | ||
| 314 | auto* new_query = GetQuery(index); | ||
| 315 | new_query->guest_address = address; | ||
| 316 | new_query->value = 0; | ||
| 317 | new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; | ||
| 318 | if (has_timestamp) { | ||
| 319 | new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; | ||
| 320 | } | ||
| 321 | if (!current_query) { | ||
| 322 | new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; | ||
| 323 | return index; | ||
| 324 | } | ||
| 325 | new_query->start_bank_id = current_query->start_bank_id; | ||
| 326 | new_query->size_banks = current_query->size_banks; | ||
| 327 | new_query->start_slot = current_query->start_slot; | ||
| 328 | new_query->size_slots = current_query->size_slots; | ||
| 329 | ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) { | ||
| 330 | bank->AddReference(amount); | ||
| 331 | }); | ||
| 332 | pending_sync.push_back(index); | ||
| 333 | pending_flush_queries.push_back(index); | ||
| 334 | return index; | ||
| 335 | } | ||
| 336 | |||
| 337 | bool HasUnsyncedQueries() const override { | ||
| 338 | return !pending_flush_queries.empty(); | ||
| 339 | } | ||
| 340 | |||
| 341 | void PushUnsyncedQueries() override { | ||
| 342 | PauseCounter(); | ||
| 343 | current_bank->Close(); | ||
| 344 | { | ||
| 345 | std::scoped_lock lk(flush_guard); | ||
| 346 | pending_flush_sets.emplace_back(std::move(pending_flush_queries)); | ||
| 347 | } | ||
| 348 | } | ||
| 349 | |||
| 350 | void PopUnsyncedQueries() override { | ||
| 351 | std::vector<size_t> current_flush_queries; | ||
| 352 | { | ||
| 353 | std::scoped_lock lk(flush_guard); | ||
| 354 | current_flush_queries = std::move(pending_flush_sets.front()); | ||
| 355 | pending_flush_sets.pop_front(); | ||
| 356 | } | ||
| 357 | ApplyBanksWideOp<false>( | ||
| 358 | current_flush_queries, | ||
| 359 | [](SamplesQueryBank* bank, size_t start, size_t amount) { bank->Sync(start, amount); }); | ||
| 360 | for (auto q : current_flush_queries) { | ||
| 361 | auto* query = GetQuery(q); | ||
| 362 | u64 total = 0; | ||
| 363 | ApplyBankOp(query, [&total](SamplesQueryBank* bank, size_t start, size_t amount) { | ||
| 364 | const auto& results = bank->GetResults(); | ||
| 365 | for (size_t i = 0; i < amount; i++) { | ||
| 366 | total += results[start + i]; | ||
| 367 | } | ||
| 368 | }); | ||
| 369 | query->value = total; | ||
| 370 | query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; | ||
| 371 | } | ||
| 372 | } | ||
| 373 | |||
| 374 | private: | ||
| 375 | template <typename Func> | ||
| 376 | void ApplyBankOp(VideoCommon::HostQueryBase* query, Func&& func) { | ||
| 377 | size_t size_slots = query->size_slots; | ||
| 378 | if (size_slots == 0) { | ||
| 379 | return; | ||
| 380 | } | ||
| 381 | size_t bank_id = query->start_bank_id; | ||
| 382 | size_t banks_set = query->size_banks; | ||
| 383 | size_t start_slot = query->start_slot; | ||
| 384 | for (size_t i = 0; i < banks_set; i++) { | ||
| 385 | auto& the_bank = bank_pool.GetBank(bank_id); | ||
| 386 | size_t amount = std::min(the_bank.Size() - start_slot, size_slots); | ||
| 387 | func(&the_bank, start_slot, amount); | ||
| 388 | bank_id = the_bank.next_bank - 1; | ||
| 389 | start_slot = 0; | ||
| 390 | size_slots -= amount; | ||
| 391 | } | ||
| 392 | } | ||
| 393 | |||
| 394 | template <bool is_ordered, typename Func> | ||
| 395 | void ApplyBanksWideOp(std::vector<size_t>& queries, Func&& func) { | ||
| 396 | std::conditional_t<is_ordered, std::map<size_t, std::pair<size_t, size_t>>, | ||
| 397 | std::unordered_map<size_t, std::pair<size_t, size_t>>> | ||
| 398 | indexer; | ||
| 399 | for (auto q : queries) { | ||
| 400 | auto* query = GetQuery(q); | ||
| 401 | ApplyBankOp(query, [&indexer](SamplesQueryBank* bank, size_t start, size_t amount) { | ||
| 402 | auto id_ = bank->GetIndex(); | ||
| 403 | auto pair = indexer.try_emplace(id_, std::numeric_limits<size_t>::max(), | ||
| 404 | std::numeric_limits<size_t>::min()); | ||
| 405 | auto& current_pair = pair.first->second; | ||
| 406 | current_pair.first = std::min(current_pair.first, start); | ||
| 407 | current_pair.second = std::max(current_pair.second, amount + start); | ||
| 408 | }); | ||
| 409 | } | ||
| 410 | for (auto& cont : indexer) { | ||
| 411 | func(&bank_pool.GetBank(cont.first), cont.second.first, | ||
| 412 | cont.second.second - cont.second.first); | ||
| 413 | } | ||
| 414 | } | ||
| 415 | |||
| 416 | void ReserveBank() { | ||
| 417 | current_bank_id = | ||
| 418 | bank_pool.ReserveBank([this](std::deque<SamplesQueryBank>& queue, size_t index) { | ||
| 419 | queue.emplace_back(device, index); | ||
| 420 | }); | ||
| 421 | if (current_bank) { | ||
| 422 | current_bank->next_bank = current_bank_id + 1; | ||
| 423 | } | ||
| 424 | current_bank = &bank_pool.GetBank(current_bank_id); | ||
| 425 | current_query_pool = current_bank->GetInnerPool(); | ||
| 426 | } | ||
| 427 | |||
| 428 | size_t ReserveBankSlot() { | ||
| 429 | if (!current_bank || current_bank->IsClosed()) { | ||
| 430 | ReserveBank(); | ||
| 431 | } | ||
| 432 | auto [built, index] = current_bank->Reserve(); | ||
| 433 | current_bank_slot = index; | ||
| 434 | return index; | ||
| 435 | } | ||
| 436 | |||
| 437 | void ReserveHostQuery() { | ||
| 438 | size_t new_slot = ReserveBankSlot(); | ||
| 439 | current_bank->AddReference(1); | ||
| 440 | num_slots_used++; | ||
| 441 | if (current_query) { | ||
| 442 | size_t bank_id = current_query->start_bank_id; | ||
| 443 | size_t banks_set = current_query->size_banks - 1; | ||
| 444 | bool found = bank_id == current_bank_id; | ||
| 445 | while (!found && banks_set > 0) { | ||
| 446 | SamplesQueryBank& some_bank = bank_pool.GetBank(bank_id); | ||
| 447 | bank_id = some_bank.next_bank - 1; | ||
| 448 | found = bank_id == current_bank_id; | ||
| 449 | banks_set--; | ||
| 450 | } | ||
| 451 | if (!found) { | ||
| 452 | current_query->size_banks++; | ||
| 453 | } | ||
| 454 | current_query->size_slots++; | ||
| 455 | } else { | ||
| 456 | current_query_id = BuildQuery(); | ||
| 457 | current_query = GetQuery(current_query_id); | ||
| 458 | current_query->start_bank_id = static_cast<u32>(current_bank_id); | ||
| 459 | current_query->size_banks = 1; | ||
| 460 | current_query->start_slot = new_slot; | ||
| 461 | current_query->size_slots = 1; | ||
| 462 | } | ||
| 463 | } | ||
| 464 | |||
| 465 | void Free(size_t query_id) override { | ||
| 466 | std::scoped_lock lk(guard); | ||
| 467 | auto* query = GetQuery(query_id); | ||
| 468 | ApplyBankOp(query, [](SamplesQueryBank* bank, size_t start, size_t amount) { | ||
| 469 | bank->CloseReference(amount); | ||
| 470 | }); | ||
| 471 | ReleaseQuery(query_id); | ||
| 472 | } | ||
| 473 | |||
| 474 | void AbandonCurrentQuery() { | ||
| 475 | if (!current_query) { | ||
| 476 | return; | ||
| 477 | } | ||
| 478 | Free(current_query_id); | ||
| 479 | current_query = nullptr; | ||
| 480 | current_query_id = 0; | ||
| 481 | } | ||
| 482 | |||
| 483 | void ReplicateCurrentQueryIfNeeded() { | ||
| 484 | if (pending_sync.empty()) { | ||
| 485 | return; | ||
| 486 | } | ||
| 487 | if (!current_query) { | ||
| 488 | return; | ||
| 489 | } | ||
| 490 | auto index = BuildQuery(); | ||
| 491 | auto* new_query = GetQuery(index); | ||
| 492 | new_query->guest_address = 0; | ||
| 493 | new_query->value = 0; | ||
| 494 | new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; | ||
| 495 | new_query->start_bank_id = current_query->start_bank_id; | ||
| 496 | new_query->size_banks = current_query->size_banks; | ||
| 497 | new_query->start_slot = current_query->start_slot; | ||
| 498 | new_query->size_slots = current_query->size_slots; | ||
| 499 | ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) { | ||
| 500 | bank->AddReference(amount); | ||
| 501 | }); | ||
| 502 | pending_flush_queries.push_back(index); | ||
| 503 | std::function<void()> func([this, index] { | ||
| 504 | auto* query = GetQuery(index); | ||
| 505 | query->value += GetAmmendValue(); | ||
| 506 | SetAccumulationValue(query->value); | ||
| 507 | Free(index); | ||
| 508 | }); | ||
| 509 | } | ||
| 510 | |||
| 511 | template <bool is_resolve> | ||
| 512 | size_t ObtainBuffer(size_t num_needed) { | ||
| 513 | const size_t log_2 = std::max<size_t>(11U, Common::Log2Ceil64(num_needed)); | ||
| 514 | if constexpr (is_resolve) { | ||
| 515 | if (resolve_table[log_2] != 0) { | ||
| 516 | return resolve_table[log_2] - 1; | ||
| 517 | } | ||
| 518 | } else { | ||
| 519 | if (intermediary_table[log_2] != 0) { | ||
| 520 | return intermediary_table[log_2] - 1; | ||
| 521 | } | ||
| 522 | } | ||
| 523 | const VkBufferCreateInfo buffer_ci = { | ||
| 524 | .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, | ||
| 525 | .pNext = nullptr, | ||
| 526 | .flags = 0, | ||
| 527 | .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2), | ||
| 528 | .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | | ||
| 529 | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, | ||
| 530 | .sharingMode = VK_SHARING_MODE_EXCLUSIVE, | ||
| 531 | .queueFamilyIndexCount = 0, | ||
| 532 | .pQueueFamilyIndices = nullptr, | ||
| 533 | }; | ||
| 534 | buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); | ||
| 535 | if constexpr (is_resolve) { | ||
| 536 | resolve_table[log_2] = buffers.size(); | ||
| 537 | } else { | ||
| 538 | intermediary_table[log_2] = buffers.size(); | ||
| 539 | } | ||
| 540 | return buffers.size() - 1; | ||
| 541 | } | ||
| 542 | |||
| 543 | QueryCacheRuntime& runtime; | ||
| 544 | VideoCore::RasterizerInterface* rasterizer; | ||
| 545 | const Device& device; | ||
| 546 | Scheduler& scheduler; | ||
| 547 | const MemoryAllocator& memory_allocator; | ||
| 548 | VideoCommon::BankPool<SamplesQueryBank> bank_pool; | ||
| 549 | std::deque<vk::Buffer> buffers; | ||
| 550 | std::array<size_t, 32> resolve_table{}; | ||
| 551 | std::array<size_t, 32> intermediary_table{}; | ||
| 552 | vk::Buffer accumulation_buffer; | ||
| 553 | std::deque<std::vector<HostSyncValues>> sync_values_stash; | ||
| 554 | std::vector<size_t> resolve_buffers; | ||
| 555 | |||
| 556 | // syncing queue | ||
| 557 | std::vector<size_t> pending_sync; | ||
| 558 | |||
| 559 | // flush levels | ||
| 560 | std::vector<size_t> pending_flush_queries; | ||
| 561 | std::deque<std::vector<size_t>> pending_flush_sets; | ||
| 562 | |||
| 563 | // State Machine | ||
| 564 | size_t current_bank_slot; | ||
| 565 | size_t current_bank_id; | ||
| 566 | SamplesQueryBank* current_bank; | ||
| 567 | VkQueryPool current_query_pool; | ||
| 568 | size_t current_query_id; | ||
| 569 | size_t num_slots_used{}; | ||
| 570 | size_t first_accumulation_checkpoint{}; | ||
| 571 | size_t last_accumulation_checkpoint{}; | ||
| 572 | bool accumulation_since_last_sync{}; | ||
| 573 | VideoCommon::HostQueryBase* current_query; | ||
| 574 | bool has_started{}; | ||
| 575 | std::mutex flush_guard; | ||
| 576 | |||
| 577 | std::unique_ptr<QueriesPrefixScanPass> queries_prefix_scan_pass; | ||
| 578 | }; | ||
| 579 | |||
| 580 | // Transform feedback queries | ||
| 581 | class TFBQueryBank : public VideoCommon::BankBase { | ||
| 582 | public: | ||
| 583 | static constexpr size_t BANK_SIZE = 1024; | ||
| 584 | static constexpr size_t QUERY_SIZE = 4; | ||
| 585 | explicit TFBQueryBank(Scheduler& scheduler_, const MemoryAllocator& memory_allocator, | ||
| 586 | size_t index_) | ||
| 587 | : BankBase(BANK_SIZE), scheduler{scheduler_}, index{index_} { | ||
| 588 | const VkBufferCreateInfo buffer_ci = { | ||
| 589 | .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, | ||
| 590 | .pNext = nullptr, | ||
| 591 | .flags = 0, | ||
| 592 | .size = QUERY_SIZE * BANK_SIZE, | ||
| 593 | .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, | ||
| 594 | .sharingMode = VK_SHARING_MODE_EXCLUSIVE, | ||
| 595 | .queueFamilyIndexCount = 0, | ||
| 596 | .pQueueFamilyIndices = nullptr, | ||
| 597 | }; | ||
| 598 | buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); | ||
| 599 | } | ||
| 600 | |||
| 601 | ~TFBQueryBank() = default; | ||
| 602 | |||
| 603 | void Reset() override { | ||
| 604 | ASSERT(references == 0); | ||
| 605 | VideoCommon::BankBase::Reset(); | ||
| 606 | } | ||
| 607 | |||
| 608 | void Sync(StagingBufferRef& stagging_buffer, size_t extra_offset, size_t start, size_t size) { | ||
| 609 | scheduler.RequestOutsideRenderPassOperationContext(); | ||
| 610 | scheduler.Record([this, dst_buffer = stagging_buffer.buffer, extra_offset, start, | ||
| 611 | size](vk::CommandBuffer cmdbuf) { | ||
| 612 | std::array<VkBufferCopy, 1> copy{VkBufferCopy{ | ||
| 613 | .srcOffset = start * QUERY_SIZE, | ||
| 614 | .dstOffset = extra_offset, | ||
| 615 | .size = size * QUERY_SIZE, | ||
| 616 | }}; | ||
| 617 | cmdbuf.CopyBuffer(*buffer, dst_buffer, copy); | ||
| 618 | }); | ||
| 619 | } | ||
| 620 | |||
| 621 | size_t GetIndex() const { | ||
| 622 | return index; | ||
| 623 | } | ||
| 624 | |||
| 625 | VkBuffer GetBuffer() const { | ||
| 626 | return *buffer; | ||
| 627 | } | ||
| 628 | |||
| 629 | private: | ||
| 630 | Scheduler& scheduler; | ||
| 631 | const size_t index; | ||
| 632 | vk::Buffer buffer; | ||
| 633 | }; | ||
| 634 | |||
| 635 | class PrimitivesSucceededStreamer; | ||
| 636 | |||
| 637 | class TFBCounterStreamer : public BaseStreamer { | ||
| 638 | public: | ||
| 639 | explicit TFBCounterStreamer(size_t id_, QueryCacheRuntime& runtime_, const Device& device_, | ||
| 640 | Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, | ||
| 641 | StagingBufferPool& staging_pool_) | ||
| 642 | : BaseStreamer(id_), runtime{runtime_}, device{device_}, scheduler{scheduler_}, | ||
| 643 | memory_allocator{memory_allocator_}, staging_pool{staging_pool_} { | ||
| 644 | buffers_count = 0; | ||
| 645 | current_bank = nullptr; | ||
| 646 | counter_buffers.fill(VK_NULL_HANDLE); | ||
| 647 | offsets.fill(0); | ||
| 648 | last_queries.fill(0); | ||
| 649 | last_queries_stride.fill(1); | ||
| 650 | const VkBufferCreateInfo buffer_ci = { | ||
| 651 | .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, | ||
| 652 | .pNext = nullptr, | ||
| 653 | .flags = 0, | ||
| 654 | .size = TFBQueryBank::QUERY_SIZE * NUM_STREAMS, | ||
| 655 | .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | | ||
| 656 | VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT, | ||
| 657 | .sharingMode = VK_SHARING_MODE_EXCLUSIVE, | ||
| 658 | .queueFamilyIndexCount = 0, | ||
| 659 | .pQueueFamilyIndices = nullptr, | ||
| 660 | }; | ||
| 661 | |||
| 662 | counters_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); | ||
| 663 | for (auto& c : counter_buffers) { | ||
| 664 | c = *counters_buffer; | ||
| 665 | } | ||
| 666 | size_t base_offset = 0; | ||
| 667 | for (auto& o : offsets) { | ||
| 668 | o = base_offset; | ||
| 669 | base_offset += TFBQueryBank::QUERY_SIZE; | ||
| 670 | } | ||
| 671 | } | ||
| 672 | |||
| 673 | ~TFBCounterStreamer() = default; | ||
| 674 | |||
| 675 | void StartCounter() override { | ||
| 676 | FlushBeginTFB(); | ||
| 677 | has_started = true; | ||
| 678 | } | ||
| 679 | |||
| 680 | void PauseCounter() override { | ||
| 681 | CloseCounter(); | ||
| 682 | } | ||
| 683 | |||
| 684 | void ResetCounter() override { | ||
| 685 | CloseCounter(); | ||
| 686 | } | ||
| 687 | |||
| 688 | void CloseCounter() override { | ||
| 689 | if (has_flushed_end_pending) { | ||
| 690 | FlushEndTFB(); | ||
| 691 | } | ||
| 692 | runtime.View3DRegs([this](Maxwell3D& maxwell3d) { | ||
| 693 | if (maxwell3d.regs.transform_feedback_enabled == 0) { | ||
| 694 | streams_mask = 0; | ||
| 695 | has_started = false; | ||
| 696 | } | ||
| 697 | }); | ||
| 698 | } | ||
| 699 | |||
| 700 | bool HasPendingSync() const override { | ||
| 701 | return !pending_sync.empty(); | ||
| 702 | } | ||
| 703 | |||
| 704 | void SyncWrites() override { | ||
| 705 | CloseCounter(); | ||
| 706 | std::unordered_map<size_t, std::vector<HostSyncValues>> sync_values_stash; | ||
| 707 | for (auto q : pending_sync) { | ||
| 708 | auto* query = GetQuery(q); | ||
| 709 | if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { | ||
| 710 | continue; | ||
| 711 | } | ||
| 712 | if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { | ||
| 713 | continue; | ||
| 714 | } | ||
| 715 | query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; | ||
| 716 | sync_values_stash.try_emplace(query->start_bank_id); | ||
| 717 | sync_values_stash[query->start_bank_id].emplace_back(HostSyncValues{ | ||
| 718 | .address = query->guest_address, | ||
| 719 | .size = TFBQueryBank::QUERY_SIZE, | ||
| 720 | .offset = query->start_slot * TFBQueryBank::QUERY_SIZE, | ||
| 721 | }); | ||
| 722 | } | ||
| 723 | for (auto& p : sync_values_stash) { | ||
| 724 | auto& bank = bank_pool.GetBank(p.first); | ||
| 725 | runtime.template SyncValues<HostSyncValues>(p.second, bank.GetBuffer()); | ||
| 726 | } | ||
| 727 | pending_sync.clear(); | ||
| 728 | } | ||
| 26 | 729 | ||
| 27 | } // Anonymous namespace | 730 | size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, |
| 731 | std::optional<u32> subreport_) override { | ||
| 732 | auto index = BuildQuery(); | ||
| 733 | auto* new_query = GetQuery(index); | ||
| 734 | new_query->guest_address = address; | ||
| 735 | new_query->value = 0; | ||
| 736 | new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; | ||
| 737 | if (has_timestamp) { | ||
| 738 | new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; | ||
| 739 | } | ||
| 740 | if (!subreport_) { | ||
| 741 | new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; | ||
| 742 | return index; | ||
| 743 | } | ||
| 744 | const size_t subreport = static_cast<size_t>(*subreport_); | ||
| 745 | last_queries[subreport] = address; | ||
| 746 | if ((streams_mask & (1ULL << subreport)) == 0) { | ||
| 747 | new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; | ||
| 748 | return index; | ||
| 749 | } | ||
| 750 | CloseCounter(); | ||
| 751 | auto [bank_slot, data_slot] = ProduceCounterBuffer(subreport); | ||
| 752 | new_query->start_bank_id = static_cast<u32>(bank_slot); | ||
| 753 | new_query->size_banks = 1; | ||
| 754 | new_query->start_slot = static_cast<u32>(data_slot); | ||
| 755 | new_query->size_slots = 1; | ||
| 756 | pending_sync.push_back(index); | ||
| 757 | pending_flush_queries.push_back(index); | ||
| 758 | return index; | ||
| 759 | } | ||
| 760 | |||
| 761 | std::optional<std::pair<VAddr, size_t>> GetLastQueryStream(size_t stream) { | ||
| 762 | if (last_queries[stream] != 0) { | ||
| 763 | std::pair<VAddr, size_t> result(last_queries[stream], last_queries_stride[stream]); | ||
| 764 | return result; | ||
| 765 | } | ||
| 766 | return std::nullopt; | ||
| 767 | } | ||
| 768 | |||
| 769 | Maxwell3D::Regs::PrimitiveTopology GetOutputTopology() const { | ||
| 770 | return out_topology; | ||
| 771 | } | ||
| 772 | |||
| 773 | bool HasUnsyncedQueries() const override { | ||
| 774 | return !pending_flush_queries.empty(); | ||
| 775 | } | ||
| 776 | |||
| 777 | void PushUnsyncedQueries() override { | ||
| 778 | CloseCounter(); | ||
| 779 | auto staging_ref = staging_pool.Request( | ||
| 780 | pending_flush_queries.size() * TFBQueryBank::QUERY_SIZE, MemoryUsage::Download, true); | ||
| 781 | size_t offset_base = staging_ref.offset; | ||
| 782 | for (auto q : pending_flush_queries) { | ||
| 783 | auto* query = GetQuery(q); | ||
| 784 | auto& bank = bank_pool.GetBank(query->start_bank_id); | ||
| 785 | bank.Sync(staging_ref, offset_base, query->start_slot, 1); | ||
| 786 | offset_base += TFBQueryBank::QUERY_SIZE; | ||
| 787 | bank.CloseReference(); | ||
| 788 | } | ||
| 789 | static constexpr VkMemoryBarrier WRITE_BARRIER{ | ||
| 790 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||
| 791 | .pNext = nullptr, | ||
| 792 | .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, | ||
| 793 | .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, | ||
| 794 | }; | ||
| 795 | scheduler.RequestOutsideRenderPassOperationContext(); | ||
| 796 | scheduler.Record([](vk::CommandBuffer cmdbuf) { | ||
| 797 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, | ||
| 798 | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); | ||
| 799 | }); | ||
| 800 | |||
| 801 | std::scoped_lock lk(flush_guard); | ||
| 802 | for (auto& str : free_queue) { | ||
| 803 | staging_pool.FreeDeferred(str); | ||
| 804 | } | ||
| 805 | free_queue.clear(); | ||
| 806 | download_buffers.emplace_back(staging_ref); | ||
| 807 | pending_flush_sets.emplace_back(std::move(pending_flush_queries)); | ||
| 808 | } | ||
| 809 | |||
| 810 | void PopUnsyncedQueries() override { | ||
| 811 | StagingBufferRef staging_ref; | ||
| 812 | std::vector<size_t> flushed_queries; | ||
| 813 | { | ||
| 814 | std::scoped_lock lk(flush_guard); | ||
| 815 | staging_ref = download_buffers.front(); | ||
| 816 | flushed_queries = std::move(pending_flush_sets.front()); | ||
| 817 | download_buffers.pop_front(); | ||
| 818 | pending_flush_sets.pop_front(); | ||
| 819 | } | ||
| 820 | |||
| 821 | size_t offset_base = staging_ref.offset; | ||
| 822 | for (auto q : flushed_queries) { | ||
| 823 | auto* query = GetQuery(q); | ||
| 824 | u32 result = 0; | ||
| 825 | std::memcpy(&result, staging_ref.mapped_span.data() + offset_base, sizeof(u32)); | ||
| 826 | query->value = static_cast<u64>(result); | ||
| 827 | query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; | ||
| 828 | offset_base += TFBQueryBank::QUERY_SIZE; | ||
| 829 | } | ||
| 830 | |||
| 831 | { | ||
| 832 | std::scoped_lock lk(flush_guard); | ||
| 833 | free_queue.emplace_back(staging_ref); | ||
| 834 | } | ||
| 835 | } | ||
| 836 | |||
| 837 | private: | ||
| 838 | void FlushBeginTFB() { | ||
| 839 | if (has_flushed_end_pending) [[unlikely]] { | ||
| 840 | return; | ||
| 841 | } | ||
| 842 | has_flushed_end_pending = true; | ||
| 843 | if (!has_started || buffers_count == 0) { | ||
| 844 | scheduler.Record([](vk::CommandBuffer cmdbuf) { | ||
| 845 | cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); | ||
| 846 | }); | ||
| 847 | UpdateBuffers(); | ||
| 848 | return; | ||
| 849 | } | ||
| 850 | scheduler.Record([this, total = static_cast<u32>(buffers_count)](vk::CommandBuffer cmdbuf) { | ||
| 851 | cmdbuf.BeginTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data()); | ||
| 852 | }); | ||
| 853 | UpdateBuffers(); | ||
| 854 | } | ||
| 855 | |||
| 856 | void FlushEndTFB() { | ||
| 857 | if (!has_flushed_end_pending) [[unlikely]] { | ||
| 858 | UNREACHABLE(); | ||
| 859 | return; | ||
| 860 | } | ||
| 861 | has_flushed_end_pending = false; | ||
| 862 | |||
| 863 | if (buffers_count == 0) { | ||
| 864 | scheduler.Record([](vk::CommandBuffer cmdbuf) { | ||
| 865 | cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); | ||
| 866 | }); | ||
| 867 | } else { | ||
| 868 | scheduler.Record([this, | ||
| 869 | total = static_cast<u32>(buffers_count)](vk::CommandBuffer cmdbuf) { | ||
| 870 | cmdbuf.EndTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data()); | ||
| 871 | }); | ||
| 872 | } | ||
| 873 | } | ||
| 874 | |||
| 875 | void UpdateBuffers() { | ||
| 876 | last_queries.fill(0); | ||
| 877 | last_queries_stride.fill(1); | ||
| 878 | runtime.View3DRegs([this](Maxwell3D& maxwell3d) { | ||
| 879 | buffers_count = 0; | ||
| 880 | out_topology = maxwell3d.draw_manager->GetDrawState().topology; | ||
| 881 | for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) { | ||
| 882 | const auto& tf = maxwell3d.regs.transform_feedback; | ||
| 883 | if (tf.buffers[i].enable == 0) { | ||
| 884 | continue; | ||
| 885 | } | ||
| 886 | const size_t stream = tf.controls[i].stream; | ||
| 887 | last_queries_stride[stream] = tf.controls[i].stride; | ||
| 888 | streams_mask |= 1ULL << stream; | ||
| 889 | buffers_count = std::max<size_t>(buffers_count, stream + 1); | ||
| 890 | } | ||
| 891 | }); | ||
| 892 | } | ||
| 893 | |||
| 894 | std::pair<size_t, size_t> ProduceCounterBuffer(size_t stream) { | ||
| 895 | if (current_bank == nullptr || current_bank->IsClosed()) { | ||
| 896 | current_bank_id = | ||
| 897 | bank_pool.ReserveBank([this](std::deque<TFBQueryBank>& queue, size_t index) { | ||
| 898 | queue.emplace_back(scheduler, memory_allocator, index); | ||
| 899 | }); | ||
| 900 | current_bank = &bank_pool.GetBank(current_bank_id); | ||
| 901 | } | ||
| 902 | auto [dont_care, other] = current_bank->Reserve(); | ||
| 903 | const size_t slot = other; // workaround to compile bug. | ||
| 904 | current_bank->AddReference(); | ||
| 905 | |||
| 906 | static constexpr VkMemoryBarrier READ_BARRIER{ | ||
| 907 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||
| 908 | .pNext = nullptr, | ||
| 909 | .srcAccessMask = VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT, | ||
| 910 | .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, | ||
| 911 | }; | ||
| 912 | static constexpr VkMemoryBarrier WRITE_BARRIER{ | ||
| 913 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||
| 914 | .pNext = nullptr, | ||
| 915 | .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, | ||
| 916 | .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT, | ||
| 917 | }; | ||
| 918 | scheduler.RequestOutsideRenderPassOperationContext(); | ||
| 919 | scheduler.Record([dst_buffer = current_bank->GetBuffer(), | ||
| 920 | src_buffer = counter_buffers[stream], src_offset = offsets[stream], | ||
| 921 | slot](vk::CommandBuffer cmdbuf) { | ||
| 922 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, | ||
| 923 | VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); | ||
| 924 | std::array<VkBufferCopy, 1> copy{VkBufferCopy{ | ||
| 925 | .srcOffset = src_offset, | ||
| 926 | .dstOffset = slot * TFBQueryBank::QUERY_SIZE, | ||
| 927 | .size = TFBQueryBank::QUERY_SIZE, | ||
| 928 | }}; | ||
| 929 | cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); | ||
| 930 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, | ||
| 931 | 0, WRITE_BARRIER); | ||
| 932 | }); | ||
| 933 | return {current_bank_id, slot}; | ||
| 934 | } | ||
| 935 | |||
| 936 | friend class PrimitivesSucceededStreamer; | ||
| 937 | |||
| 938 | static constexpr size_t NUM_STREAMS = 4; | ||
| 939 | |||
| 940 | QueryCacheRuntime& runtime; | ||
| 941 | const Device& device; | ||
| 942 | Scheduler& scheduler; | ||
| 943 | const MemoryAllocator& memory_allocator; | ||
| 944 | StagingBufferPool& staging_pool; | ||
| 945 | VideoCommon::BankPool<TFBQueryBank> bank_pool; | ||
| 946 | size_t current_bank_id; | ||
| 947 | TFBQueryBank* current_bank; | ||
| 948 | vk::Buffer counters_buffer; | ||
| 949 | |||
| 950 | // syncing queue | ||
| 951 | std::vector<size_t> pending_sync; | ||
| 952 | |||
| 953 | // flush levels | ||
| 954 | std::vector<size_t> pending_flush_queries; | ||
| 955 | std::deque<StagingBufferRef> download_buffers; | ||
| 956 | std::deque<std::vector<size_t>> pending_flush_sets; | ||
| 957 | std::vector<StagingBufferRef> free_queue; | ||
| 958 | std::mutex flush_guard; | ||
| 959 | |||
| 960 | // state machine | ||
| 961 | bool has_started{}; | ||
| 962 | bool has_flushed_end_pending{}; | ||
| 963 | size_t buffers_count{}; | ||
| 964 | std::array<VkBuffer, NUM_STREAMS> counter_buffers{}; | ||
| 965 | std::array<VkDeviceSize, NUM_STREAMS> offsets{}; | ||
| 966 | std::array<VAddr, NUM_STREAMS> last_queries; | ||
| 967 | std::array<size_t, NUM_STREAMS> last_queries_stride; | ||
| 968 | Maxwell3D::Regs::PrimitiveTopology out_topology; | ||
| 969 | u64 streams_mask; | ||
| 970 | }; | ||
| 971 | |||
| 972 | class PrimitivesQueryBase : public VideoCommon::QueryBase { | ||
| 973 | public: | ||
| 974 | // Default constructor | ||
| 975 | PrimitivesQueryBase() | ||
| 976 | : VideoCommon::QueryBase(0, VideoCommon::QueryFlagBits::IsHostManaged, 0) {} | ||
| 977 | |||
| 978 | // Parameterized constructor | ||
| 979 | PrimitivesQueryBase(bool has_timestamp, VAddr address) | ||
| 980 | : VideoCommon::QueryBase(address, VideoCommon::QueryFlagBits::IsHostManaged, 0) { | ||
| 981 | if (has_timestamp) { | ||
| 982 | flags |= VideoCommon::QueryFlagBits::HasTimestamp; | ||
| 983 | } | ||
| 984 | } | ||
| 985 | |||
| 986 | u64 stride{}; | ||
| 987 | VAddr dependant_address{}; | ||
| 988 | Maxwell3D::Regs::PrimitiveTopology topology{Maxwell3D::Regs::PrimitiveTopology::Points}; | ||
| 989 | size_t dependant_index{}; | ||
| 990 | bool dependant_manage{}; | ||
| 991 | }; | ||
| 992 | |||
| 993 | class PrimitivesSucceededStreamer : public VideoCommon::SimpleStreamer<PrimitivesQueryBase> { | ||
| 994 | public: | ||
| 995 | explicit PrimitivesSucceededStreamer(size_t id_, QueryCacheRuntime& runtime_, | ||
| 996 | TFBCounterStreamer& tfb_streamer_, | ||
| 997 | Core::Memory::Memory& cpu_memory_) | ||
| 998 | : VideoCommon::SimpleStreamer<PrimitivesQueryBase>(id_), runtime{runtime_}, | ||
| 999 | tfb_streamer{tfb_streamer_}, cpu_memory{cpu_memory_} { | ||
| 1000 | MakeDependent(&tfb_streamer); | ||
| 1001 | } | ||
| 1002 | |||
| 1003 | ~PrimitivesSucceededStreamer() = default; | ||
| 1004 | |||
| 1005 | size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, | ||
| 1006 | std::optional<u32> subreport_) override { | ||
| 1007 | auto index = BuildQuery(); | ||
| 1008 | auto* new_query = GetQuery(index); | ||
| 1009 | new_query->guest_address = address; | ||
| 1010 | new_query->value = 0; | ||
| 1011 | if (has_timestamp) { | ||
| 1012 | new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; | ||
| 1013 | } | ||
| 1014 | if (!subreport_) { | ||
| 1015 | new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; | ||
| 1016 | return index; | ||
| 1017 | } | ||
| 1018 | const size_t subreport = static_cast<size_t>(*subreport_); | ||
| 1019 | auto dependant_address_opt = tfb_streamer.GetLastQueryStream(subreport); | ||
| 1020 | bool must_manage_dependance = false; | ||
| 1021 | new_query->topology = tfb_streamer.GetOutputTopology(); | ||
| 1022 | if (dependant_address_opt) { | ||
| 1023 | auto [dep_address, stride] = *dependant_address_opt; | ||
| 1024 | new_query->dependant_address = dep_address; | ||
| 1025 | new_query->stride = stride; | ||
| 1026 | } else { | ||
| 1027 | new_query->dependant_index = | ||
| 1028 | tfb_streamer.WriteCounter(address, has_timestamp, value, subreport_); | ||
| 1029 | auto* dependant_query = tfb_streamer.GetQuery(new_query->dependant_index); | ||
| 1030 | dependant_query->flags |= VideoCommon::QueryFlagBits::IsInvalidated; | ||
| 1031 | must_manage_dependance = true; | ||
| 1032 | if (True(dependant_query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) { | ||
| 1033 | new_query->value = 0; | ||
| 1034 | new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; | ||
| 1035 | if (must_manage_dependance) { | ||
| 1036 | tfb_streamer.Free(new_query->dependant_index); | ||
| 1037 | } | ||
| 1038 | return index; | ||
| 1039 | } | ||
| 1040 | new_query->stride = 1; | ||
| 1041 | runtime.View3DRegs([new_query, subreport](Maxwell3D& maxwell3d) { | ||
| 1042 | for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) { | ||
| 1043 | const auto& tf = maxwell3d.regs.transform_feedback; | ||
| 1044 | if (tf.buffers[i].enable == 0) { | ||
| 1045 | continue; | ||
| 1046 | } | ||
| 1047 | if (tf.controls[i].stream != subreport) { | ||
| 1048 | continue; | ||
| 1049 | } | ||
| 1050 | new_query->stride = tf.controls[i].stride; | ||
| 1051 | break; | ||
| 1052 | } | ||
| 1053 | }); | ||
| 1054 | } | ||
| 1055 | |||
| 1056 | new_query->dependant_manage = must_manage_dependance; | ||
| 1057 | pending_flush_queries.push_back(index); | ||
| 1058 | return index; | ||
| 1059 | } | ||
| 1060 | |||
| 1061 | bool HasUnsyncedQueries() const override { | ||
| 1062 | return !pending_flush_queries.empty(); | ||
| 1063 | } | ||
| 1064 | |||
| 1065 | void PushUnsyncedQueries() override { | ||
| 1066 | std::scoped_lock lk(flush_guard); | ||
| 1067 | pending_flush_sets.emplace_back(std::move(pending_flush_queries)); | ||
| 1068 | pending_flush_queries.clear(); | ||
| 1069 | } | ||
| 1070 | |||
| 1071 | void PopUnsyncedQueries() override { | ||
| 1072 | std::vector<size_t> flushed_queries; | ||
| 1073 | { | ||
| 1074 | std::scoped_lock lk(flush_guard); | ||
| 1075 | flushed_queries = std::move(pending_flush_sets.front()); | ||
| 1076 | pending_flush_sets.pop_front(); | ||
| 1077 | } | ||
| 1078 | |||
| 1079 | for (auto q : flushed_queries) { | ||
| 1080 | auto* query = GetQuery(q); | ||
| 1081 | if (True(query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) { | ||
| 1082 | continue; | ||
| 1083 | } | ||
| 1084 | |||
| 1085 | query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; | ||
| 1086 | u64 num_vertices = 0; | ||
| 1087 | if (query->dependant_manage) { | ||
| 1088 | auto* dependant_query = tfb_streamer.GetQuery(query->dependant_index); | ||
| 1089 | num_vertices = dependant_query->value / query->stride; | ||
| 1090 | tfb_streamer.Free(query->dependant_index); | ||
| 1091 | } else { | ||
| 1092 | u8* pointer = cpu_memory.GetPointer(query->dependant_address); | ||
| 1093 | u32 result; | ||
| 1094 | std::memcpy(&result, pointer, sizeof(u32)); | ||
| 1095 | num_vertices = static_cast<u64>(result) / query->stride; | ||
| 1096 | } | ||
| 1097 | query->value = [&]() -> u64 { | ||
| 1098 | switch (query->topology) { | ||
| 1099 | case Maxwell3D::Regs::PrimitiveTopology::Points: | ||
| 1100 | return num_vertices; | ||
| 1101 | case Maxwell3D::Regs::PrimitiveTopology::Lines: | ||
| 1102 | return num_vertices / 2; | ||
| 1103 | case Maxwell3D::Regs::PrimitiveTopology::LineLoop: | ||
| 1104 | return (num_vertices / 2) + 1; | ||
| 1105 | case Maxwell3D::Regs::PrimitiveTopology::LineStrip: | ||
| 1106 | return num_vertices - 1; | ||
| 1107 | case Maxwell3D::Regs::PrimitiveTopology::Patches: | ||
| 1108 | case Maxwell3D::Regs::PrimitiveTopology::Triangles: | ||
| 1109 | case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency: | ||
| 1110 | return num_vertices / 3; | ||
| 1111 | case Maxwell3D::Regs::PrimitiveTopology::TriangleFan: | ||
| 1112 | case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip: | ||
| 1113 | case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: | ||
| 1114 | return num_vertices - 2; | ||
| 1115 | case Maxwell3D::Regs::PrimitiveTopology::Quads: | ||
| 1116 | return num_vertices / 4; | ||
| 1117 | case Maxwell3D::Regs::PrimitiveTopology::Polygon: | ||
| 1118 | return 1U; | ||
| 1119 | default: | ||
| 1120 | return num_vertices; | ||
| 1121 | } | ||
| 1122 | }(); | ||
| 1123 | } | ||
| 1124 | } | ||
| 1125 | |||
| 1126 | private: | ||
| 1127 | QueryCacheRuntime& runtime; | ||
| 1128 | TFBCounterStreamer& tfb_streamer; | ||
| 1129 | Core::Memory::Memory& cpu_memory; | ||
| 1130 | |||
| 1131 | // syncing queue | ||
| 1132 | std::vector<size_t> pending_sync; | ||
| 1133 | |||
| 1134 | // flush levels | ||
| 1135 | std::vector<size_t> pending_flush_queries; | ||
| 1136 | std::deque<std::vector<size_t>> pending_flush_sets; | ||
| 1137 | std::mutex flush_guard; | ||
| 1138 | }; | ||
| 1139 | |||
| 1140 | } // namespace | ||
| 1141 | |||
| 1142 | struct QueryCacheRuntimeImpl { | ||
| 1143 | QueryCacheRuntimeImpl(QueryCacheRuntime& runtime, VideoCore::RasterizerInterface* rasterizer_, | ||
| 1144 | Core::Memory::Memory& cpu_memory_, Vulkan::BufferCache& buffer_cache_, | ||
| 1145 | const Device& device_, const MemoryAllocator& memory_allocator_, | ||
| 1146 | Scheduler& scheduler_, StagingBufferPool& staging_pool_, | ||
| 1147 | ComputePassDescriptorQueue& compute_pass_descriptor_queue, | ||
| 1148 | DescriptorPool& descriptor_pool) | ||
| 1149 | : rasterizer{rasterizer_}, cpu_memory{cpu_memory_}, | ||
| 1150 | buffer_cache{buffer_cache_}, device{device_}, | ||
| 1151 | memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, | ||
| 1152 | guest_streamer(0, runtime), | ||
| 1153 | sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer, | ||
| 1154 | device, scheduler, memory_allocator, compute_pass_descriptor_queue, | ||
| 1155 | descriptor_pool), | ||
| 1156 | tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device, | ||
| 1157 | scheduler, memory_allocator, staging_pool), | ||
| 1158 | primitives_succeeded_streamer( | ||
| 1159 | static_cast<size_t>(QueryType::StreamingPrimitivesSucceeded), runtime, tfb_streamer, | ||
| 1160 | cpu_memory_), | ||
| 1161 | primitives_needed_minus_suceeded_streamer( | ||
| 1162 | static_cast<size_t>(QueryType::StreamingPrimitivesNeededMinusSucceeded), runtime, 0u), | ||
| 1163 | hcr_setup{}, hcr_is_set{}, is_hcr_running{} { | ||
| 28 | 1164 | ||
| 29 | QueryPool::QueryPool(const Device& device_, Scheduler& scheduler, QueryType type_) | 1165 | hcr_setup.sType = VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT; |
| 30 | : ResourcePool{scheduler.GetMasterSemaphore(), GROW_STEP}, device{device_}, type{type_} {} | 1166 | hcr_setup.pNext = nullptr; |
| 1167 | hcr_setup.flags = 0; | ||
| 31 | 1168 | ||
| 32 | QueryPool::~QueryPool() = default; | 1169 | conditional_resolve_pass = std::make_unique<ConditionalRenderingResolvePass>( |
| 1170 | device, scheduler, descriptor_pool, compute_pass_descriptor_queue); | ||
| 33 | 1171 | ||
| 34 | std::pair<VkQueryPool, u32> QueryPool::Commit() { | 1172 | const VkBufferCreateInfo buffer_ci = { |
| 35 | std::size_t index; | 1173 | .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, |
| 36 | do { | 1174 | .pNext = nullptr, |
| 37 | index = CommitResource(); | 1175 | .flags = 0, |
| 38 | } while (usage[index]); | 1176 | .size = sizeof(u32), |
| 39 | usage[index] = true; | 1177 | .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | |
| 1178 | VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT, | ||
| 1179 | .sharingMode = VK_SHARING_MODE_EXCLUSIVE, | ||
| 1180 | .queueFamilyIndexCount = 0, | ||
| 1181 | .pQueueFamilyIndices = nullptr, | ||
| 1182 | }; | ||
| 1183 | hcr_resolve_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); | ||
| 1184 | } | ||
| 1185 | |||
| 1186 | VideoCore::RasterizerInterface* rasterizer; | ||
| 1187 | Core::Memory::Memory& cpu_memory; | ||
| 1188 | Vulkan::BufferCache& buffer_cache; | ||
| 1189 | |||
| 1190 | const Device& device; | ||
| 1191 | const MemoryAllocator& memory_allocator; | ||
| 1192 | Scheduler& scheduler; | ||
| 1193 | StagingBufferPool& staging_pool; | ||
| 1194 | |||
| 1195 | // Streamers | ||
| 1196 | VideoCommon::GuestStreamer<QueryCacheParams> guest_streamer; | ||
| 1197 | SamplesStreamer sample_streamer; | ||
| 1198 | TFBCounterStreamer tfb_streamer; | ||
| 1199 | PrimitivesSucceededStreamer primitives_succeeded_streamer; | ||
| 1200 | VideoCommon::StubStreamer<QueryCacheParams> primitives_needed_minus_suceeded_streamer; | ||
| 40 | 1201 | ||
| 41 | return {*pools[index / GROW_STEP], static_cast<u32>(index % GROW_STEP)}; | 1202 | std::vector<std::pair<VAddr, VAddr>> little_cache; |
| 1203 | std::vector<std::pair<VkBuffer, VkDeviceSize>> buffers_to_upload_to; | ||
| 1204 | std::vector<size_t> redirect_cache; | ||
| 1205 | std::vector<std::vector<VkBufferCopy>> copies_setup; | ||
| 1206 | |||
| 1207 | // Host conditional rendering data | ||
| 1208 | std::unique_ptr<ConditionalRenderingResolvePass> conditional_resolve_pass; | ||
| 1209 | vk::Buffer hcr_resolve_buffer; | ||
| 1210 | VkConditionalRenderingBeginInfoEXT hcr_setup; | ||
| 1211 | VkBuffer hcr_buffer; | ||
| 1212 | size_t hcr_offset; | ||
| 1213 | bool hcr_is_set; | ||
| 1214 | bool is_hcr_running; | ||
| 1215 | |||
| 1216 | // maxwell3d | ||
| 1217 | Maxwell3D* maxwell3d; | ||
| 1218 | }; | ||
| 1219 | |||
| 1220 | QueryCacheRuntime::QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, | ||
| 1221 | Core::Memory::Memory& cpu_memory_, | ||
| 1222 | Vulkan::BufferCache& buffer_cache_, const Device& device_, | ||
| 1223 | const MemoryAllocator& memory_allocator_, | ||
| 1224 | Scheduler& scheduler_, StagingBufferPool& staging_pool_, | ||
| 1225 | ComputePassDescriptorQueue& compute_pass_descriptor_queue, | ||
| 1226 | DescriptorPool& descriptor_pool) { | ||
| 1227 | impl = std::make_unique<QueryCacheRuntimeImpl>( | ||
| 1228 | *this, rasterizer, cpu_memory_, buffer_cache_, device_, memory_allocator_, scheduler_, | ||
| 1229 | staging_pool_, compute_pass_descriptor_queue, descriptor_pool); | ||
| 42 | } | 1230 | } |
| 43 | 1231 | ||
| 44 | void QueryPool::Allocate(std::size_t begin, std::size_t end) { | 1232 | void QueryCacheRuntime::Bind3DEngine(Maxwell3D* maxwell3d) { |
| 45 | usage.resize(end); | 1233 | impl->maxwell3d = maxwell3d; |
| 1234 | } | ||
| 46 | 1235 | ||
| 47 | pools.push_back(device.GetLogical().CreateQueryPool({ | 1236 | template <typename Func> |
| 48 | .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, | 1237 | void QueryCacheRuntime::View3DRegs(Func&& func) { |
| 49 | .pNext = nullptr, | 1238 | func(*impl->maxwell3d); |
| 50 | .flags = 0, | 1239 | } |
| 51 | .queryType = GetTarget(type), | 1240 | |
| 52 | .queryCount = static_cast<u32>(end - begin), | 1241 | void QueryCacheRuntime::EndHostConditionalRendering() { |
| 53 | .pipelineStatistics = 0, | 1242 | PauseHostConditionalRendering(); |
| 54 | })); | 1243 | impl->hcr_is_set = false; |
| 1244 | impl->is_hcr_running = false; | ||
| 1245 | impl->hcr_buffer = nullptr; | ||
| 1246 | impl->hcr_offset = 0; | ||
| 1247 | } | ||
| 1248 | |||
| 1249 | void QueryCacheRuntime::PauseHostConditionalRendering() { | ||
| 1250 | if (!impl->hcr_is_set) { | ||
| 1251 | return; | ||
| 1252 | } | ||
| 1253 | if (impl->is_hcr_running) { | ||
| 1254 | impl->scheduler.Record( | ||
| 1255 | [](vk::CommandBuffer cmdbuf) { cmdbuf.EndConditionalRenderingEXT(); }); | ||
| 1256 | } | ||
| 1257 | impl->is_hcr_running = false; | ||
| 55 | } | 1258 | } |
| 56 | 1259 | ||
| 57 | void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) { | 1260 | void QueryCacheRuntime::ResumeHostConditionalRendering() { |
| 58 | const auto it = | 1261 | if (!impl->hcr_is_set) { |
| 59 | std::find_if(pools.begin(), pools.end(), [query_pool = query.first](vk::QueryPool& pool) { | 1262 | return; |
| 60 | return query_pool == *pool; | 1263 | } |
| 1264 | if (!impl->is_hcr_running) { | ||
| 1265 | impl->scheduler.Record([hcr_setup = impl->hcr_setup](vk::CommandBuffer cmdbuf) { | ||
| 1266 | cmdbuf.BeginConditionalRenderingEXT(hcr_setup); | ||
| 61 | }); | 1267 | }); |
| 1268 | } | ||
| 1269 | impl->is_hcr_running = true; | ||
| 1270 | } | ||
| 62 | 1271 | ||
| 63 | if (it != std::end(pools)) { | 1272 | void QueryCacheRuntime::HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, |
| 64 | const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it); | 1273 | bool is_equal) { |
| 65 | usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; | 1274 | { |
| 1275 | std::scoped_lock lk(impl->buffer_cache.mutex); | ||
| 1276 | static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; | ||
| 1277 | const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; | ||
| 1278 | const auto [buffer, offset] = | ||
| 1279 | impl->buffer_cache.ObtainCPUBuffer(object.address, 8, sync_info, post_op); | ||
| 1280 | impl->hcr_buffer = buffer->Handle(); | ||
| 1281 | impl->hcr_offset = offset; | ||
| 1282 | } | ||
| 1283 | if (impl->hcr_is_set) { | ||
| 1284 | if (impl->hcr_setup.buffer == impl->hcr_buffer && | ||
| 1285 | impl->hcr_setup.offset == impl->hcr_offset) { | ||
| 1286 | ResumeHostConditionalRendering(); | ||
| 1287 | return; | ||
| 1288 | } | ||
| 1289 | PauseHostConditionalRendering(); | ||
| 66 | } | 1290 | } |
| 1291 | impl->hcr_setup.buffer = impl->hcr_buffer; | ||
| 1292 | impl->hcr_setup.offset = impl->hcr_offset; | ||
| 1293 | impl->hcr_setup.flags = is_equal ? VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT : 0; | ||
| 1294 | impl->hcr_is_set = true; | ||
| 1295 | impl->is_hcr_running = false; | ||
| 1296 | ResumeHostConditionalRendering(); | ||
| 67 | } | 1297 | } |
| 68 | 1298 | ||
| 69 | QueryCache::QueryCache(VideoCore::RasterizerInterface& rasterizer_, | 1299 | void QueryCacheRuntime::HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal) { |
| 70 | Core::Memory::Memory& cpu_memory_, const Device& device_, | 1300 | VkBuffer to_resolve; |
| 71 | Scheduler& scheduler_) | 1301 | u32 to_resolve_offset; |
| 72 | : QueryCacheBase{rasterizer_, cpu_memory_}, device{device_}, scheduler{scheduler_}, | 1302 | { |
| 73 | query_pools{ | 1303 | std::scoped_lock lk(impl->buffer_cache.mutex); |
| 74 | QueryPool{device_, scheduler_, QueryType::SamplesPassed}, | 1304 | static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::NoSynchronize; |
| 75 | } {} | 1305 | const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; |
| 76 | 1306 | const auto [buffer, offset] = | |
| 77 | QueryCache::~QueryCache() { | 1307 | impl->buffer_cache.ObtainCPUBuffer(address, 24, sync_info, post_op); |
| 78 | // TODO(Rodrigo): This is a hack to destroy all HostCounter instances before the base class | 1308 | to_resolve = buffer->Handle(); |
| 79 | // destructor is called. The query cache should be redesigned to have a proper ownership model | 1309 | to_resolve_offset = static_cast<u32>(offset); |
| 80 | // instead of using shared pointers. | ||
| 81 | for (size_t query_type = 0; query_type < VideoCore::NumQueryTypes; ++query_type) { | ||
| 82 | auto& stream = Stream(static_cast<QueryType>(query_type)); | ||
| 83 | stream.Update(false); | ||
| 84 | stream.Reset(); | ||
| 85 | } | 1310 | } |
| 1311 | if (impl->is_hcr_running) { | ||
| 1312 | PauseHostConditionalRendering(); | ||
| 1313 | } | ||
| 1314 | impl->conditional_resolve_pass->Resolve(*impl->hcr_resolve_buffer, to_resolve, | ||
| 1315 | to_resolve_offset, false); | ||
| 1316 | impl->hcr_setup.buffer = *impl->hcr_resolve_buffer; | ||
| 1317 | impl->hcr_setup.offset = 0; | ||
| 1318 | impl->hcr_setup.flags = is_equal ? 0 : VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; | ||
| 1319 | impl->hcr_is_set = true; | ||
| 1320 | impl->is_hcr_running = false; | ||
| 1321 | ResumeHostConditionalRendering(); | ||
| 86 | } | 1322 | } |
| 87 | 1323 | ||
| 88 | std::pair<VkQueryPool, u32> QueryCache::AllocateQuery(QueryType type) { | 1324 | bool QueryCacheRuntime::HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, |
| 89 | return query_pools[static_cast<std::size_t>(type)].Commit(); | 1325 | [[maybe_unused]] bool qc_dirty) { |
| 1326 | if (!impl->device.IsExtConditionalRendering()) { | ||
| 1327 | return false; | ||
| 1328 | } | ||
| 1329 | HostConditionalRenderingCompareValueImpl(object_1, false); | ||
| 1330 | return true; | ||
| 90 | } | 1331 | } |
| 91 | 1332 | ||
| 92 | void QueryCache::Reserve(QueryType type, std::pair<VkQueryPool, u32> query) { | 1333 | bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, |
| 93 | query_pools[static_cast<std::size_t>(type)].Reserve(query); | 1334 | VideoCommon::LookupData object_2, |
| 1335 | bool qc_dirty, bool equal_check) { | ||
| 1336 | if (!impl->device.IsExtConditionalRendering()) { | ||
| 1337 | return false; | ||
| 1338 | } | ||
| 1339 | |||
| 1340 | const auto check_in_bc = [&](VAddr address) { | ||
| 1341 | return impl->buffer_cache.IsRegionGpuModified(address, 8); | ||
| 1342 | }; | ||
| 1343 | const auto check_value = [&](VAddr address) { | ||
| 1344 | u8* ptr = impl->cpu_memory.GetPointer(address); | ||
| 1345 | u64 value{}; | ||
| 1346 | std::memcpy(&value, ptr, sizeof(value)); | ||
| 1347 | return value == 0; | ||
| 1348 | }; | ||
| 1349 | std::array<VideoCommon::LookupData*, 2> objects{&object_1, &object_2}; | ||
| 1350 | std::array<bool, 2> is_in_bc{}; | ||
| 1351 | std::array<bool, 2> is_in_qc{}; | ||
| 1352 | std::array<bool, 2> is_in_ac{}; | ||
| 1353 | std::array<bool, 2> is_null{}; | ||
| 1354 | { | ||
| 1355 | std::scoped_lock lk(impl->buffer_cache.mutex); | ||
| 1356 | for (size_t i = 0; i < 2; i++) { | ||
| 1357 | is_in_qc[i] = objects[i]->found_query != nullptr; | ||
| 1358 | is_in_bc[i] = !is_in_qc[i] && check_in_bc(objects[i]->address); | ||
| 1359 | is_in_ac[i] = is_in_qc[i] || is_in_bc[i]; | ||
| 1360 | } | ||
| 1361 | } | ||
| 1362 | |||
| 1363 | if (!is_in_ac[0] && !is_in_ac[1]) { | ||
| 1364 | EndHostConditionalRendering(); | ||
| 1365 | return false; | ||
| 1366 | } | ||
| 1367 | |||
| 1368 | if (!qc_dirty && !is_in_bc[0] && !is_in_bc[1]) { | ||
| 1369 | EndHostConditionalRendering(); | ||
| 1370 | return false; | ||
| 1371 | } | ||
| 1372 | |||
| 1373 | const bool is_gpu_high = Settings::IsGPULevelHigh(); | ||
| 1374 | if (!is_gpu_high && impl->device.GetDriverID() == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) { | ||
| 1375 | return true; | ||
| 1376 | } | ||
| 1377 | |||
| 1378 | for (size_t i = 0; i < 2; i++) { | ||
| 1379 | is_null[i] = !is_in_ac[i] && check_value(objects[i]->address); | ||
| 1380 | } | ||
| 1381 | |||
| 1382 | for (size_t i = 0; i < 2; i++) { | ||
| 1383 | if (is_null[i]) { | ||
| 1384 | size_t j = (i + 1) % 2; | ||
| 1385 | HostConditionalRenderingCompareValueImpl(*objects[j], equal_check); | ||
| 1386 | return true; | ||
| 1387 | } | ||
| 1388 | } | ||
| 1389 | |||
| 1390 | if (!is_gpu_high) { | ||
| 1391 | return true; | ||
| 1392 | } | ||
| 1393 | |||
| 1394 | if (!is_in_bc[0] && !is_in_bc[1]) { | ||
| 1395 | // Both queries are in query cache, it's best to just flush. | ||
| 1396 | return true; | ||
| 1397 | } | ||
| 1398 | HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); | ||
| 1399 | return true; | ||
| 94 | } | 1400 | } |
| 95 | 1401 | ||
| 96 | HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, | 1402 | QueryCacheRuntime::~QueryCacheRuntime() = default; |
| 97 | QueryType type_) | 1403 | |
| 98 | : HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, | 1404 | VideoCommon::StreamerInterface* QueryCacheRuntime::GetStreamerInterface(QueryType query_type) { |
| 99 | query{cache_.AllocateQuery(type_)}, tick{cache_.GetScheduler().CurrentTick()} { | 1405 | switch (query_type) { |
| 100 | const vk::Device* logical = &cache.GetDevice().GetLogical(); | 1406 | case QueryType::Payload: |
| 101 | cache.GetScheduler().Record([logical, query_ = query](vk::CommandBuffer cmdbuf) { | 1407 | return &impl->guest_streamer; |
| 102 | const bool use_precise = Settings::IsGPULevelHigh(); | 1408 | case QueryType::ZPassPixelCount64: |
| 103 | logical->ResetQueryPool(query_.first, query_.second, 1); | 1409 | return &impl->sample_streamer; |
| 104 | cmdbuf.BeginQuery(query_.first, query_.second, | 1410 | case QueryType::StreamingByteCount: |
| 105 | use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); | 1411 | return &impl->tfb_streamer; |
| 106 | }); | 1412 | case QueryType::StreamingPrimitivesNeeded: |
| 1413 | case QueryType::VtgPrimitivesOut: | ||
| 1414 | case QueryType::StreamingPrimitivesSucceeded: | ||
| 1415 | return &impl->primitives_succeeded_streamer; | ||
| 1416 | case QueryType::StreamingPrimitivesNeededMinusSucceeded: | ||
| 1417 | return &impl->primitives_needed_minus_suceeded_streamer; | ||
| 1418 | default: | ||
| 1419 | return nullptr; | ||
| 1420 | } | ||
| 107 | } | 1421 | } |
| 108 | 1422 | ||
| 109 | HostCounter::~HostCounter() { | 1423 | void QueryCacheRuntime::Barriers(bool is_prebarrier) { |
| 110 | cache.Reserve(type, query); | 1424 | static constexpr VkMemoryBarrier READ_BARRIER{ |
| 1425 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||
| 1426 | .pNext = nullptr, | ||
| 1427 | .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, | ||
| 1428 | .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, | ||
| 1429 | }; | ||
| 1430 | static constexpr VkMemoryBarrier WRITE_BARRIER{ | ||
| 1431 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||
| 1432 | .pNext = nullptr, | ||
| 1433 | .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, | ||
| 1434 | .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, | ||
| 1435 | }; | ||
| 1436 | if (is_prebarrier) { | ||
| 1437 | impl->scheduler.Record([](vk::CommandBuffer cmdbuf) { | ||
| 1438 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | ||
| 1439 | VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); | ||
| 1440 | }); | ||
| 1441 | } else { | ||
| 1442 | impl->scheduler.Record([](vk::CommandBuffer cmdbuf) { | ||
| 1443 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, | ||
| 1444 | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); | ||
| 1445 | }); | ||
| 1446 | } | ||
| 111 | } | 1447 | } |
| 112 | 1448 | ||
| 113 | void HostCounter::EndQuery() { | 1449 | template <typename SyncValuesType> |
| 114 | cache.GetScheduler().Record([query_ = query](vk::CommandBuffer cmdbuf) { | 1450 | void QueryCacheRuntime::SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer) { |
| 115 | cmdbuf.EndQuery(query_.first, query_.second); | 1451 | if (values.size() == 0) { |
| 1452 | return; | ||
| 1453 | } | ||
| 1454 | impl->redirect_cache.clear(); | ||
| 1455 | impl->little_cache.clear(); | ||
| 1456 | size_t total_size = 0; | ||
| 1457 | for (auto& sync_val : values) { | ||
| 1458 | total_size += sync_val.size; | ||
| 1459 | bool found = false; | ||
| 1460 | VAddr base = Common::AlignDown(sync_val.address, Core::Memory::YUZU_PAGESIZE); | ||
| 1461 | VAddr base_end = base + Core::Memory::YUZU_PAGESIZE; | ||
| 1462 | for (size_t i = 0; i < impl->little_cache.size(); i++) { | ||
| 1463 | const auto set_found = [&] { | ||
| 1464 | impl->redirect_cache.push_back(i); | ||
| 1465 | found = true; | ||
| 1466 | }; | ||
| 1467 | auto& loc = impl->little_cache[i]; | ||
| 1468 | if (base < loc.second && loc.first < base_end) { | ||
| 1469 | set_found(); | ||
| 1470 | break; | ||
| 1471 | } | ||
| 1472 | if (loc.first == base_end) { | ||
| 1473 | loc.first = base; | ||
| 1474 | set_found(); | ||
| 1475 | break; | ||
| 1476 | } | ||
| 1477 | if (loc.second == base) { | ||
| 1478 | loc.second = base_end; | ||
| 1479 | set_found(); | ||
| 1480 | break; | ||
| 1481 | } | ||
| 1482 | } | ||
| 1483 | if (!found) { | ||
| 1484 | impl->redirect_cache.push_back(impl->little_cache.size()); | ||
| 1485 | impl->little_cache.emplace_back(base, base_end); | ||
| 1486 | } | ||
| 1487 | } | ||
| 1488 | |||
| 1489 | // Vulkan part. | ||
| 1490 | std::scoped_lock lk(impl->buffer_cache.mutex); | ||
| 1491 | impl->buffer_cache.BufferOperations([&] { | ||
| 1492 | impl->buffers_to_upload_to.clear(); | ||
| 1493 | for (auto& pair : impl->little_cache) { | ||
| 1494 | static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; | ||
| 1495 | const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; | ||
| 1496 | const auto [buffer, offset] = impl->buffer_cache.ObtainCPUBuffer( | ||
| 1497 | pair.first, static_cast<u32>(pair.second - pair.first), sync_info, post_op); | ||
| 1498 | impl->buffers_to_upload_to.emplace_back(buffer->Handle(), offset); | ||
| 1499 | } | ||
| 116 | }); | 1500 | }); |
| 117 | } | ||
| 118 | 1501 | ||
| 119 | u64 HostCounter::BlockingQuery(bool async) const { | 1502 | VkBuffer src_buffer; |
| 120 | if (!async) { | 1503 | [[maybe_unused]] StagingBufferRef ref; |
| 121 | cache.GetScheduler().Wait(tick); | 1504 | impl->copies_setup.clear(); |
| 122 | } | 1505 | impl->copies_setup.resize(impl->little_cache.size()); |
| 123 | u64 data; | 1506 | if constexpr (SyncValuesType::GeneratesBaseBuffer) { |
| 124 | const VkResult query_result = cache.GetDevice().GetLogical().GetQueryResults( | 1507 | ref = impl->staging_pool.Request(total_size, MemoryUsage::Upload); |
| 125 | query.first, query.second, 1, sizeof(data), &data, sizeof(data), | 1508 | size_t current_offset = ref.offset; |
| 126 | VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); | 1509 | size_t accumulated_size = 0; |
| 127 | 1510 | for (size_t i = 0; i < values.size(); i++) { | |
| 128 | switch (query_result) { | 1511 | size_t which_copy = impl->redirect_cache[i]; |
| 129 | case VK_SUCCESS: | 1512 | impl->copies_setup[which_copy].emplace_back(VkBufferCopy{ |
| 130 | return data; | 1513 | .srcOffset = current_offset + accumulated_size, |
| 131 | case VK_ERROR_DEVICE_LOST: | 1514 | .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address - |
| 132 | cache.GetDevice().ReportLoss(); | 1515 | impl->little_cache[which_copy].first, |
| 133 | [[fallthrough]]; | 1516 | .size = values[i].size, |
| 134 | default: | 1517 | }); |
| 135 | throw vk::Exception(query_result); | 1518 | std::memcpy(ref.mapped_span.data() + accumulated_size, &values[i].value, |
| 1519 | values[i].size); | ||
| 1520 | accumulated_size += values[i].size; | ||
| 1521 | } | ||
| 1522 | src_buffer = ref.buffer; | ||
| 1523 | } else { | ||
| 1524 | for (size_t i = 0; i < values.size(); i++) { | ||
| 1525 | size_t which_copy = impl->redirect_cache[i]; | ||
| 1526 | impl->copies_setup[which_copy].emplace_back(VkBufferCopy{ | ||
| 1527 | .srcOffset = values[i].offset, | ||
| 1528 | .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address - | ||
| 1529 | impl->little_cache[which_copy].first, | ||
| 1530 | .size = values[i].size, | ||
| 1531 | }); | ||
| 1532 | } | ||
| 1533 | src_buffer = base_src_buffer; | ||
| 136 | } | 1534 | } |
| 1535 | |||
| 1536 | impl->scheduler.RequestOutsideRenderPassOperationContext(); | ||
| 1537 | impl->scheduler.Record([src_buffer, dst_buffers = std::move(impl->buffers_to_upload_to), | ||
| 1538 | vk_copies = std::move(impl->copies_setup)](vk::CommandBuffer cmdbuf) { | ||
| 1539 | size_t size = dst_buffers.size(); | ||
| 1540 | for (size_t i = 0; i < size; i++) { | ||
| 1541 | cmdbuf.CopyBuffer(src_buffer, dst_buffers[i].first, vk_copies[i]); | ||
| 1542 | } | ||
| 1543 | }); | ||
| 137 | } | 1544 | } |
| 138 | 1545 | ||
| 139 | } // namespace Vulkan | 1546 | } // namespace Vulkan |
| 1547 | |||
| 1548 | namespace VideoCommon { | ||
| 1549 | |||
| 1550 | template class QueryCacheBase<Vulkan::QueryCacheParams>; | ||
| 1551 | |||
| 1552 | } // namespace VideoCommon | ||
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index c1b9552eb..e9a1ea169 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h | |||
| @@ -1,101 +1,75 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project | 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project |
| 2 | // SPDX-License-Identifier: GPL-2.0-or-later | 2 | // SPDX-License-Identifier: GPL-3.0-or-later |
| 3 | 3 | ||
| 4 | #pragma once | 4 | #pragma once |
| 5 | 5 | ||
| 6 | #include <cstddef> | ||
| 7 | #include <memory> | 6 | #include <memory> |
| 8 | #include <utility> | ||
| 9 | #include <vector> | ||
| 10 | 7 | ||
| 11 | #include "common/common_types.h" | 8 | #include "video_core/query_cache/query_cache_base.h" |
| 12 | #include "video_core/query_cache.h" | 9 | #include "video_core/renderer_vulkan/vk_buffer_cache.h" |
| 13 | #include "video_core/renderer_vulkan/vk_resource_pool.h" | ||
| 14 | #include "video_core/vulkan_common/vulkan_wrapper.h" | ||
| 15 | 10 | ||
| 16 | namespace VideoCore { | 11 | namespace VideoCore { |
| 17 | class RasterizerInterface; | 12 | class RasterizerInterface; |
| 18 | } | 13 | } |
| 19 | 14 | ||
| 15 | namespace VideoCommon { | ||
| 16 | class StreamerInterface; | ||
| 17 | } | ||
| 18 | |||
| 20 | namespace Vulkan { | 19 | namespace Vulkan { |
| 21 | 20 | ||
| 22 | class CachedQuery; | ||
| 23 | class Device; | 21 | class Device; |
| 24 | class HostCounter; | ||
| 25 | class QueryCache; | ||
| 26 | class Scheduler; | 22 | class Scheduler; |
| 23 | class StagingBufferPool; | ||
| 27 | 24 | ||
| 28 | using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; | 25 | struct QueryCacheRuntimeImpl; |
| 29 | 26 | ||
| 30 | class QueryPool final : public ResourcePool { | 27 | class QueryCacheRuntime { |
| 31 | public: | 28 | public: |
| 32 | explicit QueryPool(const Device& device, Scheduler& scheduler, VideoCore::QueryType type); | 29 | explicit QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, |
| 33 | ~QueryPool() override; | 30 | Core::Memory::Memory& cpu_memory_, |
| 31 | Vulkan::BufferCache& buffer_cache_, const Device& device_, | ||
| 32 | const MemoryAllocator& memory_allocator_, Scheduler& scheduler_, | ||
| 33 | StagingBufferPool& staging_pool_, | ||
| 34 | ComputePassDescriptorQueue& compute_pass_descriptor_queue, | ||
| 35 | DescriptorPool& descriptor_pool); | ||
| 36 | ~QueryCacheRuntime(); | ||
| 34 | 37 | ||
| 35 | std::pair<VkQueryPool, u32> Commit(); | 38 | template <typename SyncValuesType> |
| 39 | void SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer = nullptr); | ||
| 36 | 40 | ||
| 37 | void Reserve(std::pair<VkQueryPool, u32> query); | 41 | void Barriers(bool is_prebarrier); |
| 38 | 42 | ||
| 39 | protected: | 43 | void EndHostConditionalRendering(); |
| 40 | void Allocate(std::size_t begin, std::size_t end) override; | ||
| 41 | 44 | ||
| 42 | private: | 45 | void PauseHostConditionalRendering(); |
| 43 | static constexpr std::size_t GROW_STEP = 512; | ||
| 44 | 46 | ||
| 45 | const Device& device; | 47 | void ResumeHostConditionalRendering(); |
| 46 | const VideoCore::QueryType type; | ||
| 47 | 48 | ||
| 48 | std::vector<vk::QueryPool> pools; | 49 | bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty); |
| 49 | std::vector<bool> usage; | ||
| 50 | }; | ||
| 51 | 50 | ||
| 52 | class QueryCache final | 51 | bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, |
| 53 | : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { | 52 | VideoCommon::LookupData object_2, bool qc_dirty, |
| 54 | public: | 53 | bool equal_check); |
| 55 | explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_, | ||
| 56 | Core::Memory::Memory& cpu_memory_, const Device& device_, | ||
| 57 | Scheduler& scheduler_); | ||
| 58 | ~QueryCache(); | ||
| 59 | |||
| 60 | std::pair<VkQueryPool, u32> AllocateQuery(VideoCore::QueryType type); | ||
| 61 | 54 | ||
| 62 | void Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query); | 55 | VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type); |
| 63 | 56 | ||
| 64 | const Device& GetDevice() const noexcept { | 57 | void Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d); |
| 65 | return device; | ||
| 66 | } | ||
| 67 | 58 | ||
| 68 | Scheduler& GetScheduler() const noexcept { | 59 | template <typename Func> |
| 69 | return scheduler; | 60 | void View3DRegs(Func&& func); |
| 70 | } | ||
| 71 | 61 | ||
| 72 | private: | 62 | private: |
| 73 | const Device& device; | 63 | void HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, bool is_equal); |
| 74 | Scheduler& scheduler; | 64 | void HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal); |
| 75 | std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; | 65 | friend struct QueryCacheRuntimeImpl; |
| 66 | std::unique_ptr<QueryCacheRuntimeImpl> impl; | ||
| 76 | }; | 67 | }; |
| 77 | 68 | ||
| 78 | class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { | 69 | struct QueryCacheParams { |
| 79 | public: | 70 | using RuntimeType = typename Vulkan::QueryCacheRuntime; |
| 80 | explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, | ||
| 81 | VideoCore::QueryType type_); | ||
| 82 | ~HostCounter(); | ||
| 83 | |||
| 84 | void EndQuery(); | ||
| 85 | |||
| 86 | private: | ||
| 87 | u64 BlockingQuery(bool async = false) const override; | ||
| 88 | |||
| 89 | QueryCache& cache; | ||
| 90 | const VideoCore::QueryType type; | ||
| 91 | const std::pair<VkQueryPool, u32> query; | ||
| 92 | const u64 tick; | ||
| 93 | }; | 71 | }; |
| 94 | 72 | ||
| 95 | class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> { | 73 | using QueryCache = VideoCommon::QueryCacheBase<QueryCacheParams>; |
| 96 | public: | ||
| 97 | explicit CachedQuery(QueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_) | ||
| 98 | : CachedQueryBase{cpu_addr_, host_ptr_} {} | ||
| 99 | }; | ||
| 100 | 74 | ||
| 101 | } // namespace Vulkan | 75 | } // namespace Vulkan |
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 01e76a82c..c7ce7c312 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include "video_core/renderer_vulkan/vk_compute_pipeline.h" | 24 | #include "video_core/renderer_vulkan/vk_compute_pipeline.h" |
| 25 | #include "video_core/renderer_vulkan/vk_descriptor_pool.h" | 25 | #include "video_core/renderer_vulkan/vk_descriptor_pool.h" |
| 26 | #include "video_core/renderer_vulkan/vk_pipeline_cache.h" | 26 | #include "video_core/renderer_vulkan/vk_pipeline_cache.h" |
| 27 | #include "video_core/renderer_vulkan/vk_query_cache.h" | ||
| 27 | #include "video_core/renderer_vulkan/vk_rasterizer.h" | 28 | #include "video_core/renderer_vulkan/vk_rasterizer.h" |
| 28 | #include "video_core/renderer_vulkan/vk_scheduler.h" | 29 | #include "video_core/renderer_vulkan/vk_scheduler.h" |
| 29 | #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" | 30 | #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" |
| @@ -170,9 +171,11 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra | |||
| 170 | buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, | 171 | buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, |
| 171 | guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool), | 172 | guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool), |
| 172 | buffer_cache(*this, cpu_memory_, buffer_cache_runtime), | 173 | buffer_cache(*this, cpu_memory_, buffer_cache_runtime), |
| 174 | query_cache_runtime(this, cpu_memory_, buffer_cache, device, memory_allocator, scheduler, | ||
| 175 | staging_pool, compute_pass_descriptor_queue, descriptor_pool), | ||
| 176 | query_cache(gpu, *this, cpu_memory_, query_cache_runtime), | ||
| 173 | pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue, | 177 | pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue, |
| 174 | render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), | 178 | render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), |
| 175 | query_cache{*this, cpu_memory_, device, scheduler}, | ||
| 176 | accelerate_dma(buffer_cache, texture_cache, scheduler), | 179 | accelerate_dma(buffer_cache, texture_cache, scheduler), |
| 177 | fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), | 180 | fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), |
| 178 | wfi_event(device.GetLogical().CreateEvent()) { | 181 | wfi_event(device.GetLogical().CreateEvent()) { |
| @@ -189,14 +192,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { | |||
| 189 | FlushWork(); | 192 | FlushWork(); |
| 190 | gpu_memory->FlushCaching(); | 193 | gpu_memory->FlushCaching(); |
| 191 | 194 | ||
| 192 | #if ANDROID | 195 | query_cache.NotifySegment(true); |
| 193 | if (Settings::IsGPULevelHigh()) { | ||
| 194 | // This is problematic on Android, disable on GPU Normal. | ||
| 195 | query_cache.UpdateCounters(); | ||
| 196 | } | ||
| 197 | #else | ||
| 198 | query_cache.UpdateCounters(); | ||
| 199 | #endif | ||
| 200 | 196 | ||
| 201 | GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()}; | 197 | GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()}; |
| 202 | if (!pipeline) { | 198 | if (!pipeline) { |
| @@ -207,13 +203,12 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { | |||
| 207 | pipeline->SetEngine(maxwell3d, gpu_memory); | 203 | pipeline->SetEngine(maxwell3d, gpu_memory); |
| 208 | pipeline->Configure(is_indexed); | 204 | pipeline->Configure(is_indexed); |
| 209 | 205 | ||
| 210 | BeginTransformFeedback(); | ||
| 211 | |||
| 212 | UpdateDynamicStates(); | 206 | UpdateDynamicStates(); |
| 213 | 207 | ||
| 208 | HandleTransformFeedback(); | ||
| 209 | query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, | ||
| 210 | maxwell3d->regs.zpass_pixel_count_enable); | ||
| 214 | draw_func(); | 211 | draw_func(); |
| 215 | |||
| 216 | EndTransformFeedback(); | ||
| 217 | } | 212 | } |
| 218 | 213 | ||
| 219 | void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { | 214 | void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { |
| @@ -241,6 +236,14 @@ void RasterizerVulkan::DrawIndirect() { | |||
| 241 | const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer(); | 236 | const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer(); |
| 242 | const auto& buffer = indirect_buffer.first; | 237 | const auto& buffer = indirect_buffer.first; |
| 243 | const auto& offset = indirect_buffer.second; | 238 | const auto& offset = indirect_buffer.second; |
| 239 | if (params.is_byte_count) { | ||
| 240 | scheduler.Record([buffer_obj = buffer->Handle(), offset, | ||
| 241 | stride = params.stride](vk::CommandBuffer cmdbuf) { | ||
| 242 | cmdbuf.DrawIndirectByteCountEXT(1, 0, buffer_obj, offset, 0, | ||
| 243 | static_cast<u32>(stride)); | ||
| 244 | }); | ||
| 245 | return; | ||
| 246 | } | ||
| 244 | if (params.include_count) { | 247 | if (params.include_count) { |
| 245 | const auto count = buffer_cache.GetDrawIndirectCount(); | 248 | const auto count = buffer_cache.GetDrawIndirectCount(); |
| 246 | const auto& draw_buffer = count.first; | 249 | const auto& draw_buffer = count.first; |
| @@ -280,20 +283,15 @@ void RasterizerVulkan::DrawTexture() { | |||
| 280 | SCOPE_EXIT({ gpu.TickWork(); }); | 283 | SCOPE_EXIT({ gpu.TickWork(); }); |
| 281 | FlushWork(); | 284 | FlushWork(); |
| 282 | 285 | ||
| 283 | #if ANDROID | 286 | query_cache.NotifySegment(true); |
| 284 | if (Settings::IsGPULevelHigh()) { | ||
| 285 | // This is problematic on Android, disable on GPU Normal. | ||
| 286 | query_cache.UpdateCounters(); | ||
| 287 | } | ||
| 288 | #else | ||
| 289 | query_cache.UpdateCounters(); | ||
| 290 | #endif | ||
| 291 | 287 | ||
| 292 | texture_cache.SynchronizeGraphicsDescriptors(); | 288 | texture_cache.SynchronizeGraphicsDescriptors(); |
| 293 | texture_cache.UpdateRenderTargets(false); | 289 | texture_cache.UpdateRenderTargets(false); |
| 294 | 290 | ||
| 295 | UpdateDynamicStates(); | 291 | UpdateDynamicStates(); |
| 296 | 292 | ||
| 293 | query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, | ||
| 294 | maxwell3d->regs.zpass_pixel_count_enable); | ||
| 297 | const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState(); | 295 | const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState(); |
| 298 | const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler); | 296 | const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler); |
| 299 | const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture); | 297 | const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture); |
| @@ -316,14 +314,9 @@ void RasterizerVulkan::Clear(u32 layer_count) { | |||
| 316 | FlushWork(); | 314 | FlushWork(); |
| 317 | gpu_memory->FlushCaching(); | 315 | gpu_memory->FlushCaching(); |
| 318 | 316 | ||
| 319 | #if ANDROID | 317 | query_cache.NotifySegment(true); |
| 320 | if (Settings::IsGPULevelHigh()) { | 318 | query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, |
| 321 | // This is problematic on Android, disable on GPU Normal. | 319 | maxwell3d->regs.zpass_pixel_count_enable); |
| 322 | query_cache.UpdateCounters(); | ||
| 323 | } | ||
| 324 | #else | ||
| 325 | query_cache.UpdateCounters(); | ||
| 326 | #endif | ||
| 327 | 320 | ||
| 328 | auto& regs = maxwell3d->regs; | 321 | auto& regs = maxwell3d->regs; |
| 329 | const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || | 322 | const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || |
| @@ -482,13 +475,13 @@ void RasterizerVulkan::DispatchCompute() { | |||
| 482 | scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); }); | 475 | scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); }); |
| 483 | } | 476 | } |
| 484 | 477 | ||
| 485 | void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) { | 478 | void RasterizerVulkan::ResetCounter(VideoCommon::QueryType type) { |
| 486 | query_cache.ResetCounter(type); | 479 | query_cache.CounterReset(type); |
| 487 | } | 480 | } |
| 488 | 481 | ||
| 489 | void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, | 482 | void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, |
| 490 | std::optional<u64> timestamp) { | 483 | VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { |
| 491 | query_cache.Query(gpu_addr, type, timestamp); | 484 | query_cache.CounterReport(gpu_addr, type, flags, payload, subreport); |
| 492 | } | 485 | } |
| 493 | 486 | ||
| 494 | void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | 487 | void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, |
| @@ -669,8 +662,8 @@ void RasterizerVulkan::SignalReference() { | |||
| 669 | fence_manager.SignalReference(); | 662 | fence_manager.SignalReference(); |
| 670 | } | 663 | } |
| 671 | 664 | ||
| 672 | void RasterizerVulkan::ReleaseFences() { | 665 | void RasterizerVulkan::ReleaseFences(bool force) { |
| 673 | fence_manager.WaitPendingFences(); | 666 | fence_manager.WaitPendingFences(force); |
| 674 | } | 667 | } |
| 675 | 668 | ||
| 676 | void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size, | 669 | void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size, |
| @@ -694,6 +687,8 @@ void RasterizerVulkan::WaitForIdle() { | |||
| 694 | flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; | 687 | flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; |
| 695 | } | 688 | } |
| 696 | 689 | ||
| 690 | query_cache.NotifyWFI(); | ||
| 691 | |||
| 697 | scheduler.RequestOutsideRenderPassOperationContext(); | 692 | scheduler.RequestOutsideRenderPassOperationContext(); |
| 698 | scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { | 693 | scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { |
| 699 | cmdbuf.SetEvent(event, flags); | 694 | cmdbuf.SetEvent(event, flags); |
| @@ -737,19 +732,7 @@ void RasterizerVulkan::TickFrame() { | |||
| 737 | 732 | ||
| 738 | bool RasterizerVulkan::AccelerateConditionalRendering() { | 733 | bool RasterizerVulkan::AccelerateConditionalRendering() { |
| 739 | gpu_memory->FlushCaching(); | 734 | gpu_memory->FlushCaching(); |
| 740 | if (Settings::IsGPULevelHigh()) { | 735 | return query_cache.AccelerateHostConditionalRendering(); |
| 741 | // TODO(Blinkhawk): Reimplement Host conditional rendering. | ||
| 742 | return false; | ||
| 743 | } | ||
| 744 | // Medium / Low Hack: stub any checks on queries written into the buffer cache. | ||
| 745 | const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()}; | ||
| 746 | Maxwell::ReportSemaphore::Compare cmp; | ||
| 747 | if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp), | ||
| 748 | VideoCommon::CacheType::BufferCache | | ||
| 749 | VideoCommon::CacheType::QueryCache)) { | ||
| 750 | return true; | ||
| 751 | } | ||
| 752 | return false; | ||
| 753 | } | 736 | } |
| 754 | 737 | ||
| 755 | bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, | 738 | bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, |
| @@ -795,6 +778,7 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, | |||
| 795 | if (!image_view) { | 778 | if (!image_view) { |
| 796 | return false; | 779 | return false; |
| 797 | } | 780 | } |
| 781 | query_cache.NotifySegment(false); | ||
| 798 | screen_info.image = image_view->ImageHandle(); | 782 | screen_info.image = image_view->ImageHandle(); |
| 799 | screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D); | 783 | screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D); |
| 800 | screen_info.width = image_view->size.width; | 784 | screen_info.width = image_view->size.width; |
| @@ -933,31 +917,18 @@ void RasterizerVulkan::UpdateDynamicStates() { | |||
| 933 | } | 917 | } |
| 934 | } | 918 | } |
| 935 | 919 | ||
| 936 | void RasterizerVulkan::BeginTransformFeedback() { | 920 | void RasterizerVulkan::HandleTransformFeedback() { |
| 937 | const auto& regs = maxwell3d->regs; | 921 | const auto& regs = maxwell3d->regs; |
| 938 | if (regs.transform_feedback_enabled == 0) { | ||
| 939 | return; | ||
| 940 | } | ||
| 941 | if (!device.IsExtTransformFeedbackSupported()) { | 922 | if (!device.IsExtTransformFeedbackSupported()) { |
| 942 | LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); | 923 | LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); |
| 943 | return; | 924 | return; |
| 944 | } | 925 | } |
| 945 | UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || | 926 | query_cache.CounterEnable(VideoCommon::QueryType::StreamingByteCount, |
| 946 | regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); | 927 | regs.transform_feedback_enabled); |
| 947 | scheduler.Record( | 928 | if (regs.transform_feedback_enabled != 0) { |
| 948 | [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); | 929 | UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || |
| 949 | } | 930 | regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); |
| 950 | |||
| 951 | void RasterizerVulkan::EndTransformFeedback() { | ||
| 952 | const auto& regs = maxwell3d->regs; | ||
| 953 | if (regs.transform_feedback_enabled == 0) { | ||
| 954 | return; | ||
| 955 | } | ||
| 956 | if (!device.IsExtTransformFeedbackSupported()) { | ||
| 957 | return; | ||
| 958 | } | 931 | } |
| 959 | scheduler.Record( | ||
| 960 | [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); | ||
| 961 | } | 932 | } |
| 962 | 933 | ||
| 963 | void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { | 934 | void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { |
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index b31982485..ad069556c 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h | |||
| @@ -84,8 +84,9 @@ public: | |||
| 84 | void DrawTexture() override; | 84 | void DrawTexture() override; |
| 85 | void Clear(u32 layer_count) override; | 85 | void Clear(u32 layer_count) override; |
| 86 | void DispatchCompute() override; | 86 | void DispatchCompute() override; |
| 87 | void ResetCounter(VideoCore::QueryType type) override; | 87 | void ResetCounter(VideoCommon::QueryType type) override; |
| 88 | void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; | 88 | void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, |
| 89 | VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; | ||
| 89 | void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; | 90 | void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; |
| 90 | void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; | 91 | void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; |
| 91 | void FlushAll() override; | 92 | void FlushAll() override; |
| @@ -106,7 +107,7 @@ public: | |||
| 106 | void SyncOperation(std::function<void()>&& func) override; | 107 | void SyncOperation(std::function<void()>&& func) override; |
| 107 | void SignalSyncPoint(u32 value) override; | 108 | void SignalSyncPoint(u32 value) override; |
| 108 | void SignalReference() override; | 109 | void SignalReference() override; |
| 109 | void ReleaseFences() override; | 110 | void ReleaseFences(bool force = true) override; |
| 110 | void FlushAndInvalidateRegion( | 111 | void FlushAndInvalidateRegion( |
| 111 | VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; | 112 | VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; |
| 112 | void WaitForIdle() override; | 113 | void WaitForIdle() override; |
| @@ -146,9 +147,7 @@ private: | |||
| 146 | 147 | ||
| 147 | void UpdateDynamicStates(); | 148 | void UpdateDynamicStates(); |
| 148 | 149 | ||
| 149 | void BeginTransformFeedback(); | 150 | void HandleTransformFeedback(); |
| 150 | |||
| 151 | void EndTransformFeedback(); | ||
| 152 | 151 | ||
| 153 | void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); | 152 | void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); |
| 154 | void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); | 153 | void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); |
| @@ -195,8 +194,9 @@ private: | |||
| 195 | TextureCache texture_cache; | 194 | TextureCache texture_cache; |
| 196 | BufferCacheRuntime buffer_cache_runtime; | 195 | BufferCacheRuntime buffer_cache_runtime; |
| 197 | BufferCache buffer_cache; | 196 | BufferCache buffer_cache; |
| 198 | PipelineCache pipeline_cache; | 197 | QueryCacheRuntime query_cache_runtime; |
| 199 | QueryCache query_cache; | 198 | QueryCache query_cache; |
| 199 | PipelineCache pipeline_cache; | ||
| 200 | AccelerateDMA accelerate_dma; | 200 | AccelerateDMA accelerate_dma; |
| 201 | FenceManager fence_manager; | 201 | FenceManager fence_manager; |
| 202 | 202 | ||
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 89fd31b4f..3be7837f4 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp | |||
| @@ -243,10 +243,10 @@ void Scheduler::AllocateNewContext() { | |||
| 243 | #if ANDROID | 243 | #if ANDROID |
| 244 | if (Settings::IsGPULevelHigh()) { | 244 | if (Settings::IsGPULevelHigh()) { |
| 245 | // This is problematic on Android, disable on GPU Normal. | 245 | // This is problematic on Android, disable on GPU Normal. |
| 246 | query_cache->UpdateCounters(); | 246 | query_cache->NotifySegment(true); |
| 247 | } | 247 | } |
| 248 | #else | 248 | #else |
| 249 | query_cache->UpdateCounters(); | 249 | query_cache->NotifySegment(true); |
| 250 | #endif | 250 | #endif |
| 251 | } | 251 | } |
| 252 | } | 252 | } |
| @@ -261,11 +261,12 @@ void Scheduler::EndPendingOperations() { | |||
| 261 | #if ANDROID | 261 | #if ANDROID |
| 262 | if (Settings::IsGPULevelHigh()) { | 262 | if (Settings::IsGPULevelHigh()) { |
| 263 | // This is problematic on Android, disable on GPU Normal. | 263 | // This is problematic on Android, disable on GPU Normal. |
| 264 | query_cache->DisableStreams(); | 264 | // query_cache->DisableStreams(); |
| 265 | } | 265 | } |
| 266 | #else | 266 | #else |
| 267 | query_cache->DisableStreams(); | 267 | // query_cache->DisableStreams(); |
| 268 | #endif | 268 | #endif |
| 269 | query_cache->NotifySegment(false); | ||
| 269 | EndRenderPass(); | 270 | EndRenderPass(); |
| 270 | } | 271 | } |
| 271 | 272 | ||
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 475c682eb..da03803aa 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h | |||
| @@ -17,6 +17,11 @@ | |||
| 17 | #include "video_core/renderer_vulkan/vk_master_semaphore.h" | 17 | #include "video_core/renderer_vulkan/vk_master_semaphore.h" |
| 18 | #include "video_core/vulkan_common/vulkan_wrapper.h" | 18 | #include "video_core/vulkan_common/vulkan_wrapper.h" |
| 19 | 19 | ||
| 20 | namespace VideoCommon { | ||
| 21 | template <typename Trait> | ||
| 22 | class QueryCacheBase; | ||
| 23 | } | ||
| 24 | |||
| 20 | namespace Vulkan { | 25 | namespace Vulkan { |
| 21 | 26 | ||
| 22 | class CommandPool; | 27 | class CommandPool; |
| @@ -24,7 +29,8 @@ class Device; | |||
| 24 | class Framebuffer; | 29 | class Framebuffer; |
| 25 | class GraphicsPipeline; | 30 | class GraphicsPipeline; |
| 26 | class StateTracker; | 31 | class StateTracker; |
| 27 | class QueryCache; | 32 | |
| 33 | struct QueryCacheParams; | ||
| 28 | 34 | ||
| 29 | /// The scheduler abstracts command buffer and fence management with an interface that's able to do | 35 | /// The scheduler abstracts command buffer and fence management with an interface that's able to do |
| 30 | /// OpenGL-like operations on Vulkan command buffers. | 36 | /// OpenGL-like operations on Vulkan command buffers. |
| @@ -63,7 +69,7 @@ public: | |||
| 63 | void InvalidateState(); | 69 | void InvalidateState(); |
| 64 | 70 | ||
| 65 | /// Assigns the query cache. | 71 | /// Assigns the query cache. |
| 66 | void SetQueryCache(QueryCache& query_cache_) { | 72 | void SetQueryCache(VideoCommon::QueryCacheBase<QueryCacheParams>& query_cache_) { |
| 67 | query_cache = &query_cache_; | 73 | query_cache = &query_cache_; |
| 68 | } | 74 | } |
| 69 | 75 | ||
| @@ -219,7 +225,7 @@ private: | |||
| 219 | std::unique_ptr<MasterSemaphore> master_semaphore; | 225 | std::unique_ptr<MasterSemaphore> master_semaphore; |
| 220 | std::unique_ptr<CommandPool> command_pool; | 226 | std::unique_ptr<CommandPool> command_pool; |
| 221 | 227 | ||
| 222 | QueryCache* query_cache = nullptr; | 228 | VideoCommon::QueryCacheBase<QueryCacheParams>* query_cache = nullptr; |
| 223 | 229 | ||
| 224 | vk::CommandBuffer current_cmdbuf; | 230 | vk::CommandBuffer current_cmdbuf; |
| 225 | 231 | ||
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 8c5355a28..94f41266d 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h | |||
| @@ -61,6 +61,7 @@ VK_DEFINE_HANDLE(VmaAllocator) | |||
| 61 | 61 | ||
| 62 | // Define miscellaneous extensions which may be used by the implementation here. | 62 | // Define miscellaneous extensions which may be used by the implementation here. |
| 63 | #define FOR_EACH_VK_EXTENSION(EXTENSION) \ | 63 | #define FOR_EACH_VK_EXTENSION(EXTENSION) \ |
| 64 | EXTENSION(EXT, CONDITIONAL_RENDERING, conditional_rendering) \ | ||
| 64 | EXTENSION(EXT, CONSERVATIVE_RASTERIZATION, conservative_rasterization) \ | 65 | EXTENSION(EXT, CONSERVATIVE_RASTERIZATION, conservative_rasterization) \ |
| 65 | EXTENSION(EXT, DEPTH_RANGE_UNRESTRICTED, depth_range_unrestricted) \ | 66 | EXTENSION(EXT, DEPTH_RANGE_UNRESTRICTED, depth_range_unrestricted) \ |
| 66 | EXTENSION(EXT, MEMORY_BUDGET, memory_budget) \ | 67 | EXTENSION(EXT, MEMORY_BUDGET, memory_budget) \ |
| @@ -93,6 +94,7 @@ VK_DEFINE_HANDLE(VmaAllocator) | |||
| 93 | 94 | ||
| 94 | // Define extensions where the absence of the extension may result in a degraded experience. | 95 | // Define extensions where the absence of the extension may result in a degraded experience. |
| 95 | #define FOR_EACH_VK_RECOMMENDED_EXTENSION(EXTENSION_NAME) \ | 96 | #define FOR_EACH_VK_RECOMMENDED_EXTENSION(EXTENSION_NAME) \ |
| 97 | EXTENSION_NAME(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME) \ | ||
| 96 | EXTENSION_NAME(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME) \ | 98 | EXTENSION_NAME(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME) \ |
| 97 | EXTENSION_NAME(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME) \ | 99 | EXTENSION_NAME(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME) \ |
| 98 | EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME) \ | 100 | EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME) \ |
| @@ -541,6 +543,10 @@ public: | |||
| 541 | return extensions.shader_atomic_int64; | 543 | return extensions.shader_atomic_int64; |
| 542 | } | 544 | } |
| 543 | 545 | ||
| 546 | bool IsExtConditionalRendering() const { | ||
| 547 | return extensions.conditional_rendering; | ||
| 548 | } | ||
| 549 | |||
| 544 | bool HasTimelineSemaphore() const; | 550 | bool HasTimelineSemaphore() const; |
| 545 | 551 | ||
| 546 | /// Returns the minimum supported version of SPIR-V. | 552 | /// Returns the minimum supported version of SPIR-V. |
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index c3f388d89..5afba365c 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp | |||
| @@ -75,6 +75,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { | |||
| 75 | X(vkBeginCommandBuffer); | 75 | X(vkBeginCommandBuffer); |
| 76 | X(vkBindBufferMemory); | 76 | X(vkBindBufferMemory); |
| 77 | X(vkBindImageMemory); | 77 | X(vkBindImageMemory); |
| 78 | X(vkCmdBeginConditionalRenderingEXT); | ||
| 78 | X(vkCmdBeginQuery); | 79 | X(vkCmdBeginQuery); |
| 79 | X(vkCmdBeginRenderPass); | 80 | X(vkCmdBeginRenderPass); |
| 80 | X(vkCmdBeginTransformFeedbackEXT); | 81 | X(vkCmdBeginTransformFeedbackEXT); |
| @@ -91,6 +92,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { | |||
| 91 | X(vkCmdCopyBufferToImage); | 92 | X(vkCmdCopyBufferToImage); |
| 92 | X(vkCmdCopyImage); | 93 | X(vkCmdCopyImage); |
| 93 | X(vkCmdCopyImageToBuffer); | 94 | X(vkCmdCopyImageToBuffer); |
| 95 | X(vkCmdCopyQueryPoolResults); | ||
| 94 | X(vkCmdDispatch); | 96 | X(vkCmdDispatch); |
| 95 | X(vkCmdDispatchIndirect); | 97 | X(vkCmdDispatchIndirect); |
| 96 | X(vkCmdDraw); | 98 | X(vkCmdDraw); |
| @@ -99,6 +101,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { | |||
| 99 | X(vkCmdDrawIndexedIndirect); | 101 | X(vkCmdDrawIndexedIndirect); |
| 100 | X(vkCmdDrawIndirectCount); | 102 | X(vkCmdDrawIndirectCount); |
| 101 | X(vkCmdDrawIndexedIndirectCount); | 103 | X(vkCmdDrawIndexedIndirectCount); |
| 104 | X(vkCmdDrawIndirectByteCountEXT); | ||
| 105 | X(vkCmdEndConditionalRenderingEXT); | ||
| 102 | X(vkCmdEndQuery); | 106 | X(vkCmdEndQuery); |
| 103 | X(vkCmdEndRenderPass); | 107 | X(vkCmdEndRenderPass); |
| 104 | X(vkCmdEndTransformFeedbackEXT); | 108 | X(vkCmdEndTransformFeedbackEXT); |
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 049fa8038..0d4bbe7f7 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h | |||
| @@ -185,6 +185,7 @@ struct DeviceDispatch : InstanceDispatch { | |||
| 185 | PFN_vkBeginCommandBuffer vkBeginCommandBuffer{}; | 185 | PFN_vkBeginCommandBuffer vkBeginCommandBuffer{}; |
| 186 | PFN_vkBindBufferMemory vkBindBufferMemory{}; | 186 | PFN_vkBindBufferMemory vkBindBufferMemory{}; |
| 187 | PFN_vkBindImageMemory vkBindImageMemory{}; | 187 | PFN_vkBindImageMemory vkBindImageMemory{}; |
| 188 | PFN_vkCmdBeginConditionalRenderingEXT vkCmdBeginConditionalRenderingEXT{}; | ||
| 188 | PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{}; | 189 | PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{}; |
| 189 | PFN_vkCmdBeginQuery vkCmdBeginQuery{}; | 190 | PFN_vkCmdBeginQuery vkCmdBeginQuery{}; |
| 190 | PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{}; | 191 | PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{}; |
| @@ -202,6 +203,7 @@ struct DeviceDispatch : InstanceDispatch { | |||
| 202 | PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{}; | 203 | PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{}; |
| 203 | PFN_vkCmdCopyImage vkCmdCopyImage{}; | 204 | PFN_vkCmdCopyImage vkCmdCopyImage{}; |
| 204 | PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{}; | 205 | PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{}; |
| 206 | PFN_vkCmdCopyQueryPoolResults vkCmdCopyQueryPoolResults{}; | ||
| 205 | PFN_vkCmdDispatch vkCmdDispatch{}; | 207 | PFN_vkCmdDispatch vkCmdDispatch{}; |
| 206 | PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect{}; | 208 | PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect{}; |
| 207 | PFN_vkCmdDraw vkCmdDraw{}; | 209 | PFN_vkCmdDraw vkCmdDraw{}; |
| @@ -210,6 +212,8 @@ struct DeviceDispatch : InstanceDispatch { | |||
| 210 | PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; | 212 | PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; |
| 211 | PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; | 213 | PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; |
| 212 | PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; | 214 | PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; |
| 215 | PFN_vkCmdDrawIndirectByteCountEXT vkCmdDrawIndirectByteCountEXT{}; | ||
| 216 | PFN_vkCmdEndConditionalRenderingEXT vkCmdEndConditionalRenderingEXT{}; | ||
| 213 | PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; | 217 | PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; |
| 214 | PFN_vkCmdEndQuery vkCmdEndQuery{}; | 218 | PFN_vkCmdEndQuery vkCmdEndQuery{}; |
| 215 | PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; | 219 | PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; |
| @@ -1182,6 +1186,13 @@ public: | |||
| 1182 | count_offset, draw_count, stride); | 1186 | count_offset, draw_count, stride); |
| 1183 | } | 1187 | } |
| 1184 | 1188 | ||
| 1189 | void DrawIndirectByteCountEXT(u32 instance_count, u32 first_instance, VkBuffer counter_buffer, | ||
| 1190 | VkDeviceSize counter_buffer_offset, u32 counter_offset, | ||
| 1191 | u32 stride) { | ||
| 1192 | dld->vkCmdDrawIndirectByteCountEXT(handle, instance_count, first_instance, counter_buffer, | ||
| 1193 | counter_buffer_offset, counter_offset, stride); | ||
| 1194 | } | ||
| 1195 | |||
| 1185 | void ClearAttachments(Span<VkClearAttachment> attachments, | 1196 | void ClearAttachments(Span<VkClearAttachment> attachments, |
| 1186 | Span<VkClearRect> rects) const noexcept { | 1197 | Span<VkClearRect> rects) const noexcept { |
| 1187 | dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(), | 1198 | dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(), |
| @@ -1270,6 +1281,13 @@ public: | |||
| 1270 | regions.data()); | 1281 | regions.data()); |
| 1271 | } | 1282 | } |
| 1272 | 1283 | ||
| 1284 | void CopyQueryPoolResults(VkQueryPool query_pool, u32 first_query, u32 query_count, | ||
| 1285 | VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize stride, | ||
| 1286 | VkQueryResultFlags flags) const noexcept { | ||
| 1287 | dld->vkCmdCopyQueryPoolResults(handle, query_pool, first_query, query_count, dst_buffer, | ||
| 1288 | dst_offset, stride, flags); | ||
| 1289 | } | ||
| 1290 | |||
| 1273 | void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, | 1291 | void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, |
| 1274 | u32 data) const noexcept { | 1292 | u32 data) const noexcept { |
| 1275 | dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data); | 1293 | dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data); |
| @@ -1448,6 +1466,15 @@ public: | |||
| 1448 | counter_buffers, counter_buffer_offsets); | 1466 | counter_buffers, counter_buffer_offsets); |
| 1449 | } | 1467 | } |
| 1450 | 1468 | ||
| 1469 | void BeginConditionalRenderingEXT( | ||
| 1470 | const VkConditionalRenderingBeginInfoEXT& info) const noexcept { | ||
| 1471 | dld->vkCmdBeginConditionalRenderingEXT(handle, &info); | ||
| 1472 | } | ||
| 1473 | |||
| 1474 | void EndConditionalRenderingEXT() const noexcept { | ||
| 1475 | dld->vkCmdEndConditionalRenderingEXT(handle); | ||
| 1476 | } | ||
| 1477 | |||
| 1451 | void BeginDebugUtilsLabelEXT(const char* label, std::span<float, 4> color) const noexcept { | 1478 | void BeginDebugUtilsLabelEXT(const char* label, std::span<float, 4> color) const noexcept { |
| 1452 | const VkDebugUtilsLabelEXT label_info{ | 1479 | const VkDebugUtilsLabelEXT label_info{ |
| 1453 | .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, | 1480 | .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, |