diff options
| author | 2023-08-24 03:58:59 +0200 | |
|---|---|---|
| committer | 2023-09-23 23:05:30 +0200 | |
| commit | 57d8cd6c40bbadeb30e7a4792267061cbad4d446 (patch) | |
| tree | 4ed8c078eee5983e875e5104cf0ff61242964185 /src/video_core/renderer_vulkan | |
| parent | Query Cache: Fix behavior in Normal Accuracy (diff) | |
| download | yuzu-57d8cd6c40bbadeb30e7a4792267061cbad4d446.tar.gz yuzu-57d8cd6c40bbadeb30e7a4792267061cbad4d446.tar.xz yuzu-57d8cd6c40bbadeb30e7a4792267061cbad4d446.zip | |
Query Cache: Fix Prefix Sums
Diffstat (limited to 'src/video_core/renderer_vulkan')
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_compute_pass.cpp | 101 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_compute_pass.h | 2 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_query_cache.cpp | 13 |
3 files changed, 66 insertions, 50 deletions
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 44ec5a032..289d5b25c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp | |||
| @@ -179,8 +179,10 @@ struct AstcPushConstants { | |||
| 179 | }; | 179 | }; |
| 180 | 180 | ||
| 181 | struct QueriesPrefixScanPushConstants { | 181 | struct QueriesPrefixScanPushConstants { |
| 182 | u32 min_accumulation_base; | ||
| 182 | u32 max_accumulation_base; | 183 | u32 max_accumulation_base; |
| 183 | u32 accumulation_limit; | 184 | u32 accumulation_limit; |
| 185 | u32 buffer_offset; | ||
| 184 | }; | 186 | }; |
| 185 | } // Anonymous namespace | 187 | } // Anonymous namespace |
| 186 | 188 | ||
| @@ -416,56 +418,65 @@ QueriesPrefixScanPass::QueriesPrefixScanPass( | |||
| 416 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && | 418 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && |
| 417 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) | 419 | device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) |
| 418 | ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) | 420 | ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) |
| 419 | : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV), | 421 | : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV)), |
| 420 | {32}), | ||
| 421 | scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} | 422 | scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} |
| 422 | 423 | ||
| 423 | void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, | 424 | void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, |
| 424 | VkBuffer src_buffer, size_t number_of_sums, | 425 | VkBuffer src_buffer, size_t number_of_sums, |
| 425 | size_t max_accumulation_limit) { | 426 | size_t min_accumulation_limit, size_t max_accumulation_limit) { |
| 426 | size_t aligned_runs = Common::AlignUp(number_of_sums, 32); | 427 | size_t current_runs = number_of_sums; |
| 427 | 428 | size_t offset = 0; | |
| 428 | compute_pass_descriptor_queue.Acquire(); | 429 | while (current_runs != 0) { |
| 429 | compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64)); | 430 | static constexpr size_t DISPATCH_SIZE = 2048U; |
| 430 | compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64)); | 431 | size_t runs_to_do = std::min<size_t>(current_runs, DISPATCH_SIZE); |
| 431 | compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); | 432 | current_runs -= runs_to_do; |
| 432 | const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; | 433 | compute_pass_descriptor_queue.Acquire(); |
| 433 | 434 | compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, number_of_sums * sizeof(u64)); | |
| 434 | scheduler.RequestOutsideRenderPassOperationContext(); | 435 | compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, number_of_sums * sizeof(u64)); |
| 435 | scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums, | 436 | compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); |
| 436 | aligned_runs](vk::CommandBuffer cmdbuf) { | 437 | const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; |
| 437 | static constexpr VkMemoryBarrier read_barrier{ | 438 | size_t used_offset = offset; |
| 438 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | 439 | offset += runs_to_do; |
| 439 | .pNext = nullptr, | 440 | |
| 440 | .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, | 441 | scheduler.RequestOutsideRenderPassOperationContext(); |
| 441 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, | 442 | scheduler.Record([this, descriptor_data, min_accumulation_limit, max_accumulation_limit, |
| 442 | }; | 443 | runs_to_do, used_offset](vk::CommandBuffer cmdbuf) { |
| 443 | static constexpr VkMemoryBarrier write_barrier{ | 444 | static constexpr VkMemoryBarrier read_barrier{ |
| 444 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | 445 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, |
| 445 | .pNext = nullptr, | 446 | .pNext = nullptr, |
| 446 | .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, | 447 | .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, |
| 447 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | | 448 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, |
| 448 | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | | 449 | }; |
| 449 | VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | | 450 | static constexpr VkMemoryBarrier write_barrier{ |
| 450 | VK_ACCESS_UNIFORM_READ_BIT | | 451 | .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, |
| 451 | VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, | 452 | .pNext = nullptr, |
| 452 | }; | 453 | .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, |
| 453 | const QueriesPrefixScanPushConstants uniforms{ | 454 | .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | |
| 454 | .max_accumulation_base = static_cast<u32>(max_accumulation_limit), | 455 | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | |
| 455 | .accumulation_limit = static_cast<u32>(number_of_sums - 1), | 456 | VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | |
| 456 | }; | 457 | VK_ACCESS_UNIFORM_READ_BIT | |
| 457 | const VkDescriptorSet set = descriptor_allocator.Commit(); | 458 | VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, |
| 458 | device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); | 459 | }; |
| 460 | const QueriesPrefixScanPushConstants uniforms{ | ||
| 461 | .min_accumulation_base = static_cast<u32>(min_accumulation_limit), | ||
| 462 | .max_accumulation_base = static_cast<u32>(max_accumulation_limit), | ||
| 463 | .accumulation_limit = static_cast<u32>(runs_to_do - 1), | ||
| 464 | .buffer_offset = static_cast<u32>(used_offset), | ||
| 465 | }; | ||
| 466 | const VkDescriptorSet set = descriptor_allocator.Commit(); | ||
| 467 | device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); | ||
| 459 | 468 | ||
| 460 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | 469 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, |
| 461 | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); | 470 | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); |
| 462 | cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); | 471 | cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); |
| 463 | cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); | 472 | cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); |
| 464 | cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); | 473 | cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); |
| 465 | cmdbuf.Dispatch(static_cast<u32>(aligned_runs / 32U), 1, 1); | 474 | cmdbuf.Dispatch(1, 1, 1); |
| 466 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, | 475 | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, |
| 467 | VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); | 476 | VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, |
| 468 | }); | 477 | write_barrier); |
| 478 | }); | ||
| 479 | } | ||
| 469 | } | 480 | } |
| 470 | 481 | ||
| 471 | ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, | 482 | ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, |
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 68ffb1b82..3ff935639 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h | |||
| @@ -104,7 +104,7 @@ public: | |||
| 104 | ComputePassDescriptorQueue& compute_pass_descriptor_queue_); | 104 | ComputePassDescriptorQueue& compute_pass_descriptor_queue_); |
| 105 | 105 | ||
| 106 | void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, | 106 | void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, |
| 107 | size_t number_of_sums, size_t max_accumulation_limit); | 107 | size_t number_of_sums, size_t min_accumulation_limit, size_t max_accumulation_limit); |
| 108 | 108 | ||
| 109 | private: | 109 | private: |
| 110 | Scheduler& scheduler; | 110 | Scheduler& scheduler; |
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 2cc007716..a32da3ba3 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp | |||
| @@ -181,7 +181,8 @@ public: | |||
| 181 | }); | 181 | }); |
| 182 | rasterizer->SyncOperation(std::move(func)); | 182 | rasterizer->SyncOperation(std::move(func)); |
| 183 | accumulation_since_last_sync = false; | 183 | accumulation_since_last_sync = false; |
| 184 | last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used); | 184 | first_accumulation_checkpoint = std::min(first_accumulation_checkpoint, num_slots_used); |
| 185 | last_accumulation_checkpoint = std::max(last_accumulation_checkpoint, num_slots_used); | ||
| 185 | } | 186 | } |
| 186 | 187 | ||
| 187 | void CloseCounter() override { | 188 | void CloseCounter() override { |
| @@ -285,7 +286,9 @@ public: | |||
| 285 | resolve_buffers.push_back(intermediary_buffer_index); | 286 | resolve_buffers.push_back(intermediary_buffer_index); |
| 286 | queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], | 287 | queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], |
| 287 | *buffers[resolve_buffer_index], num_slots_used, | 288 | *buffers[resolve_buffer_index], num_slots_used, |
| 288 | std::min(last_accumulation_checkpoint, num_slots_used)); | 289 | std::min(first_accumulation_checkpoint, num_slots_used), |
| 290 | last_accumulation_checkpoint); | ||
| 291 | |||
| 289 | } else { | 292 | } else { |
| 290 | scheduler.RequestOutsideRenderPassOperationContext(); | 293 | scheduler.RequestOutsideRenderPassOperationContext(); |
| 291 | scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { | 294 | scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { |
| @@ -298,7 +301,8 @@ public: | |||
| 298 | rasterizer->SyncOperation(std::move(func)); | 301 | rasterizer->SyncOperation(std::move(func)); |
| 299 | AbandonCurrentQuery(); | 302 | AbandonCurrentQuery(); |
| 300 | num_slots_used = 0; | 303 | num_slots_used = 0; |
| 301 | last_accumulation_checkpoint = std::numeric_limits<size_t>::max(); | 304 | first_accumulation_checkpoint = std::numeric_limits<size_t>::max(); |
| 305 | last_accumulation_checkpoint = 0; | ||
| 302 | accumulation_since_last_sync = has_multi_queries; | 306 | accumulation_since_last_sync = has_multi_queries; |
| 303 | pending_sync.clear(); | 307 | pending_sync.clear(); |
| 304 | } | 308 | } |
| @@ -506,7 +510,7 @@ private: | |||
| 506 | 510 | ||
| 507 | template <bool is_resolve> | 511 | template <bool is_resolve> |
| 508 | size_t ObtainBuffer(size_t num_needed) { | 512 | size_t ObtainBuffer(size_t num_needed) { |
| 509 | const size_t log_2 = std::max<size_t>(6U, Common::Log2Ceil64(num_needed)); | 513 | const size_t log_2 = std::max<size_t>(11U, Common::Log2Ceil64(num_needed)); |
| 510 | if constexpr (is_resolve) { | 514 | if constexpr (is_resolve) { |
| 511 | if (resolve_table[log_2] != 0) { | 515 | if (resolve_table[log_2] != 0) { |
| 512 | return resolve_table[log_2] - 1; | 516 | return resolve_table[log_2] - 1; |
| @@ -563,6 +567,7 @@ private: | |||
| 563 | VkQueryPool current_query_pool; | 567 | VkQueryPool current_query_pool; |
| 564 | size_t current_query_id; | 568 | size_t current_query_id; |
| 565 | size_t num_slots_used{}; | 569 | size_t num_slots_used{}; |
| 570 | size_t first_accumulation_checkpoint{}; | ||
| 566 | size_t last_accumulation_checkpoint{}; | 571 | size_t last_accumulation_checkpoint{}; |
| 567 | bool accumulation_since_last_sync{}; | 572 | bool accumulation_since_last_sync{}; |
| 568 | VideoCommon::HostQueryBase* current_query; | 573 | VideoCommon::HostQueryBase* current_query; |