diff options
| author | 2021-07-10 16:06:19 -0400 | |
|---|---|---|
| committer | 2021-07-10 16:06:19 -0400 | |
| commit | 907b2324d3a570ff726fe064ba425924d6dc0426 (patch) | |
| tree | 1b3a16691ab5835900a5eee0a05a871b6b774a71 /src | |
| parent | Merge pull request #6573 from lat9nq/cpu-settings-cleanup-2 (diff) | |
| parent | Buffer Cache: Address Feedback. (diff) | |
| download | yuzu-907b2324d3a570ff726fe064ba425924d6dc0426.tar.gz yuzu-907b2324d3a570ff726fe064ba425924d6dc0426.tar.xz yuzu-907b2324d3a570ff726fe064ba425924d6dc0426.zip | |
Merge pull request #6557 from FernandoS27/staceys-mom-has-got-it-goin-on
Buffer Cache: Fix High downloads / Fence manager: Improve fence checking.
Diffstat (limited to 'src')
| -rw-r--r-- | src/tests/video_core/buffer_base.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_base.h | 19 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 238 | ||||
| -rw-r--r-- | src/video_core/dma_pusher.cpp | 10 | ||||
| -rw-r--r-- | src/video_core/fence_manager.h | 7 | ||||
| -rw-r--r-- | src/video_core/gpu.cpp | 4 | ||||
| -rw-r--r-- | src/video_core/rasterizer_interface.h | 3 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 8 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.h | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_rasterizer.cpp | 8 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_rasterizer.h | 1 | ||||
| -rw-r--r-- | src/video_core/texture_cache/types.h | 4 |
12 files changed, 227 insertions, 78 deletions
diff --git a/src/tests/video_core/buffer_base.cpp b/src/tests/video_core/buffer_base.cpp index edced69bb..9f5a54de4 100644 --- a/src/tests/video_core/buffer_base.cpp +++ b/src/tests/video_core/buffer_base.cpp | |||
| @@ -536,7 +536,7 @@ TEST_CASE("BufferBase: Cached write downloads") { | |||
| 536 | REQUIRE(rasterizer.Count() == 63); | 536 | REQUIRE(rasterizer.Count() == 63); |
| 537 | buffer.MarkRegionAsGpuModified(c + PAGE, PAGE); | 537 | buffer.MarkRegionAsGpuModified(c + PAGE, PAGE); |
| 538 | int num = 0; | 538 | int num = 0; |
| 539 | buffer.ForEachDownloadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); | 539 | buffer.ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; }); |
| 540 | buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); | 540 | buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); |
| 541 | REQUIRE(num == 0); | 541 | REQUIRE(num == 0); |
| 542 | REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); | 542 | REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); |
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index b121d36a3..c3318095c 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h | |||
| @@ -226,19 +226,24 @@ public: | |||
| 226 | /// Call 'func' for each CPU modified range and unmark those pages as CPU modified | 226 | /// Call 'func' for each CPU modified range and unmark those pages as CPU modified |
| 227 | template <typename Func> | 227 | template <typename Func> |
| 228 | void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { | 228 | void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { |
| 229 | ForEachModifiedRange<Type::CPU>(query_cpu_range, size, func); | 229 | ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func); |
| 230 | } | 230 | } |
| 231 | 231 | ||
| 232 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified | 232 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified |
| 233 | template <typename Func> | 233 | template <typename Func> |
| 234 | void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) { | 234 | void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { |
| 235 | ForEachModifiedRange<Type::GPU>(query_cpu_range, size, func); | 235 | ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func); |
| 236 | } | ||
| 237 | |||
| 238 | template <typename Func> | ||
| 239 | void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) { | ||
| 240 | ForEachModifiedRange<Type::GPU>(query_cpu_range, size, true, func); | ||
| 236 | } | 241 | } |
| 237 | 242 | ||
| 238 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified | 243 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified |
| 239 | template <typename Func> | 244 | template <typename Func> |
| 240 | void ForEachDownloadRange(Func&& func) { | 245 | void ForEachDownloadRange(Func&& func) { |
| 241 | ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), func); | 246 | ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func); |
| 242 | } | 247 | } |
| 243 | 248 | ||
| 244 | /// Mark buffer as picked | 249 | /// Mark buffer as picked |
| @@ -415,7 +420,7 @@ private: | |||
| 415 | * @param func Function to call for each turned off region | 420 | * @param func Function to call for each turned off region |
| 416 | */ | 421 | */ |
| 417 | template <Type type, typename Func> | 422 | template <Type type, typename Func> |
| 418 | void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { | 423 | void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { |
| 419 | static_assert(type != Type::Untracked); | 424 | static_assert(type != Type::Untracked); |
| 420 | 425 | ||
| 421 | const s64 difference = query_cpu_range - cpu_addr; | 426 | const s64 difference = query_cpu_range - cpu_addr; |
| @@ -467,7 +472,9 @@ private: | |||
| 467 | bits = (bits << left_offset) >> left_offset; | 472 | bits = (bits << left_offset) >> left_offset; |
| 468 | 473 | ||
| 469 | const u64 current_word = state_words[word_index] & bits; | 474 | const u64 current_word = state_words[word_index] & bits; |
| 470 | state_words[word_index] &= ~bits; | 475 | if (clear) { |
| 476 | state_words[word_index] &= ~bits; | ||
| 477 | } | ||
| 471 | 478 | ||
| 472 | if constexpr (type == Type::CPU) { | 479 | if constexpr (type == Type::CPU) { |
| 473 | const u64 current_bits = untracked_words[word_index] & bits; | 480 | const u64 current_bits = untracked_words[word_index] & bits; |
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index cad7f902d..502feddba 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <vector> | 15 | #include <vector> |
| 16 | 16 | ||
| 17 | #include <boost/container/small_vector.hpp> | 17 | #include <boost/container/small_vector.hpp> |
| 18 | #include <boost/icl/interval_set.hpp> | ||
| 18 | 19 | ||
| 19 | #include "common/common_types.h" | 20 | #include "common/common_types.h" |
| 20 | #include "common/div_ceil.h" | 21 | #include "common/div_ceil.h" |
| @@ -77,6 +78,9 @@ class BufferCache { | |||
| 77 | using Runtime = typename P::Runtime; | 78 | using Runtime = typename P::Runtime; |
| 78 | using Buffer = typename P::Buffer; | 79 | using Buffer = typename P::Buffer; |
| 79 | 80 | ||
| 81 | using IntervalSet = boost::icl::interval_set<VAddr>; | ||
| 82 | using IntervalType = typename IntervalSet::interval_type; | ||
| 83 | |||
| 80 | struct Empty {}; | 84 | struct Empty {}; |
| 81 | 85 | ||
| 82 | struct OverlapResult { | 86 | struct OverlapResult { |
| @@ -148,11 +152,14 @@ public: | |||
| 148 | /// Return true when there are uncommitted buffers to be downloaded | 152 | /// Return true when there are uncommitted buffers to be downloaded |
| 149 | [[nodiscard]] bool HasUncommittedFlushes() const noexcept; | 153 | [[nodiscard]] bool HasUncommittedFlushes() const noexcept; |
| 150 | 154 | ||
| 155 | void AccumulateFlushes(); | ||
| 156 | |||
| 151 | /// Return true when the caller should wait for async downloads | 157 | /// Return true when the caller should wait for async downloads |
| 152 | [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; | 158 | [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; |
| 153 | 159 | ||
| 154 | /// Commit asynchronous downloads | 160 | /// Commit asynchronous downloads |
| 155 | void CommitAsyncFlushes(); | 161 | void CommitAsyncFlushes(); |
| 162 | void CommitAsyncFlushesHigh(); | ||
| 156 | 163 | ||
| 157 | /// Pop asynchronous downloads | 164 | /// Pop asynchronous downloads |
| 158 | void PopAsyncFlushes(); | 165 | void PopAsyncFlushes(); |
| @@ -160,6 +167,9 @@ public: | |||
| 160 | /// Return true when a CPU region is modified from the GPU | 167 | /// Return true when a CPU region is modified from the GPU |
| 161 | [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); | 168 | [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); |
| 162 | 169 | ||
| 170 | /// Return true when a CPU region is modified from the CPU | ||
| 171 | [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); | ||
| 172 | |||
| 163 | std::mutex mutex; | 173 | std::mutex mutex; |
| 164 | 174 | ||
| 165 | private: | 175 | private: |
| @@ -272,8 +282,6 @@ private: | |||
| 272 | 282 | ||
| 273 | void DeleteBuffer(BufferId buffer_id); | 283 | void DeleteBuffer(BufferId buffer_id); |
| 274 | 284 | ||
| 275 | void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); | ||
| 276 | |||
| 277 | void NotifyBufferDeletion(); | 285 | void NotifyBufferDeletion(); |
| 278 | 286 | ||
| 279 | [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const; | 287 | [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const; |
| @@ -327,9 +335,9 @@ private: | |||
| 327 | 335 | ||
| 328 | std::vector<BufferId> cached_write_buffer_ids; | 336 | std::vector<BufferId> cached_write_buffer_ids; |
| 329 | 337 | ||
| 330 | // TODO: This data structure is not optimal and it should be reworked | 338 | IntervalSet uncommitted_ranges; |
| 331 | std::vector<BufferId> uncommitted_downloads; | 339 | IntervalSet common_ranges; |
| 332 | std::deque<std::vector<BufferId>> committed_downloads; | 340 | std::deque<IntervalSet> committed_ranges; |
| 333 | 341 | ||
| 334 | size_t immediate_buffer_capacity = 0; | 342 | size_t immediate_buffer_capacity = 0; |
| 335 | std::unique_ptr<u8[]> immediate_buffer_alloc; | 343 | std::unique_ptr<u8[]> immediate_buffer_alloc; |
| @@ -352,6 +360,7 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, | |||
| 352 | // Ensure the first slot is used for the null buffer | 360 | // Ensure the first slot is used for the null buffer |
| 353 | void(slot_buffers.insert(runtime, NullBufferParams{})); | 361 | void(slot_buffers.insert(runtime, NullBufferParams{})); |
| 354 | deletion_iterator = slot_buffers.end(); | 362 | deletion_iterator = slot_buffers.end(); |
| 363 | common_ranges.clear(); | ||
| 355 | } | 364 | } |
| 356 | 365 | ||
| 357 | template <class P> | 366 | template <class P> |
| @@ -547,29 +556,30 @@ void BufferCache<P>::FlushCachedWrites() { | |||
| 547 | 556 | ||
| 548 | template <class P> | 557 | template <class P> |
| 549 | bool BufferCache<P>::HasUncommittedFlushes() const noexcept { | 558 | bool BufferCache<P>::HasUncommittedFlushes() const noexcept { |
| 550 | return !uncommitted_downloads.empty(); | 559 | return !uncommitted_ranges.empty() || !committed_ranges.empty(); |
| 551 | } | 560 | } |
| 552 | 561 | ||
| 553 | template <class P> | 562 | template <class P> |
| 554 | bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { | 563 | void BufferCache<P>::AccumulateFlushes() { |
| 555 | return !committed_downloads.empty() && !committed_downloads.front().empty(); | 564 | if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) { |
| 565 | uncommitted_ranges.clear(); | ||
| 566 | return; | ||
| 567 | } | ||
| 568 | if (uncommitted_ranges.empty()) { | ||
| 569 | return; | ||
| 570 | } | ||
| 571 | committed_ranges.emplace_back(std::move(uncommitted_ranges)); | ||
| 556 | } | 572 | } |
| 557 | 573 | ||
| 558 | template <class P> | 574 | template <class P> |
| 559 | void BufferCache<P>::CommitAsyncFlushes() { | 575 | bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { |
| 560 | // This is intentionally passing the value by copy | 576 | return false; |
| 561 | committed_downloads.push_front(uncommitted_downloads); | ||
| 562 | uncommitted_downloads.clear(); | ||
| 563 | } | 577 | } |
| 564 | 578 | ||
| 565 | template <class P> | 579 | template <class P> |
| 566 | void BufferCache<P>::PopAsyncFlushes() { | 580 | void BufferCache<P>::CommitAsyncFlushesHigh() { |
| 567 | if (committed_downloads.empty()) { | 581 | AccumulateFlushes(); |
| 568 | return; | 582 | if (committed_ranges.empty()) { |
| 569 | } | ||
| 570 | auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); }); | ||
| 571 | const std::span<const BufferId> download_ids = committed_downloads.back(); | ||
| 572 | if (download_ids.empty()) { | ||
| 573 | return; | 583 | return; |
| 574 | } | 584 | } |
| 575 | MICROPROFILE_SCOPE(GPU_DownloadMemory); | 585 | MICROPROFILE_SCOPE(GPU_DownloadMemory); |
| @@ -577,20 +587,66 @@ void BufferCache<P>::PopAsyncFlushes() { | |||
| 577 | boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads; | 587 | boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads; |
| 578 | u64 total_size_bytes = 0; | 588 | u64 total_size_bytes = 0; |
| 579 | u64 largest_copy = 0; | 589 | u64 largest_copy = 0; |
| 580 | for (const BufferId buffer_id : download_ids) { | 590 | for (const IntervalSet& intervals : committed_ranges) { |
| 581 | slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) { | 591 | for (auto& interval : intervals) { |
| 582 | downloads.push_back({ | 592 | const std::size_t size = interval.upper() - interval.lower(); |
| 583 | BufferCopy{ | 593 | const VAddr cpu_addr = interval.lower(); |
| 584 | .src_offset = range_offset, | 594 | ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { |
| 585 | .dst_offset = total_size_bytes, | 595 | boost::container::small_vector<BufferCopy, 1> copies; |
| 586 | .size = range_size, | 596 | buffer.ForEachDownloadRangeAndClear( |
| 587 | }, | 597 | cpu_addr, size, [&](u64 range_offset, u64 range_size) { |
| 588 | buffer_id, | 598 | const VAddr buffer_addr = buffer.CpuAddr(); |
| 599 | const auto add_download = [&](VAddr start, VAddr end) { | ||
| 600 | const u64 new_offset = start - buffer_addr; | ||
| 601 | const u64 new_size = end - start; | ||
| 602 | downloads.push_back({ | ||
| 603 | BufferCopy{ | ||
| 604 | .src_offset = new_offset, | ||
| 605 | .dst_offset = total_size_bytes, | ||
| 606 | .size = new_size, | ||
| 607 | }, | ||
| 608 | buffer_id, | ||
| 609 | }); | ||
| 610 | // Align up to avoid cache conflicts | ||
| 611 | constexpr u64 align = 256ULL; | ||
| 612 | constexpr u64 mask = ~(align - 1ULL); | ||
| 613 | total_size_bytes += (new_size + align - 1) & mask; | ||
| 614 | largest_copy = std::max(largest_copy, new_size); | ||
| 615 | }; | ||
| 616 | |||
| 617 | const VAddr start_address = buffer_addr + range_offset; | ||
| 618 | const VAddr end_address = start_address + range_size; | ||
| 619 | const IntervalType search_interval{cpu_addr, 1}; | ||
| 620 | auto it = common_ranges.lower_bound(search_interval); | ||
| 621 | if (it == common_ranges.end()) { | ||
| 622 | it = common_ranges.begin(); | ||
| 623 | } | ||
| 624 | while (it != common_ranges.end()) { | ||
| 625 | VAddr inter_addr_end = it->upper(); | ||
| 626 | VAddr inter_addr = it->lower(); | ||
| 627 | if (inter_addr >= end_address) { | ||
| 628 | break; | ||
| 629 | } | ||
| 630 | if (inter_addr_end <= start_address) { | ||
| 631 | it++; | ||
| 632 | continue; | ||
| 633 | } | ||
| 634 | if (inter_addr_end > end_address) { | ||
| 635 | inter_addr_end = end_address; | ||
| 636 | } | ||
| 637 | if (inter_addr < start_address) { | ||
| 638 | inter_addr = start_address; | ||
| 639 | } | ||
| 640 | add_download(inter_addr, inter_addr_end); | ||
| 641 | it++; | ||
| 642 | } | ||
| 643 | const IntervalType subtract_interval{start_address, end_address}; | ||
| 644 | common_ranges.subtract(subtract_interval); | ||
| 645 | }); | ||
| 589 | }); | 646 | }); |
| 590 | total_size_bytes += range_size; | 647 | } |
| 591 | largest_copy = std::max(largest_copy, range_size); | ||
| 592 | }); | ||
| 593 | } | 648 | } |
| 649 | committed_ranges.clear(); | ||
| 594 | if (downloads.empty()) { | 650 | if (downloads.empty()) { |
| 595 | return; | 651 | return; |
| 596 | } | 652 | } |
| @@ -623,6 +679,19 @@ void BufferCache<P>::PopAsyncFlushes() { | |||
| 623 | } | 679 | } |
| 624 | 680 | ||
| 625 | template <class P> | 681 | template <class P> |
| 682 | void BufferCache<P>::CommitAsyncFlushes() { | ||
| 683 | if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { | ||
| 684 | CommitAsyncFlushesHigh(); | ||
| 685 | } else { | ||
| 686 | uncommitted_ranges.clear(); | ||
| 687 | committed_ranges.clear(); | ||
| 688 | } | ||
| 689 | } | ||
| 690 | |||
| 691 | template <class P> | ||
| 692 | void BufferCache<P>::PopAsyncFlushes() {} | ||
| 693 | |||
| 694 | template <class P> | ||
| 626 | bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { | 695 | bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { |
| 627 | const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); | 696 | const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); |
| 628 | for (u64 page = addr >> PAGE_BITS; page < page_end;) { | 697 | for (u64 page = addr >> PAGE_BITS; page < page_end;) { |
| @@ -642,6 +711,25 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { | |||
| 642 | } | 711 | } |
| 643 | 712 | ||
| 644 | template <class P> | 713 | template <class P> |
| 714 | bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) { | ||
| 715 | const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); | ||
| 716 | for (u64 page = addr >> PAGE_BITS; page < page_end;) { | ||
| 717 | const BufferId image_id = page_table[page]; | ||
| 718 | if (!image_id) { | ||
| 719 | ++page; | ||
| 720 | continue; | ||
| 721 | } | ||
| 722 | Buffer& buffer = slot_buffers[image_id]; | ||
| 723 | if (buffer.IsRegionCpuModified(addr, size)) { | ||
| 724 | return true; | ||
| 725 | } | ||
| 726 | const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); | ||
| 727 | page = Common::DivCeil(end_addr, PAGE_SIZE); | ||
| 728 | } | ||
| 729 | return false; | ||
| 730 | } | ||
| 731 | |||
| 732 | template <class P> | ||
| 645 | void BufferCache<P>::BindHostIndexBuffer() { | 733 | void BufferCache<P>::BindHostIndexBuffer() { |
| 646 | Buffer& buffer = slot_buffers[index_buffer.buffer_id]; | 734 | Buffer& buffer = slot_buffers[index_buffer.buffer_id]; |
| 647 | TouchBuffer(buffer); | 735 | TouchBuffer(buffer); |
| @@ -1010,16 +1098,16 @@ void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s | |||
| 1010 | Buffer& buffer = slot_buffers[buffer_id]; | 1098 | Buffer& buffer = slot_buffers[buffer_id]; |
| 1011 | buffer.MarkRegionAsGpuModified(cpu_addr, size); | 1099 | buffer.MarkRegionAsGpuModified(cpu_addr, size); |
| 1012 | 1100 | ||
| 1013 | const bool is_accuracy_high = Settings::IsGPULevelHigh(); | 1101 | const IntervalType base_interval{cpu_addr, cpu_addr + size}; |
| 1102 | common_ranges.add(base_interval); | ||
| 1103 | |||
| 1104 | const bool is_accuracy_high = | ||
| 1105 | Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High; | ||
| 1014 | const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); | 1106 | const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); |
| 1015 | if (!is_accuracy_high || !is_async) { | 1107 | if (!is_async && !is_accuracy_high) { |
| 1016 | return; | ||
| 1017 | } | ||
| 1018 | if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) { | ||
| 1019 | // Already inserted | ||
| 1020 | return; | 1108 | return; |
| 1021 | } | 1109 | } |
| 1022 | uncommitted_downloads.push_back(buffer_id); | 1110 | uncommitted_ranges.add(base_interval); |
| 1023 | } | 1111 | } |
| 1024 | 1112 | ||
| 1025 | template <class P> | 1113 | template <class P> |
| @@ -1103,7 +1191,6 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, | |||
| 1103 | if (!copies.empty()) { | 1191 | if (!copies.empty()) { |
| 1104 | runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); | 1192 | runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); |
| 1105 | } | 1193 | } |
| 1106 | ReplaceBufferDownloads(overlap_id, new_buffer_id); | ||
| 1107 | DeleteBuffer(overlap_id); | 1194 | DeleteBuffer(overlap_id); |
| 1108 | } | 1195 | } |
| 1109 | 1196 | ||
| @@ -1244,14 +1331,51 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si | |||
| 1244 | boost::container::small_vector<BufferCopy, 1> copies; | 1331 | boost::container::small_vector<BufferCopy, 1> copies; |
| 1245 | u64 total_size_bytes = 0; | 1332 | u64 total_size_bytes = 0; |
| 1246 | u64 largest_copy = 0; | 1333 | u64 largest_copy = 0; |
| 1247 | buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { | 1334 | buffer.ForEachDownloadRangeAndClear(cpu_addr, size, [&](u64 range_offset, u64 range_size) { |
| 1248 | copies.push_back(BufferCopy{ | 1335 | const VAddr buffer_addr = buffer.CpuAddr(); |
| 1249 | .src_offset = range_offset, | 1336 | const auto add_download = [&](VAddr start, VAddr end) { |
| 1250 | .dst_offset = total_size_bytes, | 1337 | const u64 new_offset = start - buffer_addr; |
| 1251 | .size = range_size, | 1338 | const u64 new_size = end - start; |
| 1252 | }); | 1339 | copies.push_back(BufferCopy{ |
| 1253 | total_size_bytes += range_size; | 1340 | .src_offset = new_offset, |
| 1254 | largest_copy = std::max(largest_copy, range_size); | 1341 | .dst_offset = total_size_bytes, |
| 1342 | .size = new_size, | ||
| 1343 | }); | ||
| 1344 | // Align up to avoid cache conflicts | ||
| 1345 | constexpr u64 align = 256ULL; | ||
| 1346 | constexpr u64 mask = ~(align - 1ULL); | ||
| 1347 | total_size_bytes += (new_size + align - 1) & mask; | ||
| 1348 | largest_copy = std::max(largest_copy, new_size); | ||
| 1349 | }; | ||
| 1350 | |||
| 1351 | const VAddr start_address = buffer_addr + range_offset; | ||
| 1352 | const VAddr end_address = start_address + range_size; | ||
| 1353 | const IntervalType search_interval{start_address - range_size, 1}; | ||
| 1354 | auto it = common_ranges.lower_bound(search_interval); | ||
| 1355 | if (it == common_ranges.end()) { | ||
| 1356 | it = common_ranges.begin(); | ||
| 1357 | } | ||
| 1358 | while (it != common_ranges.end()) { | ||
| 1359 | VAddr inter_addr_end = it->upper(); | ||
| 1360 | VAddr inter_addr = it->lower(); | ||
| 1361 | if (inter_addr >= end_address) { | ||
| 1362 | break; | ||
| 1363 | } | ||
| 1364 | if (inter_addr_end <= start_address) { | ||
| 1365 | it++; | ||
| 1366 | continue; | ||
| 1367 | } | ||
| 1368 | if (inter_addr_end > end_address) { | ||
| 1369 | inter_addr_end = end_address; | ||
| 1370 | } | ||
| 1371 | if (inter_addr < start_address) { | ||
| 1372 | inter_addr = start_address; | ||
| 1373 | } | ||
| 1374 | add_download(inter_addr, inter_addr_end); | ||
| 1375 | it++; | ||
| 1376 | } | ||
| 1377 | const IntervalType subtract_interval{start_address, end_address}; | ||
| 1378 | common_ranges.subtract(subtract_interval); | ||
| 1255 | }); | 1379 | }); |
| 1256 | if (total_size_bytes == 0) { | 1380 | if (total_size_bytes == 0) { |
| 1257 | return; | 1381 | return; |
| @@ -1316,18 +1440,6 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { | |||
| 1316 | } | 1440 | } |
| 1317 | 1441 | ||
| 1318 | template <class P> | 1442 | template <class P> |
| 1319 | void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) { | ||
| 1320 | const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) { | ||
| 1321 | std::ranges::replace(buffers, old_buffer_id, new_buffer_id); | ||
| 1322 | if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) { | ||
| 1323 | buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end()); | ||
| 1324 | } | ||
| 1325 | }; | ||
| 1326 | replace(uncommitted_downloads); | ||
| 1327 | std::ranges::for_each(committed_downloads, replace); | ||
| 1328 | } | ||
| 1329 | |||
| 1330 | template <class P> | ||
| 1331 | void BufferCache<P>::NotifyBufferDeletion() { | 1443 | void BufferCache<P>::NotifyBufferDeletion() { |
| 1332 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { | 1444 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { |
| 1333 | dirty_uniform_buffers.fill(~u32{0}); | 1445 | dirty_uniform_buffers.fill(~u32{0}); |
| @@ -1349,15 +1461,9 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s | |||
| 1349 | if (!cpu_addr || size == 0) { | 1461 | if (!cpu_addr || size == 0) { |
| 1350 | return NULL_BINDING; | 1462 | return NULL_BINDING; |
| 1351 | } | 1463 | } |
| 1352 | // HACK(Rodrigo): This is the number of bytes bound in host beyond the guest API's range. | ||
| 1353 | // It exists due to some games like Astral Chain operate out of bounds. | ||
| 1354 | // Binding the whole map range would be technically correct, but games have large maps that make | ||
| 1355 | // this approach unaffordable for now. | ||
| 1356 | static constexpr u32 arbitrary_extra_bytes = 0xc000; | ||
| 1357 | const u32 bytes_to_map_end = static_cast<u32>(gpu_memory.BytesToMapEnd(gpu_addr)); | ||
| 1358 | const Binding binding{ | 1464 | const Binding binding{ |
| 1359 | .cpu_addr = *cpu_addr, | 1465 | .cpu_addr = *cpu_addr, |
| 1360 | .size = std::min(size + arbitrary_extra_bytes, bytes_to_map_end), | 1466 | .size = size, |
| 1361 | .buffer_id = BufferId{}, | 1467 | .buffer_id = BufferId{}, |
| 1362 | }; | 1468 | }; |
| 1363 | return binding; | 1469 | return binding; |
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 8b33c04ab..8d28bd884 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp | |||
| @@ -4,6 +4,7 @@ | |||
| 4 | 4 | ||
| 5 | #include "common/cityhash.h" | 5 | #include "common/cityhash.h" |
| 6 | #include "common/microprofile.h" | 6 | #include "common/microprofile.h" |
| 7 | #include "common/settings.h" | ||
| 7 | #include "core/core.h" | 8 | #include "core/core.h" |
| 8 | #include "core/memory.h" | 9 | #include "core/memory.h" |
| 9 | #include "video_core/dma_pusher.h" | 10 | #include "video_core/dma_pusher.h" |
| @@ -76,8 +77,13 @@ bool DmaPusher::Step() { | |||
| 76 | 77 | ||
| 77 | // Push buffer non-empty, read a word | 78 | // Push buffer non-empty, read a word |
| 78 | command_headers.resize(command_list_header.size); | 79 | command_headers.resize(command_list_header.size); |
| 79 | gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), | 80 | if (Settings::IsGPULevelHigh()) { |
| 80 | command_list_header.size * sizeof(u32)); | 81 | gpu.MemoryManager().ReadBlock(dma_get, command_headers.data(), |
| 82 | command_list_header.size * sizeof(u32)); | ||
| 83 | } else { | ||
| 84 | gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), | ||
| 85 | command_list_header.size * sizeof(u32)); | ||
| 86 | } | ||
| 81 | } | 87 | } |
| 82 | for (std::size_t index = 0; index < command_headers.size();) { | 88 | for (std::size_t index = 0; index < command_headers.size();) { |
| 83 | const CommandHeader& command_header = command_headers[index]; | 89 | const CommandHeader& command_header = command_headers[index]; |
diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index f055b61e9..34dc6c596 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | #include <queue> | 8 | #include <queue> |
| 9 | 9 | ||
| 10 | #include "common/common_types.h" | 10 | #include "common/common_types.h" |
| 11 | #include "common/settings.h" | ||
| 11 | #include "core/core.h" | 12 | #include "core/core.h" |
| 12 | #include "video_core/delayed_destruction_ring.h" | 13 | #include "video_core/delayed_destruction_ring.h" |
| 13 | #include "video_core/gpu.h" | 14 | #include "video_core/gpu.h" |
| @@ -53,6 +54,12 @@ public: | |||
| 53 | delayed_destruction_ring.Tick(); | 54 | delayed_destruction_ring.Tick(); |
| 54 | } | 55 | } |
| 55 | 56 | ||
| 57 | // Unlike other fences, this one doesn't | ||
| 58 | void SignalOrdering() { | ||
| 59 | std::scoped_lock lock{buffer_cache.mutex}; | ||
| 60 | buffer_cache.AccumulateFlushes(); | ||
| 61 | } | ||
| 62 | |||
| 56 | void SignalSemaphore(GPUVAddr addr, u32 value) { | 63 | void SignalSemaphore(GPUVAddr addr, u32 value) { |
| 57 | TryReleasePendingFences(); | 64 | TryReleasePendingFences(); |
| 58 | const bool should_flush = ShouldFlush(); | 65 | const bool should_flush = ShouldFlush(); |
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 35cc561be..f317ddc2b 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp | |||
| @@ -268,11 +268,13 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { | |||
| 268 | case BufferMethods::SemaphoreAddressHigh: | 268 | case BufferMethods::SemaphoreAddressHigh: |
| 269 | case BufferMethods::SemaphoreAddressLow: | 269 | case BufferMethods::SemaphoreAddressLow: |
| 270 | case BufferMethods::SemaphoreSequence: | 270 | case BufferMethods::SemaphoreSequence: |
| 271 | case BufferMethods::RefCnt: | ||
| 272 | case BufferMethods::UnkCacheFlush: | 271 | case BufferMethods::UnkCacheFlush: |
| 273 | case BufferMethods::WrcacheFlush: | 272 | case BufferMethods::WrcacheFlush: |
| 274 | case BufferMethods::FenceValue: | 273 | case BufferMethods::FenceValue: |
| 275 | break; | 274 | break; |
| 275 | case BufferMethods::RefCnt: | ||
| 276 | rasterizer->SignalReference(); | ||
| 277 | break; | ||
| 276 | case BufferMethods::FenceAction: | 278 | case BufferMethods::FenceAction: |
| 277 | ProcessFenceActionMethod(); | 279 | ProcessFenceActionMethod(); |
| 278 | break; | 280 | break; |
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 0cec4225b..67aef6000 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h | |||
| @@ -63,6 +63,9 @@ public: | |||
| 63 | /// Signal a GPU based syncpoint as a fence | 63 | /// Signal a GPU based syncpoint as a fence |
| 64 | virtual void SignalSyncPoint(u32 value) = 0; | 64 | virtual void SignalSyncPoint(u32 value) = 0; |
| 65 | 65 | ||
| 66 | /// Signal a GPU based reference as point | ||
| 67 | virtual void SignalReference() = 0; | ||
| 68 | |||
| 66 | /// Release all pending fences. | 69 | /// Release all pending fences. |
| 67 | virtual void ReleaseFences() = 0; | 70 | virtual void ReleaseFences() = 0; |
| 68 | 71 | ||
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 07ad0e205..a4ed8f68f 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp | |||
| @@ -634,6 +634,13 @@ void RasterizerOpenGL::SignalSyncPoint(u32 value) { | |||
| 634 | fence_manager.SignalSyncPoint(value); | 634 | fence_manager.SignalSyncPoint(value); |
| 635 | } | 635 | } |
| 636 | 636 | ||
| 637 | void RasterizerOpenGL::SignalReference() { | ||
| 638 | if (!gpu.IsAsync()) { | ||
| 639 | return; | ||
| 640 | } | ||
| 641 | fence_manager.SignalOrdering(); | ||
| 642 | } | ||
| 643 | |||
| 637 | void RasterizerOpenGL::ReleaseFences() { | 644 | void RasterizerOpenGL::ReleaseFences() { |
| 638 | if (!gpu.IsAsync()) { | 645 | if (!gpu.IsAsync()) { |
| 639 | return; | 646 | return; |
| @@ -650,6 +657,7 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { | |||
| 650 | 657 | ||
| 651 | void RasterizerOpenGL::WaitForIdle() { | 658 | void RasterizerOpenGL::WaitForIdle() { |
| 652 | glMemoryBarrier(GL_ALL_BARRIER_BITS); | 659 | glMemoryBarrier(GL_ALL_BARRIER_BITS); |
| 660 | SignalReference(); | ||
| 653 | } | 661 | } |
| 654 | 662 | ||
| 655 | void RasterizerOpenGL::FragmentBarrier() { | 663 | void RasterizerOpenGL::FragmentBarrier() { |
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 482efed7a..d8df71962 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h | |||
| @@ -83,6 +83,7 @@ public: | |||
| 83 | void ModifyGPUMemory(GPUVAddr addr, u64 size) override; | 83 | void ModifyGPUMemory(GPUVAddr addr, u64 size) override; |
| 84 | void SignalSemaphore(GPUVAddr addr, u32 value) override; | 84 | void SignalSemaphore(GPUVAddr addr, u32 value) override; |
| 85 | void SignalSyncPoint(u32 value) override; | 85 | void SignalSyncPoint(u32 value) override; |
| 86 | void SignalReference() override; | ||
| 86 | void ReleaseFences() override; | 87 | void ReleaseFences() override; |
| 87 | void FlushAndInvalidateRegion(VAddr addr, u64 size) override; | 88 | void FlushAndInvalidateRegion(VAddr addr, u64 size) override; |
| 88 | void WaitForIdle() override; | 89 | void WaitForIdle() override; |
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index bd4d649cc..9ea4b6653 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp | |||
| @@ -580,6 +580,13 @@ void RasterizerVulkan::SignalSyncPoint(u32 value) { | |||
| 580 | fence_manager.SignalSyncPoint(value); | 580 | fence_manager.SignalSyncPoint(value); |
| 581 | } | 581 | } |
| 582 | 582 | ||
| 583 | void RasterizerVulkan::SignalReference() { | ||
| 584 | if (!gpu.IsAsync()) { | ||
| 585 | return; | ||
| 586 | } | ||
| 587 | fence_manager.SignalOrdering(); | ||
| 588 | } | ||
| 589 | |||
| 583 | void RasterizerVulkan::ReleaseFences() { | 590 | void RasterizerVulkan::ReleaseFences() { |
| 584 | if (!gpu.IsAsync()) { | 591 | if (!gpu.IsAsync()) { |
| 585 | return; | 592 | return; |
| @@ -612,6 +619,7 @@ void RasterizerVulkan::WaitForIdle() { | |||
| 612 | cmdbuf.SetEvent(event, flags); | 619 | cmdbuf.SetEvent(event, flags); |
| 613 | cmdbuf.WaitEvents(event, flags, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, {}, {}, {}); | 620 | cmdbuf.WaitEvents(event, flags, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, {}, {}, {}); |
| 614 | }); | 621 | }); |
| 622 | SignalReference(); | ||
| 615 | } | 623 | } |
| 616 | 624 | ||
| 617 | void RasterizerVulkan::FragmentBarrier() { | 625 | void RasterizerVulkan::FragmentBarrier() { |
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 41459c5c5..5450ccfb5 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h | |||
| @@ -75,6 +75,7 @@ public: | |||
| 75 | void ModifyGPUMemory(GPUVAddr addr, u64 size) override; | 75 | void ModifyGPUMemory(GPUVAddr addr, u64 size) override; |
| 76 | void SignalSemaphore(GPUVAddr addr, u32 value) override; | 76 | void SignalSemaphore(GPUVAddr addr, u32 value) override; |
| 77 | void SignalSyncPoint(u32 value) override; | 77 | void SignalSyncPoint(u32 value) override; |
| 78 | void SignalReference() override; | ||
| 78 | void ReleaseFences() override; | 79 | void ReleaseFences() override; |
| 79 | void FlushAndInvalidateRegion(VAddr addr, u64 size) override; | 80 | void FlushAndInvalidateRegion(VAddr addr, u64 size) override; |
| 80 | void WaitForIdle() override; | 81 | void WaitForIdle() override; |
diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h index 9fbdc1ac6..47a11cb2f 100644 --- a/src/video_core/texture_cache/types.h +++ b/src/video_core/texture_cache/types.h | |||
| @@ -133,8 +133,8 @@ struct BufferImageCopy { | |||
| 133 | }; | 133 | }; |
| 134 | 134 | ||
| 135 | struct BufferCopy { | 135 | struct BufferCopy { |
| 136 | size_t src_offset; | 136 | u64 src_offset; |
| 137 | size_t dst_offset; | 137 | u64 dst_offset; |
| 138 | size_t size; | 138 | size_t size; |
| 139 | }; | 139 | }; |
| 140 | 140 | ||