diff options
| author | 2023-04-22 20:10:40 +0200 | |
|---|---|---|
| committer | 2023-04-29 00:46:31 +0200 | |
| commit | ed4553806a08e4130fcea36230985cb74d1b326a (patch) | |
| tree | e127bdb4c9c1258e294997537c4c03e52c816aba | |
| parent | Buffer Cache rework: Setup async downloads. (diff) | |
| download | yuzu-ed4553806a08e4130fcea36230985cb74d1b326a.tar.gz yuzu-ed4553806a08e4130fcea36230985cb74d1b326a.tar.xz yuzu-ed4553806a08e4130fcea36230985cb74d1b326a.zip | |
Implement Async downloads in normal and fix a few issues.
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 69 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache_base.h | 25 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/word_manager.h | 6 |
3 files changed, 61 insertions, 39 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 43fe5b080..faa48a678 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h | |||
| @@ -22,6 +22,8 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, | |||
| 22 | void(slot_buffers.insert(runtime, NullBufferParams{})); | 22 | void(slot_buffers.insert(runtime, NullBufferParams{})); |
| 23 | common_ranges.clear(); | 23 | common_ranges.clear(); |
| 24 | 24 | ||
| 25 | active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh(); | ||
| 26 | |||
| 25 | if (!runtime.CanReportMemoryUsage()) { | 27 | if (!runtime.CanReportMemoryUsage()) { |
| 26 | minimum_memory = DEFAULT_EXPECTED_MEMORY; | 28 | minimum_memory = DEFAULT_EXPECTED_MEMORY; |
| 27 | critical_memory = DEFAULT_CRITICAL_MEMORY; | 29 | critical_memory = DEFAULT_CRITICAL_MEMORY; |
| @@ -72,6 +74,8 @@ void BufferCache<P>::TickFrame() { | |||
| 72 | uniform_cache_hits[0] = 0; | 74 | uniform_cache_hits[0] = 0; |
| 73 | uniform_cache_shots[0] = 0; | 75 | uniform_cache_shots[0] = 0; |
| 74 | 76 | ||
| 77 | active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh(); | ||
| 78 | |||
| 75 | const bool skip_preferred = hits * 256 < shots * 251; | 79 | const bool skip_preferred = hits * 256 < shots * 251; |
| 76 | uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; | 80 | uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; |
| 77 | 81 | ||
| @@ -130,7 +134,7 @@ void BufferCache<P>::WaitOnAsyncFlushes(VAddr cpu_addr, u64 size) { | |||
| 130 | 134 | ||
| 131 | template <class P> | 135 | template <class P> |
| 132 | void BufferCache<P>::ClearDownload(IntervalType subtract_interval) { | 136 | void BufferCache<P>::ClearDownload(IntervalType subtract_interval) { |
| 133 | async_downloads -= std::make_pair(subtract_interval, std::numeric_limits<int>::max()); | 137 | RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1024); |
| 134 | uncommitted_ranges.subtract(subtract_interval); | 138 | uncommitted_ranges.subtract(subtract_interval); |
| 135 | pending_ranges.subtract(subtract_interval); | 139 | pending_ranges.subtract(subtract_interval); |
| 136 | for (auto& interval_set : committed_ranges) { | 140 | for (auto& interval_set : committed_ranges) { |
| @@ -173,18 +177,14 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am | |||
| 173 | }}; | 177 | }}; |
| 174 | 178 | ||
| 175 | boost::container::small_vector<IntervalType, 4> tmp_intervals; | 179 | boost::container::small_vector<IntervalType, 4> tmp_intervals; |
| 176 | const bool is_high_accuracy = | ||
| 177 | Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High; | ||
| 178 | auto mirror = [&](VAddr base_address, VAddr base_address_end) { | 180 | auto mirror = [&](VAddr base_address, VAddr base_address_end) { |
| 179 | const u64 size = base_address_end - base_address; | 181 | const u64 size = base_address_end - base_address; |
| 180 | const VAddr diff = base_address - *cpu_src_address; | 182 | const VAddr diff = base_address - *cpu_src_address; |
| 181 | const VAddr new_base_address = *cpu_dest_address + diff; | 183 | const VAddr new_base_address = *cpu_dest_address + diff; |
| 182 | const IntervalType add_interval{new_base_address, new_base_address + size}; | 184 | const IntervalType add_interval{new_base_address, new_base_address + size}; |
| 183 | tmp_intervals.push_back(add_interval); | 185 | tmp_intervals.push_back(add_interval); |
| 184 | if (is_high_accuracy) { | 186 | uncommitted_ranges.add(add_interval); |
| 185 | uncommitted_ranges.add(add_interval); | 187 | pending_ranges.add(add_interval); |
| 186 | pending_ranges.add(add_interval); | ||
| 187 | } | ||
| 188 | }; | 188 | }; |
| 189 | ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); | 189 | ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); |
| 190 | // This subtraction in this order is important for overlapping copies. | 190 | // This subtraction in this order is important for overlapping copies. |
| @@ -468,7 +468,7 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { | |||
| 468 | AccumulateFlushes(); | 468 | AccumulateFlushes(); |
| 469 | 469 | ||
| 470 | if (committed_ranges.empty()) { | 470 | if (committed_ranges.empty()) { |
| 471 | if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { | 471 | if (active_async_buffers) { |
| 472 | async_buffers.emplace_back(std::optional<Async_Buffer>{}); | 472 | async_buffers.emplace_back(std::optional<Async_Buffer>{}); |
| 473 | } | 473 | } |
| 474 | return; | 474 | return; |
| @@ -529,31 +529,33 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { | |||
| 529 | } | 529 | } |
| 530 | committed_ranges.clear(); | 530 | committed_ranges.clear(); |
| 531 | if (downloads.empty()) { | 531 | if (downloads.empty()) { |
| 532 | if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { | 532 | if (active_async_buffers) { |
| 533 | async_buffers.emplace_back(std::optional<Async_Buffer>{}); | 533 | async_buffers.emplace_back(std::optional<Async_Buffer>{}); |
| 534 | } | 534 | } |
| 535 | return; | 535 | return; |
| 536 | } | 536 | } |
| 537 | if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { | 537 | if (active_async_buffers) { |
| 538 | auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); | 538 | if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { |
| 539 | boost::container::small_vector<BufferCopy, 4> normalized_copies; | 539 | auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); |
| 540 | IntervalSet new_async_range{}; | 540 | boost::container::small_vector<BufferCopy, 4> normalized_copies; |
| 541 | runtime.PreCopyBarrier(); | 541 | IntervalSet new_async_range{}; |
| 542 | for (auto& [copy, buffer_id] : downloads) { | 542 | runtime.PreCopyBarrier(); |
| 543 | copy.dst_offset += download_staging.offset; | 543 | for (auto& [copy, buffer_id] : downloads) { |
| 544 | const std::array copies{copy}; | 544 | copy.dst_offset += download_staging.offset; |
| 545 | BufferCopy second_copy{copy}; | 545 | const std::array copies{copy}; |
| 546 | Buffer& buffer = slot_buffers[buffer_id]; | 546 | BufferCopy second_copy{copy}; |
| 547 | second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + copy.src_offset; | 547 | Buffer& buffer = slot_buffers[buffer_id]; |
| 548 | VAddr orig_cpu_addr = static_cast<VAddr>(second_copy.src_offset); | 548 | second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + copy.src_offset; |
| 549 | const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; | 549 | VAddr orig_cpu_addr = static_cast<VAddr>(second_copy.src_offset); |
| 550 | async_downloads += std::make_pair(base_interval, 1); | 550 | const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; |
| 551 | runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); | 551 | async_downloads += std::make_pair(base_interval, 1); |
| 552 | normalized_copies.push_back(second_copy); | 552 | runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); |
| 553 | normalized_copies.push_back(second_copy); | ||
| 554 | } | ||
| 555 | runtime.PostCopyBarrier(); | ||
| 556 | pending_downloads.emplace_back(std::move(normalized_copies)); | ||
| 557 | async_buffers.emplace_back(download_staging); | ||
| 553 | } | 558 | } |
| 554 | runtime.PostCopyBarrier(); | ||
| 555 | pending_downloads.emplace_back(std::move(normalized_copies)); | ||
| 556 | async_buffers.emplace_back(download_staging); | ||
| 557 | } else { | 559 | } else { |
| 558 | if constexpr (USE_MEMORY_MAPS) { | 560 | if constexpr (USE_MEMORY_MAPS) { |
| 559 | auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); | 561 | auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); |
| @@ -624,7 +626,8 @@ void BufferCache<P>::PopAsyncBuffers() { | |||
| 624 | common_ranges.subtract(base_interval); | 626 | common_ranges.subtract(base_interval); |
| 625 | } | 627 | } |
| 626 | }); | 628 | }); |
| 627 | async_downloads -= std::make_pair(IntervalType(cpu_addr, cpu_addr + copy.size), 1); | 629 | const IntervalType subtract_interval{cpu_addr, cpu_addr + copy.size}; |
| 630 | RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1); | ||
| 628 | } | 631 | } |
| 629 | runtime.FreeDeferredStagingBuffer(*async_buffer); | 632 | runtime.FreeDeferredStagingBuffer(*async_buffer); |
| 630 | async_buffers.pop_front(); | 633 | async_buffers.pop_front(); |
| @@ -1198,10 +1201,8 @@ void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s | |||
| 1198 | 1201 | ||
| 1199 | const IntervalType base_interval{cpu_addr, cpu_addr + size}; | 1202 | const IntervalType base_interval{cpu_addr, cpu_addr + size}; |
| 1200 | common_ranges.add(base_interval); | 1203 | common_ranges.add(base_interval); |
| 1201 | if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { | 1204 | uncommitted_ranges.add(base_interval); |
| 1202 | uncommitted_ranges.add(base_interval); | 1205 | pending_ranges.add(base_interval); |
| 1203 | pending_ranges.add(base_interval); | ||
| 1204 | } | ||
| 1205 | } | 1206 | } |
| 1206 | 1207 | ||
| 1207 | template <class P> | 1208 | template <class P> |
| @@ -1542,7 +1543,7 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si | |||
| 1542 | .size = new_size, | 1543 | .size = new_size, |
| 1543 | }); | 1544 | }); |
| 1544 | // Align up to avoid cache conflicts | 1545 | // Align up to avoid cache conflicts |
| 1545 | constexpr u64 align = 8ULL; | 1546 | constexpr u64 align = 64ULL; |
| 1546 | constexpr u64 mask = ~(align - 1ULL); | 1547 | constexpr u64 mask = ~(align - 1ULL); |
| 1547 | total_size_bytes += (new_size + align - 1) & mask; | 1548 | total_size_bytes += (new_size + align - 1) & mask; |
| 1548 | largest_copy = std::max(largest_copy, new_size); | 1549 | largest_copy = std::max(largest_copy, new_size); |
diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 6f29cba25..d4914a8f5 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h | |||
| @@ -345,13 +345,30 @@ private: | |||
| 345 | if (inter_addr < start_address) { | 345 | if (inter_addr < start_address) { |
| 346 | inter_addr = start_address; | 346 | inter_addr = start_address; |
| 347 | } | 347 | } |
| 348 | if (it->second <= 0) { | ||
| 349 | __debugbreak(); | ||
| 350 | } | ||
| 351 | func(inter_addr, inter_addr_end, it->second); | 348 | func(inter_addr, inter_addr_end, it->second); |
| 352 | } | 349 | } |
| 353 | } | 350 | } |
| 354 | 351 | ||
| 352 | void RemoveEachInOverlapCounter(OverlapCounter& current_range, const IntervalType search_interval, int subtract_value) { | ||
| 353 | bool any_removals = false; | ||
| 354 | current_range.add(std::make_pair(search_interval, subtract_value)); | ||
| 355 | do { | ||
| 356 | any_removals = false; | ||
| 357 | auto it = current_range.lower_bound(search_interval); | ||
| 358 | if (it == current_range.end()) { | ||
| 359 | return; | ||
| 360 | } | ||
| 361 | auto end_it = current_range.upper_bound(search_interval); | ||
| 362 | for (; it != end_it; it++) { | ||
| 363 | if (it->second <= 0) { | ||
| 364 | any_removals = true; | ||
| 365 | current_range.erase(it); | ||
| 366 | break; | ||
| 367 | } | ||
| 368 | } | ||
| 369 | } while (any_removals); | ||
| 370 | } | ||
| 371 | |||
| 355 | static bool IsRangeGranular(VAddr cpu_addr, size_t size) { | 372 | static bool IsRangeGranular(VAddr cpu_addr, size_t size) { |
| 356 | return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == | 373 | return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == |
| 357 | ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); | 374 | ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); |
| @@ -554,6 +571,8 @@ private: | |||
| 554 | u64 minimum_memory = 0; | 571 | u64 minimum_memory = 0; |
| 555 | u64 critical_memory = 0; | 572 | u64 critical_memory = 0; |
| 556 | 573 | ||
| 574 | bool active_async_buffers = false; | ||
| 575 | |||
| 557 | std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; | 576 | std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; |
| 558 | }; | 577 | }; |
| 559 | 578 | ||
diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h index 782951fe7..21729752b 100644 --- a/src/video_core/buffer_cache/word_manager.h +++ b/src/video_core/buffer_cache/word_manager.h | |||
| @@ -273,7 +273,7 @@ public: | |||
| 273 | untracked_words[word_index] &= ~bits; | 273 | untracked_words[word_index] &= ~bits; |
| 274 | NotifyRasterizer<true>(word_index, current_bits, ~u64{0}); | 274 | NotifyRasterizer<true>(word_index, current_bits, ~u64{0}); |
| 275 | } | 275 | } |
| 276 | const u64 word = current_word; | 276 | const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); |
| 277 | u64 page = page_begin; | 277 | u64 page = page_begin; |
| 278 | page_begin = 0; | 278 | page_begin = 0; |
| 279 | 279 | ||
| @@ -321,6 +321,7 @@ public: | |||
| 321 | [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { | 321 | [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { |
| 322 | static_assert(type != Type::Untracked); | 322 | static_assert(type != Type::Untracked); |
| 323 | 323 | ||
| 324 | const u64* const untracked_words = Array<Type::Untracked>(); | ||
| 324 | const u64* const state_words = Array<type>(); | 325 | const u64* const state_words = Array<type>(); |
| 325 | const u64 num_query_words = size / BYTES_PER_WORD + 1; | 326 | const u64 num_query_words = size / BYTES_PER_WORD + 1; |
| 326 | const u64 word_begin = offset / BYTES_PER_WORD; | 327 | const u64 word_begin = offset / BYTES_PER_WORD; |
| @@ -328,7 +329,8 @@ public: | |||
| 328 | const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); | 329 | const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); |
| 329 | u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; | 330 | u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; |
| 330 | for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { | 331 | for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { |
| 331 | const u64 word = state_words[word_index]; | 332 | const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; |
| 333 | const u64 word = state_words[word_index] & ~off_word; | ||
| 332 | if (word == 0) { | 334 | if (word == 0) { |
| 333 | continue; | 335 | continue; |
| 334 | } | 336 | } |