summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Ameer J2021-07-10 16:06:19 -0400
committerGravatar GitHub2021-07-10 16:06:19 -0400
commit907b2324d3a570ff726fe064ba425924d6dc0426 (patch)
tree1b3a16691ab5835900a5eee0a05a871b6b774a71 /src
parentMerge pull request #6573 from lat9nq/cpu-settings-cleanup-2 (diff)
parentBuffer Cache: Address Feedback. (diff)
downloadyuzu-907b2324d3a570ff726fe064ba425924d6dc0426.tar.gz
yuzu-907b2324d3a570ff726fe064ba425924d6dc0426.tar.xz
yuzu-907b2324d3a570ff726fe064ba425924d6dc0426.zip
Merge pull request #6557 from FernandoS27/staceys-mom-has-got-it-goin-on
Buffer Cache: Fix High downloads / Fence manager: Improve fence checking.
Diffstat (limited to 'src')
-rw-r--r--src/tests/video_core/buffer_base.cpp2
-rw-r--r--src/video_core/buffer_cache/buffer_base.h19
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h238
-rw-r--r--src/video_core/dma_pusher.cpp10
-rw-r--r--src/video_core/fence_manager.h7
-rw-r--r--src/video_core/gpu.cpp4
-rw-r--r--src/video_core/rasterizer_interface.h3
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp8
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h1
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp8
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h1
-rw-r--r--src/video_core/texture_cache/types.h4
12 files changed, 227 insertions, 78 deletions
diff --git a/src/tests/video_core/buffer_base.cpp b/src/tests/video_core/buffer_base.cpp
index edced69bb..9f5a54de4 100644
--- a/src/tests/video_core/buffer_base.cpp
+++ b/src/tests/video_core/buffer_base.cpp
@@ -536,7 +536,7 @@ TEST_CASE("BufferBase: Cached write downloads") {
536 REQUIRE(rasterizer.Count() == 63); 536 REQUIRE(rasterizer.Count() == 63);
537 buffer.MarkRegionAsGpuModified(c + PAGE, PAGE); 537 buffer.MarkRegionAsGpuModified(c + PAGE, PAGE);
538 int num = 0; 538 int num = 0;
539 buffer.ForEachDownloadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); 539 buffer.ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; });
540 buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); 540 buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
541 REQUIRE(num == 0); 541 REQUIRE(num == 0);
542 REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); 542 REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index b121d36a3..c3318095c 100644
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -226,19 +226,24 @@ public:
226 /// Call 'func' for each CPU modified range and unmark those pages as CPU modified 226 /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
227 template <typename Func> 227 template <typename Func>
228 void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { 228 void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) {
229 ForEachModifiedRange<Type::CPU>(query_cpu_range, size, func); 229 ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func);
230 } 230 }
231 231
232 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified 232 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
233 template <typename Func> 233 template <typename Func>
234 void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) { 234 void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) {
235 ForEachModifiedRange<Type::GPU>(query_cpu_range, size, func); 235 ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func);
236 }
237
238 template <typename Func>
239 void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) {
240 ForEachModifiedRange<Type::GPU>(query_cpu_range, size, true, func);
236 } 241 }
237 242
238 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified 243 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
239 template <typename Func> 244 template <typename Func>
240 void ForEachDownloadRange(Func&& func) { 245 void ForEachDownloadRange(Func&& func) {
241 ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), func); 246 ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func);
242 } 247 }
243 248
244 /// Mark buffer as picked 249 /// Mark buffer as picked
@@ -415,7 +420,7 @@ private:
415 * @param func Function to call for each turned off region 420 * @param func Function to call for each turned off region
416 */ 421 */
417 template <Type type, typename Func> 422 template <Type type, typename Func>
418 void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { 423 void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) {
419 static_assert(type != Type::Untracked); 424 static_assert(type != Type::Untracked);
420 425
421 const s64 difference = query_cpu_range - cpu_addr; 426 const s64 difference = query_cpu_range - cpu_addr;
@@ -467,7 +472,9 @@ private:
467 bits = (bits << left_offset) >> left_offset; 472 bits = (bits << left_offset) >> left_offset;
468 473
469 const u64 current_word = state_words[word_index] & bits; 474 const u64 current_word = state_words[word_index] & bits;
470 state_words[word_index] &= ~bits; 475 if (clear) {
476 state_words[word_index] &= ~bits;
477 }
471 478
472 if constexpr (type == Type::CPU) { 479 if constexpr (type == Type::CPU) {
473 const u64 current_bits = untracked_words[word_index] & bits; 480 const u64 current_bits = untracked_words[word_index] & bits;
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index cad7f902d..502feddba 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -15,6 +15,7 @@
15#include <vector> 15#include <vector>
16 16
17#include <boost/container/small_vector.hpp> 17#include <boost/container/small_vector.hpp>
18#include <boost/icl/interval_set.hpp>
18 19
19#include "common/common_types.h" 20#include "common/common_types.h"
20#include "common/div_ceil.h" 21#include "common/div_ceil.h"
@@ -77,6 +78,9 @@ class BufferCache {
77 using Runtime = typename P::Runtime; 78 using Runtime = typename P::Runtime;
78 using Buffer = typename P::Buffer; 79 using Buffer = typename P::Buffer;
79 80
81 using IntervalSet = boost::icl::interval_set<VAddr>;
82 using IntervalType = typename IntervalSet::interval_type;
83
80 struct Empty {}; 84 struct Empty {};
81 85
82 struct OverlapResult { 86 struct OverlapResult {
@@ -148,11 +152,14 @@ public:
148 /// Return true when there are uncommitted buffers to be downloaded 152 /// Return true when there are uncommitted buffers to be downloaded
149 [[nodiscard]] bool HasUncommittedFlushes() const noexcept; 153 [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
150 154
155 void AccumulateFlushes();
156
151 /// Return true when the caller should wait for async downloads 157 /// Return true when the caller should wait for async downloads
152 [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; 158 [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
153 159
154 /// Commit asynchronous downloads 160 /// Commit asynchronous downloads
155 void CommitAsyncFlushes(); 161 void CommitAsyncFlushes();
162 void CommitAsyncFlushesHigh();
156 163
157 /// Pop asynchronous downloads 164 /// Pop asynchronous downloads
158 void PopAsyncFlushes(); 165 void PopAsyncFlushes();
@@ -160,6 +167,9 @@ public:
160 /// Return true when a CPU region is modified from the GPU 167 /// Return true when a CPU region is modified from the GPU
161 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); 168 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
162 169
170 /// Return true when a CPU region is modified from the CPU
171 [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
172
163 std::mutex mutex; 173 std::mutex mutex;
164 174
165private: 175private:
@@ -272,8 +282,6 @@ private:
272 282
273 void DeleteBuffer(BufferId buffer_id); 283 void DeleteBuffer(BufferId buffer_id);
274 284
275 void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
276
277 void NotifyBufferDeletion(); 285 void NotifyBufferDeletion();
278 286
279 [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const; 287 [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const;
@@ -327,9 +335,9 @@ private:
327 335
328 std::vector<BufferId> cached_write_buffer_ids; 336 std::vector<BufferId> cached_write_buffer_ids;
329 337
330 // TODO: This data structure is not optimal and it should be reworked 338 IntervalSet uncommitted_ranges;
331 std::vector<BufferId> uncommitted_downloads; 339 IntervalSet common_ranges;
332 std::deque<std::vector<BufferId>> committed_downloads; 340 std::deque<IntervalSet> committed_ranges;
333 341
334 size_t immediate_buffer_capacity = 0; 342 size_t immediate_buffer_capacity = 0;
335 std::unique_ptr<u8[]> immediate_buffer_alloc; 343 std::unique_ptr<u8[]> immediate_buffer_alloc;
@@ -352,6 +360,7 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
352 // Ensure the first slot is used for the null buffer 360 // Ensure the first slot is used for the null buffer
353 void(slot_buffers.insert(runtime, NullBufferParams{})); 361 void(slot_buffers.insert(runtime, NullBufferParams{}));
354 deletion_iterator = slot_buffers.end(); 362 deletion_iterator = slot_buffers.end();
363 common_ranges.clear();
355} 364}
356 365
357template <class P> 366template <class P>
@@ -547,29 +556,30 @@ void BufferCache<P>::FlushCachedWrites() {
547 556
548template <class P> 557template <class P>
549bool BufferCache<P>::HasUncommittedFlushes() const noexcept { 558bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
550 return !uncommitted_downloads.empty(); 559 return !uncommitted_ranges.empty() || !committed_ranges.empty();
551} 560}
552 561
553template <class P> 562template <class P>
554bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { 563void BufferCache<P>::AccumulateFlushes() {
555 return !committed_downloads.empty() && !committed_downloads.front().empty(); 564 if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) {
565 uncommitted_ranges.clear();
566 return;
567 }
568 if (uncommitted_ranges.empty()) {
569 return;
570 }
571 committed_ranges.emplace_back(std::move(uncommitted_ranges));
556} 572}
557 573
558template <class P> 574template <class P>
559void BufferCache<P>::CommitAsyncFlushes() { 575bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
560 // This is intentionally passing the value by copy 576 return false;
561 committed_downloads.push_front(uncommitted_downloads);
562 uncommitted_downloads.clear();
563} 577}
564 578
565template <class P> 579template <class P>
566void BufferCache<P>::PopAsyncFlushes() { 580void BufferCache<P>::CommitAsyncFlushesHigh() {
567 if (committed_downloads.empty()) { 581 AccumulateFlushes();
568 return; 582 if (committed_ranges.empty()) {
569 }
570 auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); });
571 const std::span<const BufferId> download_ids = committed_downloads.back();
572 if (download_ids.empty()) {
573 return; 583 return;
574 } 584 }
575 MICROPROFILE_SCOPE(GPU_DownloadMemory); 585 MICROPROFILE_SCOPE(GPU_DownloadMemory);
@@ -577,20 +587,66 @@ void BufferCache<P>::PopAsyncFlushes() {
577 boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads; 587 boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
578 u64 total_size_bytes = 0; 588 u64 total_size_bytes = 0;
579 u64 largest_copy = 0; 589 u64 largest_copy = 0;
580 for (const BufferId buffer_id : download_ids) { 590 for (const IntervalSet& intervals : committed_ranges) {
581 slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) { 591 for (auto& interval : intervals) {
582 downloads.push_back({ 592 const std::size_t size = interval.upper() - interval.lower();
583 BufferCopy{ 593 const VAddr cpu_addr = interval.lower();
584 .src_offset = range_offset, 594 ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
585 .dst_offset = total_size_bytes, 595 boost::container::small_vector<BufferCopy, 1> copies;
586 .size = range_size, 596 buffer.ForEachDownloadRangeAndClear(
587 }, 597 cpu_addr, size, [&](u64 range_offset, u64 range_size) {
588 buffer_id, 598 const VAddr buffer_addr = buffer.CpuAddr();
599 const auto add_download = [&](VAddr start, VAddr end) {
600 const u64 new_offset = start - buffer_addr;
601 const u64 new_size = end - start;
602 downloads.push_back({
603 BufferCopy{
604 .src_offset = new_offset,
605 .dst_offset = total_size_bytes,
606 .size = new_size,
607 },
608 buffer_id,
609 });
610 // Align up to avoid cache conflicts
611 constexpr u64 align = 256ULL;
612 constexpr u64 mask = ~(align - 1ULL);
613 total_size_bytes += (new_size + align - 1) & mask;
614 largest_copy = std::max(largest_copy, new_size);
615 };
616
617 const VAddr start_address = buffer_addr + range_offset;
618 const VAddr end_address = start_address + range_size;
619 const IntervalType search_interval{cpu_addr, 1};
620 auto it = common_ranges.lower_bound(search_interval);
621 if (it == common_ranges.end()) {
622 it = common_ranges.begin();
623 }
624 while (it != common_ranges.end()) {
625 VAddr inter_addr_end = it->upper();
626 VAddr inter_addr = it->lower();
627 if (inter_addr >= end_address) {
628 break;
629 }
630 if (inter_addr_end <= start_address) {
631 it++;
632 continue;
633 }
634 if (inter_addr_end > end_address) {
635 inter_addr_end = end_address;
636 }
637 if (inter_addr < start_address) {
638 inter_addr = start_address;
639 }
640 add_download(inter_addr, inter_addr_end);
641 it++;
642 }
643 const IntervalType subtract_interval{start_address, end_address};
644 common_ranges.subtract(subtract_interval);
645 });
589 }); 646 });
590 total_size_bytes += range_size; 647 }
591 largest_copy = std::max(largest_copy, range_size);
592 });
593 } 648 }
649 committed_ranges.clear();
594 if (downloads.empty()) { 650 if (downloads.empty()) {
595 return; 651 return;
596 } 652 }
@@ -623,6 +679,19 @@ void BufferCache<P>::PopAsyncFlushes() {
623} 679}
624 680
625template <class P> 681template <class P>
682void BufferCache<P>::CommitAsyncFlushes() {
683 if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) {
684 CommitAsyncFlushesHigh();
685 } else {
686 uncommitted_ranges.clear();
687 committed_ranges.clear();
688 }
689}
690
691template <class P>
692void BufferCache<P>::PopAsyncFlushes() {}
693
694template <class P>
626bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { 695bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
627 const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); 696 const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
628 for (u64 page = addr >> PAGE_BITS; page < page_end;) { 697 for (u64 page = addr >> PAGE_BITS; page < page_end;) {
@@ -642,6 +711,25 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
642} 711}
643 712
644template <class P> 713template <class P>
714bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) {
715 const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
716 for (u64 page = addr >> PAGE_BITS; page < page_end;) {
717 const BufferId image_id = page_table[page];
718 if (!image_id) {
719 ++page;
720 continue;
721 }
722 Buffer& buffer = slot_buffers[image_id];
723 if (buffer.IsRegionCpuModified(addr, size)) {
724 return true;
725 }
726 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
727 page = Common::DivCeil(end_addr, PAGE_SIZE);
728 }
729 return false;
730}
731
732template <class P>
645void BufferCache<P>::BindHostIndexBuffer() { 733void BufferCache<P>::BindHostIndexBuffer() {
646 Buffer& buffer = slot_buffers[index_buffer.buffer_id]; 734 Buffer& buffer = slot_buffers[index_buffer.buffer_id];
647 TouchBuffer(buffer); 735 TouchBuffer(buffer);
@@ -1010,16 +1098,16 @@ void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s
1010 Buffer& buffer = slot_buffers[buffer_id]; 1098 Buffer& buffer = slot_buffers[buffer_id];
1011 buffer.MarkRegionAsGpuModified(cpu_addr, size); 1099 buffer.MarkRegionAsGpuModified(cpu_addr, size);
1012 1100
1013 const bool is_accuracy_high = Settings::IsGPULevelHigh(); 1101 const IntervalType base_interval{cpu_addr, cpu_addr + size};
1102 common_ranges.add(base_interval);
1103
1104 const bool is_accuracy_high =
1105 Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High;
1014 const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); 1106 const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
1015 if (!is_accuracy_high || !is_async) { 1107 if (!is_async && !is_accuracy_high) {
1016 return;
1017 }
1018 if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) {
1019 // Already inserted
1020 return; 1108 return;
1021 } 1109 }
1022 uncommitted_downloads.push_back(buffer_id); 1110 uncommitted_ranges.add(base_interval);
1023} 1111}
1024 1112
1025template <class P> 1113template <class P>
@@ -1103,7 +1191,6 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
1103 if (!copies.empty()) { 1191 if (!copies.empty()) {
1104 runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); 1192 runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies);
1105 } 1193 }
1106 ReplaceBufferDownloads(overlap_id, new_buffer_id);
1107 DeleteBuffer(overlap_id); 1194 DeleteBuffer(overlap_id);
1108} 1195}
1109 1196
@@ -1244,14 +1331,51 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si
1244 boost::container::small_vector<BufferCopy, 1> copies; 1331 boost::container::small_vector<BufferCopy, 1> copies;
1245 u64 total_size_bytes = 0; 1332 u64 total_size_bytes = 0;
1246 u64 largest_copy = 0; 1333 u64 largest_copy = 0;
1247 buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { 1334 buffer.ForEachDownloadRangeAndClear(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
1248 copies.push_back(BufferCopy{ 1335 const VAddr buffer_addr = buffer.CpuAddr();
1249 .src_offset = range_offset, 1336 const auto add_download = [&](VAddr start, VAddr end) {
1250 .dst_offset = total_size_bytes, 1337 const u64 new_offset = start - buffer_addr;
1251 .size = range_size, 1338 const u64 new_size = end - start;
1252 }); 1339 copies.push_back(BufferCopy{
1253 total_size_bytes += range_size; 1340 .src_offset = new_offset,
1254 largest_copy = std::max(largest_copy, range_size); 1341 .dst_offset = total_size_bytes,
1342 .size = new_size,
1343 });
1344 // Align up to avoid cache conflicts
1345 constexpr u64 align = 256ULL;
1346 constexpr u64 mask = ~(align - 1ULL);
1347 total_size_bytes += (new_size + align - 1) & mask;
1348 largest_copy = std::max(largest_copy, new_size);
1349 };
1350
1351 const VAddr start_address = buffer_addr + range_offset;
1352 const VAddr end_address = start_address + range_size;
1353 const IntervalType search_interval{start_address - range_size, 1};
1354 auto it = common_ranges.lower_bound(search_interval);
1355 if (it == common_ranges.end()) {
1356 it = common_ranges.begin();
1357 }
1358 while (it != common_ranges.end()) {
1359 VAddr inter_addr_end = it->upper();
1360 VAddr inter_addr = it->lower();
1361 if (inter_addr >= end_address) {
1362 break;
1363 }
1364 if (inter_addr_end <= start_address) {
1365 it++;
1366 continue;
1367 }
1368 if (inter_addr_end > end_address) {
1369 inter_addr_end = end_address;
1370 }
1371 if (inter_addr < start_address) {
1372 inter_addr = start_address;
1373 }
1374 add_download(inter_addr, inter_addr_end);
1375 it++;
1376 }
1377 const IntervalType subtract_interval{start_address, end_address};
1378 common_ranges.subtract(subtract_interval);
1255 }); 1379 });
1256 if (total_size_bytes == 0) { 1380 if (total_size_bytes == 0) {
1257 return; 1381 return;
@@ -1316,18 +1440,6 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
1316} 1440}
1317 1441
1318template <class P> 1442template <class P>
1319void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) {
1320 const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) {
1321 std::ranges::replace(buffers, old_buffer_id, new_buffer_id);
1322 if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) {
1323 buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end());
1324 }
1325 };
1326 replace(uncommitted_downloads);
1327 std::ranges::for_each(committed_downloads, replace);
1328}
1329
1330template <class P>
1331void BufferCache<P>::NotifyBufferDeletion() { 1443void BufferCache<P>::NotifyBufferDeletion() {
1332 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { 1444 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
1333 dirty_uniform_buffers.fill(~u32{0}); 1445 dirty_uniform_buffers.fill(~u32{0});
@@ -1349,15 +1461,9 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s
1349 if (!cpu_addr || size == 0) { 1461 if (!cpu_addr || size == 0) {
1350 return NULL_BINDING; 1462 return NULL_BINDING;
1351 } 1463 }
1352 // HACK(Rodrigo): This is the number of bytes bound in host beyond the guest API's range.
1353 // It exists due to some games like Astral Chain operate out of bounds.
1354 // Binding the whole map range would be technically correct, but games have large maps that make
1355 // this approach unaffordable for now.
1356 static constexpr u32 arbitrary_extra_bytes = 0xc000;
1357 const u32 bytes_to_map_end = static_cast<u32>(gpu_memory.BytesToMapEnd(gpu_addr));
1358 const Binding binding{ 1464 const Binding binding{
1359 .cpu_addr = *cpu_addr, 1465 .cpu_addr = *cpu_addr,
1360 .size = std::min(size + arbitrary_extra_bytes, bytes_to_map_end), 1466 .size = size,
1361 .buffer_id = BufferId{}, 1467 .buffer_id = BufferId{},
1362 }; 1468 };
1363 return binding; 1469 return binding;
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 8b33c04ab..8d28bd884 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -4,6 +4,7 @@
4 4
5#include "common/cityhash.h" 5#include "common/cityhash.h"
6#include "common/microprofile.h" 6#include "common/microprofile.h"
7#include "common/settings.h"
7#include "core/core.h" 8#include "core/core.h"
8#include "core/memory.h" 9#include "core/memory.h"
9#include "video_core/dma_pusher.h" 10#include "video_core/dma_pusher.h"
@@ -76,8 +77,13 @@ bool DmaPusher::Step() {
76 77
77 // Push buffer non-empty, read a word 78 // Push buffer non-empty, read a word
78 command_headers.resize(command_list_header.size); 79 command_headers.resize(command_list_header.size);
79 gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), 80 if (Settings::IsGPULevelHigh()) {
80 command_list_header.size * sizeof(u32)); 81 gpu.MemoryManager().ReadBlock(dma_get, command_headers.data(),
82 command_list_header.size * sizeof(u32));
83 } else {
84 gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
85 command_list_header.size * sizeof(u32));
86 }
81 } 87 }
82 for (std::size_t index = 0; index < command_headers.size();) { 88 for (std::size_t index = 0; index < command_headers.size();) {
83 const CommandHeader& command_header = command_headers[index]; 89 const CommandHeader& command_header = command_headers[index];
diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h
index f055b61e9..34dc6c596 100644
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@@ -8,6 +8,7 @@
8#include <queue> 8#include <queue>
9 9
10#include "common/common_types.h" 10#include "common/common_types.h"
11#include "common/settings.h"
11#include "core/core.h" 12#include "core/core.h"
12#include "video_core/delayed_destruction_ring.h" 13#include "video_core/delayed_destruction_ring.h"
13#include "video_core/gpu.h" 14#include "video_core/gpu.h"
@@ -53,6 +54,12 @@ public:
53 delayed_destruction_ring.Tick(); 54 delayed_destruction_ring.Tick();
54 } 55 }
55 56
57 // Unlike other fences, this one doesn't
58 void SignalOrdering() {
59 std::scoped_lock lock{buffer_cache.mutex};
60 buffer_cache.AccumulateFlushes();
61 }
62
56 void SignalSemaphore(GPUVAddr addr, u32 value) { 63 void SignalSemaphore(GPUVAddr addr, u32 value) {
57 TryReleasePendingFences(); 64 TryReleasePendingFences();
58 const bool should_flush = ShouldFlush(); 65 const bool should_flush = ShouldFlush();
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 35cc561be..f317ddc2b 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -268,11 +268,13 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
268 case BufferMethods::SemaphoreAddressHigh: 268 case BufferMethods::SemaphoreAddressHigh:
269 case BufferMethods::SemaphoreAddressLow: 269 case BufferMethods::SemaphoreAddressLow:
270 case BufferMethods::SemaphoreSequence: 270 case BufferMethods::SemaphoreSequence:
271 case BufferMethods::RefCnt:
272 case BufferMethods::UnkCacheFlush: 271 case BufferMethods::UnkCacheFlush:
273 case BufferMethods::WrcacheFlush: 272 case BufferMethods::WrcacheFlush:
274 case BufferMethods::FenceValue: 273 case BufferMethods::FenceValue:
275 break; 274 break;
275 case BufferMethods::RefCnt:
276 rasterizer->SignalReference();
277 break;
276 case BufferMethods::FenceAction: 278 case BufferMethods::FenceAction:
277 ProcessFenceActionMethod(); 279 ProcessFenceActionMethod();
278 break; 280 break;
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 0cec4225b..67aef6000 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -63,6 +63,9 @@ public:
63 /// Signal a GPU based syncpoint as a fence 63 /// Signal a GPU based syncpoint as a fence
64 virtual void SignalSyncPoint(u32 value) = 0; 64 virtual void SignalSyncPoint(u32 value) = 0;
65 65
66 /// Signal a GPU based reference as point
67 virtual void SignalReference() = 0;
68
66 /// Release all pending fences. 69 /// Release all pending fences.
67 virtual void ReleaseFences() = 0; 70 virtual void ReleaseFences() = 0;
68 71
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 07ad0e205..a4ed8f68f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -634,6 +634,13 @@ void RasterizerOpenGL::SignalSyncPoint(u32 value) {
634 fence_manager.SignalSyncPoint(value); 634 fence_manager.SignalSyncPoint(value);
635} 635}
636 636
637void RasterizerOpenGL::SignalReference() {
638 if (!gpu.IsAsync()) {
639 return;
640 }
641 fence_manager.SignalOrdering();
642}
643
637void RasterizerOpenGL::ReleaseFences() { 644void RasterizerOpenGL::ReleaseFences() {
638 if (!gpu.IsAsync()) { 645 if (!gpu.IsAsync()) {
639 return; 646 return;
@@ -650,6 +657,7 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
650 657
651void RasterizerOpenGL::WaitForIdle() { 658void RasterizerOpenGL::WaitForIdle() {
652 glMemoryBarrier(GL_ALL_BARRIER_BITS); 659 glMemoryBarrier(GL_ALL_BARRIER_BITS);
660 SignalReference();
653} 661}
654 662
655void RasterizerOpenGL::FragmentBarrier() { 663void RasterizerOpenGL::FragmentBarrier() {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 482efed7a..d8df71962 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -83,6 +83,7 @@ public:
83 void ModifyGPUMemory(GPUVAddr addr, u64 size) override; 83 void ModifyGPUMemory(GPUVAddr addr, u64 size) override;
84 void SignalSemaphore(GPUVAddr addr, u32 value) override; 84 void SignalSemaphore(GPUVAddr addr, u32 value) override;
85 void SignalSyncPoint(u32 value) override; 85 void SignalSyncPoint(u32 value) override;
86 void SignalReference() override;
86 void ReleaseFences() override; 87 void ReleaseFences() override;
87 void FlushAndInvalidateRegion(VAddr addr, u64 size) override; 88 void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
88 void WaitForIdle() override; 89 void WaitForIdle() override;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index bd4d649cc..9ea4b6653 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -580,6 +580,13 @@ void RasterizerVulkan::SignalSyncPoint(u32 value) {
580 fence_manager.SignalSyncPoint(value); 580 fence_manager.SignalSyncPoint(value);
581} 581}
582 582
583void RasterizerVulkan::SignalReference() {
584 if (!gpu.IsAsync()) {
585 return;
586 }
587 fence_manager.SignalOrdering();
588}
589
583void RasterizerVulkan::ReleaseFences() { 590void RasterizerVulkan::ReleaseFences() {
584 if (!gpu.IsAsync()) { 591 if (!gpu.IsAsync()) {
585 return; 592 return;
@@ -612,6 +619,7 @@ void RasterizerVulkan::WaitForIdle() {
612 cmdbuf.SetEvent(event, flags); 619 cmdbuf.SetEvent(event, flags);
613 cmdbuf.WaitEvents(event, flags, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, {}, {}, {}); 620 cmdbuf.WaitEvents(event, flags, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, {}, {}, {});
614 }); 621 });
622 SignalReference();
615} 623}
616 624
617void RasterizerVulkan::FragmentBarrier() { 625void RasterizerVulkan::FragmentBarrier() {
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 41459c5c5..5450ccfb5 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -75,6 +75,7 @@ public:
75 void ModifyGPUMemory(GPUVAddr addr, u64 size) override; 75 void ModifyGPUMemory(GPUVAddr addr, u64 size) override;
76 void SignalSemaphore(GPUVAddr addr, u32 value) override; 76 void SignalSemaphore(GPUVAddr addr, u32 value) override;
77 void SignalSyncPoint(u32 value) override; 77 void SignalSyncPoint(u32 value) override;
78 void SignalReference() override;
78 void ReleaseFences() override; 79 void ReleaseFences() override;
79 void FlushAndInvalidateRegion(VAddr addr, u64 size) override; 80 void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
80 void WaitForIdle() override; 81 void WaitForIdle() override;
diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h
index 9fbdc1ac6..47a11cb2f 100644
--- a/src/video_core/texture_cache/types.h
+++ b/src/video_core/texture_cache/types.h
@@ -133,8 +133,8 @@ struct BufferImageCopy {
133}; 133};
134 134
135struct BufferCopy { 135struct BufferCopy {
136 size_t src_offset; 136 u64 src_offset;
137 size_t dst_offset; 137 u64 dst_offset;
138 size_t size; 138 size_t size;
139}; 139};
140 140