summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Fernando Sahmkow2023-04-28 23:54:54 +0200
committerGravatar Fernando Sahmkow2023-05-01 11:43:26 +0200
commitd6f565e5da22ec6a6a77ffabd88e59f3a25bcc96 (patch)
tree4abb992e6ae3dc1f9b4614b3d78a0d43a0e74e39
parentBuffer Cache: Release stagging buffers on tick frame (diff)
downloadyuzu-d6f565e5da22ec6a6a77ffabd88e59f3a25bcc96.tar.gz
yuzu-d6f565e5da22ec6a6a77ffabd88e59f3a25bcc96.tar.xz
yuzu-d6f565e5da22ec6a6a77ffabd88e59f3a25bcc96.zip
BufferCache: Fixes and address feedback
Diffstat (limited to '')
-rw-r--r--src/tests/video_core/memory_tracker.cpp4
-rw-r--r--src/video_core/buffer_cache/buffer_base.h99
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h42
-rw-r--r--src/video_core/buffer_cache/buffer_cache_base.h19
-rw-r--r--src/video_core/buffer_cache/memory_tracker_base.h17
-rw-r--r--src/video_core/buffer_cache/word_manager.h384
6 files changed, 243 insertions, 322 deletions
diff --git a/src/tests/video_core/memory_tracker.cpp b/src/tests/video_core/memory_tracker.cpp
index 77d391f15..3981907a2 100644
--- a/src/tests/video_core/memory_tracker.cpp
+++ b/src/tests/video_core/memory_tracker.cpp
@@ -427,7 +427,7 @@ TEST_CASE("MemoryTracker: Single page in large region", "[video_core]") {
427 427
428 memory_track->MarkRegionAsCpuModified(c + WORD * 12 + PAGE * 8, PAGE); 428 memory_track->MarkRegionAsCpuModified(c + WORD * 12 + PAGE * 8, PAGE);
429 REQUIRE(memory_track->IsRegionCpuModified(c, WORD * 16)); 429 REQUIRE(memory_track->IsRegionCpuModified(c, WORD * 16));
430 REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 10, WORD * 2)); 430 REQUIRE(!memory_track->IsRegionCpuModified(c + WORD * 10, WORD * 2));
431 REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 11, WORD * 2)); 431 REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 11, WORD * 2));
432 REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12, WORD * 2)); 432 REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12, WORD * 2));
433 REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 4, PAGE * 8)); 433 REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 4, PAGE * 8));
@@ -535,6 +535,8 @@ TEST_CASE("MemoryTracker: Cached write downloads") {
535 memory_track->MarkRegionAsGpuModified(c + PAGE, PAGE); 535 memory_track->MarkRegionAsGpuModified(c + PAGE, PAGE);
536 int num = 0; 536 int num = 0;
537 memory_track->ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; }); 537 memory_track->ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; });
538 REQUIRE(num == 1);
539 num = 0;
538 memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); 540 memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
539 REQUIRE(num == 0); 541 REQUIRE(num == 0);
540 REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); 542 REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE));
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index 095f79387..9cbd95c4b 100644
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -38,10 +38,8 @@ public:
38 static constexpr u64 BASE_PAGE_BITS = 16; 38 static constexpr u64 BASE_PAGE_BITS = 16;
39 static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS; 39 static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS;
40 40
41 explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes) 41 explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes_)
42 : cpu_addr{Common::AlignDown(cpu_addr_, BASE_PAGE_SIZE)}, 42 : cpu_addr{cpu_addr_}, size_bytes{size_bytes_} {}
43 word_manager(cpu_addr, rasterizer_,
44 Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BASE_PAGE_SIZE)) {}
45 43
46 explicit BufferBase(NullBufferParams) {} 44 explicit BufferBase(NullBufferParams) {}
47 45
@@ -51,88 +49,6 @@ public:
51 BufferBase& operator=(BufferBase&&) = default; 49 BufferBase& operator=(BufferBase&&) = default;
52 BufferBase(BufferBase&&) = default; 50 BufferBase(BufferBase&&) = default;
53 51
54 /// Returns the inclusive CPU modified range in a begin end pair
55 [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr,
56 u64 query_size) const noexcept {
57 const u64 offset = query_cpu_addr - cpu_addr;
58 return word_manager.template ModifiedRegion<Type::CPU>(offset, query_size);
59 }
60
61 /// Returns the inclusive GPU modified range in a begin end pair
62 [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr,
63 u64 query_size) const noexcept {
64 const u64 offset = query_cpu_addr - cpu_addr;
65 return word_manager.template ModifiedRegion<Type::GPU>(offset, query_size);
66 }
67
68 /// Returns true if a region has been modified from the CPU
69 [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
70 const u64 offset = query_cpu_addr - cpu_addr;
71 return word_manager.template IsRegionModified<Type::CPU>(offset, query_size);
72 }
73
74 /// Returns true if a region has been modified from the GPU
75 [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
76 const u64 offset = query_cpu_addr - cpu_addr;
77 return word_manager.template IsRegionModified<Type::GPU>(offset, query_size);
78 }
79
80 /// Mark region as CPU modified, notifying the rasterizer about this change
81 void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
82 word_manager.template ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size);
83 }
84
85 /// Unmark region as CPU modified, notifying the rasterizer about this change
86 void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
87 word_manager.template ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size);
88 }
89
90 /// Mark region as modified from the host GPU
91 void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
92 word_manager.template ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size);
93 }
94
95 /// Unmark region as modified from the host GPU
96 void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
97 word_manager.template ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size);
98 }
99
100 /// Mark region as modified from the CPU
101 /// but don't mark it as modified until FlusHCachedWrites is called.
102 void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) {
103 flags |= BufferFlagBits::CachedWrites;
104 word_manager.template ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size);
105 }
106
107 /// Flushes cached CPU writes, and notify the rasterizer about the deltas
108 void FlushCachedWrites() noexcept {
109 flags &= ~BufferFlagBits::CachedWrites;
110 word_manager.FlushCachedWrites();
111 }
112
113 /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
114 template <typename Func>
115 void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) {
116 word_manager.template ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func);
117 }
118
119 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
120 template <typename Func>
121 void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) {
122 word_manager.template ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func);
123 }
124
125 template <typename Func>
126 void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) {
127 word_manager.template ForEachModifiedRange<Type::GPU>(query_cpu_range, size, true, func);
128 }
129
130 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
131 template <typename Func>
132 void ForEachDownloadRange(Func&& func) {
133 word_manager.template ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func);
134 }
135
136 /// Mark buffer as picked 52 /// Mark buffer as picked
137 void Pick() noexcept { 53 void Pick() noexcept {
138 flags |= BufferFlagBits::Picked; 54 flags |= BufferFlagBits::Picked;
@@ -179,11 +95,6 @@ public:
179 return static_cast<u32>(other_cpu_addr - cpu_addr); 95 return static_cast<u32>(other_cpu_addr - cpu_addr);
180 } 96 }
181 97
182 /// Returns the size in bytes of the buffer
183 [[nodiscard]] u64 SizeBytes() const noexcept {
184 return word_manager.SizeBytes();
185 }
186
187 size_t getLRUID() const noexcept { 98 size_t getLRUID() const noexcept {
188 return lru_id; 99 return lru_id;
189 } 100 }
@@ -192,12 +103,16 @@ public:
192 lru_id = lru_id_; 103 lru_id = lru_id_;
193 } 104 }
194 105
106 size_t SizeBytes() const {
107 return size_bytes;
108 }
109
195private: 110private:
196 VAddr cpu_addr = 0; 111 VAddr cpu_addr = 0;
197 WordManager<RasterizerInterface> word_manager;
198 BufferFlagBits flags{}; 112 BufferFlagBits flags{};
199 int stream_score = 0; 113 int stream_score = 0;
200 size_t lru_id = SIZE_MAX; 114 size_t lru_id = SIZE_MAX;
115 size_t size_bytes = 0;
201}; 116};
202 117
203} // namespace VideoCommon 118} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index e5c626c36..7975564b5 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -21,6 +21,7 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
21 // Ensure the first slot is used for the null buffer 21 // Ensure the first slot is used for the null buffer
22 void(slot_buffers.insert(runtime, NullBufferParams{})); 22 void(slot_buffers.insert(runtime, NullBufferParams{}));
23 common_ranges.clear(); 23 common_ranges.clear();
24 inline_buffer_id = NULL_BUFFER_ID;
24 25
25 active_async_buffers = !Settings::IsGPULevelHigh(); 26 active_async_buffers = !Settings::IsGPULevelHigh();
26 27
@@ -442,9 +443,6 @@ template <class P>
442void BufferCache<P>::FlushCachedWrites() { 443void BufferCache<P>::FlushCachedWrites() {
443 cached_write_buffer_ids.clear(); 444 cached_write_buffer_ids.clear();
444 memory_tracker.FlushCachedWrites(); 445 memory_tracker.FlushCachedWrites();
445 for (auto& interval : cached_ranges) {
446 ClearDownload(interval);
447 }
448 cached_ranges.clear(); 446 cached_ranges.clear();
449} 447}
450 448
@@ -659,8 +657,8 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
659template <class P> 657template <class P>
660bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { 658bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) {
661 const VAddr end_addr = addr + size; 659 const VAddr end_addr = addr + size;
662 const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE); 660 const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE);
663 for (u64 page = addr >> PAGE_BITS; page < page_end;) { 661 for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) {
664 const BufferId buffer_id = page_table[page]; 662 const BufferId buffer_id = page_table[page];
665 if (!buffer_id) { 663 if (!buffer_id) {
666 ++page; 664 ++page;
@@ -672,7 +670,7 @@ bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) {
672 if (buf_start_addr < end_addr && addr < buf_end_addr) { 670 if (buf_start_addr < end_addr && addr < buf_end_addr) {
673 return true; 671 return true;
674 } 672 }
675 page = Common::DivCeil(end_addr, PAGE_SIZE); 673 page = Common::DivCeil(end_addr, CACHING_PAGESIZE);
676 } 674 }
677 return false; 675 return false;
678} 676}
@@ -689,7 +687,7 @@ void BufferCache<P>::BindHostIndexBuffer() {
689 const u32 offset = buffer.Offset(index_buffer.cpu_addr); 687 const u32 offset = buffer.Offset(index_buffer.cpu_addr);
690 const u32 size = index_buffer.size; 688 const u32 size = index_buffer.size;
691 const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); 689 const auto& draw_state = maxwell3d->draw_manager->GetDrawState();
692 if (!draw_state.inline_index_draw_indexes.empty()) { 690 if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] {
693 if constexpr (USE_MEMORY_MAPS) { 691 if constexpr (USE_MEMORY_MAPS) {
694 auto upload_staging = runtime.UploadStagingBuffer(size); 692 auto upload_staging = runtime.UploadStagingBuffer(size);
695 std::array<BufferCopy, 1> copies{ 693 std::array<BufferCopy, 1> copies{
@@ -1001,12 +999,20 @@ void BufferCache<P>::UpdateIndexBuffer() {
1001 return; 999 return;
1002 } 1000 }
1003 flags[Dirty::IndexBuffer] = false; 1001 flags[Dirty::IndexBuffer] = false;
1004 if (!draw_state.inline_index_draw_indexes.empty()) { 1002 if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] {
1005 auto inline_index_size = static_cast<u32>(draw_state.inline_index_draw_indexes.size()); 1003 auto inline_index_size = static_cast<u32>(draw_state.inline_index_draw_indexes.size());
1004 u32 buffer_size = Common::AlignUp(inline_index_size, CACHING_PAGESIZE);
1005 if (inline_buffer_id == NULL_BUFFER_ID) [[unlikely]] {
1006 inline_buffer_id = CreateBuffer(0, buffer_size);
1007 }
1008 if (slot_buffers[inline_buffer_id].SizeBytes() < buffer_size) [[unlikely]] {
1009 slot_buffers.erase(inline_buffer_id);
1010 inline_buffer_id = CreateBuffer(0, buffer_size);
1011 }
1006 index_buffer = Binding{ 1012 index_buffer = Binding{
1007 .cpu_addr = 0, 1013 .cpu_addr = 0,
1008 .size = inline_index_size, 1014 .size = inline_index_size,
1009 .buffer_id = FindBuffer(0, inline_index_size), 1015 .buffer_id = inline_buffer_id,
1010 }; 1016 };
1011 return; 1017 return;
1012 } 1018 }
@@ -1224,7 +1230,7 @@ BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) {
1224 if (cpu_addr == 0) { 1230 if (cpu_addr == 0) {
1225 return NULL_BUFFER_ID; 1231 return NULL_BUFFER_ID;
1226 } 1232 }
1227 const u64 page = cpu_addr >> PAGE_BITS; 1233 const u64 page = cpu_addr >> CACHING_PAGEBITS;
1228 const BufferId buffer_id = page_table[page]; 1234 const BufferId buffer_id = page_table[page];
1229 if (!buffer_id) { 1235 if (!buffer_id) {
1230 return CreateBuffer(cpu_addr, size); 1236 return CreateBuffer(cpu_addr, size);
@@ -1253,8 +1259,9 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
1253 .has_stream_leap = has_stream_leap, 1259 .has_stream_leap = has_stream_leap,
1254 }; 1260 };
1255 } 1261 }
1256 for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) { 1262 for (; cpu_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE);
1257 const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; 1263 cpu_addr += CACHING_PAGESIZE) {
1264 const BufferId overlap_id = page_table[cpu_addr >> CACHING_PAGEBITS];
1258 if (!overlap_id) { 1265 if (!overlap_id) {
1259 continue; 1266 continue;
1260 } 1267 }
@@ -1280,11 +1287,11 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
1280 // as a stream buffer. Increase the size to skip constantly recreating buffers. 1287 // as a stream buffer. Increase the size to skip constantly recreating buffers.
1281 has_stream_leap = true; 1288 has_stream_leap = true;
1282 if (expands_right) { 1289 if (expands_right) {
1283 begin -= PAGE_SIZE * 256; 1290 begin -= CACHING_PAGESIZE * 256;
1284 cpu_addr = begin; 1291 cpu_addr = begin;
1285 } 1292 }
1286 if (expands_left) { 1293 if (expands_left) {
1287 end += PAGE_SIZE * 256; 1294 end += CACHING_PAGESIZE * 256;
1288 } 1295 }
1289 } 1296 }
1290 } 1297 }
@@ -1317,6 +1324,9 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
1317 1324
1318template <class P> 1325template <class P>
1319BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { 1326BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
1327 VAddr cpu_addr_end = Common::AlignUp(cpu_addr + wanted_size, CACHING_PAGESIZE);
1328 cpu_addr = Common::AlignDown(cpu_addr, CACHING_PAGESIZE);
1329 wanted_size = static_cast<u32>(cpu_addr_end - cpu_addr);
1320 const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); 1330 const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size);
1321 const u32 size = static_cast<u32>(overlap.end - overlap.begin); 1331 const u32 size = static_cast<u32>(overlap.end - overlap.begin);
1322 const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); 1332 const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size);
@@ -1354,8 +1364,8 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
1354 } 1364 }
1355 const VAddr cpu_addr_begin = buffer.CpuAddr(); 1365 const VAddr cpu_addr_begin = buffer.CpuAddr();
1356 const VAddr cpu_addr_end = cpu_addr_begin + size; 1366 const VAddr cpu_addr_end = cpu_addr_begin + size;
1357 const u64 page_begin = cpu_addr_begin / PAGE_SIZE; 1367 const u64 page_begin = cpu_addr_begin / CACHING_PAGESIZE;
1358 const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); 1368 const u64 page_end = Common::DivCeil(cpu_addr_end, CACHING_PAGESIZE);
1359 for (u64 page = page_begin; page != page_end; ++page) { 1369 for (u64 page = page_begin; page != page_end; ++page) {
1360 if constexpr (insert) { 1370 if constexpr (insert) {
1361 page_table[page] = buffer_id; 1371 page_table[page] = buffer_id;
diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h
index 75cb98ba3..656baa550 100644
--- a/src/video_core/buffer_cache/buffer_cache_base.h
+++ b/src/video_core/buffer_cache/buffer_cache_base.h
@@ -90,10 +90,8 @@ template <typename P>
90class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { 90class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> {
91 // Page size for caching purposes. 91 // Page size for caching purposes.
92 // This is unrelated to the CPU page size and it can be changed as it seems optimal. 92 // This is unrelated to the CPU page size and it can be changed as it seems optimal.
93 static constexpr u32 PAGE_BITS = 16; 93 static constexpr u32 CACHING_PAGEBITS = 16;
94 static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; 94 static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS;
95 static constexpr u32 CPU_PAGE_BITS = 12;
96 static constexpr u64 CPU_PAGE_SIZE = u64{1} << CPU_PAGE_BITS;
97 95
98 static constexpr bool IS_OPENGL = P::IS_OPENGL; 96 static constexpr bool IS_OPENGL = P::IS_OPENGL;
99 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = 97 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS =
@@ -112,6 +110,10 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelI
112 static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; 110 static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB;
113 static constexpr s64 TARGET_THRESHOLD = 4_GiB; 111 static constexpr s64 TARGET_THRESHOLD = 4_GiB;
114 112
113 // Debug Flags.
114
115 static constexpr bool DISABLE_DOWNLOADS = true;
116
115 using Maxwell = Tegra::Engines::Maxwell3D::Regs; 117 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
116 118
117 using Runtime = typename P::Runtime; 119 using Runtime = typename P::Runtime;
@@ -286,8 +288,8 @@ private:
286 288
287 template <typename Func> 289 template <typename Func>
288 void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { 290 void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) {
289 const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); 291 const u64 page_end = Common::DivCeil(cpu_addr + size, CACHING_PAGESIZE);
290 for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { 292 for (u64 page = cpu_addr >> CACHING_PAGEBITS; page < page_end;) {
291 const BufferId buffer_id = page_table[page]; 293 const BufferId buffer_id = page_table[page];
292 if (!buffer_id) { 294 if (!buffer_id) {
293 ++page; 295 ++page;
@@ -297,7 +299,7 @@ private:
297 func(buffer_id, buffer); 299 func(buffer_id, buffer);
298 300
299 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); 301 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
300 page = Common::DivCeil(end_addr, PAGE_SIZE); 302 page = Common::DivCeil(end_addr, CACHING_PAGESIZE);
301 } 303 }
302 } 304 }
303 305
@@ -568,10 +570,11 @@ private:
568 u64 total_used_memory = 0; 570 u64 total_used_memory = 0;
569 u64 minimum_memory = 0; 571 u64 minimum_memory = 0;
570 u64 critical_memory = 0; 572 u64 critical_memory = 0;
573 BufferId inline_buffer_id;
571 574
572 bool active_async_buffers = false; 575 bool active_async_buffers = false;
573 576
574 std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; 577 std::array<BufferId, ((1ULL << 39) >> CACHING_PAGEBITS)> page_table;
575}; 578};
576 579
577} // namespace VideoCommon 580} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h
index 016d8430f..4bc59017f 100644
--- a/src/video_core/buffer_cache/memory_tracker_base.h
+++ b/src/video_core/buffer_cache/memory_tracker_base.h
@@ -132,8 +132,8 @@ public:
132 void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { 132 void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) {
133 IteratePages<true>(query_cpu_range, query_size, 133 IteratePages<true>(query_cpu_range, query_size,
134 [&func](Manager* manager, u64 offset, size_t size) { 134 [&func](Manager* manager, u64 offset, size_t size) {
135 manager->template ForEachModifiedRange<Type::CPU>( 135 manager->template ForEachModifiedRange<Type::CPU, true>(
136 manager->GetCpuAddr() + offset, size, true, func); 136 manager->GetCpuAddr() + offset, size, func);
137 }); 137 });
138 } 138 }
139 139
@@ -142,8 +142,13 @@ public:
142 void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) { 142 void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) {
143 IteratePages<false>(query_cpu_range, query_size, 143 IteratePages<false>(query_cpu_range, query_size,
144 [&func, clear](Manager* manager, u64 offset, size_t size) { 144 [&func, clear](Manager* manager, u64 offset, size_t size) {
145 manager->template ForEachModifiedRange<Type::GPU>( 145 if (clear) {
146 manager->GetCpuAddr() + offset, size, clear, func); 146 manager->template ForEachModifiedRange<Type::GPU, true>(
147 manager->GetCpuAddr() + offset, size, func);
148 } else {
149 manager->template ForEachModifiedRange<Type::GPU, false>(
150 manager->GetCpuAddr() + offset, size, func);
151 }
147 }); 152 });
148 } 153 }
149 154
@@ -151,8 +156,8 @@ public:
151 void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) { 156 void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) {
152 IteratePages<false>(query_cpu_range, query_size, 157 IteratePages<false>(query_cpu_range, query_size,
153 [&func](Manager* manager, u64 offset, size_t size) { 158 [&func](Manager* manager, u64 offset, size_t size) {
154 manager->template ForEachModifiedRange<Type::GPU>( 159 manager->template ForEachModifiedRange<Type::GPU, true>(
155 manager->GetCpuAddr() + offset, size, true, func); 160 manager->GetCpuAddr() + offset, size, func);
156 }); 161 });
157 } 162 }
158 163
diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h
index 21729752b..a42455045 100644
--- a/src/video_core/buffer_cache/word_manager.h
+++ b/src/video_core/buffer_cache/word_manager.h
@@ -6,6 +6,7 @@
6#include <algorithm> 6#include <algorithm>
7#include <bit> 7#include <bit>
8#include <limits> 8#include <limits>
9#include <span>
9#include <utility> 10#include <utility>
10 11
11#include "common/alignment.h" 12#include "common/alignment.h"
@@ -20,9 +21,16 @@ constexpr u64 PAGES_PER_WORD = 64;
20constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; 21constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE;
21constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; 22constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE;
22 23
24enum class Type {
25 CPU,
26 GPU,
27 CachedCPU,
28 Untracked,
29};
30
23/// Vector tracking modified pages tightly packed with small vector optimization 31/// Vector tracking modified pages tightly packed with small vector optimization
24template <size_t stack_words = 1> 32template <size_t stack_words = 1>
25union WordsArray { 33struct WordsArray {
26 /// Returns the pointer to the words state 34 /// Returns the pointer to the words state
27 [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { 35 [[nodiscard]] const u64* Pointer(bool is_short) const noexcept {
28 return is_short ? stack.data() : heap; 36 return is_short ? stack.data() : heap;
@@ -41,13 +49,13 @@ template <size_t stack_words = 1>
41struct Words { 49struct Words {
42 explicit Words() = default; 50 explicit Words() = default;
43 explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { 51 explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} {
52 num_words = Common::DivCeil(size_bytes, BYTES_PER_WORD);
44 if (IsShort()) { 53 if (IsShort()) {
45 cpu.stack.fill(~u64{0}); 54 cpu.stack.fill(~u64{0});
46 gpu.stack.fill(0); 55 gpu.stack.fill(0);
47 cached_cpu.stack.fill(0); 56 cached_cpu.stack.fill(0);
48 untracked.stack.fill(~u64{0}); 57 untracked.stack.fill(~u64{0});
49 } else { 58 } else {
50 const size_t num_words = NumWords();
51 // Share allocation between CPU and GPU pages and set their default values 59 // Share allocation between CPU and GPU pages and set their default values
52 u64* const alloc = new u64[num_words * 4]; 60 u64* const alloc = new u64[num_words * 4];
53 cpu.heap = alloc; 61 cpu.heap = alloc;
@@ -75,6 +83,7 @@ struct Words {
75 Words& operator=(Words&& rhs) noexcept { 83 Words& operator=(Words&& rhs) noexcept {
76 Release(); 84 Release();
77 size_bytes = rhs.size_bytes; 85 size_bytes = rhs.size_bytes;
86 num_words = rhs.num_words;
78 cpu = rhs.cpu; 87 cpu = rhs.cpu;
79 gpu = rhs.gpu; 88 gpu = rhs.gpu;
80 cached_cpu = rhs.cached_cpu; 89 cached_cpu = rhs.cached_cpu;
@@ -84,7 +93,7 @@ struct Words {
84 } 93 }
85 94
86 Words(Words&& rhs) noexcept 95 Words(Words&& rhs) noexcept
87 : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, 96 : size_bytes{rhs.size_bytes}, num_words{rhs.num_words}, cpu{rhs.cpu}, gpu{rhs.gpu},
88 cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { 97 cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} {
89 rhs.cpu.heap = nullptr; 98 rhs.cpu.heap = nullptr;
90 } 99 }
@@ -94,12 +103,12 @@ struct Words {
94 103
95 /// Returns true when the buffer fits in the small vector optimization 104 /// Returns true when the buffer fits in the small vector optimization
96 [[nodiscard]] bool IsShort() const noexcept { 105 [[nodiscard]] bool IsShort() const noexcept {
97 return size_bytes <= stack_words * BYTES_PER_WORD; 106 return num_words <= stack_words;
98 } 107 }
99 108
100 /// Returns the number of words of the buffer 109 /// Returns the number of words of the buffer
101 [[nodiscard]] size_t NumWords() const noexcept { 110 [[nodiscard]] size_t NumWords() const noexcept {
102 return Common::DivCeil(size_bytes, BYTES_PER_WORD); 111 return num_words;
103 } 112 }
104 113
105 /// Release buffer resources 114 /// Release buffer resources
@@ -110,20 +119,40 @@ struct Words {
110 } 119 }
111 } 120 }
112 121
122 template <Type type>
123 std::span<u64> Span() noexcept {
124 if constexpr (type == Type::CPU) {
125 return std::span<u64>(cpu.Pointer(IsShort()), num_words);
126 } else if constexpr (type == Type::GPU) {
127 return std::span<u64>(gpu.Pointer(IsShort()), num_words);
128 } else if constexpr (type == Type::CachedCPU) {
129 return std::span<u64>(cached_cpu.Pointer(IsShort()), num_words);
130 } else if constexpr (type == Type::Untracked) {
131 return std::span<u64>(untracked.Pointer(IsShort()), num_words);
132 }
133 }
134
135 template <Type type>
136 std::span<const u64> Span() const noexcept {
137 if constexpr (type == Type::CPU) {
138 return std::span<const u64>(cpu.Pointer(IsShort()), num_words);
139 } else if constexpr (type == Type::GPU) {
140 return std::span<const u64>(gpu.Pointer(IsShort()), num_words);
141 } else if constexpr (type == Type::CachedCPU) {
142 return std::span<const u64>(cached_cpu.Pointer(IsShort()), num_words);
143 } else if constexpr (type == Type::Untracked) {
144 return std::span<const u64>(untracked.Pointer(IsShort()), num_words);
145 }
146 }
147
113 u64 size_bytes = 0; 148 u64 size_bytes = 0;
149 size_t num_words = 0;
114 WordsArray<stack_words> cpu; 150 WordsArray<stack_words> cpu;
115 WordsArray<stack_words> gpu; 151 WordsArray<stack_words> gpu;
116 WordsArray<stack_words> cached_cpu; 152 WordsArray<stack_words> cached_cpu;
117 WordsArray<stack_words> untracked; 153 WordsArray<stack_words> untracked;
118}; 154};
119 155
120enum class Type {
121 CPU,
122 GPU,
123 CachedCPU,
124 Untracked,
125};
126
127template <class RasterizerInterface, size_t stack_words = 1> 156template <class RasterizerInterface, size_t stack_words = 1>
128class WordManager { 157class WordManager {
129public: 158public:
@@ -140,6 +169,69 @@ public:
140 return cpu_addr; 169 return cpu_addr;
141 } 170 }
142 171
172 static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) {
173 constexpr size_t number_bits = sizeof(u64) * 8;
174 const size_t limit_page_end = number_bits - std::min(page_end, number_bits);
175 u64 bits = (word >> page_start) << page_start;
176 bits = (bits << limit_page_end) >> limit_page_end;
177 return bits;
178 }
179
180 static std::pair<size_t, size_t> GetWordPage(VAddr address) {
181 const size_t converted_address = static_cast<size_t>(address);
182 const size_t word_number = converted_address / BYTES_PER_WORD;
183 const size_t amount_pages = converted_address % BYTES_PER_WORD;
184 return std::make_pair(word_number, amount_pages / BYTES_PER_PAGE);
185 }
186
187 template <typename Func>
188 void IterateWords(size_t offset, size_t size, Func&& func) const {
189 using FuncReturn = std::invoke_result_t<Func, std::size_t, u64>;
190 static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>;
191 const size_t start = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset), 0LL));
192 const size_t end = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset + size), 0LL));
193 if (start >= SizeBytes() || end <= start) {
194 return;
195 }
196 auto [start_word, start_page] = GetWordPage(start);
197 auto [end_word, end_page] = GetWordPage(end + BYTES_PER_PAGE - 1ULL);
198 const size_t num_words = NumWords();
199 start_word = std::min(start_word, num_words);
200 end_word = std::min(end_word, num_words);
201 const size_t diff = end_word - start_word;
202 end_word += (end_page + PAGES_PER_WORD - 1ULL) / PAGES_PER_WORD;
203 end_word = std::min(end_word, num_words);
204 end_page += diff * PAGES_PER_WORD;
205 constexpr u64 base_mask{~0ULL};
206 for (size_t word_index = start_word; word_index < end_word; word_index++) {
207 const u64 mask = ExtractBits(base_mask, start_page, end_page);
208 start_page = 0;
209 end_page -= PAGES_PER_WORD;
210 if constexpr (BOOL_BREAK) {
211 if (func(word_index, mask)) {
212 return;
213 }
214 } else {
215 func(word_index, mask);
216 }
217 }
218 }
219
220 template <typename Func>
221 void IteratePages(u64 mask, Func&& func) const {
222 size_t offset = 0;
223 while (mask != 0) {
224 const size_t empty_bits = std::countr_zero(mask);
225 offset += empty_bits;
226 mask = mask >> empty_bits;
227
228 const size_t continuous_bits = std::countr_one(mask);
229 func(offset, continuous_bits);
230 mask = continuous_bits < PAGES_PER_WORD ? (mask >> continuous_bits) : 0;
231 offset += continuous_bits;
232 }
233 }
234
143 /** 235 /**
144 * Change the state of a range of pages 236 * Change the state of a range of pages
145 * 237 *
@@ -147,47 +239,33 @@ public:
147 * @param size Size in bytes to mark or unmark as modified 239 * @param size Size in bytes to mark or unmark as modified
148 */ 240 */
149 template <Type type, bool enable> 241 template <Type type, bool enable>
150 void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { 242 void ChangeRegionState(u64 dirty_addr, u64 size) noexcept(type == Type::GPU) {
151 const s64 difference = dirty_addr - cpu_addr; 243 std::span<u64> state_words = words.template Span<type>();
152 const u64 offset = std::max<s64>(difference, 0); 244 [[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>();
153 size += std::min<s64>(difference, 0); 245 [[maybe_unused]] std::span<u64> cached_words = words.template Span<Type::CachedCPU>();
154 if (offset >= SizeBytes() || size < 0) { 246 IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) {
155 return;
156 }
157 u64* const untracked_words = Array<Type::Untracked>();
158 u64* const state_words = Array<type>();
159 const u64 offset_end = std::min(offset + size, SizeBytes());
160 const u64 begin_page_index = offset / BYTES_PER_PAGE;
161 const u64 begin_word_index = begin_page_index / PAGES_PER_WORD;
162 const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE);
163 const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD);
164 u64 page_index = begin_page_index % PAGES_PER_WORD;
165 u64 word_index = begin_word_index;
166 while (word_index < end_word_index) {
167 const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD;
168 const u64 left_offset =
169 std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD;
170 const u64 right_offset = page_index;
171 u64 bits = ~u64{0};
172 bits = (bits >> right_offset) << right_offset;
173 bits = (bits << left_offset) >> left_offset;
174 if constexpr (type == Type::CPU || type == Type::CachedCPU) { 247 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
175 NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits); 248 NotifyRasterizer<!enable>(index, untracked_words[index], mask);
176 } 249 }
177 if constexpr (enable) { 250 if constexpr (enable) {
178 state_words[word_index] |= bits; 251 state_words[index] |= mask;
179 if constexpr (type == Type::CPU || type == Type::CachedCPU) { 252 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
180 untracked_words[word_index] |= bits; 253 untracked_words[index] |= mask;
254 }
255 if constexpr (type == Type::CPU) {
256 cached_words[index] &= ~mask;
181 } 257 }
182 } else { 258 } else {
183 state_words[word_index] &= ~bits; 259 if constexpr (type == Type::CPU) {
260 const u64 word = state_words[index] & mask;
261 cached_words[index] &= ~word;
262 }
263 state_words[index] &= ~mask;
184 if constexpr (type == Type::CPU || type == Type::CachedCPU) { 264 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
185 untracked_words[word_index] &= ~bits; 265 untracked_words[index] &= ~mask;
186 } 266 }
187 } 267 }
188 page_index = 0; 268 });
189 ++word_index;
190 }
191 } 269 }
192 270
193 /** 271 /**
@@ -198,119 +276,59 @@ public:
198 * @param size Size in bytes of the CPU range to loop over 276 * @param size Size in bytes of the CPU range to loop over
199 * @param func Function to call for each turned off region 277 * @param func Function to call for each turned off region
200 */ 278 */
201 template <Type type, typename Func> 279 template <Type type, bool clear, typename Func>
202 void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { 280 void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) {
203 static_assert(type != Type::Untracked); 281 static_assert(type != Type::Untracked);
204 282
205 const s64 difference = query_cpu_range - cpu_addr; 283 std::span<u64> state_words = words.template Span<type>();
206 const u64 query_begin = std::max<s64>(difference, 0); 284 [[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>();
207 size += std::min<s64>(difference, 0); 285 [[maybe_unused]] std::span<u64> cached_words = words.template Span<Type::CachedCPU>();
208 if (query_begin >= SizeBytes() || size < 0) { 286 const size_t offset = query_cpu_range - cpu_addr;
209 return; 287 bool pending = false;
210 } 288 size_t pending_offset{};
211 [[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>(); 289 size_t pending_pointer{};
212 [[maybe_unused]] u64* const cpu_words = Array<Type::CPU>(); 290 const auto release = [&]() {
213 u64* const state_words = Array<type>(); 291 func(cpu_addr + pending_offset * BYTES_PER_PAGE,
214 const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); 292 (pending_pointer - pending_offset) * BYTES_PER_PAGE);
215 u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; 293 };
216 u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); 294 IterateWords(offset, size, [&](size_t index, u64 mask) {
217 u64 first_page = (query_begin / BYTES_PER_PAGE) % PAGES_PER_WORD; 295 const u64 word = state_words[index] & mask;
218 296 if constexpr (clear) {
219 const auto modified = [](u64 word) { return word != 0; }; 297 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
220 const auto first_modified_word = std::find_if(words_begin, words_end, modified); 298 NotifyRasterizer<true>(index, untracked_words[index], mask);
221 if (first_modified_word == words_end) {
222 // Exit early when the buffer is not modified
223 return;
224 }
225 if (first_modified_word != words_begin) {
226 first_page = 0;
227 }
228 std::reverse_iterator<u64*> first_word_reverse(first_modified_word);
229 std::reverse_iterator<u64*> last_word_iterator(words_end);
230 auto last_word_result = std::find_if(last_word_iterator, first_word_reverse, modified);
231 u64* const last_modified_word = &(*last_word_result) + 1;
232
233 const u64 word_index_begin = std::distance(state_words, first_modified_word);
234 const u64 word_index_end = std::distance(state_words, last_modified_word);
235 const unsigned local_page_begin = std::countr_zero(*first_modified_word);
236 const unsigned local_page_end =
237 static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]);
238 const u64 word_page_begin = word_index_begin * PAGES_PER_WORD;
239 const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD;
240 const u64 query_page_begin = query_begin / BYTES_PER_PAGE;
241 const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE);
242 const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin);
243 const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end);
244 const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD;
245 const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1;
246
247 u64 page_begin = std::max(first_word_page_begin, first_page);
248 u64 current_base = 0;
249 u64 current_size = 0;
250 bool on_going = false;
251 for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) {
252 const bool is_last_word = word_index + 1 == word_index_end;
253 const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD;
254 const u64 right_offset = page_begin;
255 const u64 left_offset = PAGES_PER_WORD - page_end;
256 u64 bits = ~u64{0};
257 bits = (bits >> right_offset) << right_offset;
258 bits = (bits << left_offset) >> left_offset;
259
260 const u64 current_word = state_words[word_index] & bits;
261 if (clear) {
262 state_words[word_index] &= ~bits;
263 }
264
265 if constexpr (type == Type::CachedCPU) {
266 NotifyRasterizer<false>(word_index, untracked_words[word_index], current_word);
267 untracked_words[word_index] |= current_word;
268 cpu_words[word_index] |= current_word;
269 }
270
271 if constexpr (type == Type::CPU) {
272 const u64 current_bits = untracked_words[word_index] & bits;
273 untracked_words[word_index] &= ~bits;
274 NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
275 }
276 const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
277 u64 page = page_begin;
278 page_begin = 0;
279
280 while (page < page_end) {
281 const int empty_bits = std::countr_zero(word >> page);
282 if (on_going && empty_bits != 0) {
283 InvokeModifiedRange(func, current_size, current_base);
284 current_size = 0;
285 on_going = false;
286 } 299 }
287 if (empty_bits == PAGES_PER_WORD) { 300 state_words[index] &= ~mask;
288 break; 301 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
302 untracked_words[index] &= ~mask;
289 } 303 }
290 page += empty_bits; 304 if constexpr (type == Type::CPU) {
291 305 cached_words[index] &= ~word;
292 const int continuous_bits = std::countr_one(word >> page);
293 if (!on_going && continuous_bits != 0) {
294 current_base = word_index * PAGES_PER_WORD + page;
295 on_going = true;
296 } 306 }
297 current_size += continuous_bits;
298 page += continuous_bits;
299 } 307 }
300 } 308 const size_t base_offset = index * PAGES_PER_WORD;
301 if (on_going && current_size > 0) { 309 IteratePages(word, [&](size_t pages_offset, size_t pages_size) {
302 InvokeModifiedRange(func, current_size, current_base); 310 const auto reset = [&]() {
311 pending_offset = base_offset + pages_offset;
312 pending_pointer = base_offset + pages_offset + pages_size;
313 };
314 if (!pending) {
315 reset();
316 pending = true;
317 return;
318 }
319 if (pending_pointer == base_offset + pages_offset) {
320 pending_pointer += pages_size;
321 return;
322 }
323 release();
324 reset();
325 });
326 });
327 if (pending) {
328 release();
303 } 329 }
304 } 330 }
305 331
306 template <typename Func>
307 void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) {
308 const u64 current_size_bytes = current_size * BYTES_PER_PAGE;
309 const u64 offset_begin = current_base * BYTES_PER_PAGE;
310 const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes());
311 func(cpu_addr + offset_begin, offset_end - offset_begin);
312 }
313
314 /** 332 /**
315 * Returns true when a region has been modified 333 * Returns true when a region has been modified
316 * 334 *
@@ -321,27 +339,17 @@ public:
321 [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { 339 [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
322 static_assert(type != Type::Untracked); 340 static_assert(type != Type::Untracked);
323 341
324 const u64* const untracked_words = Array<Type::Untracked>(); 342 const std::span<const u64> state_words = words.template Span<type>();
325 const u64* const state_words = Array<type>(); 343 bool result = false;
326 const u64 num_query_words = size / BYTES_PER_WORD + 1; 344 IterateWords(offset, size, [&](size_t index, u64 mask) {
327 const u64 word_begin = offset / BYTES_PER_WORD; 345 const u64 word = state_words[index] & mask;
328 const u64 word_end = std::min(word_begin + num_query_words, NumWords()); 346 if (word != 0) {
329 const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); 347 result = true;
330 u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
331 for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
332 const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
333 const u64 word = state_words[word_index] & ~off_word;
334 if (word == 0) {
335 continue;
336 }
337 const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit);
338 const u64 local_page_end = page_end % PAGES_PER_WORD;
339 const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD;
340 if (((word >> page_index) << page_index) << page_end_shift != 0) {
341 return true; 348 return true;
342 } 349 }
343 } 350 return false;
344 return false; 351 });
352 return result;
345 } 353 }
346 354
347 /** 355 /**
@@ -353,34 +361,20 @@ public:
353 template <Type type> 361 template <Type type>
354 [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { 362 [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
355 static_assert(type != Type::Untracked); 363 static_assert(type != Type::Untracked);
356 const u64* const state_words = Array<type>(); 364 const std::span<const u64> state_words = words.template Span<type>();
357 const u64 num_query_words = size / BYTES_PER_WORD + 1;
358 const u64 word_begin = offset / BYTES_PER_WORD;
359 const u64 word_end = std::min(word_begin + num_query_words, NumWords());
360 const u64 page_base = offset / BYTES_PER_PAGE;
361 u64 page_begin = page_base & (PAGES_PER_WORD - 1);
362 u64 page_end =
363 Common::DivCeil(offset + size, BYTES_PER_PAGE) - (page_base & ~(PAGES_PER_WORD - 1));
364 u64 begin = std::numeric_limits<u64>::max(); 365 u64 begin = std::numeric_limits<u64>::max();
365 u64 end = 0; 366 u64 end = 0;
366 for (u64 word_index = word_begin; word_index < word_end; ++word_index) { 367 IterateWords(offset, size, [&](size_t index, u64 mask) {
367 const u64 base_mask = (1ULL << page_begin) - 1ULL; 368 const u64 word = state_words[index] & mask;
368 const u64 end_mask = page_end >= PAGES_PER_WORD ? 0ULL : ~((1ULL << page_end) - 1ULL);
369 const u64 off_word = end_mask | base_mask;
370 const u64 word = state_words[word_index] & ~off_word;
371 if (word == 0) { 369 if (word == 0) {
372 page_begin = 0; 370 return;
373 page_end -= PAGES_PER_WORD;
374 continue;
375 } 371 }
376 const u64 local_page_begin = std::countr_zero(word); 372 const u64 local_page_begin = std::countr_zero(word);
377 const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); 373 const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word);
378 const u64 page_index = word_index * PAGES_PER_WORD; 374 const u64 page_index = index * PAGES_PER_WORD;
379 begin = std::min(begin, page_index + local_page_begin); 375 begin = std::min(begin, page_index + local_page_begin);
380 end = page_index + local_page_end; 376 end = page_index + local_page_end;
381 page_begin = 0; 377 });
382 page_end -= PAGES_PER_WORD;
383 }
384 static constexpr std::pair<u64, u64> EMPTY{0, 0}; 378 static constexpr std::pair<u64, u64> EMPTY{0, 0};
385 return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; 379 return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY;
386 } 380 }
@@ -454,18 +448,10 @@ private:
454 void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { 448 void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const {
455 u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; 449 u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits;
456 VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; 450 VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
457 while (changed_bits != 0) { 451 IteratePages(changed_bits, [&](size_t offset, size_t size) {
458 const int empty_bits = std::countr_zero(changed_bits); 452 rasterizer->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE,
459 addr += empty_bits * BYTES_PER_PAGE; 453 size * BYTES_PER_PAGE, add_to_rasterizer ? 1 : -1);
460 changed_bits >>= empty_bits; 454 });
461
462 const u32 continuous_bits = std::countr_one(changed_bits);
463 const u64 size = continuous_bits * BYTES_PER_PAGE;
464 const VAddr begin_addr = addr;
465 addr += size;
466 changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0;
467 rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1);
468 }
469 } 455 }
470 456
471 VAddr cpu_addr = 0; 457 VAddr cpu_addr = 0;