diff options
| author | 2023-04-28 23:54:54 +0200 | |
|---|---|---|
| committer | 2023-05-01 11:43:26 +0200 | |
| commit | d6f565e5da22ec6a6a77ffabd88e59f3a25bcc96 (patch) | |
| tree | 4abb992e6ae3dc1f9b4614b3d78a0d43a0e74e39 | |
| parent | Buffer Cache: Release stagging buffers on tick frame (diff) | |
| download | yuzu-d6f565e5da22ec6a6a77ffabd88e59f3a25bcc96.tar.gz yuzu-d6f565e5da22ec6a6a77ffabd88e59f3a25bcc96.tar.xz yuzu-d6f565e5da22ec6a6a77ffabd88e59f3a25bcc96.zip | |
BufferCache: Fixes and address feedback
Diffstat (limited to '')
| -rw-r--r-- | src/tests/video_core/memory_tracker.cpp | 4 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_base.h | 99 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 42 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache_base.h | 19 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/memory_tracker_base.h | 17 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/word_manager.h | 384 |
6 files changed, 243 insertions, 322 deletions
diff --git a/src/tests/video_core/memory_tracker.cpp b/src/tests/video_core/memory_tracker.cpp index 77d391f15..3981907a2 100644 --- a/src/tests/video_core/memory_tracker.cpp +++ b/src/tests/video_core/memory_tracker.cpp | |||
| @@ -427,7 +427,7 @@ TEST_CASE("MemoryTracker: Single page in large region", "[video_core]") { | |||
| 427 | 427 | ||
| 428 | memory_track->MarkRegionAsCpuModified(c + WORD * 12 + PAGE * 8, PAGE); | 428 | memory_track->MarkRegionAsCpuModified(c + WORD * 12 + PAGE * 8, PAGE); |
| 429 | REQUIRE(memory_track->IsRegionCpuModified(c, WORD * 16)); | 429 | REQUIRE(memory_track->IsRegionCpuModified(c, WORD * 16)); |
| 430 | REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 10, WORD * 2)); | 430 | REQUIRE(!memory_track->IsRegionCpuModified(c + WORD * 10, WORD * 2)); |
| 431 | REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 11, WORD * 2)); | 431 | REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 11, WORD * 2)); |
| 432 | REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12, WORD * 2)); | 432 | REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12, WORD * 2)); |
| 433 | REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 4, PAGE * 8)); | 433 | REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 4, PAGE * 8)); |
| @@ -535,6 +535,8 @@ TEST_CASE("MemoryTracker: Cached write downloads") { | |||
| 535 | memory_track->MarkRegionAsGpuModified(c + PAGE, PAGE); | 535 | memory_track->MarkRegionAsGpuModified(c + PAGE, PAGE); |
| 536 | int num = 0; | 536 | int num = 0; |
| 537 | memory_track->ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; }); | 537 | memory_track->ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; }); |
| 538 | REQUIRE(num == 1); | ||
| 539 | num = 0; | ||
| 538 | memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); | 540 | memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); |
| 539 | REQUIRE(num == 0); | 541 | REQUIRE(num == 0); |
| 540 | REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); | 542 | REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); |
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index 095f79387..9cbd95c4b 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h | |||
| @@ -38,10 +38,8 @@ public: | |||
| 38 | static constexpr u64 BASE_PAGE_BITS = 16; | 38 | static constexpr u64 BASE_PAGE_BITS = 16; |
| 39 | static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS; | 39 | static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS; |
| 40 | 40 | ||
| 41 | explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes) | 41 | explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes_) |
| 42 | : cpu_addr{Common::AlignDown(cpu_addr_, BASE_PAGE_SIZE)}, | 42 | : cpu_addr{cpu_addr_}, size_bytes{size_bytes_} {} |
| 43 | word_manager(cpu_addr, rasterizer_, | ||
| 44 | Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BASE_PAGE_SIZE)) {} | ||
| 45 | 43 | ||
| 46 | explicit BufferBase(NullBufferParams) {} | 44 | explicit BufferBase(NullBufferParams) {} |
| 47 | 45 | ||
| @@ -51,88 +49,6 @@ public: | |||
| 51 | BufferBase& operator=(BufferBase&&) = default; | 49 | BufferBase& operator=(BufferBase&&) = default; |
| 52 | BufferBase(BufferBase&&) = default; | 50 | BufferBase(BufferBase&&) = default; |
| 53 | 51 | ||
| 54 | /// Returns the inclusive CPU modified range in a begin end pair | ||
| 55 | [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, | ||
| 56 | u64 query_size) const noexcept { | ||
| 57 | const u64 offset = query_cpu_addr - cpu_addr; | ||
| 58 | return word_manager.template ModifiedRegion<Type::CPU>(offset, query_size); | ||
| 59 | } | ||
| 60 | |||
| 61 | /// Returns the inclusive GPU modified range in a begin end pair | ||
| 62 | [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, | ||
| 63 | u64 query_size) const noexcept { | ||
| 64 | const u64 offset = query_cpu_addr - cpu_addr; | ||
| 65 | return word_manager.template ModifiedRegion<Type::GPU>(offset, query_size); | ||
| 66 | } | ||
| 67 | |||
| 68 | /// Returns true if a region has been modified from the CPU | ||
| 69 | [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { | ||
| 70 | const u64 offset = query_cpu_addr - cpu_addr; | ||
| 71 | return word_manager.template IsRegionModified<Type::CPU>(offset, query_size); | ||
| 72 | } | ||
| 73 | |||
| 74 | /// Returns true if a region has been modified from the GPU | ||
| 75 | [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { | ||
| 76 | const u64 offset = query_cpu_addr - cpu_addr; | ||
| 77 | return word_manager.template IsRegionModified<Type::GPU>(offset, query_size); | ||
| 78 | } | ||
| 79 | |||
| 80 | /// Mark region as CPU modified, notifying the rasterizer about this change | ||
| 81 | void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { | ||
| 82 | word_manager.template ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size); | ||
| 83 | } | ||
| 84 | |||
| 85 | /// Unmark region as CPU modified, notifying the rasterizer about this change | ||
| 86 | void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { | ||
| 87 | word_manager.template ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size); | ||
| 88 | } | ||
| 89 | |||
| 90 | /// Mark region as modified from the host GPU | ||
| 91 | void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { | ||
| 92 | word_manager.template ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size); | ||
| 93 | } | ||
| 94 | |||
| 95 | /// Unmark region as modified from the host GPU | ||
| 96 | void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { | ||
| 97 | word_manager.template ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size); | ||
| 98 | } | ||
| 99 | |||
| 100 | /// Mark region as modified from the CPU | ||
| 101 | /// but don't mark it as modified until FlusHCachedWrites is called. | ||
| 102 | void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { | ||
| 103 | flags |= BufferFlagBits::CachedWrites; | ||
| 104 | word_manager.template ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size); | ||
| 105 | } | ||
| 106 | |||
| 107 | /// Flushes cached CPU writes, and notify the rasterizer about the deltas | ||
| 108 | void FlushCachedWrites() noexcept { | ||
| 109 | flags &= ~BufferFlagBits::CachedWrites; | ||
| 110 | word_manager.FlushCachedWrites(); | ||
| 111 | } | ||
| 112 | |||
| 113 | /// Call 'func' for each CPU modified range and unmark those pages as CPU modified | ||
| 114 | template <typename Func> | ||
| 115 | void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { | ||
| 116 | word_manager.template ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func); | ||
| 117 | } | ||
| 118 | |||
| 119 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified | ||
| 120 | template <typename Func> | ||
| 121 | void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { | ||
| 122 | word_manager.template ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func); | ||
| 123 | } | ||
| 124 | |||
| 125 | template <typename Func> | ||
| 126 | void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) { | ||
| 127 | word_manager.template ForEachModifiedRange<Type::GPU>(query_cpu_range, size, true, func); | ||
| 128 | } | ||
| 129 | |||
| 130 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified | ||
| 131 | template <typename Func> | ||
| 132 | void ForEachDownloadRange(Func&& func) { | ||
| 133 | word_manager.template ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func); | ||
| 134 | } | ||
| 135 | |||
| 136 | /// Mark buffer as picked | 52 | /// Mark buffer as picked |
| 137 | void Pick() noexcept { | 53 | void Pick() noexcept { |
| 138 | flags |= BufferFlagBits::Picked; | 54 | flags |= BufferFlagBits::Picked; |
| @@ -179,11 +95,6 @@ public: | |||
| 179 | return static_cast<u32>(other_cpu_addr - cpu_addr); | 95 | return static_cast<u32>(other_cpu_addr - cpu_addr); |
| 180 | } | 96 | } |
| 181 | 97 | ||
| 182 | /// Returns the size in bytes of the buffer | ||
| 183 | [[nodiscard]] u64 SizeBytes() const noexcept { | ||
| 184 | return word_manager.SizeBytes(); | ||
| 185 | } | ||
| 186 | |||
| 187 | size_t getLRUID() const noexcept { | 98 | size_t getLRUID() const noexcept { |
| 188 | return lru_id; | 99 | return lru_id; |
| 189 | } | 100 | } |
| @@ -192,12 +103,16 @@ public: | |||
| 192 | lru_id = lru_id_; | 103 | lru_id = lru_id_; |
| 193 | } | 104 | } |
| 194 | 105 | ||
| 106 | size_t SizeBytes() const { | ||
| 107 | return size_bytes; | ||
| 108 | } | ||
| 109 | |||
| 195 | private: | 110 | private: |
| 196 | VAddr cpu_addr = 0; | 111 | VAddr cpu_addr = 0; |
| 197 | WordManager<RasterizerInterface> word_manager; | ||
| 198 | BufferFlagBits flags{}; | 112 | BufferFlagBits flags{}; |
| 199 | int stream_score = 0; | 113 | int stream_score = 0; |
| 200 | size_t lru_id = SIZE_MAX; | 114 | size_t lru_id = SIZE_MAX; |
| 115 | size_t size_bytes = 0; | ||
| 201 | }; | 116 | }; |
| 202 | 117 | ||
| 203 | } // namespace VideoCommon | 118 | } // namespace VideoCommon |
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index e5c626c36..7975564b5 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h | |||
| @@ -21,6 +21,7 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, | |||
| 21 | // Ensure the first slot is used for the null buffer | 21 | // Ensure the first slot is used for the null buffer |
| 22 | void(slot_buffers.insert(runtime, NullBufferParams{})); | 22 | void(slot_buffers.insert(runtime, NullBufferParams{})); |
| 23 | common_ranges.clear(); | 23 | common_ranges.clear(); |
| 24 | inline_buffer_id = NULL_BUFFER_ID; | ||
| 24 | 25 | ||
| 25 | active_async_buffers = !Settings::IsGPULevelHigh(); | 26 | active_async_buffers = !Settings::IsGPULevelHigh(); |
| 26 | 27 | ||
| @@ -442,9 +443,6 @@ template <class P> | |||
| 442 | void BufferCache<P>::FlushCachedWrites() { | 443 | void BufferCache<P>::FlushCachedWrites() { |
| 443 | cached_write_buffer_ids.clear(); | 444 | cached_write_buffer_ids.clear(); |
| 444 | memory_tracker.FlushCachedWrites(); | 445 | memory_tracker.FlushCachedWrites(); |
| 445 | for (auto& interval : cached_ranges) { | ||
| 446 | ClearDownload(interval); | ||
| 447 | } | ||
| 448 | cached_ranges.clear(); | 446 | cached_ranges.clear(); |
| 449 | } | 447 | } |
| 450 | 448 | ||
| @@ -659,8 +657,8 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { | |||
| 659 | template <class P> | 657 | template <class P> |
| 660 | bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { | 658 | bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { |
| 661 | const VAddr end_addr = addr + size; | 659 | const VAddr end_addr = addr + size; |
| 662 | const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE); | 660 | const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE); |
| 663 | for (u64 page = addr >> PAGE_BITS; page < page_end;) { | 661 | for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) { |
| 664 | const BufferId buffer_id = page_table[page]; | 662 | const BufferId buffer_id = page_table[page]; |
| 665 | if (!buffer_id) { | 663 | if (!buffer_id) { |
| 666 | ++page; | 664 | ++page; |
| @@ -672,7 +670,7 @@ bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { | |||
| 672 | if (buf_start_addr < end_addr && addr < buf_end_addr) { | 670 | if (buf_start_addr < end_addr && addr < buf_end_addr) { |
| 673 | return true; | 671 | return true; |
| 674 | } | 672 | } |
| 675 | page = Common::DivCeil(end_addr, PAGE_SIZE); | 673 | page = Common::DivCeil(end_addr, CACHING_PAGESIZE); |
| 676 | } | 674 | } |
| 677 | return false; | 675 | return false; |
| 678 | } | 676 | } |
| @@ -689,7 +687,7 @@ void BufferCache<P>::BindHostIndexBuffer() { | |||
| 689 | const u32 offset = buffer.Offset(index_buffer.cpu_addr); | 687 | const u32 offset = buffer.Offset(index_buffer.cpu_addr); |
| 690 | const u32 size = index_buffer.size; | 688 | const u32 size = index_buffer.size; |
| 691 | const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); | 689 | const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); |
| 692 | if (!draw_state.inline_index_draw_indexes.empty()) { | 690 | if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] { |
| 693 | if constexpr (USE_MEMORY_MAPS) { | 691 | if constexpr (USE_MEMORY_MAPS) { |
| 694 | auto upload_staging = runtime.UploadStagingBuffer(size); | 692 | auto upload_staging = runtime.UploadStagingBuffer(size); |
| 695 | std::array<BufferCopy, 1> copies{ | 693 | std::array<BufferCopy, 1> copies{ |
| @@ -1001,12 +999,20 @@ void BufferCache<P>::UpdateIndexBuffer() { | |||
| 1001 | return; | 999 | return; |
| 1002 | } | 1000 | } |
| 1003 | flags[Dirty::IndexBuffer] = false; | 1001 | flags[Dirty::IndexBuffer] = false; |
| 1004 | if (!draw_state.inline_index_draw_indexes.empty()) { | 1002 | if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] { |
| 1005 | auto inline_index_size = static_cast<u32>(draw_state.inline_index_draw_indexes.size()); | 1003 | auto inline_index_size = static_cast<u32>(draw_state.inline_index_draw_indexes.size()); |
| 1004 | u32 buffer_size = Common::AlignUp(inline_index_size, CACHING_PAGESIZE); | ||
| 1005 | if (inline_buffer_id == NULL_BUFFER_ID) [[unlikely]] { | ||
| 1006 | inline_buffer_id = CreateBuffer(0, buffer_size); | ||
| 1007 | } | ||
| 1008 | if (slot_buffers[inline_buffer_id].SizeBytes() < buffer_size) [[unlikely]] { | ||
| 1009 | slot_buffers.erase(inline_buffer_id); | ||
| 1010 | inline_buffer_id = CreateBuffer(0, buffer_size); | ||
| 1011 | } | ||
| 1006 | index_buffer = Binding{ | 1012 | index_buffer = Binding{ |
| 1007 | .cpu_addr = 0, | 1013 | .cpu_addr = 0, |
| 1008 | .size = inline_index_size, | 1014 | .size = inline_index_size, |
| 1009 | .buffer_id = FindBuffer(0, inline_index_size), | 1015 | .buffer_id = inline_buffer_id, |
| 1010 | }; | 1016 | }; |
| 1011 | return; | 1017 | return; |
| 1012 | } | 1018 | } |
| @@ -1224,7 +1230,7 @@ BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) { | |||
| 1224 | if (cpu_addr == 0) { | 1230 | if (cpu_addr == 0) { |
| 1225 | return NULL_BUFFER_ID; | 1231 | return NULL_BUFFER_ID; |
| 1226 | } | 1232 | } |
| 1227 | const u64 page = cpu_addr >> PAGE_BITS; | 1233 | const u64 page = cpu_addr >> CACHING_PAGEBITS; |
| 1228 | const BufferId buffer_id = page_table[page]; | 1234 | const BufferId buffer_id = page_table[page]; |
| 1229 | if (!buffer_id) { | 1235 | if (!buffer_id) { |
| 1230 | return CreateBuffer(cpu_addr, size); | 1236 | return CreateBuffer(cpu_addr, size); |
| @@ -1253,8 +1259,9 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu | |||
| 1253 | .has_stream_leap = has_stream_leap, | 1259 | .has_stream_leap = has_stream_leap, |
| 1254 | }; | 1260 | }; |
| 1255 | } | 1261 | } |
| 1256 | for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) { | 1262 | for (; cpu_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE); |
| 1257 | const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; | 1263 | cpu_addr += CACHING_PAGESIZE) { |
| 1264 | const BufferId overlap_id = page_table[cpu_addr >> CACHING_PAGEBITS]; | ||
| 1258 | if (!overlap_id) { | 1265 | if (!overlap_id) { |
| 1259 | continue; | 1266 | continue; |
| 1260 | } | 1267 | } |
| @@ -1280,11 +1287,11 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu | |||
| 1280 | // as a stream buffer. Increase the size to skip constantly recreating buffers. | 1287 | // as a stream buffer. Increase the size to skip constantly recreating buffers. |
| 1281 | has_stream_leap = true; | 1288 | has_stream_leap = true; |
| 1282 | if (expands_right) { | 1289 | if (expands_right) { |
| 1283 | begin -= PAGE_SIZE * 256; | 1290 | begin -= CACHING_PAGESIZE * 256; |
| 1284 | cpu_addr = begin; | 1291 | cpu_addr = begin; |
| 1285 | } | 1292 | } |
| 1286 | if (expands_left) { | 1293 | if (expands_left) { |
| 1287 | end += PAGE_SIZE * 256; | 1294 | end += CACHING_PAGESIZE * 256; |
| 1288 | } | 1295 | } |
| 1289 | } | 1296 | } |
| 1290 | } | 1297 | } |
| @@ -1317,6 +1324,9 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, | |||
| 1317 | 1324 | ||
| 1318 | template <class P> | 1325 | template <class P> |
| 1319 | BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { | 1326 | BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { |
| 1327 | VAddr cpu_addr_end = Common::AlignUp(cpu_addr + wanted_size, CACHING_PAGESIZE); | ||
| 1328 | cpu_addr = Common::AlignDown(cpu_addr, CACHING_PAGESIZE); | ||
| 1329 | wanted_size = static_cast<u32>(cpu_addr_end - cpu_addr); | ||
| 1320 | const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); | 1330 | const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); |
| 1321 | const u32 size = static_cast<u32>(overlap.end - overlap.begin); | 1331 | const u32 size = static_cast<u32>(overlap.end - overlap.begin); |
| 1322 | const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); | 1332 | const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); |
| @@ -1354,8 +1364,8 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) { | |||
| 1354 | } | 1364 | } |
| 1355 | const VAddr cpu_addr_begin = buffer.CpuAddr(); | 1365 | const VAddr cpu_addr_begin = buffer.CpuAddr(); |
| 1356 | const VAddr cpu_addr_end = cpu_addr_begin + size; | 1366 | const VAddr cpu_addr_end = cpu_addr_begin + size; |
| 1357 | const u64 page_begin = cpu_addr_begin / PAGE_SIZE; | 1367 | const u64 page_begin = cpu_addr_begin / CACHING_PAGESIZE; |
| 1358 | const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); | 1368 | const u64 page_end = Common::DivCeil(cpu_addr_end, CACHING_PAGESIZE); |
| 1359 | for (u64 page = page_begin; page != page_end; ++page) { | 1369 | for (u64 page = page_begin; page != page_end; ++page) { |
| 1360 | if constexpr (insert) { | 1370 | if constexpr (insert) { |
| 1361 | page_table[page] = buffer_id; | 1371 | page_table[page] = buffer_id; |
diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 75cb98ba3..656baa550 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h | |||
| @@ -90,10 +90,8 @@ template <typename P> | |||
| 90 | class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { | 90 | class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { |
| 91 | // Page size for caching purposes. | 91 | // Page size for caching purposes. |
| 92 | // This is unrelated to the CPU page size and it can be changed as it seems optimal. | 92 | // This is unrelated to the CPU page size and it can be changed as it seems optimal. |
| 93 | static constexpr u32 PAGE_BITS = 16; | 93 | static constexpr u32 CACHING_PAGEBITS = 16; |
| 94 | static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; | 94 | static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS; |
| 95 | static constexpr u32 CPU_PAGE_BITS = 12; | ||
| 96 | static constexpr u64 CPU_PAGE_SIZE = u64{1} << CPU_PAGE_BITS; | ||
| 97 | 95 | ||
| 98 | static constexpr bool IS_OPENGL = P::IS_OPENGL; | 96 | static constexpr bool IS_OPENGL = P::IS_OPENGL; |
| 99 | static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = | 97 | static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = |
| @@ -112,6 +110,10 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelI | |||
| 112 | static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; | 110 | static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; |
| 113 | static constexpr s64 TARGET_THRESHOLD = 4_GiB; | 111 | static constexpr s64 TARGET_THRESHOLD = 4_GiB; |
| 114 | 112 | ||
| 113 | // Debug Flags. | ||
| 114 | |||
| 115 | static constexpr bool DISABLE_DOWNLOADS = true; | ||
| 116 | |||
| 115 | using Maxwell = Tegra::Engines::Maxwell3D::Regs; | 117 | using Maxwell = Tegra::Engines::Maxwell3D::Regs; |
| 116 | 118 | ||
| 117 | using Runtime = typename P::Runtime; | 119 | using Runtime = typename P::Runtime; |
| @@ -286,8 +288,8 @@ private: | |||
| 286 | 288 | ||
| 287 | template <typename Func> | 289 | template <typename Func> |
| 288 | void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { | 290 | void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { |
| 289 | const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); | 291 | const u64 page_end = Common::DivCeil(cpu_addr + size, CACHING_PAGESIZE); |
| 290 | for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { | 292 | for (u64 page = cpu_addr >> CACHING_PAGEBITS; page < page_end;) { |
| 291 | const BufferId buffer_id = page_table[page]; | 293 | const BufferId buffer_id = page_table[page]; |
| 292 | if (!buffer_id) { | 294 | if (!buffer_id) { |
| 293 | ++page; | 295 | ++page; |
| @@ -297,7 +299,7 @@ private: | |||
| 297 | func(buffer_id, buffer); | 299 | func(buffer_id, buffer); |
| 298 | 300 | ||
| 299 | const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); | 301 | const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); |
| 300 | page = Common::DivCeil(end_addr, PAGE_SIZE); | 302 | page = Common::DivCeil(end_addr, CACHING_PAGESIZE); |
| 301 | } | 303 | } |
| 302 | } | 304 | } |
| 303 | 305 | ||
| @@ -568,10 +570,11 @@ private: | |||
| 568 | u64 total_used_memory = 0; | 570 | u64 total_used_memory = 0; |
| 569 | u64 minimum_memory = 0; | 571 | u64 minimum_memory = 0; |
| 570 | u64 critical_memory = 0; | 572 | u64 critical_memory = 0; |
| 573 | BufferId inline_buffer_id; | ||
| 571 | 574 | ||
| 572 | bool active_async_buffers = false; | 575 | bool active_async_buffers = false; |
| 573 | 576 | ||
| 574 | std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; | 577 | std::array<BufferId, ((1ULL << 39) >> CACHING_PAGEBITS)> page_table; |
| 575 | }; | 578 | }; |
| 576 | 579 | ||
| 577 | } // namespace VideoCommon | 580 | } // namespace VideoCommon |
diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h index 016d8430f..4bc59017f 100644 --- a/src/video_core/buffer_cache/memory_tracker_base.h +++ b/src/video_core/buffer_cache/memory_tracker_base.h | |||
| @@ -132,8 +132,8 @@ public: | |||
| 132 | void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { | 132 | void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { |
| 133 | IteratePages<true>(query_cpu_range, query_size, | 133 | IteratePages<true>(query_cpu_range, query_size, |
| 134 | [&func](Manager* manager, u64 offset, size_t size) { | 134 | [&func](Manager* manager, u64 offset, size_t size) { |
| 135 | manager->template ForEachModifiedRange<Type::CPU>( | 135 | manager->template ForEachModifiedRange<Type::CPU, true>( |
| 136 | manager->GetCpuAddr() + offset, size, true, func); | 136 | manager->GetCpuAddr() + offset, size, func); |
| 137 | }); | 137 | }); |
| 138 | } | 138 | } |
| 139 | 139 | ||
| @@ -142,8 +142,13 @@ public: | |||
| 142 | void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) { | 142 | void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) { |
| 143 | IteratePages<false>(query_cpu_range, query_size, | 143 | IteratePages<false>(query_cpu_range, query_size, |
| 144 | [&func, clear](Manager* manager, u64 offset, size_t size) { | 144 | [&func, clear](Manager* manager, u64 offset, size_t size) { |
| 145 | manager->template ForEachModifiedRange<Type::GPU>( | 145 | if (clear) { |
| 146 | manager->GetCpuAddr() + offset, size, clear, func); | 146 | manager->template ForEachModifiedRange<Type::GPU, true>( |
| 147 | manager->GetCpuAddr() + offset, size, func); | ||
| 148 | } else { | ||
| 149 | manager->template ForEachModifiedRange<Type::GPU, false>( | ||
| 150 | manager->GetCpuAddr() + offset, size, func); | ||
| 151 | } | ||
| 147 | }); | 152 | }); |
| 148 | } | 153 | } |
| 149 | 154 | ||
| @@ -151,8 +156,8 @@ public: | |||
| 151 | void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) { | 156 | void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) { |
| 152 | IteratePages<false>(query_cpu_range, query_size, | 157 | IteratePages<false>(query_cpu_range, query_size, |
| 153 | [&func](Manager* manager, u64 offset, size_t size) { | 158 | [&func](Manager* manager, u64 offset, size_t size) { |
| 154 | manager->template ForEachModifiedRange<Type::GPU>( | 159 | manager->template ForEachModifiedRange<Type::GPU, true>( |
| 155 | manager->GetCpuAddr() + offset, size, true, func); | 160 | manager->GetCpuAddr() + offset, size, func); |
| 156 | }); | 161 | }); |
| 157 | } | 162 | } |
| 158 | 163 | ||
diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h index 21729752b..a42455045 100644 --- a/src/video_core/buffer_cache/word_manager.h +++ b/src/video_core/buffer_cache/word_manager.h | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <algorithm> | 6 | #include <algorithm> |
| 7 | #include <bit> | 7 | #include <bit> |
| 8 | #include <limits> | 8 | #include <limits> |
| 9 | #include <span> | ||
| 9 | #include <utility> | 10 | #include <utility> |
| 10 | 11 | ||
| 11 | #include "common/alignment.h" | 12 | #include "common/alignment.h" |
| @@ -20,9 +21,16 @@ constexpr u64 PAGES_PER_WORD = 64; | |||
| 20 | constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; | 21 | constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; |
| 21 | constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; | 22 | constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; |
| 22 | 23 | ||
| 24 | enum class Type { | ||
| 25 | CPU, | ||
| 26 | GPU, | ||
| 27 | CachedCPU, | ||
| 28 | Untracked, | ||
| 29 | }; | ||
| 30 | |||
| 23 | /// Vector tracking modified pages tightly packed with small vector optimization | 31 | /// Vector tracking modified pages tightly packed with small vector optimization |
| 24 | template <size_t stack_words = 1> | 32 | template <size_t stack_words = 1> |
| 25 | union WordsArray { | 33 | struct WordsArray { |
| 26 | /// Returns the pointer to the words state | 34 | /// Returns the pointer to the words state |
| 27 | [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { | 35 | [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { |
| 28 | return is_short ? stack.data() : heap; | 36 | return is_short ? stack.data() : heap; |
| @@ -41,13 +49,13 @@ template <size_t stack_words = 1> | |||
| 41 | struct Words { | 49 | struct Words { |
| 42 | explicit Words() = default; | 50 | explicit Words() = default; |
| 43 | explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { | 51 | explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { |
| 52 | num_words = Common::DivCeil(size_bytes, BYTES_PER_WORD); | ||
| 44 | if (IsShort()) { | 53 | if (IsShort()) { |
| 45 | cpu.stack.fill(~u64{0}); | 54 | cpu.stack.fill(~u64{0}); |
| 46 | gpu.stack.fill(0); | 55 | gpu.stack.fill(0); |
| 47 | cached_cpu.stack.fill(0); | 56 | cached_cpu.stack.fill(0); |
| 48 | untracked.stack.fill(~u64{0}); | 57 | untracked.stack.fill(~u64{0}); |
| 49 | } else { | 58 | } else { |
| 50 | const size_t num_words = NumWords(); | ||
| 51 | // Share allocation between CPU and GPU pages and set their default values | 59 | // Share allocation between CPU and GPU pages and set their default values |
| 52 | u64* const alloc = new u64[num_words * 4]; | 60 | u64* const alloc = new u64[num_words * 4]; |
| 53 | cpu.heap = alloc; | 61 | cpu.heap = alloc; |
| @@ -75,6 +83,7 @@ struct Words { | |||
| 75 | Words& operator=(Words&& rhs) noexcept { | 83 | Words& operator=(Words&& rhs) noexcept { |
| 76 | Release(); | 84 | Release(); |
| 77 | size_bytes = rhs.size_bytes; | 85 | size_bytes = rhs.size_bytes; |
| 86 | num_words = rhs.num_words; | ||
| 78 | cpu = rhs.cpu; | 87 | cpu = rhs.cpu; |
| 79 | gpu = rhs.gpu; | 88 | gpu = rhs.gpu; |
| 80 | cached_cpu = rhs.cached_cpu; | 89 | cached_cpu = rhs.cached_cpu; |
| @@ -84,7 +93,7 @@ struct Words { | |||
| 84 | } | 93 | } |
| 85 | 94 | ||
| 86 | Words(Words&& rhs) noexcept | 95 | Words(Words&& rhs) noexcept |
| 87 | : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, | 96 | : size_bytes{rhs.size_bytes}, num_words{rhs.num_words}, cpu{rhs.cpu}, gpu{rhs.gpu}, |
| 88 | cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { | 97 | cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { |
| 89 | rhs.cpu.heap = nullptr; | 98 | rhs.cpu.heap = nullptr; |
| 90 | } | 99 | } |
| @@ -94,12 +103,12 @@ struct Words { | |||
| 94 | 103 | ||
| 95 | /// Returns true when the buffer fits in the small vector optimization | 104 | /// Returns true when the buffer fits in the small vector optimization |
| 96 | [[nodiscard]] bool IsShort() const noexcept { | 105 | [[nodiscard]] bool IsShort() const noexcept { |
| 97 | return size_bytes <= stack_words * BYTES_PER_WORD; | 106 | return num_words <= stack_words; |
| 98 | } | 107 | } |
| 99 | 108 | ||
| 100 | /// Returns the number of words of the buffer | 109 | /// Returns the number of words of the buffer |
| 101 | [[nodiscard]] size_t NumWords() const noexcept { | 110 | [[nodiscard]] size_t NumWords() const noexcept { |
| 102 | return Common::DivCeil(size_bytes, BYTES_PER_WORD); | 111 | return num_words; |
| 103 | } | 112 | } |
| 104 | 113 | ||
| 105 | /// Release buffer resources | 114 | /// Release buffer resources |
| @@ -110,20 +119,40 @@ struct Words { | |||
| 110 | } | 119 | } |
| 111 | } | 120 | } |
| 112 | 121 | ||
| 122 | template <Type type> | ||
| 123 | std::span<u64> Span() noexcept { | ||
| 124 | if constexpr (type == Type::CPU) { | ||
| 125 | return std::span<u64>(cpu.Pointer(IsShort()), num_words); | ||
| 126 | } else if constexpr (type == Type::GPU) { | ||
| 127 | return std::span<u64>(gpu.Pointer(IsShort()), num_words); | ||
| 128 | } else if constexpr (type == Type::CachedCPU) { | ||
| 129 | return std::span<u64>(cached_cpu.Pointer(IsShort()), num_words); | ||
| 130 | } else if constexpr (type == Type::Untracked) { | ||
| 131 | return std::span<u64>(untracked.Pointer(IsShort()), num_words); | ||
| 132 | } | ||
| 133 | } | ||
| 134 | |||
| 135 | template <Type type> | ||
| 136 | std::span<const u64> Span() const noexcept { | ||
| 137 | if constexpr (type == Type::CPU) { | ||
| 138 | return std::span<const u64>(cpu.Pointer(IsShort()), num_words); | ||
| 139 | } else if constexpr (type == Type::GPU) { | ||
| 140 | return std::span<const u64>(gpu.Pointer(IsShort()), num_words); | ||
| 141 | } else if constexpr (type == Type::CachedCPU) { | ||
| 142 | return std::span<const u64>(cached_cpu.Pointer(IsShort()), num_words); | ||
| 143 | } else if constexpr (type == Type::Untracked) { | ||
| 144 | return std::span<const u64>(untracked.Pointer(IsShort()), num_words); | ||
| 145 | } | ||
| 146 | } | ||
| 147 | |||
| 113 | u64 size_bytes = 0; | 148 | u64 size_bytes = 0; |
| 149 | size_t num_words = 0; | ||
| 114 | WordsArray<stack_words> cpu; | 150 | WordsArray<stack_words> cpu; |
| 115 | WordsArray<stack_words> gpu; | 151 | WordsArray<stack_words> gpu; |
| 116 | WordsArray<stack_words> cached_cpu; | 152 | WordsArray<stack_words> cached_cpu; |
| 117 | WordsArray<stack_words> untracked; | 153 | WordsArray<stack_words> untracked; |
| 118 | }; | 154 | }; |
| 119 | 155 | ||
| 120 | enum class Type { | ||
| 121 | CPU, | ||
| 122 | GPU, | ||
| 123 | CachedCPU, | ||
| 124 | Untracked, | ||
| 125 | }; | ||
| 126 | |||
| 127 | template <class RasterizerInterface, size_t stack_words = 1> | 156 | template <class RasterizerInterface, size_t stack_words = 1> |
| 128 | class WordManager { | 157 | class WordManager { |
| 129 | public: | 158 | public: |
| @@ -140,6 +169,69 @@ public: | |||
| 140 | return cpu_addr; | 169 | return cpu_addr; |
| 141 | } | 170 | } |
| 142 | 171 | ||
| 172 | static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) { | ||
| 173 | constexpr size_t number_bits = sizeof(u64) * 8; | ||
| 174 | const size_t limit_page_end = number_bits - std::min(page_end, number_bits); | ||
| 175 | u64 bits = (word >> page_start) << page_start; | ||
| 176 | bits = (bits << limit_page_end) >> limit_page_end; | ||
| 177 | return bits; | ||
| 178 | } | ||
| 179 | |||
| 180 | static std::pair<size_t, size_t> GetWordPage(VAddr address) { | ||
| 181 | const size_t converted_address = static_cast<size_t>(address); | ||
| 182 | const size_t word_number = converted_address / BYTES_PER_WORD; | ||
| 183 | const size_t amount_pages = converted_address % BYTES_PER_WORD; | ||
| 184 | return std::make_pair(word_number, amount_pages / BYTES_PER_PAGE); | ||
| 185 | } | ||
| 186 | |||
| 187 | template <typename Func> | ||
| 188 | void IterateWords(size_t offset, size_t size, Func&& func) const { | ||
| 189 | using FuncReturn = std::invoke_result_t<Func, std::size_t, u64>; | ||
| 190 | static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>; | ||
| 191 | const size_t start = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset), 0LL)); | ||
| 192 | const size_t end = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset + size), 0LL)); | ||
| 193 | if (start >= SizeBytes() || end <= start) { | ||
| 194 | return; | ||
| 195 | } | ||
| 196 | auto [start_word, start_page] = GetWordPage(start); | ||
| 197 | auto [end_word, end_page] = GetWordPage(end + BYTES_PER_PAGE - 1ULL); | ||
| 198 | const size_t num_words = NumWords(); | ||
| 199 | start_word = std::min(start_word, num_words); | ||
| 200 | end_word = std::min(end_word, num_words); | ||
| 201 | const size_t diff = end_word - start_word; | ||
| 202 | end_word += (end_page + PAGES_PER_WORD - 1ULL) / PAGES_PER_WORD; | ||
| 203 | end_word = std::min(end_word, num_words); | ||
| 204 | end_page += diff * PAGES_PER_WORD; | ||
| 205 | constexpr u64 base_mask{~0ULL}; | ||
| 206 | for (size_t word_index = start_word; word_index < end_word; word_index++) { | ||
| 207 | const u64 mask = ExtractBits(base_mask, start_page, end_page); | ||
| 208 | start_page = 0; | ||
| 209 | end_page -= PAGES_PER_WORD; | ||
| 210 | if constexpr (BOOL_BREAK) { | ||
| 211 | if (func(word_index, mask)) { | ||
| 212 | return; | ||
| 213 | } | ||
| 214 | } else { | ||
| 215 | func(word_index, mask); | ||
| 216 | } | ||
| 217 | } | ||
| 218 | } | ||
| 219 | |||
| 220 | template <typename Func> | ||
| 221 | void IteratePages(u64 mask, Func&& func) const { | ||
| 222 | size_t offset = 0; | ||
| 223 | while (mask != 0) { | ||
| 224 | const size_t empty_bits = std::countr_zero(mask); | ||
| 225 | offset += empty_bits; | ||
| 226 | mask = mask >> empty_bits; | ||
| 227 | |||
| 228 | const size_t continuous_bits = std::countr_one(mask); | ||
| 229 | func(offset, continuous_bits); | ||
| 230 | mask = continuous_bits < PAGES_PER_WORD ? (mask >> continuous_bits) : 0; | ||
| 231 | offset += continuous_bits; | ||
| 232 | } | ||
| 233 | } | ||
| 234 | |||
| 143 | /** | 235 | /** |
| 144 | * Change the state of a range of pages | 236 | * Change the state of a range of pages |
| 145 | * | 237 | * |
| @@ -147,47 +239,33 @@ public: | |||
| 147 | * @param size Size in bytes to mark or unmark as modified | 239 | * @param size Size in bytes to mark or unmark as modified |
| 148 | */ | 240 | */ |
| 149 | template <Type type, bool enable> | 241 | template <Type type, bool enable> |
| 150 | void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { | 242 | void ChangeRegionState(u64 dirty_addr, u64 size) noexcept(type == Type::GPU) { |
| 151 | const s64 difference = dirty_addr - cpu_addr; | 243 | std::span<u64> state_words = words.template Span<type>(); |
| 152 | const u64 offset = std::max<s64>(difference, 0); | 244 | [[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>(); |
| 153 | size += std::min<s64>(difference, 0); | 245 | [[maybe_unused]] std::span<u64> cached_words = words.template Span<Type::CachedCPU>(); |
| 154 | if (offset >= SizeBytes() || size < 0) { | 246 | IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) { |
| 155 | return; | ||
| 156 | } | ||
| 157 | u64* const untracked_words = Array<Type::Untracked>(); | ||
| 158 | u64* const state_words = Array<type>(); | ||
| 159 | const u64 offset_end = std::min(offset + size, SizeBytes()); | ||
| 160 | const u64 begin_page_index = offset / BYTES_PER_PAGE; | ||
| 161 | const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; | ||
| 162 | const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE); | ||
| 163 | const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD); | ||
| 164 | u64 page_index = begin_page_index % PAGES_PER_WORD; | ||
| 165 | u64 word_index = begin_word_index; | ||
| 166 | while (word_index < end_word_index) { | ||
| 167 | const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD; | ||
| 168 | const u64 left_offset = | ||
| 169 | std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD; | ||
| 170 | const u64 right_offset = page_index; | ||
| 171 | u64 bits = ~u64{0}; | ||
| 172 | bits = (bits >> right_offset) << right_offset; | ||
| 173 | bits = (bits << left_offset) >> left_offset; | ||
| 174 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { | 247 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { |
| 175 | NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits); | 248 | NotifyRasterizer<!enable>(index, untracked_words[index], mask); |
| 176 | } | 249 | } |
| 177 | if constexpr (enable) { | 250 | if constexpr (enable) { |
| 178 | state_words[word_index] |= bits; | 251 | state_words[index] |= mask; |
| 179 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { | 252 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { |
| 180 | untracked_words[word_index] |= bits; | 253 | untracked_words[index] |= mask; |
| 254 | } | ||
| 255 | if constexpr (type == Type::CPU) { | ||
| 256 | cached_words[index] &= ~mask; | ||
| 181 | } | 257 | } |
| 182 | } else { | 258 | } else { |
| 183 | state_words[word_index] &= ~bits; | 259 | if constexpr (type == Type::CPU) { |
| 260 | const u64 word = state_words[index] & mask; | ||
| 261 | cached_words[index] &= ~word; | ||
| 262 | } | ||
| 263 | state_words[index] &= ~mask; | ||
| 184 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { | 264 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { |
| 185 | untracked_words[word_index] &= ~bits; | 265 | untracked_words[index] &= ~mask; |
| 186 | } | 266 | } |
| 187 | } | 267 | } |
| 188 | page_index = 0; | 268 | }); |
| 189 | ++word_index; | ||
| 190 | } | ||
| 191 | } | 269 | } |
| 192 | 270 | ||
| 193 | /** | 271 | /** |
| @@ -198,119 +276,59 @@ public: | |||
| 198 | * @param size Size in bytes of the CPU range to loop over | 276 | * @param size Size in bytes of the CPU range to loop over |
| 199 | * @param func Function to call for each turned off region | 277 | * @param func Function to call for each turned off region |
| 200 | */ | 278 | */ |
| 201 | template <Type type, typename Func> | 279 | template <Type type, bool clear, typename Func> |
| 202 | void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { | 280 | void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { |
| 203 | static_assert(type != Type::Untracked); | 281 | static_assert(type != Type::Untracked); |
| 204 | 282 | ||
| 205 | const s64 difference = query_cpu_range - cpu_addr; | 283 | std::span<u64> state_words = words.template Span<type>(); |
| 206 | const u64 query_begin = std::max<s64>(difference, 0); | 284 | [[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>(); |
| 207 | size += std::min<s64>(difference, 0); | 285 | [[maybe_unused]] std::span<u64> cached_words = words.template Span<Type::CachedCPU>(); |
| 208 | if (query_begin >= SizeBytes() || size < 0) { | 286 | const size_t offset = query_cpu_range - cpu_addr; |
| 209 | return; | 287 | bool pending = false; |
| 210 | } | 288 | size_t pending_offset{}; |
| 211 | [[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>(); | 289 | size_t pending_pointer{}; |
| 212 | [[maybe_unused]] u64* const cpu_words = Array<Type::CPU>(); | 290 | const auto release = [&]() { |
| 213 | u64* const state_words = Array<type>(); | 291 | func(cpu_addr + pending_offset * BYTES_PER_PAGE, |
| 214 | const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); | 292 | (pending_pointer - pending_offset) * BYTES_PER_PAGE); |
| 215 | u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; | 293 | }; |
| 216 | u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); | 294 | IterateWords(offset, size, [&](size_t index, u64 mask) { |
| 217 | u64 first_page = (query_begin / BYTES_PER_PAGE) % PAGES_PER_WORD; | 295 | const u64 word = state_words[index] & mask; |
| 218 | 296 | if constexpr (clear) { | |
| 219 | const auto modified = [](u64 word) { return word != 0; }; | 297 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { |
| 220 | const auto first_modified_word = std::find_if(words_begin, words_end, modified); | 298 | NotifyRasterizer<true>(index, untracked_words[index], mask); |
| 221 | if (first_modified_word == words_end) { | ||
| 222 | // Exit early when the buffer is not modified | ||
| 223 | return; | ||
| 224 | } | ||
| 225 | if (first_modified_word != words_begin) { | ||
| 226 | first_page = 0; | ||
| 227 | } | ||
| 228 | std::reverse_iterator<u64*> first_word_reverse(first_modified_word); | ||
| 229 | std::reverse_iterator<u64*> last_word_iterator(words_end); | ||
| 230 | auto last_word_result = std::find_if(last_word_iterator, first_word_reverse, modified); | ||
| 231 | u64* const last_modified_word = &(*last_word_result) + 1; | ||
| 232 | |||
| 233 | const u64 word_index_begin = std::distance(state_words, first_modified_word); | ||
| 234 | const u64 word_index_end = std::distance(state_words, last_modified_word); | ||
| 235 | const unsigned local_page_begin = std::countr_zero(*first_modified_word); | ||
| 236 | const unsigned local_page_end = | ||
| 237 | static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); | ||
| 238 | const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; | ||
| 239 | const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; | ||
| 240 | const u64 query_page_begin = query_begin / BYTES_PER_PAGE; | ||
| 241 | const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE); | ||
| 242 | const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin); | ||
| 243 | const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end); | ||
| 244 | const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD; | ||
| 245 | const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1; | ||
| 246 | |||
| 247 | u64 page_begin = std::max(first_word_page_begin, first_page); | ||
| 248 | u64 current_base = 0; | ||
| 249 | u64 current_size = 0; | ||
| 250 | bool on_going = false; | ||
| 251 | for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) { | ||
| 252 | const bool is_last_word = word_index + 1 == word_index_end; | ||
| 253 | const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD; | ||
| 254 | const u64 right_offset = page_begin; | ||
| 255 | const u64 left_offset = PAGES_PER_WORD - page_end; | ||
| 256 | u64 bits = ~u64{0}; | ||
| 257 | bits = (bits >> right_offset) << right_offset; | ||
| 258 | bits = (bits << left_offset) >> left_offset; | ||
| 259 | |||
| 260 | const u64 current_word = state_words[word_index] & bits; | ||
| 261 | if (clear) { | ||
| 262 | state_words[word_index] &= ~bits; | ||
| 263 | } | ||
| 264 | |||
| 265 | if constexpr (type == Type::CachedCPU) { | ||
| 266 | NotifyRasterizer<false>(word_index, untracked_words[word_index], current_word); | ||
| 267 | untracked_words[word_index] |= current_word; | ||
| 268 | cpu_words[word_index] |= current_word; | ||
| 269 | } | ||
| 270 | |||
| 271 | if constexpr (type == Type::CPU) { | ||
| 272 | const u64 current_bits = untracked_words[word_index] & bits; | ||
| 273 | untracked_words[word_index] &= ~bits; | ||
| 274 | NotifyRasterizer<true>(word_index, current_bits, ~u64{0}); | ||
| 275 | } | ||
| 276 | const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); | ||
| 277 | u64 page = page_begin; | ||
| 278 | page_begin = 0; | ||
| 279 | |||
| 280 | while (page < page_end) { | ||
| 281 | const int empty_bits = std::countr_zero(word >> page); | ||
| 282 | if (on_going && empty_bits != 0) { | ||
| 283 | InvokeModifiedRange(func, current_size, current_base); | ||
| 284 | current_size = 0; | ||
| 285 | on_going = false; | ||
| 286 | } | 299 | } |
| 287 | if (empty_bits == PAGES_PER_WORD) { | 300 | state_words[index] &= ~mask; |
| 288 | break; | 301 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { |
| 302 | untracked_words[index] &= ~mask; | ||
| 289 | } | 303 | } |
| 290 | page += empty_bits; | 304 | if constexpr (type == Type::CPU) { |
| 291 | 305 | cached_words[index] &= ~word; | |
| 292 | const int continuous_bits = std::countr_one(word >> page); | ||
| 293 | if (!on_going && continuous_bits != 0) { | ||
| 294 | current_base = word_index * PAGES_PER_WORD + page; | ||
| 295 | on_going = true; | ||
| 296 | } | 306 | } |
| 297 | current_size += continuous_bits; | ||
| 298 | page += continuous_bits; | ||
| 299 | } | 307 | } |
| 300 | } | 308 | const size_t base_offset = index * PAGES_PER_WORD; |
| 301 | if (on_going && current_size > 0) { | 309 | IteratePages(word, [&](size_t pages_offset, size_t pages_size) { |
| 302 | InvokeModifiedRange(func, current_size, current_base); | 310 | const auto reset = [&]() { |
| 311 | pending_offset = base_offset + pages_offset; | ||
| 312 | pending_pointer = base_offset + pages_offset + pages_size; | ||
| 313 | }; | ||
| 314 | if (!pending) { | ||
| 315 | reset(); | ||
| 316 | pending = true; | ||
| 317 | return; | ||
| 318 | } | ||
| 319 | if (pending_pointer == base_offset + pages_offset) { | ||
| 320 | pending_pointer += pages_size; | ||
| 321 | return; | ||
| 322 | } | ||
| 323 | release(); | ||
| 324 | reset(); | ||
| 325 | }); | ||
| 326 | }); | ||
| 327 | if (pending) { | ||
| 328 | release(); | ||
| 303 | } | 329 | } |
| 304 | } | 330 | } |
| 305 | 331 | ||
| 306 | template <typename Func> | ||
| 307 | void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) { | ||
| 308 | const u64 current_size_bytes = current_size * BYTES_PER_PAGE; | ||
| 309 | const u64 offset_begin = current_base * BYTES_PER_PAGE; | ||
| 310 | const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes()); | ||
| 311 | func(cpu_addr + offset_begin, offset_end - offset_begin); | ||
| 312 | } | ||
| 313 | |||
| 314 | /** | 332 | /** |
| 315 | * Returns true when a region has been modified | 333 | * Returns true when a region has been modified |
| 316 | * | 334 | * |
| @@ -321,27 +339,17 @@ public: | |||
| 321 | [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { | 339 | [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { |
| 322 | static_assert(type != Type::Untracked); | 340 | static_assert(type != Type::Untracked); |
| 323 | 341 | ||
| 324 | const u64* const untracked_words = Array<Type::Untracked>(); | 342 | const std::span<const u64> state_words = words.template Span<type>(); |
| 325 | const u64* const state_words = Array<type>(); | 343 | bool result = false; |
| 326 | const u64 num_query_words = size / BYTES_PER_WORD + 1; | 344 | IterateWords(offset, size, [&](size_t index, u64 mask) { |
| 327 | const u64 word_begin = offset / BYTES_PER_WORD; | 345 | const u64 word = state_words[index] & mask; |
| 328 | const u64 word_end = std::min(word_begin + num_query_words, NumWords()); | 346 | if (word != 0) { |
| 329 | const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); | 347 | result = true; |
| 330 | u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; | ||
| 331 | for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { | ||
| 332 | const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; | ||
| 333 | const u64 word = state_words[word_index] & ~off_word; | ||
| 334 | if (word == 0) { | ||
| 335 | continue; | ||
| 336 | } | ||
| 337 | const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit); | ||
| 338 | const u64 local_page_end = page_end % PAGES_PER_WORD; | ||
| 339 | const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD; | ||
| 340 | if (((word >> page_index) << page_index) << page_end_shift != 0) { | ||
| 341 | return true; | 348 | return true; |
| 342 | } | 349 | } |
| 343 | } | 350 | return false; |
| 344 | return false; | 351 | }); |
| 352 | return result; | ||
| 345 | } | 353 | } |
| 346 | 354 | ||
| 347 | /** | 355 | /** |
| @@ -353,34 +361,20 @@ public: | |||
| 353 | template <Type type> | 361 | template <Type type> |
| 354 | [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { | 362 | [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { |
| 355 | static_assert(type != Type::Untracked); | 363 | static_assert(type != Type::Untracked); |
| 356 | const u64* const state_words = Array<type>(); | 364 | const std::span<const u64> state_words = words.template Span<type>(); |
| 357 | const u64 num_query_words = size / BYTES_PER_WORD + 1; | ||
| 358 | const u64 word_begin = offset / BYTES_PER_WORD; | ||
| 359 | const u64 word_end = std::min(word_begin + num_query_words, NumWords()); | ||
| 360 | const u64 page_base = offset / BYTES_PER_PAGE; | ||
| 361 | u64 page_begin = page_base & (PAGES_PER_WORD - 1); | ||
| 362 | u64 page_end = | ||
| 363 | Common::DivCeil(offset + size, BYTES_PER_PAGE) - (page_base & ~(PAGES_PER_WORD - 1)); | ||
| 364 | u64 begin = std::numeric_limits<u64>::max(); | 365 | u64 begin = std::numeric_limits<u64>::max(); |
| 365 | u64 end = 0; | 366 | u64 end = 0; |
| 366 | for (u64 word_index = word_begin; word_index < word_end; ++word_index) { | 367 | IterateWords(offset, size, [&](size_t index, u64 mask) { |
| 367 | const u64 base_mask = (1ULL << page_begin) - 1ULL; | 368 | const u64 word = state_words[index] & mask; |
| 368 | const u64 end_mask = page_end >= PAGES_PER_WORD ? 0ULL : ~((1ULL << page_end) - 1ULL); | ||
| 369 | const u64 off_word = end_mask | base_mask; | ||
| 370 | const u64 word = state_words[word_index] & ~off_word; | ||
| 371 | if (word == 0) { | 369 | if (word == 0) { |
| 372 | page_begin = 0; | 370 | return; |
| 373 | page_end -= PAGES_PER_WORD; | ||
| 374 | continue; | ||
| 375 | } | 371 | } |
| 376 | const u64 local_page_begin = std::countr_zero(word); | 372 | const u64 local_page_begin = std::countr_zero(word); |
| 377 | const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); | 373 | const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); |
| 378 | const u64 page_index = word_index * PAGES_PER_WORD; | 374 | const u64 page_index = index * PAGES_PER_WORD; |
| 379 | begin = std::min(begin, page_index + local_page_begin); | 375 | begin = std::min(begin, page_index + local_page_begin); |
| 380 | end = page_index + local_page_end; | 376 | end = page_index + local_page_end; |
| 381 | page_begin = 0; | 377 | }); |
| 382 | page_end -= PAGES_PER_WORD; | ||
| 383 | } | ||
| 384 | static constexpr std::pair<u64, u64> EMPTY{0, 0}; | 378 | static constexpr std::pair<u64, u64> EMPTY{0, 0}; |
| 385 | return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; | 379 | return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; |
| 386 | } | 380 | } |
| @@ -454,18 +448,10 @@ private: | |||
| 454 | void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { | 448 | void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { |
| 455 | u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; | 449 | u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; |
| 456 | VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; | 450 | VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; |
| 457 | while (changed_bits != 0) { | 451 | IteratePages(changed_bits, [&](size_t offset, size_t size) { |
| 458 | const int empty_bits = std::countr_zero(changed_bits); | 452 | rasterizer->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE, |
| 459 | addr += empty_bits * BYTES_PER_PAGE; | 453 | size * BYTES_PER_PAGE, add_to_rasterizer ? 1 : -1); |
| 460 | changed_bits >>= empty_bits; | 454 | }); |
| 461 | |||
| 462 | const u32 continuous_bits = std::countr_one(changed_bits); | ||
| 463 | const u64 size = continuous_bits * BYTES_PER_PAGE; | ||
| 464 | const VAddr begin_addr = addr; | ||
| 465 | addr += size; | ||
| 466 | changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0; | ||
| 467 | rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1); | ||
| 468 | } | ||
| 469 | } | 455 | } |
| 470 | 456 | ||
| 471 | VAddr cpu_addr = 0; | 457 | VAddr cpu_addr = 0; |