diff options
| author | 2022-11-20 00:09:56 +0100 | |
|---|---|---|
| committer | 2023-04-29 00:46:31 +0200 | |
| commit | a16c2611316e534bda310f99319f4e8c74c49c92 (patch) | |
| tree | 906e44bc1bfcd358ecf7510a99adff05394c2846 /src | |
| parent | Merge pull request #10051 from liamwhite/surface-capabilities (diff) | |
| download | yuzu-a16c2611316e534bda310f99319f4e8c74c49c92.tar.gz yuzu-a16c2611316e534bda310f99319f4e8c74c49c92.tar.xz yuzu-a16c2611316e534bda310f99319f4e8c74c49c92.zip | |
Buffer Cache: Fully rework the buffer cache.
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/CMakeLists.txt | 5 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_base.h | 459 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.cpp | 4 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 990 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache_base.h | 507 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/memory_tracker_base.h | 258 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/word_manager.h | 474 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_buffer_cache.h | 4 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_buffer_cache_base.cpp | 9 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 8 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_buffer_cache.h | 8 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp | 9 |
12 files changed, 1644 insertions, 1091 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index e904573d7..92cab93f3 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt | |||
| @@ -11,8 +11,11 @@ endif() | |||
| 11 | 11 | ||
| 12 | add_library(video_core STATIC | 12 | add_library(video_core STATIC |
| 13 | buffer_cache/buffer_base.h | 13 | buffer_cache/buffer_base.h |
| 14 | buffer_cache/buffer_cache_base.h | ||
| 14 | buffer_cache/buffer_cache.cpp | 15 | buffer_cache/buffer_cache.cpp |
| 15 | buffer_cache/buffer_cache.h | 16 | buffer_cache/buffer_cache.h |
| 17 | buffer_cache/memory_tracker_base.h | ||
| 18 | buffer_cache/word_manager.h | ||
| 16 | cache_types.h | 19 | cache_types.h |
| 17 | cdma_pusher.cpp | 20 | cdma_pusher.cpp |
| 18 | cdma_pusher.h | 21 | cdma_pusher.h |
| @@ -104,6 +107,7 @@ add_library(video_core STATIC | |||
| 104 | renderer_null/renderer_null.h | 107 | renderer_null/renderer_null.h |
| 105 | renderer_opengl/blit_image.cpp | 108 | renderer_opengl/blit_image.cpp |
| 106 | renderer_opengl/blit_image.h | 109 | renderer_opengl/blit_image.h |
| 110 | renderer_opengl/gl_buffer_cache_base.cpp | ||
| 107 | renderer_opengl/gl_buffer_cache.cpp | 111 | renderer_opengl/gl_buffer_cache.cpp |
| 108 | renderer_opengl/gl_buffer_cache.h | 112 | renderer_opengl/gl_buffer_cache.h |
| 109 | renderer_opengl/gl_compute_pipeline.cpp | 113 | renderer_opengl/gl_compute_pipeline.cpp |
| @@ -154,6 +158,7 @@ add_library(video_core STATIC | |||
| 154 | renderer_vulkan/renderer_vulkan.cpp | 158 | renderer_vulkan/renderer_vulkan.cpp |
| 155 | renderer_vulkan/vk_blit_screen.cpp | 159 | renderer_vulkan/vk_blit_screen.cpp |
| 156 | renderer_vulkan/vk_blit_screen.h | 160 | renderer_vulkan/vk_blit_screen.h |
| 161 | renderer_vulkan/vk_buffer_cache_base.cpp | ||
| 157 | renderer_vulkan/vk_buffer_cache.cpp | 162 | renderer_vulkan/vk_buffer_cache.cpp |
| 158 | renderer_vulkan/vk_buffer_cache.h | 163 | renderer_vulkan/vk_buffer_cache.h |
| 159 | renderer_vulkan/vk_command_pool.cpp | 164 | renderer_vulkan/vk_command_pool.cpp |
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index 1b4d63616..66d8bb43c 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project | 1 | // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project |
| 2 | // SPDX-License-Identifier: GPL-2.0-or-later | 2 | // SPDX-License-Identifier: GPL-3.0-or-later |
| 3 | 3 | ||
| 4 | #pragma once | 4 | #pragma once |
| 5 | 5 | ||
| @@ -11,9 +11,7 @@ | |||
| 11 | #include "common/alignment.h" | 11 | #include "common/alignment.h" |
| 12 | #include "common/common_funcs.h" | 12 | #include "common/common_funcs.h" |
| 13 | #include "common/common_types.h" | 13 | #include "common/common_types.h" |
| 14 | #include "common/div_ceil.h" | 14 | #include "video_core/buffer_cache/word_manager.h" |
| 15 | #include "common/settings.h" | ||
| 16 | #include "core/memory.h" | ||
| 17 | 15 | ||
| 18 | namespace VideoCommon { | 16 | namespace VideoCommon { |
| 19 | 17 | ||
| @@ -36,116 +34,14 @@ struct NullBufferParams {}; | |||
| 36 | */ | 34 | */ |
| 37 | template <class RasterizerInterface> | 35 | template <class RasterizerInterface> |
| 38 | class BufferBase { | 36 | class BufferBase { |
| 39 | static constexpr u64 PAGES_PER_WORD = 64; | ||
| 40 | static constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; | ||
| 41 | static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; | ||
| 42 | |||
| 43 | /// Vector tracking modified pages tightly packed with small vector optimization | ||
| 44 | union WordsArray { | ||
| 45 | /// Returns the pointer to the words state | ||
| 46 | [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { | ||
| 47 | return is_short ? &stack : heap; | ||
| 48 | } | ||
| 49 | |||
| 50 | /// Returns the pointer to the words state | ||
| 51 | [[nodiscard]] u64* Pointer(bool is_short) noexcept { | ||
| 52 | return is_short ? &stack : heap; | ||
| 53 | } | ||
| 54 | |||
| 55 | u64 stack = 0; ///< Small buffers storage | ||
| 56 | u64* heap; ///< Not-small buffers pointer to the storage | ||
| 57 | }; | ||
| 58 | |||
| 59 | struct Words { | ||
| 60 | explicit Words() = default; | ||
| 61 | explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { | ||
| 62 | if (IsShort()) { | ||
| 63 | cpu.stack = ~u64{0}; | ||
| 64 | gpu.stack = 0; | ||
| 65 | cached_cpu.stack = 0; | ||
| 66 | untracked.stack = ~u64{0}; | ||
| 67 | } else { | ||
| 68 | // Share allocation between CPU and GPU pages and set their default values | ||
| 69 | const size_t num_words = NumWords(); | ||
| 70 | u64* const alloc = new u64[num_words * 4]; | ||
| 71 | cpu.heap = alloc; | ||
| 72 | gpu.heap = alloc + num_words; | ||
| 73 | cached_cpu.heap = alloc + num_words * 2; | ||
| 74 | untracked.heap = alloc + num_words * 3; | ||
| 75 | std::fill_n(cpu.heap, num_words, ~u64{0}); | ||
| 76 | std::fill_n(gpu.heap, num_words, 0); | ||
| 77 | std::fill_n(cached_cpu.heap, num_words, 0); | ||
| 78 | std::fill_n(untracked.heap, num_words, ~u64{0}); | ||
| 79 | } | ||
| 80 | // Clean up tailing bits | ||
| 81 | const u64 last_word_size = size_bytes % BYTES_PER_WORD; | ||
| 82 | const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); | ||
| 83 | const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; | ||
| 84 | const u64 last_word = (~u64{0} << shift) >> shift; | ||
| 85 | cpu.Pointer(IsShort())[NumWords() - 1] = last_word; | ||
| 86 | untracked.Pointer(IsShort())[NumWords() - 1] = last_word; | ||
| 87 | } | ||
| 88 | |||
| 89 | ~Words() { | ||
| 90 | Release(); | ||
| 91 | } | ||
| 92 | |||
| 93 | Words& operator=(Words&& rhs) noexcept { | ||
| 94 | Release(); | ||
| 95 | size_bytes = rhs.size_bytes; | ||
| 96 | cpu = rhs.cpu; | ||
| 97 | gpu = rhs.gpu; | ||
| 98 | cached_cpu = rhs.cached_cpu; | ||
| 99 | untracked = rhs.untracked; | ||
| 100 | rhs.cpu.heap = nullptr; | ||
| 101 | return *this; | ||
| 102 | } | ||
| 103 | |||
| 104 | Words(Words&& rhs) noexcept | ||
| 105 | : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, | ||
| 106 | cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { | ||
| 107 | rhs.cpu.heap = nullptr; | ||
| 108 | } | ||
| 109 | |||
| 110 | Words& operator=(const Words&) = delete; | ||
| 111 | Words(const Words&) = delete; | ||
| 112 | |||
| 113 | /// Returns true when the buffer fits in the small vector optimization | ||
| 114 | [[nodiscard]] bool IsShort() const noexcept { | ||
| 115 | return size_bytes <= BYTES_PER_WORD; | ||
| 116 | } | ||
| 117 | |||
| 118 | /// Returns the number of words of the buffer | ||
| 119 | [[nodiscard]] size_t NumWords() const noexcept { | ||
| 120 | return Common::DivCeil(size_bytes, BYTES_PER_WORD); | ||
| 121 | } | ||
| 122 | |||
| 123 | /// Release buffer resources | ||
| 124 | void Release() { | ||
| 125 | if (!IsShort()) { | ||
| 126 | // CPU written words is the base for the heap allocation | ||
| 127 | delete[] cpu.heap; | ||
| 128 | } | ||
| 129 | } | ||
| 130 | |||
| 131 | u64 size_bytes = 0; | ||
| 132 | WordsArray cpu; | ||
| 133 | WordsArray gpu; | ||
| 134 | WordsArray cached_cpu; | ||
| 135 | WordsArray untracked; | ||
| 136 | }; | ||
| 137 | |||
| 138 | enum class Type { | ||
| 139 | CPU, | ||
| 140 | GPU, | ||
| 141 | CachedCPU, | ||
| 142 | Untracked, | ||
| 143 | }; | ||
| 144 | |||
| 145 | public: | 37 | public: |
| 38 | static constexpr u64 BASE_PAGE_BITS = 16; | ||
| 39 | static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS; | ||
| 40 | |||
| 146 | explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes) | 41 | explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes) |
| 147 | : rasterizer{&rasterizer_}, cpu_addr{Common::AlignDown(cpu_addr_, BYTES_PER_PAGE)}, | 42 | : cpu_addr{Common::AlignDown(cpu_addr_, BASE_PAGE_SIZE)}, |
| 148 | words(Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BYTES_PER_PAGE)) {} | 43 | word_manager(cpu_addr, rasterizer_, |
| 44 | Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BASE_PAGE_SIZE)) {} | ||
| 149 | 45 | ||
| 150 | explicit BufferBase(NullBufferParams) {} | 46 | explicit BufferBase(NullBufferParams) {} |
| 151 | 47 | ||
| @@ -159,94 +55,82 @@ public: | |||
| 159 | [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, | 55 | [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, |
| 160 | u64 query_size) const noexcept { | 56 | u64 query_size) const noexcept { |
| 161 | const u64 offset = query_cpu_addr - cpu_addr; | 57 | const u64 offset = query_cpu_addr - cpu_addr; |
| 162 | return ModifiedRegion<Type::CPU>(offset, query_size); | 58 | return word_manager.ModifiedRegion<Type::CPU>(offset, query_size); |
| 163 | } | 59 | } |
| 164 | 60 | ||
| 165 | /// Returns the inclusive GPU modified range in a begin end pair | 61 | /// Returns the inclusive GPU modified range in a begin end pair |
| 166 | [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, | 62 | [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, |
| 167 | u64 query_size) const noexcept { | 63 | u64 query_size) const noexcept { |
| 168 | const u64 offset = query_cpu_addr - cpu_addr; | 64 | const u64 offset = query_cpu_addr - cpu_addr; |
| 169 | return ModifiedRegion<Type::GPU>(offset, query_size); | 65 | return word_manager.ModifiedRegion<Type::GPU>(offset, query_size); |
| 170 | } | 66 | } |
| 171 | 67 | ||
| 172 | /// Returns true if a region has been modified from the CPU | 68 | /// Returns true if a region has been modified from the CPU |
| 173 | [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { | 69 | [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { |
| 174 | const u64 offset = query_cpu_addr - cpu_addr; | 70 | const u64 offset = query_cpu_addr - cpu_addr; |
| 175 | return IsRegionModified<Type::CPU>(offset, query_size); | 71 | return word_manager.IsRegionModified<Type::CPU>(offset, query_size); |
| 176 | } | 72 | } |
| 177 | 73 | ||
| 178 | /// Returns true if a region has been modified from the GPU | 74 | /// Returns true if a region has been modified from the GPU |
| 179 | [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { | 75 | [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { |
| 180 | const u64 offset = query_cpu_addr - cpu_addr; | 76 | const u64 offset = query_cpu_addr - cpu_addr; |
| 181 | return IsRegionModified<Type::GPU>(offset, query_size); | 77 | return word_manager.IsRegionModified<Type::GPU>(offset, query_size); |
| 182 | } | 78 | } |
| 183 | 79 | ||
| 184 | /// Mark region as CPU modified, notifying the rasterizer about this change | 80 | /// Mark region as CPU modified, notifying the rasterizer about this change |
| 185 | void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { | 81 | void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { |
| 186 | ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size); | 82 | word_manager.ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size); |
| 187 | } | 83 | } |
| 188 | 84 | ||
| 189 | /// Unmark region as CPU modified, notifying the rasterizer about this change | 85 | /// Unmark region as CPU modified, notifying the rasterizer about this change |
| 190 | void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { | 86 | void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { |
| 191 | ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size); | 87 | word_manager.ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size); |
| 192 | } | 88 | } |
| 193 | 89 | ||
| 194 | /// Mark region as modified from the host GPU | 90 | /// Mark region as modified from the host GPU |
| 195 | void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { | 91 | void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { |
| 196 | ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size); | 92 | word_manager.ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size); |
| 197 | } | 93 | } |
| 198 | 94 | ||
| 199 | /// Unmark region as modified from the host GPU | 95 | /// Unmark region as modified from the host GPU |
| 200 | void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { | 96 | void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { |
| 201 | ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size); | 97 | word_manager.ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size); |
| 202 | } | 98 | } |
| 203 | 99 | ||
| 204 | /// Mark region as modified from the CPU | 100 | /// Mark region as modified from the CPU |
| 205 | /// but don't mark it as modified until FlusHCachedWrites is called. | 101 | /// but don't mark it as modified until FlusHCachedWrites is called. |
| 206 | void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { | 102 | void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { |
| 207 | flags |= BufferFlagBits::CachedWrites; | 103 | flags |= BufferFlagBits::CachedWrites; |
| 208 | ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size); | 104 | word_manager.ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size); |
| 209 | } | 105 | } |
| 210 | 106 | ||
| 211 | /// Flushes cached CPU writes, and notify the rasterizer about the deltas | 107 | /// Flushes cached CPU writes, and notify the rasterizer about the deltas |
| 212 | void FlushCachedWrites() noexcept { | 108 | void FlushCachedWrites() noexcept { |
| 213 | flags &= ~BufferFlagBits::CachedWrites; | 109 | flags &= ~BufferFlagBits::CachedWrites; |
| 214 | const u64 num_words = NumWords(); | 110 | word_manager.FlushCachedWrites(); |
| 215 | u64* const cached_words = Array<Type::CachedCPU>(); | ||
| 216 | u64* const untracked_words = Array<Type::Untracked>(); | ||
| 217 | u64* const cpu_words = Array<Type::CPU>(); | ||
| 218 | for (u64 word_index = 0; word_index < num_words; ++word_index) { | ||
| 219 | const u64 cached_bits = cached_words[word_index]; | ||
| 220 | NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits); | ||
| 221 | untracked_words[word_index] |= cached_bits; | ||
| 222 | cpu_words[word_index] |= cached_bits; | ||
| 223 | if (!Settings::values.use_pessimistic_flushes) { | ||
| 224 | cached_words[word_index] = 0; | ||
| 225 | } | ||
| 226 | } | ||
| 227 | } | 111 | } |
| 228 | 112 | ||
| 229 | /// Call 'func' for each CPU modified range and unmark those pages as CPU modified | 113 | /// Call 'func' for each CPU modified range and unmark those pages as CPU modified |
| 230 | template <typename Func> | 114 | template <typename Func> |
| 231 | void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { | 115 | void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { |
| 232 | ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func); | 116 | word_manager.ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func); |
| 233 | } | 117 | } |
| 234 | 118 | ||
| 235 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified | 119 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified |
| 236 | template <typename Func> | 120 | template <typename Func> |
| 237 | void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { | 121 | void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { |
| 238 | ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func); | 122 | word_manager.ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func); |
| 239 | } | 123 | } |
| 240 | 124 | ||
| 241 | template <typename Func> | 125 | template <typename Func> |
| 242 | void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) { | 126 | void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) { |
| 243 | ForEachModifiedRange<Type::GPU>(query_cpu_range, size, true, func); | 127 | word_manager.ForEachModifiedRange<Type::GPU>(query_cpu_range, size, true, func); |
| 244 | } | 128 | } |
| 245 | 129 | ||
| 246 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified | 130 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified |
| 247 | template <typename Func> | 131 | template <typename Func> |
| 248 | void ForEachDownloadRange(Func&& func) { | 132 | void ForEachDownloadRange(Func&& func) { |
| 249 | ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func); | 133 | word_manager.ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func); |
| 250 | } | 134 | } |
| 251 | 135 | ||
| 252 | /// Mark buffer as picked | 136 | /// Mark buffer as picked |
| @@ -297,7 +181,7 @@ public: | |||
| 297 | 181 | ||
| 298 | /// Returns the size in bytes of the buffer | 182 | /// Returns the size in bytes of the buffer |
| 299 | [[nodiscard]] u64 SizeBytes() const noexcept { | 183 | [[nodiscard]] u64 SizeBytes() const noexcept { |
| 300 | return words.size_bytes; | 184 | return word_manager.SizeBytes(); |
| 301 | } | 185 | } |
| 302 | 186 | ||
| 303 | size_t getLRUID() const noexcept { | 187 | size_t getLRUID() const noexcept { |
| @@ -309,301 +193,8 @@ public: | |||
| 309 | } | 193 | } |
| 310 | 194 | ||
| 311 | private: | 195 | private: |
| 312 | template <Type type> | ||
| 313 | u64* Array() noexcept { | ||
| 314 | if constexpr (type == Type::CPU) { | ||
| 315 | return words.cpu.Pointer(IsShort()); | ||
| 316 | } else if constexpr (type == Type::GPU) { | ||
| 317 | return words.gpu.Pointer(IsShort()); | ||
| 318 | } else if constexpr (type == Type::CachedCPU) { | ||
| 319 | return words.cached_cpu.Pointer(IsShort()); | ||
| 320 | } else if constexpr (type == Type::Untracked) { | ||
| 321 | return words.untracked.Pointer(IsShort()); | ||
| 322 | } | ||
| 323 | } | ||
| 324 | |||
| 325 | template <Type type> | ||
| 326 | const u64* Array() const noexcept { | ||
| 327 | if constexpr (type == Type::CPU) { | ||
| 328 | return words.cpu.Pointer(IsShort()); | ||
| 329 | } else if constexpr (type == Type::GPU) { | ||
| 330 | return words.gpu.Pointer(IsShort()); | ||
| 331 | } else if constexpr (type == Type::CachedCPU) { | ||
| 332 | return words.cached_cpu.Pointer(IsShort()); | ||
| 333 | } else if constexpr (type == Type::Untracked) { | ||
| 334 | return words.untracked.Pointer(IsShort()); | ||
| 335 | } | ||
| 336 | } | ||
| 337 | |||
| 338 | /** | ||
| 339 | * Change the state of a range of pages | ||
| 340 | * | ||
| 341 | * @param dirty_addr Base address to mark or unmark as modified | ||
| 342 | * @param size Size in bytes to mark or unmark as modified | ||
| 343 | */ | ||
| 344 | template <Type type, bool enable> | ||
| 345 | void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { | ||
| 346 | const s64 difference = dirty_addr - cpu_addr; | ||
| 347 | const u64 offset = std::max<s64>(difference, 0); | ||
| 348 | size += std::min<s64>(difference, 0); | ||
| 349 | if (offset >= SizeBytes() || size < 0) { | ||
| 350 | return; | ||
| 351 | } | ||
| 352 | u64* const untracked_words = Array<Type::Untracked>(); | ||
| 353 | u64* const state_words = Array<type>(); | ||
| 354 | const u64 offset_end = std::min(offset + size, SizeBytes()); | ||
| 355 | const u64 begin_page_index = offset / BYTES_PER_PAGE; | ||
| 356 | const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; | ||
| 357 | const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE); | ||
| 358 | const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD); | ||
| 359 | u64 page_index = begin_page_index % PAGES_PER_WORD; | ||
| 360 | u64 word_index = begin_word_index; | ||
| 361 | while (word_index < end_word_index) { | ||
| 362 | const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD; | ||
| 363 | const u64 left_offset = | ||
| 364 | std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD; | ||
| 365 | const u64 right_offset = page_index; | ||
| 366 | u64 bits = ~u64{0}; | ||
| 367 | bits = (bits >> right_offset) << right_offset; | ||
| 368 | bits = (bits << left_offset) >> left_offset; | ||
| 369 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { | ||
| 370 | NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits); | ||
| 371 | } | ||
| 372 | if constexpr (enable) { | ||
| 373 | state_words[word_index] |= bits; | ||
| 374 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { | ||
| 375 | untracked_words[word_index] |= bits; | ||
| 376 | } | ||
| 377 | } else { | ||
| 378 | state_words[word_index] &= ~bits; | ||
| 379 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { | ||
| 380 | untracked_words[word_index] &= ~bits; | ||
| 381 | } | ||
| 382 | } | ||
| 383 | page_index = 0; | ||
| 384 | ++word_index; | ||
| 385 | } | ||
| 386 | } | ||
| 387 | |||
| 388 | /** | ||
| 389 | * Notify rasterizer about changes in the CPU tracking state of a word in the buffer | ||
| 390 | * | ||
| 391 | * @param word_index Index to the word to notify to the rasterizer | ||
| 392 | * @param current_bits Current state of the word | ||
| 393 | * @param new_bits New state of the word | ||
| 394 | * | ||
| 395 | * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages | ||
| 396 | */ | ||
| 397 | template <bool add_to_rasterizer> | ||
| 398 | void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { | ||
| 399 | u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; | ||
| 400 | VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; | ||
| 401 | while (changed_bits != 0) { | ||
| 402 | const int empty_bits = std::countr_zero(changed_bits); | ||
| 403 | addr += empty_bits * BYTES_PER_PAGE; | ||
| 404 | changed_bits >>= empty_bits; | ||
| 405 | |||
| 406 | const u32 continuous_bits = std::countr_one(changed_bits); | ||
| 407 | const u64 size = continuous_bits * BYTES_PER_PAGE; | ||
| 408 | const VAddr begin_addr = addr; | ||
| 409 | addr += size; | ||
| 410 | changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0; | ||
| 411 | rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1); | ||
| 412 | } | ||
| 413 | } | ||
| 414 | |||
| 415 | /** | ||
| 416 | * Loop over each page in the given range, turn off those bits and notify the rasterizer if | ||
| 417 | * needed. Call the given function on each turned off range. | ||
| 418 | * | ||
| 419 | * @param query_cpu_range Base CPU address to loop over | ||
| 420 | * @param size Size in bytes of the CPU range to loop over | ||
| 421 | * @param func Function to call for each turned off region | ||
| 422 | */ | ||
| 423 | template <Type type, typename Func> | ||
| 424 | void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { | ||
| 425 | static_assert(type != Type::Untracked); | ||
| 426 | |||
| 427 | const s64 difference = query_cpu_range - cpu_addr; | ||
| 428 | const u64 query_begin = std::max<s64>(difference, 0); | ||
| 429 | size += std::min<s64>(difference, 0); | ||
| 430 | if (query_begin >= SizeBytes() || size < 0) { | ||
| 431 | return; | ||
| 432 | } | ||
| 433 | u64* const untracked_words = Array<Type::Untracked>(); | ||
| 434 | u64* const state_words = Array<type>(); | ||
| 435 | const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); | ||
| 436 | u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; | ||
| 437 | u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); | ||
| 438 | |||
| 439 | const auto modified = [](u64 word) { return word != 0; }; | ||
| 440 | const auto first_modified_word = std::find_if(words_begin, words_end, modified); | ||
| 441 | if (first_modified_word == words_end) { | ||
| 442 | // Exit early when the buffer is not modified | ||
| 443 | return; | ||
| 444 | } | ||
| 445 | const auto last_modified_word = std::find_if_not(first_modified_word, words_end, modified); | ||
| 446 | |||
| 447 | const u64 word_index_begin = std::distance(state_words, first_modified_word); | ||
| 448 | const u64 word_index_end = std::distance(state_words, last_modified_word); | ||
| 449 | |||
| 450 | const unsigned local_page_begin = std::countr_zero(*first_modified_word); | ||
| 451 | const unsigned local_page_end = | ||
| 452 | static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); | ||
| 453 | const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; | ||
| 454 | const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; | ||
| 455 | const u64 query_page_begin = query_begin / BYTES_PER_PAGE; | ||
| 456 | const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE); | ||
| 457 | const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin); | ||
| 458 | const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end); | ||
| 459 | const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD; | ||
| 460 | const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1; | ||
| 461 | |||
| 462 | u64 page_begin = first_word_page_begin; | ||
| 463 | u64 current_base = 0; | ||
| 464 | u64 current_size = 0; | ||
| 465 | bool on_going = false; | ||
| 466 | for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) { | ||
| 467 | const bool is_last_word = word_index + 1 == word_index_end; | ||
| 468 | const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD; | ||
| 469 | const u64 right_offset = page_begin; | ||
| 470 | const u64 left_offset = PAGES_PER_WORD - page_end; | ||
| 471 | u64 bits = ~u64{0}; | ||
| 472 | bits = (bits >> right_offset) << right_offset; | ||
| 473 | bits = (bits << left_offset) >> left_offset; | ||
| 474 | |||
| 475 | const u64 current_word = state_words[word_index] & bits; | ||
| 476 | if (clear) { | ||
| 477 | state_words[word_index] &= ~bits; | ||
| 478 | } | ||
| 479 | |||
| 480 | if constexpr (type == Type::CPU) { | ||
| 481 | const u64 current_bits = untracked_words[word_index] & bits; | ||
| 482 | untracked_words[word_index] &= ~bits; | ||
| 483 | NotifyRasterizer<true>(word_index, current_bits, ~u64{0}); | ||
| 484 | } | ||
| 485 | // Exclude CPU modified pages when visiting GPU pages | ||
| 486 | const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); | ||
| 487 | u64 page = page_begin; | ||
| 488 | page_begin = 0; | ||
| 489 | |||
| 490 | while (page < page_end) { | ||
| 491 | const int empty_bits = std::countr_zero(word >> page); | ||
| 492 | if (on_going && empty_bits != 0) { | ||
| 493 | InvokeModifiedRange(func, current_size, current_base); | ||
| 494 | current_size = 0; | ||
| 495 | on_going = false; | ||
| 496 | } | ||
| 497 | if (empty_bits == PAGES_PER_WORD) { | ||
| 498 | break; | ||
| 499 | } | ||
| 500 | page += empty_bits; | ||
| 501 | |||
| 502 | const int continuous_bits = std::countr_one(word >> page); | ||
| 503 | if (!on_going && continuous_bits != 0) { | ||
| 504 | current_base = word_index * PAGES_PER_WORD + page; | ||
| 505 | on_going = true; | ||
| 506 | } | ||
| 507 | current_size += continuous_bits; | ||
| 508 | page += continuous_bits; | ||
| 509 | } | ||
| 510 | } | ||
| 511 | if (on_going && current_size > 0) { | ||
| 512 | InvokeModifiedRange(func, current_size, current_base); | ||
| 513 | } | ||
| 514 | } | ||
| 515 | |||
| 516 | template <typename Func> | ||
| 517 | void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) { | ||
| 518 | const u64 current_size_bytes = current_size * BYTES_PER_PAGE; | ||
| 519 | const u64 offset_begin = current_base * BYTES_PER_PAGE; | ||
| 520 | const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes()); | ||
| 521 | func(offset_begin, offset_end - offset_begin); | ||
| 522 | } | ||
| 523 | |||
| 524 | /** | ||
| 525 | * Returns true when a region has been modified | ||
| 526 | * | ||
| 527 | * @param offset Offset in bytes from the start of the buffer | ||
| 528 | * @param size Size in bytes of the region to query for modifications | ||
| 529 | */ | ||
| 530 | template <Type type> | ||
| 531 | [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { | ||
| 532 | static_assert(type != Type::Untracked); | ||
| 533 | |||
| 534 | const u64* const untracked_words = Array<Type::Untracked>(); | ||
| 535 | const u64* const state_words = Array<type>(); | ||
| 536 | const u64 num_query_words = size / BYTES_PER_WORD + 1; | ||
| 537 | const u64 word_begin = offset / BYTES_PER_WORD; | ||
| 538 | const u64 word_end = std::min<u64>(word_begin + num_query_words, NumWords()); | ||
| 539 | const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); | ||
| 540 | u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; | ||
| 541 | for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { | ||
| 542 | const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; | ||
| 543 | const u64 word = state_words[word_index] & ~off_word; | ||
| 544 | if (word == 0) { | ||
| 545 | continue; | ||
| 546 | } | ||
| 547 | const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit); | ||
| 548 | const u64 local_page_end = page_end % PAGES_PER_WORD; | ||
| 549 | const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD; | ||
| 550 | if (((word >> page_index) << page_index) << page_end_shift != 0) { | ||
| 551 | return true; | ||
| 552 | } | ||
| 553 | } | ||
| 554 | return false; | ||
| 555 | } | ||
| 556 | |||
| 557 | /** | ||
| 558 | * Returns a begin end pair with the inclusive modified region | ||
| 559 | * | ||
| 560 | * @param offset Offset in bytes from the start of the buffer | ||
| 561 | * @param size Size in bytes of the region to query for modifications | ||
| 562 | */ | ||
| 563 | template <Type type> | ||
| 564 | [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { | ||
| 565 | static_assert(type != Type::Untracked); | ||
| 566 | |||
| 567 | const u64* const untracked_words = Array<Type::Untracked>(); | ||
| 568 | const u64* const state_words = Array<type>(); | ||
| 569 | const u64 num_query_words = size / BYTES_PER_WORD + 1; | ||
| 570 | const u64 word_begin = offset / BYTES_PER_WORD; | ||
| 571 | const u64 word_end = std::min<u64>(word_begin + num_query_words, NumWords()); | ||
| 572 | const u64 page_base = offset / BYTES_PER_PAGE; | ||
| 573 | const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); | ||
| 574 | u64 begin = std::numeric_limits<u64>::max(); | ||
| 575 | u64 end = 0; | ||
| 576 | for (u64 word_index = word_begin; word_index < word_end; ++word_index) { | ||
| 577 | const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; | ||
| 578 | const u64 word = state_words[word_index] & ~off_word; | ||
| 579 | if (word == 0) { | ||
| 580 | continue; | ||
| 581 | } | ||
| 582 | const u64 local_page_begin = std::countr_zero(word); | ||
| 583 | const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); | ||
| 584 | const u64 page_index = word_index * PAGES_PER_WORD; | ||
| 585 | const u64 page_begin = std::max(page_index + local_page_begin, page_base); | ||
| 586 | const u64 page_end = std::min(page_index + local_page_end, page_limit); | ||
| 587 | begin = std::min(begin, page_begin); | ||
| 588 | end = std::max(end, page_end); | ||
| 589 | } | ||
| 590 | static constexpr std::pair<u64, u64> EMPTY{0, 0}; | ||
| 591 | return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; | ||
| 592 | } | ||
| 593 | |||
| 594 | /// Returns the number of words of the buffer | ||
| 595 | [[nodiscard]] size_t NumWords() const noexcept { | ||
| 596 | return words.NumWords(); | ||
| 597 | } | ||
| 598 | |||
| 599 | /// Returns true when the buffer fits in the small vector optimization | ||
| 600 | [[nodiscard]] bool IsShort() const noexcept { | ||
| 601 | return words.IsShort(); | ||
| 602 | } | ||
| 603 | |||
| 604 | RasterizerInterface* rasterizer = nullptr; | ||
| 605 | VAddr cpu_addr = 0; | 196 | VAddr cpu_addr = 0; |
| 606 | Words words; | 197 | WordManager<RasterizerInterface> word_manager; |
| 607 | BufferFlagBits flags{}; | 198 | BufferFlagBits flags{}; |
| 608 | int stream_score = 0; | 199 | int stream_score = 0; |
| 609 | size_t lru_id = SIZE_MAX; | 200 | size_t lru_id = SIZE_MAX; |
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index a16308b60..40db243d2 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project | 1 | // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project |
| 2 | // SPDX-License-Identifier: GPL-2.0-or-later | 2 | // SPDX-License-Identifier: GPL-3.0-or-later |
| 3 | 3 | ||
| 4 | #include "common/microprofile.h" | 4 | #include "common/microprofile.h" |
| 5 | 5 | ||
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index abdc593df..a0701ce4e 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h | |||
| @@ -1,482 +1,21 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project | 1 | // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project |
| 2 | // SPDX-License-Identifier: GPL-2.0-or-later | 2 | // SPDX-License-Identifier: GPL-3.0-or-later |
| 3 | 3 | ||
| 4 | #pragma once | 4 | #pragma once |
| 5 | 5 | ||
| 6 | #include <algorithm> | 6 | #include <algorithm> |
| 7 | #include <array> | ||
| 8 | #include <memory> | 7 | #include <memory> |
| 9 | #include <mutex> | ||
| 10 | #include <numeric> | 8 | #include <numeric> |
| 11 | #include <span> | ||
| 12 | #include <vector> | ||
| 13 | |||
| 14 | #include <boost/container/small_vector.hpp> | ||
| 15 | #include <boost/icl/interval_set.hpp> | ||
| 16 | |||
| 17 | #include "common/common_types.h" | ||
| 18 | #include "common/div_ceil.h" | ||
| 19 | #include "common/literals.h" | ||
| 20 | #include "common/lru_cache.h" | ||
| 21 | #include "common/microprofile.h" | ||
| 22 | #include "common/polyfill_ranges.h" | ||
| 23 | #include "common/scratch_buffer.h" | ||
| 24 | #include "common/settings.h" | ||
| 25 | #include "core/memory.h" | ||
| 26 | #include "video_core/buffer_cache/buffer_base.h" | ||
| 27 | #include "video_core/control/channel_state_cache.h" | ||
| 28 | #include "video_core/delayed_destruction_ring.h" | ||
| 29 | #include "video_core/dirty_flags.h" | ||
| 30 | #include "video_core/engines/draw_manager.h" | ||
| 31 | #include "video_core/engines/kepler_compute.h" | ||
| 32 | #include "video_core/engines/maxwell_3d.h" | ||
| 33 | #include "video_core/memory_manager.h" | ||
| 34 | #include "video_core/rasterizer_interface.h" | ||
| 35 | #include "video_core/surface.h" | ||
| 36 | #include "video_core/texture_cache/slot_vector.h" | ||
| 37 | #include "video_core/texture_cache/types.h" | ||
| 38 | 9 | ||
| 39 | namespace VideoCommon { | 10 | #include "video_core/buffer_cache/buffer_cache_base.h" |
| 40 | |||
| 41 | MICROPROFILE_DECLARE(GPU_PrepareBuffers); | ||
| 42 | MICROPROFILE_DECLARE(GPU_BindUploadBuffers); | ||
| 43 | MICROPROFILE_DECLARE(GPU_DownloadMemory); | ||
| 44 | |||
| 45 | using BufferId = SlotId; | ||
| 46 | |||
| 47 | using VideoCore::Surface::PixelFormat; | ||
| 48 | using namespace Common::Literals; | ||
| 49 | |||
| 50 | constexpr u32 NUM_VERTEX_BUFFERS = 32; | ||
| 51 | constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; | ||
| 52 | constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; | ||
| 53 | constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; | ||
| 54 | constexpr u32 NUM_STORAGE_BUFFERS = 16; | ||
| 55 | constexpr u32 NUM_TEXTURE_BUFFERS = 16; | ||
| 56 | constexpr u32 NUM_STAGES = 5; | ||
| 57 | |||
| 58 | enum class ObtainBufferSynchronize : u32 { | ||
| 59 | NoSynchronize = 0, | ||
| 60 | FullSynchronize = 1, | ||
| 61 | SynchronizeNoDirty = 2, | ||
| 62 | }; | ||
| 63 | |||
| 64 | enum class ObtainBufferOperation : u32 { | ||
| 65 | DoNothing = 0, | ||
| 66 | MarkAsWritten = 1, | ||
| 67 | DiscardWrite = 2, | ||
| 68 | MarkQuery = 3, | ||
| 69 | }; | ||
| 70 | |||
| 71 | using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>; | ||
| 72 | using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>; | ||
| 73 | |||
| 74 | template <typename P> | ||
| 75 | class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { | ||
| 76 | |||
| 77 | // Page size for caching purposes. | ||
| 78 | // This is unrelated to the CPU page size and it can be changed as it seems optimal. | ||
| 79 | static constexpr u32 YUZU_PAGEBITS = 16; | ||
| 80 | static constexpr u64 YUZU_PAGESIZE = u64{1} << YUZU_PAGEBITS; | ||
| 81 | |||
| 82 | static constexpr bool IS_OPENGL = P::IS_OPENGL; | ||
| 83 | static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = | ||
| 84 | P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; | ||
| 85 | static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = | ||
| 86 | P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; | ||
| 87 | static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; | ||
| 88 | static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; | ||
| 89 | static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; | ||
| 90 | static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS; | ||
| 91 | |||
| 92 | static constexpr BufferId NULL_BUFFER_ID{0}; | ||
| 93 | |||
| 94 | static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; | ||
| 95 | static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; | ||
| 96 | static constexpr s64 TARGET_THRESHOLD = 4_GiB; | ||
| 97 | |||
| 98 | using Maxwell = Tegra::Engines::Maxwell3D::Regs; | ||
| 99 | |||
| 100 | using Runtime = typename P::Runtime; | ||
| 101 | using Buffer = typename P::Buffer; | ||
| 102 | |||
| 103 | using IntervalSet = boost::icl::interval_set<VAddr>; | ||
| 104 | using IntervalType = typename IntervalSet::interval_type; | ||
| 105 | |||
| 106 | struct Empty {}; | ||
| 107 | |||
| 108 | struct OverlapResult { | ||
| 109 | std::vector<BufferId> ids; | ||
| 110 | VAddr begin; | ||
| 111 | VAddr end; | ||
| 112 | bool has_stream_leap = false; | ||
| 113 | }; | ||
| 114 | |||
| 115 | struct Binding { | ||
| 116 | VAddr cpu_addr{}; | ||
| 117 | u32 size{}; | ||
| 118 | BufferId buffer_id; | ||
| 119 | }; | ||
| 120 | |||
| 121 | struct TextureBufferBinding : Binding { | ||
| 122 | PixelFormat format; | ||
| 123 | }; | ||
| 124 | |||
| 125 | static constexpr Binding NULL_BINDING{ | ||
| 126 | .cpu_addr = 0, | ||
| 127 | .size = 0, | ||
| 128 | .buffer_id = NULL_BUFFER_ID, | ||
| 129 | }; | ||
| 130 | |||
| 131 | public: | ||
| 132 | static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB); | ||
| 133 | |||
| 134 | explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, | ||
| 135 | Core::Memory::Memory& cpu_memory_, Runtime& runtime_); | ||
| 136 | |||
| 137 | void TickFrame(); | ||
| 138 | |||
| 139 | void WriteMemory(VAddr cpu_addr, u64 size); | ||
| 140 | |||
| 141 | void CachedWriteMemory(VAddr cpu_addr, u64 size); | ||
| 142 | |||
| 143 | void DownloadMemory(VAddr cpu_addr, u64 size); | ||
| 144 | |||
| 145 | bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer); | ||
| 146 | |||
| 147 | void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); | ||
| 148 | |||
| 149 | void DisableGraphicsUniformBuffer(size_t stage, u32 index); | ||
| 150 | |||
| 151 | void UpdateGraphicsBuffers(bool is_indexed); | ||
| 152 | |||
| 153 | void UpdateComputeBuffers(); | ||
| 154 | |||
| 155 | void BindHostGeometryBuffers(bool is_indexed); | ||
| 156 | |||
| 157 | void BindHostStageBuffers(size_t stage); | ||
| 158 | |||
| 159 | void BindHostComputeBuffers(); | ||
| 160 | |||
| 161 | void SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask, | ||
| 162 | const UniformBufferSizes* sizes); | ||
| 163 | |||
| 164 | void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes); | ||
| 165 | |||
| 166 | void UnbindGraphicsStorageBuffers(size_t stage); | ||
| 167 | |||
| 168 | void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, | ||
| 169 | bool is_written); | ||
| 170 | |||
| 171 | void UnbindGraphicsTextureBuffers(size_t stage); | ||
| 172 | |||
| 173 | void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size, | ||
| 174 | PixelFormat format, bool is_written, bool is_image); | ||
| 175 | |||
| 176 | void UnbindComputeStorageBuffers(); | ||
| 177 | |||
| 178 | void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, | ||
| 179 | bool is_written); | ||
| 180 | |||
| 181 | void UnbindComputeTextureBuffers(); | ||
| 182 | |||
| 183 | void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format, | ||
| 184 | bool is_written, bool is_image); | ||
| 185 | |||
| 186 | void FlushCachedWrites(); | ||
| 187 | |||
| 188 | /// Return true when there are uncommitted buffers to be downloaded | ||
| 189 | [[nodiscard]] bool HasUncommittedFlushes() const noexcept; | ||
| 190 | |||
| 191 | void AccumulateFlushes(); | ||
| 192 | |||
| 193 | /// Return true when the caller should wait for async downloads | ||
| 194 | [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; | ||
| 195 | |||
| 196 | /// Commit asynchronous downloads | ||
| 197 | void CommitAsyncFlushes(); | ||
| 198 | void CommitAsyncFlushesHigh(); | ||
| 199 | |||
| 200 | /// Pop asynchronous downloads | ||
| 201 | void PopAsyncFlushes(); | ||
| 202 | |||
| 203 | bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); | ||
| 204 | |||
| 205 | bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); | ||
| 206 | |||
| 207 | [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, | ||
| 208 | ObtainBufferSynchronize sync_info, | ||
| 209 | ObtainBufferOperation post_op); | ||
| 210 | |||
| 211 | /// Return true when a CPU region is modified from the GPU | ||
| 212 | [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); | ||
| 213 | |||
| 214 | /// Return true when a region is registered on the cache | ||
| 215 | [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); | ||
| 216 | |||
| 217 | /// Return true when a CPU region is modified from the CPU | ||
| 218 | [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); | ||
| 219 | |||
| 220 | void SetDrawIndirect( | ||
| 221 | const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { | ||
| 222 | current_draw_indirect = current_draw_indirect_; | ||
| 223 | } | ||
| 224 | |||
| 225 | [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectCount(); | ||
| 226 | |||
| 227 | [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); | ||
| 228 | |||
| 229 | std::recursive_mutex mutex; | ||
| 230 | Runtime& runtime; | ||
| 231 | |||
| 232 | private: | ||
| 233 | template <typename Func> | ||
| 234 | static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { | ||
| 235 | for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { | ||
| 236 | const int disabled_bits = std::countr_zero(enabled_mask); | ||
| 237 | index += disabled_bits; | ||
| 238 | enabled_mask >>= disabled_bits; | ||
| 239 | func(index); | ||
| 240 | } | ||
| 241 | } | ||
| 242 | |||
| 243 | template <typename Func> | ||
| 244 | void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { | ||
| 245 | const u64 page_end = Common::DivCeil(cpu_addr + size, YUZU_PAGESIZE); | ||
| 246 | for (u64 page = cpu_addr >> YUZU_PAGEBITS; page < page_end;) { | ||
| 247 | const BufferId buffer_id = page_table[page]; | ||
| 248 | if (!buffer_id) { | ||
| 249 | ++page; | ||
| 250 | continue; | ||
| 251 | } | ||
| 252 | Buffer& buffer = slot_buffers[buffer_id]; | ||
| 253 | func(buffer_id, buffer); | ||
| 254 | |||
| 255 | const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); | ||
| 256 | page = Common::DivCeil(end_addr, YUZU_PAGESIZE); | ||
| 257 | } | ||
| 258 | } | ||
| 259 | |||
| 260 | template <typename Func> | ||
| 261 | void ForEachWrittenRange(VAddr cpu_addr, u64 size, Func&& func) { | ||
| 262 | const VAddr start_address = cpu_addr; | ||
| 263 | const VAddr end_address = start_address + size; | ||
| 264 | const VAddr search_base = | ||
| 265 | static_cast<VAddr>(std::min<s64>(0LL, static_cast<s64>(start_address - size))); | ||
| 266 | const IntervalType search_interval{search_base, search_base + 1}; | ||
| 267 | auto it = common_ranges.lower_bound(search_interval); | ||
| 268 | if (it == common_ranges.end()) { | ||
| 269 | it = common_ranges.begin(); | ||
| 270 | } | ||
| 271 | for (; it != common_ranges.end(); it++) { | ||
| 272 | VAddr inter_addr_end = it->upper(); | ||
| 273 | VAddr inter_addr = it->lower(); | ||
| 274 | if (inter_addr >= end_address) { | ||
| 275 | break; | ||
| 276 | } | ||
| 277 | if (inter_addr_end <= start_address) { | ||
| 278 | continue; | ||
| 279 | } | ||
| 280 | if (inter_addr_end > end_address) { | ||
| 281 | inter_addr_end = end_address; | ||
| 282 | } | ||
| 283 | if (inter_addr < start_address) { | ||
| 284 | inter_addr = start_address; | ||
| 285 | } | ||
| 286 | func(inter_addr, inter_addr_end); | ||
| 287 | } | ||
| 288 | } | ||
| 289 | |||
| 290 | static bool IsRangeGranular(VAddr cpu_addr, size_t size) { | ||
| 291 | return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == | ||
| 292 | ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); | ||
| 293 | } | ||
| 294 | |||
| 295 | void RunGarbageCollector(); | ||
| 296 | |||
| 297 | void BindHostIndexBuffer(); | ||
| 298 | |||
| 299 | void BindHostVertexBuffers(); | ||
| 300 | |||
| 301 | void BindHostDrawIndirectBuffers(); | ||
| 302 | |||
| 303 | void BindHostGraphicsUniformBuffers(size_t stage); | ||
| 304 | |||
| 305 | void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); | ||
| 306 | |||
| 307 | void BindHostGraphicsStorageBuffers(size_t stage); | ||
| 308 | |||
| 309 | void BindHostGraphicsTextureBuffers(size_t stage); | ||
| 310 | |||
| 311 | void BindHostTransformFeedbackBuffers(); | ||
| 312 | |||
| 313 | void BindHostComputeUniformBuffers(); | ||
| 314 | |||
| 315 | void BindHostComputeStorageBuffers(); | ||
| 316 | |||
| 317 | void BindHostComputeTextureBuffers(); | ||
| 318 | |||
| 319 | void DoUpdateGraphicsBuffers(bool is_indexed); | ||
| 320 | |||
| 321 | void DoUpdateComputeBuffers(); | ||
| 322 | |||
| 323 | void UpdateIndexBuffer(); | ||
| 324 | |||
| 325 | void UpdateVertexBuffers(); | ||
| 326 | |||
| 327 | void UpdateVertexBuffer(u32 index); | ||
| 328 | |||
| 329 | void UpdateDrawIndirect(); | ||
| 330 | |||
| 331 | void UpdateUniformBuffers(size_t stage); | ||
| 332 | |||
| 333 | void UpdateStorageBuffers(size_t stage); | ||
| 334 | |||
| 335 | void UpdateTextureBuffers(size_t stage); | ||
| 336 | |||
| 337 | void UpdateTransformFeedbackBuffers(); | ||
| 338 | |||
| 339 | void UpdateTransformFeedbackBuffer(u32 index); | ||
| 340 | |||
| 341 | void UpdateComputeUniformBuffers(); | ||
| 342 | |||
| 343 | void UpdateComputeStorageBuffers(); | ||
| 344 | |||
| 345 | void UpdateComputeTextureBuffers(); | ||
| 346 | |||
| 347 | void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); | ||
| 348 | |||
| 349 | [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); | ||
| 350 | |||
| 351 | [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); | ||
| 352 | |||
| 353 | void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); | ||
| 354 | |||
| 355 | [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); | ||
| 356 | |||
| 357 | void Register(BufferId buffer_id); | ||
| 358 | |||
| 359 | void Unregister(BufferId buffer_id); | ||
| 360 | |||
| 361 | template <bool insert> | ||
| 362 | void ChangeRegister(BufferId buffer_id); | ||
| 363 | |||
| 364 | void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept; | ||
| 365 | |||
| 366 | bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); | ||
| 367 | |||
| 368 | bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); | ||
| 369 | |||
| 370 | void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, | ||
| 371 | std::span<BufferCopy> copies); | ||
| 372 | |||
| 373 | void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, | ||
| 374 | std::span<const BufferCopy> copies); | ||
| 375 | |||
| 376 | void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); | ||
| 377 | |||
| 378 | void DownloadBufferMemory(Buffer& buffer_id); | ||
| 379 | |||
| 380 | void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); | ||
| 381 | |||
| 382 | void DeleteBuffer(BufferId buffer_id); | ||
| 383 | |||
| 384 | void NotifyBufferDeletion(); | ||
| 385 | |||
| 386 | [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, | ||
| 387 | bool is_written = false) const; | ||
| 388 | |||
| 389 | [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, | ||
| 390 | PixelFormat format); | ||
| 391 | |||
| 392 | [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size); | ||
| 393 | |||
| 394 | [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity); | ||
| 395 | |||
| 396 | [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; | ||
| 397 | |||
| 398 | void ClearDownload(IntervalType subtract_interval); | ||
| 399 | |||
| 400 | VideoCore::RasterizerInterface& rasterizer; | ||
| 401 | Core::Memory::Memory& cpu_memory; | ||
| 402 | |||
| 403 | SlotVector<Buffer> slot_buffers; | ||
| 404 | DelayedDestructionRing<Buffer, 8> delayed_destruction_ring; | ||
| 405 | |||
| 406 | const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; | ||
| 407 | |||
| 408 | u32 last_index_count = 0; | ||
| 409 | |||
| 410 | Binding index_buffer; | ||
| 411 | std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers; | ||
| 412 | std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers; | ||
| 413 | std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers; | ||
| 414 | std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers; | ||
| 415 | std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers; | ||
| 416 | Binding count_buffer_binding; | ||
| 417 | Binding indirect_buffer_binding; | ||
| 418 | |||
| 419 | std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers; | ||
| 420 | std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers; | ||
| 421 | std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS> compute_texture_buffers; | ||
| 422 | |||
| 423 | std::array<u32, NUM_STAGES> enabled_uniform_buffer_masks{}; | ||
| 424 | u32 enabled_compute_uniform_buffer_mask = 0; | ||
| 425 | |||
| 426 | const UniformBufferSizes* uniform_buffer_sizes{}; | ||
| 427 | const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{}; | ||
| 428 | |||
| 429 | std::array<u32, NUM_STAGES> enabled_storage_buffers{}; | ||
| 430 | std::array<u32, NUM_STAGES> written_storage_buffers{}; | ||
| 431 | u32 enabled_compute_storage_buffers = 0; | ||
| 432 | u32 written_compute_storage_buffers = 0; | ||
| 433 | |||
| 434 | std::array<u32, NUM_STAGES> enabled_texture_buffers{}; | ||
| 435 | std::array<u32, NUM_STAGES> written_texture_buffers{}; | ||
| 436 | std::array<u32, NUM_STAGES> image_texture_buffers{}; | ||
| 437 | u32 enabled_compute_texture_buffers = 0; | ||
| 438 | u32 written_compute_texture_buffers = 0; | ||
| 439 | u32 image_compute_texture_buffers = 0; | ||
| 440 | |||
| 441 | std::array<u32, 16> uniform_cache_hits{}; | ||
| 442 | std::array<u32, 16> uniform_cache_shots{}; | ||
| 443 | |||
| 444 | u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE; | ||
| 445 | |||
| 446 | bool has_deleted_buffers = false; | ||
| 447 | |||
| 448 | std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty> | ||
| 449 | dirty_uniform_buffers{}; | ||
| 450 | std::conditional_t<IS_OPENGL, std::array<u32, NUM_STAGES>, Empty> fast_bound_uniform_buffers{}; | ||
| 451 | std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, | ||
| 452 | std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>, Empty> | ||
| 453 | uniform_buffer_binding_sizes{}; | ||
| 454 | |||
| 455 | std::vector<BufferId> cached_write_buffer_ids; | ||
| 456 | |||
| 457 | IntervalSet uncommitted_ranges; | ||
| 458 | IntervalSet common_ranges; | ||
| 459 | std::deque<IntervalSet> committed_ranges; | ||
| 460 | |||
| 461 | Common::ScratchBuffer<u8> immediate_buffer_alloc; | ||
| 462 | |||
| 463 | struct LRUItemParams { | ||
| 464 | using ObjectType = BufferId; | ||
| 465 | using TickType = u64; | ||
| 466 | }; | ||
| 467 | Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache; | ||
| 468 | u64 frame_tick = 0; | ||
| 469 | u64 total_used_memory = 0; | ||
| 470 | u64 minimum_memory = 0; | ||
| 471 | u64 critical_memory = 0; | ||
| 472 | 11 | ||
| 473 | std::array<BufferId, ((1ULL << 39) >> YUZU_PAGEBITS)> page_table; | 12 | namespace VideoCommon { |
| 474 | }; | ||
| 475 | 13 | ||
| 476 | template <class P> | 14 | template <class P> |
| 477 | BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, | 15 | BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, |
| 478 | Core::Memory::Memory& cpu_memory_, Runtime& runtime_) | 16 | Core::Memory::Memory& cpu_memory_, Runtime& runtime_) |
| 479 | : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_} { | 17 | : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_}, memory_tracker{ |
| 18 | rasterizer} { | ||
| 480 | // Ensure the first slot is used for the null buffer | 19 | // Ensure the first slot is used for the null buffer |
| 481 | void(slot_buffers.insert(runtime, NullBufferParams{})); | 20 | void(slot_buffers.insert(runtime, NullBufferParams{})); |
| 482 | common_ranges.clear(); | 21 | common_ranges.clear(); |
| @@ -547,19 +86,18 @@ void BufferCache<P>::TickFrame() { | |||
| 547 | 86 | ||
| 548 | template <class P> | 87 | template <class P> |
| 549 | void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) { | 88 | void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) { |
| 550 | ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { | 89 | memory_tracker.MarkRegionAsCpuModified(cpu_addr, size); |
| 551 | buffer.MarkRegionAsCpuModified(cpu_addr, size); | 90 | const IntervalType subtract_interval{cpu_addr, cpu_addr + size}; |
| 552 | }); | 91 | ClearDownload(subtract_interval); |
| 92 | common_ranges.subtract(subtract_interval); | ||
| 553 | } | 93 | } |
| 554 | 94 | ||
| 555 | template <class P> | 95 | template <class P> |
| 556 | void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { | 96 | void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { |
| 557 | ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { | 97 | memory_tracker.CachedCpuWrite(cpu_addr, size); |
| 558 | if (!buffer.HasCachedWrites()) { | 98 | const IntervalType add_interval{Common::AlignDown(cpu_addr, YUZU_PAGESIZE), |
| 559 | cached_write_buffer_ids.push_back(buffer_id); | 99 | Common::AlignUp(cpu_addr + size, YUZU_PAGESIZE)}; |
| 560 | } | 100 | cached_ranges.add(add_interval); |
| 561 | buffer.CachedCpuWrite(cpu_addr, size); | ||
| 562 | }); | ||
| 563 | } | 101 | } |
| 564 | 102 | ||
| 565 | template <class P> | 103 | template <class P> |
| @@ -572,6 +110,9 @@ void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { | |||
| 572 | template <class P> | 110 | template <class P> |
| 573 | void BufferCache<P>::ClearDownload(IntervalType subtract_interval) { | 111 | void BufferCache<P>::ClearDownload(IntervalType subtract_interval) { |
| 574 | uncommitted_ranges.subtract(subtract_interval); | 112 | uncommitted_ranges.subtract(subtract_interval); |
| 113 | for (auto& interval_set : async_downloads) { | ||
| 114 | interval_set.subtract(subtract_interval); | ||
| 115 | } | ||
| 575 | for (auto& interval_set : committed_ranges) { | 116 | for (auto& interval_set : committed_ranges) { |
| 576 | interval_set.subtract(subtract_interval); | 117 | interval_set.subtract(subtract_interval); |
| 577 | } | 118 | } |
| @@ -611,15 +152,19 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am | |||
| 611 | }}; | 152 | }}; |
| 612 | 153 | ||
| 613 | boost::container::small_vector<IntervalType, 4> tmp_intervals; | 154 | boost::container::small_vector<IntervalType, 4> tmp_intervals; |
| 155 | const bool is_high_accuracy = | ||
| 156 | Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High; | ||
| 614 | auto mirror = [&](VAddr base_address, VAddr base_address_end) { | 157 | auto mirror = [&](VAddr base_address, VAddr base_address_end) { |
| 615 | const u64 size = base_address_end - base_address; | 158 | const u64 size = base_address_end - base_address; |
| 616 | const VAddr diff = base_address - *cpu_src_address; | 159 | const VAddr diff = base_address - *cpu_src_address; |
| 617 | const VAddr new_base_address = *cpu_dest_address + diff; | 160 | const VAddr new_base_address = *cpu_dest_address + diff; |
| 618 | const IntervalType add_interval{new_base_address, new_base_address + size}; | 161 | const IntervalType add_interval{new_base_address, new_base_address + size}; |
| 619 | uncommitted_ranges.add(add_interval); | ||
| 620 | tmp_intervals.push_back(add_interval); | 162 | tmp_intervals.push_back(add_interval); |
| 163 | if (is_high_accuracy) { | ||
| 164 | uncommitted_ranges.add(add_interval); | ||
| 165 | } | ||
| 621 | }; | 166 | }; |
| 622 | ForEachWrittenRange(*cpu_src_address, amount, mirror); | 167 | ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); |
| 623 | // This subtraction in this order is important for overlapping copies. | 168 | // This subtraction in this order is important for overlapping copies. |
| 624 | common_ranges.subtract(subtract_interval); | 169 | common_ranges.subtract(subtract_interval); |
| 625 | const bool has_new_downloads = tmp_intervals.size() != 0; | 170 | const bool has_new_downloads = tmp_intervals.size() != 0; |
| @@ -628,7 +173,7 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am | |||
| 628 | } | 173 | } |
| 629 | runtime.CopyBuffer(dest_buffer, src_buffer, copies); | 174 | runtime.CopyBuffer(dest_buffer, src_buffer, copies); |
| 630 | if (has_new_downloads) { | 175 | if (has_new_downloads) { |
| 631 | dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount); | 176 | memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount); |
| 632 | } | 177 | } |
| 633 | std::vector<u8> tmp_buffer(amount); | 178 | std::vector<u8> tmp_buffer(amount); |
| 634 | cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); | 179 | cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); |
| @@ -866,23 +411,24 @@ void BufferCache<P>::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add | |||
| 866 | 411 | ||
| 867 | template <class P> | 412 | template <class P> |
| 868 | void BufferCache<P>::FlushCachedWrites() { | 413 | void BufferCache<P>::FlushCachedWrites() { |
| 869 | for (const BufferId buffer_id : cached_write_buffer_ids) { | ||
| 870 | slot_buffers[buffer_id].FlushCachedWrites(); | ||
| 871 | } | ||
| 872 | cached_write_buffer_ids.clear(); | 414 | cached_write_buffer_ids.clear(); |
| 415 | memory_tracker.FlushCachedWrites(); | ||
| 416 | /*for (auto& interval : cached_ranges) { | ||
| 417 | VAddr cpu_addr = interval.lower(); | ||
| 418 | const std::size_t size = interval.upper() - interval.lower(); | ||
| 419 | memory_tracker.FlushCachedWrites(cpu_addr, size); | ||
| 420 | // common_ranges.subtract(interval); | ||
| 421 | }*/ | ||
| 422 | cached_ranges.clear(); | ||
| 873 | } | 423 | } |
| 874 | 424 | ||
| 875 | template <class P> | 425 | template <class P> |
| 876 | bool BufferCache<P>::HasUncommittedFlushes() const noexcept { | 426 | bool BufferCache<P>::HasUncommittedFlushes() const noexcept { |
| 877 | return !uncommitted_ranges.empty() || !committed_ranges.empty(); | 427 | return !uncommitted_ranges.empty() || !committed_ranges.empty() || !pending_queries.empty(); |
| 878 | } | 428 | } |
| 879 | 429 | ||
| 880 | template <class P> | 430 | template <class P> |
| 881 | void BufferCache<P>::AccumulateFlushes() { | 431 | void BufferCache<P>::AccumulateFlushes() { |
| 882 | if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) { | ||
| 883 | uncommitted_ranges.clear(); | ||
| 884 | return; | ||
| 885 | } | ||
| 886 | if (uncommitted_ranges.empty()) { | 432 | if (uncommitted_ranges.empty()) { |
| 887 | return; | 433 | return; |
| 888 | } | 434 | } |
| @@ -891,7 +437,8 @@ void BufferCache<P>::AccumulateFlushes() { | |||
| 891 | 437 | ||
| 892 | template <class P> | 438 | template <class P> |
| 893 | bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { | 439 | bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { |
| 894 | return false; | 440 | return (!async_buffers.empty() && async_buffers.front().has_value()) || |
| 441 | (!query_async_buffers.empty() && query_async_buffers.front().has_value()); | ||
| 895 | } | 442 | } |
| 896 | 443 | ||
| 897 | template <class P> | 444 | template <class P> |
| @@ -899,11 +446,10 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { | |||
| 899 | AccumulateFlushes(); | 446 | AccumulateFlushes(); |
| 900 | 447 | ||
| 901 | if (committed_ranges.empty()) { | 448 | if (committed_ranges.empty()) { |
| 449 | async_buffers.emplace_back(std::optional<Async_Buffer>{}); | ||
| 902 | return; | 450 | return; |
| 903 | } | 451 | } |
| 904 | MICROPROFILE_SCOPE(GPU_DownloadMemory); | 452 | MICROPROFILE_SCOPE(GPU_DownloadMemory); |
| 905 | const bool is_accuracy_normal = | ||
| 906 | Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal; | ||
| 907 | 453 | ||
| 908 | auto it = committed_ranges.begin(); | 454 | auto it = committed_ranges.begin(); |
| 909 | while (it != committed_ranges.end()) { | 455 | while (it != committed_ranges.end()) { |
| @@ -926,11 +472,12 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { | |||
| 926 | const std::size_t size = interval.upper() - interval.lower(); | 472 | const std::size_t size = interval.upper() - interval.lower(); |
| 927 | const VAddr cpu_addr = interval.lower(); | 473 | const VAddr cpu_addr = interval.lower(); |
| 928 | ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { | 474 | ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { |
| 929 | buffer.ForEachDownloadRangeAndClear( | 475 | const VAddr buffer_start = buffer.CpuAddr(); |
| 930 | cpu_addr, size, [&](u64 range_offset, u64 range_size) { | 476 | const VAddr buffer_end = buffer_start + buffer.SizeBytes(); |
| 931 | if (is_accuracy_normal) { | 477 | const VAddr new_start = std::max(buffer_start, cpu_addr); |
| 932 | return; | 478 | const VAddr new_end = std::min(buffer_end, cpu_addr + size); |
| 933 | } | 479 | memory_tracker.ForEachDownloadRange( |
| 480 | new_start, new_end - new_start, false, [&](u64 cpu_addr_out, u64 range_size) { | ||
| 934 | const VAddr buffer_addr = buffer.CpuAddr(); | 481 | const VAddr buffer_addr = buffer.CpuAddr(); |
| 935 | const auto add_download = [&](VAddr start, VAddr end) { | 482 | const auto add_download = [&](VAddr start, VAddr end) { |
| 936 | const u64 new_offset = start - buffer_addr; | 483 | const u64 new_offset = start - buffer_addr; |
| @@ -950,38 +497,36 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { | |||
| 950 | largest_copy = std::max(largest_copy, new_size); | 497 | largest_copy = std::max(largest_copy, new_size); |
| 951 | }; | 498 | }; |
| 952 | 499 | ||
| 953 | const VAddr start_address = buffer_addr + range_offset; | 500 | ForEachInRangeSet(common_ranges, cpu_addr_out, range_size, add_download); |
| 954 | const VAddr end_address = start_address + range_size; | ||
| 955 | ForEachWrittenRange(start_address, range_size, add_download); | ||
| 956 | const IntervalType subtract_interval{start_address, end_address}; | ||
| 957 | common_ranges.subtract(subtract_interval); | ||
| 958 | }); | 501 | }); |
| 959 | }); | 502 | }); |
| 960 | } | 503 | } |
| 961 | } | 504 | } |
| 962 | committed_ranges.clear(); | 505 | committed_ranges.clear(); |
| 963 | if (downloads.empty()) { | 506 | if (downloads.empty()) { |
| 507 | async_buffers.emplace_back(std::optional<Async_Buffer>{}); | ||
| 964 | return; | 508 | return; |
| 965 | } | 509 | } |
| 966 | if constexpr (USE_MEMORY_MAPS) { | 510 | if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { |
| 967 | auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); | 511 | auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); |
| 512 | boost::container::small_vector<BufferCopy, 4> normalized_copies; | ||
| 513 | IntervalSet new_async_range{}; | ||
| 968 | runtime.PreCopyBarrier(); | 514 | runtime.PreCopyBarrier(); |
| 969 | for (auto& [copy, buffer_id] : downloads) { | 515 | for (auto& [copy, buffer_id] : downloads) { |
| 970 | // Have in mind the staging buffer offset for the copy | ||
| 971 | copy.dst_offset += download_staging.offset; | 516 | copy.dst_offset += download_staging.offset; |
| 972 | const std::array copies{copy}; | 517 | const std::array copies{copy}; |
| 973 | runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); | 518 | BufferCopy second_copy{copy}; |
| 974 | } | 519 | Buffer& buffer = slot_buffers[buffer_id]; |
| 975 | runtime.PostCopyBarrier(); | 520 | second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + copy.src_offset; |
| 976 | runtime.Finish(); | 521 | VAddr orig_cpu_addr = static_cast<VAddr>(second_copy.src_offset); |
| 977 | for (const auto& [copy, buffer_id] : downloads) { | 522 | const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; |
| 978 | const Buffer& buffer = slot_buffers[buffer_id]; | 523 | new_async_range.add(base_interval); |
| 979 | const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; | 524 | runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); |
| 980 | // Undo the modified offset | 525 | normalized_copies.push_back(second_copy); |
| 981 | const u64 dst_offset = copy.dst_offset - download_staging.offset; | ||
| 982 | const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; | ||
| 983 | cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); | ||
| 984 | } | 526 | } |
| 527 | async_downloads.emplace_back(std::move(new_async_range)); | ||
| 528 | pending_downloads.emplace_back(std::move(normalized_copies)); | ||
| 529 | async_buffers.emplace_back(download_staging); | ||
| 985 | } else { | 530 | } else { |
| 986 | const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); | 531 | const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); |
| 987 | for (const auto& [copy, buffer_id] : downloads) { | 532 | for (const auto& [copy, buffer_id] : downloads) { |
| @@ -994,42 +539,154 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { | |||
| 994 | } | 539 | } |
| 995 | 540 | ||
| 996 | template <class P> | 541 | template <class P> |
| 997 | void BufferCache<P>::CommitAsyncFlushes() { | 542 | void BufferCache<P>::CommitAsyncQueries() { |
| 998 | if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { | 543 | if (pending_queries.empty()) { |
| 999 | CommitAsyncFlushesHigh(); | 544 | query_async_buffers.emplace_back(std::optional<Async_Buffer>{}); |
| 545 | return; | ||
| 546 | } | ||
| 547 | |||
| 548 | MICROPROFILE_SCOPE(GPU_DownloadMemory); | ||
| 549 | boost::container::small_vector<std::pair<BufferCopy, BufferId>, 8> downloads; | ||
| 550 | u64 total_size_bytes = 0; | ||
| 551 | u64 largest_copy = 0; | ||
| 552 | do { | ||
| 553 | has_deleted_buffers = false; | ||
| 554 | downloads.clear(); | ||
| 555 | total_size_bytes = 0; | ||
| 556 | largest_copy = 0; | ||
| 557 | for (const auto& query_info : pending_queries) { | ||
| 558 | const std::size_t size = query_info.second; | ||
| 559 | const VAddr cpu_addr = query_info.first; | ||
| 560 | const BufferId buffer_id = FindBuffer(cpu_addr, static_cast<u32>(size)); | ||
| 561 | Buffer& buffer = slot_buffers[buffer_id]; | ||
| 562 | if (has_deleted_buffers) { | ||
| 563 | break; | ||
| 564 | } | ||
| 565 | downloads.push_back({ | ||
| 566 | BufferCopy{ | ||
| 567 | .src_offset = buffer.Offset(cpu_addr), | ||
| 568 | .dst_offset = total_size_bytes, | ||
| 569 | .size = size, | ||
| 570 | }, | ||
| 571 | buffer_id, | ||
| 572 | }); | ||
| 573 | constexpr u64 align = 8ULL; | ||
| 574 | constexpr u64 mask = ~(align - 1ULL); | ||
| 575 | total_size_bytes += (size + align - 1) & mask; | ||
| 576 | largest_copy = std::max(largest_copy, size); | ||
| 577 | } | ||
| 578 | } while (has_deleted_buffers); | ||
| 579 | pending_queries.clear(); | ||
| 580 | if (downloads.empty()) { | ||
| 581 | query_async_buffers.push_back(std::optional<Async_Buffer>{}); | ||
| 582 | return; | ||
| 583 | } | ||
| 584 | if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { | ||
| 585 | auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); | ||
| 586 | boost::container::small_vector<BufferCopy, 8> normalized_copies; | ||
| 587 | runtime.PreCopyBarrier(); | ||
| 588 | for (auto& [copy, buffer_id] : downloads) { | ||
| 589 | // Have in mind the staging buffer offset for the copy | ||
| 590 | copy.dst_offset += download_staging.offset; | ||
| 591 | const std::array copies{copy}; | ||
| 592 | const Buffer& buffer = slot_buffers[buffer_id]; | ||
| 593 | BufferCopy second_copy{copy}; | ||
| 594 | second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + second_copy.src_offset; | ||
| 595 | runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); | ||
| 596 | normalized_copies.push_back(second_copy); | ||
| 597 | } | ||
| 598 | committed_queries.emplace_back(std::move(normalized_copies)); | ||
| 599 | query_async_buffers.emplace_back(download_staging); | ||
| 1000 | } else { | 600 | } else { |
| 1001 | uncommitted_ranges.clear(); | 601 | query_async_buffers.push_back(std::optional<Async_Buffer>{}); |
| 1002 | committed_ranges.clear(); | ||
| 1003 | } | 602 | } |
| 1004 | } | 603 | } |
| 1005 | 604 | ||
| 1006 | template <class P> | 605 | template <class P> |
| 1007 | void BufferCache<P>::PopAsyncFlushes() {} | 606 | void BufferCache<P>::CommitAsyncFlushes() { |
| 607 | CommitAsyncFlushesHigh(); | ||
| 608 | CommitAsyncQueries(); | ||
| 609 | } | ||
| 1008 | 610 | ||
| 1009 | template <class P> | 611 | template <class P> |
| 1010 | bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { | 612 | void BufferCache<P>::PopAsyncFlushes() { |
| 1011 | const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); | 613 | MICROPROFILE_SCOPE(GPU_DownloadMemory); |
| 1012 | for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { | 614 | PopAsyncBuffers(); |
| 1013 | const BufferId image_id = page_table[page]; | 615 | PopAsyncQueries(); |
| 1014 | if (!image_id) { | 616 | } |
| 1015 | ++page; | 617 | |
| 1016 | continue; | 618 | template <class P> |
| 619 | void BufferCache<P>::PopAsyncBuffers() { | ||
| 620 | if (async_buffers.empty()) { | ||
| 621 | return; | ||
| 622 | } | ||
| 623 | if (!async_buffers.front().has_value()) { | ||
| 624 | async_buffers.pop_front(); | ||
| 625 | return; | ||
| 626 | } | ||
| 627 | if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { | ||
| 628 | auto& downloads = pending_downloads.front(); | ||
| 629 | auto& async_buffer = async_buffers.front(); | ||
| 630 | auto& async_range = async_downloads.front(); | ||
| 631 | u8* base = async_buffer->mapped_span.data(); | ||
| 632 | const size_t base_offset = async_buffer->offset; | ||
| 633 | for (const auto& copy : downloads) { | ||
| 634 | const VAddr cpu_addr = static_cast<VAddr>(copy.src_offset); | ||
| 635 | const u64 dst_offset = copy.dst_offset - base_offset; | ||
| 636 | const u8* read_mapped_memory = base + dst_offset; | ||
| 637 | ForEachInRangeSet(async_range, cpu_addr, copy.size, [&](VAddr start, VAddr end) { | ||
| 638 | const size_t diff = start - cpu_addr; | ||
| 639 | const size_t new_size = end - start; | ||
| 640 | cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[diff], new_size); | ||
| 641 | const IntervalType base_interval{start, end}; | ||
| 642 | common_ranges.subtract(base_interval); | ||
| 643 | }); | ||
| 1017 | } | 644 | } |
| 1018 | Buffer& buffer = slot_buffers[image_id]; | 645 | runtime.FreeDeferredStagingBuffer(*async_buffer); |
| 1019 | if (buffer.IsRegionGpuModified(addr, size)) { | 646 | async_buffers.pop_front(); |
| 1020 | return true; | 647 | pending_downloads.pop_front(); |
| 648 | async_downloads.pop_front(); | ||
| 649 | } | ||
| 650 | } | ||
| 651 | |||
| 652 | template <class P> | ||
| 653 | void BufferCache<P>::PopAsyncQueries() { | ||
| 654 | if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { | ||
| 655 | if (query_async_buffers.empty()) { | ||
| 656 | return; | ||
| 657 | } | ||
| 658 | if (!query_async_buffers.front().has_value()) { | ||
| 659 | query_async_buffers.pop_front(); | ||
| 660 | return; | ||
| 1021 | } | 661 | } |
| 1022 | const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); | 662 | auto& downloads = committed_queries.front(); |
| 1023 | page = Common::DivCeil(end_addr, YUZU_PAGESIZE); | 663 | auto& async_buffer = query_async_buffers.front(); |
| 664 | flushed_queries.clear(); | ||
| 665 | u8* base = async_buffer->mapped_span.data(); | ||
| 666 | const size_t base_offset = async_buffer->offset; | ||
| 667 | for (const auto& copy : downloads) { | ||
| 668 | const size_t dst_offset = copy.dst_offset - base_offset; | ||
| 669 | const u8* read_mapped_memory = base + dst_offset; | ||
| 670 | u64 new_value{}; | ||
| 671 | std::memcpy(&new_value, read_mapped_memory, copy.size); | ||
| 672 | flushed_queries.push_back(new_value); | ||
| 673 | } | ||
| 674 | runtime.FreeDeferredStagingBuffer(*async_buffer); | ||
| 675 | committed_queries.pop_front(); | ||
| 676 | query_async_buffers.pop_front(); | ||
| 1024 | } | 677 | } |
| 1025 | return false; | 678 | } |
| 679 | |||
| 680 | template <class P> | ||
| 681 | bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { | ||
| 682 | return memory_tracker.IsRegionGpuModified(addr, size); | ||
| 1026 | } | 683 | } |
| 1027 | 684 | ||
| 1028 | template <class P> | 685 | template <class P> |
| 1029 | bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { | 686 | bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { |
| 1030 | const VAddr end_addr = addr + size; | 687 | const VAddr end_addr = addr + size; |
| 1031 | const u64 page_end = Common::DivCeil(end_addr, YUZU_PAGESIZE); | 688 | const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE); |
| 1032 | for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { | 689 | for (u64 page = addr >> PAGE_BITS; page < page_end;) { |
| 1033 | const BufferId buffer_id = page_table[page]; | 690 | const BufferId buffer_id = page_table[page]; |
| 1034 | if (!buffer_id) { | 691 | if (!buffer_id) { |
| 1035 | ++page; | 692 | ++page; |
| @@ -1041,28 +698,14 @@ bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { | |||
| 1041 | if (buf_start_addr < end_addr && addr < buf_end_addr) { | 698 | if (buf_start_addr < end_addr && addr < buf_end_addr) { |
| 1042 | return true; | 699 | return true; |
| 1043 | } | 700 | } |
| 1044 | page = Common::DivCeil(end_addr, YUZU_PAGESIZE); | 701 | page = Common::DivCeil(end_addr, PAGE_SIZE); |
| 1045 | } | 702 | } |
| 1046 | return false; | 703 | return false; |
| 1047 | } | 704 | } |
| 1048 | 705 | ||
| 1049 | template <class P> | 706 | template <class P> |
| 1050 | bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) { | 707 | bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) { |
| 1051 | const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); | 708 | return memory_tracker.IsRegionCpuModified(addr, size); |
| 1052 | for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { | ||
| 1053 | const BufferId image_id = page_table[page]; | ||
| 1054 | if (!image_id) { | ||
| 1055 | ++page; | ||
| 1056 | continue; | ||
| 1057 | } | ||
| 1058 | Buffer& buffer = slot_buffers[image_id]; | ||
| 1059 | if (buffer.IsRegionCpuModified(addr, size)) { | ||
| 1060 | return true; | ||
| 1061 | } | ||
| 1062 | const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); | ||
| 1063 | page = Common::DivCeil(end_addr, YUZU_PAGESIZE); | ||
| 1064 | } | ||
| 1065 | return false; | ||
| 1066 | } | 709 | } |
| 1067 | 710 | ||
| 1068 | template <class P> | 711 | template <class P> |
| @@ -1155,7 +798,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 | |||
| 1155 | TouchBuffer(buffer, binding.buffer_id); | 798 | TouchBuffer(buffer, binding.buffer_id); |
| 1156 | const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && | 799 | const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && |
| 1157 | size <= uniform_buffer_skip_cache_size && | 800 | size <= uniform_buffer_skip_cache_size && |
| 1158 | !buffer.IsRegionGpuModified(cpu_addr, size); | 801 | !memory_tracker.IsRegionGpuModified(cpu_addr, size); |
| 1159 | if (use_fast_buffer) { | 802 | if (use_fast_buffer) { |
| 1160 | if constexpr (IS_OPENGL) { | 803 | if constexpr (IS_OPENGL) { |
| 1161 | if (runtime.HasFastBufferSubData()) { | 804 | if (runtime.HasFastBufferSubData()) { |
| @@ -1378,27 +1021,28 @@ void BufferCache<P>::UpdateIndexBuffer() { | |||
| 1378 | // We have to check for the dirty flags and index count | 1021 | // We have to check for the dirty flags and index count |
| 1379 | // The index count is currently changed without updating the dirty flags | 1022 | // The index count is currently changed without updating the dirty flags |
| 1380 | const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); | 1023 | const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); |
| 1381 | const auto& index_array = draw_state.index_buffer; | 1024 | const auto& index_buffer_ref = draw_state.index_buffer; |
| 1382 | auto& flags = maxwell3d->dirty.flags; | 1025 | auto& flags = maxwell3d->dirty.flags; |
| 1383 | if (!flags[Dirty::IndexBuffer]) { | 1026 | if (!flags[Dirty::IndexBuffer]) { |
| 1384 | return; | 1027 | return; |
| 1385 | } | 1028 | } |
| 1386 | flags[Dirty::IndexBuffer] = false; | 1029 | flags[Dirty::IndexBuffer] = false; |
| 1387 | last_index_count = index_array.count; | ||
| 1388 | if (!draw_state.inline_index_draw_indexes.empty()) { | 1030 | if (!draw_state.inline_index_draw_indexes.empty()) { |
| 1389 | auto inline_index_size = static_cast<u32>(draw_state.inline_index_draw_indexes.size()); | 1031 | auto inline_index_size = static_cast<u32>(draw_state.inline_index_draw_indexes.size()); |
| 1390 | index_buffer = Binding{ | 1032 | index_buffer = Binding{ |
| 1391 | .cpu_addr = 0, | 1033 | .cpu_addr = 0, |
| 1392 | .size = inline_index_size, | 1034 | .size = inline_index_size, |
| 1393 | .buffer_id = CreateBuffer(0, inline_index_size), | 1035 | .buffer_id = FindBuffer(0, inline_index_size), |
| 1394 | }; | 1036 | }; |
| 1395 | return; | 1037 | return; |
| 1396 | } | 1038 | } |
| 1397 | const GPUVAddr gpu_addr_begin = index_array.StartAddress(); | 1039 | |
| 1398 | const GPUVAddr gpu_addr_end = index_array.EndAddress(); | 1040 | const GPUVAddr gpu_addr_begin = index_buffer_ref.StartAddress(); |
| 1041 | const GPUVAddr gpu_addr_end = index_buffer_ref.EndAddress(); | ||
| 1399 | const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); | 1042 | const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); |
| 1400 | const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); | 1043 | const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); |
| 1401 | const u32 draw_size = (index_array.count + index_array.first) * index_array.FormatSizeInBytes(); | 1044 | const u32 draw_size = |
| 1045 | (index_buffer_ref.count + index_buffer_ref.first) * index_buffer_ref.FormatSizeInBytes(); | ||
| 1402 | const u32 size = std::min(address_size, draw_size); | 1046 | const u32 size = std::min(address_size, draw_size); |
| 1403 | if (size == 0 || !cpu_addr) { | 1047 | if (size == 0 || !cpu_addr) { |
| 1404 | index_buffer = NULL_BINDING; | 1048 | index_buffer = NULL_BINDING; |
| @@ -1434,17 +1078,15 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) { | |||
| 1434 | const GPUVAddr gpu_addr_begin = array.Address(); | 1078 | const GPUVAddr gpu_addr_begin = array.Address(); |
| 1435 | const GPUVAddr gpu_addr_end = limit.Address() + 1; | 1079 | const GPUVAddr gpu_addr_end = limit.Address() + 1; |
| 1436 | const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); | 1080 | const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); |
| 1437 | u32 address_size = static_cast<u32>( | 1081 | const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); |
| 1438 | std::min(gpu_addr_end - gpu_addr_begin, static_cast<u64>(std::numeric_limits<u32>::max()))); | 1082 | u32 size = address_size; // TODO: Analyze stride and number of vertices |
| 1439 | if (array.enable == 0 || address_size == 0 || !cpu_addr) { | 1083 | if (array.enable == 0 || size == 0 || !cpu_addr) { |
| 1440 | vertex_buffers[index] = NULL_BINDING; | 1084 | vertex_buffers[index] = NULL_BINDING; |
| 1441 | return; | 1085 | return; |
| 1442 | } | 1086 | } |
| 1443 | if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) { | 1087 | if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) { |
| 1444 | address_size = | 1088 | size = static_cast<u32>(gpu_memory->MaxContinuousRange(gpu_addr_begin, size)); |
| 1445 | static_cast<u32>(gpu_memory->MaxContinuousRange(gpu_addr_begin, address_size)); | ||
| 1446 | } | 1089 | } |
| 1447 | const u32 size = address_size; // TODO: Analyze stride and number of vertices | ||
| 1448 | vertex_buffers[index] = Binding{ | 1090 | vertex_buffers[index] = Binding{ |
| 1449 | .cpu_addr = *cpu_addr, | 1091 | .cpu_addr = *cpu_addr, |
| 1450 | .size = size, | 1092 | .size = size, |
| @@ -1590,18 +1232,17 @@ void BufferCache<P>::UpdateComputeTextureBuffers() { | |||
| 1590 | } | 1232 | } |
| 1591 | 1233 | ||
| 1592 | template <class P> | 1234 | template <class P> |
| 1593 | void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { | 1235 | void BufferCache<P>::MarkWrittenBuffer(BufferId, VAddr cpu_addr, u32 size) { |
| 1594 | Buffer& buffer = slot_buffers[buffer_id]; | 1236 | memory_tracker.MarkRegionAsGpuModified(cpu_addr, size); |
| 1595 | buffer.MarkRegionAsGpuModified(cpu_addr, size); | ||
| 1596 | 1237 | ||
| 1597 | const IntervalType base_interval{cpu_addr, cpu_addr + size}; | 1238 | const IntervalType base_interval{cpu_addr, cpu_addr + size}; |
| 1598 | common_ranges.add(base_interval); | 1239 | common_ranges.add(base_interval); |
| 1599 | 1240 | for (auto& interval_set : async_downloads) { | |
| 1600 | const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); | 1241 | interval_set.subtract(base_interval); |
| 1601 | if (!is_async) { | 1242 | } |
| 1602 | return; | 1243 | if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { |
| 1244 | uncommitted_ranges.add(base_interval); | ||
| 1603 | } | 1245 | } |
| 1604 | uncommitted_ranges.add(base_interval); | ||
| 1605 | } | 1246 | } |
| 1606 | 1247 | ||
| 1607 | template <class P> | 1248 | template <class P> |
| @@ -1609,7 +1250,7 @@ BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) { | |||
| 1609 | if (cpu_addr == 0) { | 1250 | if (cpu_addr == 0) { |
| 1610 | return NULL_BUFFER_ID; | 1251 | return NULL_BUFFER_ID; |
| 1611 | } | 1252 | } |
| 1612 | const u64 page = cpu_addr >> YUZU_PAGEBITS; | 1253 | const u64 page = cpu_addr >> PAGE_BITS; |
| 1613 | const BufferId buffer_id = page_table[page]; | 1254 | const BufferId buffer_id = page_table[page]; |
| 1614 | if (!buffer_id) { | 1255 | if (!buffer_id) { |
| 1615 | return CreateBuffer(cpu_addr, size); | 1256 | return CreateBuffer(cpu_addr, size); |
| @@ -1638,9 +1279,8 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu | |||
| 1638 | .has_stream_leap = has_stream_leap, | 1279 | .has_stream_leap = has_stream_leap, |
| 1639 | }; | 1280 | }; |
| 1640 | } | 1281 | } |
| 1641 | for (; cpu_addr >> YUZU_PAGEBITS < Common::DivCeil(end, YUZU_PAGESIZE); | 1282 | for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) { |
| 1642 | cpu_addr += YUZU_PAGESIZE) { | 1283 | const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; |
| 1643 | const BufferId overlap_id = page_table[cpu_addr >> YUZU_PAGEBITS]; | ||
| 1644 | if (!overlap_id) { | 1284 | if (!overlap_id) { |
| 1645 | continue; | 1285 | continue; |
| 1646 | } | 1286 | } |
| @@ -1666,11 +1306,11 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu | |||
| 1666 | // as a stream buffer. Increase the size to skip constantly recreating buffers. | 1306 | // as a stream buffer. Increase the size to skip constantly recreating buffers. |
| 1667 | has_stream_leap = true; | 1307 | has_stream_leap = true; |
| 1668 | if (expands_right) { | 1308 | if (expands_right) { |
| 1669 | begin -= YUZU_PAGESIZE * 256; | 1309 | begin -= PAGE_SIZE * 256; |
| 1670 | cpu_addr = begin; | 1310 | cpu_addr = begin; |
| 1671 | } | 1311 | } |
| 1672 | if (expands_left) { | 1312 | if (expands_left) { |
| 1673 | end += YUZU_PAGESIZE * 256; | 1313 | end += PAGE_SIZE * 256; |
| 1674 | } | 1314 | } |
| 1675 | } | 1315 | } |
| 1676 | } | 1316 | } |
| @@ -1690,21 +1330,15 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, | |||
| 1690 | if (accumulate_stream_score) { | 1330 | if (accumulate_stream_score) { |
| 1691 | new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1); | 1331 | new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1); |
| 1692 | } | 1332 | } |
| 1693 | std::vector<BufferCopy> copies; | 1333 | boost::container::small_vector<BufferCopy, 1> copies; |
| 1694 | const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); | 1334 | const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); |
| 1695 | overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) { | 1335 | copies.push_back(BufferCopy{ |
| 1696 | copies.push_back(BufferCopy{ | 1336 | .src_offset = 0, |
| 1697 | .src_offset = begin, | 1337 | .dst_offset = dst_base_offset, |
| 1698 | .dst_offset = dst_base_offset + begin, | 1338 | .size = overlap.SizeBytes(), |
| 1699 | .size = range_size, | ||
| 1700 | }); | ||
| 1701 | new_buffer.UnmarkRegionAsCpuModified(begin, range_size); | ||
| 1702 | new_buffer.MarkRegionAsGpuModified(begin, range_size); | ||
| 1703 | }); | 1339 | }); |
| 1704 | if (!copies.empty()) { | 1340 | runtime.CopyBuffer(new_buffer, overlap, copies); |
| 1705 | runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); | 1341 | DeleteBuffer(overlap_id, true); |
| 1706 | } | ||
| 1707 | DeleteBuffer(overlap_id); | ||
| 1708 | } | 1342 | } |
| 1709 | 1343 | ||
| 1710 | template <class P> | 1344 | template <class P> |
| @@ -1718,7 +1352,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { | |||
| 1718 | JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); | 1352 | JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); |
| 1719 | } | 1353 | } |
| 1720 | Register(new_buffer_id); | 1354 | Register(new_buffer_id); |
| 1721 | TouchBuffer(slot_buffers[new_buffer_id], new_buffer_id); | 1355 | TouchBuffer(new_buffer, new_buffer_id); |
| 1722 | return new_buffer_id; | 1356 | return new_buffer_id; |
| 1723 | } | 1357 | } |
| 1724 | 1358 | ||
| @@ -1746,8 +1380,8 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) { | |||
| 1746 | } | 1380 | } |
| 1747 | const VAddr cpu_addr_begin = buffer.CpuAddr(); | 1381 | const VAddr cpu_addr_begin = buffer.CpuAddr(); |
| 1748 | const VAddr cpu_addr_end = cpu_addr_begin + size; | 1382 | const VAddr cpu_addr_end = cpu_addr_begin + size; |
| 1749 | const u64 page_begin = cpu_addr_begin / YUZU_PAGESIZE; | 1383 | const u64 page_begin = cpu_addr_begin / PAGE_SIZE; |
| 1750 | const u64 page_end = Common::DivCeil(cpu_addr_end, YUZU_PAGESIZE); | 1384 | const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); |
| 1751 | for (u64 page = page_begin; page != page_end; ++page) { | 1385 | for (u64 page = page_begin; page != page_end; ++page) { |
| 1752 | if constexpr (insert) { | 1386 | if constexpr (insert) { |
| 1753 | page_table[page] = buffer_id; | 1387 | page_table[page] = buffer_id; |
| @@ -1766,9 +1400,6 @@ void BufferCache<P>::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept { | |||
| 1766 | 1400 | ||
| 1767 | template <class P> | 1401 | template <class P> |
| 1768 | bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { | 1402 | bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { |
| 1769 | if (buffer.CpuAddr() == 0) { | ||
| 1770 | return true; | ||
| 1771 | } | ||
| 1772 | return SynchronizeBufferImpl(buffer, cpu_addr, size); | 1403 | return SynchronizeBufferImpl(buffer, cpu_addr, size); |
| 1773 | } | 1404 | } |
| 1774 | 1405 | ||
| @@ -1777,10 +1408,11 @@ bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s | |||
| 1777 | boost::container::small_vector<BufferCopy, 4> copies; | 1408 | boost::container::small_vector<BufferCopy, 4> copies; |
| 1778 | u64 total_size_bytes = 0; | 1409 | u64 total_size_bytes = 0; |
| 1779 | u64 largest_copy = 0; | 1410 | u64 largest_copy = 0; |
| 1780 | buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { | 1411 | VAddr buffer_start = buffer.CpuAddr(); |
| 1412 | memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { | ||
| 1781 | copies.push_back(BufferCopy{ | 1413 | copies.push_back(BufferCopy{ |
| 1782 | .src_offset = total_size_bytes, | 1414 | .src_offset = total_size_bytes, |
| 1783 | .dst_offset = range_offset, | 1415 | .dst_offset = cpu_addr_out - buffer_start, |
| 1784 | .size = range_size, | 1416 | .size = range_size, |
| 1785 | }); | 1417 | }); |
| 1786 | total_size_bytes += range_size; | 1418 | total_size_bytes += range_size; |
| @@ -1795,6 +1427,51 @@ bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s | |||
| 1795 | } | 1427 | } |
| 1796 | 1428 | ||
| 1797 | template <class P> | 1429 | template <class P> |
| 1430 | bool BufferCache<P>::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) { | ||
| 1431 | boost::container::small_vector<BufferCopy, 4> copies; | ||
| 1432 | u64 total_size_bytes = 0; | ||
| 1433 | u64 largest_copy = 0; | ||
| 1434 | IntervalSet found_sets{}; | ||
| 1435 | auto make_copies = [&] { | ||
| 1436 | for (auto& interval : found_sets) { | ||
| 1437 | const std::size_t sub_size = interval.upper() - interval.lower(); | ||
| 1438 | const VAddr cpu_addr = interval.lower(); | ||
| 1439 | copies.push_back(BufferCopy{ | ||
| 1440 | .src_offset = total_size_bytes, | ||
| 1441 | .dst_offset = cpu_addr - buffer.CpuAddr(), | ||
| 1442 | .size = sub_size, | ||
| 1443 | }); | ||
| 1444 | total_size_bytes += sub_size; | ||
| 1445 | largest_copy = std::max(largest_copy, sub_size); | ||
| 1446 | } | ||
| 1447 | const std::span<BufferCopy> copies_span(copies.data(), copies.size()); | ||
| 1448 | UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); | ||
| 1449 | }; | ||
| 1450 | memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { | ||
| 1451 | const VAddr base_adr = cpu_addr_out; | ||
| 1452 | const VAddr end_adr = base_adr + range_size; | ||
| 1453 | const IntervalType add_interval{base_adr, end_adr}; | ||
| 1454 | found_sets.add(add_interval); | ||
| 1455 | }); | ||
| 1456 | if (found_sets.empty()) { | ||
| 1457 | return true; | ||
| 1458 | } | ||
| 1459 | const IntervalType search_interval{cpu_addr, cpu_addr + size}; | ||
| 1460 | auto it = common_ranges.lower_bound(search_interval); | ||
| 1461 | auto it_end = common_ranges.upper_bound(search_interval); | ||
| 1462 | if (it == common_ranges.end()) { | ||
| 1463 | make_copies(); | ||
| 1464 | return false; | ||
| 1465 | } | ||
| 1466 | while (it != it_end) { | ||
| 1467 | found_sets.subtract(*it); | ||
| 1468 | it++; | ||
| 1469 | } | ||
| 1470 | make_copies(); | ||
| 1471 | return false; | ||
| 1472 | } | ||
| 1473 | |||
| 1474 | template <class P> | ||
| 1798 | void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, | 1475 | void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, |
| 1799 | std::span<BufferCopy> copies) { | 1476 | std::span<BufferCopy> copies) { |
| 1800 | if constexpr (USE_MEMORY_MAPS) { | 1477 | if constexpr (USE_MEMORY_MAPS) { |
| @@ -1805,39 +1482,45 @@ void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 larg | |||
| 1805 | } | 1482 | } |
| 1806 | 1483 | ||
| 1807 | template <class P> | 1484 | template <class P> |
| 1808 | void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, | 1485 | void BufferCache<P>::ImmediateUploadMemory([[maybe_unused]] Buffer& buffer, |
| 1809 | std::span<const BufferCopy> copies) { | 1486 | [[maybe_unused]] u64 largest_copy, |
| 1810 | std::span<u8> immediate_buffer; | 1487 | [[maybe_unused]] std::span<const BufferCopy> copies) { |
| 1811 | for (const BufferCopy& copy : copies) { | 1488 | if constexpr (!USE_MEMORY_MAPS) { |
| 1812 | std::span<const u8> upload_span; | 1489 | std::span<u8> immediate_buffer; |
| 1813 | const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; | 1490 | for (const BufferCopy& copy : copies) { |
| 1814 | if (IsRangeGranular(cpu_addr, copy.size)) { | 1491 | std::span<const u8> upload_span; |
| 1815 | upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); | 1492 | const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; |
| 1816 | } else { | 1493 | if (IsRangeGranular(cpu_addr, copy.size)) { |
| 1817 | if (immediate_buffer.empty()) { | 1494 | upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); |
| 1818 | immediate_buffer = ImmediateBuffer(largest_copy); | 1495 | } else { |
| 1496 | if (immediate_buffer.empty()) { | ||
| 1497 | immediate_buffer = ImmediateBuffer(largest_copy); | ||
| 1498 | } | ||
| 1499 | cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); | ||
| 1500 | upload_span = immediate_buffer.subspan(0, copy.size); | ||
| 1819 | } | 1501 | } |
| 1820 | cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); | 1502 | buffer.ImmediateUpload(copy.dst_offset, upload_span); |
| 1821 | upload_span = immediate_buffer.subspan(0, copy.size); | ||
| 1822 | } | 1503 | } |
| 1823 | buffer.ImmediateUpload(copy.dst_offset, upload_span); | ||
| 1824 | } | 1504 | } |
| 1825 | } | 1505 | } |
| 1826 | 1506 | ||
| 1827 | template <class P> | 1507 | template <class P> |
| 1828 | void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, | 1508 | void BufferCache<P>::MappedUploadMemory([[maybe_unused]] Buffer& buffer, |
| 1829 | std::span<BufferCopy> copies) { | 1509 | [[maybe_unused]] u64 total_size_bytes, |
| 1830 | auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); | 1510 | [[maybe_unused]] std::span<BufferCopy> copies) { |
| 1831 | const std::span<u8> staging_pointer = upload_staging.mapped_span; | 1511 | if constexpr (USE_MEMORY_MAPS) { |
| 1832 | for (BufferCopy& copy : copies) { | 1512 | auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); |
| 1833 | u8* const src_pointer = staging_pointer.data() + copy.src_offset; | 1513 | const std::span<u8> staging_pointer = upload_staging.mapped_span; |
| 1834 | const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; | 1514 | for (BufferCopy& copy : copies) { |
| 1835 | cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); | 1515 | u8* const src_pointer = staging_pointer.data() + copy.src_offset; |
| 1516 | const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; | ||
| 1517 | cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); | ||
| 1836 | 1518 | ||
| 1837 | // Apply the staging offset | 1519 | // Apply the staging offset |
| 1838 | copy.src_offset += upload_staging.offset; | 1520 | copy.src_offset += upload_staging.offset; |
| 1521 | } | ||
| 1522 | runtime.CopyBuffer(buffer, upload_staging.buffer, copies); | ||
| 1839 | } | 1523 | } |
| 1840 | runtime.CopyBuffer(buffer, upload_staging.buffer, copies); | ||
| 1841 | } | 1524 | } |
| 1842 | 1525 | ||
| 1843 | template <class P> | 1526 | template <class P> |
| @@ -1886,30 +1569,31 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si | |||
| 1886 | boost::container::small_vector<BufferCopy, 1> copies; | 1569 | boost::container::small_vector<BufferCopy, 1> copies; |
| 1887 | u64 total_size_bytes = 0; | 1570 | u64 total_size_bytes = 0; |
| 1888 | u64 largest_copy = 0; | 1571 | u64 largest_copy = 0; |
| 1889 | buffer.ForEachDownloadRangeAndClear(cpu_addr, size, [&](u64 range_offset, u64 range_size) { | 1572 | memory_tracker.ForEachDownloadRangeAndClear( |
| 1890 | const VAddr buffer_addr = buffer.CpuAddr(); | 1573 | cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { |
| 1891 | const auto add_download = [&](VAddr start, VAddr end) { | 1574 | const VAddr buffer_addr = buffer.CpuAddr(); |
| 1892 | const u64 new_offset = start - buffer_addr; | 1575 | const auto add_download = [&](VAddr start, VAddr end) { |
| 1893 | const u64 new_size = end - start; | 1576 | const u64 new_offset = start - buffer_addr; |
| 1894 | copies.push_back(BufferCopy{ | 1577 | const u64 new_size = end - start; |
| 1895 | .src_offset = new_offset, | 1578 | copies.push_back(BufferCopy{ |
| 1896 | .dst_offset = total_size_bytes, | 1579 | .src_offset = new_offset, |
| 1897 | .size = new_size, | 1580 | .dst_offset = total_size_bytes, |
| 1898 | }); | 1581 | .size = new_size, |
| 1899 | // Align up to avoid cache conflicts | 1582 | }); |
| 1900 | constexpr u64 align = 256ULL; | 1583 | // Align up to avoid cache conflicts |
| 1901 | constexpr u64 mask = ~(align - 1ULL); | 1584 | constexpr u64 align = 8ULL; |
| 1902 | total_size_bytes += (new_size + align - 1) & mask; | 1585 | constexpr u64 mask = ~(align - 1ULL); |
| 1903 | largest_copy = std::max(largest_copy, new_size); | 1586 | total_size_bytes += (new_size + align - 1) & mask; |
| 1904 | }; | 1587 | largest_copy = std::max(largest_copy, new_size); |
| 1905 | 1588 | }; | |
| 1906 | const VAddr start_address = buffer_addr + range_offset; | 1589 | |
| 1907 | const VAddr end_address = start_address + range_size; | 1590 | const VAddr start_address = cpu_addr_out; |
| 1908 | ForEachWrittenRange(start_address, range_size, add_download); | 1591 | const VAddr end_address = start_address + range_size; |
| 1909 | const IntervalType subtract_interval{start_address, end_address}; | 1592 | ForEachInRangeSet(common_ranges, start_address, range_size, add_download); |
| 1910 | ClearDownload(subtract_interval); | 1593 | const IntervalType subtract_interval{start_address, end_address}; |
| 1911 | common_ranges.subtract(subtract_interval); | 1594 | ClearDownload(subtract_interval); |
| 1912 | }); | 1595 | common_ranges.subtract(subtract_interval); |
| 1596 | }); | ||
| 1913 | if (total_size_bytes == 0) { | 1597 | if (total_size_bytes == 0) { |
| 1914 | return; | 1598 | return; |
| 1915 | } | 1599 | } |
| @@ -1943,7 +1627,7 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si | |||
| 1943 | } | 1627 | } |
| 1944 | 1628 | ||
| 1945 | template <class P> | 1629 | template <class P> |
| 1946 | void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { | 1630 | void BufferCache<P>::DeleteBuffer(BufferId buffer_id, bool do_not_mark) { |
| 1947 | const auto scalar_replace = [buffer_id](Binding& binding) { | 1631 | const auto scalar_replace = [buffer_id](Binding& binding) { |
| 1948 | if (binding.buffer_id == buffer_id) { | 1632 | if (binding.buffer_id == buffer_id) { |
| 1949 | binding.buffer_id = BufferId{}; | 1633 | binding.buffer_id = BufferId{}; |
| @@ -1962,8 +1646,10 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { | |||
| 1962 | std::erase(cached_write_buffer_ids, buffer_id); | 1646 | std::erase(cached_write_buffer_ids, buffer_id); |
| 1963 | 1647 | ||
| 1964 | // Mark the whole buffer as CPU written to stop tracking CPU writes | 1648 | // Mark the whole buffer as CPU written to stop tracking CPU writes |
| 1965 | Buffer& buffer = slot_buffers[buffer_id]; | 1649 | if (!do_not_mark) { |
| 1966 | buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); | 1650 | Buffer& buffer = slot_buffers[buffer_id]; |
| 1651 | memory_tracker.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); | ||
| 1652 | } | ||
| 1967 | 1653 | ||
| 1968 | Unregister(buffer_id); | 1654 | Unregister(buffer_id); |
| 1969 | delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); | 1655 | delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); |
| @@ -2011,7 +1697,7 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s | |||
| 2011 | LOG_WARNING(HW_GPU, "Failed to find storage buffer for cbuf index {}", cbuf_index); | 1697 | LOG_WARNING(HW_GPU, "Failed to find storage buffer for cbuf index {}", cbuf_index); |
| 2012 | return NULL_BINDING; | 1698 | return NULL_BINDING; |
| 2013 | } | 1699 | } |
| 2014 | const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE); | 1700 | const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, YUZU_PAGESIZE); |
| 2015 | const Binding binding{ | 1701 | const Binding binding{ |
| 2016 | .cpu_addr = *cpu_addr, | 1702 | .cpu_addr = *cpu_addr, |
| 2017 | .size = is_written ? size : static_cast<u32>(cpu_end - *cpu_addr), | 1703 | .size = is_written ? size : static_cast<u32>(cpu_end - *cpu_addr), |
diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h new file mode 100644 index 000000000..4b3677da3 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache_base.h | |||
| @@ -0,0 +1,507 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #pragma once | ||
| 5 | |||
| 6 | #include <algorithm> | ||
| 7 | #include <array> | ||
| 8 | #include <memory> | ||
| 9 | #include <mutex> | ||
| 10 | #include <numeric> | ||
| 11 | #include <span> | ||
| 12 | #include <unordered_map> | ||
| 13 | #include <vector> | ||
| 14 | |||
| 15 | #include <boost/container/small_vector.hpp> | ||
| 16 | #define BOOST_NO_MT | ||
| 17 | #include <boost/pool/detail/mutex.hpp> | ||
| 18 | #undef BOOST_NO_MT | ||
| 19 | #include <boost/icl/interval_set.hpp> | ||
| 20 | #include <boost/pool/pool.hpp> | ||
| 21 | #include <boost/pool/pool_alloc.hpp> | ||
| 22 | |||
| 23 | #include "common/common_types.h" | ||
| 24 | #include "common/div_ceil.h" | ||
| 25 | #include "common/literals.h" | ||
| 26 | #include "common/lru_cache.h" | ||
| 27 | #include "common/microprofile.h" | ||
| 28 | #include "common/scope_exit.h" | ||
| 29 | #include "common/settings.h" | ||
| 30 | #include "core/memory.h" | ||
| 31 | #include "video_core/buffer_cache/buffer_base.h" | ||
| 32 | #include "video_core/control/channel_state_cache.h" | ||
| 33 | #include "video_core/delayed_destruction_ring.h" | ||
| 34 | #include "video_core/dirty_flags.h" | ||
| 35 | #include "video_core/engines/draw_manager.h" | ||
| 36 | #include "video_core/engines/kepler_compute.h" | ||
| 37 | #include "video_core/engines/maxwell_3d.h" | ||
| 38 | #include "video_core/memory_manager.h" | ||
| 39 | #include "video_core/rasterizer_interface.h" | ||
| 40 | #include "video_core/surface.h" | ||
| 41 | #include "video_core/texture_cache/slot_vector.h" | ||
| 42 | #include "video_core/texture_cache/types.h" | ||
| 43 | |||
| 44 | |||
| 45 | namespace boost { | ||
| 46 | template <typename T> | ||
| 47 | class fast_pool_allocator<T, default_user_allocator_new_delete, details::pool::default_mutex, 4096, | ||
| 48 | 0>; | ||
| 49 | } | ||
| 50 | |||
| 51 | namespace VideoCommon { | ||
| 52 | |||
| 53 | MICROPROFILE_DECLARE(GPU_PrepareBuffers); | ||
| 54 | MICROPROFILE_DECLARE(GPU_BindUploadBuffers); | ||
| 55 | MICROPROFILE_DECLARE(GPU_DownloadMemory); | ||
| 56 | |||
| 57 | using BufferId = SlotId; | ||
| 58 | |||
| 59 | using VideoCore::Surface::PixelFormat; | ||
| 60 | using namespace Common::Literals; | ||
| 61 | |||
| 62 | constexpr u32 NUM_VERTEX_BUFFERS = 32; | ||
| 63 | constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; | ||
| 64 | constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; | ||
| 65 | constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; | ||
| 66 | constexpr u32 NUM_STORAGE_BUFFERS = 16; | ||
| 67 | constexpr u32 NUM_TEXTURE_BUFFERS = 16; | ||
| 68 | constexpr u32 NUM_STAGES = 5; | ||
| 69 | |||
| 70 | using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>; | ||
| 71 | using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>; | ||
| 72 | |||
| 73 | enum class ObtainBufferSynchronize : u32 { | ||
| 74 | NoSynchronize = 0, | ||
| 75 | FullSynchronize = 1, | ||
| 76 | SynchronizeNoDirty = 2, | ||
| 77 | }; | ||
| 78 | |||
| 79 | enum class ObtainBufferOperation : u32 { | ||
| 80 | DoNothing = 0, | ||
| 81 | MarkAsWritten = 1, | ||
| 82 | DiscardWrite = 2, | ||
| 83 | MarkQuery = 3, | ||
| 84 | }; | ||
| 85 | |||
| 86 | template <typename P> | ||
| 87 | class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { | ||
| 88 | // Page size for caching purposes. | ||
| 89 | // This is unrelated to the CPU page size and it can be changed as it seems optimal. | ||
| 90 | static constexpr u32 PAGE_BITS = 16; | ||
| 91 | static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; | ||
| 92 | static constexpr u32 CPU_PAGE_BITS = 12; | ||
| 93 | static constexpr u64 CPU_PAGE_SIZE = u64{1} << CPU_PAGE_BITS; | ||
| 94 | |||
| 95 | static constexpr bool IS_OPENGL = P::IS_OPENGL; | ||
| 96 | static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = | ||
| 97 | P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; | ||
| 98 | static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = | ||
| 99 | P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; | ||
| 100 | static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; | ||
| 101 | static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; | ||
| 102 | static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; | ||
| 103 | static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS; | ||
| 104 | static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = P::IMPLEMENTS_ASYNC_DOWNLOADS; | ||
| 105 | |||
| 106 | static constexpr BufferId NULL_BUFFER_ID{0}; | ||
| 107 | |||
| 108 | static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; | ||
| 109 | static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; | ||
| 110 | static constexpr s64 TARGET_THRESHOLD = 4_GiB; | ||
| 111 | |||
| 112 | using Maxwell = Tegra::Engines::Maxwell3D::Regs; | ||
| 113 | |||
| 114 | using Runtime = typename P::Runtime; | ||
| 115 | using Buffer = typename P::Buffer; | ||
| 116 | using Async_Buffer = typename P::Async_Buffer; | ||
| 117 | using MemoryTracker = typename P::MemoryTracker; | ||
| 118 | |||
| 119 | using IntervalCompare = ICL_COMPARE_INSTANCE(ICL_COMPARE_DEFAULT, VAddr); | ||
| 120 | using IntervalInstance = ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, IntervalCompare); | ||
| 121 | using IntervalAllocator = boost::fast_pool_allocator; | ||
| 122 | using IntervalSet = | ||
| 123 | boost::icl::interval_set<VAddr, IntervalCompare, IntervalInstance, IntervalAllocator>; | ||
| 124 | using IntervalType = typename IntervalSet::interval_type; | ||
| 125 | |||
| 126 | struct Empty {}; | ||
| 127 | |||
| 128 | struct OverlapResult { | ||
| 129 | std::vector<BufferId> ids; | ||
| 130 | VAddr begin; | ||
| 131 | VAddr end; | ||
| 132 | bool has_stream_leap = false; | ||
| 133 | }; | ||
| 134 | |||
| 135 | struct Binding { | ||
| 136 | VAddr cpu_addr{}; | ||
| 137 | u32 size{}; | ||
| 138 | BufferId buffer_id; | ||
| 139 | }; | ||
| 140 | |||
| 141 | struct TextureBufferBinding : Binding { | ||
| 142 | PixelFormat format; | ||
| 143 | }; | ||
| 144 | |||
| 145 | static constexpr Binding NULL_BINDING{ | ||
| 146 | .cpu_addr = 0, | ||
| 147 | .size = 0, | ||
| 148 | .buffer_id = NULL_BUFFER_ID, | ||
| 149 | }; | ||
| 150 | |||
| 151 | public: | ||
| 152 | static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB); | ||
| 153 | |||
| 154 | explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, | ||
| 155 | Core::Memory::Memory& cpu_memory_, Runtime& runtime_); | ||
| 156 | |||
| 157 | void TickFrame(); | ||
| 158 | |||
| 159 | void WriteMemory(VAddr cpu_addr, u64 size); | ||
| 160 | |||
| 161 | void CachedWriteMemory(VAddr cpu_addr, u64 size); | ||
| 162 | |||
| 163 | void DownloadMemory(VAddr cpu_addr, u64 size); | ||
| 164 | |||
| 165 | bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer); | ||
| 166 | |||
| 167 | void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); | ||
| 168 | |||
| 169 | void DisableGraphicsUniformBuffer(size_t stage, u32 index); | ||
| 170 | |||
| 171 | void UpdateGraphicsBuffers(bool is_indexed); | ||
| 172 | |||
| 173 | void UpdateComputeBuffers(); | ||
| 174 | |||
| 175 | void BindHostGeometryBuffers(bool is_indexed); | ||
| 176 | |||
| 177 | void BindHostStageBuffers(size_t stage); | ||
| 178 | |||
| 179 | void BindHostComputeBuffers(); | ||
| 180 | |||
| 181 | void SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask, | ||
| 182 | const UniformBufferSizes* sizes); | ||
| 183 | |||
| 184 | void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes); | ||
| 185 | |||
| 186 | void UnbindGraphicsStorageBuffers(size_t stage); | ||
| 187 | |||
| 188 | void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, | ||
| 189 | bool is_written); | ||
| 190 | |||
| 191 | void UnbindGraphicsTextureBuffers(size_t stage); | ||
| 192 | |||
| 193 | void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size, | ||
| 194 | PixelFormat format, bool is_written, bool is_image); | ||
| 195 | |||
| 196 | void UnbindComputeStorageBuffers(); | ||
| 197 | |||
| 198 | void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, | ||
| 199 | bool is_written); | ||
| 200 | |||
| 201 | void UnbindComputeTextureBuffers(); | ||
| 202 | |||
| 203 | void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format, | ||
| 204 | bool is_written, bool is_image); | ||
| 205 | |||
| 206 | [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, | ||
| 207 | ObtainBufferSynchronize sync_info, | ||
| 208 | ObtainBufferOperation post_op); | ||
| 209 | void FlushCachedWrites(); | ||
| 210 | |||
| 211 | /// Return true when there are uncommitted buffers to be downloaded | ||
| 212 | [[nodiscard]] bool HasUncommittedFlushes() const noexcept; | ||
| 213 | |||
| 214 | void AccumulateFlushes(); | ||
| 215 | |||
| 216 | /// Return true when the caller should wait for async downloads | ||
| 217 | [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; | ||
| 218 | |||
| 219 | /// Commit asynchronous downloads | ||
| 220 | void CommitAsyncFlushes(); | ||
| 221 | void CommitAsyncFlushesHigh(); | ||
| 222 | void CommitAsyncQueries(); | ||
| 223 | |||
| 224 | /// Pop asynchronous downloads | ||
| 225 | void PopAsyncFlushes(); | ||
| 226 | |||
| 227 | void PopAsyncQueries(); | ||
| 228 | void PopAsyncBuffers(); | ||
| 229 | |||
| 230 | bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); | ||
| 231 | |||
| 232 | bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); | ||
| 233 | |||
| 234 | /// Return true when a CPU region is modified from the GPU | ||
| 235 | [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); | ||
| 236 | |||
| 237 | /// Return true when a region is registered on the cache | ||
| 238 | [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); | ||
| 239 | |||
| 240 | /// Return true when a CPU region is modified from the CPU | ||
| 241 | [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); | ||
| 242 | |||
| 243 | void SetDrawIndirect(const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { | ||
| 244 | current_draw_indirect = current_draw_indirect_; | ||
| 245 | } | ||
| 246 | |||
| 247 | [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectCount(); | ||
| 248 | |||
| 249 | [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); | ||
| 250 | |||
| 251 | std::recursive_mutex mutex; | ||
| 252 | Runtime& runtime; | ||
| 253 | |||
| 254 | private: | ||
| 255 | template <typename Func> | ||
| 256 | static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { | ||
| 257 | for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { | ||
| 258 | const int disabled_bits = std::countr_zero(enabled_mask); | ||
| 259 | index += disabled_bits; | ||
| 260 | enabled_mask >>= disabled_bits; | ||
| 261 | func(index); | ||
| 262 | } | ||
| 263 | } | ||
| 264 | |||
| 265 | template <typename Func> | ||
| 266 | void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { | ||
| 267 | const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); | ||
| 268 | for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { | ||
| 269 | const BufferId buffer_id = page_table[page]; | ||
| 270 | if (!buffer_id) { | ||
| 271 | ++page; | ||
| 272 | continue; | ||
| 273 | } | ||
| 274 | Buffer& buffer = slot_buffers[buffer_id]; | ||
| 275 | func(buffer_id, buffer); | ||
| 276 | |||
| 277 | const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); | ||
| 278 | page = Common::DivCeil(end_addr, PAGE_SIZE); | ||
| 279 | } | ||
| 280 | } | ||
| 281 | |||
| 282 | template <typename Func> | ||
| 283 | void ForEachInRangeSet(IntervalSet& current_range, VAddr cpu_addr, u64 size, Func&& func) { | ||
| 284 | const VAddr start_address = cpu_addr; | ||
| 285 | const VAddr end_address = start_address + size; | ||
| 286 | const IntervalType search_interval{start_address, end_address}; | ||
| 287 | auto it = current_range.lower_bound(search_interval); | ||
| 288 | if (it == current_range.end()) { | ||
| 289 | return; | ||
| 290 | } | ||
| 291 | auto end_it = current_range.upper_bound(search_interval); | ||
| 292 | for (; it != end_it; it++) { | ||
| 293 | VAddr inter_addr_end = it->upper(); | ||
| 294 | VAddr inter_addr = it->lower(); | ||
| 295 | if (inter_addr_end > end_address) { | ||
| 296 | inter_addr_end = end_address; | ||
| 297 | } | ||
| 298 | if (inter_addr < start_address) { | ||
| 299 | inter_addr = start_address; | ||
| 300 | } | ||
| 301 | func(inter_addr, inter_addr_end); | ||
| 302 | } | ||
| 303 | } | ||
| 304 | |||
| 305 | static bool IsRangeGranular(VAddr cpu_addr, size_t size) { | ||
| 306 | return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == | ||
| 307 | ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); | ||
| 308 | } | ||
| 309 | |||
| 310 | void RunGarbageCollector(); | ||
| 311 | |||
| 312 | void BindHostIndexBuffer(); | ||
| 313 | |||
| 314 | void BindHostVertexBuffers(); | ||
| 315 | |||
| 316 | void BindHostDrawIndirectBuffers(); | ||
| 317 | |||
| 318 | void BindHostGraphicsUniformBuffers(size_t stage); | ||
| 319 | |||
| 320 | void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); | ||
| 321 | |||
| 322 | void BindHostGraphicsStorageBuffers(size_t stage); | ||
| 323 | |||
| 324 | void BindHostGraphicsTextureBuffers(size_t stage); | ||
| 325 | |||
| 326 | void BindHostTransformFeedbackBuffers(); | ||
| 327 | |||
| 328 | void BindHostComputeUniformBuffers(); | ||
| 329 | |||
| 330 | void BindHostComputeStorageBuffers(); | ||
| 331 | |||
| 332 | void BindHostComputeTextureBuffers(); | ||
| 333 | |||
| 334 | void DoUpdateGraphicsBuffers(bool is_indexed); | ||
| 335 | |||
| 336 | void DoUpdateComputeBuffers(); | ||
| 337 | |||
| 338 | void UpdateIndexBuffer(); | ||
| 339 | |||
| 340 | void UpdateVertexBuffers(); | ||
| 341 | |||
| 342 | void UpdateVertexBuffer(u32 index); | ||
| 343 | |||
| 344 | void UpdateDrawIndirect(); | ||
| 345 | |||
| 346 | void UpdateUniformBuffers(size_t stage); | ||
| 347 | |||
| 348 | void UpdateStorageBuffers(size_t stage); | ||
| 349 | |||
| 350 | void UpdateTextureBuffers(size_t stage); | ||
| 351 | |||
| 352 | void UpdateTransformFeedbackBuffers(); | ||
| 353 | |||
| 354 | void UpdateTransformFeedbackBuffer(u32 index); | ||
| 355 | |||
| 356 | void UpdateComputeUniformBuffers(); | ||
| 357 | |||
| 358 | void UpdateComputeStorageBuffers(); | ||
| 359 | |||
| 360 | void UpdateComputeTextureBuffers(); | ||
| 361 | |||
| 362 | void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); | ||
| 363 | |||
| 364 | [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); | ||
| 365 | |||
| 366 | [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); | ||
| 367 | |||
| 368 | void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); | ||
| 369 | |||
| 370 | [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); | ||
| 371 | |||
| 372 | void Register(BufferId buffer_id); | ||
| 373 | |||
| 374 | void Unregister(BufferId buffer_id); | ||
| 375 | |||
| 376 | template <bool insert> | ||
| 377 | void ChangeRegister(BufferId buffer_id); | ||
| 378 | |||
| 379 | void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept; | ||
| 380 | |||
| 381 | bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); | ||
| 382 | |||
| 383 | bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); | ||
| 384 | |||
| 385 | bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size); | ||
| 386 | |||
| 387 | void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, | ||
| 388 | std::span<BufferCopy> copies); | ||
| 389 | |||
| 390 | void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, | ||
| 391 | std::span<const BufferCopy> copies); | ||
| 392 | |||
| 393 | void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); | ||
| 394 | |||
| 395 | void DownloadBufferMemory(Buffer& buffer_id); | ||
| 396 | |||
| 397 | void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); | ||
| 398 | |||
| 399 | void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false); | ||
| 400 | |||
| 401 | void NotifyBufferDeletion(); | ||
| 402 | |||
| 403 | [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, bool is_written) const; | ||
| 404 | |||
| 405 | [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, | ||
| 406 | PixelFormat format); | ||
| 407 | |||
| 408 | [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size); | ||
| 409 | |||
| 410 | [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity); | ||
| 411 | |||
| 412 | [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; | ||
| 413 | |||
| 414 | void ClearDownload(IntervalType subtract_interval); | ||
| 415 | |||
| 416 | VideoCore::RasterizerInterface& rasterizer; | ||
| 417 | Core::Memory::Memory& cpu_memory; | ||
| 418 | |||
| 419 | SlotVector<Buffer> slot_buffers; | ||
| 420 | DelayedDestructionRing<Buffer, 8> delayed_destruction_ring; | ||
| 421 | |||
| 422 | const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; | ||
| 423 | |||
| 424 | u32 last_index_count = 0; | ||
| 425 | |||
| 426 | Binding index_buffer; | ||
| 427 | std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers; | ||
| 428 | std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers; | ||
| 429 | std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers; | ||
| 430 | std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers; | ||
| 431 | std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers; | ||
| 432 | Binding count_buffer_binding; | ||
| 433 | Binding indirect_buffer_binding; | ||
| 434 | |||
| 435 | std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers; | ||
| 436 | std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers; | ||
| 437 | std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS> compute_texture_buffers; | ||
| 438 | |||
| 439 | std::array<u32, NUM_STAGES> enabled_uniform_buffer_masks{}; | ||
| 440 | u32 enabled_compute_uniform_buffer_mask = 0; | ||
| 441 | |||
| 442 | const UniformBufferSizes* uniform_buffer_sizes{}; | ||
| 443 | const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{}; | ||
| 444 | |||
| 445 | std::array<u32, NUM_STAGES> enabled_storage_buffers{}; | ||
| 446 | std::array<u32, NUM_STAGES> written_storage_buffers{}; | ||
| 447 | u32 enabled_compute_storage_buffers = 0; | ||
| 448 | u32 written_compute_storage_buffers = 0; | ||
| 449 | |||
| 450 | std::array<u32, NUM_STAGES> enabled_texture_buffers{}; | ||
| 451 | std::array<u32, NUM_STAGES> written_texture_buffers{}; | ||
| 452 | std::array<u32, NUM_STAGES> image_texture_buffers{}; | ||
| 453 | u32 enabled_compute_texture_buffers = 0; | ||
| 454 | u32 written_compute_texture_buffers = 0; | ||
| 455 | u32 image_compute_texture_buffers = 0; | ||
| 456 | |||
| 457 | std::array<u32, 16> uniform_cache_hits{}; | ||
| 458 | std::array<u32, 16> uniform_cache_shots{}; | ||
| 459 | |||
| 460 | u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE; | ||
| 461 | |||
| 462 | bool has_deleted_buffers = false; | ||
| 463 | |||
| 464 | std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty> | ||
| 465 | dirty_uniform_buffers{}; | ||
| 466 | std::conditional_t<IS_OPENGL, std::array<u32, NUM_STAGES>, Empty> fast_bound_uniform_buffers{}; | ||
| 467 | std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, | ||
| 468 | std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>, Empty> | ||
| 469 | uniform_buffer_binding_sizes{}; | ||
| 470 | |||
| 471 | std::vector<BufferId> cached_write_buffer_ids; | ||
| 472 | |||
| 473 | MemoryTracker memory_tracker; | ||
| 474 | IntervalSet uncommitted_ranges; | ||
| 475 | IntervalSet common_ranges; | ||
| 476 | IntervalSet cached_ranges; | ||
| 477 | std::deque<IntervalSet> committed_ranges; | ||
| 478 | |||
| 479 | // Async Buffers | ||
| 480 | std::deque<IntervalSet> async_downloads; | ||
| 481 | std::deque<std::optional<Async_Buffer>> async_buffers; | ||
| 482 | std::deque<boost::container::small_vector<BufferCopy, 4>> pending_downloads; | ||
| 483 | std::optional<Async_Buffer> current_buffer; | ||
| 484 | |||
| 485 | // queries | ||
| 486 | boost::container::small_vector<std::pair<VAddr, size_t>, 8> pending_queries; | ||
| 487 | std::deque<boost::container::small_vector<BufferCopy, 8>> committed_queries; | ||
| 488 | boost::container::small_vector<u64, 8> flushed_queries; | ||
| 489 | std::deque<std::optional<Async_Buffer>> query_async_buffers; | ||
| 490 | |||
| 491 | size_t immediate_buffer_capacity = 0; | ||
| 492 | Common::ScratchBuffer<u8> immediate_buffer_alloc; | ||
| 493 | |||
| 494 | struct LRUItemParams { | ||
| 495 | using ObjectType = BufferId; | ||
| 496 | using TickType = u64; | ||
| 497 | }; | ||
| 498 | Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache; | ||
| 499 | u64 frame_tick = 0; | ||
| 500 | u64 total_used_memory = 0; | ||
| 501 | u64 minimum_memory = 0; | ||
| 502 | u64 critical_memory = 0; | ||
| 503 | |||
| 504 | std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; | ||
| 505 | }; | ||
| 506 | |||
| 507 | } // namespace VideoCommon | ||
diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h new file mode 100644 index 000000000..93bd779c9 --- /dev/null +++ b/src/video_core/buffer_cache/memory_tracker_base.h | |||
| @@ -0,0 +1,258 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #pragma once | ||
| 5 | |||
| 6 | #include <algorithm> | ||
| 7 | #include <bit> | ||
| 8 | #include <deque> | ||
| 9 | #include <limits> | ||
| 10 | #include <type_traits> | ||
| 11 | #include <unordered_set> | ||
| 12 | #include <utility> | ||
| 13 | |||
| 14 | #include "common/alignment.h" | ||
| 15 | #include "common/common_types.h" | ||
| 16 | #include "video_core/buffer_cache/word_manager.h" | ||
| 17 | |||
| 18 | namespace VideoCommon { | ||
| 19 | |||
| 20 | template <class RasterizerInterface> | ||
| 21 | class MemoryTrackerBase { | ||
| 22 | static constexpr size_t MAX_CPU_PAGE_BITS = 39; | ||
| 23 | static constexpr size_t HIGHER_PAGE_BITS = 22; | ||
| 24 | static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS; | ||
| 25 | static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL; | ||
| 26 | static constexpr size_t NUM_HIGH_PAGES = 1ULL << (MAX_CPU_PAGE_BITS - HIGHER_PAGE_BITS); | ||
| 27 | static constexpr size_t MANAGER_POOL_SIZE = 32; | ||
| 28 | static constexpr size_t WORDS_STACK_NEEDED = HIGHER_PAGE_SIZE / BYTES_PER_WORD; | ||
| 29 | using Manager = WordManager<RasterizerInterface, WORDS_STACK_NEEDED>; | ||
| 30 | |||
| 31 | public: | ||
| 32 | MemoryTrackerBase(RasterizerInterface& rasterizer_) : rasterizer{&rasterizer_} {} | ||
| 33 | ~MemoryTrackerBase() = default; | ||
| 34 | |||
| 35 | /// Returns the inclusive CPU modified range in a begin end pair | ||
| 36 | [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, | ||
| 37 | u64 query_size) noexcept { | ||
| 38 | return IteratePairs<true>(query_cpu_addr, query_size, | ||
| 39 | [](Manager* manager, u64 offset, size_t size) { | ||
| 40 | return manager->ModifiedRegion<Type::CPU>(offset, size); | ||
| 41 | }); | ||
| 42 | } | ||
| 43 | |||
| 44 | /// Returns the inclusive GPU modified range in a begin end pair | ||
| 45 | [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, | ||
| 46 | u64 query_size) noexcept { | ||
| 47 | return IteratePairs<false>(query_cpu_addr, query_size, | ||
| 48 | [](Manager* manager, u64 offset, size_t size) { | ||
| 49 | return manager->ModifiedRegion<Type::GPU>(offset, size); | ||
| 50 | }); | ||
| 51 | } | ||
| 52 | |||
| 53 | /// Returns true if a region has been modified from the CPU | ||
| 54 | [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { | ||
| 55 | return IteratePages<true>(query_cpu_addr, query_size, | ||
| 56 | [](Manager* manager, u64 offset, size_t size) { | ||
| 57 | return manager->IsRegionModified<Type::CPU>(offset, size); | ||
| 58 | }); | ||
| 59 | } | ||
| 60 | |||
| 61 | /// Returns true if a region has been modified from the GPU | ||
| 62 | [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { | ||
| 63 | return IteratePages<false>(query_cpu_addr, query_size, | ||
| 64 | [](Manager* manager, u64 offset, size_t size) { | ||
| 65 | return manager->IsRegionModified<Type::GPU>(offset, size); | ||
| 66 | }); | ||
| 67 | } | ||
| 68 | |||
| 69 | /// Mark region as CPU modified, notifying the rasterizer about this change | ||
| 70 | void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { | ||
| 71 | IteratePages<true>( | ||
| 72 | dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { | ||
| 73 | manager->ChangeRegionState<Type::CPU, true>(manager->GetCpuAddr() + offset, size); | ||
| 74 | }); | ||
| 75 | } | ||
| 76 | |||
| 77 | /// Unmark region as CPU modified, notifying the rasterizer about this change | ||
| 78 | void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { | ||
| 79 | IteratePages<true>( | ||
| 80 | dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { | ||
| 81 | manager->ChangeRegionState<Type::CPU, false>(manager->GetCpuAddr() + offset, size); | ||
| 82 | }); | ||
| 83 | } | ||
| 84 | |||
| 85 | /// Mark region as modified from the host GPU | ||
| 86 | void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { | ||
| 87 | IteratePages<true>( | ||
| 88 | dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { | ||
| 89 | manager->ChangeRegionState<Type::GPU, true>(manager->GetCpuAddr() + offset, size); | ||
| 90 | }); | ||
| 91 | } | ||
| 92 | |||
| 93 | /// Unmark region as modified from the host GPU | ||
| 94 | void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { | ||
| 95 | IteratePages<true>( | ||
| 96 | dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { | ||
| 97 | manager->ChangeRegionState<Type::GPU, false>(manager->GetCpuAddr() + offset, size); | ||
| 98 | }); | ||
| 99 | } | ||
| 100 | |||
| 101 | /// Mark region as modified from the CPU | ||
| 102 | /// but don't mark it as modified until FlusHCachedWrites is called. | ||
| 103 | void CachedCpuWrite(VAddr dirty_cpu_addr, u64 query_size) { | ||
| 104 | IteratePages<true>( | ||
| 105 | dirty_cpu_addr, query_size, [this](Manager* manager, u64 offset, size_t size) { | ||
| 106 | const VAddr cpu_address = manager->GetCpuAddr() + offset; | ||
| 107 | manager->ChangeRegionState<Type::CachedCPU, true>(cpu_address, size); | ||
| 108 | cached_pages.insert(static_cast<u32>(cpu_address >> HIGHER_PAGE_BITS)); | ||
| 109 | }); | ||
| 110 | } | ||
| 111 | |||
| 112 | /// Flushes cached CPU writes, and notify the rasterizer about the deltas | ||
| 113 | void FlushCachedWrites(VAddr query_cpu_addr, u64 query_size) noexcept { | ||
| 114 | IteratePages<false>(query_cpu_addr, query_size, | ||
| 115 | [](Manager* manager, [[maybe_unused]] u64 offset, | ||
| 116 | [[maybe_unused]] size_t size) { manager->FlushCachedWrites(); }); | ||
| 117 | } | ||
| 118 | |||
| 119 | void FlushCachedWrites() noexcept { | ||
| 120 | for (auto id : cached_pages) { | ||
| 121 | top_tier[id]->FlushCachedWrites(); | ||
| 122 | } | ||
| 123 | cached_pages.clear(); | ||
| 124 | } | ||
| 125 | |||
| 126 | /// Call 'func' for each CPU modified range and unmark those pages as CPU modified | ||
| 127 | template <typename Func> | ||
| 128 | void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { | ||
| 129 | IteratePages<true>(query_cpu_range, query_size, | ||
| 130 | [&func](Manager* manager, u64 offset, size_t size) { | ||
| 131 | manager->ForEachModifiedRange<Type::CPU>( | ||
| 132 | manager->GetCpuAddr() + offset, size, true, func); | ||
| 133 | }); | ||
| 134 | } | ||
| 135 | |||
| 136 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified | ||
| 137 | template <typename Func> | ||
| 138 | void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) { | ||
| 139 | IteratePages<false>(query_cpu_range, query_size, | ||
| 140 | [&func, clear](Manager* manager, u64 offset, size_t size) { | ||
| 141 | manager->ForEachModifiedRange<Type::GPU>( | ||
| 142 | manager->GetCpuAddr() + offset, size, clear, func); | ||
| 143 | }); | ||
| 144 | } | ||
| 145 | |||
| 146 | template <typename Func> | ||
| 147 | void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) { | ||
| 148 | IteratePages<false>(query_cpu_range, query_size, | ||
| 149 | [&func](Manager* manager, u64 offset, size_t size) { | ||
| 150 | manager->ForEachModifiedRange<Type::GPU>( | ||
| 151 | manager->GetCpuAddr() + offset, size, true, func); | ||
| 152 | }); | ||
| 153 | } | ||
| 154 | |||
| 155 | private: | ||
| 156 | template <bool create_region_on_fail, typename Func> | ||
| 157 | bool IteratePages(VAddr cpu_address, size_t size, Func&& func) { | ||
| 158 | using FuncReturn = typename std::invoke_result<Func, Manager*, u64, size_t>::type; | ||
| 159 | static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>; | ||
| 160 | std::size_t remaining_size{size}; | ||
| 161 | std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS}; | ||
| 162 | u64 page_offset{cpu_address & HIGHER_PAGE_MASK}; | ||
| 163 | while (remaining_size > 0) { | ||
| 164 | const std::size_t copy_amount{std::min(HIGHER_PAGE_SIZE - page_offset, remaining_size)}; | ||
| 165 | auto* manager{top_tier[page_index]}; | ||
| 166 | if (manager) { | ||
| 167 | if constexpr (BOOL_BREAK) { | ||
| 168 | if (func(manager, page_offset, copy_amount)) { | ||
| 169 | return true; | ||
| 170 | } | ||
| 171 | } else { | ||
| 172 | func(manager, page_offset, copy_amount); | ||
| 173 | } | ||
| 174 | } else if constexpr (create_region_on_fail) { | ||
| 175 | CreateRegion(page_index); | ||
| 176 | manager = top_tier[page_index]; | ||
| 177 | if constexpr (BOOL_BREAK) { | ||
| 178 | if (func(manager, page_offset, copy_amount)) { | ||
| 179 | return true; | ||
| 180 | } | ||
| 181 | } else { | ||
| 182 | func(manager, page_offset, copy_amount); | ||
| 183 | } | ||
| 184 | } | ||
| 185 | page_index++; | ||
| 186 | page_offset = 0; | ||
| 187 | remaining_size -= copy_amount; | ||
| 188 | } | ||
| 189 | return false; | ||
| 190 | } | ||
| 191 | |||
| 192 | template <bool create_region_on_fail, typename Func> | ||
| 193 | std::pair<u64, u64> IteratePairs(VAddr cpu_address, size_t size, Func&& func) { | ||
| 194 | std::size_t remaining_size{size}; | ||
| 195 | std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS}; | ||
| 196 | u64 page_offset{cpu_address & HIGHER_PAGE_MASK}; | ||
| 197 | u64 begin = std::numeric_limits<u64>::max(); | ||
| 198 | u64 end = 0; | ||
| 199 | while (remaining_size > 0) { | ||
| 200 | const std::size_t copy_amount{std::min(HIGHER_PAGE_SIZE - page_offset, remaining_size)}; | ||
| 201 | auto* manager{top_tier[page_index]}; | ||
| 202 | const auto execute = [&] { | ||
| 203 | auto [new_begin, new_end] = func(manager, page_offset, copy_amount); | ||
| 204 | if (new_begin != 0 || new_end != 0) { | ||
| 205 | const u64 base_address = page_index << HIGHER_PAGE_BITS; | ||
| 206 | begin = std::min(new_begin + base_address, begin); | ||
| 207 | end = std::max(new_end + base_address, end); | ||
| 208 | } | ||
| 209 | }; | ||
| 210 | if (manager) { | ||
| 211 | execute(); | ||
| 212 | } else if constexpr (create_region_on_fail) { | ||
| 213 | CreateRegion(page_index); | ||
| 214 | manager = top_tier[page_index]; | ||
| 215 | execute(); | ||
| 216 | } | ||
| 217 | page_index++; | ||
| 218 | page_offset = 0; | ||
| 219 | remaining_size -= copy_amount; | ||
| 220 | } | ||
| 221 | return begin < end ? std::make_pair(begin, end) : std::make_pair(0ULL, 0ULL); | ||
| 222 | } | ||
| 223 | |||
| 224 | void CreateRegion(std::size_t page_index) { | ||
| 225 | const VAddr base_cpu_addr = page_index << HIGHER_PAGE_BITS; | ||
| 226 | top_tier[page_index] = GetNewManager(base_cpu_addr); | ||
| 227 | } | ||
| 228 | |||
| 229 | Manager* GetNewManager(VAddr base_cpu_addess) { | ||
| 230 | const auto on_return = [&] { | ||
| 231 | auto* new_manager = free_managers.front(); | ||
| 232 | new_manager->SetCpuAddress(base_cpu_addess); | ||
| 233 | free_managers.pop_front(); | ||
| 234 | return new_manager; | ||
| 235 | }; | ||
| 236 | if (!free_managers.empty()) { | ||
| 237 | return on_return(); | ||
| 238 | } | ||
| 239 | manager_pool.emplace_back(); | ||
| 240 | auto& last_pool = manager_pool.back(); | ||
| 241 | for (size_t i = 0; i < MANAGER_POOL_SIZE; i++) { | ||
| 242 | new (&last_pool[i]) Manager(0, *rasterizer, HIGHER_PAGE_SIZE); | ||
| 243 | free_managers.push_back(&last_pool[i]); | ||
| 244 | } | ||
| 245 | return on_return(); | ||
| 246 | } | ||
| 247 | |||
| 248 | std::deque<std::array<Manager, MANAGER_POOL_SIZE>> manager_pool; | ||
| 249 | std::deque<Manager*> free_managers; | ||
| 250 | |||
| 251 | std::array<Manager*, NUM_HIGH_PAGES> top_tier{}; | ||
| 252 | |||
| 253 | std::unordered_set<u32> cached_pages; | ||
| 254 | |||
| 255 | RasterizerInterface* rasterizer = nullptr; | ||
| 256 | }; | ||
| 257 | |||
| 258 | } // namespace VideoCommon | ||
diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h new file mode 100644 index 000000000..782951fe7 --- /dev/null +++ b/src/video_core/buffer_cache/word_manager.h | |||
| @@ -0,0 +1,474 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #pragma once | ||
| 5 | |||
| 6 | #include <algorithm> | ||
| 7 | #include <bit> | ||
| 8 | #include <limits> | ||
| 9 | #include <utility> | ||
| 10 | |||
| 11 | #include "common/alignment.h" | ||
| 12 | #include "common/common_funcs.h" | ||
| 13 | #include "common/common_types.h" | ||
| 14 | #include "common/div_ceil.h" | ||
| 15 | #include "core/memory.h" | ||
| 16 | |||
| 17 | namespace VideoCommon { | ||
| 18 | |||
| 19 | constexpr u64 PAGES_PER_WORD = 64; | ||
| 20 | constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; | ||
| 21 | constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; | ||
| 22 | |||
| 23 | /// Vector tracking modified pages tightly packed with small vector optimization | ||
| 24 | template <size_t stack_words = 1> | ||
| 25 | union WordsArray { | ||
| 26 | /// Returns the pointer to the words state | ||
| 27 | [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { | ||
| 28 | return is_short ? stack.data() : heap; | ||
| 29 | } | ||
| 30 | |||
| 31 | /// Returns the pointer to the words state | ||
| 32 | [[nodiscard]] u64* Pointer(bool is_short) noexcept { | ||
| 33 | return is_short ? stack.data() : heap; | ||
| 34 | } | ||
| 35 | |||
| 36 | std::array<u64, stack_words> stack{}; ///< Small buffers storage | ||
| 37 | u64* heap; ///< Not-small buffers pointer to the storage | ||
| 38 | }; | ||
| 39 | |||
| 40 | template <size_t stack_words = 1> | ||
| 41 | struct Words { | ||
| 42 | explicit Words() = default; | ||
| 43 | explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { | ||
| 44 | if (IsShort()) { | ||
| 45 | cpu.stack.fill(~u64{0}); | ||
| 46 | gpu.stack.fill(0); | ||
| 47 | cached_cpu.stack.fill(0); | ||
| 48 | untracked.stack.fill(~u64{0}); | ||
| 49 | } else { | ||
| 50 | const size_t num_words = NumWords(); | ||
| 51 | // Share allocation between CPU and GPU pages and set their default values | ||
| 52 | u64* const alloc = new u64[num_words * 4]; | ||
| 53 | cpu.heap = alloc; | ||
| 54 | gpu.heap = alloc + num_words; | ||
| 55 | cached_cpu.heap = alloc + num_words * 2; | ||
| 56 | untracked.heap = alloc + num_words * 3; | ||
| 57 | std::fill_n(cpu.heap, num_words, ~u64{0}); | ||
| 58 | std::fill_n(gpu.heap, num_words, 0); | ||
| 59 | std::fill_n(cached_cpu.heap, num_words, 0); | ||
| 60 | std::fill_n(untracked.heap, num_words, ~u64{0}); | ||
| 61 | } | ||
| 62 | // Clean up tailing bits | ||
| 63 | const u64 last_word_size = size_bytes % BYTES_PER_WORD; | ||
| 64 | const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); | ||
| 65 | const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; | ||
| 66 | const u64 last_word = (~u64{0} << shift) >> shift; | ||
| 67 | cpu.Pointer(IsShort())[NumWords() - 1] = last_word; | ||
| 68 | untracked.Pointer(IsShort())[NumWords() - 1] = last_word; | ||
| 69 | } | ||
| 70 | |||
| 71 | ~Words() { | ||
| 72 | Release(); | ||
| 73 | } | ||
| 74 | |||
| 75 | Words& operator=(Words&& rhs) noexcept { | ||
| 76 | Release(); | ||
| 77 | size_bytes = rhs.size_bytes; | ||
| 78 | cpu = rhs.cpu; | ||
| 79 | gpu = rhs.gpu; | ||
| 80 | cached_cpu = rhs.cached_cpu; | ||
| 81 | untracked = rhs.untracked; | ||
| 82 | rhs.cpu.heap = nullptr; | ||
| 83 | return *this; | ||
| 84 | } | ||
| 85 | |||
| 86 | Words(Words&& rhs) noexcept | ||
| 87 | : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, | ||
| 88 | cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { | ||
| 89 | rhs.cpu.heap = nullptr; | ||
| 90 | } | ||
| 91 | |||
| 92 | Words& operator=(const Words&) = delete; | ||
| 93 | Words(const Words&) = delete; | ||
| 94 | |||
| 95 | /// Returns true when the buffer fits in the small vector optimization | ||
| 96 | [[nodiscard]] bool IsShort() const noexcept { | ||
| 97 | return size_bytes <= stack_words * BYTES_PER_WORD; | ||
| 98 | } | ||
| 99 | |||
| 100 | /// Returns the number of words of the buffer | ||
| 101 | [[nodiscard]] size_t NumWords() const noexcept { | ||
| 102 | return Common::DivCeil(size_bytes, BYTES_PER_WORD); | ||
| 103 | } | ||
| 104 | |||
| 105 | /// Release buffer resources | ||
| 106 | void Release() { | ||
| 107 | if (!IsShort()) { | ||
| 108 | // CPU written words is the base for the heap allocation | ||
| 109 | delete[] cpu.heap; | ||
| 110 | } | ||
| 111 | } | ||
| 112 | |||
| 113 | u64 size_bytes = 0; | ||
| 114 | WordsArray<stack_words> cpu; | ||
| 115 | WordsArray<stack_words> gpu; | ||
| 116 | WordsArray<stack_words> cached_cpu; | ||
| 117 | WordsArray<stack_words> untracked; | ||
| 118 | }; | ||
| 119 | |||
| 120 | enum class Type { | ||
| 121 | CPU, | ||
| 122 | GPU, | ||
| 123 | CachedCPU, | ||
| 124 | Untracked, | ||
| 125 | }; | ||
| 126 | |||
| 127 | template <class RasterizerInterface, size_t stack_words = 1> | ||
| 128 | class WordManager { | ||
| 129 | public: | ||
| 130 | explicit WordManager(VAddr cpu_addr_, RasterizerInterface& rasterizer_, u64 size_bytes) | ||
| 131 | : cpu_addr{cpu_addr_}, rasterizer{&rasterizer_}, words{size_bytes} {} | ||
| 132 | |||
| 133 | explicit WordManager() = default; | ||
| 134 | |||
| 135 | void SetCpuAddress(VAddr new_cpu_addr) { | ||
| 136 | cpu_addr = new_cpu_addr; | ||
| 137 | } | ||
| 138 | |||
| 139 | VAddr GetCpuAddr() const { | ||
| 140 | return cpu_addr; | ||
| 141 | } | ||
| 142 | |||
| 143 | /** | ||
| 144 | * Change the state of a range of pages | ||
| 145 | * | ||
| 146 | * @param dirty_addr Base address to mark or unmark as modified | ||
| 147 | * @param size Size in bytes to mark or unmark as modified | ||
| 148 | */ | ||
| 149 | template <Type type, bool enable> | ||
| 150 | void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { | ||
| 151 | const s64 difference = dirty_addr - cpu_addr; | ||
| 152 | const u64 offset = std::max<s64>(difference, 0); | ||
| 153 | size += std::min<s64>(difference, 0); | ||
| 154 | if (offset >= SizeBytes() || size < 0) { | ||
| 155 | return; | ||
| 156 | } | ||
| 157 | u64* const untracked_words = Array<Type::Untracked>(); | ||
| 158 | u64* const state_words = Array<type>(); | ||
| 159 | const u64 offset_end = std::min(offset + size, SizeBytes()); | ||
| 160 | const u64 begin_page_index = offset / BYTES_PER_PAGE; | ||
| 161 | const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; | ||
| 162 | const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE); | ||
| 163 | const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD); | ||
| 164 | u64 page_index = begin_page_index % PAGES_PER_WORD; | ||
| 165 | u64 word_index = begin_word_index; | ||
| 166 | while (word_index < end_word_index) { | ||
| 167 | const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD; | ||
| 168 | const u64 left_offset = | ||
| 169 | std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD; | ||
| 170 | const u64 right_offset = page_index; | ||
| 171 | u64 bits = ~u64{0}; | ||
| 172 | bits = (bits >> right_offset) << right_offset; | ||
| 173 | bits = (bits << left_offset) >> left_offset; | ||
| 174 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { | ||
| 175 | NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits); | ||
| 176 | } | ||
| 177 | if constexpr (enable) { | ||
| 178 | state_words[word_index] |= bits; | ||
| 179 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { | ||
| 180 | untracked_words[word_index] |= bits; | ||
| 181 | } | ||
| 182 | } else { | ||
| 183 | state_words[word_index] &= ~bits; | ||
| 184 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { | ||
| 185 | untracked_words[word_index] &= ~bits; | ||
| 186 | } | ||
| 187 | } | ||
| 188 | page_index = 0; | ||
| 189 | ++word_index; | ||
| 190 | } | ||
| 191 | } | ||
| 192 | |||
| 193 | /** | ||
| 194 | * Loop over each page in the given range, turn off those bits and notify the rasterizer if | ||
| 195 | * needed. Call the given function on each turned off range. | ||
| 196 | * | ||
| 197 | * @param query_cpu_range Base CPU address to loop over | ||
| 198 | * @param size Size in bytes of the CPU range to loop over | ||
| 199 | * @param func Function to call for each turned off region | ||
| 200 | */ | ||
| 201 | template <Type type, typename Func> | ||
| 202 | void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { | ||
| 203 | static_assert(type != Type::Untracked); | ||
| 204 | |||
| 205 | const s64 difference = query_cpu_range - cpu_addr; | ||
| 206 | const u64 query_begin = std::max<s64>(difference, 0); | ||
| 207 | size += std::min<s64>(difference, 0); | ||
| 208 | if (query_begin >= SizeBytes() || size < 0) { | ||
| 209 | return; | ||
| 210 | } | ||
| 211 | [[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>(); | ||
| 212 | [[maybe_unused]] u64* const cpu_words = Array<Type::CPU>(); | ||
| 213 | u64* const state_words = Array<type>(); | ||
| 214 | const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); | ||
| 215 | u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; | ||
| 216 | u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); | ||
| 217 | u64 first_page = (query_begin / BYTES_PER_PAGE) % PAGES_PER_WORD; | ||
| 218 | |||
| 219 | const auto modified = [](u64 word) { return word != 0; }; | ||
| 220 | const auto first_modified_word = std::find_if(words_begin, words_end, modified); | ||
| 221 | if (first_modified_word == words_end) { | ||
| 222 | // Exit early when the buffer is not modified | ||
| 223 | return; | ||
| 224 | } | ||
| 225 | if (first_modified_word != words_begin) { | ||
| 226 | first_page = 0; | ||
| 227 | } | ||
| 228 | std::reverse_iterator<u64*> first_word_reverse(first_modified_word); | ||
| 229 | std::reverse_iterator<u64*> last_word_iterator(words_end); | ||
| 230 | auto last_word_result = std::find_if(last_word_iterator, first_word_reverse, modified); | ||
| 231 | u64* const last_modified_word = &(*last_word_result) + 1; | ||
| 232 | |||
| 233 | const u64 word_index_begin = std::distance(state_words, first_modified_word); | ||
| 234 | const u64 word_index_end = std::distance(state_words, last_modified_word); | ||
| 235 | const unsigned local_page_begin = std::countr_zero(*first_modified_word); | ||
| 236 | const unsigned local_page_end = | ||
| 237 | static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); | ||
| 238 | const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; | ||
| 239 | const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; | ||
| 240 | const u64 query_page_begin = query_begin / BYTES_PER_PAGE; | ||
| 241 | const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE); | ||
| 242 | const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin); | ||
| 243 | const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end); | ||
| 244 | const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD; | ||
| 245 | const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1; | ||
| 246 | |||
| 247 | u64 page_begin = std::max(first_word_page_begin, first_page); | ||
| 248 | u64 current_base = 0; | ||
| 249 | u64 current_size = 0; | ||
| 250 | bool on_going = false; | ||
| 251 | for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) { | ||
| 252 | const bool is_last_word = word_index + 1 == word_index_end; | ||
| 253 | const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD; | ||
| 254 | const u64 right_offset = page_begin; | ||
| 255 | const u64 left_offset = PAGES_PER_WORD - page_end; | ||
| 256 | u64 bits = ~u64{0}; | ||
| 257 | bits = (bits >> right_offset) << right_offset; | ||
| 258 | bits = (bits << left_offset) >> left_offset; | ||
| 259 | |||
| 260 | const u64 current_word = state_words[word_index] & bits; | ||
| 261 | if (clear) { | ||
| 262 | state_words[word_index] &= ~bits; | ||
| 263 | } | ||
| 264 | |||
| 265 | if constexpr (type == Type::CachedCPU) { | ||
| 266 | NotifyRasterizer<false>(word_index, untracked_words[word_index], current_word); | ||
| 267 | untracked_words[word_index] |= current_word; | ||
| 268 | cpu_words[word_index] |= current_word; | ||
| 269 | } | ||
| 270 | |||
| 271 | if constexpr (type == Type::CPU) { | ||
| 272 | const u64 current_bits = untracked_words[word_index] & bits; | ||
| 273 | untracked_words[word_index] &= ~bits; | ||
| 274 | NotifyRasterizer<true>(word_index, current_bits, ~u64{0}); | ||
| 275 | } | ||
| 276 | const u64 word = current_word; | ||
| 277 | u64 page = page_begin; | ||
| 278 | page_begin = 0; | ||
| 279 | |||
| 280 | while (page < page_end) { | ||
| 281 | const int empty_bits = std::countr_zero(word >> page); | ||
| 282 | if (on_going && empty_bits != 0) { | ||
| 283 | InvokeModifiedRange(func, current_size, current_base); | ||
| 284 | current_size = 0; | ||
| 285 | on_going = false; | ||
| 286 | } | ||
| 287 | if (empty_bits == PAGES_PER_WORD) { | ||
| 288 | break; | ||
| 289 | } | ||
| 290 | page += empty_bits; | ||
| 291 | |||
| 292 | const int continuous_bits = std::countr_one(word >> page); | ||
| 293 | if (!on_going && continuous_bits != 0) { | ||
| 294 | current_base = word_index * PAGES_PER_WORD + page; | ||
| 295 | on_going = true; | ||
| 296 | } | ||
| 297 | current_size += continuous_bits; | ||
| 298 | page += continuous_bits; | ||
| 299 | } | ||
| 300 | } | ||
| 301 | if (on_going && current_size > 0) { | ||
| 302 | InvokeModifiedRange(func, current_size, current_base); | ||
| 303 | } | ||
| 304 | } | ||
| 305 | |||
| 306 | template <typename Func> | ||
| 307 | void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) { | ||
| 308 | const u64 current_size_bytes = current_size * BYTES_PER_PAGE; | ||
| 309 | const u64 offset_begin = current_base * BYTES_PER_PAGE; | ||
| 310 | const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes()); | ||
| 311 | func(cpu_addr + offset_begin, offset_end - offset_begin); | ||
| 312 | } | ||
| 313 | |||
| 314 | /** | ||
| 315 | * Returns true when a region has been modified | ||
| 316 | * | ||
| 317 | * @param offset Offset in bytes from the start of the buffer | ||
| 318 | * @param size Size in bytes of the region to query for modifications | ||
| 319 | */ | ||
| 320 | template <Type type> | ||
| 321 | [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { | ||
| 322 | static_assert(type != Type::Untracked); | ||
| 323 | |||
| 324 | const u64* const state_words = Array<type>(); | ||
| 325 | const u64 num_query_words = size / BYTES_PER_WORD + 1; | ||
| 326 | const u64 word_begin = offset / BYTES_PER_WORD; | ||
| 327 | const u64 word_end = std::min(word_begin + num_query_words, NumWords()); | ||
| 328 | const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); | ||
| 329 | u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; | ||
| 330 | for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { | ||
| 331 | const u64 word = state_words[word_index]; | ||
| 332 | if (word == 0) { | ||
| 333 | continue; | ||
| 334 | } | ||
| 335 | const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit); | ||
| 336 | const u64 local_page_end = page_end % PAGES_PER_WORD; | ||
| 337 | const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD; | ||
| 338 | if (((word >> page_index) << page_index) << page_end_shift != 0) { | ||
| 339 | return true; | ||
| 340 | } | ||
| 341 | } | ||
| 342 | return false; | ||
| 343 | } | ||
| 344 | |||
| 345 | /** | ||
| 346 | * Returns a begin end pair with the inclusive modified region | ||
| 347 | * | ||
| 348 | * @param offset Offset in bytes from the start of the buffer | ||
| 349 | * @param size Size in bytes of the region to query for modifications | ||
| 350 | */ | ||
| 351 | template <Type type> | ||
| 352 | [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { | ||
| 353 | static_assert(type != Type::Untracked); | ||
| 354 | const u64* const state_words = Array<type>(); | ||
| 355 | const u64 num_query_words = size / BYTES_PER_WORD + 1; | ||
| 356 | const u64 word_begin = offset / BYTES_PER_WORD; | ||
| 357 | const u64 word_end = std::min(word_begin + num_query_words, NumWords()); | ||
| 358 | const u64 page_base = offset / BYTES_PER_PAGE; | ||
| 359 | u64 page_begin = page_base & (PAGES_PER_WORD - 1); | ||
| 360 | u64 page_end = | ||
| 361 | Common::DivCeil(offset + size, BYTES_PER_PAGE) - (page_base & ~(PAGES_PER_WORD - 1)); | ||
| 362 | u64 begin = std::numeric_limits<u64>::max(); | ||
| 363 | u64 end = 0; | ||
| 364 | for (u64 word_index = word_begin; word_index < word_end; ++word_index) { | ||
| 365 | const u64 base_mask = (1ULL << page_begin) - 1ULL; | ||
| 366 | const u64 end_mask = page_end >= PAGES_PER_WORD ? 0ULL : ~((1ULL << page_end) - 1ULL); | ||
| 367 | const u64 off_word = end_mask | base_mask; | ||
| 368 | const u64 word = state_words[word_index] & ~off_word; | ||
| 369 | if (word == 0) { | ||
| 370 | page_begin = 0; | ||
| 371 | page_end -= PAGES_PER_WORD; | ||
| 372 | continue; | ||
| 373 | } | ||
| 374 | const u64 local_page_begin = std::countr_zero(word); | ||
| 375 | const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); | ||
| 376 | const u64 page_index = word_index * PAGES_PER_WORD; | ||
| 377 | begin = std::min(begin, page_index + local_page_begin); | ||
| 378 | end = page_index + local_page_end; | ||
| 379 | page_begin = 0; | ||
| 380 | page_end -= PAGES_PER_WORD; | ||
| 381 | } | ||
| 382 | static constexpr std::pair<u64, u64> EMPTY{0, 0}; | ||
| 383 | return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; | ||
| 384 | } | ||
| 385 | |||
| 386 | /// Returns the number of words of the manager | ||
| 387 | [[nodiscard]] size_t NumWords() const noexcept { | ||
| 388 | return words.NumWords(); | ||
| 389 | } | ||
| 390 | |||
| 391 | /// Returns the size in bytes of the manager | ||
| 392 | [[nodiscard]] u64 SizeBytes() const noexcept { | ||
| 393 | return words.size_bytes; | ||
| 394 | } | ||
| 395 | |||
| 396 | /// Returns true when the buffer fits in the small vector optimization | ||
| 397 | [[nodiscard]] bool IsShort() const noexcept { | ||
| 398 | return words.IsShort(); | ||
| 399 | } | ||
| 400 | |||
| 401 | void FlushCachedWrites() noexcept { | ||
| 402 | const u64 num_words = NumWords(); | ||
| 403 | u64* const cached_words = Array<Type::CachedCPU>(); | ||
| 404 | u64* const untracked_words = Array<Type::Untracked>(); | ||
| 405 | u64* const cpu_words = Array<Type::CPU>(); | ||
| 406 | for (u64 word_index = 0; word_index < num_words; ++word_index) { | ||
| 407 | const u64 cached_bits = cached_words[word_index]; | ||
| 408 | NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits); | ||
| 409 | untracked_words[word_index] |= cached_bits; | ||
| 410 | cpu_words[word_index] |= cached_bits; | ||
| 411 | cached_words[word_index] = 0; | ||
| 412 | } | ||
| 413 | } | ||
| 414 | |||
| 415 | private: | ||
| 416 | template <Type type> | ||
| 417 | u64* Array() noexcept { | ||
| 418 | if constexpr (type == Type::CPU) { | ||
| 419 | return words.cpu.Pointer(IsShort()); | ||
| 420 | } else if constexpr (type == Type::GPU) { | ||
| 421 | return words.gpu.Pointer(IsShort()); | ||
| 422 | } else if constexpr (type == Type::CachedCPU) { | ||
| 423 | return words.cached_cpu.Pointer(IsShort()); | ||
| 424 | } else if constexpr (type == Type::Untracked) { | ||
| 425 | return words.untracked.Pointer(IsShort()); | ||
| 426 | } | ||
| 427 | } | ||
| 428 | |||
| 429 | template <Type type> | ||
| 430 | const u64* Array() const noexcept { | ||
| 431 | if constexpr (type == Type::CPU) { | ||
| 432 | return words.cpu.Pointer(IsShort()); | ||
| 433 | } else if constexpr (type == Type::GPU) { | ||
| 434 | return words.gpu.Pointer(IsShort()); | ||
| 435 | } else if constexpr (type == Type::CachedCPU) { | ||
| 436 | return words.cached_cpu.Pointer(IsShort()); | ||
| 437 | } else if constexpr (type == Type::Untracked) { | ||
| 438 | return words.untracked.Pointer(IsShort()); | ||
| 439 | } | ||
| 440 | } | ||
| 441 | |||
| 442 | /** | ||
| 443 | * Notify rasterizer about changes in the CPU tracking state of a word in the buffer | ||
| 444 | * | ||
| 445 | * @param word_index Index to the word to notify to the rasterizer | ||
| 446 | * @param current_bits Current state of the word | ||
| 447 | * @param new_bits New state of the word | ||
| 448 | * | ||
| 449 | * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages | ||
| 450 | */ | ||
| 451 | template <bool add_to_rasterizer> | ||
| 452 | void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { | ||
| 453 | u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; | ||
| 454 | VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; | ||
| 455 | while (changed_bits != 0) { | ||
| 456 | const int empty_bits = std::countr_zero(changed_bits); | ||
| 457 | addr += empty_bits * BYTES_PER_PAGE; | ||
| 458 | changed_bits >>= empty_bits; | ||
| 459 | |||
| 460 | const u32 continuous_bits = std::countr_one(changed_bits); | ||
| 461 | const u64 size = continuous_bits * BYTES_PER_PAGE; | ||
| 462 | const VAddr begin_addr = addr; | ||
| 463 | addr += size; | ||
| 464 | changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0; | ||
| 465 | rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1); | ||
| 466 | } | ||
| 467 | } | ||
| 468 | |||
| 469 | VAddr cpu_addr = 0; | ||
| 470 | RasterizerInterface* rasterizer = nullptr; | ||
| 471 | Words<stack_words> words; | ||
| 472 | }; | ||
| 473 | |||
| 474 | } // namespace VideoCommon | ||
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a8c3f8b67..18d3c3ac0 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | 8 | ||
| 9 | #include "common/common_types.h" | 9 | #include "common/common_types.h" |
| 10 | #include "video_core/buffer_cache/buffer_cache.h" | 10 | #include "video_core/buffer_cache/buffer_cache.h" |
| 11 | #include "video_core/buffer_cache/memory_tracker_base.h" | ||
| 11 | #include "video_core/rasterizer_interface.h" | 12 | #include "video_core/rasterizer_interface.h" |
| 12 | #include "video_core/renderer_opengl/gl_device.h" | 13 | #include "video_core/renderer_opengl/gl_device.h" |
| 13 | #include "video_core/renderer_opengl/gl_resource_manager.h" | 14 | #include "video_core/renderer_opengl/gl_resource_manager.h" |
| @@ -200,6 +201,8 @@ private: | |||
| 200 | struct BufferCacheParams { | 201 | struct BufferCacheParams { |
| 201 | using Runtime = OpenGL::BufferCacheRuntime; | 202 | using Runtime = OpenGL::BufferCacheRuntime; |
| 202 | using Buffer = OpenGL::Buffer; | 203 | using Buffer = OpenGL::Buffer; |
| 204 | using Async_Buffer = u32; | ||
| 205 | using MemoryTracker = VideoCommon::MemoryTrackerBase<VideoCore::RasterizerInterface>; | ||
| 203 | 206 | ||
| 204 | static constexpr bool IS_OPENGL = true; | 207 | static constexpr bool IS_OPENGL = true; |
| 205 | static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true; | 208 | static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true; |
| @@ -208,6 +211,7 @@ struct BufferCacheParams { | |||
| 208 | static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; | 211 | static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; |
| 209 | static constexpr bool USE_MEMORY_MAPS = false; | 212 | static constexpr bool USE_MEMORY_MAPS = false; |
| 210 | static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = true; | 213 | static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = true; |
| 214 | static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = false; | ||
| 211 | }; | 215 | }; |
| 212 | 216 | ||
| 213 | using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; | 217 | using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; |
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp new file mode 100644 index 000000000..f15ae8e25 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp | |||
| @@ -0,0 +1,9 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
| 3 | |||
| 4 | #include "video_core/buffer_cache/buffer_cache.h" | ||
| 5 | #include "video_core/renderer_opengl/gl_buffer_cache.h" | ||
| 6 | |||
| 7 | namespace VideoCommon { | ||
| 8 | template class VideoCommon::BufferCache<OpenGL::BufferCacheParams>; | ||
| 9 | } | ||
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 9cbcb3c8f..510602e8e 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp | |||
| @@ -314,8 +314,12 @@ StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) { | |||
| 314 | return staging_pool.Request(size, MemoryUsage::Upload); | 314 | return staging_pool.Request(size, MemoryUsage::Upload); |
| 315 | } | 315 | } |
| 316 | 316 | ||
| 317 | StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) { | 317 | StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) { |
| 318 | return staging_pool.Request(size, MemoryUsage::Download); | 318 | return staging_pool.Request(size, MemoryUsage::Download, deferred); |
| 319 | } | ||
| 320 | |||
| 321 | void BufferCacheRuntime::FreeDeferredStagingBuffer(StagingBufferRef& ref) { | ||
| 322 | staging_pool.FreeDeferred(ref); | ||
| 319 | } | 323 | } |
| 320 | 324 | ||
| 321 | u64 BufferCacheRuntime::GetDeviceLocalMemory() const { | 325 | u64 BufferCacheRuntime::GetDeviceLocalMemory() const { |
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 183b33632..05968e6a6 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h | |||
| @@ -4,6 +4,7 @@ | |||
| 4 | #pragma once | 4 | #pragma once |
| 5 | 5 | ||
| 6 | #include "video_core/buffer_cache/buffer_cache.h" | 6 | #include "video_core/buffer_cache/buffer_cache.h" |
| 7 | #include "video_core/buffer_cache/memory_tracker_base.h" | ||
| 7 | #include "video_core/engines/maxwell_3d.h" | 8 | #include "video_core/engines/maxwell_3d.h" |
| 8 | #include "video_core/renderer_vulkan/vk_compute_pass.h" | 9 | #include "video_core/renderer_vulkan/vk_compute_pass.h" |
| 9 | #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" | 10 | #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" |
| @@ -75,7 +76,9 @@ public: | |||
| 75 | 76 | ||
| 76 | [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); | 77 | [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); |
| 77 | 78 | ||
| 78 | [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); | 79 | [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false); |
| 80 | |||
| 81 | void FreeDeferredStagingBuffer(StagingBufferRef& ref); | ||
| 79 | 82 | ||
| 80 | void PreCopyBarrier(); | 83 | void PreCopyBarrier(); |
| 81 | 84 | ||
| @@ -142,6 +145,8 @@ private: | |||
| 142 | struct BufferCacheParams { | 145 | struct BufferCacheParams { |
| 143 | using Runtime = Vulkan::BufferCacheRuntime; | 146 | using Runtime = Vulkan::BufferCacheRuntime; |
| 144 | using Buffer = Vulkan::Buffer; | 147 | using Buffer = Vulkan::Buffer; |
| 148 | using Async_Buffer = Vulkan::StagingBufferRef; | ||
| 149 | using MemoryTracker = VideoCommon::MemoryTrackerBase<VideoCore::RasterizerInterface>; | ||
| 145 | 150 | ||
| 146 | static constexpr bool IS_OPENGL = false; | 151 | static constexpr bool IS_OPENGL = false; |
| 147 | static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false; | 152 | static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false; |
| @@ -150,6 +155,7 @@ struct BufferCacheParams { | |||
| 150 | static constexpr bool NEEDS_BIND_STORAGE_INDEX = false; | 155 | static constexpr bool NEEDS_BIND_STORAGE_INDEX = false; |
| 151 | static constexpr bool USE_MEMORY_MAPS = true; | 156 | static constexpr bool USE_MEMORY_MAPS = true; |
| 152 | static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = false; | 157 | static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = false; |
| 158 | static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = true; | ||
| 153 | }; | 159 | }; |
| 154 | 160 | ||
| 155 | using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; | 161 | using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; |
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp new file mode 100644 index 000000000..f9e271507 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp | |||
| @@ -0,0 +1,9 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 3 | |||
| 4 | #include "video_core/buffer_cache/buffer_cache.h" | ||
| 5 | #include "video_core/renderer_vulkan/vk_buffer_cache.h" | ||
| 6 | |||
| 7 | namespace VideoCommon { | ||
| 8 | template class VideoCommon::BufferCache<Vulkan::BufferCacheParams>; | ||
| 9 | } | ||