diff options
| author | 2021-02-12 22:22:18 -0800 | |
|---|---|---|
| committer | 2021-02-12 22:22:18 -0800 | |
| commit | d3c7a7e7cf4bcabb171c98fe55e6e0291f8ee980 (patch) | |
| tree | 5c900d10847e1768a4951c1e6bec35f2618b5991 /src/video_core/buffer_cache | |
| parent | Merge pull request #5877 from ameerj/res-limit-usage (diff) | |
| parent | config: Make high GPU accuracy the default (diff) | |
| download | yuzu-d3c7a7e7cf4bcabb171c98fe55e6e0291f8ee980.tar.gz yuzu-d3c7a7e7cf4bcabb171c98fe55e6e0291f8ee980.tar.xz yuzu-d3c7a7e7cf4bcabb171c98fe55e6e0291f8ee980.zip | |
Merge pull request #5741 from ReinUsesLisp/new-bufcache
video_core: Reimplement the buffer cache
Diffstat (limited to 'src/video_core/buffer_cache')
| -rw-r--r-- | src/video_core/buffer_cache/buffer_base.h | 217 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_block.h | 62 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.cpp | 13 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 1656 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/map_interval.cpp | 33 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/map_interval.h | 93 |
6 files changed, 1346 insertions, 728 deletions
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index ee8602ce9..0c00ae280 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h | |||
| @@ -19,6 +19,7 @@ namespace VideoCommon { | |||
| 19 | 19 | ||
| 20 | enum class BufferFlagBits { | 20 | enum class BufferFlagBits { |
| 21 | Picked = 1 << 0, | 21 | Picked = 1 << 0, |
| 22 | CachedWrites = 1 << 1, | ||
| 22 | }; | 23 | }; |
| 23 | DECLARE_ENUM_FLAG_OPERATORS(BufferFlagBits) | 24 | DECLARE_ENUM_FLAG_OPERATORS(BufferFlagBits) |
| 24 | 25 | ||
| @@ -40,7 +41,7 @@ class BufferBase { | |||
| 40 | static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; | 41 | static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; |
| 41 | 42 | ||
| 42 | /// Vector tracking modified pages tightly packed with small vector optimization | 43 | /// Vector tracking modified pages tightly packed with small vector optimization |
| 43 | union WrittenWords { | 44 | union WordsArray { |
| 44 | /// Returns the pointer to the words state | 45 | /// Returns the pointer to the words state |
| 45 | [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { | 46 | [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { |
| 46 | return is_short ? &stack : heap; | 47 | return is_short ? &stack : heap; |
| @@ -55,49 +56,59 @@ class BufferBase { | |||
| 55 | u64* heap; ///< Not-small buffers pointer to the storage | 56 | u64* heap; ///< Not-small buffers pointer to the storage |
| 56 | }; | 57 | }; |
| 57 | 58 | ||
| 58 | struct GpuCpuWords { | 59 | struct Words { |
| 59 | explicit GpuCpuWords() = default; | 60 | explicit Words() = default; |
| 60 | explicit GpuCpuWords(u64 size_bytes_) : size_bytes{size_bytes_} { | 61 | explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { |
| 61 | if (IsShort()) { | 62 | if (IsShort()) { |
| 62 | cpu.stack = ~u64{0}; | 63 | cpu.stack = ~u64{0}; |
| 63 | gpu.stack = 0; | 64 | gpu.stack = 0; |
| 65 | cached_cpu.stack = 0; | ||
| 66 | untracked.stack = ~u64{0}; | ||
| 64 | } else { | 67 | } else { |
| 65 | // Share allocation between CPU and GPU pages and set their default values | 68 | // Share allocation between CPU and GPU pages and set their default values |
| 66 | const size_t num_words = NumWords(); | 69 | const size_t num_words = NumWords(); |
| 67 | u64* const alloc = new u64[num_words * 2]; | 70 | u64* const alloc = new u64[num_words * 4]; |
| 68 | cpu.heap = alloc; | 71 | cpu.heap = alloc; |
| 69 | gpu.heap = alloc + num_words; | 72 | gpu.heap = alloc + num_words; |
| 73 | cached_cpu.heap = alloc + num_words * 2; | ||
| 74 | untracked.heap = alloc + num_words * 3; | ||
| 70 | std::fill_n(cpu.heap, num_words, ~u64{0}); | 75 | std::fill_n(cpu.heap, num_words, ~u64{0}); |
| 71 | std::fill_n(gpu.heap, num_words, 0); | 76 | std::fill_n(gpu.heap, num_words, 0); |
| 77 | std::fill_n(cached_cpu.heap, num_words, 0); | ||
| 78 | std::fill_n(untracked.heap, num_words, ~u64{0}); | ||
| 72 | } | 79 | } |
| 73 | // Clean up tailing bits | 80 | // Clean up tailing bits |
| 74 | const u64 last_local_page = | 81 | const u64 last_word_size = size_bytes % BYTES_PER_WORD; |
| 75 | Common::DivCeil(size_bytes % BYTES_PER_WORD, BYTES_PER_PAGE); | 82 | const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); |
| 76 | const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; | 83 | const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; |
| 77 | u64& last_word = cpu.Pointer(IsShort())[NumWords() - 1]; | 84 | const u64 last_word = (~u64{0} << shift) >> shift; |
| 78 | last_word = (last_word << shift) >> shift; | 85 | cpu.Pointer(IsShort())[NumWords() - 1] = last_word; |
| 86 | untracked.Pointer(IsShort())[NumWords() - 1] = last_word; | ||
| 79 | } | 87 | } |
| 80 | 88 | ||
| 81 | ~GpuCpuWords() { | 89 | ~Words() { |
| 82 | Release(); | 90 | Release(); |
| 83 | } | 91 | } |
| 84 | 92 | ||
| 85 | GpuCpuWords& operator=(GpuCpuWords&& rhs) noexcept { | 93 | Words& operator=(Words&& rhs) noexcept { |
| 86 | Release(); | 94 | Release(); |
| 87 | size_bytes = rhs.size_bytes; | 95 | size_bytes = rhs.size_bytes; |
| 88 | cpu = rhs.cpu; | 96 | cpu = rhs.cpu; |
| 89 | gpu = rhs.gpu; | 97 | gpu = rhs.gpu; |
| 98 | cached_cpu = rhs.cached_cpu; | ||
| 99 | untracked = rhs.untracked; | ||
| 90 | rhs.cpu.heap = nullptr; | 100 | rhs.cpu.heap = nullptr; |
| 91 | return *this; | 101 | return *this; |
| 92 | } | 102 | } |
| 93 | 103 | ||
| 94 | GpuCpuWords(GpuCpuWords&& rhs) noexcept | 104 | Words(Words&& rhs) noexcept |
| 95 | : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu} { | 105 | : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, |
| 106 | cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { | ||
| 96 | rhs.cpu.heap = nullptr; | 107 | rhs.cpu.heap = nullptr; |
| 97 | } | 108 | } |
| 98 | 109 | ||
| 99 | GpuCpuWords& operator=(const GpuCpuWords&) = delete; | 110 | Words& operator=(const Words&) = delete; |
| 100 | GpuCpuWords(const GpuCpuWords&) = delete; | 111 | Words(const Words&) = delete; |
| 101 | 112 | ||
| 102 | /// Returns true when the buffer fits in the small vector optimization | 113 | /// Returns true when the buffer fits in the small vector optimization |
| 103 | [[nodiscard]] bool IsShort() const noexcept { | 114 | [[nodiscard]] bool IsShort() const noexcept { |
| @@ -118,8 +129,17 @@ class BufferBase { | |||
| 118 | } | 129 | } |
| 119 | 130 | ||
| 120 | u64 size_bytes = 0; | 131 | u64 size_bytes = 0; |
| 121 | WrittenWords cpu; | 132 | WordsArray cpu; |
| 122 | WrittenWords gpu; | 133 | WordsArray gpu; |
| 134 | WordsArray cached_cpu; | ||
| 135 | WordsArray untracked; | ||
| 136 | }; | ||
| 137 | |||
| 138 | enum class Type { | ||
| 139 | CPU, | ||
| 140 | GPU, | ||
| 141 | CachedCPU, | ||
| 142 | Untracked, | ||
| 123 | }; | 143 | }; |
| 124 | 144 | ||
| 125 | public: | 145 | public: |
| @@ -132,68 +152,93 @@ public: | |||
| 132 | BufferBase& operator=(const BufferBase&) = delete; | 152 | BufferBase& operator=(const BufferBase&) = delete; |
| 133 | BufferBase(const BufferBase&) = delete; | 153 | BufferBase(const BufferBase&) = delete; |
| 134 | 154 | ||
| 155 | BufferBase& operator=(BufferBase&&) = default; | ||
| 156 | BufferBase(BufferBase&&) = default; | ||
| 157 | |||
| 135 | /// Returns the inclusive CPU modified range in a begin end pair | 158 | /// Returns the inclusive CPU modified range in a begin end pair |
| 136 | [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, | 159 | [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, |
| 137 | u64 query_size) const noexcept { | 160 | u64 query_size) const noexcept { |
| 138 | const u64 offset = query_cpu_addr - cpu_addr; | 161 | const u64 offset = query_cpu_addr - cpu_addr; |
| 139 | return ModifiedRegion<false>(offset, query_size); | 162 | return ModifiedRegion<Type::CPU>(offset, query_size); |
| 140 | } | 163 | } |
| 141 | 164 | ||
| 142 | /// Returns the inclusive GPU modified range in a begin end pair | 165 | /// Returns the inclusive GPU modified range in a begin end pair |
| 143 | [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, | 166 | [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, |
| 144 | u64 query_size) const noexcept { | 167 | u64 query_size) const noexcept { |
| 145 | const u64 offset = query_cpu_addr - cpu_addr; | 168 | const u64 offset = query_cpu_addr - cpu_addr; |
| 146 | return ModifiedRegion<true>(offset, query_size); | 169 | return ModifiedRegion<Type::GPU>(offset, query_size); |
| 147 | } | 170 | } |
| 148 | 171 | ||
| 149 | /// Returns true if a region has been modified from the CPU | 172 | /// Returns true if a region has been modified from the CPU |
| 150 | [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { | 173 | [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { |
| 151 | const u64 offset = query_cpu_addr - cpu_addr; | 174 | const u64 offset = query_cpu_addr - cpu_addr; |
| 152 | return IsRegionModified<false>(offset, query_size); | 175 | return IsRegionModified<Type::CPU>(offset, query_size); |
| 153 | } | 176 | } |
| 154 | 177 | ||
| 155 | /// Returns true if a region has been modified from the GPU | 178 | /// Returns true if a region has been modified from the GPU |
| 156 | [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { | 179 | [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { |
| 157 | const u64 offset = query_cpu_addr - cpu_addr; | 180 | const u64 offset = query_cpu_addr - cpu_addr; |
| 158 | return IsRegionModified<true>(offset, query_size); | 181 | return IsRegionModified<Type::GPU>(offset, query_size); |
| 159 | } | 182 | } |
| 160 | 183 | ||
| 161 | /// Mark region as CPU modified, notifying the rasterizer about this change | 184 | /// Mark region as CPU modified, notifying the rasterizer about this change |
| 162 | void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { | 185 | void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { |
| 163 | ChangeRegionState<true, true>(words.cpu, dirty_cpu_addr, size); | 186 | ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size); |
| 164 | } | 187 | } |
| 165 | 188 | ||
| 166 | /// Unmark region as CPU modified, notifying the rasterizer about this change | 189 | /// Unmark region as CPU modified, notifying the rasterizer about this change |
| 167 | void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { | 190 | void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { |
| 168 | ChangeRegionState<false, true>(words.cpu, dirty_cpu_addr, size); | 191 | ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size); |
| 169 | } | 192 | } |
| 170 | 193 | ||
| 171 | /// Mark region as modified from the host GPU | 194 | /// Mark region as modified from the host GPU |
| 172 | void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { | 195 | void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { |
| 173 | ChangeRegionState<true, false>(words.gpu, dirty_cpu_addr, size); | 196 | ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size); |
| 174 | } | 197 | } |
| 175 | 198 | ||
| 176 | /// Unmark region as modified from the host GPU | 199 | /// Unmark region as modified from the host GPU |
| 177 | void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { | 200 | void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { |
| 178 | ChangeRegionState<false, false>(words.gpu, dirty_cpu_addr, size); | 201 | ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size); |
| 202 | } | ||
| 203 | |||
| 204 | /// Mark region as modified from the CPU | ||
| 205 | /// but don't mark it as modified until FlusHCachedWrites is called. | ||
| 206 | void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { | ||
| 207 | flags |= BufferFlagBits::CachedWrites; | ||
| 208 | ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size); | ||
| 209 | } | ||
| 210 | |||
| 211 | /// Flushes cached CPU writes, and notify the rasterizer about the deltas | ||
| 212 | void FlushCachedWrites() noexcept { | ||
| 213 | flags &= ~BufferFlagBits::CachedWrites; | ||
| 214 | const u64 num_words = NumWords(); | ||
| 215 | const u64* const cached_words = Array<Type::CachedCPU>(); | ||
| 216 | u64* const untracked_words = Array<Type::Untracked>(); | ||
| 217 | u64* const cpu_words = Array<Type::CPU>(); | ||
| 218 | for (u64 word_index = 0; word_index < num_words; ++word_index) { | ||
| 219 | const u64 cached_bits = cached_words[word_index]; | ||
| 220 | NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits); | ||
| 221 | untracked_words[word_index] |= cached_bits; | ||
| 222 | cpu_words[word_index] |= cached_bits; | ||
| 223 | } | ||
| 179 | } | 224 | } |
| 180 | 225 | ||
| 181 | /// Call 'func' for each CPU modified range and unmark those pages as CPU modified | 226 | /// Call 'func' for each CPU modified range and unmark those pages as CPU modified |
| 182 | template <typename Func> | 227 | template <typename Func> |
| 183 | void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { | 228 | void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { |
| 184 | ForEachModifiedRange<false, true>(query_cpu_range, size, func); | 229 | ForEachModifiedRange<Type::CPU>(query_cpu_range, size, func); |
| 185 | } | 230 | } |
| 186 | 231 | ||
| 187 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified | 232 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified |
| 188 | template <typename Func> | 233 | template <typename Func> |
| 189 | void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) { | 234 | void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) { |
| 190 | ForEachModifiedRange<true, false>(query_cpu_range, size, func); | 235 | ForEachModifiedRange<Type::GPU>(query_cpu_range, size, func); |
| 191 | } | 236 | } |
| 192 | 237 | ||
| 193 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified | 238 | /// Call 'func' for each GPU modified range and unmark those pages as GPU modified |
| 194 | template <typename Func> | 239 | template <typename Func> |
| 195 | void ForEachDownloadRange(Func&& func) { | 240 | void ForEachDownloadRange(Func&& func) { |
| 196 | ForEachModifiedRange<true, false>(cpu_addr, SizeBytes(), func); | 241 | ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), func); |
| 197 | } | 242 | } |
| 198 | 243 | ||
| 199 | /// Mark buffer as picked | 244 | /// Mark buffer as picked |
| @@ -206,6 +251,16 @@ public: | |||
| 206 | flags &= ~BufferFlagBits::Picked; | 251 | flags &= ~BufferFlagBits::Picked; |
| 207 | } | 252 | } |
| 208 | 253 | ||
| 254 | /// Increases the likeliness of this being a stream buffer | ||
| 255 | void IncreaseStreamScore(int score) noexcept { | ||
| 256 | stream_score += score; | ||
| 257 | } | ||
| 258 | |||
| 259 | /// Returns the likeliness of this being a stream buffer | ||
| 260 | [[nodiscard]] int StreamScore() const noexcept { | ||
| 261 | return stream_score; | ||
| 262 | } | ||
| 263 | |||
| 209 | /// Returns true when vaddr -> vaddr+size is fully contained in the buffer | 264 | /// Returns true when vaddr -> vaddr+size is fully contained in the buffer |
| 210 | [[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept { | 265 | [[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept { |
| 211 | return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes(); | 266 | return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes(); |
| @@ -216,6 +271,11 @@ public: | |||
| 216 | return True(flags & BufferFlagBits::Picked); | 271 | return True(flags & BufferFlagBits::Picked); |
| 217 | } | 272 | } |
| 218 | 273 | ||
| 274 | /// Returns true when the buffer has pending cached writes | ||
| 275 | [[nodiscard]] bool HasCachedWrites() const noexcept { | ||
| 276 | return True(flags & BufferFlagBits::CachedWrites); | ||
| 277 | } | ||
| 278 | |||
| 219 | /// Returns the base CPU address of the buffer | 279 | /// Returns the base CPU address of the buffer |
| 220 | [[nodiscard]] VAddr CpuAddr() const noexcept { | 280 | [[nodiscard]] VAddr CpuAddr() const noexcept { |
| 221 | return cpu_addr; | 281 | return cpu_addr; |
| @@ -233,26 +293,48 @@ public: | |||
| 233 | } | 293 | } |
| 234 | 294 | ||
| 235 | private: | 295 | private: |
| 296 | template <Type type> | ||
| 297 | u64* Array() noexcept { | ||
| 298 | if constexpr (type == Type::CPU) { | ||
| 299 | return words.cpu.Pointer(IsShort()); | ||
| 300 | } else if constexpr (type == Type::GPU) { | ||
| 301 | return words.gpu.Pointer(IsShort()); | ||
| 302 | } else if constexpr (type == Type::CachedCPU) { | ||
| 303 | return words.cached_cpu.Pointer(IsShort()); | ||
| 304 | } else if constexpr (type == Type::Untracked) { | ||
| 305 | return words.untracked.Pointer(IsShort()); | ||
| 306 | } | ||
| 307 | } | ||
| 308 | |||
| 309 | template <Type type> | ||
| 310 | const u64* Array() const noexcept { | ||
| 311 | if constexpr (type == Type::CPU) { | ||
| 312 | return words.cpu.Pointer(IsShort()); | ||
| 313 | } else if constexpr (type == Type::GPU) { | ||
| 314 | return words.gpu.Pointer(IsShort()); | ||
| 315 | } else if constexpr (type == Type::CachedCPU) { | ||
| 316 | return words.cached_cpu.Pointer(IsShort()); | ||
| 317 | } else if constexpr (type == Type::Untracked) { | ||
| 318 | return words.untracked.Pointer(IsShort()); | ||
| 319 | } | ||
| 320 | } | ||
| 321 | |||
| 236 | /** | 322 | /** |
| 237 | * Change the state of a range of pages | 323 | * Change the state of a range of pages |
| 238 | * | 324 | * |
| 239 | * @param written_words Pages to be marked or unmarked as modified | ||
| 240 | * @param dirty_addr Base address to mark or unmark as modified | 325 | * @param dirty_addr Base address to mark or unmark as modified |
| 241 | * @param size Size in bytes to mark or unmark as modified | 326 | * @param size Size in bytes to mark or unmark as modified |
| 242 | * | ||
| 243 | * @tparam enable True when the bits will be set to one, false for zero | ||
| 244 | * @tparam notify_rasterizer True when the rasterizer has to be notified about the changes | ||
| 245 | */ | 327 | */ |
| 246 | template <bool enable, bool notify_rasterizer> | 328 | template <Type type, bool enable> |
| 247 | void ChangeRegionState(WrittenWords& written_words, u64 dirty_addr, | 329 | void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { |
| 248 | s64 size) noexcept(!notify_rasterizer) { | ||
| 249 | const s64 difference = dirty_addr - cpu_addr; | 330 | const s64 difference = dirty_addr - cpu_addr; |
| 250 | const u64 offset = std::max<s64>(difference, 0); | 331 | const u64 offset = std::max<s64>(difference, 0); |
| 251 | size += std::min<s64>(difference, 0); | 332 | size += std::min<s64>(difference, 0); |
| 252 | if (offset >= SizeBytes() || size < 0) { | 333 | if (offset >= SizeBytes() || size < 0) { |
| 253 | return; | 334 | return; |
| 254 | } | 335 | } |
| 255 | u64* const state_words = written_words.Pointer(IsShort()); | 336 | u64* const untracked_words = Array<Type::Untracked>(); |
| 337 | u64* const state_words = Array<type>(); | ||
| 256 | const u64 offset_end = std::min(offset + size, SizeBytes()); | 338 | const u64 offset_end = std::min(offset + size, SizeBytes()); |
| 257 | const u64 begin_page_index = offset / BYTES_PER_PAGE; | 339 | const u64 begin_page_index = offset / BYTES_PER_PAGE; |
| 258 | const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; | 340 | const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; |
| @@ -268,13 +350,19 @@ private: | |||
| 268 | u64 bits = ~u64{0}; | 350 | u64 bits = ~u64{0}; |
| 269 | bits = (bits >> right_offset) << right_offset; | 351 | bits = (bits >> right_offset) << right_offset; |
| 270 | bits = (bits << left_offset) >> left_offset; | 352 | bits = (bits << left_offset) >> left_offset; |
| 271 | if constexpr (notify_rasterizer) { | 353 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { |
| 272 | NotifyRasterizer<!enable>(word_index, state_words[word_index], bits); | 354 | NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits); |
| 273 | } | 355 | } |
| 274 | if constexpr (enable) { | 356 | if constexpr (enable) { |
| 275 | state_words[word_index] |= bits; | 357 | state_words[word_index] |= bits; |
| 358 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { | ||
| 359 | untracked_words[word_index] |= bits; | ||
| 360 | } | ||
| 276 | } else { | 361 | } else { |
| 277 | state_words[word_index] &= ~bits; | 362 | state_words[word_index] &= ~bits; |
| 363 | if constexpr (type == Type::CPU || type == Type::CachedCPU) { | ||
| 364 | untracked_words[word_index] &= ~bits; | ||
| 365 | } | ||
| 278 | } | 366 | } |
| 279 | page_index = 0; | 367 | page_index = 0; |
| 280 | ++word_index; | 368 | ++word_index; |
| @@ -291,7 +379,7 @@ private: | |||
| 291 | * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages | 379 | * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages |
| 292 | */ | 380 | */ |
| 293 | template <bool add_to_rasterizer> | 381 | template <bool add_to_rasterizer> |
| 294 | void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) { | 382 | void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { |
| 295 | u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; | 383 | u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; |
| 296 | VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; | 384 | VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; |
| 297 | while (changed_bits != 0) { | 385 | while (changed_bits != 0) { |
| @@ -315,21 +403,20 @@ private: | |||
| 315 | * @param query_cpu_range Base CPU address to loop over | 403 | * @param query_cpu_range Base CPU address to loop over |
| 316 | * @param size Size in bytes of the CPU range to loop over | 404 | * @param size Size in bytes of the CPU range to loop over |
| 317 | * @param func Function to call for each turned off region | 405 | * @param func Function to call for each turned off region |
| 318 | * | ||
| 319 | * @tparam gpu True for host GPU pages, false for CPU pages | ||
| 320 | * @tparam notify_rasterizer True when the rasterizer should be notified about state changes | ||
| 321 | */ | 406 | */ |
| 322 | template <bool gpu, bool notify_rasterizer, typename Func> | 407 | template <Type type, typename Func> |
| 323 | void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { | 408 | void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { |
| 409 | static_assert(type != Type::Untracked); | ||
| 410 | |||
| 324 | const s64 difference = query_cpu_range - cpu_addr; | 411 | const s64 difference = query_cpu_range - cpu_addr; |
| 325 | const u64 query_begin = std::max<s64>(difference, 0); | 412 | const u64 query_begin = std::max<s64>(difference, 0); |
| 326 | size += std::min<s64>(difference, 0); | 413 | size += std::min<s64>(difference, 0); |
| 327 | if (query_begin >= SizeBytes() || size < 0) { | 414 | if (query_begin >= SizeBytes() || size < 0) { |
| 328 | return; | 415 | return; |
| 329 | } | 416 | } |
| 330 | const u64* const cpu_words = words.cpu.Pointer(IsShort()); | 417 | u64* const untracked_words = Array<Type::Untracked>(); |
| 418 | u64* const state_words = Array<type>(); | ||
| 331 | const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); | 419 | const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); |
| 332 | u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort()); | ||
| 333 | u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; | 420 | u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; |
| 334 | u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); | 421 | u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); |
| 335 | 422 | ||
| @@ -345,7 +432,8 @@ private: | |||
| 345 | const u64 word_index_end = std::distance(state_words, last_modified_word); | 432 | const u64 word_index_end = std::distance(state_words, last_modified_word); |
| 346 | 433 | ||
| 347 | const unsigned local_page_begin = std::countr_zero(*first_modified_word); | 434 | const unsigned local_page_begin = std::countr_zero(*first_modified_word); |
| 348 | const unsigned local_page_end = PAGES_PER_WORD - std::countl_zero(last_modified_word[-1]); | 435 | const unsigned local_page_end = |
| 436 | static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); | ||
| 349 | const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; | 437 | const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; |
| 350 | const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; | 438 | const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; |
| 351 | const u64 query_page_begin = query_begin / BYTES_PER_PAGE; | 439 | const u64 query_page_begin = query_begin / BYTES_PER_PAGE; |
| @@ -371,11 +459,13 @@ private: | |||
| 371 | const u64 current_word = state_words[word_index] & bits; | 459 | const u64 current_word = state_words[word_index] & bits; |
| 372 | state_words[word_index] &= ~bits; | 460 | state_words[word_index] &= ~bits; |
| 373 | 461 | ||
| 374 | // Exclude CPU modified pages when visiting GPU pages | 462 | if constexpr (type == Type::CPU) { |
| 375 | const u64 word = current_word & ~(gpu ? cpu_words[word_index] : 0); | 463 | const u64 current_bits = untracked_words[word_index] & bits; |
| 376 | if constexpr (notify_rasterizer) { | 464 | untracked_words[word_index] &= ~bits; |
| 377 | NotifyRasterizer<true>(word_index, word, ~u64{0}); | 465 | NotifyRasterizer<true>(word_index, current_bits, ~u64{0}); |
| 378 | } | 466 | } |
| 467 | // Exclude CPU modified pages when visiting GPU pages | ||
| 468 | const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); | ||
| 379 | u64 page = page_begin; | 469 | u64 page = page_begin; |
| 380 | page_begin = 0; | 470 | page_begin = 0; |
| 381 | 471 | ||
| @@ -416,17 +506,20 @@ private: | |||
| 416 | * @param offset Offset in bytes from the start of the buffer | 506 | * @param offset Offset in bytes from the start of the buffer |
| 417 | * @param size Size in bytes of the region to query for modifications | 507 | * @param size Size in bytes of the region to query for modifications |
| 418 | */ | 508 | */ |
| 419 | template <bool gpu> | 509 | template <Type type> |
| 420 | [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { | 510 | [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { |
| 421 | const u64* const cpu_words = words.cpu.Pointer(IsShort()); | 511 | static_assert(type != Type::Untracked); |
| 422 | const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort()); | 512 | |
| 513 | const u64* const untracked_words = Array<Type::Untracked>(); | ||
| 514 | const u64* const state_words = Array<type>(); | ||
| 423 | const u64 num_query_words = size / BYTES_PER_WORD + 1; | 515 | const u64 num_query_words = size / BYTES_PER_WORD + 1; |
| 424 | const u64 word_begin = offset / BYTES_PER_WORD; | 516 | const u64 word_begin = offset / BYTES_PER_WORD; |
| 425 | const u64 word_end = std::min(word_begin + num_query_words, NumWords()); | 517 | const u64 word_end = std::min(word_begin + num_query_words, NumWords()); |
| 426 | const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); | 518 | const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); |
| 427 | u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; | 519 | u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; |
| 428 | for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { | 520 | for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { |
| 429 | const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0); | 521 | const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; |
| 522 | const u64 word = state_words[word_index] & ~off_word; | ||
| 430 | if (word == 0) { | 523 | if (word == 0) { |
| 431 | continue; | 524 | continue; |
| 432 | } | 525 | } |
| @@ -445,13 +538,13 @@ private: | |||
| 445 | * | 538 | * |
| 446 | * @param offset Offset in bytes from the start of the buffer | 539 | * @param offset Offset in bytes from the start of the buffer |
| 447 | * @param size Size in bytes of the region to query for modifications | 540 | * @param size Size in bytes of the region to query for modifications |
| 448 | * | ||
| 449 | * @tparam gpu True to query GPU modified pages, false for CPU pages | ||
| 450 | */ | 541 | */ |
| 451 | template <bool gpu> | 542 | template <Type type> |
| 452 | [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { | 543 | [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { |
| 453 | const u64* const cpu_words = words.cpu.Pointer(IsShort()); | 544 | static_assert(type != Type::Untracked); |
| 454 | const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort()); | 545 | |
| 546 | const u64* const untracked_words = Array<Type::Untracked>(); | ||
| 547 | const u64* const state_words = Array<type>(); | ||
| 455 | const u64 num_query_words = size / BYTES_PER_WORD + 1; | 548 | const u64 num_query_words = size / BYTES_PER_WORD + 1; |
| 456 | const u64 word_begin = offset / BYTES_PER_WORD; | 549 | const u64 word_begin = offset / BYTES_PER_WORD; |
| 457 | const u64 word_end = std::min(word_begin + num_query_words, NumWords()); | 550 | const u64 word_end = std::min(word_begin + num_query_words, NumWords()); |
| @@ -460,7 +553,8 @@ private: | |||
| 460 | u64 begin = std::numeric_limits<u64>::max(); | 553 | u64 begin = std::numeric_limits<u64>::max(); |
| 461 | u64 end = 0; | 554 | u64 end = 0; |
| 462 | for (u64 word_index = word_begin; word_index < word_end; ++word_index) { | 555 | for (u64 word_index = word_begin; word_index < word_end; ++word_index) { |
| 463 | const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0); | 556 | const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; |
| 557 | const u64 word = state_words[word_index] & ~off_word; | ||
| 464 | if (word == 0) { | 558 | if (word == 0) { |
| 465 | continue; | 559 | continue; |
| 466 | } | 560 | } |
| @@ -488,8 +582,9 @@ private: | |||
| 488 | 582 | ||
| 489 | RasterizerInterface* rasterizer = nullptr; | 583 | RasterizerInterface* rasterizer = nullptr; |
| 490 | VAddr cpu_addr = 0; | 584 | VAddr cpu_addr = 0; |
| 491 | GpuCpuWords words; | 585 | Words words; |
| 492 | BufferFlagBits flags{}; | 586 | BufferFlagBits flags{}; |
| 587 | int stream_score = 0; | ||
| 493 | }; | 588 | }; |
| 494 | 589 | ||
| 495 | } // namespace VideoCommon | 590 | } // namespace VideoCommon |
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h deleted file mode 100644 index e9306194a..000000000 --- a/src/video_core/buffer_cache/buffer_block.h +++ /dev/null | |||
| @@ -1,62 +0,0 @@ | |||
| 1 | // Copyright 2019 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include "common/common_types.h" | ||
| 8 | |||
| 9 | namespace VideoCommon { | ||
| 10 | |||
| 11 | class BufferBlock { | ||
| 12 | public: | ||
| 13 | [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const { | ||
| 14 | return (cpu_addr < end) && (cpu_addr_end > start); | ||
| 15 | } | ||
| 16 | |||
| 17 | [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const { | ||
| 18 | return cpu_addr <= other_start && other_end <= cpu_addr_end; | ||
| 19 | } | ||
| 20 | |||
| 21 | [[nodiscard]] std::size_t Offset(VAddr in_addr) const { | ||
| 22 | return static_cast<std::size_t>(in_addr - cpu_addr); | ||
| 23 | } | ||
| 24 | |||
| 25 | [[nodiscard]] VAddr CpuAddr() const { | ||
| 26 | return cpu_addr; | ||
| 27 | } | ||
| 28 | |||
| 29 | [[nodiscard]] VAddr CpuAddrEnd() const { | ||
| 30 | return cpu_addr_end; | ||
| 31 | } | ||
| 32 | |||
| 33 | void SetCpuAddr(VAddr new_addr) { | ||
| 34 | cpu_addr = new_addr; | ||
| 35 | cpu_addr_end = new_addr + size; | ||
| 36 | } | ||
| 37 | |||
| 38 | [[nodiscard]] std::size_t Size() const { | ||
| 39 | return size; | ||
| 40 | } | ||
| 41 | |||
| 42 | [[nodiscard]] u64 Epoch() const { | ||
| 43 | return epoch; | ||
| 44 | } | ||
| 45 | |||
| 46 | void SetEpoch(u64 new_epoch) { | ||
| 47 | epoch = new_epoch; | ||
| 48 | } | ||
| 49 | |||
| 50 | protected: | ||
| 51 | explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} { | ||
| 52 | SetCpuAddr(cpu_addr_); | ||
| 53 | } | ||
| 54 | |||
| 55 | private: | ||
| 56 | VAddr cpu_addr{}; | ||
| 57 | VAddr cpu_addr_end{}; | ||
| 58 | std::size_t size{}; | ||
| 59 | u64 epoch{}; | ||
| 60 | }; | ||
| 61 | |||
| 62 | } // namespace VideoCommon | ||
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp new file mode 100644 index 000000000..ab32294c8 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache.cpp | |||
| @@ -0,0 +1,13 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include "common/microprofile.h" | ||
| 6 | |||
| 7 | namespace VideoCommon { | ||
| 8 | |||
| 9 | MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128)); | ||
| 10 | MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128)); | ||
| 11 | MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128)); | ||
| 12 | |||
| 13 | } // namespace VideoCommon | ||
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 83b9ee871..2a6844ab1 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h | |||
| @@ -4,591 +4,1289 @@ | |||
| 4 | 4 | ||
| 5 | #pragma once | 5 | #pragma once |
| 6 | 6 | ||
| 7 | #include <list> | 7 | #include <algorithm> |
| 8 | #include <array> | ||
| 9 | #include <deque> | ||
| 8 | #include <memory> | 10 | #include <memory> |
| 9 | #include <mutex> | 11 | #include <mutex> |
| 12 | #include <span> | ||
| 10 | #include <unordered_map> | 13 | #include <unordered_map> |
| 11 | #include <unordered_set> | ||
| 12 | #include <utility> | ||
| 13 | #include <vector> | 14 | #include <vector> |
| 14 | 15 | ||
| 15 | #include <boost/container/small_vector.hpp> | 16 | #include <boost/container/small_vector.hpp> |
| 16 | #include <boost/icl/interval_set.hpp> | ||
| 17 | #include <boost/intrusive/set.hpp> | ||
| 18 | 17 | ||
| 19 | #include "common/alignment.h" | ||
| 20 | #include "common/assert.h" | ||
| 21 | #include "common/common_types.h" | 18 | #include "common/common_types.h" |
| 22 | #include "common/logging/log.h" | 19 | #include "common/div_ceil.h" |
| 23 | #include "core/core.h" | 20 | #include "common/microprofile.h" |
| 21 | #include "common/scope_exit.h" | ||
| 24 | #include "core/memory.h" | 22 | #include "core/memory.h" |
| 25 | #include "core/settings.h" | 23 | #include "core/settings.h" |
| 26 | #include "video_core/buffer_cache/buffer_block.h" | 24 | #include "video_core/buffer_cache/buffer_base.h" |
| 27 | #include "video_core/buffer_cache/map_interval.h" | 25 | #include "video_core/delayed_destruction_ring.h" |
| 26 | #include "video_core/dirty_flags.h" | ||
| 27 | #include "video_core/engines/kepler_compute.h" | ||
| 28 | #include "video_core/engines/maxwell_3d.h" | ||
| 28 | #include "video_core/memory_manager.h" | 29 | #include "video_core/memory_manager.h" |
| 29 | #include "video_core/rasterizer_interface.h" | 30 | #include "video_core/rasterizer_interface.h" |
| 31 | #include "video_core/texture_cache/slot_vector.h" | ||
| 32 | #include "video_core/texture_cache/types.h" | ||
| 30 | 33 | ||
| 31 | namespace VideoCommon { | 34 | namespace VideoCommon { |
| 32 | 35 | ||
| 33 | template <typename Buffer, typename BufferType, typename StreamBuffer> | 36 | MICROPROFILE_DECLARE(GPU_PrepareBuffers); |
| 37 | MICROPROFILE_DECLARE(GPU_BindUploadBuffers); | ||
| 38 | MICROPROFILE_DECLARE(GPU_DownloadMemory); | ||
| 39 | |||
| 40 | using BufferId = SlotId; | ||
| 41 | |||
| 42 | constexpr u32 NUM_VERTEX_BUFFERS = 32; | ||
| 43 | constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; | ||
| 44 | constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; | ||
| 45 | constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; | ||
| 46 | constexpr u32 NUM_STORAGE_BUFFERS = 16; | ||
| 47 | constexpr u32 NUM_STAGES = 5; | ||
| 48 | |||
| 49 | template <typename P> | ||
| 34 | class BufferCache { | 50 | class BufferCache { |
| 35 | using IntervalSet = boost::icl::interval_set<VAddr>; | 51 | // Page size for caching purposes. |
| 36 | using IntervalType = typename IntervalSet::interval_type; | 52 | // This is unrelated to the CPU page size and it can be changed as it seems optimal. |
| 37 | using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>; | 53 | static constexpr u32 PAGE_BITS = 16; |
| 54 | static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; | ||
| 55 | |||
| 56 | static constexpr bool IS_OPENGL = P::IS_OPENGL; | ||
| 57 | static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = | ||
| 58 | P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; | ||
| 59 | static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = | ||
| 60 | P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; | ||
| 61 | static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; | ||
| 62 | static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; | ||
| 63 | static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; | ||
| 64 | |||
| 65 | static constexpr BufferId NULL_BUFFER_ID{0}; | ||
| 66 | |||
| 67 | using Maxwell = Tegra::Engines::Maxwell3D::Regs; | ||
| 68 | |||
| 69 | using Runtime = typename P::Runtime; | ||
| 70 | using Buffer = typename P::Buffer; | ||
| 71 | |||
| 72 | struct Empty {}; | ||
| 73 | |||
| 74 | struct OverlapResult { | ||
| 75 | std::vector<BufferId> ids; | ||
| 76 | VAddr begin; | ||
| 77 | VAddr end; | ||
| 78 | bool has_stream_leap = false; | ||
| 79 | }; | ||
| 38 | 80 | ||
| 39 | static constexpr u64 WRITE_PAGE_BIT = 11; | 81 | struct Binding { |
| 40 | static constexpr u64 BLOCK_PAGE_BITS = 21; | 82 | VAddr cpu_addr{}; |
| 41 | static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; | 83 | u32 size{}; |
| 84 | BufferId buffer_id; | ||
| 85 | }; | ||
| 42 | 86 | ||
| 43 | public: | 87 | static constexpr Binding NULL_BINDING{ |
| 44 | struct BufferInfo { | 88 | .cpu_addr = 0, |
| 45 | BufferType handle; | 89 | .size = 0, |
| 46 | u64 offset; | 90 | .buffer_id = NULL_BUFFER_ID, |
| 47 | u64 address; | ||
| 48 | }; | 91 | }; |
| 49 | 92 | ||
| 50 | BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, | 93 | public: |
| 51 | bool is_written = false, bool use_fast_cbuf = false) { | 94 | static constexpr u32 SKIP_CACHE_SIZE = 4096; |
| 52 | std::lock_guard lock{mutex}; | ||
| 53 | 95 | ||
| 54 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); | 96 | explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, |
| 55 | if (!cpu_addr) { | 97 | Tegra::Engines::Maxwell3D& maxwell3d_, |
| 56 | return GetEmptyBuffer(size); | 98 | Tegra::Engines::KeplerCompute& kepler_compute_, |
| 57 | } | 99 | Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, |
| 100 | Runtime& runtime_); | ||
| 58 | 101 | ||
| 59 | // Cache management is a big overhead, so only cache entries with a given size. | 102 | void TickFrame(); |
| 60 | // TODO: Figure out which size is the best for given games. | ||
| 61 | constexpr std::size_t max_stream_size = 0x800; | ||
| 62 | if (use_fast_cbuf || size < max_stream_size) { | ||
| 63 | if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) { | ||
| 64 | const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size); | ||
| 65 | if (use_fast_cbuf) { | ||
| 66 | u8* dest; | ||
| 67 | if (is_granular) { | ||
| 68 | dest = gpu_memory.GetPointer(gpu_addr); | ||
| 69 | } else { | ||
| 70 | staging_buffer.resize(size); | ||
| 71 | dest = staging_buffer.data(); | ||
| 72 | gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size); | ||
| 73 | } | ||
| 74 | return ConstBufferUpload(dest, size); | ||
| 75 | } | ||
| 76 | if (is_granular) { | ||
| 77 | u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); | ||
| 78 | return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) { | ||
| 79 | std::memcpy(dest, host_ptr, size); | ||
| 80 | }); | ||
| 81 | } else { | ||
| 82 | return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) { | ||
| 83 | gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size); | ||
| 84 | }); | ||
| 85 | } | ||
| 86 | } | ||
| 87 | } | ||
| 88 | 103 | ||
| 89 | Buffer* const block = GetBlock(*cpu_addr, size); | 104 | void WriteMemory(VAddr cpu_addr, u64 size); |
| 90 | MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size); | ||
| 91 | if (!map) { | ||
| 92 | return GetEmptyBuffer(size); | ||
| 93 | } | ||
| 94 | if (is_written) { | ||
| 95 | map->MarkAsModified(true, GetModifiedTicks()); | ||
| 96 | if (Settings::IsGPULevelHigh() && | ||
| 97 | Settings::values.use_asynchronous_gpu_emulation.GetValue()) { | ||
| 98 | MarkForAsyncFlush(map); | ||
| 99 | } | ||
| 100 | if (!map->is_written) { | ||
| 101 | map->is_written = true; | ||
| 102 | MarkRegionAsWritten(map->start, map->end - 1); | ||
| 103 | } | ||
| 104 | } | ||
| 105 | 105 | ||
| 106 | return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()}; | 106 | void CachedWriteMemory(VAddr cpu_addr, u64 size); |
| 107 | } | ||
| 108 | 107 | ||
| 109 | /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. | 108 | void DownloadMemory(VAddr cpu_addr, u64 size); |
| 110 | BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, | ||
| 111 | std::size_t alignment = 4) { | ||
| 112 | std::lock_guard lock{mutex}; | ||
| 113 | return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) { | ||
| 114 | std::memcpy(dest, raw_pointer, size); | ||
| 115 | }); | ||
| 116 | } | ||
| 117 | 109 | ||
| 118 | /// Prepares the buffer cache for data uploading | 110 | void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); |
| 119 | /// @param max_size Maximum number of bytes that will be uploaded | ||
| 120 | /// @return True when a stream buffer invalidation was required, false otherwise | ||
| 121 | void Map(std::size_t max_size) { | ||
| 122 | std::lock_guard lock{mutex}; | ||
| 123 | 111 | ||
| 124 | std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4); | 112 | void UpdateGraphicsBuffers(bool is_indexed); |
| 125 | buffer_offset = buffer_offset_base; | ||
| 126 | } | ||
| 127 | 113 | ||
| 128 | /// Finishes the upload stream | 114 | void UpdateComputeBuffers(); |
| 129 | void Unmap() { | ||
| 130 | std::lock_guard lock{mutex}; | ||
| 131 | stream_buffer.Unmap(buffer_offset - buffer_offset_base); | ||
| 132 | } | ||
| 133 | 115 | ||
| 134 | /// Function called at the end of each frame, inteded for deferred operations | 116 | void BindHostGeometryBuffers(bool is_indexed); |
| 135 | void TickFrame() { | ||
| 136 | ++epoch; | ||
| 137 | 117 | ||
| 138 | while (!pending_destruction.empty()) { | 118 | void BindHostStageBuffers(size_t stage); |
| 139 | // Delay at least 4 frames before destruction. | ||
| 140 | // This is due to triple buffering happening on some drivers. | ||
| 141 | static constexpr u64 epochs_to_destroy = 5; | ||
| 142 | if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) { | ||
| 143 | break; | ||
| 144 | } | ||
| 145 | pending_destruction.pop(); | ||
| 146 | } | ||
| 147 | } | ||
| 148 | 119 | ||
| 149 | /// Write any cached resources overlapping the specified region back to memory | 120 | void BindHostComputeBuffers(); |
| 150 | void FlushRegion(VAddr addr, std::size_t size) { | ||
| 151 | std::lock_guard lock{mutex}; | ||
| 152 | 121 | ||
| 153 | VectorMapInterval objects = GetMapsInRange(addr, size); | 122 | void SetEnabledUniformBuffers(size_t stage, u32 enabled); |
| 154 | std::sort(objects.begin(), objects.end(), | ||
| 155 | [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; }); | ||
| 156 | for (MapInterval* object : objects) { | ||
| 157 | if (object->is_modified && object->is_registered) { | ||
| 158 | mutex.unlock(); | ||
| 159 | FlushMap(object); | ||
| 160 | mutex.lock(); | ||
| 161 | } | ||
| 162 | } | ||
| 163 | } | ||
| 164 | 123 | ||
| 165 | bool MustFlushRegion(VAddr addr, std::size_t size) { | 124 | void SetEnabledComputeUniformBuffers(u32 enabled); |
| 166 | std::lock_guard lock{mutex}; | ||
| 167 | 125 | ||
| 168 | const VectorMapInterval objects = GetMapsInRange(addr, size); | 126 | void UnbindGraphicsStorageBuffers(size_t stage); |
| 169 | return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) { | ||
| 170 | return map->is_modified && map->is_registered; | ||
| 171 | }); | ||
| 172 | } | ||
| 173 | 127 | ||
| 174 | /// Mark the specified region as being invalidated | 128 | void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, |
| 175 | void InvalidateRegion(VAddr addr, u64 size) { | 129 | bool is_written); |
| 176 | std::lock_guard lock{mutex}; | ||
| 177 | 130 | ||
| 178 | for (auto& object : GetMapsInRange(addr, size)) { | 131 | void UnbindComputeStorageBuffers(); |
| 179 | if (object->is_registered) { | ||
| 180 | Unregister(object); | ||
| 181 | } | ||
| 182 | } | ||
| 183 | } | ||
| 184 | 132 | ||
| 185 | void OnCPUWrite(VAddr addr, std::size_t size) { | 133 | void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, |
| 186 | std::lock_guard lock{mutex}; | 134 | bool is_written); |
| 187 | 135 | ||
| 188 | for (MapInterval* object : GetMapsInRange(addr, size)) { | 136 | void FlushCachedWrites(); |
| 189 | if (object->is_memory_marked && object->is_registered) { | ||
| 190 | UnmarkMemory(object); | ||
| 191 | object->is_sync_pending = true; | ||
| 192 | marked_for_unregister.emplace_back(object); | ||
| 193 | } | ||
| 194 | } | ||
| 195 | } | ||
| 196 | 137 | ||
| 197 | void SyncGuestHost() { | 138 | /// Return true when there are uncommitted buffers to be downloaded |
| 198 | std::lock_guard lock{mutex}; | 139 | [[nodiscard]] bool HasUncommittedFlushes() const noexcept; |
| 199 | 140 | ||
| 200 | for (auto& object : marked_for_unregister) { | 141 | /// Return true when the caller should wait for async downloads |
| 201 | if (object->is_registered) { | 142 | [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; |
| 202 | object->is_sync_pending = false; | 143 | |
| 203 | Unregister(object); | 144 | /// Commit asynchronous downloads |
| 204 | } | 145 | void CommitAsyncFlushes(); |
| 146 | |||
| 147 | /// Pop asynchronous downloads | ||
| 148 | void PopAsyncFlushes(); | ||
| 149 | |||
| 150 | /// Return true when a CPU region is modified from the GPU | ||
| 151 | [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); | ||
| 152 | |||
| 153 | std::mutex mutex; | ||
| 154 | |||
| 155 | private: | ||
| 156 | template <typename Func> | ||
| 157 | static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { | ||
| 158 | for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { | ||
| 159 | const int disabled_bits = std::countr_zero(enabled_mask); | ||
| 160 | index += disabled_bits; | ||
| 161 | enabled_mask >>= disabled_bits; | ||
| 162 | func(index); | ||
| 205 | } | 163 | } |
| 206 | marked_for_unregister.clear(); | ||
| 207 | } | 164 | } |
| 208 | 165 | ||
| 209 | void CommitAsyncFlushes() { | 166 | template <typename Func> |
| 210 | if (uncommitted_flushes) { | 167 | void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { |
| 211 | auto commit_list = std::make_shared<std::list<MapInterval*>>(); | 168 | const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); |
| 212 | for (MapInterval* map : *uncommitted_flushes) { | 169 | for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { |
| 213 | if (map->is_registered && map->is_modified) { | 170 | const BufferId buffer_id = page_table[page]; |
| 214 | // TODO(Blinkhawk): Implement backend asynchronous flushing | 171 | if (!buffer_id) { |
| 215 | // AsyncFlushMap(map) | 172 | ++page; |
| 216 | commit_list->push_back(map); | 173 | continue; |
| 217 | } | ||
| 218 | } | ||
| 219 | if (!commit_list->empty()) { | ||
| 220 | committed_flushes.push_back(commit_list); | ||
| 221 | } else { | ||
| 222 | committed_flushes.emplace_back(); | ||
| 223 | } | 174 | } |
| 224 | } else { | 175 | Buffer& buffer = slot_buffers[buffer_id]; |
| 225 | committed_flushes.emplace_back(); | 176 | func(buffer_id, buffer); |
| 177 | |||
| 178 | const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); | ||
| 179 | page = Common::DivCeil(end_addr, PAGE_SIZE); | ||
| 226 | } | 180 | } |
| 227 | uncommitted_flushes.reset(); | ||
| 228 | } | 181 | } |
| 229 | 182 | ||
| 230 | bool ShouldWaitAsyncFlushes() const { | 183 | static bool IsRangeGranular(VAddr cpu_addr, size_t size) { |
| 231 | return !committed_flushes.empty() && committed_flushes.front() != nullptr; | 184 | return (cpu_addr & ~Core::Memory::PAGE_MASK) == |
| 185 | ((cpu_addr + size) & ~Core::Memory::PAGE_MASK); | ||
| 232 | } | 186 | } |
| 233 | 187 | ||
| 234 | bool HasUncommittedFlushes() const { | 188 | void BindHostIndexBuffer(); |
| 235 | return uncommitted_flushes != nullptr; | ||
| 236 | } | ||
| 237 | 189 | ||
| 238 | void PopAsyncFlushes() { | 190 | void BindHostVertexBuffers(); |
| 239 | if (committed_flushes.empty()) { | ||
| 240 | return; | ||
| 241 | } | ||
| 242 | auto& flush_list = committed_flushes.front(); | ||
| 243 | if (!flush_list) { | ||
| 244 | committed_flushes.pop_front(); | ||
| 245 | return; | ||
| 246 | } | ||
| 247 | for (MapInterval* map : *flush_list) { | ||
| 248 | if (map->is_registered) { | ||
| 249 | // TODO(Blinkhawk): Replace this for reading the asynchronous flush | ||
| 250 | FlushMap(map); | ||
| 251 | } | ||
| 252 | } | ||
| 253 | committed_flushes.pop_front(); | ||
| 254 | } | ||
| 255 | 191 | ||
| 256 | virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; | 192 | void BindHostGraphicsUniformBuffers(size_t stage); |
| 257 | 193 | ||
| 258 | protected: | 194 | void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); |
| 259 | explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, | ||
| 260 | Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, | ||
| 261 | StreamBuffer& stream_buffer_) | ||
| 262 | : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, | ||
| 263 | stream_buffer{stream_buffer_} {} | ||
| 264 | 195 | ||
| 265 | ~BufferCache() = default; | 196 | void BindHostGraphicsStorageBuffers(size_t stage); |
| 266 | 197 | ||
| 267 | virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0; | 198 | void BindHostTransformFeedbackBuffers(); |
| 268 | 199 | ||
| 269 | virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { | 200 | void BindHostComputeUniformBuffers(); |
| 270 | return {}; | ||
| 271 | } | ||
| 272 | 201 | ||
| 273 | /// Register an object into the cache | 202 | void BindHostComputeStorageBuffers(); |
| 274 | MapInterval* Register(MapInterval new_map, bool inherit_written = false) { | ||
| 275 | const VAddr cpu_addr = new_map.start; | ||
| 276 | if (!cpu_addr) { | ||
| 277 | LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}", | ||
| 278 | new_map.gpu_addr); | ||
| 279 | return nullptr; | ||
| 280 | } | ||
| 281 | const std::size_t size = new_map.end - new_map.start; | ||
| 282 | new_map.is_registered = true; | ||
| 283 | rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1); | ||
| 284 | new_map.is_memory_marked = true; | ||
| 285 | if (inherit_written) { | ||
| 286 | MarkRegionAsWritten(new_map.start, new_map.end - 1); | ||
| 287 | new_map.is_written = true; | ||
| 288 | } | ||
| 289 | MapInterval* const storage = mapped_addresses_allocator.Allocate(); | ||
| 290 | *storage = new_map; | ||
| 291 | mapped_addresses.insert(*storage); | ||
| 292 | return storage; | ||
| 293 | } | ||
| 294 | 203 | ||
| 295 | void UnmarkMemory(MapInterval* map) { | 204 | void DoUpdateGraphicsBuffers(bool is_indexed); |
| 296 | if (!map->is_memory_marked) { | 205 | |
| 297 | return; | 206 | void DoUpdateComputeBuffers(); |
| 298 | } | 207 | |
| 299 | const std::size_t size = map->end - map->start; | 208 | void UpdateIndexBuffer(); |
| 300 | rasterizer.UpdatePagesCachedCount(map->start, size, -1); | 209 | |
| 301 | map->is_memory_marked = false; | 210 | void UpdateVertexBuffers(); |
| 302 | } | 211 | |
| 303 | 212 | void UpdateVertexBuffer(u32 index); | |
| 304 | /// Unregisters an object from the cache | 213 | |
| 305 | void Unregister(MapInterval* map) { | 214 | void UpdateUniformBuffers(size_t stage); |
| 306 | UnmarkMemory(map); | 215 | |
| 307 | map->is_registered = false; | 216 | void UpdateStorageBuffers(size_t stage); |
| 308 | if (map->is_sync_pending) { | 217 | |
| 309 | map->is_sync_pending = false; | 218 | void UpdateTransformFeedbackBuffers(); |
| 310 | marked_for_unregister.remove(map); | 219 | |
| 220 | void UpdateTransformFeedbackBuffer(u32 index); | ||
| 221 | |||
| 222 | void UpdateComputeUniformBuffers(); | ||
| 223 | |||
| 224 | void UpdateComputeStorageBuffers(); | ||
| 225 | |||
| 226 | void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); | ||
| 227 | |||
| 228 | [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); | ||
| 229 | |||
| 230 | [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); | ||
| 231 | |||
| 232 | void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); | ||
| 233 | |||
| 234 | [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); | ||
| 235 | |||
| 236 | void Register(BufferId buffer_id); | ||
| 237 | |||
| 238 | void Unregister(BufferId buffer_id); | ||
| 239 | |||
| 240 | template <bool insert> | ||
| 241 | void ChangeRegister(BufferId buffer_id); | ||
| 242 | |||
| 243 | void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); | ||
| 244 | |||
| 245 | void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); | ||
| 246 | |||
| 247 | void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, | ||
| 248 | std::span<BufferCopy> copies); | ||
| 249 | |||
| 250 | void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, | ||
| 251 | std::span<const BufferCopy> copies); | ||
| 252 | |||
| 253 | void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); | ||
| 254 | |||
| 255 | void DeleteBuffer(BufferId buffer_id); | ||
| 256 | |||
| 257 | void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); | ||
| 258 | |||
| 259 | void NotifyBufferDeletion(); | ||
| 260 | |||
| 261 | [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const; | ||
| 262 | |||
| 263 | [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size); | ||
| 264 | |||
| 265 | [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity); | ||
| 266 | |||
| 267 | [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; | ||
| 268 | |||
| 269 | VideoCore::RasterizerInterface& rasterizer; | ||
| 270 | Tegra::Engines::Maxwell3D& maxwell3d; | ||
| 271 | Tegra::Engines::KeplerCompute& kepler_compute; | ||
| 272 | Tegra::MemoryManager& gpu_memory; | ||
| 273 | Core::Memory::Memory& cpu_memory; | ||
| 274 | Runtime& runtime; | ||
| 275 | |||
| 276 | SlotVector<Buffer> slot_buffers; | ||
| 277 | DelayedDestructionRing<Buffer, 8> delayed_destruction_ring; | ||
| 278 | |||
| 279 | u32 last_index_count = 0; | ||
| 280 | |||
| 281 | Binding index_buffer; | ||
| 282 | std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers; | ||
| 283 | std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers; | ||
| 284 | std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers; | ||
| 285 | std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers; | ||
| 286 | |||
| 287 | std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers; | ||
| 288 | std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers; | ||
| 289 | |||
| 290 | std::array<u32, NUM_STAGES> enabled_uniform_buffers{}; | ||
| 291 | u32 enabled_compute_uniform_buffers = 0; | ||
| 292 | |||
| 293 | std::array<u32, NUM_STAGES> enabled_storage_buffers{}; | ||
| 294 | std::array<u32, NUM_STAGES> written_storage_buffers{}; | ||
| 295 | u32 enabled_compute_storage_buffers = 0; | ||
| 296 | u32 written_compute_storage_buffers = 0; | ||
| 297 | |||
| 298 | std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{}; | ||
| 299 | |||
| 300 | bool has_deleted_buffers = false; | ||
| 301 | |||
| 302 | std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty> | ||
| 303 | dirty_uniform_buffers{}; | ||
| 304 | |||
| 305 | std::vector<BufferId> cached_write_buffer_ids; | ||
| 306 | |||
| 307 | // TODO: This data structure is not optimal and it should be reworked | ||
| 308 | std::vector<BufferId> uncommitted_downloads; | ||
| 309 | std::deque<std::vector<BufferId>> committed_downloads; | ||
| 310 | |||
| 311 | size_t immediate_buffer_capacity = 0; | ||
| 312 | std::unique_ptr<u8[]> immediate_buffer_alloc; | ||
| 313 | |||
| 314 | std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; | ||
| 315 | }; | ||
| 316 | |||
| 317 | template <class P> | ||
| 318 | BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, | ||
| 319 | Tegra::Engines::Maxwell3D& maxwell3d_, | ||
| 320 | Tegra::Engines::KeplerCompute& kepler_compute_, | ||
| 321 | Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, | ||
| 322 | Runtime& runtime_) | ||
| 323 | : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_}, | ||
| 324 | gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} { | ||
| 325 | // Ensure the first slot is used for the null buffer | ||
| 326 | void(slot_buffers.insert(runtime, NullBufferParams{})); | ||
| 327 | } | ||
| 328 | |||
| 329 | template <class P> | ||
| 330 | void BufferCache<P>::TickFrame() { | ||
| 331 | delayed_destruction_ring.Tick(); | ||
| 332 | } | ||
| 333 | |||
| 334 | template <class P> | ||
| 335 | void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) { | ||
| 336 | ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { | ||
| 337 | buffer.MarkRegionAsCpuModified(cpu_addr, size); | ||
| 338 | }); | ||
| 339 | } | ||
| 340 | |||
| 341 | template <class P> | ||
| 342 | void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { | ||
| 343 | ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { | ||
| 344 | if (!buffer.HasCachedWrites()) { | ||
| 345 | cached_write_buffer_ids.push_back(buffer_id); | ||
| 311 | } | 346 | } |
| 312 | if (map->is_written) { | 347 | buffer.CachedCpuWrite(cpu_addr, size); |
| 313 | UnmarkRegionAsWritten(map->start, map->end - 1); | 348 | }); |
| 349 | } | ||
| 350 | |||
| 351 | template <class P> | ||
| 352 | void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { | ||
| 353 | ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { | ||
| 354 | boost::container::small_vector<BufferCopy, 1> copies; | ||
| 355 | u64 total_size_bytes = 0; | ||
| 356 | u64 largest_copy = 0; | ||
| 357 | buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { | ||
| 358 | copies.push_back(BufferCopy{ | ||
| 359 | .src_offset = range_offset, | ||
| 360 | .dst_offset = total_size_bytes, | ||
| 361 | .size = range_size, | ||
| 362 | }); | ||
| 363 | total_size_bytes += range_size; | ||
| 364 | largest_copy = std::max(largest_copy, range_size); | ||
| 365 | }); | ||
| 366 | if (total_size_bytes == 0) { | ||
| 367 | return; | ||
| 314 | } | 368 | } |
| 315 | const auto it = mapped_addresses.find(*map); | 369 | MICROPROFILE_SCOPE(GPU_DownloadMemory); |
| 316 | ASSERT(it != mapped_addresses.end()); | 370 | |
| 317 | mapped_addresses.erase(it); | 371 | if constexpr (USE_MEMORY_MAPS) { |
| 318 | mapped_addresses_allocator.Release(map); | 372 | auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); |
| 319 | } | 373 | const u8* const mapped_memory = download_staging.mapped_span.data(); |
| 320 | 374 | const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); | |
| 321 | private: | 375 | for (BufferCopy& copy : copies) { |
| 322 | MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { | 376 | // Modify copies to have the staging offset in mind |
| 323 | const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); | 377 | copy.dst_offset += download_staging.offset; |
| 324 | if (overlaps.empty()) { | ||
| 325 | const VAddr cpu_addr_end = cpu_addr + size; | ||
| 326 | if (gpu_memory.IsGranularRange(gpu_addr, size)) { | ||
| 327 | u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); | ||
| 328 | block->Upload(block->Offset(cpu_addr), size, host_ptr); | ||
| 329 | } else { | ||
| 330 | staging_buffer.resize(size); | ||
| 331 | gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); | ||
| 332 | block->Upload(block->Offset(cpu_addr), size, staging_buffer.data()); | ||
| 333 | } | 378 | } |
| 334 | return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); | 379 | runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); |
| 335 | } | 380 | runtime.Finish(); |
| 336 | 381 | for (const BufferCopy& copy : copies) { | |
| 337 | const VAddr cpu_addr_end = cpu_addr + size; | 382 | const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; |
| 338 | if (overlaps.size() == 1) { | 383 | // Undo the modified offset |
| 339 | MapInterval* const current_map = overlaps[0]; | 384 | const u64 dst_offset = copy.dst_offset - download_staging.offset; |
| 340 | if (current_map->IsInside(cpu_addr, cpu_addr_end)) { | 385 | const u8* copy_mapped_memory = mapped_memory + dst_offset; |
| 341 | return current_map; | 386 | cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); |
| 387 | } | ||
| 388 | } else { | ||
| 389 | const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); | ||
| 390 | for (const BufferCopy& copy : copies) { | ||
| 391 | buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); | ||
| 392 | const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; | ||
| 393 | cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); | ||
| 342 | } | 394 | } |
| 343 | } | 395 | } |
| 344 | VAddr new_start = cpu_addr; | 396 | }); |
| 345 | VAddr new_end = cpu_addr_end; | 397 | } |
| 346 | bool write_inheritance = false; | 398 | |
| 347 | bool modified_inheritance = false; | 399 | template <class P> |
| 348 | // Calculate new buffer parameters | 400 | void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, |
| 349 | for (MapInterval* overlap : overlaps) { | 401 | u32 size) { |
| 350 | new_start = std::min(overlap->start, new_start); | 402 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); |
| 351 | new_end = std::max(overlap->end, new_end); | 403 | if (!cpu_addr) { |
| 352 | write_inheritance |= overlap->is_written; | 404 | uniform_buffers[stage][index] = NULL_BINDING; |
| 353 | modified_inheritance |= overlap->is_modified; | 405 | return; |
| 406 | } | ||
| 407 | const Binding binding{ | ||
| 408 | .cpu_addr = *cpu_addr, | ||
| 409 | .size = size, | ||
| 410 | .buffer_id = BufferId{}, | ||
| 411 | }; | ||
| 412 | uniform_buffers[stage][index] = binding; | ||
| 413 | } | ||
| 414 | |||
| 415 | template <class P> | ||
| 416 | void BufferCache<P>::UpdateGraphicsBuffers(bool is_indexed) { | ||
| 417 | MICROPROFILE_SCOPE(GPU_PrepareBuffers); | ||
| 418 | do { | ||
| 419 | has_deleted_buffers = false; | ||
| 420 | DoUpdateGraphicsBuffers(is_indexed); | ||
| 421 | } while (has_deleted_buffers); | ||
| 422 | } | ||
| 423 | |||
| 424 | template <class P> | ||
| 425 | void BufferCache<P>::UpdateComputeBuffers() { | ||
| 426 | MICROPROFILE_SCOPE(GPU_PrepareBuffers); | ||
| 427 | do { | ||
| 428 | has_deleted_buffers = false; | ||
| 429 | DoUpdateComputeBuffers(); | ||
| 430 | } while (has_deleted_buffers); | ||
| 431 | } | ||
| 432 | |||
| 433 | template <class P> | ||
| 434 | void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) { | ||
| 435 | MICROPROFILE_SCOPE(GPU_BindUploadBuffers); | ||
| 436 | if (is_indexed) { | ||
| 437 | BindHostIndexBuffer(); | ||
| 438 | } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { | ||
| 439 | const auto& regs = maxwell3d.regs; | ||
| 440 | if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) { | ||
| 441 | runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count); | ||
| 354 | } | 442 | } |
| 355 | GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; | 443 | } |
| 356 | for (auto& overlap : overlaps) { | 444 | BindHostVertexBuffers(); |
| 357 | Unregister(overlap); | 445 | BindHostTransformFeedbackBuffers(); |
| 446 | } | ||
| 447 | |||
| 448 | template <class P> | ||
| 449 | void BufferCache<P>::BindHostStageBuffers(size_t stage) { | ||
| 450 | MICROPROFILE_SCOPE(GPU_BindUploadBuffers); | ||
| 451 | BindHostGraphicsUniformBuffers(stage); | ||
| 452 | BindHostGraphicsStorageBuffers(stage); | ||
| 453 | } | ||
| 454 | |||
| 455 | template <class P> | ||
| 456 | void BufferCache<P>::BindHostComputeBuffers() { | ||
| 457 | MICROPROFILE_SCOPE(GPU_BindUploadBuffers); | ||
| 458 | BindHostComputeUniformBuffers(); | ||
| 459 | BindHostComputeStorageBuffers(); | ||
| 460 | } | ||
| 461 | |||
| 462 | template <class P> | ||
| 463 | void BufferCache<P>::SetEnabledUniformBuffers(size_t stage, u32 enabled) { | ||
| 464 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { | ||
| 465 | if (enabled_uniform_buffers[stage] != enabled) { | ||
| 466 | dirty_uniform_buffers[stage] = ~u32{0}; | ||
| 358 | } | 467 | } |
| 359 | UpdateBlock(block, new_start, new_end, overlaps); | 468 | } |
| 360 | 469 | enabled_uniform_buffers[stage] = enabled; | |
| 361 | const MapInterval new_map{new_start, new_end, new_gpu_addr}; | 470 | } |
| 362 | MapInterval* const map = Register(new_map, write_inheritance); | 471 | |
| 363 | if (!map) { | 472 | template <class P> |
| 364 | return nullptr; | 473 | void BufferCache<P>::SetEnabledComputeUniformBuffers(u32 enabled) { |
| 474 | enabled_compute_uniform_buffers = enabled; | ||
| 475 | } | ||
| 476 | |||
| 477 | template <class P> | ||
| 478 | void BufferCache<P>::UnbindGraphicsStorageBuffers(size_t stage) { | ||
| 479 | enabled_storage_buffers[stage] = 0; | ||
| 480 | written_storage_buffers[stage] = 0; | ||
| 481 | } | ||
| 482 | |||
| 483 | template <class P> | ||
| 484 | void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, | ||
| 485 | u32 cbuf_offset, bool is_written) { | ||
| 486 | enabled_storage_buffers[stage] |= 1U << ssbo_index; | ||
| 487 | written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index; | ||
| 488 | |||
| 489 | const auto& cbufs = maxwell3d.state.shader_stages[stage]; | ||
| 490 | const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset; | ||
| 491 | storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr); | ||
| 492 | } | ||
| 493 | |||
| 494 | template <class P> | ||
| 495 | void BufferCache<P>::UnbindComputeStorageBuffers() { | ||
| 496 | enabled_compute_storage_buffers = 0; | ||
| 497 | written_compute_storage_buffers = 0; | ||
| 498 | } | ||
| 499 | |||
| 500 | template <class P> | ||
| 501 | void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, | ||
| 502 | bool is_written) { | ||
| 503 | enabled_compute_storage_buffers |= 1U << ssbo_index; | ||
| 504 | written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index; | ||
| 505 | |||
| 506 | const auto& launch_desc = kepler_compute.launch_description; | ||
| 507 | ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0); | ||
| 508 | |||
| 509 | const auto& cbufs = launch_desc.const_buffer_config; | ||
| 510 | const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset; | ||
| 511 | compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr); | ||
| 512 | } | ||
| 513 | |||
| 514 | template <class P> | ||
| 515 | void BufferCache<P>::FlushCachedWrites() { | ||
| 516 | for (const BufferId buffer_id : cached_write_buffer_ids) { | ||
| 517 | slot_buffers[buffer_id].FlushCachedWrites(); | ||
| 518 | } | ||
| 519 | cached_write_buffer_ids.clear(); | ||
| 520 | } | ||
| 521 | |||
| 522 | template <class P> | ||
| 523 | bool BufferCache<P>::HasUncommittedFlushes() const noexcept { | ||
| 524 | return !uncommitted_downloads.empty(); | ||
| 525 | } | ||
| 526 | |||
| 527 | template <class P> | ||
| 528 | bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { | ||
| 529 | return !committed_downloads.empty() && !committed_downloads.front().empty(); | ||
| 530 | } | ||
| 531 | |||
| 532 | template <class P> | ||
| 533 | void BufferCache<P>::CommitAsyncFlushes() { | ||
| 534 | // This is intentionally passing the value by copy | ||
| 535 | committed_downloads.push_front(uncommitted_downloads); | ||
| 536 | uncommitted_downloads.clear(); | ||
| 537 | } | ||
| 538 | |||
| 539 | template <class P> | ||
| 540 | void BufferCache<P>::PopAsyncFlushes() { | ||
| 541 | if (committed_downloads.empty()) { | ||
| 542 | return; | ||
| 543 | } | ||
| 544 | auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); }); | ||
| 545 | const std::span<const BufferId> download_ids = committed_downloads.back(); | ||
| 546 | if (download_ids.empty()) { | ||
| 547 | return; | ||
| 548 | } | ||
| 549 | MICROPROFILE_SCOPE(GPU_DownloadMemory); | ||
| 550 | |||
| 551 | boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads; | ||
| 552 | u64 total_size_bytes = 0; | ||
| 553 | u64 largest_copy = 0; | ||
| 554 | for (const BufferId buffer_id : download_ids) { | ||
| 555 | slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) { | ||
| 556 | downloads.push_back({ | ||
| 557 | BufferCopy{ | ||
| 558 | .src_offset = range_offset, | ||
| 559 | .dst_offset = total_size_bytes, | ||
| 560 | .size = range_size, | ||
| 561 | }, | ||
| 562 | buffer_id, | ||
| 563 | }); | ||
| 564 | total_size_bytes += range_size; | ||
| 565 | largest_copy = std::max(largest_copy, range_size); | ||
| 566 | }); | ||
| 567 | } | ||
| 568 | if (downloads.empty()) { | ||
| 569 | return; | ||
| 570 | } | ||
| 571 | if constexpr (USE_MEMORY_MAPS) { | ||
| 572 | auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); | ||
| 573 | for (auto& [copy, buffer_id] : downloads) { | ||
| 574 | // Have in mind the staging buffer offset for the copy | ||
| 575 | copy.dst_offset += download_staging.offset; | ||
| 576 | const std::array copies{copy}; | ||
| 577 | runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies); | ||
| 365 | } | 578 | } |
| 366 | if (modified_inheritance) { | 579 | runtime.Finish(); |
| 367 | map->MarkAsModified(true, GetModifiedTicks()); | 580 | for (const auto [copy, buffer_id] : downloads) { |
| 368 | if (Settings::IsGPULevelHigh() && | 581 | const Buffer& buffer = slot_buffers[buffer_id]; |
| 369 | Settings::values.use_asynchronous_gpu_emulation.GetValue()) { | 582 | const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; |
| 370 | MarkForAsyncFlush(map); | 583 | // Undo the modified offset |
| 371 | } | 584 | const u64 dst_offset = copy.dst_offset - download_staging.offset; |
| 585 | const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; | ||
| 586 | cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); | ||
| 587 | } | ||
| 588 | } else { | ||
| 589 | const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); | ||
| 590 | for (const auto [copy, buffer_id] : downloads) { | ||
| 591 | Buffer& buffer = slot_buffers[buffer_id]; | ||
| 592 | buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); | ||
| 593 | const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; | ||
| 594 | cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); | ||
| 372 | } | 595 | } |
| 373 | return map; | ||
| 374 | } | 596 | } |
| 375 | 597 | } | |
| 376 | void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { | 598 | |
| 377 | const IntervalType base_interval{start, end}; | 599 | template <class P> |
| 378 | IntervalSet interval_set{}; | 600 | bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { |
| 379 | interval_set.add(base_interval); | 601 | const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); |
| 380 | for (auto& overlap : overlaps) { | 602 | for (u64 page = addr >> PAGE_BITS; page < page_end;) { |
| 381 | const IntervalType subtract{overlap->start, overlap->end}; | 603 | const BufferId image_id = page_table[page]; |
| 382 | interval_set.subtract(subtract); | 604 | if (!image_id) { |
| 605 | ++page; | ||
| 606 | continue; | ||
| 383 | } | 607 | } |
| 384 | for (auto& interval : interval_set) { | 608 | Buffer& buffer = slot_buffers[image_id]; |
| 385 | const std::size_t size = interval.upper() - interval.lower(); | 609 | if (buffer.IsRegionGpuModified(addr, size)) { |
| 386 | if (size == 0) { | 610 | return true; |
| 387 | continue; | ||
| 388 | } | ||
| 389 | staging_buffer.resize(size); | ||
| 390 | cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); | ||
| 391 | block->Upload(block->Offset(interval.lower()), size, staging_buffer.data()); | ||
| 392 | } | 611 | } |
| 612 | const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); | ||
| 613 | page = Common::DivCeil(end_addr, PAGE_SIZE); | ||
| 393 | } | 614 | } |
| 394 | 615 | return false; | |
| 395 | VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) { | 616 | } |
| 396 | VectorMapInterval result; | 617 | |
| 397 | if (size == 0) { | 618 | template <class P> |
| 398 | return result; | 619 | void BufferCache<P>::BindHostIndexBuffer() { |
| 620 | Buffer& buffer = slot_buffers[index_buffer.buffer_id]; | ||
| 621 | const u32 offset = buffer.Offset(index_buffer.cpu_addr); | ||
| 622 | const u32 size = index_buffer.size; | ||
| 623 | SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); | ||
| 624 | if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { | ||
| 625 | runtime.BindIndexBuffer(buffer, offset, size); | ||
| 626 | } else { | ||
| 627 | runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format, | ||
| 628 | maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count, | ||
| 629 | buffer, offset, size); | ||
| 630 | } | ||
| 631 | } | ||
| 632 | |||
| 633 | template <class P> | ||
| 634 | void BufferCache<P>::BindHostVertexBuffers() { | ||
| 635 | auto& flags = maxwell3d.dirty.flags; | ||
| 636 | for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { | ||
| 637 | const Binding& binding = vertex_buffers[index]; | ||
| 638 | Buffer& buffer = slot_buffers[binding.buffer_id]; | ||
| 639 | SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); | ||
| 640 | if (!flags[Dirty::VertexBuffer0 + index]) { | ||
| 641 | continue; | ||
| 399 | } | 642 | } |
| 643 | flags[Dirty::VertexBuffer0 + index] = false; | ||
| 400 | 644 | ||
| 401 | const VAddr addr_end = addr + size; | 645 | const u32 stride = maxwell3d.regs.vertex_array[index].stride; |
| 402 | auto it = mapped_addresses.lower_bound(addr); | 646 | const u32 offset = buffer.Offset(binding.cpu_addr); |
| 403 | if (it != mapped_addresses.begin()) { | 647 | runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride); |
| 404 | --it; | 648 | } |
| 649 | } | ||
| 650 | |||
| 651 | template <class P> | ||
| 652 | void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) { | ||
| 653 | u32 dirty = ~0U; | ||
| 654 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { | ||
| 655 | dirty = std::exchange(dirty_uniform_buffers[stage], 0); | ||
| 656 | } | ||
| 657 | u32 binding_index = 0; | ||
| 658 | ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) { | ||
| 659 | const bool needs_bind = ((dirty >> index) & 1) != 0; | ||
| 660 | BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind); | ||
| 661 | if constexpr (NEEDS_BIND_UNIFORM_INDEX) { | ||
| 662 | ++binding_index; | ||
| 405 | } | 663 | } |
| 406 | while (it != mapped_addresses.end() && it->start < addr_end) { | 664 | }); |
| 407 | if (it->Overlaps(addr, addr_end)) { | 665 | } |
| 408 | result.push_back(&*it); | 666 | |
| 667 | template <class P> | ||
| 668 | void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, | ||
| 669 | bool needs_bind) { | ||
| 670 | const Binding& binding = uniform_buffers[stage][index]; | ||
| 671 | const VAddr cpu_addr = binding.cpu_addr; | ||
| 672 | const u32 size = binding.size; | ||
| 673 | Buffer& buffer = slot_buffers[binding.buffer_id]; | ||
| 674 | if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) { | ||
| 675 | if constexpr (IS_OPENGL) { | ||
| 676 | if (runtime.HasFastBufferSubData()) { | ||
| 677 | // Fast path for Nvidia | ||
| 678 | if (!HasFastUniformBufferBound(stage, binding_index)) { | ||
| 679 | // We only have to bind when the currently bound buffer is not the fast version | ||
| 680 | runtime.BindFastUniformBuffer(stage, binding_index, size); | ||
| 681 | } | ||
| 682 | const auto span = ImmediateBufferWithData(cpu_addr, size); | ||
| 683 | runtime.PushFastUniformBuffer(stage, binding_index, span); | ||
| 684 | return; | ||
| 409 | } | 685 | } |
| 410 | ++it; | ||
| 411 | } | 686 | } |
| 412 | return result; | 687 | fast_bound_uniform_buffers[stage] |= 1U << binding_index; |
| 413 | } | ||
| 414 | 688 | ||
| 415 | /// Returns a ticks counter used for tracking when cached objects were last modified | 689 | // Stream buffer path to avoid stalling on non-Nvidia drivers or Vulkan |
| 416 | u64 GetModifiedTicks() { | 690 | const std::span<u8> span = runtime.BindMappedUniformBuffer(stage, binding_index, size); |
| 417 | return ++modified_ticks; | 691 | cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size); |
| 692 | return; | ||
| 418 | } | 693 | } |
| 419 | 694 | // Classic cached path | |
| 420 | void FlushMap(MapInterval* map) { | 695 | SynchronizeBuffer(buffer, cpu_addr, size); |
| 421 | const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); | 696 | if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) { |
| 422 | ASSERT_OR_EXECUTE(it != blocks.end(), return;); | 697 | // Skip binding if it's not needed and if the bound buffer is not the fast version |
| 423 | 698 | // This exists to avoid instances where the fast buffer is bound and a GPU write happens | |
| 424 | std::shared_ptr<Buffer> block = it->second; | 699 | return; |
| 425 | |||
| 426 | const std::size_t size = map->end - map->start; | ||
| 427 | staging_buffer.resize(size); | ||
| 428 | block->Download(block->Offset(map->start), size, staging_buffer.data()); | ||
| 429 | cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size); | ||
| 430 | map->MarkAsModified(false, 0); | ||
| 431 | } | 700 | } |
| 701 | fast_bound_uniform_buffers[stage] &= ~(1U << binding_index); | ||
| 432 | 702 | ||
| 433 | template <typename Callable> | 703 | const u32 offset = buffer.Offset(cpu_addr); |
| 434 | BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { | 704 | if constexpr (NEEDS_BIND_UNIFORM_INDEX) { |
| 435 | AlignBuffer(alignment); | 705 | runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size); |
| 436 | const std::size_t uploaded_offset = buffer_offset; | 706 | } else { |
| 437 | callable(buffer_ptr); | 707 | runtime.BindUniformBuffer(buffer, offset, size); |
| 438 | |||
| 439 | buffer_ptr += size; | ||
| 440 | buffer_offset += size; | ||
| 441 | return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()}; | ||
| 442 | } | 708 | } |
| 709 | } | ||
| 710 | |||
| 711 | template <class P> | ||
| 712 | void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) { | ||
| 713 | u32 binding_index = 0; | ||
| 714 | ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { | ||
| 715 | const Binding& binding = storage_buffers[stage][index]; | ||
| 716 | Buffer& buffer = slot_buffers[binding.buffer_id]; | ||
| 717 | const u32 size = binding.size; | ||
| 718 | SynchronizeBuffer(buffer, binding.cpu_addr, size); | ||
| 719 | |||
| 720 | const u32 offset = buffer.Offset(binding.cpu_addr); | ||
| 721 | const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0; | ||
| 722 | if constexpr (NEEDS_BIND_STORAGE_INDEX) { | ||
| 723 | runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written); | ||
| 724 | ++binding_index; | ||
| 725 | } else { | ||
| 726 | runtime.BindStorageBuffer(buffer, offset, size, is_written); | ||
| 727 | } | ||
| 728 | }); | ||
| 729 | } | ||
| 443 | 730 | ||
| 444 | void AlignBuffer(std::size_t alignment) { | 731 | template <class P> |
| 445 | // Align the offset, not the mapped pointer | 732 | void BufferCache<P>::BindHostTransformFeedbackBuffers() { |
| 446 | const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment); | 733 | if (maxwell3d.regs.tfb_enabled == 0) { |
| 447 | buffer_ptr += offset_aligned - buffer_offset; | 734 | return; |
| 448 | buffer_offset = offset_aligned; | ||
| 449 | } | 735 | } |
| 736 | for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { | ||
| 737 | const Binding& binding = transform_feedback_buffers[index]; | ||
| 738 | Buffer& buffer = slot_buffers[binding.buffer_id]; | ||
| 739 | const u32 size = binding.size; | ||
| 740 | SynchronizeBuffer(buffer, binding.cpu_addr, size); | ||
| 741 | |||
| 742 | const u32 offset = buffer.Offset(binding.cpu_addr); | ||
| 743 | runtime.BindTransformFeedbackBuffer(index, buffer, offset, size); | ||
| 744 | } | ||
| 745 | } | ||
| 450 | 746 | ||
| 451 | std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) { | 747 | template <class P> |
| 452 | const std::size_t old_size = buffer->Size(); | 748 | void BufferCache<P>::BindHostComputeUniformBuffers() { |
| 453 | const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; | 749 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { |
| 454 | const VAddr cpu_addr = buffer->CpuAddr(); | 750 | // Mark all uniform buffers as dirty |
| 455 | std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size); | 751 | dirty_uniform_buffers.fill(~u32{0}); |
| 456 | new_buffer->CopyFrom(*buffer, 0, 0, old_size); | 752 | } |
| 457 | QueueDestruction(std::move(buffer)); | 753 | u32 binding_index = 0; |
| 458 | 754 | ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { | |
| 459 | const VAddr cpu_addr_end = cpu_addr + new_size - 1; | 755 | const Binding& binding = compute_uniform_buffers[index]; |
| 460 | const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; | 756 | Buffer& buffer = slot_buffers[binding.buffer_id]; |
| 461 | for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { | 757 | const u32 size = binding.size; |
| 462 | blocks.insert_or_assign(page_start, new_buffer); | 758 | SynchronizeBuffer(buffer, binding.cpu_addr, size); |
| 759 | |||
| 760 | const u32 offset = buffer.Offset(binding.cpu_addr); | ||
| 761 | if constexpr (NEEDS_BIND_UNIFORM_INDEX) { | ||
| 762 | runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size); | ||
| 763 | ++binding_index; | ||
| 764 | } else { | ||
| 765 | runtime.BindUniformBuffer(buffer, offset, size); | ||
| 463 | } | 766 | } |
| 767 | }); | ||
| 768 | } | ||
| 769 | |||
| 770 | template <class P> | ||
| 771 | void BufferCache<P>::BindHostComputeStorageBuffers() { | ||
| 772 | u32 binding_index = 0; | ||
| 773 | ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { | ||
| 774 | const Binding& binding = compute_storage_buffers[index]; | ||
| 775 | Buffer& buffer = slot_buffers[binding.buffer_id]; | ||
| 776 | const u32 size = binding.size; | ||
| 777 | SynchronizeBuffer(buffer, binding.cpu_addr, size); | ||
| 778 | |||
| 779 | const u32 offset = buffer.Offset(binding.cpu_addr); | ||
| 780 | const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0; | ||
| 781 | if constexpr (NEEDS_BIND_STORAGE_INDEX) { | ||
| 782 | runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written); | ||
| 783 | ++binding_index; | ||
| 784 | } else { | ||
| 785 | runtime.BindStorageBuffer(buffer, offset, size, is_written); | ||
| 786 | } | ||
| 787 | }); | ||
| 788 | } | ||
| 464 | 789 | ||
| 465 | return new_buffer; | 790 | template <class P> |
| 791 | void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) { | ||
| 792 | if (is_indexed) { | ||
| 793 | UpdateIndexBuffer(); | ||
| 466 | } | 794 | } |
| 795 | UpdateVertexBuffers(); | ||
| 796 | UpdateTransformFeedbackBuffers(); | ||
| 797 | for (size_t stage = 0; stage < NUM_STAGES; ++stage) { | ||
| 798 | UpdateUniformBuffers(stage); | ||
| 799 | UpdateStorageBuffers(stage); | ||
| 800 | } | ||
| 801 | } | ||
| 802 | |||
| 803 | template <class P> | ||
| 804 | void BufferCache<P>::DoUpdateComputeBuffers() { | ||
| 805 | UpdateComputeUniformBuffers(); | ||
| 806 | UpdateComputeStorageBuffers(); | ||
| 807 | } | ||
| 808 | |||
| 809 | template <class P> | ||
| 810 | void BufferCache<P>::UpdateIndexBuffer() { | ||
| 811 | // We have to check for the dirty flags and index count | ||
| 812 | // The index count is currently changed without updating the dirty flags | ||
| 813 | const auto& index_array = maxwell3d.regs.index_array; | ||
| 814 | auto& flags = maxwell3d.dirty.flags; | ||
| 815 | if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) { | ||
| 816 | return; | ||
| 817 | } | ||
| 818 | flags[Dirty::IndexBuffer] = false; | ||
| 819 | last_index_count = index_array.count; | ||
| 820 | |||
| 821 | const GPUVAddr gpu_addr_begin = index_array.StartAddress(); | ||
| 822 | const GPUVAddr gpu_addr_end = index_array.EndAddress(); | ||
| 823 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); | ||
| 824 | const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); | ||
| 825 | const u32 draw_size = index_array.count * index_array.FormatSizeInBytes(); | ||
| 826 | const u32 size = std::min(address_size, draw_size); | ||
| 827 | if (size == 0 || !cpu_addr) { | ||
| 828 | index_buffer = NULL_BINDING; | ||
| 829 | return; | ||
| 830 | } | ||
| 831 | index_buffer = Binding{ | ||
| 832 | .cpu_addr = *cpu_addr, | ||
| 833 | .size = size, | ||
| 834 | .buffer_id = FindBuffer(*cpu_addr, size), | ||
| 835 | }; | ||
| 836 | } | ||
| 467 | 837 | ||
| 468 | std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first, | 838 | template <class P> |
| 469 | std::shared_ptr<Buffer> second) { | 839 | void BufferCache<P>::UpdateVertexBuffers() { |
| 470 | const std::size_t size_1 = first->Size(); | 840 | auto& flags = maxwell3d.dirty.flags; |
| 471 | const std::size_t size_2 = second->Size(); | 841 | if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) { |
| 472 | const VAddr first_addr = first->CpuAddr(); | 842 | return; |
| 473 | const VAddr second_addr = second->CpuAddr(); | 843 | } |
| 474 | const VAddr new_addr = std::min(first_addr, second_addr); | 844 | flags[Dirty::VertexBuffers] = false; |
| 475 | const std::size_t new_size = size_1 + size_2; | ||
| 476 | |||
| 477 | std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size); | ||
| 478 | new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1); | ||
| 479 | new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2); | ||
| 480 | QueueDestruction(std::move(first)); | ||
| 481 | QueueDestruction(std::move(second)); | ||
| 482 | 845 | ||
| 483 | const VAddr cpu_addr_end = new_addr + new_size - 1; | 846 | for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { |
| 484 | const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; | 847 | UpdateVertexBuffer(index); |
| 485 | for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { | ||
| 486 | blocks.insert_or_assign(page_start, new_buffer); | ||
| 487 | } | ||
| 488 | return new_buffer; | ||
| 489 | } | 848 | } |
| 849 | } | ||
| 490 | 850 | ||
| 491 | Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { | 851 | template <class P> |
| 492 | std::shared_ptr<Buffer> found; | 852 | void BufferCache<P>::UpdateVertexBuffer(u32 index) { |
| 853 | if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) { | ||
| 854 | return; | ||
| 855 | } | ||
| 856 | const auto& array = maxwell3d.regs.vertex_array[index]; | ||
| 857 | const auto& limit = maxwell3d.regs.vertex_array_limit[index]; | ||
| 858 | const GPUVAddr gpu_addr_begin = array.StartAddress(); | ||
| 859 | const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1; | ||
| 860 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); | ||
| 861 | const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); | ||
| 862 | const u32 size = address_size; // TODO: Analyze stride and number of vertices | ||
| 863 | if (array.enable == 0 || size == 0 || !cpu_addr) { | ||
| 864 | vertex_buffers[index] = NULL_BINDING; | ||
| 865 | return; | ||
| 866 | } | ||
| 867 | vertex_buffers[index] = Binding{ | ||
| 868 | .cpu_addr = *cpu_addr, | ||
| 869 | .size = size, | ||
| 870 | .buffer_id = FindBuffer(*cpu_addr, size), | ||
| 871 | }; | ||
| 872 | } | ||
| 873 | |||
| 874 | template <class P> | ||
| 875 | void BufferCache<P>::UpdateUniformBuffers(size_t stage) { | ||
| 876 | ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) { | ||
| 877 | Binding& binding = uniform_buffers[stage][index]; | ||
| 878 | if (binding.buffer_id) { | ||
| 879 | // Already updated | ||
| 880 | return; | ||
| 881 | } | ||
| 882 | // Mark as dirty | ||
| 883 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { | ||
| 884 | dirty_uniform_buffers[stage] |= 1U << index; | ||
| 885 | } | ||
| 886 | // Resolve buffer | ||
| 887 | binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size); | ||
| 888 | }); | ||
| 889 | } | ||
| 890 | |||
| 891 | template <class P> | ||
| 892 | void BufferCache<P>::UpdateStorageBuffers(size_t stage) { | ||
| 893 | const u32 written_mask = written_storage_buffers[stage]; | ||
| 894 | ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { | ||
| 895 | // Resolve buffer | ||
| 896 | Binding& binding = storage_buffers[stage][index]; | ||
| 897 | const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size); | ||
| 898 | binding.buffer_id = buffer_id; | ||
| 899 | // Mark buffer as written if needed | ||
| 900 | if (((written_mask >> index) & 1) != 0) { | ||
| 901 | MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size); | ||
| 902 | } | ||
| 903 | }); | ||
| 904 | } | ||
| 493 | 905 | ||
| 494 | const VAddr cpu_addr_end = cpu_addr + size - 1; | 906 | template <class P> |
| 495 | const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; | 907 | void BufferCache<P>::UpdateTransformFeedbackBuffers() { |
| 496 | for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { | 908 | if (maxwell3d.regs.tfb_enabled == 0) { |
| 497 | auto it = blocks.find(page_start); | 909 | return; |
| 498 | if (it == blocks.end()) { | 910 | } |
| 499 | if (found) { | 911 | for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { |
| 500 | found = EnlargeBlock(found); | 912 | UpdateTransformFeedbackBuffer(index); |
| 501 | continue; | 913 | } |
| 502 | } | 914 | } |
| 503 | const VAddr start_addr = page_start << BLOCK_PAGE_BITS; | 915 | |
| 504 | found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); | 916 | template <class P> |
| 505 | blocks.insert_or_assign(page_start, found); | 917 | void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) { |
| 506 | continue; | 918 | const auto& binding = maxwell3d.regs.tfb_bindings[index]; |
| 507 | } | 919 | const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset; |
| 508 | if (!found) { | 920 | const u32 size = binding.buffer_size; |
| 509 | found = it->second; | 921 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); |
| 510 | continue; | 922 | if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) { |
| 511 | } | 923 | transform_feedback_buffers[index] = NULL_BINDING; |
| 512 | if (found != it->second) { | 924 | return; |
| 513 | found = MergeBlocks(std::move(found), it->second); | 925 | } |
| 926 | const BufferId buffer_id = FindBuffer(*cpu_addr, size); | ||
| 927 | transform_feedback_buffers[index] = Binding{ | ||
| 928 | .cpu_addr = *cpu_addr, | ||
| 929 | .size = size, | ||
| 930 | .buffer_id = buffer_id, | ||
| 931 | }; | ||
| 932 | MarkWrittenBuffer(buffer_id, *cpu_addr, size); | ||
| 933 | } | ||
| 934 | |||
| 935 | template <class P> | ||
| 936 | void BufferCache<P>::UpdateComputeUniformBuffers() { | ||
| 937 | ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { | ||
| 938 | Binding& binding = compute_uniform_buffers[index]; | ||
| 939 | binding = NULL_BINDING; | ||
| 940 | const auto& launch_desc = kepler_compute.launch_description; | ||
| 941 | if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) { | ||
| 942 | const auto& cbuf = launch_desc.const_buffer_config[index]; | ||
| 943 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address()); | ||
| 944 | if (cpu_addr) { | ||
| 945 | binding.cpu_addr = *cpu_addr; | ||
| 946 | binding.size = cbuf.size; | ||
| 514 | } | 947 | } |
| 515 | } | 948 | } |
| 516 | return found.get(); | 949 | binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size); |
| 950 | }); | ||
| 951 | } | ||
| 952 | |||
| 953 | template <class P> | ||
| 954 | void BufferCache<P>::UpdateComputeStorageBuffers() { | ||
| 955 | ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { | ||
| 956 | // Resolve buffer | ||
| 957 | Binding& binding = compute_storage_buffers[index]; | ||
| 958 | const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size); | ||
| 959 | binding.buffer_id = buffer_id; | ||
| 960 | // Mark as written if needed | ||
| 961 | if (((written_compute_storage_buffers >> index) & 1) != 0) { | ||
| 962 | MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size); | ||
| 963 | } | ||
| 964 | }); | ||
| 965 | } | ||
| 966 | |||
| 967 | template <class P> | ||
| 968 | void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { | ||
| 969 | Buffer& buffer = slot_buffers[buffer_id]; | ||
| 970 | buffer.MarkRegionAsGpuModified(cpu_addr, size); | ||
| 971 | |||
| 972 | const bool is_accuracy_high = Settings::IsGPULevelHigh(); | ||
| 973 | const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); | ||
| 974 | if (!is_accuracy_high || !is_async) { | ||
| 975 | return; | ||
| 976 | } | ||
| 977 | if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) { | ||
| 978 | // Already inserted | ||
| 979 | return; | ||
| 517 | } | 980 | } |
| 981 | uncommitted_downloads.push_back(buffer_id); | ||
| 982 | } | ||
| 518 | 983 | ||
| 519 | void MarkRegionAsWritten(VAddr start, VAddr end) { | 984 | template <class P> |
| 520 | const u64 page_end = end >> WRITE_PAGE_BIT; | 985 | BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) { |
| 521 | for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { | 986 | if (cpu_addr == 0) { |
| 522 | if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) { | 987 | return NULL_BUFFER_ID; |
| 523 | ++it->second; | 988 | } |
| 524 | } | 989 | const u64 page = cpu_addr >> PAGE_BITS; |
| 990 | const BufferId buffer_id = page_table[page]; | ||
| 991 | if (!buffer_id) { | ||
| 992 | return CreateBuffer(cpu_addr, size); | ||
| 993 | } | ||
| 994 | const Buffer& buffer = slot_buffers[buffer_id]; | ||
| 995 | if (buffer.IsInBounds(cpu_addr, size)) { | ||
| 996 | return buffer_id; | ||
| 997 | } | ||
| 998 | return CreateBuffer(cpu_addr, size); | ||
| 999 | } | ||
| 1000 | |||
| 1001 | template <class P> | ||
| 1002 | typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu_addr, | ||
| 1003 | u32 wanted_size) { | ||
| 1004 | static constexpr int STREAM_LEAP_THRESHOLD = 16; | ||
| 1005 | std::vector<BufferId> overlap_ids; | ||
| 1006 | VAddr begin = cpu_addr; | ||
| 1007 | VAddr end = cpu_addr + wanted_size; | ||
| 1008 | int stream_score = 0; | ||
| 1009 | bool has_stream_leap = false; | ||
| 1010 | for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) { | ||
| 1011 | const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; | ||
| 1012 | if (!overlap_id) { | ||
| 1013 | continue; | ||
| 1014 | } | ||
| 1015 | Buffer& overlap = slot_buffers[overlap_id]; | ||
| 1016 | if (overlap.IsPicked()) { | ||
| 1017 | continue; | ||
| 1018 | } | ||
| 1019 | overlap_ids.push_back(overlap_id); | ||
| 1020 | overlap.Pick(); | ||
| 1021 | const VAddr overlap_cpu_addr = overlap.CpuAddr(); | ||
| 1022 | if (overlap_cpu_addr < begin) { | ||
| 1023 | cpu_addr = begin = overlap_cpu_addr; | ||
| 1024 | } | ||
| 1025 | end = std::max(end, overlap_cpu_addr + overlap.SizeBytes()); | ||
| 1026 | |||
| 1027 | stream_score += overlap.StreamScore(); | ||
| 1028 | if (stream_score > STREAM_LEAP_THRESHOLD && !has_stream_leap) { | ||
| 1029 | // When this memory region has been joined a bunch of times, we assume it's being used | ||
| 1030 | // as a stream buffer. Increase the size to skip constantly recreating buffers. | ||
| 1031 | has_stream_leap = true; | ||
| 1032 | end += PAGE_SIZE * 256; | ||
| 525 | } | 1033 | } |
| 526 | } | 1034 | } |
| 527 | 1035 | return OverlapResult{ | |
| 528 | void UnmarkRegionAsWritten(VAddr start, VAddr end) { | 1036 | .ids = std::move(overlap_ids), |
| 529 | const u64 page_end = end >> WRITE_PAGE_BIT; | 1037 | .begin = begin, |
| 530 | for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { | 1038 | .end = end, |
| 531 | auto it = written_pages.find(page_start); | 1039 | .has_stream_leap = has_stream_leap, |
| 532 | if (it != written_pages.end()) { | 1040 | }; |
| 533 | if (it->second > 1) { | 1041 | } |
| 534 | --it->second; | 1042 | |
| 535 | } else { | 1043 | template <class P> |
| 536 | written_pages.erase(it); | 1044 | void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, |
| 537 | } | 1045 | bool accumulate_stream_score) { |
| 538 | } | 1046 | Buffer& new_buffer = slot_buffers[new_buffer_id]; |
| 1047 | Buffer& overlap = slot_buffers[overlap_id]; | ||
| 1048 | if (accumulate_stream_score) { | ||
| 1049 | new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1); | ||
| 1050 | } | ||
| 1051 | std::vector<BufferCopy> copies; | ||
| 1052 | const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); | ||
| 1053 | overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) { | ||
| 1054 | copies.push_back(BufferCopy{ | ||
| 1055 | .src_offset = begin, | ||
| 1056 | .dst_offset = dst_base_offset + begin, | ||
| 1057 | .size = range_size, | ||
| 1058 | }); | ||
| 1059 | new_buffer.UnmarkRegionAsCpuModified(begin, range_size); | ||
| 1060 | new_buffer.MarkRegionAsGpuModified(begin, range_size); | ||
| 1061 | }); | ||
| 1062 | if (!copies.empty()) { | ||
| 1063 | runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); | ||
| 1064 | } | ||
| 1065 | ReplaceBufferDownloads(overlap_id, new_buffer_id); | ||
| 1066 | DeleteBuffer(overlap_id); | ||
| 1067 | } | ||
| 1068 | |||
| 1069 | template <class P> | ||
| 1070 | BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { | ||
| 1071 | const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); | ||
| 1072 | const u32 size = static_cast<u32>(overlap.end - overlap.begin); | ||
| 1073 | const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); | ||
| 1074 | for (const BufferId overlap_id : overlap.ids) { | ||
| 1075 | JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); | ||
| 1076 | } | ||
| 1077 | Register(new_buffer_id); | ||
| 1078 | return new_buffer_id; | ||
| 1079 | } | ||
| 1080 | |||
| 1081 | template <class P> | ||
| 1082 | void BufferCache<P>::Register(BufferId buffer_id) { | ||
| 1083 | ChangeRegister<true>(buffer_id); | ||
| 1084 | } | ||
| 1085 | |||
| 1086 | template <class P> | ||
| 1087 | void BufferCache<P>::Unregister(BufferId buffer_id) { | ||
| 1088 | ChangeRegister<false>(buffer_id); | ||
| 1089 | } | ||
| 1090 | |||
| 1091 | template <class P> | ||
| 1092 | template <bool insert> | ||
| 1093 | void BufferCache<P>::ChangeRegister(BufferId buffer_id) { | ||
| 1094 | const Buffer& buffer = slot_buffers[buffer_id]; | ||
| 1095 | const VAddr cpu_addr_begin = buffer.CpuAddr(); | ||
| 1096 | const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes(); | ||
| 1097 | const u64 page_begin = cpu_addr_begin / PAGE_SIZE; | ||
| 1098 | const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); | ||
| 1099 | for (u64 page = page_begin; page != page_end; ++page) { | ||
| 1100 | if constexpr (insert) { | ||
| 1101 | page_table[page] = buffer_id; | ||
| 1102 | } else { | ||
| 1103 | page_table[page] = BufferId{}; | ||
| 539 | } | 1104 | } |
| 540 | } | 1105 | } |
| 1106 | } | ||
| 541 | 1107 | ||
| 542 | bool IsRegionWritten(VAddr start, VAddr end) const { | 1108 | template <class P> |
| 543 | const u64 page_end = end >> WRITE_PAGE_BIT; | 1109 | void BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { |
| 544 | for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { | 1110 | if (buffer.CpuAddr() == 0) { |
| 545 | if (written_pages.contains(page_start)) { | 1111 | return; |
| 546 | return true; | 1112 | } |
| 1113 | SynchronizeBufferImpl(buffer, cpu_addr, size); | ||
| 1114 | } | ||
| 1115 | |||
| 1116 | template <class P> | ||
| 1117 | void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) { | ||
| 1118 | boost::container::small_vector<BufferCopy, 4> copies; | ||
| 1119 | u64 total_size_bytes = 0; | ||
| 1120 | u64 largest_copy = 0; | ||
| 1121 | buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { | ||
| 1122 | copies.push_back(BufferCopy{ | ||
| 1123 | .src_offset = total_size_bytes, | ||
| 1124 | .dst_offset = range_offset, | ||
| 1125 | .size = range_size, | ||
| 1126 | }); | ||
| 1127 | total_size_bytes += range_size; | ||
| 1128 | largest_copy = std::max(largest_copy, range_size); | ||
| 1129 | }); | ||
| 1130 | if (total_size_bytes == 0) { | ||
| 1131 | return; | ||
| 1132 | } | ||
| 1133 | const std::span<BufferCopy> copies_span(copies.data(), copies.size()); | ||
| 1134 | UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); | ||
| 1135 | } | ||
| 1136 | |||
| 1137 | template <class P> | ||
| 1138 | void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, | ||
| 1139 | std::span<BufferCopy> copies) { | ||
| 1140 | if constexpr (USE_MEMORY_MAPS) { | ||
| 1141 | MappedUploadMemory(buffer, total_size_bytes, copies); | ||
| 1142 | } else { | ||
| 1143 | ImmediateUploadMemory(buffer, largest_copy, copies); | ||
| 1144 | } | ||
| 1145 | } | ||
| 1146 | |||
| 1147 | template <class P> | ||
| 1148 | void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, | ||
| 1149 | std::span<const BufferCopy> copies) { | ||
| 1150 | std::span<u8> immediate_buffer; | ||
| 1151 | for (const BufferCopy& copy : copies) { | ||
| 1152 | std::span<const u8> upload_span; | ||
| 1153 | const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; | ||
| 1154 | if (IsRangeGranular(cpu_addr, copy.size)) { | ||
| 1155 | upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); | ||
| 1156 | } else { | ||
| 1157 | if (immediate_buffer.empty()) { | ||
| 1158 | immediate_buffer = ImmediateBuffer(largest_copy); | ||
| 547 | } | 1159 | } |
| 1160 | cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); | ||
| 1161 | upload_span = immediate_buffer.subspan(0, copy.size); | ||
| 548 | } | 1162 | } |
| 549 | return false; | 1163 | buffer.ImmediateUpload(copy.dst_offset, upload_span); |
| 550 | } | 1164 | } |
| 551 | 1165 | } | |
| 552 | void QueueDestruction(std::shared_ptr<Buffer> buffer) { | 1166 | |
| 553 | buffer->SetEpoch(epoch); | 1167 | template <class P> |
| 554 | pending_destruction.push(std::move(buffer)); | 1168 | void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, |
| 1169 | std::span<BufferCopy> copies) { | ||
| 1170 | auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); | ||
| 1171 | const std::span<u8> staging_pointer = upload_staging.mapped_span; | ||
| 1172 | for (BufferCopy& copy : copies) { | ||
| 1173 | u8* const src_pointer = staging_pointer.data() + copy.src_offset; | ||
| 1174 | const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; | ||
| 1175 | cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); | ||
| 1176 | |||
| 1177 | // Apply the staging offset | ||
| 1178 | copy.src_offset += upload_staging.offset; | ||
| 555 | } | 1179 | } |
| 556 | 1180 | runtime.CopyBuffer(buffer, upload_staging.buffer, copies); | |
| 557 | void MarkForAsyncFlush(MapInterval* map) { | 1181 | } |
| 558 | if (!uncommitted_flushes) { | 1182 | |
| 559 | uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>(); | 1183 | template <class P> |
| 1184 | void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { | ||
| 1185 | const auto scalar_replace = [buffer_id](Binding& binding) { | ||
| 1186 | if (binding.buffer_id == buffer_id) { | ||
| 1187 | binding.buffer_id = BufferId{}; | ||
| 1188 | } | ||
| 1189 | }; | ||
| 1190 | const auto replace = [scalar_replace](std::span<Binding> bindings) { | ||
| 1191 | std::ranges::for_each(bindings, scalar_replace); | ||
| 1192 | }; | ||
| 1193 | scalar_replace(index_buffer); | ||
| 1194 | replace(vertex_buffers); | ||
| 1195 | std::ranges::for_each(uniform_buffers, replace); | ||
| 1196 | std::ranges::for_each(storage_buffers, replace); | ||
| 1197 | replace(transform_feedback_buffers); | ||
| 1198 | replace(compute_uniform_buffers); | ||
| 1199 | replace(compute_storage_buffers); | ||
| 1200 | std::erase(cached_write_buffer_ids, buffer_id); | ||
| 1201 | |||
| 1202 | // Mark the whole buffer as CPU written to stop tracking CPU writes | ||
| 1203 | Buffer& buffer = slot_buffers[buffer_id]; | ||
| 1204 | buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); | ||
| 1205 | |||
| 1206 | Unregister(buffer_id); | ||
| 1207 | delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); | ||
| 1208 | |||
| 1209 | NotifyBufferDeletion(); | ||
| 1210 | } | ||
| 1211 | |||
| 1212 | template <class P> | ||
| 1213 | void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) { | ||
| 1214 | const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) { | ||
| 1215 | std::ranges::replace(buffers, old_buffer_id, new_buffer_id); | ||
| 1216 | if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) { | ||
| 1217 | buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end()); | ||
| 560 | } | 1218 | } |
| 561 | uncommitted_flushes->insert(map); | 1219 | }; |
| 1220 | replace(uncommitted_downloads); | ||
| 1221 | std::ranges::for_each(committed_downloads, replace); | ||
| 1222 | } | ||
| 1223 | |||
| 1224 | template <class P> | ||
| 1225 | void BufferCache<P>::NotifyBufferDeletion() { | ||
| 1226 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { | ||
| 1227 | dirty_uniform_buffers.fill(~u32{0}); | ||
| 562 | } | 1228 | } |
| 1229 | auto& flags = maxwell3d.dirty.flags; | ||
| 1230 | flags[Dirty::IndexBuffer] = true; | ||
| 1231 | flags[Dirty::VertexBuffers] = true; | ||
| 1232 | for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { | ||
| 1233 | flags[Dirty::VertexBuffer0 + index] = true; | ||
| 1234 | } | ||
| 1235 | has_deleted_buffers = true; | ||
| 1236 | } | ||
| 1237 | |||
| 1238 | template <class P> | ||
| 1239 | typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr) const { | ||
| 1240 | const GPUVAddr gpu_addr = gpu_memory.Read<u64>(ssbo_addr); | ||
| 1241 | const u32 size = gpu_memory.Read<u32>(ssbo_addr + 8); | ||
| 1242 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); | ||
| 1243 | if (!cpu_addr || size == 0) { | ||
| 1244 | return NULL_BINDING; | ||
| 1245 | } | ||
| 1246 | // HACK(Rodrigo): This is the number of bytes bound in host beyond the guest API's range. | ||
| 1247 | // It exists due to some games like Astral Chain operate out of bounds. | ||
| 1248 | // Binding the whole map range would be technically correct, but games have large maps that make | ||
| 1249 | // this approach unaffordable for now. | ||
| 1250 | static constexpr u32 arbitrary_extra_bytes = 0xc000; | ||
| 1251 | const u32 bytes_to_map_end = static_cast<u32>(gpu_memory.BytesToMapEnd(gpu_addr)); | ||
| 1252 | const Binding binding{ | ||
| 1253 | .cpu_addr = *cpu_addr, | ||
| 1254 | .size = std::min(size + arbitrary_extra_bytes, bytes_to_map_end), | ||
| 1255 | .buffer_id = BufferId{}, | ||
| 1256 | }; | ||
| 1257 | return binding; | ||
| 1258 | } | ||
| 1259 | |||
| 1260 | template <class P> | ||
| 1261 | std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size_t size) { | ||
| 1262 | u8* const base_pointer = cpu_memory.GetPointer(cpu_addr); | ||
| 1263 | if (IsRangeGranular(cpu_addr, size) || | ||
| 1264 | base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) { | ||
| 1265 | return std::span(base_pointer, size); | ||
| 1266 | } else { | ||
| 1267 | const std::span<u8> span = ImmediateBuffer(size); | ||
| 1268 | cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size); | ||
| 1269 | return span; | ||
| 1270 | } | ||
| 1271 | } | ||
| 563 | 1272 | ||
| 564 | VideoCore::RasterizerInterface& rasterizer; | 1273 | template <class P> |
| 565 | Tegra::MemoryManager& gpu_memory; | 1274 | std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) { |
| 566 | Core::Memory::Memory& cpu_memory; | 1275 | if (wanted_capacity > immediate_buffer_capacity) { |
| 567 | StreamBuffer& stream_buffer; | 1276 | immediate_buffer_capacity = wanted_capacity; |
| 568 | 1277 | immediate_buffer_alloc = std::make_unique<u8[]>(wanted_capacity); | |
| 569 | u8* buffer_ptr = nullptr; | 1278 | } |
| 570 | u64 buffer_offset = 0; | 1279 | return std::span<u8>(immediate_buffer_alloc.get(), wanted_capacity); |
| 571 | u64 buffer_offset_base = 0; | 1280 | } |
| 572 | 1281 | ||
| 573 | MapIntervalAllocator mapped_addresses_allocator; | 1282 | template <class P> |
| 574 | boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>> | 1283 | bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept { |
| 575 | mapped_addresses; | 1284 | if constexpr (IS_OPENGL) { |
| 576 | 1285 | return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0; | |
| 577 | std::unordered_map<u64, u32> written_pages; | 1286 | } else { |
| 578 | std::unordered_map<u64, std::shared_ptr<Buffer>> blocks; | 1287 | // Only OpenGL has fast uniform buffers |
| 579 | 1288 | return false; | |
| 580 | std::queue<std::shared_ptr<Buffer>> pending_destruction; | 1289 | } |
| 581 | u64 epoch = 0; | 1290 | } |
| 582 | u64 modified_ticks = 0; | ||
| 583 | |||
| 584 | std::vector<u8> staging_buffer; | ||
| 585 | |||
| 586 | std::list<MapInterval*> marked_for_unregister; | ||
| 587 | |||
| 588 | std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes; | ||
| 589 | std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes; | ||
| 590 | |||
| 591 | std::recursive_mutex mutex; | ||
| 592 | }; | ||
| 593 | 1291 | ||
| 594 | } // namespace VideoCommon | 1292 | } // namespace VideoCommon |
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp deleted file mode 100644 index 62587e18a..000000000 --- a/src/video_core/buffer_cache/map_interval.cpp +++ /dev/null | |||
| @@ -1,33 +0,0 @@ | |||
| 1 | // Copyright 2020 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <array> | ||
| 7 | #include <cstddef> | ||
| 8 | #include <memory> | ||
| 9 | |||
| 10 | #include "video_core/buffer_cache/map_interval.h" | ||
| 11 | |||
| 12 | namespace VideoCommon { | ||
| 13 | |||
| 14 | MapIntervalAllocator::MapIntervalAllocator() { | ||
| 15 | FillFreeList(first_chunk); | ||
| 16 | } | ||
| 17 | |||
| 18 | MapIntervalAllocator::~MapIntervalAllocator() = default; | ||
| 19 | |||
| 20 | void MapIntervalAllocator::AllocateNewChunk() { | ||
| 21 | *new_chunk = std::make_unique<Chunk>(); | ||
| 22 | FillFreeList(**new_chunk); | ||
| 23 | new_chunk = &(*new_chunk)->next; | ||
| 24 | } | ||
| 25 | |||
| 26 | void MapIntervalAllocator::FillFreeList(Chunk& chunk) { | ||
| 27 | const std::size_t old_size = free_list.size(); | ||
| 28 | free_list.resize(old_size + chunk.data.size()); | ||
| 29 | std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size, | ||
| 30 | [](MapInterval& interval) { return &interval; }); | ||
| 31 | } | ||
| 32 | |||
| 33 | } // namespace VideoCommon | ||
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h deleted file mode 100644 index ef974b08a..000000000 --- a/src/video_core/buffer_cache/map_interval.h +++ /dev/null | |||
| @@ -1,93 +0,0 @@ | |||
| 1 | // Copyright 2019 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <array> | ||
| 8 | #include <cstddef> | ||
| 9 | #include <memory> | ||
| 10 | #include <vector> | ||
| 11 | |||
| 12 | #include <boost/intrusive/set_hook.hpp> | ||
| 13 | |||
| 14 | #include "common/common_types.h" | ||
| 15 | #include "video_core/gpu.h" | ||
| 16 | |||
| 17 | namespace VideoCommon { | ||
| 18 | |||
| 19 | struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> { | ||
| 20 | MapInterval() = default; | ||
| 21 | |||
| 22 | /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {} | ||
| 23 | |||
| 24 | explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept | ||
| 25 | : start{start_}, end{end_}, gpu_addr{gpu_addr_} {} | ||
| 26 | |||
| 27 | bool IsInside(VAddr other_start, VAddr other_end) const noexcept { | ||
| 28 | return start <= other_start && other_end <= end; | ||
| 29 | } | ||
| 30 | |||
| 31 | bool Overlaps(VAddr other_start, VAddr other_end) const noexcept { | ||
| 32 | return start < other_end && other_start < end; | ||
| 33 | } | ||
| 34 | |||
| 35 | void MarkAsModified(bool is_modified_, u64 ticks_) noexcept { | ||
| 36 | is_modified = is_modified_; | ||
| 37 | ticks = ticks_; | ||
| 38 | } | ||
| 39 | |||
| 40 | boost::intrusive::set_member_hook<> member_hook_; | ||
| 41 | VAddr start = 0; | ||
| 42 | VAddr end = 0; | ||
| 43 | GPUVAddr gpu_addr = 0; | ||
| 44 | u64 ticks = 0; | ||
| 45 | bool is_written = false; | ||
| 46 | bool is_modified = false; | ||
| 47 | bool is_registered = false; | ||
| 48 | bool is_memory_marked = false; | ||
| 49 | bool is_sync_pending = false; | ||
| 50 | }; | ||
| 51 | |||
| 52 | struct MapIntervalCompare { | ||
| 53 | constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept { | ||
| 54 | return lhs.start < rhs.start; | ||
| 55 | } | ||
| 56 | }; | ||
| 57 | |||
| 58 | class MapIntervalAllocator { | ||
| 59 | public: | ||
| 60 | MapIntervalAllocator(); | ||
| 61 | ~MapIntervalAllocator(); | ||
| 62 | |||
| 63 | MapInterval* Allocate() { | ||
| 64 | if (free_list.empty()) { | ||
| 65 | AllocateNewChunk(); | ||
| 66 | } | ||
| 67 | MapInterval* const interval = free_list.back(); | ||
| 68 | free_list.pop_back(); | ||
| 69 | return interval; | ||
| 70 | } | ||
| 71 | |||
| 72 | void Release(MapInterval* interval) { | ||
| 73 | free_list.push_back(interval); | ||
| 74 | } | ||
| 75 | |||
| 76 | private: | ||
| 77 | struct Chunk { | ||
| 78 | std::unique_ptr<Chunk> next; | ||
| 79 | std::array<MapInterval, 0x8000> data; | ||
| 80 | }; | ||
| 81 | |||
| 82 | void AllocateNewChunk(); | ||
| 83 | |||
| 84 | void FillFreeList(Chunk& chunk); | ||
| 85 | |||
| 86 | std::vector<MapInterval*> free_list; | ||
| 87 | |||
| 88 | Chunk first_chunk; | ||
| 89 | |||
| 90 | std::unique_ptr<Chunk>* new_chunk = &first_chunk.next; | ||
| 91 | }; | ||
| 92 | |||
| 93 | } // namespace VideoCommon | ||