summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Fernando Sahmkow2022-11-20 00:09:56 +0100
committerGravatar Fernando Sahmkow2023-04-29 00:46:31 +0200
commita16c2611316e534bda310f99319f4e8c74c49c92 (patch)
tree906e44bc1bfcd358ecf7510a99adff05394c2846
parentMerge pull request #10051 from liamwhite/surface-capabilities (diff)
downloadyuzu-a16c2611316e534bda310f99319f4e8c74c49c92.tar.gz
yuzu-a16c2611316e534bda310f99319f4e8c74c49c92.tar.xz
yuzu-a16c2611316e534bda310f99319f4e8c74c49c92.zip
Buffer Cache: Fully rework the buffer cache.
-rw-r--r--src/video_core/CMakeLists.txt5
-rw-r--r--src/video_core/buffer_cache/buffer_base.h459
-rw-r--r--src/video_core/buffer_cache/buffer_cache.cpp4
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h990
-rw-r--r--src/video_core/buffer_cache/buffer_cache_base.h507
-rw-r--r--src/video_core/buffer_cache/memory_tracker_base.h258
-rw-r--r--src/video_core/buffer_cache/word_manager.h474
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.h4
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache_base.cpp9
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.cpp8
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.h8
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp9
12 files changed, 1644 insertions, 1091 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index e904573d7..92cab93f3 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -11,8 +11,11 @@ endif()
11 11
12add_library(video_core STATIC 12add_library(video_core STATIC
13 buffer_cache/buffer_base.h 13 buffer_cache/buffer_base.h
14 buffer_cache/buffer_cache_base.h
14 buffer_cache/buffer_cache.cpp 15 buffer_cache/buffer_cache.cpp
15 buffer_cache/buffer_cache.h 16 buffer_cache/buffer_cache.h
17 buffer_cache/memory_tracker_base.h
18 buffer_cache/word_manager.h
16 cache_types.h 19 cache_types.h
17 cdma_pusher.cpp 20 cdma_pusher.cpp
18 cdma_pusher.h 21 cdma_pusher.h
@@ -104,6 +107,7 @@ add_library(video_core STATIC
104 renderer_null/renderer_null.h 107 renderer_null/renderer_null.h
105 renderer_opengl/blit_image.cpp 108 renderer_opengl/blit_image.cpp
106 renderer_opengl/blit_image.h 109 renderer_opengl/blit_image.h
110 renderer_opengl/gl_buffer_cache_base.cpp
107 renderer_opengl/gl_buffer_cache.cpp 111 renderer_opengl/gl_buffer_cache.cpp
108 renderer_opengl/gl_buffer_cache.h 112 renderer_opengl/gl_buffer_cache.h
109 renderer_opengl/gl_compute_pipeline.cpp 113 renderer_opengl/gl_compute_pipeline.cpp
@@ -154,6 +158,7 @@ add_library(video_core STATIC
154 renderer_vulkan/renderer_vulkan.cpp 158 renderer_vulkan/renderer_vulkan.cpp
155 renderer_vulkan/vk_blit_screen.cpp 159 renderer_vulkan/vk_blit_screen.cpp
156 renderer_vulkan/vk_blit_screen.h 160 renderer_vulkan/vk_blit_screen.h
161 renderer_vulkan/vk_buffer_cache_base.cpp
157 renderer_vulkan/vk_buffer_cache.cpp 162 renderer_vulkan/vk_buffer_cache.cpp
158 renderer_vulkan/vk_buffer_cache.h 163 renderer_vulkan/vk_buffer_cache.h
159 renderer_vulkan/vk_command_pool.cpp 164 renderer_vulkan/vk_command_pool.cpp
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index 1b4d63616..66d8bb43c 100644
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -1,5 +1,5 @@
1// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project 1// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later 2// SPDX-License-Identifier: GPL-3.0-or-later
3 3
4#pragma once 4#pragma once
5 5
@@ -11,9 +11,7 @@
11#include "common/alignment.h" 11#include "common/alignment.h"
12#include "common/common_funcs.h" 12#include "common/common_funcs.h"
13#include "common/common_types.h" 13#include "common/common_types.h"
14#include "common/div_ceil.h" 14#include "video_core/buffer_cache/word_manager.h"
15#include "common/settings.h"
16#include "core/memory.h"
17 15
18namespace VideoCommon { 16namespace VideoCommon {
19 17
@@ -36,116 +34,14 @@ struct NullBufferParams {};
36 */ 34 */
37template <class RasterizerInterface> 35template <class RasterizerInterface>
38class BufferBase { 36class BufferBase {
39 static constexpr u64 PAGES_PER_WORD = 64;
40 static constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE;
41 static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE;
42
43 /// Vector tracking modified pages tightly packed with small vector optimization
44 union WordsArray {
45 /// Returns the pointer to the words state
46 [[nodiscard]] const u64* Pointer(bool is_short) const noexcept {
47 return is_short ? &stack : heap;
48 }
49
50 /// Returns the pointer to the words state
51 [[nodiscard]] u64* Pointer(bool is_short) noexcept {
52 return is_short ? &stack : heap;
53 }
54
55 u64 stack = 0; ///< Small buffers storage
56 u64* heap; ///< Not-small buffers pointer to the storage
57 };
58
59 struct Words {
60 explicit Words() = default;
61 explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} {
62 if (IsShort()) {
63 cpu.stack = ~u64{0};
64 gpu.stack = 0;
65 cached_cpu.stack = 0;
66 untracked.stack = ~u64{0};
67 } else {
68 // Share allocation between CPU and GPU pages and set their default values
69 const size_t num_words = NumWords();
70 u64* const alloc = new u64[num_words * 4];
71 cpu.heap = alloc;
72 gpu.heap = alloc + num_words;
73 cached_cpu.heap = alloc + num_words * 2;
74 untracked.heap = alloc + num_words * 3;
75 std::fill_n(cpu.heap, num_words, ~u64{0});
76 std::fill_n(gpu.heap, num_words, 0);
77 std::fill_n(cached_cpu.heap, num_words, 0);
78 std::fill_n(untracked.heap, num_words, ~u64{0});
79 }
80 // Clean up tailing bits
81 const u64 last_word_size = size_bytes % BYTES_PER_WORD;
82 const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE);
83 const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD;
84 const u64 last_word = (~u64{0} << shift) >> shift;
85 cpu.Pointer(IsShort())[NumWords() - 1] = last_word;
86 untracked.Pointer(IsShort())[NumWords() - 1] = last_word;
87 }
88
89 ~Words() {
90 Release();
91 }
92
93 Words& operator=(Words&& rhs) noexcept {
94 Release();
95 size_bytes = rhs.size_bytes;
96 cpu = rhs.cpu;
97 gpu = rhs.gpu;
98 cached_cpu = rhs.cached_cpu;
99 untracked = rhs.untracked;
100 rhs.cpu.heap = nullptr;
101 return *this;
102 }
103
104 Words(Words&& rhs) noexcept
105 : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu},
106 cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} {
107 rhs.cpu.heap = nullptr;
108 }
109
110 Words& operator=(const Words&) = delete;
111 Words(const Words&) = delete;
112
113 /// Returns true when the buffer fits in the small vector optimization
114 [[nodiscard]] bool IsShort() const noexcept {
115 return size_bytes <= BYTES_PER_WORD;
116 }
117
118 /// Returns the number of words of the buffer
119 [[nodiscard]] size_t NumWords() const noexcept {
120 return Common::DivCeil(size_bytes, BYTES_PER_WORD);
121 }
122
123 /// Release buffer resources
124 void Release() {
125 if (!IsShort()) {
126 // CPU written words is the base for the heap allocation
127 delete[] cpu.heap;
128 }
129 }
130
131 u64 size_bytes = 0;
132 WordsArray cpu;
133 WordsArray gpu;
134 WordsArray cached_cpu;
135 WordsArray untracked;
136 };
137
138 enum class Type {
139 CPU,
140 GPU,
141 CachedCPU,
142 Untracked,
143 };
144
145public: 37public:
38 static constexpr u64 BASE_PAGE_BITS = 16;
39 static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS;
40
146 explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes) 41 explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes)
147 : rasterizer{&rasterizer_}, cpu_addr{Common::AlignDown(cpu_addr_, BYTES_PER_PAGE)}, 42 : cpu_addr{Common::AlignDown(cpu_addr_, BASE_PAGE_SIZE)},
148 words(Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BYTES_PER_PAGE)) {} 43 word_manager(cpu_addr, rasterizer_,
44 Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BASE_PAGE_SIZE)) {}
149 45
150 explicit BufferBase(NullBufferParams) {} 46 explicit BufferBase(NullBufferParams) {}
151 47
@@ -159,94 +55,82 @@ public:
159 [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, 55 [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr,
160 u64 query_size) const noexcept { 56 u64 query_size) const noexcept {
161 const u64 offset = query_cpu_addr - cpu_addr; 57 const u64 offset = query_cpu_addr - cpu_addr;
162 return ModifiedRegion<Type::CPU>(offset, query_size); 58 return word_manager.ModifiedRegion<Type::CPU>(offset, query_size);
163 } 59 }
164 60
165 /// Returns the inclusive GPU modified range in a begin end pair 61 /// Returns the inclusive GPU modified range in a begin end pair
166 [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, 62 [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr,
167 u64 query_size) const noexcept { 63 u64 query_size) const noexcept {
168 const u64 offset = query_cpu_addr - cpu_addr; 64 const u64 offset = query_cpu_addr - cpu_addr;
169 return ModifiedRegion<Type::GPU>(offset, query_size); 65 return word_manager.ModifiedRegion<Type::GPU>(offset, query_size);
170 } 66 }
171 67
172 /// Returns true if a region has been modified from the CPU 68 /// Returns true if a region has been modified from the CPU
173 [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { 69 [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
174 const u64 offset = query_cpu_addr - cpu_addr; 70 const u64 offset = query_cpu_addr - cpu_addr;
175 return IsRegionModified<Type::CPU>(offset, query_size); 71 return word_manager.IsRegionModified<Type::CPU>(offset, query_size);
176 } 72 }
177 73
178 /// Returns true if a region has been modified from the GPU 74 /// Returns true if a region has been modified from the GPU
179 [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { 75 [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
180 const u64 offset = query_cpu_addr - cpu_addr; 76 const u64 offset = query_cpu_addr - cpu_addr;
181 return IsRegionModified<Type::GPU>(offset, query_size); 77 return word_manager.IsRegionModified<Type::GPU>(offset, query_size);
182 } 78 }
183 79
184 /// Mark region as CPU modified, notifying the rasterizer about this change 80 /// Mark region as CPU modified, notifying the rasterizer about this change
185 void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { 81 void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
186 ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size); 82 word_manager.ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size);
187 } 83 }
188 84
189 /// Unmark region as CPU modified, notifying the rasterizer about this change 85 /// Unmark region as CPU modified, notifying the rasterizer about this change
190 void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { 86 void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
191 ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size); 87 word_manager.ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size);
192 } 88 }
193 89
194 /// Mark region as modified from the host GPU 90 /// Mark region as modified from the host GPU
195 void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { 91 void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
196 ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size); 92 word_manager.ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size);
197 } 93 }
198 94
199 /// Unmark region as modified from the host GPU 95 /// Unmark region as modified from the host GPU
200 void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { 96 void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
201 ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size); 97 word_manager.ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size);
202 } 98 }
203 99
204 /// Mark region as modified from the CPU 100 /// Mark region as modified from the CPU
205 /// but don't mark it as modified until FlusHCachedWrites is called. 101 /// but don't mark it as modified until FlusHCachedWrites is called.
206 void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { 102 void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) {
207 flags |= BufferFlagBits::CachedWrites; 103 flags |= BufferFlagBits::CachedWrites;
208 ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size); 104 word_manager.ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size);
209 } 105 }
210 106
211 /// Flushes cached CPU writes, and notify the rasterizer about the deltas 107 /// Flushes cached CPU writes, and notify the rasterizer about the deltas
212 void FlushCachedWrites() noexcept { 108 void FlushCachedWrites() noexcept {
213 flags &= ~BufferFlagBits::CachedWrites; 109 flags &= ~BufferFlagBits::CachedWrites;
214 const u64 num_words = NumWords(); 110 word_manager.FlushCachedWrites();
215 u64* const cached_words = Array<Type::CachedCPU>();
216 u64* const untracked_words = Array<Type::Untracked>();
217 u64* const cpu_words = Array<Type::CPU>();
218 for (u64 word_index = 0; word_index < num_words; ++word_index) {
219 const u64 cached_bits = cached_words[word_index];
220 NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits);
221 untracked_words[word_index] |= cached_bits;
222 cpu_words[word_index] |= cached_bits;
223 if (!Settings::values.use_pessimistic_flushes) {
224 cached_words[word_index] = 0;
225 }
226 }
227 } 111 }
228 112
229 /// Call 'func' for each CPU modified range and unmark those pages as CPU modified 113 /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
230 template <typename Func> 114 template <typename Func>
231 void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { 115 void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) {
232 ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func); 116 word_manager.ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func);
233 } 117 }
234 118
235 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified 119 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
236 template <typename Func> 120 template <typename Func>
237 void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { 121 void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) {
238 ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func); 122 word_manager.ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func);
239 } 123 }
240 124
241 template <typename Func> 125 template <typename Func>
242 void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) { 126 void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) {
243 ForEachModifiedRange<Type::GPU>(query_cpu_range, size, true, func); 127 word_manager.ForEachModifiedRange<Type::GPU>(query_cpu_range, size, true, func);
244 } 128 }
245 129
246 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified 130 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
247 template <typename Func> 131 template <typename Func>
248 void ForEachDownloadRange(Func&& func) { 132 void ForEachDownloadRange(Func&& func) {
249 ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func); 133 word_manager.ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func);
250 } 134 }
251 135
252 /// Mark buffer as picked 136 /// Mark buffer as picked
@@ -297,7 +181,7 @@ public:
297 181
298 /// Returns the size in bytes of the buffer 182 /// Returns the size in bytes of the buffer
299 [[nodiscard]] u64 SizeBytes() const noexcept { 183 [[nodiscard]] u64 SizeBytes() const noexcept {
300 return words.size_bytes; 184 return word_manager.SizeBytes();
301 } 185 }
302 186
303 size_t getLRUID() const noexcept { 187 size_t getLRUID() const noexcept {
@@ -309,301 +193,8 @@ public:
309 } 193 }
310 194
311private: 195private:
312 template <Type type>
313 u64* Array() noexcept {
314 if constexpr (type == Type::CPU) {
315 return words.cpu.Pointer(IsShort());
316 } else if constexpr (type == Type::GPU) {
317 return words.gpu.Pointer(IsShort());
318 } else if constexpr (type == Type::CachedCPU) {
319 return words.cached_cpu.Pointer(IsShort());
320 } else if constexpr (type == Type::Untracked) {
321 return words.untracked.Pointer(IsShort());
322 }
323 }
324
325 template <Type type>
326 const u64* Array() const noexcept {
327 if constexpr (type == Type::CPU) {
328 return words.cpu.Pointer(IsShort());
329 } else if constexpr (type == Type::GPU) {
330 return words.gpu.Pointer(IsShort());
331 } else if constexpr (type == Type::CachedCPU) {
332 return words.cached_cpu.Pointer(IsShort());
333 } else if constexpr (type == Type::Untracked) {
334 return words.untracked.Pointer(IsShort());
335 }
336 }
337
338 /**
339 * Change the state of a range of pages
340 *
341 * @param dirty_addr Base address to mark or unmark as modified
342 * @param size Size in bytes to mark or unmark as modified
343 */
344 template <Type type, bool enable>
345 void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) {
346 const s64 difference = dirty_addr - cpu_addr;
347 const u64 offset = std::max<s64>(difference, 0);
348 size += std::min<s64>(difference, 0);
349 if (offset >= SizeBytes() || size < 0) {
350 return;
351 }
352 u64* const untracked_words = Array<Type::Untracked>();
353 u64* const state_words = Array<type>();
354 const u64 offset_end = std::min(offset + size, SizeBytes());
355 const u64 begin_page_index = offset / BYTES_PER_PAGE;
356 const u64 begin_word_index = begin_page_index / PAGES_PER_WORD;
357 const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE);
358 const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD);
359 u64 page_index = begin_page_index % PAGES_PER_WORD;
360 u64 word_index = begin_word_index;
361 while (word_index < end_word_index) {
362 const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD;
363 const u64 left_offset =
364 std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD;
365 const u64 right_offset = page_index;
366 u64 bits = ~u64{0};
367 bits = (bits >> right_offset) << right_offset;
368 bits = (bits << left_offset) >> left_offset;
369 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
370 NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits);
371 }
372 if constexpr (enable) {
373 state_words[word_index] |= bits;
374 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
375 untracked_words[word_index] |= bits;
376 }
377 } else {
378 state_words[word_index] &= ~bits;
379 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
380 untracked_words[word_index] &= ~bits;
381 }
382 }
383 page_index = 0;
384 ++word_index;
385 }
386 }
387
388 /**
389 * Notify rasterizer about changes in the CPU tracking state of a word in the buffer
390 *
391 * @param word_index Index to the word to notify to the rasterizer
392 * @param current_bits Current state of the word
393 * @param new_bits New state of the word
394 *
395 * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages
396 */
397 template <bool add_to_rasterizer>
398 void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const {
399 u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits;
400 VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
401 while (changed_bits != 0) {
402 const int empty_bits = std::countr_zero(changed_bits);
403 addr += empty_bits * BYTES_PER_PAGE;
404 changed_bits >>= empty_bits;
405
406 const u32 continuous_bits = std::countr_one(changed_bits);
407 const u64 size = continuous_bits * BYTES_PER_PAGE;
408 const VAddr begin_addr = addr;
409 addr += size;
410 changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0;
411 rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1);
412 }
413 }
414
415 /**
416 * Loop over each page in the given range, turn off those bits and notify the rasterizer if
417 * needed. Call the given function on each turned off range.
418 *
419 * @param query_cpu_range Base CPU address to loop over
420 * @param size Size in bytes of the CPU range to loop over
421 * @param func Function to call for each turned off region
422 */
423 template <Type type, typename Func>
424 void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) {
425 static_assert(type != Type::Untracked);
426
427 const s64 difference = query_cpu_range - cpu_addr;
428 const u64 query_begin = std::max<s64>(difference, 0);
429 size += std::min<s64>(difference, 0);
430 if (query_begin >= SizeBytes() || size < 0) {
431 return;
432 }
433 u64* const untracked_words = Array<Type::Untracked>();
434 u64* const state_words = Array<type>();
435 const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
436 u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
437 u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD);
438
439 const auto modified = [](u64 word) { return word != 0; };
440 const auto first_modified_word = std::find_if(words_begin, words_end, modified);
441 if (first_modified_word == words_end) {
442 // Exit early when the buffer is not modified
443 return;
444 }
445 const auto last_modified_word = std::find_if_not(first_modified_word, words_end, modified);
446
447 const u64 word_index_begin = std::distance(state_words, first_modified_word);
448 const u64 word_index_end = std::distance(state_words, last_modified_word);
449
450 const unsigned local_page_begin = std::countr_zero(*first_modified_word);
451 const unsigned local_page_end =
452 static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]);
453 const u64 word_page_begin = word_index_begin * PAGES_PER_WORD;
454 const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD;
455 const u64 query_page_begin = query_begin / BYTES_PER_PAGE;
456 const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE);
457 const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin);
458 const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end);
459 const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD;
460 const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1;
461
462 u64 page_begin = first_word_page_begin;
463 u64 current_base = 0;
464 u64 current_size = 0;
465 bool on_going = false;
466 for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) {
467 const bool is_last_word = word_index + 1 == word_index_end;
468 const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD;
469 const u64 right_offset = page_begin;
470 const u64 left_offset = PAGES_PER_WORD - page_end;
471 u64 bits = ~u64{0};
472 bits = (bits >> right_offset) << right_offset;
473 bits = (bits << left_offset) >> left_offset;
474
475 const u64 current_word = state_words[word_index] & bits;
476 if (clear) {
477 state_words[word_index] &= ~bits;
478 }
479
480 if constexpr (type == Type::CPU) {
481 const u64 current_bits = untracked_words[word_index] & bits;
482 untracked_words[word_index] &= ~bits;
483 NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
484 }
485 // Exclude CPU modified pages when visiting GPU pages
486 const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
487 u64 page = page_begin;
488 page_begin = 0;
489
490 while (page < page_end) {
491 const int empty_bits = std::countr_zero(word >> page);
492 if (on_going && empty_bits != 0) {
493 InvokeModifiedRange(func, current_size, current_base);
494 current_size = 0;
495 on_going = false;
496 }
497 if (empty_bits == PAGES_PER_WORD) {
498 break;
499 }
500 page += empty_bits;
501
502 const int continuous_bits = std::countr_one(word >> page);
503 if (!on_going && continuous_bits != 0) {
504 current_base = word_index * PAGES_PER_WORD + page;
505 on_going = true;
506 }
507 current_size += continuous_bits;
508 page += continuous_bits;
509 }
510 }
511 if (on_going && current_size > 0) {
512 InvokeModifiedRange(func, current_size, current_base);
513 }
514 }
515
516 template <typename Func>
517 void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) {
518 const u64 current_size_bytes = current_size * BYTES_PER_PAGE;
519 const u64 offset_begin = current_base * BYTES_PER_PAGE;
520 const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes());
521 func(offset_begin, offset_end - offset_begin);
522 }
523
524 /**
525 * Returns true when a region has been modified
526 *
527 * @param offset Offset in bytes from the start of the buffer
528 * @param size Size in bytes of the region to query for modifications
529 */
530 template <Type type>
531 [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
532 static_assert(type != Type::Untracked);
533
534 const u64* const untracked_words = Array<Type::Untracked>();
535 const u64* const state_words = Array<type>();
536 const u64 num_query_words = size / BYTES_PER_WORD + 1;
537 const u64 word_begin = offset / BYTES_PER_WORD;
538 const u64 word_end = std::min<u64>(word_begin + num_query_words, NumWords());
539 const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
540 u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
541 for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
542 const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
543 const u64 word = state_words[word_index] & ~off_word;
544 if (word == 0) {
545 continue;
546 }
547 const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit);
548 const u64 local_page_end = page_end % PAGES_PER_WORD;
549 const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD;
550 if (((word >> page_index) << page_index) << page_end_shift != 0) {
551 return true;
552 }
553 }
554 return false;
555 }
556
557 /**
558 * Returns a begin end pair with the inclusive modified region
559 *
560 * @param offset Offset in bytes from the start of the buffer
561 * @param size Size in bytes of the region to query for modifications
562 */
563 template <Type type>
564 [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
565 static_assert(type != Type::Untracked);
566
567 const u64* const untracked_words = Array<Type::Untracked>();
568 const u64* const state_words = Array<type>();
569 const u64 num_query_words = size / BYTES_PER_WORD + 1;
570 const u64 word_begin = offset / BYTES_PER_WORD;
571 const u64 word_end = std::min<u64>(word_begin + num_query_words, NumWords());
572 const u64 page_base = offset / BYTES_PER_PAGE;
573 const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
574 u64 begin = std::numeric_limits<u64>::max();
575 u64 end = 0;
576 for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
577 const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
578 const u64 word = state_words[word_index] & ~off_word;
579 if (word == 0) {
580 continue;
581 }
582 const u64 local_page_begin = std::countr_zero(word);
583 const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word);
584 const u64 page_index = word_index * PAGES_PER_WORD;
585 const u64 page_begin = std::max(page_index + local_page_begin, page_base);
586 const u64 page_end = std::min(page_index + local_page_end, page_limit);
587 begin = std::min(begin, page_begin);
588 end = std::max(end, page_end);
589 }
590 static constexpr std::pair<u64, u64> EMPTY{0, 0};
591 return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY;
592 }
593
594 /// Returns the number of words of the buffer
595 [[nodiscard]] size_t NumWords() const noexcept {
596 return words.NumWords();
597 }
598
599 /// Returns true when the buffer fits in the small vector optimization
600 [[nodiscard]] bool IsShort() const noexcept {
601 return words.IsShort();
602 }
603
604 RasterizerInterface* rasterizer = nullptr;
605 VAddr cpu_addr = 0; 196 VAddr cpu_addr = 0;
606 Words words; 197 WordManager<RasterizerInterface> word_manager;
607 BufferFlagBits flags{}; 198 BufferFlagBits flags{};
608 int stream_score = 0; 199 int stream_score = 0;
609 size_t lru_id = SIZE_MAX; 200 size_t lru_id = SIZE_MAX;
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
index a16308b60..40db243d2 100644
--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -1,5 +1,5 @@
1// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project 1// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later 2// SPDX-License-Identifier: GPL-3.0-or-later
3 3
4#include "common/microprofile.h" 4#include "common/microprofile.h"
5 5
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index abdc593df..a0701ce4e 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -1,482 +1,21 @@
1// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project 1// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later 2// SPDX-License-Identifier: GPL-3.0-or-later
3 3
4#pragma once 4#pragma once
5 5
6#include <algorithm> 6#include <algorithm>
7#include <array>
8#include <memory> 7#include <memory>
9#include <mutex>
10#include <numeric> 8#include <numeric>
11#include <span>
12#include <vector>
13
14#include <boost/container/small_vector.hpp>
15#include <boost/icl/interval_set.hpp>
16
17#include "common/common_types.h"
18#include "common/div_ceil.h"
19#include "common/literals.h"
20#include "common/lru_cache.h"
21#include "common/microprofile.h"
22#include "common/polyfill_ranges.h"
23#include "common/scratch_buffer.h"
24#include "common/settings.h"
25#include "core/memory.h"
26#include "video_core/buffer_cache/buffer_base.h"
27#include "video_core/control/channel_state_cache.h"
28#include "video_core/delayed_destruction_ring.h"
29#include "video_core/dirty_flags.h"
30#include "video_core/engines/draw_manager.h"
31#include "video_core/engines/kepler_compute.h"
32#include "video_core/engines/maxwell_3d.h"
33#include "video_core/memory_manager.h"
34#include "video_core/rasterizer_interface.h"
35#include "video_core/surface.h"
36#include "video_core/texture_cache/slot_vector.h"
37#include "video_core/texture_cache/types.h"
38 9
39namespace VideoCommon { 10#include "video_core/buffer_cache/buffer_cache_base.h"
40
41MICROPROFILE_DECLARE(GPU_PrepareBuffers);
42MICROPROFILE_DECLARE(GPU_BindUploadBuffers);
43MICROPROFILE_DECLARE(GPU_DownloadMemory);
44
45using BufferId = SlotId;
46
47using VideoCore::Surface::PixelFormat;
48using namespace Common::Literals;
49
50constexpr u32 NUM_VERTEX_BUFFERS = 32;
51constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4;
52constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18;
53constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
54constexpr u32 NUM_STORAGE_BUFFERS = 16;
55constexpr u32 NUM_TEXTURE_BUFFERS = 16;
56constexpr u32 NUM_STAGES = 5;
57
58enum class ObtainBufferSynchronize : u32 {
59 NoSynchronize = 0,
60 FullSynchronize = 1,
61 SynchronizeNoDirty = 2,
62};
63
64enum class ObtainBufferOperation : u32 {
65 DoNothing = 0,
66 MarkAsWritten = 1,
67 DiscardWrite = 2,
68 MarkQuery = 3,
69};
70
71using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>;
72using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>;
73
74template <typename P>
75class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> {
76
77 // Page size for caching purposes.
78 // This is unrelated to the CPU page size and it can be changed as it seems optimal.
79 static constexpr u32 YUZU_PAGEBITS = 16;
80 static constexpr u64 YUZU_PAGESIZE = u64{1} << YUZU_PAGEBITS;
81
82 static constexpr bool IS_OPENGL = P::IS_OPENGL;
83 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS =
84 P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS;
85 static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT =
86 P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT;
87 static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX;
88 static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX;
89 static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS;
90 static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS;
91
92 static constexpr BufferId NULL_BUFFER_ID{0};
93
94 static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB;
95 static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB;
96 static constexpr s64 TARGET_THRESHOLD = 4_GiB;
97
98 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
99
100 using Runtime = typename P::Runtime;
101 using Buffer = typename P::Buffer;
102
103 using IntervalSet = boost::icl::interval_set<VAddr>;
104 using IntervalType = typename IntervalSet::interval_type;
105
106 struct Empty {};
107
108 struct OverlapResult {
109 std::vector<BufferId> ids;
110 VAddr begin;
111 VAddr end;
112 bool has_stream_leap = false;
113 };
114
115 struct Binding {
116 VAddr cpu_addr{};
117 u32 size{};
118 BufferId buffer_id;
119 };
120
121 struct TextureBufferBinding : Binding {
122 PixelFormat format;
123 };
124
125 static constexpr Binding NULL_BINDING{
126 .cpu_addr = 0,
127 .size = 0,
128 .buffer_id = NULL_BUFFER_ID,
129 };
130
131public:
132 static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB);
133
134 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
135 Core::Memory::Memory& cpu_memory_, Runtime& runtime_);
136
137 void TickFrame();
138
139 void WriteMemory(VAddr cpu_addr, u64 size);
140
141 void CachedWriteMemory(VAddr cpu_addr, u64 size);
142
143 void DownloadMemory(VAddr cpu_addr, u64 size);
144
145 bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer);
146
147 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
148
149 void DisableGraphicsUniformBuffer(size_t stage, u32 index);
150
151 void UpdateGraphicsBuffers(bool is_indexed);
152
153 void UpdateComputeBuffers();
154
155 void BindHostGeometryBuffers(bool is_indexed);
156
157 void BindHostStageBuffers(size_t stage);
158
159 void BindHostComputeBuffers();
160
161 void SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask,
162 const UniformBufferSizes* sizes);
163
164 void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes);
165
166 void UnbindGraphicsStorageBuffers(size_t stage);
167
168 void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
169 bool is_written);
170
171 void UnbindGraphicsTextureBuffers(size_t stage);
172
173 void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size,
174 PixelFormat format, bool is_written, bool is_image);
175
176 void UnbindComputeStorageBuffers();
177
178 void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
179 bool is_written);
180
181 void UnbindComputeTextureBuffers();
182
183 void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format,
184 bool is_written, bool is_image);
185
186 void FlushCachedWrites();
187
188 /// Return true when there are uncommitted buffers to be downloaded
189 [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
190
191 void AccumulateFlushes();
192
193 /// Return true when the caller should wait for async downloads
194 [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
195
196 /// Commit asynchronous downloads
197 void CommitAsyncFlushes();
198 void CommitAsyncFlushesHigh();
199
200 /// Pop asynchronous downloads
201 void PopAsyncFlushes();
202
203 bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount);
204
205 bool DMAClear(GPUVAddr src_address, u64 amount, u32 value);
206
207 [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size,
208 ObtainBufferSynchronize sync_info,
209 ObtainBufferOperation post_op);
210
211 /// Return true when a CPU region is modified from the GPU
212 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
213
214 /// Return true when a region is registered on the cache
215 [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
216
217 /// Return true when a CPU region is modified from the CPU
218 [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
219
220 void SetDrawIndirect(
221 const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) {
222 current_draw_indirect = current_draw_indirect_;
223 }
224
225 [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectCount();
226
227 [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer();
228
229 std::recursive_mutex mutex;
230 Runtime& runtime;
231
232private:
233 template <typename Func>
234 static void ForEachEnabledBit(u32 enabled_mask, Func&& func) {
235 for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) {
236 const int disabled_bits = std::countr_zero(enabled_mask);
237 index += disabled_bits;
238 enabled_mask >>= disabled_bits;
239 func(index);
240 }
241 }
242
243 template <typename Func>
244 void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) {
245 const u64 page_end = Common::DivCeil(cpu_addr + size, YUZU_PAGESIZE);
246 for (u64 page = cpu_addr >> YUZU_PAGEBITS; page < page_end;) {
247 const BufferId buffer_id = page_table[page];
248 if (!buffer_id) {
249 ++page;
250 continue;
251 }
252 Buffer& buffer = slot_buffers[buffer_id];
253 func(buffer_id, buffer);
254
255 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
256 page = Common::DivCeil(end_addr, YUZU_PAGESIZE);
257 }
258 }
259
260 template <typename Func>
261 void ForEachWrittenRange(VAddr cpu_addr, u64 size, Func&& func) {
262 const VAddr start_address = cpu_addr;
263 const VAddr end_address = start_address + size;
264 const VAddr search_base =
265 static_cast<VAddr>(std::min<s64>(0LL, static_cast<s64>(start_address - size)));
266 const IntervalType search_interval{search_base, search_base + 1};
267 auto it = common_ranges.lower_bound(search_interval);
268 if (it == common_ranges.end()) {
269 it = common_ranges.begin();
270 }
271 for (; it != common_ranges.end(); it++) {
272 VAddr inter_addr_end = it->upper();
273 VAddr inter_addr = it->lower();
274 if (inter_addr >= end_address) {
275 break;
276 }
277 if (inter_addr_end <= start_address) {
278 continue;
279 }
280 if (inter_addr_end > end_address) {
281 inter_addr_end = end_address;
282 }
283 if (inter_addr < start_address) {
284 inter_addr = start_address;
285 }
286 func(inter_addr, inter_addr_end);
287 }
288 }
289
290 static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
291 return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) ==
292 ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK);
293 }
294
295 void RunGarbageCollector();
296
297 void BindHostIndexBuffer();
298
299 void BindHostVertexBuffers();
300
301 void BindHostDrawIndirectBuffers();
302
303 void BindHostGraphicsUniformBuffers(size_t stage);
304
305 void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind);
306
307 void BindHostGraphicsStorageBuffers(size_t stage);
308
309 void BindHostGraphicsTextureBuffers(size_t stage);
310
311 void BindHostTransformFeedbackBuffers();
312
313 void BindHostComputeUniformBuffers();
314
315 void BindHostComputeStorageBuffers();
316
317 void BindHostComputeTextureBuffers();
318
319 void DoUpdateGraphicsBuffers(bool is_indexed);
320
321 void DoUpdateComputeBuffers();
322
323 void UpdateIndexBuffer();
324
325 void UpdateVertexBuffers();
326
327 void UpdateVertexBuffer(u32 index);
328
329 void UpdateDrawIndirect();
330
331 void UpdateUniformBuffers(size_t stage);
332
333 void UpdateStorageBuffers(size_t stage);
334
335 void UpdateTextureBuffers(size_t stage);
336
337 void UpdateTransformFeedbackBuffers();
338
339 void UpdateTransformFeedbackBuffer(u32 index);
340
341 void UpdateComputeUniformBuffers();
342
343 void UpdateComputeStorageBuffers();
344
345 void UpdateComputeTextureBuffers();
346
347 void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size);
348
349 [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size);
350
351 [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size);
352
353 void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score);
354
355 [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size);
356
357 void Register(BufferId buffer_id);
358
359 void Unregister(BufferId buffer_id);
360
361 template <bool insert>
362 void ChangeRegister(BufferId buffer_id);
363
364 void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept;
365
366 bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
367
368 bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
369
370 void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
371 std::span<BufferCopy> copies);
372
373 void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
374 std::span<const BufferCopy> copies);
375
376 void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies);
377
378 void DownloadBufferMemory(Buffer& buffer_id);
379
380 void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size);
381
382 void DeleteBuffer(BufferId buffer_id);
383
384 void NotifyBufferDeletion();
385
386 [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index,
387 bool is_written = false) const;
388
389 [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size,
390 PixelFormat format);
391
392 [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size);
393
394 [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity);
395
396 [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept;
397
398 void ClearDownload(IntervalType subtract_interval);
399
400 VideoCore::RasterizerInterface& rasterizer;
401 Core::Memory::Memory& cpu_memory;
402
403 SlotVector<Buffer> slot_buffers;
404 DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
405
406 const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{};
407
408 u32 last_index_count = 0;
409
410 Binding index_buffer;
411 std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
412 std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
413 std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
414 std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers;
415 std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
416 Binding count_buffer_binding;
417 Binding indirect_buffer_binding;
418
419 std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
420 std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
421 std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS> compute_texture_buffers;
422
423 std::array<u32, NUM_STAGES> enabled_uniform_buffer_masks{};
424 u32 enabled_compute_uniform_buffer_mask = 0;
425
426 const UniformBufferSizes* uniform_buffer_sizes{};
427 const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{};
428
429 std::array<u32, NUM_STAGES> enabled_storage_buffers{};
430 std::array<u32, NUM_STAGES> written_storage_buffers{};
431 u32 enabled_compute_storage_buffers = 0;
432 u32 written_compute_storage_buffers = 0;
433
434 std::array<u32, NUM_STAGES> enabled_texture_buffers{};
435 std::array<u32, NUM_STAGES> written_texture_buffers{};
436 std::array<u32, NUM_STAGES> image_texture_buffers{};
437 u32 enabled_compute_texture_buffers = 0;
438 u32 written_compute_texture_buffers = 0;
439 u32 image_compute_texture_buffers = 0;
440
441 std::array<u32, 16> uniform_cache_hits{};
442 std::array<u32, 16> uniform_cache_shots{};
443
444 u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE;
445
446 bool has_deleted_buffers = false;
447
448 std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty>
449 dirty_uniform_buffers{};
450 std::conditional_t<IS_OPENGL, std::array<u32, NUM_STAGES>, Empty> fast_bound_uniform_buffers{};
451 std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS,
452 std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>, Empty>
453 uniform_buffer_binding_sizes{};
454
455 std::vector<BufferId> cached_write_buffer_ids;
456
457 IntervalSet uncommitted_ranges;
458 IntervalSet common_ranges;
459 std::deque<IntervalSet> committed_ranges;
460
461 Common::ScratchBuffer<u8> immediate_buffer_alloc;
462
463 struct LRUItemParams {
464 using ObjectType = BufferId;
465 using TickType = u64;
466 };
467 Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache;
468 u64 frame_tick = 0;
469 u64 total_used_memory = 0;
470 u64 minimum_memory = 0;
471 u64 critical_memory = 0;
472 11
473 std::array<BufferId, ((1ULL << 39) >> YUZU_PAGEBITS)> page_table; 12namespace VideoCommon {
474};
475 13
476template <class P> 14template <class P>
477BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, 15BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
478 Core::Memory::Memory& cpu_memory_, Runtime& runtime_) 16 Core::Memory::Memory& cpu_memory_, Runtime& runtime_)
479 : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_} { 17 : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_}, memory_tracker{
18 rasterizer} {
480 // Ensure the first slot is used for the null buffer 19 // Ensure the first slot is used for the null buffer
481 void(slot_buffers.insert(runtime, NullBufferParams{})); 20 void(slot_buffers.insert(runtime, NullBufferParams{}));
482 common_ranges.clear(); 21 common_ranges.clear();
@@ -547,19 +86,18 @@ void BufferCache<P>::TickFrame() {
547 86
548template <class P> 87template <class P>
549void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) { 88void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) {
550 ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { 89 memory_tracker.MarkRegionAsCpuModified(cpu_addr, size);
551 buffer.MarkRegionAsCpuModified(cpu_addr, size); 90 const IntervalType subtract_interval{cpu_addr, cpu_addr + size};
552 }); 91 ClearDownload(subtract_interval);
92 common_ranges.subtract(subtract_interval);
553} 93}
554 94
555template <class P> 95template <class P>
556void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { 96void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
557 ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { 97 memory_tracker.CachedCpuWrite(cpu_addr, size);
558 if (!buffer.HasCachedWrites()) { 98 const IntervalType add_interval{Common::AlignDown(cpu_addr, YUZU_PAGESIZE),
559 cached_write_buffer_ids.push_back(buffer_id); 99 Common::AlignUp(cpu_addr + size, YUZU_PAGESIZE)};
560 } 100 cached_ranges.add(add_interval);
561 buffer.CachedCpuWrite(cpu_addr, size);
562 });
563} 101}
564 102
565template <class P> 103template <class P>
@@ -572,6 +110,9 @@ void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
572template <class P> 110template <class P>
573void BufferCache<P>::ClearDownload(IntervalType subtract_interval) { 111void BufferCache<P>::ClearDownload(IntervalType subtract_interval) {
574 uncommitted_ranges.subtract(subtract_interval); 112 uncommitted_ranges.subtract(subtract_interval);
113 for (auto& interval_set : async_downloads) {
114 interval_set.subtract(subtract_interval);
115 }
575 for (auto& interval_set : committed_ranges) { 116 for (auto& interval_set : committed_ranges) {
576 interval_set.subtract(subtract_interval); 117 interval_set.subtract(subtract_interval);
577 } 118 }
@@ -611,15 +152,19 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
611 }}; 152 }};
612 153
613 boost::container::small_vector<IntervalType, 4> tmp_intervals; 154 boost::container::small_vector<IntervalType, 4> tmp_intervals;
155 const bool is_high_accuracy =
156 Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High;
614 auto mirror = [&](VAddr base_address, VAddr base_address_end) { 157 auto mirror = [&](VAddr base_address, VAddr base_address_end) {
615 const u64 size = base_address_end - base_address; 158 const u64 size = base_address_end - base_address;
616 const VAddr diff = base_address - *cpu_src_address; 159 const VAddr diff = base_address - *cpu_src_address;
617 const VAddr new_base_address = *cpu_dest_address + diff; 160 const VAddr new_base_address = *cpu_dest_address + diff;
618 const IntervalType add_interval{new_base_address, new_base_address + size}; 161 const IntervalType add_interval{new_base_address, new_base_address + size};
619 uncommitted_ranges.add(add_interval);
620 tmp_intervals.push_back(add_interval); 162 tmp_intervals.push_back(add_interval);
163 if (is_high_accuracy) {
164 uncommitted_ranges.add(add_interval);
165 }
621 }; 166 };
622 ForEachWrittenRange(*cpu_src_address, amount, mirror); 167 ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror);
623 // This subtraction in this order is important for overlapping copies. 168 // This subtraction in this order is important for overlapping copies.
624 common_ranges.subtract(subtract_interval); 169 common_ranges.subtract(subtract_interval);
625 const bool has_new_downloads = tmp_intervals.size() != 0; 170 const bool has_new_downloads = tmp_intervals.size() != 0;
@@ -628,7 +173,7 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
628 } 173 }
629 runtime.CopyBuffer(dest_buffer, src_buffer, copies); 174 runtime.CopyBuffer(dest_buffer, src_buffer, copies);
630 if (has_new_downloads) { 175 if (has_new_downloads) {
631 dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount); 176 memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount);
632 } 177 }
633 std::vector<u8> tmp_buffer(amount); 178 std::vector<u8> tmp_buffer(amount);
634 cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); 179 cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount);
@@ -866,23 +411,24 @@ void BufferCache<P>::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add
866 411
867template <class P> 412template <class P>
868void BufferCache<P>::FlushCachedWrites() { 413void BufferCache<P>::FlushCachedWrites() {
869 for (const BufferId buffer_id : cached_write_buffer_ids) {
870 slot_buffers[buffer_id].FlushCachedWrites();
871 }
872 cached_write_buffer_ids.clear(); 414 cached_write_buffer_ids.clear();
415 memory_tracker.FlushCachedWrites();
416 /*for (auto& interval : cached_ranges) {
417 VAddr cpu_addr = interval.lower();
418 const std::size_t size = interval.upper() - interval.lower();
419 memory_tracker.FlushCachedWrites(cpu_addr, size);
420 // common_ranges.subtract(interval);
421 }*/
422 cached_ranges.clear();
873} 423}
874 424
875template <class P> 425template <class P>
876bool BufferCache<P>::HasUncommittedFlushes() const noexcept { 426bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
877 return !uncommitted_ranges.empty() || !committed_ranges.empty(); 427 return !uncommitted_ranges.empty() || !committed_ranges.empty() || !pending_queries.empty();
878} 428}
879 429
880template <class P> 430template <class P>
881void BufferCache<P>::AccumulateFlushes() { 431void BufferCache<P>::AccumulateFlushes() {
882 if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) {
883 uncommitted_ranges.clear();
884 return;
885 }
886 if (uncommitted_ranges.empty()) { 432 if (uncommitted_ranges.empty()) {
887 return; 433 return;
888 } 434 }
@@ -891,7 +437,8 @@ void BufferCache<P>::AccumulateFlushes() {
891 437
892template <class P> 438template <class P>
893bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { 439bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
894 return false; 440 return (!async_buffers.empty() && async_buffers.front().has_value()) ||
441 (!query_async_buffers.empty() && query_async_buffers.front().has_value());
895} 442}
896 443
897template <class P> 444template <class P>
@@ -899,11 +446,10 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
899 AccumulateFlushes(); 446 AccumulateFlushes();
900 447
901 if (committed_ranges.empty()) { 448 if (committed_ranges.empty()) {
449 async_buffers.emplace_back(std::optional<Async_Buffer>{});
902 return; 450 return;
903 } 451 }
904 MICROPROFILE_SCOPE(GPU_DownloadMemory); 452 MICROPROFILE_SCOPE(GPU_DownloadMemory);
905 const bool is_accuracy_normal =
906 Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal;
907 453
908 auto it = committed_ranges.begin(); 454 auto it = committed_ranges.begin();
909 while (it != committed_ranges.end()) { 455 while (it != committed_ranges.end()) {
@@ -926,11 +472,12 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
926 const std::size_t size = interval.upper() - interval.lower(); 472 const std::size_t size = interval.upper() - interval.lower();
927 const VAddr cpu_addr = interval.lower(); 473 const VAddr cpu_addr = interval.lower();
928 ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { 474 ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
929 buffer.ForEachDownloadRangeAndClear( 475 const VAddr buffer_start = buffer.CpuAddr();
930 cpu_addr, size, [&](u64 range_offset, u64 range_size) { 476 const VAddr buffer_end = buffer_start + buffer.SizeBytes();
931 if (is_accuracy_normal) { 477 const VAddr new_start = std::max(buffer_start, cpu_addr);
932 return; 478 const VAddr new_end = std::min(buffer_end, cpu_addr + size);
933 } 479 memory_tracker.ForEachDownloadRange(
480 new_start, new_end - new_start, false, [&](u64 cpu_addr_out, u64 range_size) {
934 const VAddr buffer_addr = buffer.CpuAddr(); 481 const VAddr buffer_addr = buffer.CpuAddr();
935 const auto add_download = [&](VAddr start, VAddr end) { 482 const auto add_download = [&](VAddr start, VAddr end) {
936 const u64 new_offset = start - buffer_addr; 483 const u64 new_offset = start - buffer_addr;
@@ -950,38 +497,36 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
950 largest_copy = std::max(largest_copy, new_size); 497 largest_copy = std::max(largest_copy, new_size);
951 }; 498 };
952 499
953 const VAddr start_address = buffer_addr + range_offset; 500 ForEachInRangeSet(common_ranges, cpu_addr_out, range_size, add_download);
954 const VAddr end_address = start_address + range_size;
955 ForEachWrittenRange(start_address, range_size, add_download);
956 const IntervalType subtract_interval{start_address, end_address};
957 common_ranges.subtract(subtract_interval);
958 }); 501 });
959 }); 502 });
960 } 503 }
961 } 504 }
962 committed_ranges.clear(); 505 committed_ranges.clear();
963 if (downloads.empty()) { 506 if (downloads.empty()) {
507 async_buffers.emplace_back(std::optional<Async_Buffer>{});
964 return; 508 return;
965 } 509 }
966 if constexpr (USE_MEMORY_MAPS) { 510 if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
967 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); 511 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true);
512 boost::container::small_vector<BufferCopy, 4> normalized_copies;
513 IntervalSet new_async_range{};
968 runtime.PreCopyBarrier(); 514 runtime.PreCopyBarrier();
969 for (auto& [copy, buffer_id] : downloads) { 515 for (auto& [copy, buffer_id] : downloads) {
970 // Have in mind the staging buffer offset for the copy
971 copy.dst_offset += download_staging.offset; 516 copy.dst_offset += download_staging.offset;
972 const std::array copies{copy}; 517 const std::array copies{copy};
973 runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); 518 BufferCopy second_copy{copy};
974 } 519 Buffer& buffer = slot_buffers[buffer_id];
975 runtime.PostCopyBarrier(); 520 second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + copy.src_offset;
976 runtime.Finish(); 521 VAddr orig_cpu_addr = static_cast<VAddr>(second_copy.src_offset);
977 for (const auto& [copy, buffer_id] : downloads) { 522 const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size};
978 const Buffer& buffer = slot_buffers[buffer_id]; 523 new_async_range.add(base_interval);
979 const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; 524 runtime.CopyBuffer(download_staging.buffer, buffer, copies, false);
980 // Undo the modified offset 525 normalized_copies.push_back(second_copy);
981 const u64 dst_offset = copy.dst_offset - download_staging.offset;
982 const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset;
983 cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size);
984 } 526 }
527 async_downloads.emplace_back(std::move(new_async_range));
528 pending_downloads.emplace_back(std::move(normalized_copies));
529 async_buffers.emplace_back(download_staging);
985 } else { 530 } else {
986 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); 531 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
987 for (const auto& [copy, buffer_id] : downloads) { 532 for (const auto& [copy, buffer_id] : downloads) {
@@ -994,42 +539,154 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
994} 539}
995 540
996template <class P> 541template <class P>
997void BufferCache<P>::CommitAsyncFlushes() { 542void BufferCache<P>::CommitAsyncQueries() {
998 if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { 543 if (pending_queries.empty()) {
999 CommitAsyncFlushesHigh(); 544 query_async_buffers.emplace_back(std::optional<Async_Buffer>{});
545 return;
546 }
547
548 MICROPROFILE_SCOPE(GPU_DownloadMemory);
549 boost::container::small_vector<std::pair<BufferCopy, BufferId>, 8> downloads;
550 u64 total_size_bytes = 0;
551 u64 largest_copy = 0;
552 do {
553 has_deleted_buffers = false;
554 downloads.clear();
555 total_size_bytes = 0;
556 largest_copy = 0;
557 for (const auto& query_info : pending_queries) {
558 const std::size_t size = query_info.second;
559 const VAddr cpu_addr = query_info.first;
560 const BufferId buffer_id = FindBuffer(cpu_addr, static_cast<u32>(size));
561 Buffer& buffer = slot_buffers[buffer_id];
562 if (has_deleted_buffers) {
563 break;
564 }
565 downloads.push_back({
566 BufferCopy{
567 .src_offset = buffer.Offset(cpu_addr),
568 .dst_offset = total_size_bytes,
569 .size = size,
570 },
571 buffer_id,
572 });
573 constexpr u64 align = 8ULL;
574 constexpr u64 mask = ~(align - 1ULL);
575 total_size_bytes += (size + align - 1) & mask;
576 largest_copy = std::max(largest_copy, size);
577 }
578 } while (has_deleted_buffers);
579 pending_queries.clear();
580 if (downloads.empty()) {
581 query_async_buffers.push_back(std::optional<Async_Buffer>{});
582 return;
583 }
584 if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
585 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true);
586 boost::container::small_vector<BufferCopy, 8> normalized_copies;
587 runtime.PreCopyBarrier();
588 for (auto& [copy, buffer_id] : downloads) {
589 // Have in mind the staging buffer offset for the copy
590 copy.dst_offset += download_staging.offset;
591 const std::array copies{copy};
592 const Buffer& buffer = slot_buffers[buffer_id];
593 BufferCopy second_copy{copy};
594 second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + second_copy.src_offset;
595 runtime.CopyBuffer(download_staging.buffer, buffer, copies, false);
596 normalized_copies.push_back(second_copy);
597 }
598 committed_queries.emplace_back(std::move(normalized_copies));
599 query_async_buffers.emplace_back(download_staging);
1000 } else { 600 } else {
1001 uncommitted_ranges.clear(); 601 query_async_buffers.push_back(std::optional<Async_Buffer>{});
1002 committed_ranges.clear();
1003 } 602 }
1004} 603}
1005 604
1006template <class P> 605template <class P>
1007void BufferCache<P>::PopAsyncFlushes() {} 606void BufferCache<P>::CommitAsyncFlushes() {
607 CommitAsyncFlushesHigh();
608 CommitAsyncQueries();
609}
1008 610
1009template <class P> 611template <class P>
1010bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { 612void BufferCache<P>::PopAsyncFlushes() {
1011 const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); 613 MICROPROFILE_SCOPE(GPU_DownloadMemory);
1012 for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { 614 PopAsyncBuffers();
1013 const BufferId image_id = page_table[page]; 615 PopAsyncQueries();
1014 if (!image_id) { 616}
1015 ++page; 617
1016 continue; 618template <class P>
619void BufferCache<P>::PopAsyncBuffers() {
620 if (async_buffers.empty()) {
621 return;
622 }
623 if (!async_buffers.front().has_value()) {
624 async_buffers.pop_front();
625 return;
626 }
627 if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
628 auto& downloads = pending_downloads.front();
629 auto& async_buffer = async_buffers.front();
630 auto& async_range = async_downloads.front();
631 u8* base = async_buffer->mapped_span.data();
632 const size_t base_offset = async_buffer->offset;
633 for (const auto& copy : downloads) {
634 const VAddr cpu_addr = static_cast<VAddr>(copy.src_offset);
635 const u64 dst_offset = copy.dst_offset - base_offset;
636 const u8* read_mapped_memory = base + dst_offset;
637 ForEachInRangeSet(async_range, cpu_addr, copy.size, [&](VAddr start, VAddr end) {
638 const size_t diff = start - cpu_addr;
639 const size_t new_size = end - start;
640 cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[diff], new_size);
641 const IntervalType base_interval{start, end};
642 common_ranges.subtract(base_interval);
643 });
1017 } 644 }
1018 Buffer& buffer = slot_buffers[image_id]; 645 runtime.FreeDeferredStagingBuffer(*async_buffer);
1019 if (buffer.IsRegionGpuModified(addr, size)) { 646 async_buffers.pop_front();
1020 return true; 647 pending_downloads.pop_front();
648 async_downloads.pop_front();
649 }
650}
651
652template <class P>
653void BufferCache<P>::PopAsyncQueries() {
654 if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
655 if (query_async_buffers.empty()) {
656 return;
657 }
658 if (!query_async_buffers.front().has_value()) {
659 query_async_buffers.pop_front();
660 return;
1021 } 661 }
1022 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); 662 auto& downloads = committed_queries.front();
1023 page = Common::DivCeil(end_addr, YUZU_PAGESIZE); 663 auto& async_buffer = query_async_buffers.front();
664 flushed_queries.clear();
665 u8* base = async_buffer->mapped_span.data();
666 const size_t base_offset = async_buffer->offset;
667 for (const auto& copy : downloads) {
668 const size_t dst_offset = copy.dst_offset - base_offset;
669 const u8* read_mapped_memory = base + dst_offset;
670 u64 new_value{};
671 std::memcpy(&new_value, read_mapped_memory, copy.size);
672 flushed_queries.push_back(new_value);
673 }
674 runtime.FreeDeferredStagingBuffer(*async_buffer);
675 committed_queries.pop_front();
676 query_async_buffers.pop_front();
1024 } 677 }
1025 return false; 678}
679
680template <class P>
681bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
682 return memory_tracker.IsRegionGpuModified(addr, size);
1026} 683}
1027 684
1028template <class P> 685template <class P>
1029bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { 686bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) {
1030 const VAddr end_addr = addr + size; 687 const VAddr end_addr = addr + size;
1031 const u64 page_end = Common::DivCeil(end_addr, YUZU_PAGESIZE); 688 const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE);
1032 for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { 689 for (u64 page = addr >> PAGE_BITS; page < page_end;) {
1033 const BufferId buffer_id = page_table[page]; 690 const BufferId buffer_id = page_table[page];
1034 if (!buffer_id) { 691 if (!buffer_id) {
1035 ++page; 692 ++page;
@@ -1041,28 +698,14 @@ bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) {
1041 if (buf_start_addr < end_addr && addr < buf_end_addr) { 698 if (buf_start_addr < end_addr && addr < buf_end_addr) {
1042 return true; 699 return true;
1043 } 700 }
1044 page = Common::DivCeil(end_addr, YUZU_PAGESIZE); 701 page = Common::DivCeil(end_addr, PAGE_SIZE);
1045 } 702 }
1046 return false; 703 return false;
1047} 704}
1048 705
1049template <class P> 706template <class P>
1050bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) { 707bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) {
1051 const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); 708 return memory_tracker.IsRegionCpuModified(addr, size);
1052 for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) {
1053 const BufferId image_id = page_table[page];
1054 if (!image_id) {
1055 ++page;
1056 continue;
1057 }
1058 Buffer& buffer = slot_buffers[image_id];
1059 if (buffer.IsRegionCpuModified(addr, size)) {
1060 return true;
1061 }
1062 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
1063 page = Common::DivCeil(end_addr, YUZU_PAGESIZE);
1064 }
1065 return false;
1066} 709}
1067 710
1068template <class P> 711template <class P>
@@ -1155,7 +798,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
1155 TouchBuffer(buffer, binding.buffer_id); 798 TouchBuffer(buffer, binding.buffer_id);
1156 const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && 799 const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID &&
1157 size <= uniform_buffer_skip_cache_size && 800 size <= uniform_buffer_skip_cache_size &&
1158 !buffer.IsRegionGpuModified(cpu_addr, size); 801 !memory_tracker.IsRegionGpuModified(cpu_addr, size);
1159 if (use_fast_buffer) { 802 if (use_fast_buffer) {
1160 if constexpr (IS_OPENGL) { 803 if constexpr (IS_OPENGL) {
1161 if (runtime.HasFastBufferSubData()) { 804 if (runtime.HasFastBufferSubData()) {
@@ -1378,27 +1021,28 @@ void BufferCache<P>::UpdateIndexBuffer() {
1378 // We have to check for the dirty flags and index count 1021 // We have to check for the dirty flags and index count
1379 // The index count is currently changed without updating the dirty flags 1022 // The index count is currently changed without updating the dirty flags
1380 const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); 1023 const auto& draw_state = maxwell3d->draw_manager->GetDrawState();
1381 const auto& index_array = draw_state.index_buffer; 1024 const auto& index_buffer_ref = draw_state.index_buffer;
1382 auto& flags = maxwell3d->dirty.flags; 1025 auto& flags = maxwell3d->dirty.flags;
1383 if (!flags[Dirty::IndexBuffer]) { 1026 if (!flags[Dirty::IndexBuffer]) {
1384 return; 1027 return;
1385 } 1028 }
1386 flags[Dirty::IndexBuffer] = false; 1029 flags[Dirty::IndexBuffer] = false;
1387 last_index_count = index_array.count;
1388 if (!draw_state.inline_index_draw_indexes.empty()) { 1030 if (!draw_state.inline_index_draw_indexes.empty()) {
1389 auto inline_index_size = static_cast<u32>(draw_state.inline_index_draw_indexes.size()); 1031 auto inline_index_size = static_cast<u32>(draw_state.inline_index_draw_indexes.size());
1390 index_buffer = Binding{ 1032 index_buffer = Binding{
1391 .cpu_addr = 0, 1033 .cpu_addr = 0,
1392 .size = inline_index_size, 1034 .size = inline_index_size,
1393 .buffer_id = CreateBuffer(0, inline_index_size), 1035 .buffer_id = FindBuffer(0, inline_index_size),
1394 }; 1036 };
1395 return; 1037 return;
1396 } 1038 }
1397 const GPUVAddr gpu_addr_begin = index_array.StartAddress(); 1039
1398 const GPUVAddr gpu_addr_end = index_array.EndAddress(); 1040 const GPUVAddr gpu_addr_begin = index_buffer_ref.StartAddress();
1041 const GPUVAddr gpu_addr_end = index_buffer_ref.EndAddress();
1399 const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); 1042 const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin);
1400 const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); 1043 const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
1401 const u32 draw_size = (index_array.count + index_array.first) * index_array.FormatSizeInBytes(); 1044 const u32 draw_size =
1045 (index_buffer_ref.count + index_buffer_ref.first) * index_buffer_ref.FormatSizeInBytes();
1402 const u32 size = std::min(address_size, draw_size); 1046 const u32 size = std::min(address_size, draw_size);
1403 if (size == 0 || !cpu_addr) { 1047 if (size == 0 || !cpu_addr) {
1404 index_buffer = NULL_BINDING; 1048 index_buffer = NULL_BINDING;
@@ -1434,17 +1078,15 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) {
1434 const GPUVAddr gpu_addr_begin = array.Address(); 1078 const GPUVAddr gpu_addr_begin = array.Address();
1435 const GPUVAddr gpu_addr_end = limit.Address() + 1; 1079 const GPUVAddr gpu_addr_end = limit.Address() + 1;
1436 const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); 1080 const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin);
1437 u32 address_size = static_cast<u32>( 1081 const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
1438 std::min(gpu_addr_end - gpu_addr_begin, static_cast<u64>(std::numeric_limits<u32>::max()))); 1082 u32 size = address_size; // TODO: Analyze stride and number of vertices
1439 if (array.enable == 0 || address_size == 0 || !cpu_addr) { 1083 if (array.enable == 0 || size == 0 || !cpu_addr) {
1440 vertex_buffers[index] = NULL_BINDING; 1084 vertex_buffers[index] = NULL_BINDING;
1441 return; 1085 return;
1442 } 1086 }
1443 if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) { 1087 if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) {
1444 address_size = 1088 size = static_cast<u32>(gpu_memory->MaxContinuousRange(gpu_addr_begin, size));
1445 static_cast<u32>(gpu_memory->MaxContinuousRange(gpu_addr_begin, address_size));
1446 } 1089 }
1447 const u32 size = address_size; // TODO: Analyze stride and number of vertices
1448 vertex_buffers[index] = Binding{ 1090 vertex_buffers[index] = Binding{
1449 .cpu_addr = *cpu_addr, 1091 .cpu_addr = *cpu_addr,
1450 .size = size, 1092 .size = size,
@@ -1590,18 +1232,17 @@ void BufferCache<P>::UpdateComputeTextureBuffers() {
1590} 1232}
1591 1233
1592template <class P> 1234template <class P>
1593void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { 1235void BufferCache<P>::MarkWrittenBuffer(BufferId, VAddr cpu_addr, u32 size) {
1594 Buffer& buffer = slot_buffers[buffer_id]; 1236 memory_tracker.MarkRegionAsGpuModified(cpu_addr, size);
1595 buffer.MarkRegionAsGpuModified(cpu_addr, size);
1596 1237
1597 const IntervalType base_interval{cpu_addr, cpu_addr + size}; 1238 const IntervalType base_interval{cpu_addr, cpu_addr + size};
1598 common_ranges.add(base_interval); 1239 common_ranges.add(base_interval);
1599 1240 for (auto& interval_set : async_downloads) {
1600 const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); 1241 interval_set.subtract(base_interval);
1601 if (!is_async) { 1242 }
1602 return; 1243 if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) {
1244 uncommitted_ranges.add(base_interval);
1603 } 1245 }
1604 uncommitted_ranges.add(base_interval);
1605} 1246}
1606 1247
1607template <class P> 1248template <class P>
@@ -1609,7 +1250,7 @@ BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) {
1609 if (cpu_addr == 0) { 1250 if (cpu_addr == 0) {
1610 return NULL_BUFFER_ID; 1251 return NULL_BUFFER_ID;
1611 } 1252 }
1612 const u64 page = cpu_addr >> YUZU_PAGEBITS; 1253 const u64 page = cpu_addr >> PAGE_BITS;
1613 const BufferId buffer_id = page_table[page]; 1254 const BufferId buffer_id = page_table[page];
1614 if (!buffer_id) { 1255 if (!buffer_id) {
1615 return CreateBuffer(cpu_addr, size); 1256 return CreateBuffer(cpu_addr, size);
@@ -1638,9 +1279,8 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
1638 .has_stream_leap = has_stream_leap, 1279 .has_stream_leap = has_stream_leap,
1639 }; 1280 };
1640 } 1281 }
1641 for (; cpu_addr >> YUZU_PAGEBITS < Common::DivCeil(end, YUZU_PAGESIZE); 1282 for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) {
1642 cpu_addr += YUZU_PAGESIZE) { 1283 const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS];
1643 const BufferId overlap_id = page_table[cpu_addr >> YUZU_PAGEBITS];
1644 if (!overlap_id) { 1284 if (!overlap_id) {
1645 continue; 1285 continue;
1646 } 1286 }
@@ -1666,11 +1306,11 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
1666 // as a stream buffer. Increase the size to skip constantly recreating buffers. 1306 // as a stream buffer. Increase the size to skip constantly recreating buffers.
1667 has_stream_leap = true; 1307 has_stream_leap = true;
1668 if (expands_right) { 1308 if (expands_right) {
1669 begin -= YUZU_PAGESIZE * 256; 1309 begin -= PAGE_SIZE * 256;
1670 cpu_addr = begin; 1310 cpu_addr = begin;
1671 } 1311 }
1672 if (expands_left) { 1312 if (expands_left) {
1673 end += YUZU_PAGESIZE * 256; 1313 end += PAGE_SIZE * 256;
1674 } 1314 }
1675 } 1315 }
1676 } 1316 }
@@ -1690,21 +1330,15 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
1690 if (accumulate_stream_score) { 1330 if (accumulate_stream_score) {
1691 new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1); 1331 new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1);
1692 } 1332 }
1693 std::vector<BufferCopy> copies; 1333 boost::container::small_vector<BufferCopy, 1> copies;
1694 const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); 1334 const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr();
1695 overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) { 1335 copies.push_back(BufferCopy{
1696 copies.push_back(BufferCopy{ 1336 .src_offset = 0,
1697 .src_offset = begin, 1337 .dst_offset = dst_base_offset,
1698 .dst_offset = dst_base_offset + begin, 1338 .size = overlap.SizeBytes(),
1699 .size = range_size,
1700 });
1701 new_buffer.UnmarkRegionAsCpuModified(begin, range_size);
1702 new_buffer.MarkRegionAsGpuModified(begin, range_size);
1703 }); 1339 });
1704 if (!copies.empty()) { 1340 runtime.CopyBuffer(new_buffer, overlap, copies);
1705 runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); 1341 DeleteBuffer(overlap_id, true);
1706 }
1707 DeleteBuffer(overlap_id);
1708} 1342}
1709 1343
1710template <class P> 1344template <class P>
@@ -1718,7 +1352,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
1718 JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); 1352 JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
1719 } 1353 }
1720 Register(new_buffer_id); 1354 Register(new_buffer_id);
1721 TouchBuffer(slot_buffers[new_buffer_id], new_buffer_id); 1355 TouchBuffer(new_buffer, new_buffer_id);
1722 return new_buffer_id; 1356 return new_buffer_id;
1723} 1357}
1724 1358
@@ -1746,8 +1380,8 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
1746 } 1380 }
1747 const VAddr cpu_addr_begin = buffer.CpuAddr(); 1381 const VAddr cpu_addr_begin = buffer.CpuAddr();
1748 const VAddr cpu_addr_end = cpu_addr_begin + size; 1382 const VAddr cpu_addr_end = cpu_addr_begin + size;
1749 const u64 page_begin = cpu_addr_begin / YUZU_PAGESIZE; 1383 const u64 page_begin = cpu_addr_begin / PAGE_SIZE;
1750 const u64 page_end = Common::DivCeil(cpu_addr_end, YUZU_PAGESIZE); 1384 const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE);
1751 for (u64 page = page_begin; page != page_end; ++page) { 1385 for (u64 page = page_begin; page != page_end; ++page) {
1752 if constexpr (insert) { 1386 if constexpr (insert) {
1753 page_table[page] = buffer_id; 1387 page_table[page] = buffer_id;
@@ -1766,9 +1400,6 @@ void BufferCache<P>::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept {
1766 1400
1767template <class P> 1401template <class P>
1768bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { 1402bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
1769 if (buffer.CpuAddr() == 0) {
1770 return true;
1771 }
1772 return SynchronizeBufferImpl(buffer, cpu_addr, size); 1403 return SynchronizeBufferImpl(buffer, cpu_addr, size);
1773} 1404}
1774 1405
@@ -1777,10 +1408,11 @@ bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s
1777 boost::container::small_vector<BufferCopy, 4> copies; 1408 boost::container::small_vector<BufferCopy, 4> copies;
1778 u64 total_size_bytes = 0; 1409 u64 total_size_bytes = 0;
1779 u64 largest_copy = 0; 1410 u64 largest_copy = 0;
1780 buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { 1411 VAddr buffer_start = buffer.CpuAddr();
1412 memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) {
1781 copies.push_back(BufferCopy{ 1413 copies.push_back(BufferCopy{
1782 .src_offset = total_size_bytes, 1414 .src_offset = total_size_bytes,
1783 .dst_offset = range_offset, 1415 .dst_offset = cpu_addr_out - buffer_start,
1784 .size = range_size, 1416 .size = range_size,
1785 }); 1417 });
1786 total_size_bytes += range_size; 1418 total_size_bytes += range_size;
@@ -1795,6 +1427,51 @@ bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s
1795} 1427}
1796 1428
1797template <class P> 1429template <class P>
1430bool BufferCache<P>::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) {
1431 boost::container::small_vector<BufferCopy, 4> copies;
1432 u64 total_size_bytes = 0;
1433 u64 largest_copy = 0;
1434 IntervalSet found_sets{};
1435 auto make_copies = [&] {
1436 for (auto& interval : found_sets) {
1437 const std::size_t sub_size = interval.upper() - interval.lower();
1438 const VAddr cpu_addr = interval.lower();
1439 copies.push_back(BufferCopy{
1440 .src_offset = total_size_bytes,
1441 .dst_offset = cpu_addr - buffer.CpuAddr(),
1442 .size = sub_size,
1443 });
1444 total_size_bytes += sub_size;
1445 largest_copy = std::max(largest_copy, sub_size);
1446 }
1447 const std::span<BufferCopy> copies_span(copies.data(), copies.size());
1448 UploadMemory(buffer, total_size_bytes, largest_copy, copies_span);
1449 };
1450 memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) {
1451 const VAddr base_adr = cpu_addr_out;
1452 const VAddr end_adr = base_adr + range_size;
1453 const IntervalType add_interval{base_adr, end_adr};
1454 found_sets.add(add_interval);
1455 });
1456 if (found_sets.empty()) {
1457 return true;
1458 }
1459 const IntervalType search_interval{cpu_addr, cpu_addr + size};
1460 auto it = common_ranges.lower_bound(search_interval);
1461 auto it_end = common_ranges.upper_bound(search_interval);
1462 if (it == common_ranges.end()) {
1463 make_copies();
1464 return false;
1465 }
1466 while (it != it_end) {
1467 found_sets.subtract(*it);
1468 it++;
1469 }
1470 make_copies();
1471 return false;
1472}
1473
1474template <class P>
1798void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, 1475void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
1799 std::span<BufferCopy> copies) { 1476 std::span<BufferCopy> copies) {
1800 if constexpr (USE_MEMORY_MAPS) { 1477 if constexpr (USE_MEMORY_MAPS) {
@@ -1805,39 +1482,45 @@ void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 larg
1805} 1482}
1806 1483
1807template <class P> 1484template <class P>
1808void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, 1485void BufferCache<P>::ImmediateUploadMemory([[maybe_unused]] Buffer& buffer,
1809 std::span<const BufferCopy> copies) { 1486 [[maybe_unused]] u64 largest_copy,
1810 std::span<u8> immediate_buffer; 1487 [[maybe_unused]] std::span<const BufferCopy> copies) {
1811 for (const BufferCopy& copy : copies) { 1488 if constexpr (!USE_MEMORY_MAPS) {
1812 std::span<const u8> upload_span; 1489 std::span<u8> immediate_buffer;
1813 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; 1490 for (const BufferCopy& copy : copies) {
1814 if (IsRangeGranular(cpu_addr, copy.size)) { 1491 std::span<const u8> upload_span;
1815 upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); 1492 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
1816 } else { 1493 if (IsRangeGranular(cpu_addr, copy.size)) {
1817 if (immediate_buffer.empty()) { 1494 upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size);
1818 immediate_buffer = ImmediateBuffer(largest_copy); 1495 } else {
1496 if (immediate_buffer.empty()) {
1497 immediate_buffer = ImmediateBuffer(largest_copy);
1498 }
1499 cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
1500 upload_span = immediate_buffer.subspan(0, copy.size);
1819 } 1501 }
1820 cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); 1502 buffer.ImmediateUpload(copy.dst_offset, upload_span);
1821 upload_span = immediate_buffer.subspan(0, copy.size);
1822 } 1503 }
1823 buffer.ImmediateUpload(copy.dst_offset, upload_span);
1824 } 1504 }
1825} 1505}
1826 1506
1827template <class P> 1507template <class P>
1828void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, 1508void BufferCache<P>::MappedUploadMemory([[maybe_unused]] Buffer& buffer,
1829 std::span<BufferCopy> copies) { 1509 [[maybe_unused]] u64 total_size_bytes,
1830 auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); 1510 [[maybe_unused]] std::span<BufferCopy> copies) {
1831 const std::span<u8> staging_pointer = upload_staging.mapped_span; 1511 if constexpr (USE_MEMORY_MAPS) {
1832 for (BufferCopy& copy : copies) { 1512 auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes);
1833 u8* const src_pointer = staging_pointer.data() + copy.src_offset; 1513 const std::span<u8> staging_pointer = upload_staging.mapped_span;
1834 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; 1514 for (BufferCopy& copy : copies) {
1835 cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); 1515 u8* const src_pointer = staging_pointer.data() + copy.src_offset;
1516 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
1517 cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size);
1836 1518
1837 // Apply the staging offset 1519 // Apply the staging offset
1838 copy.src_offset += upload_staging.offset; 1520 copy.src_offset += upload_staging.offset;
1521 }
1522 runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
1839 } 1523 }
1840 runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
1841} 1524}
1842 1525
1843template <class P> 1526template <class P>
@@ -1886,30 +1569,31 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si
1886 boost::container::small_vector<BufferCopy, 1> copies; 1569 boost::container::small_vector<BufferCopy, 1> copies;
1887 u64 total_size_bytes = 0; 1570 u64 total_size_bytes = 0;
1888 u64 largest_copy = 0; 1571 u64 largest_copy = 0;
1889 buffer.ForEachDownloadRangeAndClear(cpu_addr, size, [&](u64 range_offset, u64 range_size) { 1572 memory_tracker.ForEachDownloadRangeAndClear(
1890 const VAddr buffer_addr = buffer.CpuAddr(); 1573 cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) {
1891 const auto add_download = [&](VAddr start, VAddr end) { 1574 const VAddr buffer_addr = buffer.CpuAddr();
1892 const u64 new_offset = start - buffer_addr; 1575 const auto add_download = [&](VAddr start, VAddr end) {
1893 const u64 new_size = end - start; 1576 const u64 new_offset = start - buffer_addr;
1894 copies.push_back(BufferCopy{ 1577 const u64 new_size = end - start;
1895 .src_offset = new_offset, 1578 copies.push_back(BufferCopy{
1896 .dst_offset = total_size_bytes, 1579 .src_offset = new_offset,
1897 .size = new_size, 1580 .dst_offset = total_size_bytes,
1898 }); 1581 .size = new_size,
1899 // Align up to avoid cache conflicts 1582 });
1900 constexpr u64 align = 256ULL; 1583 // Align up to avoid cache conflicts
1901 constexpr u64 mask = ~(align - 1ULL); 1584 constexpr u64 align = 8ULL;
1902 total_size_bytes += (new_size + align - 1) & mask; 1585 constexpr u64 mask = ~(align - 1ULL);
1903 largest_copy = std::max(largest_copy, new_size); 1586 total_size_bytes += (new_size + align - 1) & mask;
1904 }; 1587 largest_copy = std::max(largest_copy, new_size);
1905 1588 };
1906 const VAddr start_address = buffer_addr + range_offset; 1589
1907 const VAddr end_address = start_address + range_size; 1590 const VAddr start_address = cpu_addr_out;
1908 ForEachWrittenRange(start_address, range_size, add_download); 1591 const VAddr end_address = start_address + range_size;
1909 const IntervalType subtract_interval{start_address, end_address}; 1592 ForEachInRangeSet(common_ranges, start_address, range_size, add_download);
1910 ClearDownload(subtract_interval); 1593 const IntervalType subtract_interval{start_address, end_address};
1911 common_ranges.subtract(subtract_interval); 1594 ClearDownload(subtract_interval);
1912 }); 1595 common_ranges.subtract(subtract_interval);
1596 });
1913 if (total_size_bytes == 0) { 1597 if (total_size_bytes == 0) {
1914 return; 1598 return;
1915 } 1599 }
@@ -1943,7 +1627,7 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si
1943} 1627}
1944 1628
1945template <class P> 1629template <class P>
1946void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { 1630void BufferCache<P>::DeleteBuffer(BufferId buffer_id, bool do_not_mark) {
1947 const auto scalar_replace = [buffer_id](Binding& binding) { 1631 const auto scalar_replace = [buffer_id](Binding& binding) {
1948 if (binding.buffer_id == buffer_id) { 1632 if (binding.buffer_id == buffer_id) {
1949 binding.buffer_id = BufferId{}; 1633 binding.buffer_id = BufferId{};
@@ -1962,8 +1646,10 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
1962 std::erase(cached_write_buffer_ids, buffer_id); 1646 std::erase(cached_write_buffer_ids, buffer_id);
1963 1647
1964 // Mark the whole buffer as CPU written to stop tracking CPU writes 1648 // Mark the whole buffer as CPU written to stop tracking CPU writes
1965 Buffer& buffer = slot_buffers[buffer_id]; 1649 if (!do_not_mark) {
1966 buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); 1650 Buffer& buffer = slot_buffers[buffer_id];
1651 memory_tracker.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes());
1652 }
1967 1653
1968 Unregister(buffer_id); 1654 Unregister(buffer_id);
1969 delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); 1655 delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
@@ -2011,7 +1697,7 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s
2011 LOG_WARNING(HW_GPU, "Failed to find storage buffer for cbuf index {}", cbuf_index); 1697 LOG_WARNING(HW_GPU, "Failed to find storage buffer for cbuf index {}", cbuf_index);
2012 return NULL_BINDING; 1698 return NULL_BINDING;
2013 } 1699 }
2014 const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE); 1700 const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, YUZU_PAGESIZE);
2015 const Binding binding{ 1701 const Binding binding{
2016 .cpu_addr = *cpu_addr, 1702 .cpu_addr = *cpu_addr,
2017 .size = is_written ? size : static_cast<u32>(cpu_end - *cpu_addr), 1703 .size = is_written ? size : static_cast<u32>(cpu_end - *cpu_addr),
diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h
new file mode 100644
index 000000000..4b3677da3
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache_base.h
@@ -0,0 +1,507 @@
1// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#pragma once
5
6#include <algorithm>
7#include <array>
8#include <memory>
9#include <mutex>
10#include <numeric>
11#include <span>
12#include <unordered_map>
13#include <vector>
14
15#include <boost/container/small_vector.hpp>
16#define BOOST_NO_MT
17#include <boost/pool/detail/mutex.hpp>
18#undef BOOST_NO_MT
19#include <boost/icl/interval_set.hpp>
20#include <boost/pool/pool.hpp>
21#include <boost/pool/pool_alloc.hpp>
22
23#include "common/common_types.h"
24#include "common/div_ceil.h"
25#include "common/literals.h"
26#include "common/lru_cache.h"
27#include "common/microprofile.h"
28#include "common/scope_exit.h"
29#include "common/settings.h"
30#include "core/memory.h"
31#include "video_core/buffer_cache/buffer_base.h"
32#include "video_core/control/channel_state_cache.h"
33#include "video_core/delayed_destruction_ring.h"
34#include "video_core/dirty_flags.h"
35#include "video_core/engines/draw_manager.h"
36#include "video_core/engines/kepler_compute.h"
37#include "video_core/engines/maxwell_3d.h"
38#include "video_core/memory_manager.h"
39#include "video_core/rasterizer_interface.h"
40#include "video_core/surface.h"
41#include "video_core/texture_cache/slot_vector.h"
42#include "video_core/texture_cache/types.h"
43
44
45namespace boost {
46template <typename T>
47class fast_pool_allocator<T, default_user_allocator_new_delete, details::pool::default_mutex, 4096,
48 0>;
49}
50
51namespace VideoCommon {
52
53MICROPROFILE_DECLARE(GPU_PrepareBuffers);
54MICROPROFILE_DECLARE(GPU_BindUploadBuffers);
55MICROPROFILE_DECLARE(GPU_DownloadMemory);
56
57using BufferId = SlotId;
58
59using VideoCore::Surface::PixelFormat;
60using namespace Common::Literals;
61
62constexpr u32 NUM_VERTEX_BUFFERS = 32;
63constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4;
64constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18;
65constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
66constexpr u32 NUM_STORAGE_BUFFERS = 16;
67constexpr u32 NUM_TEXTURE_BUFFERS = 16;
68constexpr u32 NUM_STAGES = 5;
69
70using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>;
71using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>;
72
73enum class ObtainBufferSynchronize : u32 {
74 NoSynchronize = 0,
75 FullSynchronize = 1,
76 SynchronizeNoDirty = 2,
77};
78
79enum class ObtainBufferOperation : u32 {
80 DoNothing = 0,
81 MarkAsWritten = 1,
82 DiscardWrite = 2,
83 MarkQuery = 3,
84};
85
86template <typename P>
87class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> {
88 // Page size for caching purposes.
89 // This is unrelated to the CPU page size and it can be changed as it seems optimal.
90 static constexpr u32 PAGE_BITS = 16;
91 static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS;
92 static constexpr u32 CPU_PAGE_BITS = 12;
93 static constexpr u64 CPU_PAGE_SIZE = u64{1} << CPU_PAGE_BITS;
94
95 static constexpr bool IS_OPENGL = P::IS_OPENGL;
96 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS =
97 P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS;
98 static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT =
99 P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT;
100 static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX;
101 static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX;
102 static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS;
103 static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS;
104 static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = P::IMPLEMENTS_ASYNC_DOWNLOADS;
105
106 static constexpr BufferId NULL_BUFFER_ID{0};
107
108 static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB;
109 static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB;
110 static constexpr s64 TARGET_THRESHOLD = 4_GiB;
111
112 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
113
114 using Runtime = typename P::Runtime;
115 using Buffer = typename P::Buffer;
116 using Async_Buffer = typename P::Async_Buffer;
117 using MemoryTracker = typename P::MemoryTracker;
118
119 using IntervalCompare = ICL_COMPARE_INSTANCE(ICL_COMPARE_DEFAULT, VAddr);
120 using IntervalInstance = ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, IntervalCompare);
121 using IntervalAllocator = boost::fast_pool_allocator;
122 using IntervalSet =
123 boost::icl::interval_set<VAddr, IntervalCompare, IntervalInstance, IntervalAllocator>;
124 using IntervalType = typename IntervalSet::interval_type;
125
126 struct Empty {};
127
128 struct OverlapResult {
129 std::vector<BufferId> ids;
130 VAddr begin;
131 VAddr end;
132 bool has_stream_leap = false;
133 };
134
135 struct Binding {
136 VAddr cpu_addr{};
137 u32 size{};
138 BufferId buffer_id;
139 };
140
141 struct TextureBufferBinding : Binding {
142 PixelFormat format;
143 };
144
145 static constexpr Binding NULL_BINDING{
146 .cpu_addr = 0,
147 .size = 0,
148 .buffer_id = NULL_BUFFER_ID,
149 };
150
151public:
152 static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB);
153
154 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
155 Core::Memory::Memory& cpu_memory_, Runtime& runtime_);
156
157 void TickFrame();
158
159 void WriteMemory(VAddr cpu_addr, u64 size);
160
161 void CachedWriteMemory(VAddr cpu_addr, u64 size);
162
163 void DownloadMemory(VAddr cpu_addr, u64 size);
164
165 bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer);
166
167 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
168
169 void DisableGraphicsUniformBuffer(size_t stage, u32 index);
170
171 void UpdateGraphicsBuffers(bool is_indexed);
172
173 void UpdateComputeBuffers();
174
175 void BindHostGeometryBuffers(bool is_indexed);
176
177 void BindHostStageBuffers(size_t stage);
178
179 void BindHostComputeBuffers();
180
181 void SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask,
182 const UniformBufferSizes* sizes);
183
184 void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes);
185
186 void UnbindGraphicsStorageBuffers(size_t stage);
187
188 void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
189 bool is_written);
190
191 void UnbindGraphicsTextureBuffers(size_t stage);
192
193 void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size,
194 PixelFormat format, bool is_written, bool is_image);
195
196 void UnbindComputeStorageBuffers();
197
198 void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
199 bool is_written);
200
201 void UnbindComputeTextureBuffers();
202
203 void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format,
204 bool is_written, bool is_image);
205
206 [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size,
207 ObtainBufferSynchronize sync_info,
208 ObtainBufferOperation post_op);
209 void FlushCachedWrites();
210
211 /// Return true when there are uncommitted buffers to be downloaded
212 [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
213
214 void AccumulateFlushes();
215
216 /// Return true when the caller should wait for async downloads
217 [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
218
219 /// Commit asynchronous downloads
220 void CommitAsyncFlushes();
221 void CommitAsyncFlushesHigh();
222 void CommitAsyncQueries();
223
224 /// Pop asynchronous downloads
225 void PopAsyncFlushes();
226
227 void PopAsyncQueries();
228 void PopAsyncBuffers();
229
230 bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount);
231
232 bool DMAClear(GPUVAddr src_address, u64 amount, u32 value);
233
234 /// Return true when a CPU region is modified from the GPU
235 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
236
237 /// Return true when a region is registered on the cache
238 [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
239
240 /// Return true when a CPU region is modified from the CPU
241 [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
242
243 void SetDrawIndirect(const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) {
244 current_draw_indirect = current_draw_indirect_;
245 }
246
247 [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectCount();
248
249 [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer();
250
251 std::recursive_mutex mutex;
252 Runtime& runtime;
253
254private:
255 template <typename Func>
256 static void ForEachEnabledBit(u32 enabled_mask, Func&& func) {
257 for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) {
258 const int disabled_bits = std::countr_zero(enabled_mask);
259 index += disabled_bits;
260 enabled_mask >>= disabled_bits;
261 func(index);
262 }
263 }
264
265 template <typename Func>
266 void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) {
267 const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE);
268 for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) {
269 const BufferId buffer_id = page_table[page];
270 if (!buffer_id) {
271 ++page;
272 continue;
273 }
274 Buffer& buffer = slot_buffers[buffer_id];
275 func(buffer_id, buffer);
276
277 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
278 page = Common::DivCeil(end_addr, PAGE_SIZE);
279 }
280 }
281
282 template <typename Func>
283 void ForEachInRangeSet(IntervalSet& current_range, VAddr cpu_addr, u64 size, Func&& func) {
284 const VAddr start_address = cpu_addr;
285 const VAddr end_address = start_address + size;
286 const IntervalType search_interval{start_address, end_address};
287 auto it = current_range.lower_bound(search_interval);
288 if (it == current_range.end()) {
289 return;
290 }
291 auto end_it = current_range.upper_bound(search_interval);
292 for (; it != end_it; it++) {
293 VAddr inter_addr_end = it->upper();
294 VAddr inter_addr = it->lower();
295 if (inter_addr_end > end_address) {
296 inter_addr_end = end_address;
297 }
298 if (inter_addr < start_address) {
299 inter_addr = start_address;
300 }
301 func(inter_addr, inter_addr_end);
302 }
303 }
304
305 static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
306 return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) ==
307 ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK);
308 }
309
310 void RunGarbageCollector();
311
312 void BindHostIndexBuffer();
313
314 void BindHostVertexBuffers();
315
316 void BindHostDrawIndirectBuffers();
317
318 void BindHostGraphicsUniformBuffers(size_t stage);
319
320 void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind);
321
322 void BindHostGraphicsStorageBuffers(size_t stage);
323
324 void BindHostGraphicsTextureBuffers(size_t stage);
325
326 void BindHostTransformFeedbackBuffers();
327
328 void BindHostComputeUniformBuffers();
329
330 void BindHostComputeStorageBuffers();
331
332 void BindHostComputeTextureBuffers();
333
334 void DoUpdateGraphicsBuffers(bool is_indexed);
335
336 void DoUpdateComputeBuffers();
337
338 void UpdateIndexBuffer();
339
340 void UpdateVertexBuffers();
341
342 void UpdateVertexBuffer(u32 index);
343
344 void UpdateDrawIndirect();
345
346 void UpdateUniformBuffers(size_t stage);
347
348 void UpdateStorageBuffers(size_t stage);
349
350 void UpdateTextureBuffers(size_t stage);
351
352 void UpdateTransformFeedbackBuffers();
353
354 void UpdateTransformFeedbackBuffer(u32 index);
355
356 void UpdateComputeUniformBuffers();
357
358 void UpdateComputeStorageBuffers();
359
360 void UpdateComputeTextureBuffers();
361
362 void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size);
363
364 [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size);
365
366 [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size);
367
368 void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score);
369
370 [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size);
371
372 void Register(BufferId buffer_id);
373
374 void Unregister(BufferId buffer_id);
375
376 template <bool insert>
377 void ChangeRegister(BufferId buffer_id);
378
379 void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept;
380
381 bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
382
383 bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
384
385 bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size);
386
387 void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
388 std::span<BufferCopy> copies);
389
390 void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
391 std::span<const BufferCopy> copies);
392
393 void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies);
394
395 void DownloadBufferMemory(Buffer& buffer_id);
396
397 void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size);
398
399 void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false);
400
401 void NotifyBufferDeletion();
402
403 [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, bool is_written) const;
404
405 [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size,
406 PixelFormat format);
407
408 [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size);
409
410 [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity);
411
412 [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept;
413
414 void ClearDownload(IntervalType subtract_interval);
415
416 VideoCore::RasterizerInterface& rasterizer;
417 Core::Memory::Memory& cpu_memory;
418
419 SlotVector<Buffer> slot_buffers;
420 DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
421
422 const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{};
423
424 u32 last_index_count = 0;
425
426 Binding index_buffer;
427 std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
428 std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
429 std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
430 std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers;
431 std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
432 Binding count_buffer_binding;
433 Binding indirect_buffer_binding;
434
435 std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
436 std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
437 std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS> compute_texture_buffers;
438
439 std::array<u32, NUM_STAGES> enabled_uniform_buffer_masks{};
440 u32 enabled_compute_uniform_buffer_mask = 0;
441
442 const UniformBufferSizes* uniform_buffer_sizes{};
443 const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{};
444
445 std::array<u32, NUM_STAGES> enabled_storage_buffers{};
446 std::array<u32, NUM_STAGES> written_storage_buffers{};
447 u32 enabled_compute_storage_buffers = 0;
448 u32 written_compute_storage_buffers = 0;
449
450 std::array<u32, NUM_STAGES> enabled_texture_buffers{};
451 std::array<u32, NUM_STAGES> written_texture_buffers{};
452 std::array<u32, NUM_STAGES> image_texture_buffers{};
453 u32 enabled_compute_texture_buffers = 0;
454 u32 written_compute_texture_buffers = 0;
455 u32 image_compute_texture_buffers = 0;
456
457 std::array<u32, 16> uniform_cache_hits{};
458 std::array<u32, 16> uniform_cache_shots{};
459
460 u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE;
461
462 bool has_deleted_buffers = false;
463
464 std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty>
465 dirty_uniform_buffers{};
466 std::conditional_t<IS_OPENGL, std::array<u32, NUM_STAGES>, Empty> fast_bound_uniform_buffers{};
467 std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS,
468 std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>, Empty>
469 uniform_buffer_binding_sizes{};
470
471 std::vector<BufferId> cached_write_buffer_ids;
472
473 MemoryTracker memory_tracker;
474 IntervalSet uncommitted_ranges;
475 IntervalSet common_ranges;
476 IntervalSet cached_ranges;
477 std::deque<IntervalSet> committed_ranges;
478
479 // Async Buffers
480 std::deque<IntervalSet> async_downloads;
481 std::deque<std::optional<Async_Buffer>> async_buffers;
482 std::deque<boost::container::small_vector<BufferCopy, 4>> pending_downloads;
483 std::optional<Async_Buffer> current_buffer;
484
485 // queries
486 boost::container::small_vector<std::pair<VAddr, size_t>, 8> pending_queries;
487 std::deque<boost::container::small_vector<BufferCopy, 8>> committed_queries;
488 boost::container::small_vector<u64, 8> flushed_queries;
489 std::deque<std::optional<Async_Buffer>> query_async_buffers;
490
491 size_t immediate_buffer_capacity = 0;
492 Common::ScratchBuffer<u8> immediate_buffer_alloc;
493
494 struct LRUItemParams {
495 using ObjectType = BufferId;
496 using TickType = u64;
497 };
498 Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache;
499 u64 frame_tick = 0;
500 u64 total_used_memory = 0;
501 u64 minimum_memory = 0;
502 u64 critical_memory = 0;
503
504 std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
505};
506
507} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h
new file mode 100644
index 000000000..93bd779c9
--- /dev/null
+++ b/src/video_core/buffer_cache/memory_tracker_base.h
@@ -0,0 +1,258 @@
1// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#pragma once
5
6#include <algorithm>
7#include <bit>
8#include <deque>
9#include <limits>
10#include <type_traits>
11#include <unordered_set>
12#include <utility>
13
14#include "common/alignment.h"
15#include "common/common_types.h"
16#include "video_core/buffer_cache/word_manager.h"
17
18namespace VideoCommon {
19
20template <class RasterizerInterface>
21class MemoryTrackerBase {
22 static constexpr size_t MAX_CPU_PAGE_BITS = 39;
23 static constexpr size_t HIGHER_PAGE_BITS = 22;
24 static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS;
25 static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL;
26 static constexpr size_t NUM_HIGH_PAGES = 1ULL << (MAX_CPU_PAGE_BITS - HIGHER_PAGE_BITS);
27 static constexpr size_t MANAGER_POOL_SIZE = 32;
28 static constexpr size_t WORDS_STACK_NEEDED = HIGHER_PAGE_SIZE / BYTES_PER_WORD;
29 using Manager = WordManager<RasterizerInterface, WORDS_STACK_NEEDED>;
30
31public:
32 MemoryTrackerBase(RasterizerInterface& rasterizer_) : rasterizer{&rasterizer_} {}
33 ~MemoryTrackerBase() = default;
34
35 /// Returns the inclusive CPU modified range in a begin end pair
36 [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr,
37 u64 query_size) noexcept {
38 return IteratePairs<true>(query_cpu_addr, query_size,
39 [](Manager* manager, u64 offset, size_t size) {
40 return manager->ModifiedRegion<Type::CPU>(offset, size);
41 });
42 }
43
44 /// Returns the inclusive GPU modified range in a begin end pair
45 [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr,
46 u64 query_size) noexcept {
47 return IteratePairs<false>(query_cpu_addr, query_size,
48 [](Manager* manager, u64 offset, size_t size) {
49 return manager->ModifiedRegion<Type::GPU>(offset, size);
50 });
51 }
52
53 /// Returns true if a region has been modified from the CPU
54 [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
55 return IteratePages<true>(query_cpu_addr, query_size,
56 [](Manager* manager, u64 offset, size_t size) {
57 return manager->IsRegionModified<Type::CPU>(offset, size);
58 });
59 }
60
61 /// Returns true if a region has been modified from the GPU
62 [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
63 return IteratePages<false>(query_cpu_addr, query_size,
64 [](Manager* manager, u64 offset, size_t size) {
65 return manager->IsRegionModified<Type::GPU>(offset, size);
66 });
67 }
68
69 /// Mark region as CPU modified, notifying the rasterizer about this change
70 void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) {
71 IteratePages<true>(
72 dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) {
73 manager->ChangeRegionState<Type::CPU, true>(manager->GetCpuAddr() + offset, size);
74 });
75 }
76
77 /// Unmark region as CPU modified, notifying the rasterizer about this change
78 void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) {
79 IteratePages<true>(
80 dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) {
81 manager->ChangeRegionState<Type::CPU, false>(manager->GetCpuAddr() + offset, size);
82 });
83 }
84
85 /// Mark region as modified from the host GPU
86 void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
87 IteratePages<true>(
88 dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) {
89 manager->ChangeRegionState<Type::GPU, true>(manager->GetCpuAddr() + offset, size);
90 });
91 }
92
93 /// Unmark region as modified from the host GPU
94 void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
95 IteratePages<true>(
96 dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) {
97 manager->ChangeRegionState<Type::GPU, false>(manager->GetCpuAddr() + offset, size);
98 });
99 }
100
101 /// Mark region as modified from the CPU
102 /// but don't mark it as modified until FlusHCachedWrites is called.
103 void CachedCpuWrite(VAddr dirty_cpu_addr, u64 query_size) {
104 IteratePages<true>(
105 dirty_cpu_addr, query_size, [this](Manager* manager, u64 offset, size_t size) {
106 const VAddr cpu_address = manager->GetCpuAddr() + offset;
107 manager->ChangeRegionState<Type::CachedCPU, true>(cpu_address, size);
108 cached_pages.insert(static_cast<u32>(cpu_address >> HIGHER_PAGE_BITS));
109 });
110 }
111
112 /// Flushes cached CPU writes, and notify the rasterizer about the deltas
113 void FlushCachedWrites(VAddr query_cpu_addr, u64 query_size) noexcept {
114 IteratePages<false>(query_cpu_addr, query_size,
115 [](Manager* manager, [[maybe_unused]] u64 offset,
116 [[maybe_unused]] size_t size) { manager->FlushCachedWrites(); });
117 }
118
119 void FlushCachedWrites() noexcept {
120 for (auto id : cached_pages) {
121 top_tier[id]->FlushCachedWrites();
122 }
123 cached_pages.clear();
124 }
125
126 /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
127 template <typename Func>
128 void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) {
129 IteratePages<true>(query_cpu_range, query_size,
130 [&func](Manager* manager, u64 offset, size_t size) {
131 manager->ForEachModifiedRange<Type::CPU>(
132 manager->GetCpuAddr() + offset, size, true, func);
133 });
134 }
135
136 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
137 template <typename Func>
138 void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) {
139 IteratePages<false>(query_cpu_range, query_size,
140 [&func, clear](Manager* manager, u64 offset, size_t size) {
141 manager->ForEachModifiedRange<Type::GPU>(
142 manager->GetCpuAddr() + offset, size, clear, func);
143 });
144 }
145
146 template <typename Func>
147 void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) {
148 IteratePages<false>(query_cpu_range, query_size,
149 [&func](Manager* manager, u64 offset, size_t size) {
150 manager->ForEachModifiedRange<Type::GPU>(
151 manager->GetCpuAddr() + offset, size, true, func);
152 });
153 }
154
155private:
156 template <bool create_region_on_fail, typename Func>
157 bool IteratePages(VAddr cpu_address, size_t size, Func&& func) {
158 using FuncReturn = typename std::invoke_result<Func, Manager*, u64, size_t>::type;
159 static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>;
160 std::size_t remaining_size{size};
161 std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS};
162 u64 page_offset{cpu_address & HIGHER_PAGE_MASK};
163 while (remaining_size > 0) {
164 const std::size_t copy_amount{std::min(HIGHER_PAGE_SIZE - page_offset, remaining_size)};
165 auto* manager{top_tier[page_index]};
166 if (manager) {
167 if constexpr (BOOL_BREAK) {
168 if (func(manager, page_offset, copy_amount)) {
169 return true;
170 }
171 } else {
172 func(manager, page_offset, copy_amount);
173 }
174 } else if constexpr (create_region_on_fail) {
175 CreateRegion(page_index);
176 manager = top_tier[page_index];
177 if constexpr (BOOL_BREAK) {
178 if (func(manager, page_offset, copy_amount)) {
179 return true;
180 }
181 } else {
182 func(manager, page_offset, copy_amount);
183 }
184 }
185 page_index++;
186 page_offset = 0;
187 remaining_size -= copy_amount;
188 }
189 return false;
190 }
191
192 template <bool create_region_on_fail, typename Func>
193 std::pair<u64, u64> IteratePairs(VAddr cpu_address, size_t size, Func&& func) {
194 std::size_t remaining_size{size};
195 std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS};
196 u64 page_offset{cpu_address & HIGHER_PAGE_MASK};
197 u64 begin = std::numeric_limits<u64>::max();
198 u64 end = 0;
199 while (remaining_size > 0) {
200 const std::size_t copy_amount{std::min(HIGHER_PAGE_SIZE - page_offset, remaining_size)};
201 auto* manager{top_tier[page_index]};
202 const auto execute = [&] {
203 auto [new_begin, new_end] = func(manager, page_offset, copy_amount);
204 if (new_begin != 0 || new_end != 0) {
205 const u64 base_address = page_index << HIGHER_PAGE_BITS;
206 begin = std::min(new_begin + base_address, begin);
207 end = std::max(new_end + base_address, end);
208 }
209 };
210 if (manager) {
211 execute();
212 } else if constexpr (create_region_on_fail) {
213 CreateRegion(page_index);
214 manager = top_tier[page_index];
215 execute();
216 }
217 page_index++;
218 page_offset = 0;
219 remaining_size -= copy_amount;
220 }
221 return begin < end ? std::make_pair(begin, end) : std::make_pair(0ULL, 0ULL);
222 }
223
224 void CreateRegion(std::size_t page_index) {
225 const VAddr base_cpu_addr = page_index << HIGHER_PAGE_BITS;
226 top_tier[page_index] = GetNewManager(base_cpu_addr);
227 }
228
229 Manager* GetNewManager(VAddr base_cpu_addess) {
230 const auto on_return = [&] {
231 auto* new_manager = free_managers.front();
232 new_manager->SetCpuAddress(base_cpu_addess);
233 free_managers.pop_front();
234 return new_manager;
235 };
236 if (!free_managers.empty()) {
237 return on_return();
238 }
239 manager_pool.emplace_back();
240 auto& last_pool = manager_pool.back();
241 for (size_t i = 0; i < MANAGER_POOL_SIZE; i++) {
242 new (&last_pool[i]) Manager(0, *rasterizer, HIGHER_PAGE_SIZE);
243 free_managers.push_back(&last_pool[i]);
244 }
245 return on_return();
246 }
247
248 std::deque<std::array<Manager, MANAGER_POOL_SIZE>> manager_pool;
249 std::deque<Manager*> free_managers;
250
251 std::array<Manager*, NUM_HIGH_PAGES> top_tier{};
252
253 std::unordered_set<u32> cached_pages;
254
255 RasterizerInterface* rasterizer = nullptr;
256};
257
258} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h
new file mode 100644
index 000000000..782951fe7
--- /dev/null
+++ b/src/video_core/buffer_cache/word_manager.h
@@ -0,0 +1,474 @@
1// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#pragma once
5
6#include <algorithm>
7#include <bit>
8#include <limits>
9#include <utility>
10
11#include "common/alignment.h"
12#include "common/common_funcs.h"
13#include "common/common_types.h"
14#include "common/div_ceil.h"
15#include "core/memory.h"
16
17namespace VideoCommon {
18
19constexpr u64 PAGES_PER_WORD = 64;
20constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE;
21constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE;
22
23/// Vector tracking modified pages tightly packed with small vector optimization
24template <size_t stack_words = 1>
25union WordsArray {
26 /// Returns the pointer to the words state
27 [[nodiscard]] const u64* Pointer(bool is_short) const noexcept {
28 return is_short ? stack.data() : heap;
29 }
30
31 /// Returns the pointer to the words state
32 [[nodiscard]] u64* Pointer(bool is_short) noexcept {
33 return is_short ? stack.data() : heap;
34 }
35
36 std::array<u64, stack_words> stack{}; ///< Small buffers storage
37 u64* heap; ///< Not-small buffers pointer to the storage
38};
39
40template <size_t stack_words = 1>
41struct Words {
42 explicit Words() = default;
43 explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} {
44 if (IsShort()) {
45 cpu.stack.fill(~u64{0});
46 gpu.stack.fill(0);
47 cached_cpu.stack.fill(0);
48 untracked.stack.fill(~u64{0});
49 } else {
50 const size_t num_words = NumWords();
51 // Share allocation between CPU and GPU pages and set their default values
52 u64* const alloc = new u64[num_words * 4];
53 cpu.heap = alloc;
54 gpu.heap = alloc + num_words;
55 cached_cpu.heap = alloc + num_words * 2;
56 untracked.heap = alloc + num_words * 3;
57 std::fill_n(cpu.heap, num_words, ~u64{0});
58 std::fill_n(gpu.heap, num_words, 0);
59 std::fill_n(cached_cpu.heap, num_words, 0);
60 std::fill_n(untracked.heap, num_words, ~u64{0});
61 }
62 // Clean up tailing bits
63 const u64 last_word_size = size_bytes % BYTES_PER_WORD;
64 const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE);
65 const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD;
66 const u64 last_word = (~u64{0} << shift) >> shift;
67 cpu.Pointer(IsShort())[NumWords() - 1] = last_word;
68 untracked.Pointer(IsShort())[NumWords() - 1] = last_word;
69 }
70
71 ~Words() {
72 Release();
73 }
74
75 Words& operator=(Words&& rhs) noexcept {
76 Release();
77 size_bytes = rhs.size_bytes;
78 cpu = rhs.cpu;
79 gpu = rhs.gpu;
80 cached_cpu = rhs.cached_cpu;
81 untracked = rhs.untracked;
82 rhs.cpu.heap = nullptr;
83 return *this;
84 }
85
86 Words(Words&& rhs) noexcept
87 : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu},
88 cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} {
89 rhs.cpu.heap = nullptr;
90 }
91
92 Words& operator=(const Words&) = delete;
93 Words(const Words&) = delete;
94
95 /// Returns true when the buffer fits in the small vector optimization
96 [[nodiscard]] bool IsShort() const noexcept {
97 return size_bytes <= stack_words * BYTES_PER_WORD;
98 }
99
100 /// Returns the number of words of the buffer
101 [[nodiscard]] size_t NumWords() const noexcept {
102 return Common::DivCeil(size_bytes, BYTES_PER_WORD);
103 }
104
105 /// Release buffer resources
106 void Release() {
107 if (!IsShort()) {
108 // CPU written words is the base for the heap allocation
109 delete[] cpu.heap;
110 }
111 }
112
113 u64 size_bytes = 0;
114 WordsArray<stack_words> cpu;
115 WordsArray<stack_words> gpu;
116 WordsArray<stack_words> cached_cpu;
117 WordsArray<stack_words> untracked;
118};
119
120enum class Type {
121 CPU,
122 GPU,
123 CachedCPU,
124 Untracked,
125};
126
127template <class RasterizerInterface, size_t stack_words = 1>
128class WordManager {
129public:
130 explicit WordManager(VAddr cpu_addr_, RasterizerInterface& rasterizer_, u64 size_bytes)
131 : cpu_addr{cpu_addr_}, rasterizer{&rasterizer_}, words{size_bytes} {}
132
133 explicit WordManager() = default;
134
135 void SetCpuAddress(VAddr new_cpu_addr) {
136 cpu_addr = new_cpu_addr;
137 }
138
139 VAddr GetCpuAddr() const {
140 return cpu_addr;
141 }
142
143 /**
144 * Change the state of a range of pages
145 *
146 * @param dirty_addr Base address to mark or unmark as modified
147 * @param size Size in bytes to mark or unmark as modified
148 */
149 template <Type type, bool enable>
150 void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) {
151 const s64 difference = dirty_addr - cpu_addr;
152 const u64 offset = std::max<s64>(difference, 0);
153 size += std::min<s64>(difference, 0);
154 if (offset >= SizeBytes() || size < 0) {
155 return;
156 }
157 u64* const untracked_words = Array<Type::Untracked>();
158 u64* const state_words = Array<type>();
159 const u64 offset_end = std::min(offset + size, SizeBytes());
160 const u64 begin_page_index = offset / BYTES_PER_PAGE;
161 const u64 begin_word_index = begin_page_index / PAGES_PER_WORD;
162 const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE);
163 const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD);
164 u64 page_index = begin_page_index % PAGES_PER_WORD;
165 u64 word_index = begin_word_index;
166 while (word_index < end_word_index) {
167 const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD;
168 const u64 left_offset =
169 std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD;
170 const u64 right_offset = page_index;
171 u64 bits = ~u64{0};
172 bits = (bits >> right_offset) << right_offset;
173 bits = (bits << left_offset) >> left_offset;
174 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
175 NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits);
176 }
177 if constexpr (enable) {
178 state_words[word_index] |= bits;
179 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
180 untracked_words[word_index] |= bits;
181 }
182 } else {
183 state_words[word_index] &= ~bits;
184 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
185 untracked_words[word_index] &= ~bits;
186 }
187 }
188 page_index = 0;
189 ++word_index;
190 }
191 }
192
193 /**
194 * Loop over each page in the given range, turn off those bits and notify the rasterizer if
195 * needed. Call the given function on each turned off range.
196 *
197 * @param query_cpu_range Base CPU address to loop over
198 * @param size Size in bytes of the CPU range to loop over
199 * @param func Function to call for each turned off region
200 */
201 template <Type type, typename Func>
202 void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) {
203 static_assert(type != Type::Untracked);
204
205 const s64 difference = query_cpu_range - cpu_addr;
206 const u64 query_begin = std::max<s64>(difference, 0);
207 size += std::min<s64>(difference, 0);
208 if (query_begin >= SizeBytes() || size < 0) {
209 return;
210 }
211 [[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>();
212 [[maybe_unused]] u64* const cpu_words = Array<Type::CPU>();
213 u64* const state_words = Array<type>();
214 const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
215 u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
216 u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD);
217 u64 first_page = (query_begin / BYTES_PER_PAGE) % PAGES_PER_WORD;
218
219 const auto modified = [](u64 word) { return word != 0; };
220 const auto first_modified_word = std::find_if(words_begin, words_end, modified);
221 if (first_modified_word == words_end) {
222 // Exit early when the buffer is not modified
223 return;
224 }
225 if (first_modified_word != words_begin) {
226 first_page = 0;
227 }
228 std::reverse_iterator<u64*> first_word_reverse(first_modified_word);
229 std::reverse_iterator<u64*> last_word_iterator(words_end);
230 auto last_word_result = std::find_if(last_word_iterator, first_word_reverse, modified);
231 u64* const last_modified_word = &(*last_word_result) + 1;
232
233 const u64 word_index_begin = std::distance(state_words, first_modified_word);
234 const u64 word_index_end = std::distance(state_words, last_modified_word);
235 const unsigned local_page_begin = std::countr_zero(*first_modified_word);
236 const unsigned local_page_end =
237 static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]);
238 const u64 word_page_begin = word_index_begin * PAGES_PER_WORD;
239 const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD;
240 const u64 query_page_begin = query_begin / BYTES_PER_PAGE;
241 const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE);
242 const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin);
243 const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end);
244 const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD;
245 const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1;
246
247 u64 page_begin = std::max(first_word_page_begin, first_page);
248 u64 current_base = 0;
249 u64 current_size = 0;
250 bool on_going = false;
251 for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) {
252 const bool is_last_word = word_index + 1 == word_index_end;
253 const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD;
254 const u64 right_offset = page_begin;
255 const u64 left_offset = PAGES_PER_WORD - page_end;
256 u64 bits = ~u64{0};
257 bits = (bits >> right_offset) << right_offset;
258 bits = (bits << left_offset) >> left_offset;
259
260 const u64 current_word = state_words[word_index] & bits;
261 if (clear) {
262 state_words[word_index] &= ~bits;
263 }
264
265 if constexpr (type == Type::CachedCPU) {
266 NotifyRasterizer<false>(word_index, untracked_words[word_index], current_word);
267 untracked_words[word_index] |= current_word;
268 cpu_words[word_index] |= current_word;
269 }
270
271 if constexpr (type == Type::CPU) {
272 const u64 current_bits = untracked_words[word_index] & bits;
273 untracked_words[word_index] &= ~bits;
274 NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
275 }
276 const u64 word = current_word;
277 u64 page = page_begin;
278 page_begin = 0;
279
280 while (page < page_end) {
281 const int empty_bits = std::countr_zero(word >> page);
282 if (on_going && empty_bits != 0) {
283 InvokeModifiedRange(func, current_size, current_base);
284 current_size = 0;
285 on_going = false;
286 }
287 if (empty_bits == PAGES_PER_WORD) {
288 break;
289 }
290 page += empty_bits;
291
292 const int continuous_bits = std::countr_one(word >> page);
293 if (!on_going && continuous_bits != 0) {
294 current_base = word_index * PAGES_PER_WORD + page;
295 on_going = true;
296 }
297 current_size += continuous_bits;
298 page += continuous_bits;
299 }
300 }
301 if (on_going && current_size > 0) {
302 InvokeModifiedRange(func, current_size, current_base);
303 }
304 }
305
306 template <typename Func>
307 void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) {
308 const u64 current_size_bytes = current_size * BYTES_PER_PAGE;
309 const u64 offset_begin = current_base * BYTES_PER_PAGE;
310 const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes());
311 func(cpu_addr + offset_begin, offset_end - offset_begin);
312 }
313
314 /**
315 * Returns true when a region has been modified
316 *
317 * @param offset Offset in bytes from the start of the buffer
318 * @param size Size in bytes of the region to query for modifications
319 */
320 template <Type type>
321 [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
322 static_assert(type != Type::Untracked);
323
324 const u64* const state_words = Array<type>();
325 const u64 num_query_words = size / BYTES_PER_WORD + 1;
326 const u64 word_begin = offset / BYTES_PER_WORD;
327 const u64 word_end = std::min(word_begin + num_query_words, NumWords());
328 const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
329 u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
330 for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
331 const u64 word = state_words[word_index];
332 if (word == 0) {
333 continue;
334 }
335 const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit);
336 const u64 local_page_end = page_end % PAGES_PER_WORD;
337 const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD;
338 if (((word >> page_index) << page_index) << page_end_shift != 0) {
339 return true;
340 }
341 }
342 return false;
343 }
344
345 /**
346 * Returns a begin end pair with the inclusive modified region
347 *
348 * @param offset Offset in bytes from the start of the buffer
349 * @param size Size in bytes of the region to query for modifications
350 */
351 template <Type type>
352 [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
353 static_assert(type != Type::Untracked);
354 const u64* const state_words = Array<type>();
355 const u64 num_query_words = size / BYTES_PER_WORD + 1;
356 const u64 word_begin = offset / BYTES_PER_WORD;
357 const u64 word_end = std::min(word_begin + num_query_words, NumWords());
358 const u64 page_base = offset / BYTES_PER_PAGE;
359 u64 page_begin = page_base & (PAGES_PER_WORD - 1);
360 u64 page_end =
361 Common::DivCeil(offset + size, BYTES_PER_PAGE) - (page_base & ~(PAGES_PER_WORD - 1));
362 u64 begin = std::numeric_limits<u64>::max();
363 u64 end = 0;
364 for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
365 const u64 base_mask = (1ULL << page_begin) - 1ULL;
366 const u64 end_mask = page_end >= PAGES_PER_WORD ? 0ULL : ~((1ULL << page_end) - 1ULL);
367 const u64 off_word = end_mask | base_mask;
368 const u64 word = state_words[word_index] & ~off_word;
369 if (word == 0) {
370 page_begin = 0;
371 page_end -= PAGES_PER_WORD;
372 continue;
373 }
374 const u64 local_page_begin = std::countr_zero(word);
375 const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word);
376 const u64 page_index = word_index * PAGES_PER_WORD;
377 begin = std::min(begin, page_index + local_page_begin);
378 end = page_index + local_page_end;
379 page_begin = 0;
380 page_end -= PAGES_PER_WORD;
381 }
382 static constexpr std::pair<u64, u64> EMPTY{0, 0};
383 return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY;
384 }
385
386 /// Returns the number of words of the manager
387 [[nodiscard]] size_t NumWords() const noexcept {
388 return words.NumWords();
389 }
390
391 /// Returns the size in bytes of the manager
392 [[nodiscard]] u64 SizeBytes() const noexcept {
393 return words.size_bytes;
394 }
395
396 /// Returns true when the buffer fits in the small vector optimization
397 [[nodiscard]] bool IsShort() const noexcept {
398 return words.IsShort();
399 }
400
401 void FlushCachedWrites() noexcept {
402 const u64 num_words = NumWords();
403 u64* const cached_words = Array<Type::CachedCPU>();
404 u64* const untracked_words = Array<Type::Untracked>();
405 u64* const cpu_words = Array<Type::CPU>();
406 for (u64 word_index = 0; word_index < num_words; ++word_index) {
407 const u64 cached_bits = cached_words[word_index];
408 NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits);
409 untracked_words[word_index] |= cached_bits;
410 cpu_words[word_index] |= cached_bits;
411 cached_words[word_index] = 0;
412 }
413 }
414
415private:
416 template <Type type>
417 u64* Array() noexcept {
418 if constexpr (type == Type::CPU) {
419 return words.cpu.Pointer(IsShort());
420 } else if constexpr (type == Type::GPU) {
421 return words.gpu.Pointer(IsShort());
422 } else if constexpr (type == Type::CachedCPU) {
423 return words.cached_cpu.Pointer(IsShort());
424 } else if constexpr (type == Type::Untracked) {
425 return words.untracked.Pointer(IsShort());
426 }
427 }
428
429 template <Type type>
430 const u64* Array() const noexcept {
431 if constexpr (type == Type::CPU) {
432 return words.cpu.Pointer(IsShort());
433 } else if constexpr (type == Type::GPU) {
434 return words.gpu.Pointer(IsShort());
435 } else if constexpr (type == Type::CachedCPU) {
436 return words.cached_cpu.Pointer(IsShort());
437 } else if constexpr (type == Type::Untracked) {
438 return words.untracked.Pointer(IsShort());
439 }
440 }
441
442 /**
443 * Notify rasterizer about changes in the CPU tracking state of a word in the buffer
444 *
445 * @param word_index Index to the word to notify to the rasterizer
446 * @param current_bits Current state of the word
447 * @param new_bits New state of the word
448 *
449 * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages
450 */
451 template <bool add_to_rasterizer>
452 void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const {
453 u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits;
454 VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
455 while (changed_bits != 0) {
456 const int empty_bits = std::countr_zero(changed_bits);
457 addr += empty_bits * BYTES_PER_PAGE;
458 changed_bits >>= empty_bits;
459
460 const u32 continuous_bits = std::countr_one(changed_bits);
461 const u64 size = continuous_bits * BYTES_PER_PAGE;
462 const VAddr begin_addr = addr;
463 addr += size;
464 changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0;
465 rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1);
466 }
467 }
468
469 VAddr cpu_addr = 0;
470 RasterizerInterface* rasterizer = nullptr;
471 Words<stack_words> words;
472};
473
474} // namespace VideoCommon
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index a8c3f8b67..18d3c3ac0 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -8,6 +8,7 @@
8 8
9#include "common/common_types.h" 9#include "common/common_types.h"
10#include "video_core/buffer_cache/buffer_cache.h" 10#include "video_core/buffer_cache/buffer_cache.h"
11#include "video_core/buffer_cache/memory_tracker_base.h"
11#include "video_core/rasterizer_interface.h" 12#include "video_core/rasterizer_interface.h"
12#include "video_core/renderer_opengl/gl_device.h" 13#include "video_core/renderer_opengl/gl_device.h"
13#include "video_core/renderer_opengl/gl_resource_manager.h" 14#include "video_core/renderer_opengl/gl_resource_manager.h"
@@ -200,6 +201,8 @@ private:
200struct BufferCacheParams { 201struct BufferCacheParams {
201 using Runtime = OpenGL::BufferCacheRuntime; 202 using Runtime = OpenGL::BufferCacheRuntime;
202 using Buffer = OpenGL::Buffer; 203 using Buffer = OpenGL::Buffer;
204 using Async_Buffer = u32;
205 using MemoryTracker = VideoCommon::MemoryTrackerBase<VideoCore::RasterizerInterface>;
203 206
204 static constexpr bool IS_OPENGL = true; 207 static constexpr bool IS_OPENGL = true;
205 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true; 208 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true;
@@ -208,6 +211,7 @@ struct BufferCacheParams {
208 static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; 211 static constexpr bool NEEDS_BIND_STORAGE_INDEX = true;
209 static constexpr bool USE_MEMORY_MAPS = false; 212 static constexpr bool USE_MEMORY_MAPS = false;
210 static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = true; 213 static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = true;
214 static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = false;
211}; 215};
212 216
213using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; 217using BufferCache = VideoCommon::BufferCache<BufferCacheParams>;
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp
new file mode 100644
index 000000000..f15ae8e25
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp
@@ -0,0 +1,9 @@
1// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#include "video_core/buffer_cache/buffer_cache.h"
5#include "video_core/renderer_opengl/gl_buffer_cache.h"
6
7namespace VideoCommon {
8template class VideoCommon::BufferCache<OpenGL::BufferCacheParams>;
9}
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 9cbcb3c8f..510602e8e 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -314,8 +314,12 @@ StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) {
314 return staging_pool.Request(size, MemoryUsage::Upload); 314 return staging_pool.Request(size, MemoryUsage::Upload);
315} 315}
316 316
317StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) { 317StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) {
318 return staging_pool.Request(size, MemoryUsage::Download); 318 return staging_pool.Request(size, MemoryUsage::Download, deferred);
319}
320
321void BufferCacheRuntime::FreeDeferredStagingBuffer(StagingBufferRef& ref) {
322 staging_pool.FreeDeferred(ref);
319} 323}
320 324
321u64 BufferCacheRuntime::GetDeviceLocalMemory() const { 325u64 BufferCacheRuntime::GetDeviceLocalMemory() const {
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 183b33632..05968e6a6 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -4,6 +4,7 @@
4#pragma once 4#pragma once
5 5
6#include "video_core/buffer_cache/buffer_cache.h" 6#include "video_core/buffer_cache/buffer_cache.h"
7#include "video_core/buffer_cache/memory_tracker_base.h"
7#include "video_core/engines/maxwell_3d.h" 8#include "video_core/engines/maxwell_3d.h"
8#include "video_core/renderer_vulkan/vk_compute_pass.h" 9#include "video_core/renderer_vulkan/vk_compute_pass.h"
9#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" 10#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
@@ -75,7 +76,9 @@ public:
75 76
76 [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); 77 [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size);
77 78
78 [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); 79 [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false);
80
81 void FreeDeferredStagingBuffer(StagingBufferRef& ref);
79 82
80 void PreCopyBarrier(); 83 void PreCopyBarrier();
81 84
@@ -142,6 +145,8 @@ private:
142struct BufferCacheParams { 145struct BufferCacheParams {
143 using Runtime = Vulkan::BufferCacheRuntime; 146 using Runtime = Vulkan::BufferCacheRuntime;
144 using Buffer = Vulkan::Buffer; 147 using Buffer = Vulkan::Buffer;
148 using Async_Buffer = Vulkan::StagingBufferRef;
149 using MemoryTracker = VideoCommon::MemoryTrackerBase<VideoCore::RasterizerInterface>;
145 150
146 static constexpr bool IS_OPENGL = false; 151 static constexpr bool IS_OPENGL = false;
147 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false; 152 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false;
@@ -150,6 +155,7 @@ struct BufferCacheParams {
150 static constexpr bool NEEDS_BIND_STORAGE_INDEX = false; 155 static constexpr bool NEEDS_BIND_STORAGE_INDEX = false;
151 static constexpr bool USE_MEMORY_MAPS = true; 156 static constexpr bool USE_MEMORY_MAPS = true;
152 static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = false; 157 static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = false;
158 static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = true;
153}; 159};
154 160
155using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; 161using BufferCache = VideoCommon::BufferCache<BufferCacheParams>;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp
new file mode 100644
index 000000000..f9e271507
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp
@@ -0,0 +1,9 @@
1// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later
3
4#include "video_core/buffer_cache/buffer_cache.h"
5#include "video_core/renderer_vulkan/vk_buffer_cache.h"
6
7namespace VideoCommon {
8template class VideoCommon::BufferCache<Vulkan::BufferCacheParams>;
9}