summaryrefslogtreecommitdiff
path: root/src/video_core/buffer_cache
diff options
context:
space:
mode:
authorGravatar bunnei2021-02-12 22:22:18 -0800
committerGravatar GitHub2021-02-12 22:22:18 -0800
commitd3c7a7e7cf4bcabb171c98fe55e6e0291f8ee980 (patch)
tree5c900d10847e1768a4951c1e6bec35f2618b5991 /src/video_core/buffer_cache
parentMerge pull request #5877 from ameerj/res-limit-usage (diff)
parentconfig: Make high GPU accuracy the default (diff)
downloadyuzu-d3c7a7e7cf4bcabb171c98fe55e6e0291f8ee980.tar.gz
yuzu-d3c7a7e7cf4bcabb171c98fe55e6e0291f8ee980.tar.xz
yuzu-d3c7a7e7cf4bcabb171c98fe55e6e0291f8ee980.zip
Merge pull request #5741 from ReinUsesLisp/new-bufcache
video_core: Reimplement the buffer cache
Diffstat (limited to 'src/video_core/buffer_cache')
-rw-r--r--src/video_core/buffer_cache/buffer_base.h217
-rw-r--r--src/video_core/buffer_cache/buffer_block.h62
-rw-r--r--src/video_core/buffer_cache/buffer_cache.cpp13
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h1656
-rw-r--r--src/video_core/buffer_cache/map_interval.cpp33
-rw-r--r--src/video_core/buffer_cache/map_interval.h93
6 files changed, 1346 insertions, 728 deletions
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index ee8602ce9..0c00ae280 100644
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -19,6 +19,7 @@ namespace VideoCommon {
19 19
20enum class BufferFlagBits { 20enum class BufferFlagBits {
21 Picked = 1 << 0, 21 Picked = 1 << 0,
22 CachedWrites = 1 << 1,
22}; 23};
23DECLARE_ENUM_FLAG_OPERATORS(BufferFlagBits) 24DECLARE_ENUM_FLAG_OPERATORS(BufferFlagBits)
24 25
@@ -40,7 +41,7 @@ class BufferBase {
40 static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; 41 static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE;
41 42
42 /// Vector tracking modified pages tightly packed with small vector optimization 43 /// Vector tracking modified pages tightly packed with small vector optimization
43 union WrittenWords { 44 union WordsArray {
44 /// Returns the pointer to the words state 45 /// Returns the pointer to the words state
45 [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { 46 [[nodiscard]] const u64* Pointer(bool is_short) const noexcept {
46 return is_short ? &stack : heap; 47 return is_short ? &stack : heap;
@@ -55,49 +56,59 @@ class BufferBase {
55 u64* heap; ///< Not-small buffers pointer to the storage 56 u64* heap; ///< Not-small buffers pointer to the storage
56 }; 57 };
57 58
58 struct GpuCpuWords { 59 struct Words {
59 explicit GpuCpuWords() = default; 60 explicit Words() = default;
60 explicit GpuCpuWords(u64 size_bytes_) : size_bytes{size_bytes_} { 61 explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} {
61 if (IsShort()) { 62 if (IsShort()) {
62 cpu.stack = ~u64{0}; 63 cpu.stack = ~u64{0};
63 gpu.stack = 0; 64 gpu.stack = 0;
65 cached_cpu.stack = 0;
66 untracked.stack = ~u64{0};
64 } else { 67 } else {
65 // Share allocation between CPU and GPU pages and set their default values 68 // Share allocation between CPU and GPU pages and set their default values
66 const size_t num_words = NumWords(); 69 const size_t num_words = NumWords();
67 u64* const alloc = new u64[num_words * 2]; 70 u64* const alloc = new u64[num_words * 4];
68 cpu.heap = alloc; 71 cpu.heap = alloc;
69 gpu.heap = alloc + num_words; 72 gpu.heap = alloc + num_words;
73 cached_cpu.heap = alloc + num_words * 2;
74 untracked.heap = alloc + num_words * 3;
70 std::fill_n(cpu.heap, num_words, ~u64{0}); 75 std::fill_n(cpu.heap, num_words, ~u64{0});
71 std::fill_n(gpu.heap, num_words, 0); 76 std::fill_n(gpu.heap, num_words, 0);
77 std::fill_n(cached_cpu.heap, num_words, 0);
78 std::fill_n(untracked.heap, num_words, ~u64{0});
72 } 79 }
73 // Clean up tailing bits 80 // Clean up tailing bits
74 const u64 last_local_page = 81 const u64 last_word_size = size_bytes % BYTES_PER_WORD;
75 Common::DivCeil(size_bytes % BYTES_PER_WORD, BYTES_PER_PAGE); 82 const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE);
76 const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; 83 const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD;
77 u64& last_word = cpu.Pointer(IsShort())[NumWords() - 1]; 84 const u64 last_word = (~u64{0} << shift) >> shift;
78 last_word = (last_word << shift) >> shift; 85 cpu.Pointer(IsShort())[NumWords() - 1] = last_word;
86 untracked.Pointer(IsShort())[NumWords() - 1] = last_word;
79 } 87 }
80 88
81 ~GpuCpuWords() { 89 ~Words() {
82 Release(); 90 Release();
83 } 91 }
84 92
85 GpuCpuWords& operator=(GpuCpuWords&& rhs) noexcept { 93 Words& operator=(Words&& rhs) noexcept {
86 Release(); 94 Release();
87 size_bytes = rhs.size_bytes; 95 size_bytes = rhs.size_bytes;
88 cpu = rhs.cpu; 96 cpu = rhs.cpu;
89 gpu = rhs.gpu; 97 gpu = rhs.gpu;
98 cached_cpu = rhs.cached_cpu;
99 untracked = rhs.untracked;
90 rhs.cpu.heap = nullptr; 100 rhs.cpu.heap = nullptr;
91 return *this; 101 return *this;
92 } 102 }
93 103
94 GpuCpuWords(GpuCpuWords&& rhs) noexcept 104 Words(Words&& rhs) noexcept
95 : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu} { 105 : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu},
106 cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} {
96 rhs.cpu.heap = nullptr; 107 rhs.cpu.heap = nullptr;
97 } 108 }
98 109
99 GpuCpuWords& operator=(const GpuCpuWords&) = delete; 110 Words& operator=(const Words&) = delete;
100 GpuCpuWords(const GpuCpuWords&) = delete; 111 Words(const Words&) = delete;
101 112
102 /// Returns true when the buffer fits in the small vector optimization 113 /// Returns true when the buffer fits in the small vector optimization
103 [[nodiscard]] bool IsShort() const noexcept { 114 [[nodiscard]] bool IsShort() const noexcept {
@@ -118,8 +129,17 @@ class BufferBase {
118 } 129 }
119 130
120 u64 size_bytes = 0; 131 u64 size_bytes = 0;
121 WrittenWords cpu; 132 WordsArray cpu;
122 WrittenWords gpu; 133 WordsArray gpu;
134 WordsArray cached_cpu;
135 WordsArray untracked;
136 };
137
138 enum class Type {
139 CPU,
140 GPU,
141 CachedCPU,
142 Untracked,
123 }; 143 };
124 144
125public: 145public:
@@ -132,68 +152,93 @@ public:
132 BufferBase& operator=(const BufferBase&) = delete; 152 BufferBase& operator=(const BufferBase&) = delete;
133 BufferBase(const BufferBase&) = delete; 153 BufferBase(const BufferBase&) = delete;
134 154
155 BufferBase& operator=(BufferBase&&) = default;
156 BufferBase(BufferBase&&) = default;
157
135 /// Returns the inclusive CPU modified range in a begin end pair 158 /// Returns the inclusive CPU modified range in a begin end pair
136 [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, 159 [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr,
137 u64 query_size) const noexcept { 160 u64 query_size) const noexcept {
138 const u64 offset = query_cpu_addr - cpu_addr; 161 const u64 offset = query_cpu_addr - cpu_addr;
139 return ModifiedRegion<false>(offset, query_size); 162 return ModifiedRegion<Type::CPU>(offset, query_size);
140 } 163 }
141 164
142 /// Returns the inclusive GPU modified range in a begin end pair 165 /// Returns the inclusive GPU modified range in a begin end pair
143 [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, 166 [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr,
144 u64 query_size) const noexcept { 167 u64 query_size) const noexcept {
145 const u64 offset = query_cpu_addr - cpu_addr; 168 const u64 offset = query_cpu_addr - cpu_addr;
146 return ModifiedRegion<true>(offset, query_size); 169 return ModifiedRegion<Type::GPU>(offset, query_size);
147 } 170 }
148 171
149 /// Returns true if a region has been modified from the CPU 172 /// Returns true if a region has been modified from the CPU
150 [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { 173 [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
151 const u64 offset = query_cpu_addr - cpu_addr; 174 const u64 offset = query_cpu_addr - cpu_addr;
152 return IsRegionModified<false>(offset, query_size); 175 return IsRegionModified<Type::CPU>(offset, query_size);
153 } 176 }
154 177
155 /// Returns true if a region has been modified from the GPU 178 /// Returns true if a region has been modified from the GPU
156 [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { 179 [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
157 const u64 offset = query_cpu_addr - cpu_addr; 180 const u64 offset = query_cpu_addr - cpu_addr;
158 return IsRegionModified<true>(offset, query_size); 181 return IsRegionModified<Type::GPU>(offset, query_size);
159 } 182 }
160 183
161 /// Mark region as CPU modified, notifying the rasterizer about this change 184 /// Mark region as CPU modified, notifying the rasterizer about this change
162 void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { 185 void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
163 ChangeRegionState<true, true>(words.cpu, dirty_cpu_addr, size); 186 ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size);
164 } 187 }
165 188
166 /// Unmark region as CPU modified, notifying the rasterizer about this change 189 /// Unmark region as CPU modified, notifying the rasterizer about this change
167 void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { 190 void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
168 ChangeRegionState<false, true>(words.cpu, dirty_cpu_addr, size); 191 ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size);
169 } 192 }
170 193
171 /// Mark region as modified from the host GPU 194 /// Mark region as modified from the host GPU
172 void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { 195 void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
173 ChangeRegionState<true, false>(words.gpu, dirty_cpu_addr, size); 196 ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size);
174 } 197 }
175 198
176 /// Unmark region as modified from the host GPU 199 /// Unmark region as modified from the host GPU
177 void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { 200 void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
178 ChangeRegionState<false, false>(words.gpu, dirty_cpu_addr, size); 201 ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size);
202 }
203
204 /// Mark region as modified from the CPU
205 /// but don't mark it as modified until FlusHCachedWrites is called.
206 void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) {
207 flags |= BufferFlagBits::CachedWrites;
208 ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size);
209 }
210
211 /// Flushes cached CPU writes, and notify the rasterizer about the deltas
212 void FlushCachedWrites() noexcept {
213 flags &= ~BufferFlagBits::CachedWrites;
214 const u64 num_words = NumWords();
215 const u64* const cached_words = Array<Type::CachedCPU>();
216 u64* const untracked_words = Array<Type::Untracked>();
217 u64* const cpu_words = Array<Type::CPU>();
218 for (u64 word_index = 0; word_index < num_words; ++word_index) {
219 const u64 cached_bits = cached_words[word_index];
220 NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits);
221 untracked_words[word_index] |= cached_bits;
222 cpu_words[word_index] |= cached_bits;
223 }
179 } 224 }
180 225
181 /// Call 'func' for each CPU modified range and unmark those pages as CPU modified 226 /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
182 template <typename Func> 227 template <typename Func>
183 void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { 228 void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) {
184 ForEachModifiedRange<false, true>(query_cpu_range, size, func); 229 ForEachModifiedRange<Type::CPU>(query_cpu_range, size, func);
185 } 230 }
186 231
187 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified 232 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
188 template <typename Func> 233 template <typename Func>
189 void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) { 234 void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) {
190 ForEachModifiedRange<true, false>(query_cpu_range, size, func); 235 ForEachModifiedRange<Type::GPU>(query_cpu_range, size, func);
191 } 236 }
192 237
193 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified 238 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
194 template <typename Func> 239 template <typename Func>
195 void ForEachDownloadRange(Func&& func) { 240 void ForEachDownloadRange(Func&& func) {
196 ForEachModifiedRange<true, false>(cpu_addr, SizeBytes(), func); 241 ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), func);
197 } 242 }
198 243
199 /// Mark buffer as picked 244 /// Mark buffer as picked
@@ -206,6 +251,16 @@ public:
206 flags &= ~BufferFlagBits::Picked; 251 flags &= ~BufferFlagBits::Picked;
207 } 252 }
208 253
254 /// Increases the likeliness of this being a stream buffer
255 void IncreaseStreamScore(int score) noexcept {
256 stream_score += score;
257 }
258
259 /// Returns the likeliness of this being a stream buffer
260 [[nodiscard]] int StreamScore() const noexcept {
261 return stream_score;
262 }
263
209 /// Returns true when vaddr -> vaddr+size is fully contained in the buffer 264 /// Returns true when vaddr -> vaddr+size is fully contained in the buffer
210 [[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept { 265 [[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept {
211 return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes(); 266 return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes();
@@ -216,6 +271,11 @@ public:
216 return True(flags & BufferFlagBits::Picked); 271 return True(flags & BufferFlagBits::Picked);
217 } 272 }
218 273
274 /// Returns true when the buffer has pending cached writes
275 [[nodiscard]] bool HasCachedWrites() const noexcept {
276 return True(flags & BufferFlagBits::CachedWrites);
277 }
278
219 /// Returns the base CPU address of the buffer 279 /// Returns the base CPU address of the buffer
220 [[nodiscard]] VAddr CpuAddr() const noexcept { 280 [[nodiscard]] VAddr CpuAddr() const noexcept {
221 return cpu_addr; 281 return cpu_addr;
@@ -233,26 +293,48 @@ public:
233 } 293 }
234 294
235private: 295private:
296 template <Type type>
297 u64* Array() noexcept {
298 if constexpr (type == Type::CPU) {
299 return words.cpu.Pointer(IsShort());
300 } else if constexpr (type == Type::GPU) {
301 return words.gpu.Pointer(IsShort());
302 } else if constexpr (type == Type::CachedCPU) {
303 return words.cached_cpu.Pointer(IsShort());
304 } else if constexpr (type == Type::Untracked) {
305 return words.untracked.Pointer(IsShort());
306 }
307 }
308
309 template <Type type>
310 const u64* Array() const noexcept {
311 if constexpr (type == Type::CPU) {
312 return words.cpu.Pointer(IsShort());
313 } else if constexpr (type == Type::GPU) {
314 return words.gpu.Pointer(IsShort());
315 } else if constexpr (type == Type::CachedCPU) {
316 return words.cached_cpu.Pointer(IsShort());
317 } else if constexpr (type == Type::Untracked) {
318 return words.untracked.Pointer(IsShort());
319 }
320 }
321
236 /** 322 /**
237 * Change the state of a range of pages 323 * Change the state of a range of pages
238 * 324 *
239 * @param written_words Pages to be marked or unmarked as modified
240 * @param dirty_addr Base address to mark or unmark as modified 325 * @param dirty_addr Base address to mark or unmark as modified
241 * @param size Size in bytes to mark or unmark as modified 326 * @param size Size in bytes to mark or unmark as modified
242 *
243 * @tparam enable True when the bits will be set to one, false for zero
244 * @tparam notify_rasterizer True when the rasterizer has to be notified about the changes
245 */ 327 */
246 template <bool enable, bool notify_rasterizer> 328 template <Type type, bool enable>
247 void ChangeRegionState(WrittenWords& written_words, u64 dirty_addr, 329 void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) {
248 s64 size) noexcept(!notify_rasterizer) {
249 const s64 difference = dirty_addr - cpu_addr; 330 const s64 difference = dirty_addr - cpu_addr;
250 const u64 offset = std::max<s64>(difference, 0); 331 const u64 offset = std::max<s64>(difference, 0);
251 size += std::min<s64>(difference, 0); 332 size += std::min<s64>(difference, 0);
252 if (offset >= SizeBytes() || size < 0) { 333 if (offset >= SizeBytes() || size < 0) {
253 return; 334 return;
254 } 335 }
255 u64* const state_words = written_words.Pointer(IsShort()); 336 u64* const untracked_words = Array<Type::Untracked>();
337 u64* const state_words = Array<type>();
256 const u64 offset_end = std::min(offset + size, SizeBytes()); 338 const u64 offset_end = std::min(offset + size, SizeBytes());
257 const u64 begin_page_index = offset / BYTES_PER_PAGE; 339 const u64 begin_page_index = offset / BYTES_PER_PAGE;
258 const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; 340 const u64 begin_word_index = begin_page_index / PAGES_PER_WORD;
@@ -268,13 +350,19 @@ private:
268 u64 bits = ~u64{0}; 350 u64 bits = ~u64{0};
269 bits = (bits >> right_offset) << right_offset; 351 bits = (bits >> right_offset) << right_offset;
270 bits = (bits << left_offset) >> left_offset; 352 bits = (bits << left_offset) >> left_offset;
271 if constexpr (notify_rasterizer) { 353 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
272 NotifyRasterizer<!enable>(word_index, state_words[word_index], bits); 354 NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits);
273 } 355 }
274 if constexpr (enable) { 356 if constexpr (enable) {
275 state_words[word_index] |= bits; 357 state_words[word_index] |= bits;
358 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
359 untracked_words[word_index] |= bits;
360 }
276 } else { 361 } else {
277 state_words[word_index] &= ~bits; 362 state_words[word_index] &= ~bits;
363 if constexpr (type == Type::CPU || type == Type::CachedCPU) {
364 untracked_words[word_index] &= ~bits;
365 }
278 } 366 }
279 page_index = 0; 367 page_index = 0;
280 ++word_index; 368 ++word_index;
@@ -291,7 +379,7 @@ private:
291 * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages 379 * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages
292 */ 380 */
293 template <bool add_to_rasterizer> 381 template <bool add_to_rasterizer>
294 void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) { 382 void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const {
295 u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; 383 u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits;
296 VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; 384 VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
297 while (changed_bits != 0) { 385 while (changed_bits != 0) {
@@ -315,21 +403,20 @@ private:
315 * @param query_cpu_range Base CPU address to loop over 403 * @param query_cpu_range Base CPU address to loop over
316 * @param size Size in bytes of the CPU range to loop over 404 * @param size Size in bytes of the CPU range to loop over
317 * @param func Function to call for each turned off region 405 * @param func Function to call for each turned off region
318 *
319 * @tparam gpu True for host GPU pages, false for CPU pages
320 * @tparam notify_rasterizer True when the rasterizer should be notified about state changes
321 */ 406 */
322 template <bool gpu, bool notify_rasterizer, typename Func> 407 template <Type type, typename Func>
323 void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { 408 void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) {
409 static_assert(type != Type::Untracked);
410
324 const s64 difference = query_cpu_range - cpu_addr; 411 const s64 difference = query_cpu_range - cpu_addr;
325 const u64 query_begin = std::max<s64>(difference, 0); 412 const u64 query_begin = std::max<s64>(difference, 0);
326 size += std::min<s64>(difference, 0); 413 size += std::min<s64>(difference, 0);
327 if (query_begin >= SizeBytes() || size < 0) { 414 if (query_begin >= SizeBytes() || size < 0) {
328 return; 415 return;
329 } 416 }
330 const u64* const cpu_words = words.cpu.Pointer(IsShort()); 417 u64* const untracked_words = Array<Type::Untracked>();
418 u64* const state_words = Array<type>();
331 const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); 419 const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
332 u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort());
333 u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; 420 u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
334 u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); 421 u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD);
335 422
@@ -345,7 +432,8 @@ private:
345 const u64 word_index_end = std::distance(state_words, last_modified_word); 432 const u64 word_index_end = std::distance(state_words, last_modified_word);
346 433
347 const unsigned local_page_begin = std::countr_zero(*first_modified_word); 434 const unsigned local_page_begin = std::countr_zero(*first_modified_word);
348 const unsigned local_page_end = PAGES_PER_WORD - std::countl_zero(last_modified_word[-1]); 435 const unsigned local_page_end =
436 static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]);
349 const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; 437 const u64 word_page_begin = word_index_begin * PAGES_PER_WORD;
350 const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; 438 const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD;
351 const u64 query_page_begin = query_begin / BYTES_PER_PAGE; 439 const u64 query_page_begin = query_begin / BYTES_PER_PAGE;
@@ -371,11 +459,13 @@ private:
371 const u64 current_word = state_words[word_index] & bits; 459 const u64 current_word = state_words[word_index] & bits;
372 state_words[word_index] &= ~bits; 460 state_words[word_index] &= ~bits;
373 461
374 // Exclude CPU modified pages when visiting GPU pages 462 if constexpr (type == Type::CPU) {
375 const u64 word = current_word & ~(gpu ? cpu_words[word_index] : 0); 463 const u64 current_bits = untracked_words[word_index] & bits;
376 if constexpr (notify_rasterizer) { 464 untracked_words[word_index] &= ~bits;
377 NotifyRasterizer<true>(word_index, word, ~u64{0}); 465 NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
378 } 466 }
467 // Exclude CPU modified pages when visiting GPU pages
468 const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
379 u64 page = page_begin; 469 u64 page = page_begin;
380 page_begin = 0; 470 page_begin = 0;
381 471
@@ -416,17 +506,20 @@ private:
416 * @param offset Offset in bytes from the start of the buffer 506 * @param offset Offset in bytes from the start of the buffer
417 * @param size Size in bytes of the region to query for modifications 507 * @param size Size in bytes of the region to query for modifications
418 */ 508 */
419 template <bool gpu> 509 template <Type type>
420 [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { 510 [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
421 const u64* const cpu_words = words.cpu.Pointer(IsShort()); 511 static_assert(type != Type::Untracked);
422 const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort()); 512
513 const u64* const untracked_words = Array<Type::Untracked>();
514 const u64* const state_words = Array<type>();
423 const u64 num_query_words = size / BYTES_PER_WORD + 1; 515 const u64 num_query_words = size / BYTES_PER_WORD + 1;
424 const u64 word_begin = offset / BYTES_PER_WORD; 516 const u64 word_begin = offset / BYTES_PER_WORD;
425 const u64 word_end = std::min(word_begin + num_query_words, NumWords()); 517 const u64 word_end = std::min(word_begin + num_query_words, NumWords());
426 const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); 518 const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
427 u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; 519 u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
428 for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { 520 for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
429 const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0); 521 const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
522 const u64 word = state_words[word_index] & ~off_word;
430 if (word == 0) { 523 if (word == 0) {
431 continue; 524 continue;
432 } 525 }
@@ -445,13 +538,13 @@ private:
445 * 538 *
446 * @param offset Offset in bytes from the start of the buffer 539 * @param offset Offset in bytes from the start of the buffer
447 * @param size Size in bytes of the region to query for modifications 540 * @param size Size in bytes of the region to query for modifications
448 *
449 * @tparam gpu True to query GPU modified pages, false for CPU pages
450 */ 541 */
451 template <bool gpu> 542 template <Type type>
452 [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { 543 [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
453 const u64* const cpu_words = words.cpu.Pointer(IsShort()); 544 static_assert(type != Type::Untracked);
454 const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort()); 545
546 const u64* const untracked_words = Array<Type::Untracked>();
547 const u64* const state_words = Array<type>();
455 const u64 num_query_words = size / BYTES_PER_WORD + 1; 548 const u64 num_query_words = size / BYTES_PER_WORD + 1;
456 const u64 word_begin = offset / BYTES_PER_WORD; 549 const u64 word_begin = offset / BYTES_PER_WORD;
457 const u64 word_end = std::min(word_begin + num_query_words, NumWords()); 550 const u64 word_end = std::min(word_begin + num_query_words, NumWords());
@@ -460,7 +553,8 @@ private:
460 u64 begin = std::numeric_limits<u64>::max(); 553 u64 begin = std::numeric_limits<u64>::max();
461 u64 end = 0; 554 u64 end = 0;
462 for (u64 word_index = word_begin; word_index < word_end; ++word_index) { 555 for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
463 const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0); 556 const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
557 const u64 word = state_words[word_index] & ~off_word;
464 if (word == 0) { 558 if (word == 0) {
465 continue; 559 continue;
466 } 560 }
@@ -488,8 +582,9 @@ private:
488 582
489 RasterizerInterface* rasterizer = nullptr; 583 RasterizerInterface* rasterizer = nullptr;
490 VAddr cpu_addr = 0; 584 VAddr cpu_addr = 0;
491 GpuCpuWords words; 585 Words words;
492 BufferFlagBits flags{}; 586 BufferFlagBits flags{};
587 int stream_score = 0;
493}; 588};
494 589
495} // namespace VideoCommon 590} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
deleted file mode 100644
index e9306194a..000000000
--- a/src/video_core/buffer_cache/buffer_block.h
+++ /dev/null
@@ -1,62 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "common/common_types.h"
8
9namespace VideoCommon {
10
11class BufferBlock {
12public:
13 [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
14 return (cpu_addr < end) && (cpu_addr_end > start);
15 }
16
17 [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
18 return cpu_addr <= other_start && other_end <= cpu_addr_end;
19 }
20
21 [[nodiscard]] std::size_t Offset(VAddr in_addr) const {
22 return static_cast<std::size_t>(in_addr - cpu_addr);
23 }
24
25 [[nodiscard]] VAddr CpuAddr() const {
26 return cpu_addr;
27 }
28
29 [[nodiscard]] VAddr CpuAddrEnd() const {
30 return cpu_addr_end;
31 }
32
33 void SetCpuAddr(VAddr new_addr) {
34 cpu_addr = new_addr;
35 cpu_addr_end = new_addr + size;
36 }
37
38 [[nodiscard]] std::size_t Size() const {
39 return size;
40 }
41
42 [[nodiscard]] u64 Epoch() const {
43 return epoch;
44 }
45
46 void SetEpoch(u64 new_epoch) {
47 epoch = new_epoch;
48 }
49
50protected:
51 explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
52 SetCpuAddr(cpu_addr_);
53 }
54
55private:
56 VAddr cpu_addr{};
57 VAddr cpu_addr_end{};
58 std::size_t size{};
59 u64 epoch{};
60};
61
62} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
new file mode 100644
index 000000000..ab32294c8
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -0,0 +1,13 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/microprofile.h"
6
7namespace VideoCommon {
8
9MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128));
10MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128));
11MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128));
12
13} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 83b9ee871..2a6844ab1 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -4,591 +4,1289 @@
4 4
5#pragma once 5#pragma once
6 6
7#include <list> 7#include <algorithm>
8#include <array>
9#include <deque>
8#include <memory> 10#include <memory>
9#include <mutex> 11#include <mutex>
12#include <span>
10#include <unordered_map> 13#include <unordered_map>
11#include <unordered_set>
12#include <utility>
13#include <vector> 14#include <vector>
14 15
15#include <boost/container/small_vector.hpp> 16#include <boost/container/small_vector.hpp>
16#include <boost/icl/interval_set.hpp>
17#include <boost/intrusive/set.hpp>
18 17
19#include "common/alignment.h"
20#include "common/assert.h"
21#include "common/common_types.h" 18#include "common/common_types.h"
22#include "common/logging/log.h" 19#include "common/div_ceil.h"
23#include "core/core.h" 20#include "common/microprofile.h"
21#include "common/scope_exit.h"
24#include "core/memory.h" 22#include "core/memory.h"
25#include "core/settings.h" 23#include "core/settings.h"
26#include "video_core/buffer_cache/buffer_block.h" 24#include "video_core/buffer_cache/buffer_base.h"
27#include "video_core/buffer_cache/map_interval.h" 25#include "video_core/delayed_destruction_ring.h"
26#include "video_core/dirty_flags.h"
27#include "video_core/engines/kepler_compute.h"
28#include "video_core/engines/maxwell_3d.h"
28#include "video_core/memory_manager.h" 29#include "video_core/memory_manager.h"
29#include "video_core/rasterizer_interface.h" 30#include "video_core/rasterizer_interface.h"
31#include "video_core/texture_cache/slot_vector.h"
32#include "video_core/texture_cache/types.h"
30 33
31namespace VideoCommon { 34namespace VideoCommon {
32 35
33template <typename Buffer, typename BufferType, typename StreamBuffer> 36MICROPROFILE_DECLARE(GPU_PrepareBuffers);
37MICROPROFILE_DECLARE(GPU_BindUploadBuffers);
38MICROPROFILE_DECLARE(GPU_DownloadMemory);
39
40using BufferId = SlotId;
41
42constexpr u32 NUM_VERTEX_BUFFERS = 32;
43constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4;
44constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18;
45constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
46constexpr u32 NUM_STORAGE_BUFFERS = 16;
47constexpr u32 NUM_STAGES = 5;
48
49template <typename P>
34class BufferCache { 50class BufferCache {
35 using IntervalSet = boost::icl::interval_set<VAddr>; 51 // Page size for caching purposes.
36 using IntervalType = typename IntervalSet::interval_type; 52 // This is unrelated to the CPU page size and it can be changed as it seems optimal.
37 using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>; 53 static constexpr u32 PAGE_BITS = 16;
54 static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS;
55
56 static constexpr bool IS_OPENGL = P::IS_OPENGL;
57 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS =
58 P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS;
59 static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT =
60 P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT;
61 static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX;
62 static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX;
63 static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS;
64
65 static constexpr BufferId NULL_BUFFER_ID{0};
66
67 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
68
69 using Runtime = typename P::Runtime;
70 using Buffer = typename P::Buffer;
71
72 struct Empty {};
73
74 struct OverlapResult {
75 std::vector<BufferId> ids;
76 VAddr begin;
77 VAddr end;
78 bool has_stream_leap = false;
79 };
38 80
39 static constexpr u64 WRITE_PAGE_BIT = 11; 81 struct Binding {
40 static constexpr u64 BLOCK_PAGE_BITS = 21; 82 VAddr cpu_addr{};
41 static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; 83 u32 size{};
84 BufferId buffer_id;
85 };
42 86
43public: 87 static constexpr Binding NULL_BINDING{
44 struct BufferInfo { 88 .cpu_addr = 0,
45 BufferType handle; 89 .size = 0,
46 u64 offset; 90 .buffer_id = NULL_BUFFER_ID,
47 u64 address;
48 }; 91 };
49 92
50 BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, 93public:
51 bool is_written = false, bool use_fast_cbuf = false) { 94 static constexpr u32 SKIP_CACHE_SIZE = 4096;
52 std::lock_guard lock{mutex};
53 95
54 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); 96 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
55 if (!cpu_addr) { 97 Tegra::Engines::Maxwell3D& maxwell3d_,
56 return GetEmptyBuffer(size); 98 Tegra::Engines::KeplerCompute& kepler_compute_,
57 } 99 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
100 Runtime& runtime_);
58 101
59 // Cache management is a big overhead, so only cache entries with a given size. 102 void TickFrame();
60 // TODO: Figure out which size is the best for given games.
61 constexpr std::size_t max_stream_size = 0x800;
62 if (use_fast_cbuf || size < max_stream_size) {
63 if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) {
64 const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size);
65 if (use_fast_cbuf) {
66 u8* dest;
67 if (is_granular) {
68 dest = gpu_memory.GetPointer(gpu_addr);
69 } else {
70 staging_buffer.resize(size);
71 dest = staging_buffer.data();
72 gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
73 }
74 return ConstBufferUpload(dest, size);
75 }
76 if (is_granular) {
77 u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
78 return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
79 std::memcpy(dest, host_ptr, size);
80 });
81 } else {
82 return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) {
83 gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
84 });
85 }
86 }
87 }
88 103
89 Buffer* const block = GetBlock(*cpu_addr, size); 104 void WriteMemory(VAddr cpu_addr, u64 size);
90 MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size);
91 if (!map) {
92 return GetEmptyBuffer(size);
93 }
94 if (is_written) {
95 map->MarkAsModified(true, GetModifiedTicks());
96 if (Settings::IsGPULevelHigh() &&
97 Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
98 MarkForAsyncFlush(map);
99 }
100 if (!map->is_written) {
101 map->is_written = true;
102 MarkRegionAsWritten(map->start, map->end - 1);
103 }
104 }
105 105
106 return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()}; 106 void CachedWriteMemory(VAddr cpu_addr, u64 size);
107 }
108 107
109 /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. 108 void DownloadMemory(VAddr cpu_addr, u64 size);
110 BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
111 std::size_t alignment = 4) {
112 std::lock_guard lock{mutex};
113 return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
114 std::memcpy(dest, raw_pointer, size);
115 });
116 }
117 109
118 /// Prepares the buffer cache for data uploading 110 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
119 /// @param max_size Maximum number of bytes that will be uploaded
120 /// @return True when a stream buffer invalidation was required, false otherwise
121 void Map(std::size_t max_size) {
122 std::lock_guard lock{mutex};
123 111
124 std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4); 112 void UpdateGraphicsBuffers(bool is_indexed);
125 buffer_offset = buffer_offset_base;
126 }
127 113
128 /// Finishes the upload stream 114 void UpdateComputeBuffers();
129 void Unmap() {
130 std::lock_guard lock{mutex};
131 stream_buffer.Unmap(buffer_offset - buffer_offset_base);
132 }
133 115
134 /// Function called at the end of each frame, inteded for deferred operations 116 void BindHostGeometryBuffers(bool is_indexed);
135 void TickFrame() {
136 ++epoch;
137 117
138 while (!pending_destruction.empty()) { 118 void BindHostStageBuffers(size_t stage);
139 // Delay at least 4 frames before destruction.
140 // This is due to triple buffering happening on some drivers.
141 static constexpr u64 epochs_to_destroy = 5;
142 if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
143 break;
144 }
145 pending_destruction.pop();
146 }
147 }
148 119
149 /// Write any cached resources overlapping the specified region back to memory 120 void BindHostComputeBuffers();
150 void FlushRegion(VAddr addr, std::size_t size) {
151 std::lock_guard lock{mutex};
152 121
153 VectorMapInterval objects = GetMapsInRange(addr, size); 122 void SetEnabledUniformBuffers(size_t stage, u32 enabled);
154 std::sort(objects.begin(), objects.end(),
155 [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
156 for (MapInterval* object : objects) {
157 if (object->is_modified && object->is_registered) {
158 mutex.unlock();
159 FlushMap(object);
160 mutex.lock();
161 }
162 }
163 }
164 123
165 bool MustFlushRegion(VAddr addr, std::size_t size) { 124 void SetEnabledComputeUniformBuffers(u32 enabled);
166 std::lock_guard lock{mutex};
167 125
168 const VectorMapInterval objects = GetMapsInRange(addr, size); 126 void UnbindGraphicsStorageBuffers(size_t stage);
169 return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
170 return map->is_modified && map->is_registered;
171 });
172 }
173 127
174 /// Mark the specified region as being invalidated 128 void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
175 void InvalidateRegion(VAddr addr, u64 size) { 129 bool is_written);
176 std::lock_guard lock{mutex};
177 130
178 for (auto& object : GetMapsInRange(addr, size)) { 131 void UnbindComputeStorageBuffers();
179 if (object->is_registered) {
180 Unregister(object);
181 }
182 }
183 }
184 132
185 void OnCPUWrite(VAddr addr, std::size_t size) { 133 void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
186 std::lock_guard lock{mutex}; 134 bool is_written);
187 135
188 for (MapInterval* object : GetMapsInRange(addr, size)) { 136 void FlushCachedWrites();
189 if (object->is_memory_marked && object->is_registered) {
190 UnmarkMemory(object);
191 object->is_sync_pending = true;
192 marked_for_unregister.emplace_back(object);
193 }
194 }
195 }
196 137
197 void SyncGuestHost() { 138 /// Return true when there are uncommitted buffers to be downloaded
198 std::lock_guard lock{mutex}; 139 [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
199 140
200 for (auto& object : marked_for_unregister) { 141 /// Return true when the caller should wait for async downloads
201 if (object->is_registered) { 142 [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
202 object->is_sync_pending = false; 143
203 Unregister(object); 144 /// Commit asynchronous downloads
204 } 145 void CommitAsyncFlushes();
146
147 /// Pop asynchronous downloads
148 void PopAsyncFlushes();
149
150 /// Return true when a CPU region is modified from the GPU
151 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
152
153 std::mutex mutex;
154
155private:
156 template <typename Func>
157 static void ForEachEnabledBit(u32 enabled_mask, Func&& func) {
158 for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) {
159 const int disabled_bits = std::countr_zero(enabled_mask);
160 index += disabled_bits;
161 enabled_mask >>= disabled_bits;
162 func(index);
205 } 163 }
206 marked_for_unregister.clear();
207 } 164 }
208 165
209 void CommitAsyncFlushes() { 166 template <typename Func>
210 if (uncommitted_flushes) { 167 void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) {
211 auto commit_list = std::make_shared<std::list<MapInterval*>>(); 168 const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE);
212 for (MapInterval* map : *uncommitted_flushes) { 169 for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) {
213 if (map->is_registered && map->is_modified) { 170 const BufferId buffer_id = page_table[page];
214 // TODO(Blinkhawk): Implement backend asynchronous flushing 171 if (!buffer_id) {
215 // AsyncFlushMap(map) 172 ++page;
216 commit_list->push_back(map); 173 continue;
217 }
218 }
219 if (!commit_list->empty()) {
220 committed_flushes.push_back(commit_list);
221 } else {
222 committed_flushes.emplace_back();
223 } 174 }
224 } else { 175 Buffer& buffer = slot_buffers[buffer_id];
225 committed_flushes.emplace_back(); 176 func(buffer_id, buffer);
177
178 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
179 page = Common::DivCeil(end_addr, PAGE_SIZE);
226 } 180 }
227 uncommitted_flushes.reset();
228 } 181 }
229 182
230 bool ShouldWaitAsyncFlushes() const { 183 static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
231 return !committed_flushes.empty() && committed_flushes.front() != nullptr; 184 return (cpu_addr & ~Core::Memory::PAGE_MASK) ==
185 ((cpu_addr + size) & ~Core::Memory::PAGE_MASK);
232 } 186 }
233 187
234 bool HasUncommittedFlushes() const { 188 void BindHostIndexBuffer();
235 return uncommitted_flushes != nullptr;
236 }
237 189
238 void PopAsyncFlushes() { 190 void BindHostVertexBuffers();
239 if (committed_flushes.empty()) {
240 return;
241 }
242 auto& flush_list = committed_flushes.front();
243 if (!flush_list) {
244 committed_flushes.pop_front();
245 return;
246 }
247 for (MapInterval* map : *flush_list) {
248 if (map->is_registered) {
249 // TODO(Blinkhawk): Replace this for reading the asynchronous flush
250 FlushMap(map);
251 }
252 }
253 committed_flushes.pop_front();
254 }
255 191
256 virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; 192 void BindHostGraphicsUniformBuffers(size_t stage);
257 193
258protected: 194 void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind);
259 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
260 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
261 StreamBuffer& stream_buffer_)
262 : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
263 stream_buffer{stream_buffer_} {}
264 195
265 ~BufferCache() = default; 196 void BindHostGraphicsStorageBuffers(size_t stage);
266 197
267 virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0; 198 void BindHostTransformFeedbackBuffers();
268 199
269 virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { 200 void BindHostComputeUniformBuffers();
270 return {};
271 }
272 201
273 /// Register an object into the cache 202 void BindHostComputeStorageBuffers();
274 MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
275 const VAddr cpu_addr = new_map.start;
276 if (!cpu_addr) {
277 LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
278 new_map.gpu_addr);
279 return nullptr;
280 }
281 const std::size_t size = new_map.end - new_map.start;
282 new_map.is_registered = true;
283 rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
284 new_map.is_memory_marked = true;
285 if (inherit_written) {
286 MarkRegionAsWritten(new_map.start, new_map.end - 1);
287 new_map.is_written = true;
288 }
289 MapInterval* const storage = mapped_addresses_allocator.Allocate();
290 *storage = new_map;
291 mapped_addresses.insert(*storage);
292 return storage;
293 }
294 203
295 void UnmarkMemory(MapInterval* map) { 204 void DoUpdateGraphicsBuffers(bool is_indexed);
296 if (!map->is_memory_marked) { 205
297 return; 206 void DoUpdateComputeBuffers();
298 } 207
299 const std::size_t size = map->end - map->start; 208 void UpdateIndexBuffer();
300 rasterizer.UpdatePagesCachedCount(map->start, size, -1); 209
301 map->is_memory_marked = false; 210 void UpdateVertexBuffers();
302 } 211
303 212 void UpdateVertexBuffer(u32 index);
304 /// Unregisters an object from the cache 213
305 void Unregister(MapInterval* map) { 214 void UpdateUniformBuffers(size_t stage);
306 UnmarkMemory(map); 215
307 map->is_registered = false; 216 void UpdateStorageBuffers(size_t stage);
308 if (map->is_sync_pending) { 217
309 map->is_sync_pending = false; 218 void UpdateTransformFeedbackBuffers();
310 marked_for_unregister.remove(map); 219
220 void UpdateTransformFeedbackBuffer(u32 index);
221
222 void UpdateComputeUniformBuffers();
223
224 void UpdateComputeStorageBuffers();
225
226 void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size);
227
228 [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size);
229
230 [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size);
231
232 void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score);
233
234 [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size);
235
236 void Register(BufferId buffer_id);
237
238 void Unregister(BufferId buffer_id);
239
240 template <bool insert>
241 void ChangeRegister(BufferId buffer_id);
242
243 void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
244
245 void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
246
247 void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
248 std::span<BufferCopy> copies);
249
250 void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
251 std::span<const BufferCopy> copies);
252
253 void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies);
254
255 void DeleteBuffer(BufferId buffer_id);
256
257 void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
258
259 void NotifyBufferDeletion();
260
261 [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const;
262
263 [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size);
264
265 [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity);
266
267 [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept;
268
269 VideoCore::RasterizerInterface& rasterizer;
270 Tegra::Engines::Maxwell3D& maxwell3d;
271 Tegra::Engines::KeplerCompute& kepler_compute;
272 Tegra::MemoryManager& gpu_memory;
273 Core::Memory::Memory& cpu_memory;
274 Runtime& runtime;
275
276 SlotVector<Buffer> slot_buffers;
277 DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
278
279 u32 last_index_count = 0;
280
281 Binding index_buffer;
282 std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
283 std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
284 std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
285 std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
286
287 std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
288 std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
289
290 std::array<u32, NUM_STAGES> enabled_uniform_buffers{};
291 u32 enabled_compute_uniform_buffers = 0;
292
293 std::array<u32, NUM_STAGES> enabled_storage_buffers{};
294 std::array<u32, NUM_STAGES> written_storage_buffers{};
295 u32 enabled_compute_storage_buffers = 0;
296 u32 written_compute_storage_buffers = 0;
297
298 std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{};
299
300 bool has_deleted_buffers = false;
301
302 std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty>
303 dirty_uniform_buffers{};
304
305 std::vector<BufferId> cached_write_buffer_ids;
306
307 // TODO: This data structure is not optimal and it should be reworked
308 std::vector<BufferId> uncommitted_downloads;
309 std::deque<std::vector<BufferId>> committed_downloads;
310
311 size_t immediate_buffer_capacity = 0;
312 std::unique_ptr<u8[]> immediate_buffer_alloc;
313
314 std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
315};
316
317template <class P>
318BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
319 Tegra::Engines::Maxwell3D& maxwell3d_,
320 Tegra::Engines::KeplerCompute& kepler_compute_,
321 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
322 Runtime& runtime_)
323 : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_},
324 gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {
325 // Ensure the first slot is used for the null buffer
326 void(slot_buffers.insert(runtime, NullBufferParams{}));
327}
328
329template <class P>
330void BufferCache<P>::TickFrame() {
331 delayed_destruction_ring.Tick();
332}
333
334template <class P>
335void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) {
336 ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
337 buffer.MarkRegionAsCpuModified(cpu_addr, size);
338 });
339}
340
341template <class P>
342void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
343 ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
344 if (!buffer.HasCachedWrites()) {
345 cached_write_buffer_ids.push_back(buffer_id);
311 } 346 }
312 if (map->is_written) { 347 buffer.CachedCpuWrite(cpu_addr, size);
313 UnmarkRegionAsWritten(map->start, map->end - 1); 348 });
349}
350
351template <class P>
352void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
353 ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
354 boost::container::small_vector<BufferCopy, 1> copies;
355 u64 total_size_bytes = 0;
356 u64 largest_copy = 0;
357 buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
358 copies.push_back(BufferCopy{
359 .src_offset = range_offset,
360 .dst_offset = total_size_bytes,
361 .size = range_size,
362 });
363 total_size_bytes += range_size;
364 largest_copy = std::max(largest_copy, range_size);
365 });
366 if (total_size_bytes == 0) {
367 return;
314 } 368 }
315 const auto it = mapped_addresses.find(*map); 369 MICROPROFILE_SCOPE(GPU_DownloadMemory);
316 ASSERT(it != mapped_addresses.end()); 370
317 mapped_addresses.erase(it); 371 if constexpr (USE_MEMORY_MAPS) {
318 mapped_addresses_allocator.Release(map); 372 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
319 } 373 const u8* const mapped_memory = download_staging.mapped_span.data();
320 374 const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
321private: 375 for (BufferCopy& copy : copies) {
322 MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { 376 // Modify copies to have the staging offset in mind
323 const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); 377 copy.dst_offset += download_staging.offset;
324 if (overlaps.empty()) {
325 const VAddr cpu_addr_end = cpu_addr + size;
326 if (gpu_memory.IsGranularRange(gpu_addr, size)) {
327 u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
328 block->Upload(block->Offset(cpu_addr), size, host_ptr);
329 } else {
330 staging_buffer.resize(size);
331 gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
332 block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
333 } 378 }
334 return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); 379 runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
335 } 380 runtime.Finish();
336 381 for (const BufferCopy& copy : copies) {
337 const VAddr cpu_addr_end = cpu_addr + size; 382 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
338 if (overlaps.size() == 1) { 383 // Undo the modified offset
339 MapInterval* const current_map = overlaps[0]; 384 const u64 dst_offset = copy.dst_offset - download_staging.offset;
340 if (current_map->IsInside(cpu_addr, cpu_addr_end)) { 385 const u8* copy_mapped_memory = mapped_memory + dst_offset;
341 return current_map; 386 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
387 }
388 } else {
389 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
390 for (const BufferCopy& copy : copies) {
391 buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
392 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
393 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
342 } 394 }
343 } 395 }
344 VAddr new_start = cpu_addr; 396 });
345 VAddr new_end = cpu_addr_end; 397}
346 bool write_inheritance = false; 398
347 bool modified_inheritance = false; 399template <class P>
348 // Calculate new buffer parameters 400void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
349 for (MapInterval* overlap : overlaps) { 401 u32 size) {
350 new_start = std::min(overlap->start, new_start); 402 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
351 new_end = std::max(overlap->end, new_end); 403 if (!cpu_addr) {
352 write_inheritance |= overlap->is_written; 404 uniform_buffers[stage][index] = NULL_BINDING;
353 modified_inheritance |= overlap->is_modified; 405 return;
406 }
407 const Binding binding{
408 .cpu_addr = *cpu_addr,
409 .size = size,
410 .buffer_id = BufferId{},
411 };
412 uniform_buffers[stage][index] = binding;
413}
414
415template <class P>
416void BufferCache<P>::UpdateGraphicsBuffers(bool is_indexed) {
417 MICROPROFILE_SCOPE(GPU_PrepareBuffers);
418 do {
419 has_deleted_buffers = false;
420 DoUpdateGraphicsBuffers(is_indexed);
421 } while (has_deleted_buffers);
422}
423
424template <class P>
425void BufferCache<P>::UpdateComputeBuffers() {
426 MICROPROFILE_SCOPE(GPU_PrepareBuffers);
427 do {
428 has_deleted_buffers = false;
429 DoUpdateComputeBuffers();
430 } while (has_deleted_buffers);
431}
432
433template <class P>
434void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) {
435 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
436 if (is_indexed) {
437 BindHostIndexBuffer();
438 } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
439 const auto& regs = maxwell3d.regs;
440 if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
441 runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count);
354 } 442 }
355 GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; 443 }
356 for (auto& overlap : overlaps) { 444 BindHostVertexBuffers();
357 Unregister(overlap); 445 BindHostTransformFeedbackBuffers();
446}
447
448template <class P>
449void BufferCache<P>::BindHostStageBuffers(size_t stage) {
450 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
451 BindHostGraphicsUniformBuffers(stage);
452 BindHostGraphicsStorageBuffers(stage);
453}
454
455template <class P>
456void BufferCache<P>::BindHostComputeBuffers() {
457 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
458 BindHostComputeUniformBuffers();
459 BindHostComputeStorageBuffers();
460}
461
462template <class P>
463void BufferCache<P>::SetEnabledUniformBuffers(size_t stage, u32 enabled) {
464 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
465 if (enabled_uniform_buffers[stage] != enabled) {
466 dirty_uniform_buffers[stage] = ~u32{0};
358 } 467 }
359 UpdateBlock(block, new_start, new_end, overlaps); 468 }
360 469 enabled_uniform_buffers[stage] = enabled;
361 const MapInterval new_map{new_start, new_end, new_gpu_addr}; 470}
362 MapInterval* const map = Register(new_map, write_inheritance); 471
363 if (!map) { 472template <class P>
364 return nullptr; 473void BufferCache<P>::SetEnabledComputeUniformBuffers(u32 enabled) {
474 enabled_compute_uniform_buffers = enabled;
475}
476
477template <class P>
478void BufferCache<P>::UnbindGraphicsStorageBuffers(size_t stage) {
479 enabled_storage_buffers[stage] = 0;
480 written_storage_buffers[stage] = 0;
481}
482
483template <class P>
484void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index,
485 u32 cbuf_offset, bool is_written) {
486 enabled_storage_buffers[stage] |= 1U << ssbo_index;
487 written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index;
488
489 const auto& cbufs = maxwell3d.state.shader_stages[stage];
490 const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset;
491 storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr);
492}
493
494template <class P>
495void BufferCache<P>::UnbindComputeStorageBuffers() {
496 enabled_compute_storage_buffers = 0;
497 written_compute_storage_buffers = 0;
498}
499
500template <class P>
501void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
502 bool is_written) {
503 enabled_compute_storage_buffers |= 1U << ssbo_index;
504 written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index;
505
506 const auto& launch_desc = kepler_compute.launch_description;
507 ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0);
508
509 const auto& cbufs = launch_desc.const_buffer_config;
510 const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset;
511 compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr);
512}
513
514template <class P>
515void BufferCache<P>::FlushCachedWrites() {
516 for (const BufferId buffer_id : cached_write_buffer_ids) {
517 slot_buffers[buffer_id].FlushCachedWrites();
518 }
519 cached_write_buffer_ids.clear();
520}
521
522template <class P>
523bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
524 return !uncommitted_downloads.empty();
525}
526
527template <class P>
528bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
529 return !committed_downloads.empty() && !committed_downloads.front().empty();
530}
531
532template <class P>
533void BufferCache<P>::CommitAsyncFlushes() {
534 // This is intentionally passing the value by copy
535 committed_downloads.push_front(uncommitted_downloads);
536 uncommitted_downloads.clear();
537}
538
539template <class P>
540void BufferCache<P>::PopAsyncFlushes() {
541 if (committed_downloads.empty()) {
542 return;
543 }
544 auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); });
545 const std::span<const BufferId> download_ids = committed_downloads.back();
546 if (download_ids.empty()) {
547 return;
548 }
549 MICROPROFILE_SCOPE(GPU_DownloadMemory);
550
551 boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
552 u64 total_size_bytes = 0;
553 u64 largest_copy = 0;
554 for (const BufferId buffer_id : download_ids) {
555 slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) {
556 downloads.push_back({
557 BufferCopy{
558 .src_offset = range_offset,
559 .dst_offset = total_size_bytes,
560 .size = range_size,
561 },
562 buffer_id,
563 });
564 total_size_bytes += range_size;
565 largest_copy = std::max(largest_copy, range_size);
566 });
567 }
568 if (downloads.empty()) {
569 return;
570 }
571 if constexpr (USE_MEMORY_MAPS) {
572 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
573 for (auto& [copy, buffer_id] : downloads) {
574 // Have in mind the staging buffer offset for the copy
575 copy.dst_offset += download_staging.offset;
576 const std::array copies{copy};
577 runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies);
365 } 578 }
366 if (modified_inheritance) { 579 runtime.Finish();
367 map->MarkAsModified(true, GetModifiedTicks()); 580 for (const auto [copy, buffer_id] : downloads) {
368 if (Settings::IsGPULevelHigh() && 581 const Buffer& buffer = slot_buffers[buffer_id];
369 Settings::values.use_asynchronous_gpu_emulation.GetValue()) { 582 const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
370 MarkForAsyncFlush(map); 583 // Undo the modified offset
371 } 584 const u64 dst_offset = copy.dst_offset - download_staging.offset;
585 const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset;
586 cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size);
587 }
588 } else {
589 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
590 for (const auto [copy, buffer_id] : downloads) {
591 Buffer& buffer = slot_buffers[buffer_id];
592 buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
593 const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
594 cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
372 } 595 }
373 return map;
374 } 596 }
375 597}
376 void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { 598
377 const IntervalType base_interval{start, end}; 599template <class P>
378 IntervalSet interval_set{}; 600bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
379 interval_set.add(base_interval); 601 const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
380 for (auto& overlap : overlaps) { 602 for (u64 page = addr >> PAGE_BITS; page < page_end;) {
381 const IntervalType subtract{overlap->start, overlap->end}; 603 const BufferId image_id = page_table[page];
382 interval_set.subtract(subtract); 604 if (!image_id) {
605 ++page;
606 continue;
383 } 607 }
384 for (auto& interval : interval_set) { 608 Buffer& buffer = slot_buffers[image_id];
385 const std::size_t size = interval.upper() - interval.lower(); 609 if (buffer.IsRegionGpuModified(addr, size)) {
386 if (size == 0) { 610 return true;
387 continue;
388 }
389 staging_buffer.resize(size);
390 cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
391 block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
392 } 611 }
612 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
613 page = Common::DivCeil(end_addr, PAGE_SIZE);
393 } 614 }
394 615 return false;
395 VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) { 616}
396 VectorMapInterval result; 617
397 if (size == 0) { 618template <class P>
398 return result; 619void BufferCache<P>::BindHostIndexBuffer() {
620 Buffer& buffer = slot_buffers[index_buffer.buffer_id];
621 const u32 offset = buffer.Offset(index_buffer.cpu_addr);
622 const u32 size = index_buffer.size;
623 SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
624 if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
625 runtime.BindIndexBuffer(buffer, offset, size);
626 } else {
627 runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format,
628 maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count,
629 buffer, offset, size);
630 }
631}
632
633template <class P>
634void BufferCache<P>::BindHostVertexBuffers() {
635 auto& flags = maxwell3d.dirty.flags;
636 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
637 const Binding& binding = vertex_buffers[index];
638 Buffer& buffer = slot_buffers[binding.buffer_id];
639 SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
640 if (!flags[Dirty::VertexBuffer0 + index]) {
641 continue;
399 } 642 }
643 flags[Dirty::VertexBuffer0 + index] = false;
400 644
401 const VAddr addr_end = addr + size; 645 const u32 stride = maxwell3d.regs.vertex_array[index].stride;
402 auto it = mapped_addresses.lower_bound(addr); 646 const u32 offset = buffer.Offset(binding.cpu_addr);
403 if (it != mapped_addresses.begin()) { 647 runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride);
404 --it; 648 }
649}
650
651template <class P>
652void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
653 u32 dirty = ~0U;
654 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
655 dirty = std::exchange(dirty_uniform_buffers[stage], 0);
656 }
657 u32 binding_index = 0;
658 ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
659 const bool needs_bind = ((dirty >> index) & 1) != 0;
660 BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind);
661 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
662 ++binding_index;
405 } 663 }
406 while (it != mapped_addresses.end() && it->start < addr_end) { 664 });
407 if (it->Overlaps(addr, addr_end)) { 665}
408 result.push_back(&*it); 666
667template <class P>
668void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index,
669 bool needs_bind) {
670 const Binding& binding = uniform_buffers[stage][index];
671 const VAddr cpu_addr = binding.cpu_addr;
672 const u32 size = binding.size;
673 Buffer& buffer = slot_buffers[binding.buffer_id];
674 if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) {
675 if constexpr (IS_OPENGL) {
676 if (runtime.HasFastBufferSubData()) {
677 // Fast path for Nvidia
678 if (!HasFastUniformBufferBound(stage, binding_index)) {
679 // We only have to bind when the currently bound buffer is not the fast version
680 runtime.BindFastUniformBuffer(stage, binding_index, size);
681 }
682 const auto span = ImmediateBufferWithData(cpu_addr, size);
683 runtime.PushFastUniformBuffer(stage, binding_index, span);
684 return;
409 } 685 }
410 ++it;
411 } 686 }
412 return result; 687 fast_bound_uniform_buffers[stage] |= 1U << binding_index;
413 }
414 688
415 /// Returns a ticks counter used for tracking when cached objects were last modified 689 // Stream buffer path to avoid stalling on non-Nvidia drivers or Vulkan
416 u64 GetModifiedTicks() { 690 const std::span<u8> span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
417 return ++modified_ticks; 691 cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
692 return;
418 } 693 }
419 694 // Classic cached path
420 void FlushMap(MapInterval* map) { 695 SynchronizeBuffer(buffer, cpu_addr, size);
421 const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); 696 if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) {
422 ASSERT_OR_EXECUTE(it != blocks.end(), return;); 697 // Skip binding if it's not needed and if the bound buffer is not the fast version
423 698 // This exists to avoid instances where the fast buffer is bound and a GPU write happens
424 std::shared_ptr<Buffer> block = it->second; 699 return;
425
426 const std::size_t size = map->end - map->start;
427 staging_buffer.resize(size);
428 block->Download(block->Offset(map->start), size, staging_buffer.data());
429 cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size);
430 map->MarkAsModified(false, 0);
431 } 700 }
701 fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
432 702
433 template <typename Callable> 703 const u32 offset = buffer.Offset(cpu_addr);
434 BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { 704 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
435 AlignBuffer(alignment); 705 runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size);
436 const std::size_t uploaded_offset = buffer_offset; 706 } else {
437 callable(buffer_ptr); 707 runtime.BindUniformBuffer(buffer, offset, size);
438
439 buffer_ptr += size;
440 buffer_offset += size;
441 return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
442 } 708 }
709}
710
711template <class P>
712void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
713 u32 binding_index = 0;
714 ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
715 const Binding& binding = storage_buffers[stage][index];
716 Buffer& buffer = slot_buffers[binding.buffer_id];
717 const u32 size = binding.size;
718 SynchronizeBuffer(buffer, binding.cpu_addr, size);
719
720 const u32 offset = buffer.Offset(binding.cpu_addr);
721 const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0;
722 if constexpr (NEEDS_BIND_STORAGE_INDEX) {
723 runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written);
724 ++binding_index;
725 } else {
726 runtime.BindStorageBuffer(buffer, offset, size, is_written);
727 }
728 });
729}
443 730
444 void AlignBuffer(std::size_t alignment) { 731template <class P>
445 // Align the offset, not the mapped pointer 732void BufferCache<P>::BindHostTransformFeedbackBuffers() {
446 const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment); 733 if (maxwell3d.regs.tfb_enabled == 0) {
447 buffer_ptr += offset_aligned - buffer_offset; 734 return;
448 buffer_offset = offset_aligned;
449 } 735 }
736 for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
737 const Binding& binding = transform_feedback_buffers[index];
738 Buffer& buffer = slot_buffers[binding.buffer_id];
739 const u32 size = binding.size;
740 SynchronizeBuffer(buffer, binding.cpu_addr, size);
741
742 const u32 offset = buffer.Offset(binding.cpu_addr);
743 runtime.BindTransformFeedbackBuffer(index, buffer, offset, size);
744 }
745}
450 746
451 std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) { 747template <class P>
452 const std::size_t old_size = buffer->Size(); 748void BufferCache<P>::BindHostComputeUniformBuffers() {
453 const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; 749 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
454 const VAddr cpu_addr = buffer->CpuAddr(); 750 // Mark all uniform buffers as dirty
455 std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size); 751 dirty_uniform_buffers.fill(~u32{0});
456 new_buffer->CopyFrom(*buffer, 0, 0, old_size); 752 }
457 QueueDestruction(std::move(buffer)); 753 u32 binding_index = 0;
458 754 ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
459 const VAddr cpu_addr_end = cpu_addr + new_size - 1; 755 const Binding& binding = compute_uniform_buffers[index];
460 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 756 Buffer& buffer = slot_buffers[binding.buffer_id];
461 for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { 757 const u32 size = binding.size;
462 blocks.insert_or_assign(page_start, new_buffer); 758 SynchronizeBuffer(buffer, binding.cpu_addr, size);
759
760 const u32 offset = buffer.Offset(binding.cpu_addr);
761 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
762 runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size);
763 ++binding_index;
764 } else {
765 runtime.BindUniformBuffer(buffer, offset, size);
463 } 766 }
767 });
768}
769
770template <class P>
771void BufferCache<P>::BindHostComputeStorageBuffers() {
772 u32 binding_index = 0;
773 ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
774 const Binding& binding = compute_storage_buffers[index];
775 Buffer& buffer = slot_buffers[binding.buffer_id];
776 const u32 size = binding.size;
777 SynchronizeBuffer(buffer, binding.cpu_addr, size);
778
779 const u32 offset = buffer.Offset(binding.cpu_addr);
780 const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0;
781 if constexpr (NEEDS_BIND_STORAGE_INDEX) {
782 runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written);
783 ++binding_index;
784 } else {
785 runtime.BindStorageBuffer(buffer, offset, size, is_written);
786 }
787 });
788}
464 789
465 return new_buffer; 790template <class P>
791void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
792 if (is_indexed) {
793 UpdateIndexBuffer();
466 } 794 }
795 UpdateVertexBuffers();
796 UpdateTransformFeedbackBuffers();
797 for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
798 UpdateUniformBuffers(stage);
799 UpdateStorageBuffers(stage);
800 }
801}
802
803template <class P>
804void BufferCache<P>::DoUpdateComputeBuffers() {
805 UpdateComputeUniformBuffers();
806 UpdateComputeStorageBuffers();
807}
808
809template <class P>
810void BufferCache<P>::UpdateIndexBuffer() {
811 // We have to check for the dirty flags and index count
812 // The index count is currently changed without updating the dirty flags
813 const auto& index_array = maxwell3d.regs.index_array;
814 auto& flags = maxwell3d.dirty.flags;
815 if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) {
816 return;
817 }
818 flags[Dirty::IndexBuffer] = false;
819 last_index_count = index_array.count;
820
821 const GPUVAddr gpu_addr_begin = index_array.StartAddress();
822 const GPUVAddr gpu_addr_end = index_array.EndAddress();
823 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
824 const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
825 const u32 draw_size = index_array.count * index_array.FormatSizeInBytes();
826 const u32 size = std::min(address_size, draw_size);
827 if (size == 0 || !cpu_addr) {
828 index_buffer = NULL_BINDING;
829 return;
830 }
831 index_buffer = Binding{
832 .cpu_addr = *cpu_addr,
833 .size = size,
834 .buffer_id = FindBuffer(*cpu_addr, size),
835 };
836}
467 837
468 std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first, 838template <class P>
469 std::shared_ptr<Buffer> second) { 839void BufferCache<P>::UpdateVertexBuffers() {
470 const std::size_t size_1 = first->Size(); 840 auto& flags = maxwell3d.dirty.flags;
471 const std::size_t size_2 = second->Size(); 841 if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) {
472 const VAddr first_addr = first->CpuAddr(); 842 return;
473 const VAddr second_addr = second->CpuAddr(); 843 }
474 const VAddr new_addr = std::min(first_addr, second_addr); 844 flags[Dirty::VertexBuffers] = false;
475 const std::size_t new_size = size_1 + size_2;
476
477 std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
478 new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
479 new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
480 QueueDestruction(std::move(first));
481 QueueDestruction(std::move(second));
482 845
483 const VAddr cpu_addr_end = new_addr + new_size - 1; 846 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
484 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 847 UpdateVertexBuffer(index);
485 for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
486 blocks.insert_or_assign(page_start, new_buffer);
487 }
488 return new_buffer;
489 } 848 }
849}
490 850
491 Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { 851template <class P>
492 std::shared_ptr<Buffer> found; 852void BufferCache<P>::UpdateVertexBuffer(u32 index) {
853 if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) {
854 return;
855 }
856 const auto& array = maxwell3d.regs.vertex_array[index];
857 const auto& limit = maxwell3d.regs.vertex_array_limit[index];
858 const GPUVAddr gpu_addr_begin = array.StartAddress();
859 const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1;
860 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
861 const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
862 const u32 size = address_size; // TODO: Analyze stride and number of vertices
863 if (array.enable == 0 || size == 0 || !cpu_addr) {
864 vertex_buffers[index] = NULL_BINDING;
865 return;
866 }
867 vertex_buffers[index] = Binding{
868 .cpu_addr = *cpu_addr,
869 .size = size,
870 .buffer_id = FindBuffer(*cpu_addr, size),
871 };
872}
873
874template <class P>
875void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
876 ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
877 Binding& binding = uniform_buffers[stage][index];
878 if (binding.buffer_id) {
879 // Already updated
880 return;
881 }
882 // Mark as dirty
883 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
884 dirty_uniform_buffers[stage] |= 1U << index;
885 }
886 // Resolve buffer
887 binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
888 });
889}
890
891template <class P>
892void BufferCache<P>::UpdateStorageBuffers(size_t stage) {
893 const u32 written_mask = written_storage_buffers[stage];
894 ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
895 // Resolve buffer
896 Binding& binding = storage_buffers[stage][index];
897 const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
898 binding.buffer_id = buffer_id;
899 // Mark buffer as written if needed
900 if (((written_mask >> index) & 1) != 0) {
901 MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
902 }
903 });
904}
493 905
494 const VAddr cpu_addr_end = cpu_addr + size - 1; 906template <class P>
495 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 907void BufferCache<P>::UpdateTransformFeedbackBuffers() {
496 for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { 908 if (maxwell3d.regs.tfb_enabled == 0) {
497 auto it = blocks.find(page_start); 909 return;
498 if (it == blocks.end()) { 910 }
499 if (found) { 911 for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
500 found = EnlargeBlock(found); 912 UpdateTransformFeedbackBuffer(index);
501 continue; 913 }
502 } 914}
503 const VAddr start_addr = page_start << BLOCK_PAGE_BITS; 915
504 found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); 916template <class P>
505 blocks.insert_or_assign(page_start, found); 917void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) {
506 continue; 918 const auto& binding = maxwell3d.regs.tfb_bindings[index];
507 } 919 const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset;
508 if (!found) { 920 const u32 size = binding.buffer_size;
509 found = it->second; 921 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
510 continue; 922 if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) {
511 } 923 transform_feedback_buffers[index] = NULL_BINDING;
512 if (found != it->second) { 924 return;
513 found = MergeBlocks(std::move(found), it->second); 925 }
926 const BufferId buffer_id = FindBuffer(*cpu_addr, size);
927 transform_feedback_buffers[index] = Binding{
928 .cpu_addr = *cpu_addr,
929 .size = size,
930 .buffer_id = buffer_id,
931 };
932 MarkWrittenBuffer(buffer_id, *cpu_addr, size);
933}
934
935template <class P>
936void BufferCache<P>::UpdateComputeUniformBuffers() {
937 ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
938 Binding& binding = compute_uniform_buffers[index];
939 binding = NULL_BINDING;
940 const auto& launch_desc = kepler_compute.launch_description;
941 if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) {
942 const auto& cbuf = launch_desc.const_buffer_config[index];
943 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address());
944 if (cpu_addr) {
945 binding.cpu_addr = *cpu_addr;
946 binding.size = cbuf.size;
514 } 947 }
515 } 948 }
516 return found.get(); 949 binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
950 });
951}
952
953template <class P>
954void BufferCache<P>::UpdateComputeStorageBuffers() {
955 ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
956 // Resolve buffer
957 Binding& binding = compute_storage_buffers[index];
958 const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
959 binding.buffer_id = buffer_id;
960 // Mark as written if needed
961 if (((written_compute_storage_buffers >> index) & 1) != 0) {
962 MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
963 }
964 });
965}
966
967template <class P>
968void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) {
969 Buffer& buffer = slot_buffers[buffer_id];
970 buffer.MarkRegionAsGpuModified(cpu_addr, size);
971
972 const bool is_accuracy_high = Settings::IsGPULevelHigh();
973 const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
974 if (!is_accuracy_high || !is_async) {
975 return;
976 }
977 if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) {
978 // Already inserted
979 return;
517 } 980 }
981 uncommitted_downloads.push_back(buffer_id);
982}
518 983
519 void MarkRegionAsWritten(VAddr start, VAddr end) { 984template <class P>
520 const u64 page_end = end >> WRITE_PAGE_BIT; 985BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) {
521 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 986 if (cpu_addr == 0) {
522 if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) { 987 return NULL_BUFFER_ID;
523 ++it->second; 988 }
524 } 989 const u64 page = cpu_addr >> PAGE_BITS;
990 const BufferId buffer_id = page_table[page];
991 if (!buffer_id) {
992 return CreateBuffer(cpu_addr, size);
993 }
994 const Buffer& buffer = slot_buffers[buffer_id];
995 if (buffer.IsInBounds(cpu_addr, size)) {
996 return buffer_id;
997 }
998 return CreateBuffer(cpu_addr, size);
999}
1000
1001template <class P>
1002typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu_addr,
1003 u32 wanted_size) {
1004 static constexpr int STREAM_LEAP_THRESHOLD = 16;
1005 std::vector<BufferId> overlap_ids;
1006 VAddr begin = cpu_addr;
1007 VAddr end = cpu_addr + wanted_size;
1008 int stream_score = 0;
1009 bool has_stream_leap = false;
1010 for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) {
1011 const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS];
1012 if (!overlap_id) {
1013 continue;
1014 }
1015 Buffer& overlap = slot_buffers[overlap_id];
1016 if (overlap.IsPicked()) {
1017 continue;
1018 }
1019 overlap_ids.push_back(overlap_id);
1020 overlap.Pick();
1021 const VAddr overlap_cpu_addr = overlap.CpuAddr();
1022 if (overlap_cpu_addr < begin) {
1023 cpu_addr = begin = overlap_cpu_addr;
1024 }
1025 end = std::max(end, overlap_cpu_addr + overlap.SizeBytes());
1026
1027 stream_score += overlap.StreamScore();
1028 if (stream_score > STREAM_LEAP_THRESHOLD && !has_stream_leap) {
1029 // When this memory region has been joined a bunch of times, we assume it's being used
1030 // as a stream buffer. Increase the size to skip constantly recreating buffers.
1031 has_stream_leap = true;
1032 end += PAGE_SIZE * 256;
525 } 1033 }
526 } 1034 }
527 1035 return OverlapResult{
528 void UnmarkRegionAsWritten(VAddr start, VAddr end) { 1036 .ids = std::move(overlap_ids),
529 const u64 page_end = end >> WRITE_PAGE_BIT; 1037 .begin = begin,
530 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 1038 .end = end,
531 auto it = written_pages.find(page_start); 1039 .has_stream_leap = has_stream_leap,
532 if (it != written_pages.end()) { 1040 };
533 if (it->second > 1) { 1041}
534 --it->second; 1042
535 } else { 1043template <class P>
536 written_pages.erase(it); 1044void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
537 } 1045 bool accumulate_stream_score) {
538 } 1046 Buffer& new_buffer = slot_buffers[new_buffer_id];
1047 Buffer& overlap = slot_buffers[overlap_id];
1048 if (accumulate_stream_score) {
1049 new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1);
1050 }
1051 std::vector<BufferCopy> copies;
1052 const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr();
1053 overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) {
1054 copies.push_back(BufferCopy{
1055 .src_offset = begin,
1056 .dst_offset = dst_base_offset + begin,
1057 .size = range_size,
1058 });
1059 new_buffer.UnmarkRegionAsCpuModified(begin, range_size);
1060 new_buffer.MarkRegionAsGpuModified(begin, range_size);
1061 });
1062 if (!copies.empty()) {
1063 runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies);
1064 }
1065 ReplaceBufferDownloads(overlap_id, new_buffer_id);
1066 DeleteBuffer(overlap_id);
1067}
1068
1069template <class P>
1070BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
1071 const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size);
1072 const u32 size = static_cast<u32>(overlap.end - overlap.begin);
1073 const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size);
1074 for (const BufferId overlap_id : overlap.ids) {
1075 JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
1076 }
1077 Register(new_buffer_id);
1078 return new_buffer_id;
1079}
1080
1081template <class P>
1082void BufferCache<P>::Register(BufferId buffer_id) {
1083 ChangeRegister<true>(buffer_id);
1084}
1085
1086template <class P>
1087void BufferCache<P>::Unregister(BufferId buffer_id) {
1088 ChangeRegister<false>(buffer_id);
1089}
1090
1091template <class P>
1092template <bool insert>
1093void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
1094 const Buffer& buffer = slot_buffers[buffer_id];
1095 const VAddr cpu_addr_begin = buffer.CpuAddr();
1096 const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes();
1097 const u64 page_begin = cpu_addr_begin / PAGE_SIZE;
1098 const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE);
1099 for (u64 page = page_begin; page != page_end; ++page) {
1100 if constexpr (insert) {
1101 page_table[page] = buffer_id;
1102 } else {
1103 page_table[page] = BufferId{};
539 } 1104 }
540 } 1105 }
1106}
541 1107
542 bool IsRegionWritten(VAddr start, VAddr end) const { 1108template <class P>
543 const u64 page_end = end >> WRITE_PAGE_BIT; 1109void BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
544 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 1110 if (buffer.CpuAddr() == 0) {
545 if (written_pages.contains(page_start)) { 1111 return;
546 return true; 1112 }
1113 SynchronizeBufferImpl(buffer, cpu_addr, size);
1114}
1115
1116template <class P>
1117void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) {
1118 boost::container::small_vector<BufferCopy, 4> copies;
1119 u64 total_size_bytes = 0;
1120 u64 largest_copy = 0;
1121 buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
1122 copies.push_back(BufferCopy{
1123 .src_offset = total_size_bytes,
1124 .dst_offset = range_offset,
1125 .size = range_size,
1126 });
1127 total_size_bytes += range_size;
1128 largest_copy = std::max(largest_copy, range_size);
1129 });
1130 if (total_size_bytes == 0) {
1131 return;
1132 }
1133 const std::span<BufferCopy> copies_span(copies.data(), copies.size());
1134 UploadMemory(buffer, total_size_bytes, largest_copy, copies_span);
1135}
1136
1137template <class P>
1138void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
1139 std::span<BufferCopy> copies) {
1140 if constexpr (USE_MEMORY_MAPS) {
1141 MappedUploadMemory(buffer, total_size_bytes, copies);
1142 } else {
1143 ImmediateUploadMemory(buffer, largest_copy, copies);
1144 }
1145}
1146
1147template <class P>
1148void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
1149 std::span<const BufferCopy> copies) {
1150 std::span<u8> immediate_buffer;
1151 for (const BufferCopy& copy : copies) {
1152 std::span<const u8> upload_span;
1153 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
1154 if (IsRangeGranular(cpu_addr, copy.size)) {
1155 upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size);
1156 } else {
1157 if (immediate_buffer.empty()) {
1158 immediate_buffer = ImmediateBuffer(largest_copy);
547 } 1159 }
1160 cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
1161 upload_span = immediate_buffer.subspan(0, copy.size);
548 } 1162 }
549 return false; 1163 buffer.ImmediateUpload(copy.dst_offset, upload_span);
550 } 1164 }
551 1165}
552 void QueueDestruction(std::shared_ptr<Buffer> buffer) { 1166
553 buffer->SetEpoch(epoch); 1167template <class P>
554 pending_destruction.push(std::move(buffer)); 1168void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
1169 std::span<BufferCopy> copies) {
1170 auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes);
1171 const std::span<u8> staging_pointer = upload_staging.mapped_span;
1172 for (BufferCopy& copy : copies) {
1173 u8* const src_pointer = staging_pointer.data() + copy.src_offset;
1174 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
1175 cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size);
1176
1177 // Apply the staging offset
1178 copy.src_offset += upload_staging.offset;
555 } 1179 }
556 1180 runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
557 void MarkForAsyncFlush(MapInterval* map) { 1181}
558 if (!uncommitted_flushes) { 1182
559 uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>(); 1183template <class P>
1184void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
1185 const auto scalar_replace = [buffer_id](Binding& binding) {
1186 if (binding.buffer_id == buffer_id) {
1187 binding.buffer_id = BufferId{};
1188 }
1189 };
1190 const auto replace = [scalar_replace](std::span<Binding> bindings) {
1191 std::ranges::for_each(bindings, scalar_replace);
1192 };
1193 scalar_replace(index_buffer);
1194 replace(vertex_buffers);
1195 std::ranges::for_each(uniform_buffers, replace);
1196 std::ranges::for_each(storage_buffers, replace);
1197 replace(transform_feedback_buffers);
1198 replace(compute_uniform_buffers);
1199 replace(compute_storage_buffers);
1200 std::erase(cached_write_buffer_ids, buffer_id);
1201
1202 // Mark the whole buffer as CPU written to stop tracking CPU writes
1203 Buffer& buffer = slot_buffers[buffer_id];
1204 buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes());
1205
1206 Unregister(buffer_id);
1207 delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
1208
1209 NotifyBufferDeletion();
1210}
1211
1212template <class P>
1213void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) {
1214 const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) {
1215 std::ranges::replace(buffers, old_buffer_id, new_buffer_id);
1216 if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) {
1217 buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end());
560 } 1218 }
561 uncommitted_flushes->insert(map); 1219 };
1220 replace(uncommitted_downloads);
1221 std::ranges::for_each(committed_downloads, replace);
1222}
1223
1224template <class P>
1225void BufferCache<P>::NotifyBufferDeletion() {
1226 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
1227 dirty_uniform_buffers.fill(~u32{0});
562 } 1228 }
1229 auto& flags = maxwell3d.dirty.flags;
1230 flags[Dirty::IndexBuffer] = true;
1231 flags[Dirty::VertexBuffers] = true;
1232 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
1233 flags[Dirty::VertexBuffer0 + index] = true;
1234 }
1235 has_deleted_buffers = true;
1236}
1237
1238template <class P>
1239typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr) const {
1240 const GPUVAddr gpu_addr = gpu_memory.Read<u64>(ssbo_addr);
1241 const u32 size = gpu_memory.Read<u32>(ssbo_addr + 8);
1242 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
1243 if (!cpu_addr || size == 0) {
1244 return NULL_BINDING;
1245 }
1246 // HACK(Rodrigo): This is the number of bytes bound in host beyond the guest API's range.
1247 // It exists due to some games like Astral Chain operate out of bounds.
1248 // Binding the whole map range would be technically correct, but games have large maps that make
1249 // this approach unaffordable for now.
1250 static constexpr u32 arbitrary_extra_bytes = 0xc000;
1251 const u32 bytes_to_map_end = static_cast<u32>(gpu_memory.BytesToMapEnd(gpu_addr));
1252 const Binding binding{
1253 .cpu_addr = *cpu_addr,
1254 .size = std::min(size + arbitrary_extra_bytes, bytes_to_map_end),
1255 .buffer_id = BufferId{},
1256 };
1257 return binding;
1258}
1259
1260template <class P>
1261std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size_t size) {
1262 u8* const base_pointer = cpu_memory.GetPointer(cpu_addr);
1263 if (IsRangeGranular(cpu_addr, size) ||
1264 base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) {
1265 return std::span(base_pointer, size);
1266 } else {
1267 const std::span<u8> span = ImmediateBuffer(size);
1268 cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
1269 return span;
1270 }
1271}
563 1272
564 VideoCore::RasterizerInterface& rasterizer; 1273template <class P>
565 Tegra::MemoryManager& gpu_memory; 1274std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) {
566 Core::Memory::Memory& cpu_memory; 1275 if (wanted_capacity > immediate_buffer_capacity) {
567 StreamBuffer& stream_buffer; 1276 immediate_buffer_capacity = wanted_capacity;
568 1277 immediate_buffer_alloc = std::make_unique<u8[]>(wanted_capacity);
569 u8* buffer_ptr = nullptr; 1278 }
570 u64 buffer_offset = 0; 1279 return std::span<u8>(immediate_buffer_alloc.get(), wanted_capacity);
571 u64 buffer_offset_base = 0; 1280}
572 1281
573 MapIntervalAllocator mapped_addresses_allocator; 1282template <class P>
574 boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>> 1283bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept {
575 mapped_addresses; 1284 if constexpr (IS_OPENGL) {
576 1285 return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0;
577 std::unordered_map<u64, u32> written_pages; 1286 } else {
578 std::unordered_map<u64, std::shared_ptr<Buffer>> blocks; 1287 // Only OpenGL has fast uniform buffers
579 1288 return false;
580 std::queue<std::shared_ptr<Buffer>> pending_destruction; 1289 }
581 u64 epoch = 0; 1290}
582 u64 modified_ticks = 0;
583
584 std::vector<u8> staging_buffer;
585
586 std::list<MapInterval*> marked_for_unregister;
587
588 std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
589 std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
590
591 std::recursive_mutex mutex;
592};
593 1291
594} // namespace VideoCommon 1292} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp
deleted file mode 100644
index 62587e18a..000000000
--- a/src/video_core/buffer_cache/map_interval.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <array>
7#include <cstddef>
8#include <memory>
9
10#include "video_core/buffer_cache/map_interval.h"
11
12namespace VideoCommon {
13
14MapIntervalAllocator::MapIntervalAllocator() {
15 FillFreeList(first_chunk);
16}
17
18MapIntervalAllocator::~MapIntervalAllocator() = default;
19
20void MapIntervalAllocator::AllocateNewChunk() {
21 *new_chunk = std::make_unique<Chunk>();
22 FillFreeList(**new_chunk);
23 new_chunk = &(*new_chunk)->next;
24}
25
26void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
27 const std::size_t old_size = free_list.size();
28 free_list.resize(old_size + chunk.data.size());
29 std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
30 [](MapInterval& interval) { return &interval; });
31}
32
33} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
deleted file mode 100644
index ef974b08a..000000000
--- a/src/video_core/buffer_cache/map_interval.h
+++ /dev/null
@@ -1,93 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <cstddef>
9#include <memory>
10#include <vector>
11
12#include <boost/intrusive/set_hook.hpp>
13
14#include "common/common_types.h"
15#include "video_core/gpu.h"
16
17namespace VideoCommon {
18
19struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
20 MapInterval() = default;
21
22 /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
23
24 explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
25 : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
26
27 bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
28 return start <= other_start && other_end <= end;
29 }
30
31 bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
32 return start < other_end && other_start < end;
33 }
34
35 void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
36 is_modified = is_modified_;
37 ticks = ticks_;
38 }
39
40 boost::intrusive::set_member_hook<> member_hook_;
41 VAddr start = 0;
42 VAddr end = 0;
43 GPUVAddr gpu_addr = 0;
44 u64 ticks = 0;
45 bool is_written = false;
46 bool is_modified = false;
47 bool is_registered = false;
48 bool is_memory_marked = false;
49 bool is_sync_pending = false;
50};
51
52struct MapIntervalCompare {
53 constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
54 return lhs.start < rhs.start;
55 }
56};
57
58class MapIntervalAllocator {
59public:
60 MapIntervalAllocator();
61 ~MapIntervalAllocator();
62
63 MapInterval* Allocate() {
64 if (free_list.empty()) {
65 AllocateNewChunk();
66 }
67 MapInterval* const interval = free_list.back();
68 free_list.pop_back();
69 return interval;
70 }
71
72 void Release(MapInterval* interval) {
73 free_list.push_back(interval);
74 }
75
76private:
77 struct Chunk {
78 std::unique_ptr<Chunk> next;
79 std::array<MapInterval, 0x8000> data;
80 };
81
82 void AllocateNewChunk();
83
84 void FillFreeList(Chunk& chunk);
85
86 std::vector<MapInterval*> free_list;
87
88 Chunk first_chunk;
89
90 std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
91};
92
93} // namespace VideoCommon