summaryrefslogtreecommitdiff
path: root/src/video_core/buffer_cache
diff options
context:
space:
mode:
authorGravatar Mai M2021-06-23 08:03:01 -0400
committerGravatar GitHub2021-06-23 08:03:01 -0400
commit17fff10e06e7935522a5a69705b9a750761aab79 (patch)
tree7e7b3ae9fedbc0fed85f6c5c58e92e8d047efd87 /src/video_core/buffer_cache
parentMerge pull request #6508 from ReinUsesLisp/bootmanager-stop-token (diff)
parentReaper: Set minimum cleaning limit on OGL. (diff)
downloadyuzu-17fff10e06e7935522a5a69705b9a750761aab79.tar.gz
yuzu-17fff10e06e7935522a5a69705b9a750761aab79.tar.xz
yuzu-17fff10e06e7935522a5a69705b9a750761aab79.zip
Merge pull request #6465 from FernandoS27/sex-on-the-beach
GPU: Implement a garbage collector for GPU Caches (project Reaper+)
Diffstat (limited to 'src/video_core/buffer_cache')
-rw-r--r--src/video_core/buffer_cache/buffer_base.h11
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h161
2 files changed, 127 insertions, 45 deletions
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index a39505903..b121d36a3 100644
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -256,6 +256,16 @@ public:
256 stream_score += score; 256 stream_score += score;
257 } 257 }
258 258
259 /// Sets the new frame tick
260 void SetFrameTick(u64 new_frame_tick) noexcept {
261 frame_tick = new_frame_tick;
262 }
263
264 /// Returns the new frame tick
265 [[nodiscard]] u64 FrameTick() const noexcept {
266 return frame_tick;
267 }
268
259 /// Returns the likeliness of this being a stream buffer 269 /// Returns the likeliness of this being a stream buffer
260 [[nodiscard]] int StreamScore() const noexcept { 270 [[nodiscard]] int StreamScore() const noexcept {
261 return stream_score; 271 return stream_score;
@@ -586,6 +596,7 @@ private:
586 RasterizerInterface* rasterizer = nullptr; 596 RasterizerInterface* rasterizer = nullptr;
587 VAddr cpu_addr = 0; 597 VAddr cpu_addr = 0;
588 Words words; 598 Words words;
599 u64 frame_tick = 0;
589 BufferFlagBits flags{}; 600 BufferFlagBits flags{};
590 int stream_score = 0; 601 int stream_score = 0;
591}; 602};
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index d371b842f..6d04d00da 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -16,6 +16,7 @@
16 16
17#include <boost/container/small_vector.hpp> 17#include <boost/container/small_vector.hpp>
18 18
19#include "common/common_sizes.h"
19#include "common/common_types.h" 20#include "common/common_types.h"
20#include "common/div_ceil.h" 21#include "common/div_ceil.h"
21#include "common/microprofile.h" 22#include "common/microprofile.h"
@@ -65,6 +66,9 @@ class BufferCache {
65 66
66 static constexpr BufferId NULL_BUFFER_ID{0}; 67 static constexpr BufferId NULL_BUFFER_ID{0};
67 68
69 static constexpr u64 EXPECTED_MEMORY = Common::Size_512_MB;
70 static constexpr u64 CRITICAL_MEMORY = Common::Size_1_GB;
71
68 using Maxwell = Tegra::Engines::Maxwell3D::Regs; 72 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
69 73
70 using Runtime = typename P::Runtime; 74 using Runtime = typename P::Runtime;
@@ -102,6 +106,8 @@ public:
102 106
103 void TickFrame(); 107 void TickFrame();
104 108
109 void RunGarbageCollector();
110
105 void WriteMemory(VAddr cpu_addr, u64 size); 111 void WriteMemory(VAddr cpu_addr, u64 size);
106 112
107 void CachedWriteMemory(VAddr cpu_addr, u64 size); 113 void CachedWriteMemory(VAddr cpu_addr, u64 size);
@@ -243,6 +249,8 @@ private:
243 template <bool insert> 249 template <bool insert>
244 void ChangeRegister(BufferId buffer_id); 250 void ChangeRegister(BufferId buffer_id);
245 251
252 void TouchBuffer(Buffer& buffer) const noexcept;
253
246 bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); 254 bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
247 255
248 bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); 256 bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
@@ -255,6 +263,10 @@ private:
255 263
256 void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); 264 void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies);
257 265
266 void DownloadBufferMemory(Buffer& buffer_id);
267
268 void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size);
269
258 void DeleteBuffer(BufferId buffer_id); 270 void DeleteBuffer(BufferId buffer_id);
259 271
260 void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); 272 void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
@@ -319,6 +331,10 @@ private:
319 size_t immediate_buffer_capacity = 0; 331 size_t immediate_buffer_capacity = 0;
320 std::unique_ptr<u8[]> immediate_buffer_alloc; 332 std::unique_ptr<u8[]> immediate_buffer_alloc;
321 333
334 typename SlotVector<Buffer>::Iterator deletion_iterator;
335 u64 frame_tick = 0;
336 u64 total_used_memory = 0;
337
322 std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; 338 std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
323}; 339};
324 340
@@ -332,6 +348,28 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
332 gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} { 348 gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {
333 // Ensure the first slot is used for the null buffer 349 // Ensure the first slot is used for the null buffer
334 void(slot_buffers.insert(runtime, NullBufferParams{})); 350 void(slot_buffers.insert(runtime, NullBufferParams{}));
351 deletion_iterator = slot_buffers.end();
352}
353
354template <class P>
355void BufferCache<P>::RunGarbageCollector() {
356 const bool aggressive_gc = total_used_memory >= CRITICAL_MEMORY;
357 const u64 ticks_to_destroy = aggressive_gc ? 60 : 120;
358 int num_iterations = aggressive_gc ? 64 : 32;
359 for (; num_iterations > 0; --num_iterations) {
360 if (deletion_iterator == slot_buffers.end()) {
361 deletion_iterator = slot_buffers.begin();
362 }
363 ++deletion_iterator;
364 if (deletion_iterator == slot_buffers.end()) {
365 break;
366 }
367 const auto [buffer_id, buffer] = *deletion_iterator;
368 if (buffer->FrameTick() + ticks_to_destroy < frame_tick) {
369 DownloadBufferMemory(*buffer);
370 DeleteBuffer(buffer_id);
371 }
372 }
335} 373}
336 374
337template <class P> 375template <class P>
@@ -349,6 +387,10 @@ void BufferCache<P>::TickFrame() {
349 const bool skip_preferred = hits * 256 < shots * 251; 387 const bool skip_preferred = hits * 256 < shots * 251;
350 uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; 388 uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0;
351 389
390 if (Settings::values.use_caches_gc.GetValue() && total_used_memory >= EXPECTED_MEMORY) {
391 RunGarbageCollector();
392 }
393 ++frame_tick;
352 delayed_destruction_ring.Tick(); 394 delayed_destruction_ring.Tick();
353} 395}
354 396
@@ -371,50 +413,8 @@ void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
371 413
372template <class P> 414template <class P>
373void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { 415void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
374 ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { 416 ForEachBufferInRange(cpu_addr, size,
375 boost::container::small_vector<BufferCopy, 1> copies; 417 [&](BufferId, Buffer& buffer) { DownloadBufferMemory(buffer); });
376 u64 total_size_bytes = 0;
377 u64 largest_copy = 0;
378 buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
379 copies.push_back(BufferCopy{
380 .src_offset = range_offset,
381 .dst_offset = total_size_bytes,
382 .size = range_size,
383 });
384 total_size_bytes += range_size;
385 largest_copy = std::max(largest_copy, range_size);
386 });
387 if (total_size_bytes == 0) {
388 return;
389 }
390 MICROPROFILE_SCOPE(GPU_DownloadMemory);
391
392 if constexpr (USE_MEMORY_MAPS) {
393 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
394 const u8* const mapped_memory = download_staging.mapped_span.data();
395 const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
396 for (BufferCopy& copy : copies) {
397 // Modify copies to have the staging offset in mind
398 copy.dst_offset += download_staging.offset;
399 }
400 runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
401 runtime.Finish();
402 for (const BufferCopy& copy : copies) {
403 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
404 // Undo the modified offset
405 const u64 dst_offset = copy.dst_offset - download_staging.offset;
406 const u8* copy_mapped_memory = mapped_memory + dst_offset;
407 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
408 }
409 } else {
410 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
411 for (const BufferCopy& copy : copies) {
412 buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
413 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
414 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
415 }
416 }
417 });
418} 418}
419 419
420template <class P> 420template <class P>
@@ -640,6 +640,7 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
640template <class P> 640template <class P>
641void BufferCache<P>::BindHostIndexBuffer() { 641void BufferCache<P>::BindHostIndexBuffer() {
642 Buffer& buffer = slot_buffers[index_buffer.buffer_id]; 642 Buffer& buffer = slot_buffers[index_buffer.buffer_id];
643 TouchBuffer(buffer);
643 const u32 offset = buffer.Offset(index_buffer.cpu_addr); 644 const u32 offset = buffer.Offset(index_buffer.cpu_addr);
644 const u32 size = index_buffer.size; 645 const u32 size = index_buffer.size;
645 SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); 646 SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
@@ -658,6 +659,7 @@ void BufferCache<P>::BindHostVertexBuffers() {
658 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { 659 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
659 const Binding& binding = vertex_buffers[index]; 660 const Binding& binding = vertex_buffers[index];
660 Buffer& buffer = slot_buffers[binding.buffer_id]; 661 Buffer& buffer = slot_buffers[binding.buffer_id];
662 TouchBuffer(buffer);
661 SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); 663 SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
662 if (!flags[Dirty::VertexBuffer0 + index]) { 664 if (!flags[Dirty::VertexBuffer0 + index]) {
663 continue; 665 continue;
@@ -693,6 +695,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
693 const VAddr cpu_addr = binding.cpu_addr; 695 const VAddr cpu_addr = binding.cpu_addr;
694 const u32 size = binding.size; 696 const u32 size = binding.size;
695 Buffer& buffer = slot_buffers[binding.buffer_id]; 697 Buffer& buffer = slot_buffers[binding.buffer_id];
698 TouchBuffer(buffer);
696 const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && 699 const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID &&
697 size <= uniform_buffer_skip_cache_size && 700 size <= uniform_buffer_skip_cache_size &&
698 !buffer.IsRegionGpuModified(cpu_addr, size); 701 !buffer.IsRegionGpuModified(cpu_addr, size);
@@ -744,6 +747,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
744 ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { 747 ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
745 const Binding& binding = storage_buffers[stage][index]; 748 const Binding& binding = storage_buffers[stage][index];
746 Buffer& buffer = slot_buffers[binding.buffer_id]; 749 Buffer& buffer = slot_buffers[binding.buffer_id];
750 TouchBuffer(buffer);
747 const u32 size = binding.size; 751 const u32 size = binding.size;
748 SynchronizeBuffer(buffer, binding.cpu_addr, size); 752 SynchronizeBuffer(buffer, binding.cpu_addr, size);
749 753
@@ -766,6 +770,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() {
766 for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { 770 for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
767 const Binding& binding = transform_feedback_buffers[index]; 771 const Binding& binding = transform_feedback_buffers[index];
768 Buffer& buffer = slot_buffers[binding.buffer_id]; 772 Buffer& buffer = slot_buffers[binding.buffer_id];
773 TouchBuffer(buffer);
769 const u32 size = binding.size; 774 const u32 size = binding.size;
770 SynchronizeBuffer(buffer, binding.cpu_addr, size); 775 SynchronizeBuffer(buffer, binding.cpu_addr, size);
771 776
@@ -784,6 +789,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() {
784 ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { 789 ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
785 const Binding& binding = compute_uniform_buffers[index]; 790 const Binding& binding = compute_uniform_buffers[index];
786 Buffer& buffer = slot_buffers[binding.buffer_id]; 791 Buffer& buffer = slot_buffers[binding.buffer_id];
792 TouchBuffer(buffer);
787 const u32 size = binding.size; 793 const u32 size = binding.size;
788 SynchronizeBuffer(buffer, binding.cpu_addr, size); 794 SynchronizeBuffer(buffer, binding.cpu_addr, size);
789 795
@@ -803,6 +809,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() {
803 ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { 809 ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
804 const Binding& binding = compute_storage_buffers[index]; 810 const Binding& binding = compute_storage_buffers[index];
805 Buffer& buffer = slot_buffers[binding.buffer_id]; 811 Buffer& buffer = slot_buffers[binding.buffer_id];
812 TouchBuffer(buffer);
806 const u32 size = binding.size; 813 const u32 size = binding.size;
807 SynchronizeBuffer(buffer, binding.cpu_addr, size); 814 SynchronizeBuffer(buffer, binding.cpu_addr, size);
808 815
@@ -1101,6 +1108,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
1101 const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); 1108 const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size);
1102 const u32 size = static_cast<u32>(overlap.end - overlap.begin); 1109 const u32 size = static_cast<u32>(overlap.end - overlap.begin);
1103 const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); 1110 const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size);
1111 TouchBuffer(slot_buffers[new_buffer_id]);
1104 for (const BufferId overlap_id : overlap.ids) { 1112 for (const BufferId overlap_id : overlap.ids) {
1105 JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); 1113 JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
1106 } 1114 }
@@ -1122,8 +1130,14 @@ template <class P>
1122template <bool insert> 1130template <bool insert>
1123void BufferCache<P>::ChangeRegister(BufferId buffer_id) { 1131void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
1124 const Buffer& buffer = slot_buffers[buffer_id]; 1132 const Buffer& buffer = slot_buffers[buffer_id];
1133 const auto size = buffer.SizeBytes();
1134 if (insert) {
1135 total_used_memory += Common::AlignUp(size, 1024);
1136 } else {
1137 total_used_memory -= Common::AlignUp(size, 1024);
1138 }
1125 const VAddr cpu_addr_begin = buffer.CpuAddr(); 1139 const VAddr cpu_addr_begin = buffer.CpuAddr();
1126 const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes(); 1140 const VAddr cpu_addr_end = cpu_addr_begin + size;
1127 const u64 page_begin = cpu_addr_begin / PAGE_SIZE; 1141 const u64 page_begin = cpu_addr_begin / PAGE_SIZE;
1128 const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); 1142 const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE);
1129 for (u64 page = page_begin; page != page_end; ++page) { 1143 for (u64 page = page_begin; page != page_end; ++page) {
@@ -1136,6 +1150,11 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
1136} 1150}
1137 1151
1138template <class P> 1152template <class P>
1153void BufferCache<P>::TouchBuffer(Buffer& buffer) const noexcept {
1154 buffer.SetFrameTick(frame_tick);
1155}
1156
1157template <class P>
1139bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { 1158bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
1140 if (buffer.CpuAddr() == 0) { 1159 if (buffer.CpuAddr() == 0) {
1141 return true; 1160 return true;
@@ -1212,6 +1231,57 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
1212} 1231}
1213 1232
1214template <class P> 1233template <class P>
1234void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) {
1235 DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes());
1236}
1237
1238template <class P>
1239void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 size) {
1240 boost::container::small_vector<BufferCopy, 1> copies;
1241 u64 total_size_bytes = 0;
1242 u64 largest_copy = 0;
1243 buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
1244 copies.push_back(BufferCopy{
1245 .src_offset = range_offset,
1246 .dst_offset = total_size_bytes,
1247 .size = range_size,
1248 });
1249 total_size_bytes += range_size;
1250 largest_copy = std::max(largest_copy, range_size);
1251 });
1252 if (total_size_bytes == 0) {
1253 return;
1254 }
1255 MICROPROFILE_SCOPE(GPU_DownloadMemory);
1256
1257 if constexpr (USE_MEMORY_MAPS) {
1258 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
1259 const u8* const mapped_memory = download_staging.mapped_span.data();
1260 const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
1261 for (BufferCopy& copy : copies) {
1262 // Modify copies to have the staging offset in mind
1263 copy.dst_offset += download_staging.offset;
1264 }
1265 runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
1266 runtime.Finish();
1267 for (const BufferCopy& copy : copies) {
1268 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
1269 // Undo the modified offset
1270 const u64 dst_offset = copy.dst_offset - download_staging.offset;
1271 const u8* copy_mapped_memory = mapped_memory + dst_offset;
1272 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
1273 }
1274 } else {
1275 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
1276 for (const BufferCopy& copy : copies) {
1277 buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
1278 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
1279 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
1280 }
1281 }
1282}
1283
1284template <class P>
1215void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { 1285void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
1216 const auto scalar_replace = [buffer_id](Binding& binding) { 1286 const auto scalar_replace = [buffer_id](Binding& binding) {
1217 if (binding.buffer_id == buffer_id) { 1287 if (binding.buffer_id == buffer_id) {
@@ -1236,6 +1306,7 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
1236 1306
1237 Unregister(buffer_id); 1307 Unregister(buffer_id);
1238 delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); 1308 delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
1309 slot_buffers.erase(buffer_id);
1239 1310
1240 NotifyBufferDeletion(); 1311 NotifyBufferDeletion();
1241} 1312}