summaryrefslogtreecommitdiff
path: root/src/video_core/buffer_cache
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core/buffer_cache')
-rw-r--r--src/video_core/buffer_cache/buffer_block.h62
-rw-r--r--src/video_core/buffer_cache/buffer_cache.cpp13
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h1598
-rw-r--r--src/video_core/buffer_cache/map_interval.cpp33
-rw-r--r--src/video_core/buffer_cache/map_interval.h93
5 files changed, 1132 insertions, 667 deletions
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
deleted file mode 100644
index e9306194a..000000000
--- a/src/video_core/buffer_cache/buffer_block.h
+++ /dev/null
@@ -1,62 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "common/common_types.h"
8
9namespace VideoCommon {
10
11class BufferBlock {
12public:
13 [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
14 return (cpu_addr < end) && (cpu_addr_end > start);
15 }
16
17 [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
18 return cpu_addr <= other_start && other_end <= cpu_addr_end;
19 }
20
21 [[nodiscard]] std::size_t Offset(VAddr in_addr) const {
22 return static_cast<std::size_t>(in_addr - cpu_addr);
23 }
24
25 [[nodiscard]] VAddr CpuAddr() const {
26 return cpu_addr;
27 }
28
29 [[nodiscard]] VAddr CpuAddrEnd() const {
30 return cpu_addr_end;
31 }
32
33 void SetCpuAddr(VAddr new_addr) {
34 cpu_addr = new_addr;
35 cpu_addr_end = new_addr + size;
36 }
37
38 [[nodiscard]] std::size_t Size() const {
39 return size;
40 }
41
42 [[nodiscard]] u64 Epoch() const {
43 return epoch;
44 }
45
46 void SetEpoch(u64 new_epoch) {
47 epoch = new_epoch;
48 }
49
50protected:
51 explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
52 SetCpuAddr(cpu_addr_);
53 }
54
55private:
56 VAddr cpu_addr{};
57 VAddr cpu_addr_end{};
58 std::size_t size{};
59 u64 epoch{};
60};
61
62} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
new file mode 100644
index 000000000..ab32294c8
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -0,0 +1,13 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/microprofile.h"
6
7namespace VideoCommon {
8
9MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128));
10MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128));
11MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128));
12
13} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 83b9ee871..e4f3c8e35 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -4,591 +4,1231 @@
4 4
5#pragma once 5#pragma once
6 6
7#include <list> 7#include <algorithm>
8#include <array>
9#include <deque>
8#include <memory> 10#include <memory>
9#include <mutex> 11#include <mutex>
12#include <span>
10#include <unordered_map> 13#include <unordered_map>
11#include <unordered_set>
12#include <utility>
13#include <vector> 14#include <vector>
14 15
15#include <boost/container/small_vector.hpp> 16#include <boost/container/small_vector.hpp>
16#include <boost/icl/interval_set.hpp>
17#include <boost/intrusive/set.hpp>
18 17
19#include "common/alignment.h"
20#include "common/assert.h"
21#include "common/common_types.h" 18#include "common/common_types.h"
22#include "common/logging/log.h" 19#include "common/div_ceil.h"
23#include "core/core.h" 20#include "common/microprofile.h"
21#include "common/scope_exit.h"
24#include "core/memory.h" 22#include "core/memory.h"
25#include "core/settings.h" 23#include "core/settings.h"
26#include "video_core/buffer_cache/buffer_block.h" 24#include "video_core/buffer_cache/buffer_base.h"
27#include "video_core/buffer_cache/map_interval.h" 25#include "video_core/delayed_destruction_ring.h"
26#include "video_core/dirty_flags.h"
27#include "video_core/engines/kepler_compute.h"
28#include "video_core/engines/maxwell_3d.h"
28#include "video_core/memory_manager.h" 29#include "video_core/memory_manager.h"
29#include "video_core/rasterizer_interface.h" 30#include "video_core/rasterizer_interface.h"
31#include "video_core/texture_cache/slot_vector.h"
32#include "video_core/texture_cache/types.h"
30 33
31namespace VideoCommon { 34namespace VideoCommon {
32 35
33template <typename Buffer, typename BufferType, typename StreamBuffer> 36MICROPROFILE_DECLARE(GPU_PrepareBuffers);
37MICROPROFILE_DECLARE(GPU_BindUploadBuffers);
38MICROPROFILE_DECLARE(GPU_DownloadMemory);
39
40using BufferId = SlotId;
41
42constexpr u32 NUM_VERTEX_BUFFERS = 32;
43constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4;
44constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18;
45constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
46constexpr u32 NUM_STORAGE_BUFFERS = 16;
47constexpr u32 NUM_STAGES = 5;
48
49template <typename P>
34class BufferCache { 50class BufferCache {
35 using IntervalSet = boost::icl::interval_set<VAddr>; 51 // Page size for caching purposes.
36 using IntervalType = typename IntervalSet::interval_type; 52 // This is unrelated to the CPU page size and it can be changed as it seems optimal.
37 using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>; 53 static constexpr u32 PAGE_BITS = 16;
54 static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS;
38 55
39 static constexpr u64 WRITE_PAGE_BIT = 11; 56 static constexpr bool IS_OPENGL = P::IS_OPENGL;
40 static constexpr u64 BLOCK_PAGE_BITS = 21; 57 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS =
41 static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; 58 P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS;
59 static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT =
60 P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT;
61 static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX;
62 static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX;
63 static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS;
42 64
43public: 65 static constexpr BufferId NULL_BUFFER_ID{0};
44 struct BufferInfo { 66
45 BufferType handle; 67 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
46 u64 offset; 68
47 u64 address; 69 using Runtime = typename P::Runtime;
70 using Buffer = typename P::Buffer;
71
72 struct Empty {};
73
74 struct Binding {
75 VAddr cpu_addr{};
76 u32 size{};
77 BufferId buffer_id;
48 }; 78 };
49 79
50 BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, 80 static constexpr Binding NULL_BINDING{
51 bool is_written = false, bool use_fast_cbuf = false) { 81 .cpu_addr = 0,
52 std::lock_guard lock{mutex}; 82 .size = 0,
83 .buffer_id = NULL_BUFFER_ID,
84 };
53 85
54 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); 86public:
55 if (!cpu_addr) { 87 static constexpr size_t SKIP_CACHE_SIZE = 4096;
56 return GetEmptyBuffer(size);
57 }
58 88
59 // Cache management is a big overhead, so only cache entries with a given size. 89 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
60 // TODO: Figure out which size is the best for given games. 90 Tegra::Engines::Maxwell3D& maxwell3d_,
61 constexpr std::size_t max_stream_size = 0x800; 91 Tegra::Engines::KeplerCompute& kepler_compute_,
62 if (use_fast_cbuf || size < max_stream_size) { 92 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
63 if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) { 93 Runtime& runtime_);
64 const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size);
65 if (use_fast_cbuf) {
66 u8* dest;
67 if (is_granular) {
68 dest = gpu_memory.GetPointer(gpu_addr);
69 } else {
70 staging_buffer.resize(size);
71 dest = staging_buffer.data();
72 gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
73 }
74 return ConstBufferUpload(dest, size);
75 }
76 if (is_granular) {
77 u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
78 return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
79 std::memcpy(dest, host_ptr, size);
80 });
81 } else {
82 return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) {
83 gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
84 });
85 }
86 }
87 }
88 94
89 Buffer* const block = GetBlock(*cpu_addr, size); 95 void TickFrame();
90 MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size);
91 if (!map) {
92 return GetEmptyBuffer(size);
93 }
94 if (is_written) {
95 map->MarkAsModified(true, GetModifiedTicks());
96 if (Settings::IsGPULevelHigh() &&
97 Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
98 MarkForAsyncFlush(map);
99 }
100 if (!map->is_written) {
101 map->is_written = true;
102 MarkRegionAsWritten(map->start, map->end - 1);
103 }
104 }
105 96
106 return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()}; 97 void WriteMemory(VAddr cpu_addr, u64 size);
107 }
108 98
109 /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. 99 void CachedWriteMemory(VAddr cpu_addr, u64 size);
110 BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
111 std::size_t alignment = 4) {
112 std::lock_guard lock{mutex};
113 return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
114 std::memcpy(dest, raw_pointer, size);
115 });
116 }
117 100
118 /// Prepares the buffer cache for data uploading 101 void DownloadMemory(VAddr cpu_addr, u64 size);
119 /// @param max_size Maximum number of bytes that will be uploaded
120 /// @return True when a stream buffer invalidation was required, false otherwise
121 void Map(std::size_t max_size) {
122 std::lock_guard lock{mutex};
123 102
124 std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4); 103 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
125 buffer_offset = buffer_offset_base;
126 }
127 104
128 /// Finishes the upload stream 105 void UpdateGraphicsBuffers(bool is_indexed);
129 void Unmap() {
130 std::lock_guard lock{mutex};
131 stream_buffer.Unmap(buffer_offset - buffer_offset_base);
132 }
133 106
134 /// Function called at the end of each frame, inteded for deferred operations 107 void UpdateComputeBuffers();
135 void TickFrame() {
136 ++epoch;
137 108
138 while (!pending_destruction.empty()) { 109 void BindHostGeometryBuffers(bool is_indexed);
139 // Delay at least 4 frames before destruction.
140 // This is due to triple buffering happening on some drivers.
141 static constexpr u64 epochs_to_destroy = 5;
142 if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
143 break;
144 }
145 pending_destruction.pop();
146 }
147 }
148 110
149 /// Write any cached resources overlapping the specified region back to memory 111 void BindHostStageBuffers(size_t stage);
150 void FlushRegion(VAddr addr, std::size_t size) {
151 std::lock_guard lock{mutex};
152 112
153 VectorMapInterval objects = GetMapsInRange(addr, size); 113 void BindHostComputeBuffers();
154 std::sort(objects.begin(), objects.end(),
155 [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
156 for (MapInterval* object : objects) {
157 if (object->is_modified && object->is_registered) {
158 mutex.unlock();
159 FlushMap(object);
160 mutex.lock();
161 }
162 }
163 }
164 114
165 bool MustFlushRegion(VAddr addr, std::size_t size) { 115 void SetEnabledUniformBuffers(size_t stage, u32 enabled);
166 std::lock_guard lock{mutex};
167 116
168 const VectorMapInterval objects = GetMapsInRange(addr, size); 117 void SetEnabledComputeUniformBuffers(u32 enabled);
169 return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
170 return map->is_modified && map->is_registered;
171 });
172 }
173 118
174 /// Mark the specified region as being invalidated 119 void UnbindGraphicsStorageBuffers(size_t stage);
175 void InvalidateRegion(VAddr addr, u64 size) {
176 std::lock_guard lock{mutex};
177 120
178 for (auto& object : GetMapsInRange(addr, size)) { 121 void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
179 if (object->is_registered) { 122 bool is_written);
180 Unregister(object);
181 }
182 }
183 }
184 123
185 void OnCPUWrite(VAddr addr, std::size_t size) { 124 void UnbindComputeStorageBuffers();
186 std::lock_guard lock{mutex};
187 125
188 for (MapInterval* object : GetMapsInRange(addr, size)) { 126 void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
189 if (object->is_memory_marked && object->is_registered) { 127 bool is_written);
190 UnmarkMemory(object);
191 object->is_sync_pending = true;
192 marked_for_unregister.emplace_back(object);
193 }
194 }
195 }
196 128
197 void SyncGuestHost() { 129 void FlushCachedWrites();
198 std::lock_guard lock{mutex};
199 130
200 for (auto& object : marked_for_unregister) { 131 /// Return true when there are uncommitted buffers to be downloaded
201 if (object->is_registered) { 132 [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
202 object->is_sync_pending = false; 133
203 Unregister(object); 134 /// Return true when the caller should wait for async downloads
204 } 135 [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
136
137 /// Commit asynchronous downloads
138 void CommitAsyncFlushes();
139
140 /// Pop asynchronous downloads
141 void PopAsyncFlushes();
142
143 /// Return true when a CPU region is modified from the GPU
144 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
145
146 std::mutex mutex;
147
148private:
149 template <typename Func>
150 static void ForEachEnabledBit(u32 enabled_mask, Func&& func) {
151 for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) {
152 const int disabled_bits = std::countr_zero(enabled_mask);
153 index += disabled_bits;
154 enabled_mask >>= disabled_bits;
155 func(index);
205 } 156 }
206 marked_for_unregister.clear();
207 } 157 }
208 158
209 void CommitAsyncFlushes() { 159 template <typename Func>
210 if (uncommitted_flushes) { 160 void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) {
211 auto commit_list = std::make_shared<std::list<MapInterval*>>(); 161 const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE);
212 for (MapInterval* map : *uncommitted_flushes) { 162 for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) {
213 if (map->is_registered && map->is_modified) { 163 const BufferId buffer_id = page_table[page];
214 // TODO(Blinkhawk): Implement backend asynchronous flushing 164 if (!buffer_id) {
215 // AsyncFlushMap(map) 165 ++page;
216 commit_list->push_back(map); 166 continue;
217 }
218 }
219 if (!commit_list->empty()) {
220 committed_flushes.push_back(commit_list);
221 } else {
222 committed_flushes.emplace_back();
223 } 167 }
224 } else { 168 Buffer& buffer = slot_buffers[buffer_id];
225 committed_flushes.emplace_back(); 169 func(buffer_id, buffer);
170
171 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
172 page = Common::DivCeil(end_addr, PAGE_SIZE);
226 } 173 }
227 uncommitted_flushes.reset();
228 } 174 }
229 175
230 bool ShouldWaitAsyncFlushes() const { 176 static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
231 return !committed_flushes.empty() && committed_flushes.front() != nullptr; 177 return (cpu_addr & ~Core::Memory::PAGE_MASK) ==
178 ((cpu_addr + size) & ~Core::Memory::PAGE_MASK);
232 } 179 }
233 180
234 bool HasUncommittedFlushes() const { 181 void BindHostIndexBuffer();
235 return uncommitted_flushes != nullptr;
236 }
237 182
238 void PopAsyncFlushes() { 183 void BindHostVertexBuffers();
239 if (committed_flushes.empty()) {
240 return;
241 }
242 auto& flush_list = committed_flushes.front();
243 if (!flush_list) {
244 committed_flushes.pop_front();
245 return;
246 }
247 for (MapInterval* map : *flush_list) {
248 if (map->is_registered) {
249 // TODO(Blinkhawk): Replace this for reading the asynchronous flush
250 FlushMap(map);
251 }
252 }
253 committed_flushes.pop_front();
254 }
255 184
256 virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; 185 void BindHostGraphicsUniformBuffers(size_t stage);
257 186
258protected: 187 void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind);
259 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
260 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
261 StreamBuffer& stream_buffer_)
262 : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
263 stream_buffer{stream_buffer_} {}
264 188
265 ~BufferCache() = default; 189 void BindHostGraphicsStorageBuffers(size_t stage);
266 190
267 virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0; 191 void BindHostTransformFeedbackBuffers();
268 192
269 virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { 193 void BindHostComputeUniformBuffers();
270 return {};
271 }
272 194
273 /// Register an object into the cache 195 void BindHostComputeStorageBuffers();
274 MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
275 const VAddr cpu_addr = new_map.start;
276 if (!cpu_addr) {
277 LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
278 new_map.gpu_addr);
279 return nullptr;
280 }
281 const std::size_t size = new_map.end - new_map.start;
282 new_map.is_registered = true;
283 rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
284 new_map.is_memory_marked = true;
285 if (inherit_written) {
286 MarkRegionAsWritten(new_map.start, new_map.end - 1);
287 new_map.is_written = true;
288 }
289 MapInterval* const storage = mapped_addresses_allocator.Allocate();
290 *storage = new_map;
291 mapped_addresses.insert(*storage);
292 return storage;
293 }
294 196
295 void UnmarkMemory(MapInterval* map) { 197 void DoUpdateGraphicsBuffers(bool is_indexed);
296 if (!map->is_memory_marked) { 198
297 return; 199 void DoUpdateComputeBuffers();
298 } 200
299 const std::size_t size = map->end - map->start; 201 void UpdateIndexBuffer();
300 rasterizer.UpdatePagesCachedCount(map->start, size, -1); 202
301 map->is_memory_marked = false; 203 void UpdateVertexBuffers();
302 } 204
303 205 void UpdateVertexBuffer(u32 index);
304 /// Unregisters an object from the cache 206
305 void Unregister(MapInterval* map) { 207 void UpdateUniformBuffers(size_t stage);
306 UnmarkMemory(map); 208
307 map->is_registered = false; 209 void UpdateStorageBuffers(size_t stage);
308 if (map->is_sync_pending) { 210
309 map->is_sync_pending = false; 211 void UpdateTransformFeedbackBuffers();
310 marked_for_unregister.remove(map); 212
213 void UpdateTransformFeedbackBuffer(u32 index);
214
215 void UpdateComputeUniformBuffers();
216
217 void UpdateComputeStorageBuffers();
218
219 void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size);
220
221 [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size);
222
223 [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size);
224
225 void Register(BufferId buffer_id);
226
227 void Unregister(BufferId buffer_id);
228
229 template <bool insert>
230 void ChangeRegister(BufferId buffer_id);
231
232 void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
233
234 void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
235
236 void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
237 std::span<BufferCopy> copies);
238
239 void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
240 std::span<const BufferCopy> copies);
241
242 void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
243 std::span<const BufferCopy> copies);
244
245 void DeleteBuffer(BufferId buffer_id);
246
247 void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
248
249 void NotifyBufferDeletion();
250
251 [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const;
252
253 [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size);
254
255 [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity);
256
257 [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept;
258
259 VideoCore::RasterizerInterface& rasterizer;
260 Tegra::Engines::Maxwell3D& maxwell3d;
261 Tegra::Engines::KeplerCompute& kepler_compute;
262 Tegra::MemoryManager& gpu_memory;
263 Core::Memory::Memory& cpu_memory;
264 Runtime& runtime;
265
266 SlotVector<Buffer> slot_buffers;
267 DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
268
269 u32 last_index_count = 0;
270
271 Binding index_buffer;
272 std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
273 std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
274 std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
275 std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
276
277 std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
278 std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
279
280 std::array<u32, NUM_STAGES> enabled_uniform_buffers{};
281 u32 enabled_compute_uniform_buffers = 0;
282
283 std::array<u32, NUM_STAGES> enabled_storage_buffers{};
284 std::array<u32, NUM_STAGES> written_storage_buffers{};
285 u32 enabled_compute_storage_buffers = 0;
286 u32 written_compute_storage_buffers = 0;
287
288 std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{};
289
290 bool has_deleted_buffers = false;
291
292 std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty>
293 dirty_uniform_buffers{};
294
295 std::vector<BufferId> cached_write_buffer_ids;
296
297 // TODO: This data structure is not optimal and it should be reworked
298 std::vector<BufferId> uncommitted_downloads;
299 std::deque<std::vector<BufferId>> committed_downloads;
300
301 size_t immediate_buffer_capacity = 0;
302 std::unique_ptr<u8[]> immediate_buffer_alloc;
303
304 std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
305};
306
307template <class P>
308BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
309 Tegra::Engines::Maxwell3D& maxwell3d_,
310 Tegra::Engines::KeplerCompute& kepler_compute_,
311 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
312 Runtime& runtime_)
313 : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_},
314 gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {
315 // Ensure the first slot is used for the null buffer
316 void(slot_buffers.insert(runtime, NullBufferParams{}));
317}
318
319template <class P>
320void BufferCache<P>::TickFrame() {
321 delayed_destruction_ring.Tick();
322}
323
324template <class P>
325void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) {
326 ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
327 buffer.MarkRegionAsCpuModified(cpu_addr, size);
328 });
329}
330
331template <class P>
332void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
333 ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
334 if (!buffer.HasCachedWrites()) {
335 cached_write_buffer_ids.push_back(buffer_id);
311 } 336 }
312 if (map->is_written) { 337 buffer.CachedCpuWrite(cpu_addr, size);
313 UnmarkRegionAsWritten(map->start, map->end - 1); 338 });
339}
340
341template <class P>
342void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
343 ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
344 boost::container::small_vector<BufferCopy, 1> copies;
345 u64 total_size_bytes = 0;
346 u64 largest_copy = 0;
347 buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
348 copies.push_back(BufferCopy{
349 .src_offset = range_offset,
350 .dst_offset = total_size_bytes,
351 .size = range_size,
352 });
353 total_size_bytes += range_size;
354 largest_copy = std::max(largest_copy, range_size);
355 });
356 if (total_size_bytes == 0) {
357 return;
314 } 358 }
315 const auto it = mapped_addresses.find(*map); 359 MICROPROFILE_SCOPE(GPU_DownloadMemory);
316 ASSERT(it != mapped_addresses.end()); 360
317 mapped_addresses.erase(it); 361 if constexpr (USE_MEMORY_MAPS) {
318 mapped_addresses_allocator.Release(map); 362 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
319 } 363 const u8* const mapped_memory = download_staging.mapped_span.data();
320 364 const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
321private: 365 runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
322 MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { 366 runtime.Finish();
323 const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); 367 for (const BufferCopy& copy : copies) {
324 if (overlaps.empty()) { 368 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
325 const VAddr cpu_addr_end = cpu_addr + size; 369 const u8* copy_mapped_memory = mapped_memory + copy.dst_offset;
326 if (gpu_memory.IsGranularRange(gpu_addr, size)) { 370 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
327 u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
328 block->Upload(block->Offset(cpu_addr), size, host_ptr);
329 } else {
330 staging_buffer.resize(size);
331 gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
332 block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
333 } 371 }
334 return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); 372 } else {
335 } 373 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
336 374 for (const BufferCopy& copy : copies) {
337 const VAddr cpu_addr_end = cpu_addr + size; 375 buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
338 if (overlaps.size() == 1) { 376 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
339 MapInterval* const current_map = overlaps[0]; 377 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
340 if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
341 return current_map;
342 } 378 }
343 } 379 }
344 VAddr new_start = cpu_addr; 380 });
345 VAddr new_end = cpu_addr_end; 381}
346 bool write_inheritance = false; 382
347 bool modified_inheritance = false; 383template <class P>
348 // Calculate new buffer parameters 384void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
349 for (MapInterval* overlap : overlaps) { 385 u32 size) {
350 new_start = std::min(overlap->start, new_start); 386 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
351 new_end = std::max(overlap->end, new_end); 387 if (!cpu_addr) {
352 write_inheritance |= overlap->is_written; 388 uniform_buffers[stage][index] = NULL_BINDING;
353 modified_inheritance |= overlap->is_modified; 389 return;
390 }
391 const Binding binding{
392 .cpu_addr = *cpu_addr,
393 .size = size,
394 .buffer_id = BufferId{},
395 };
396 uniform_buffers[stage][index] = binding;
397}
398
399template <class P>
400void BufferCache<P>::UpdateGraphicsBuffers(bool is_indexed) {
401 MICROPROFILE_SCOPE(GPU_PrepareBuffers);
402 do {
403 has_deleted_buffers = false;
404 DoUpdateGraphicsBuffers(is_indexed);
405 } while (has_deleted_buffers);
406}
407
408template <class P>
409void BufferCache<P>::UpdateComputeBuffers() {
410 MICROPROFILE_SCOPE(GPU_PrepareBuffers);
411 do {
412 has_deleted_buffers = false;
413 DoUpdateComputeBuffers();
414 } while (has_deleted_buffers);
415}
416
417template <class P>
418void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) {
419 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
420 if (is_indexed) {
421 BindHostIndexBuffer();
422 } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
423 const auto& regs = maxwell3d.regs;
424 if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
425 runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count);
354 } 426 }
355 GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; 427 }
356 for (auto& overlap : overlaps) { 428 BindHostVertexBuffers();
357 Unregister(overlap); 429 BindHostTransformFeedbackBuffers();
430}
431
432template <class P>
433void BufferCache<P>::BindHostStageBuffers(size_t stage) {
434 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
435 BindHostGraphicsUniformBuffers(stage);
436 BindHostGraphicsStorageBuffers(stage);
437}
438
439template <class P>
440void BufferCache<P>::BindHostComputeBuffers() {
441 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
442 BindHostComputeUniformBuffers();
443 BindHostComputeStorageBuffers();
444}
445
446template <class P>
447void BufferCache<P>::SetEnabledUniformBuffers(size_t stage, u32 enabled) {
448 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
449 if (enabled_uniform_buffers[stage] != enabled) {
450 dirty_uniform_buffers[stage] = ~u32{0};
358 } 451 }
359 UpdateBlock(block, new_start, new_end, overlaps); 452 }
360 453 enabled_uniform_buffers[stage] = enabled;
361 const MapInterval new_map{new_start, new_end, new_gpu_addr}; 454}
362 MapInterval* const map = Register(new_map, write_inheritance); 455
363 if (!map) { 456template <class P>
364 return nullptr; 457void BufferCache<P>::SetEnabledComputeUniformBuffers(u32 enabled) {
458 enabled_compute_uniform_buffers = enabled;
459}
460
461template <class P>
462void BufferCache<P>::UnbindGraphicsStorageBuffers(size_t stage) {
463 enabled_storage_buffers[stage] = 0;
464 written_storage_buffers[stage] = 0;
465}
466
467template <class P>
468void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index,
469 u32 cbuf_offset, bool is_written) {
470 enabled_storage_buffers[stage] |= 1U << ssbo_index;
471 written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index;
472
473 const auto& cbufs = maxwell3d.state.shader_stages[stage];
474 const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset;
475 storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr);
476}
477
478template <class P>
479void BufferCache<P>::UnbindComputeStorageBuffers() {
480 enabled_compute_storage_buffers = 0;
481 written_compute_storage_buffers = 0;
482}
483
484template <class P>
485void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
486 bool is_written) {
487 enabled_compute_storage_buffers |= 1U << ssbo_index;
488 written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index;
489
490 const auto& launch_desc = kepler_compute.launch_description;
491 ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0);
492
493 const auto& cbufs = launch_desc.const_buffer_config;
494 const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset;
495 compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr);
496}
497
498template <class P>
499void BufferCache<P>::FlushCachedWrites() {
500 for (const BufferId buffer_id : cached_write_buffer_ids) {
501 slot_buffers[buffer_id].FlushCachedWrites();
502 }
503 cached_write_buffer_ids.clear();
504}
505
506template <class P>
507bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
508 return !uncommitted_downloads.empty();
509}
510
511template <class P>
512bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
513 return !committed_downloads.empty() && !committed_downloads.front().empty();
514}
515
516template <class P>
517void BufferCache<P>::CommitAsyncFlushes() {
518 // This is intentionally passing the value by copy
519 committed_downloads.push_front(uncommitted_downloads);
520 uncommitted_downloads.clear();
521}
522
523template <class P>
524void BufferCache<P>::PopAsyncFlushes() {
525 if (committed_downloads.empty()) {
526 return;
527 }
528 auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); });
529 const std::span<const BufferId> download_ids = committed_downloads.back();
530 if (download_ids.empty()) {
531 return;
532 }
533 MICROPROFILE_SCOPE(GPU_DownloadMemory);
534
535 boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
536 u64 total_size_bytes = 0;
537 u64 largest_copy = 0;
538 for (const BufferId buffer_id : download_ids) {
539 slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) {
540 downloads.push_back({
541 BufferCopy{
542 .src_offset = range_offset,
543 .dst_offset = total_size_bytes,
544 .size = range_size,
545 },
546 buffer_id,
547 });
548 total_size_bytes += range_size;
549 largest_copy = std::max(largest_copy, range_size);
550 });
551 }
552 if (downloads.empty()) {
553 return;
554 }
555 if constexpr (USE_MEMORY_MAPS) {
556 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
557 for (const auto [copy, buffer_id] : downloads) {
558 const std::array copies{copy};
559 runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies);
365 } 560 }
366 if (modified_inheritance) { 561 runtime.Finish();
367 map->MarkAsModified(true, GetModifiedTicks()); 562 for (const auto [copy, buffer_id] : downloads) {
368 if (Settings::IsGPULevelHigh() && 563 const Buffer& buffer = slot_buffers[buffer_id];
369 Settings::values.use_asynchronous_gpu_emulation.GetValue()) { 564 const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
370 MarkForAsyncFlush(map); 565 const u8* read_mapped_memory = download_staging.mapped_span.data() + copy.dst_offset;
371 } 566 cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size);
567 }
568 } else {
569 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
570 for (const auto [copy, buffer_id] : downloads) {
571 Buffer& buffer = slot_buffers[buffer_id];
572 buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
573 const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
574 cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
372 } 575 }
373 return map;
374 } 576 }
375 577}
376 void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { 578
377 const IntervalType base_interval{start, end}; 579template <class P>
378 IntervalSet interval_set{}; 580bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
379 interval_set.add(base_interval); 581 const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
380 for (auto& overlap : overlaps) { 582 for (u64 page = addr >> PAGE_BITS; page < page_end;) {
381 const IntervalType subtract{overlap->start, overlap->end}; 583 const BufferId image_id = page_table[page];
382 interval_set.subtract(subtract); 584 if (!image_id) {
585 ++page;
586 continue;
383 } 587 }
384 for (auto& interval : interval_set) { 588 Buffer& buffer = slot_buffers[image_id];
385 const std::size_t size = interval.upper() - interval.lower(); 589 if (buffer.IsRegionGpuModified(addr, size)) {
386 if (size == 0) { 590 return true;
387 continue;
388 }
389 staging_buffer.resize(size);
390 cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
391 block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
392 } 591 }
592 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
593 page = Common::DivCeil(end_addr, PAGE_SIZE);
393 } 594 }
394 595 return false;
395 VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) { 596}
396 VectorMapInterval result; 597
397 if (size == 0) { 598template <class P>
398 return result; 599void BufferCache<P>::BindHostIndexBuffer() {
600 Buffer& buffer = slot_buffers[index_buffer.buffer_id];
601 const u32 offset = buffer.Offset(index_buffer.cpu_addr);
602 const u32 size = index_buffer.size;
603 SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
604 if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
605 runtime.BindIndexBuffer(buffer, offset, size);
606 } else {
607 runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format,
608 maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count,
609 buffer, offset, size);
610 }
611}
612
613template <class P>
614void BufferCache<P>::BindHostVertexBuffers() {
615 auto& flags = maxwell3d.dirty.flags;
616 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
617 const Binding& binding = vertex_buffers[index];
618 Buffer& buffer = slot_buffers[binding.buffer_id];
619 SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
620 if (!flags[Dirty::VertexBuffer0 + index]) {
621 continue;
399 } 622 }
623 flags[Dirty::VertexBuffer0 + index] = false;
624
625 const u32 stride = maxwell3d.regs.vertex_array[index].stride;
626 const u32 offset = buffer.Offset(binding.cpu_addr);
627 runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride);
628 }
629}
400 630
401 const VAddr addr_end = addr + size; 631template <class P>
402 auto it = mapped_addresses.lower_bound(addr); 632void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
403 if (it != mapped_addresses.begin()) { 633 u32 dirty = ~0U;
404 --it; 634 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
635 dirty = std::exchange(dirty_uniform_buffers[stage], 0);
636 }
637 u32 binding_index = 0;
638 ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
639 const bool needs_bind = ((dirty >> index) & 1) != 0;
640 BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind);
641 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
642 ++binding_index;
405 } 643 }
406 while (it != mapped_addresses.end() && it->start < addr_end) { 644 });
407 if (it->Overlaps(addr, addr_end)) { 645}
408 result.push_back(&*it); 646
647template <class P>
648void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index,
649 bool needs_bind) {
650 const Binding& binding = uniform_buffers[stage][index];
651 const VAddr cpu_addr = binding.cpu_addr;
652 const u32 size = binding.size;
653 Buffer& buffer = slot_buffers[binding.buffer_id];
654 if constexpr (IS_OPENGL) {
655 if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) {
656 if (runtime.HasFastBufferSubData()) {
657 // Fast path for Nvidia
658 if (!HasFastUniformBufferBound(stage, binding_index)) {
659 // We only have to bind when the currently bound buffer is not the fast version
660 fast_bound_uniform_buffers[stage] |= 1U << binding_index;
661 runtime.BindFastUniformBuffer(stage, binding_index, size);
662 }
663 const auto span = ImmediateBufferWithData(cpu_addr, size);
664 runtime.PushFastUniformBuffer(stage, binding_index, span);
665 } else {
666 // Stream buffer path to avoid stalling on non-Nvidia drivers
667 const auto span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
668 cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
409 } 669 }
410 ++it; 670 return;
411 } 671 }
412 return result;
413 } 672 }
414 673 // Classic cached path
415 /// Returns a ticks counter used for tracking when cached objects were last modified 674 SynchronizeBuffer(buffer, cpu_addr, size);
416 u64 GetModifiedTicks() { 675 if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) {
417 return ++modified_ticks; 676 // Skip binding if it's not needed and if the bound buffer is not the fast version
677 // This exists to avoid instances where the fast buffer is bound and a GPU write happens
678 return;
418 } 679 }
680 fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
419 681
420 void FlushMap(MapInterval* map) { 682 const u32 offset = buffer.Offset(cpu_addr);
421 const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); 683 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
422 ASSERT_OR_EXECUTE(it != blocks.end(), return;); 684 runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size);
423 685 } else {
424 std::shared_ptr<Buffer> block = it->second; 686 runtime.BindUniformBuffer(buffer, offset, size);
425
426 const std::size_t size = map->end - map->start;
427 staging_buffer.resize(size);
428 block->Download(block->Offset(map->start), size, staging_buffer.data());
429 cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size);
430 map->MarkAsModified(false, 0);
431 } 687 }
688}
689
690template <class P>
691void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
692 u32 binding_index = 0;
693 ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
694 const Binding& binding = storage_buffers[stage][index];
695 Buffer& buffer = slot_buffers[binding.buffer_id];
696 const u32 size = binding.size;
697 SynchronizeBuffer(buffer, binding.cpu_addr, size);
698
699 const u32 offset = buffer.Offset(binding.cpu_addr);
700 const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0;
701 if constexpr (NEEDS_BIND_STORAGE_INDEX) {
702 runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written);
703 ++binding_index;
704 } else {
705 runtime.BindStorageBuffer(buffer, offset, size, is_written);
706 }
707 });
708}
432 709
433 template <typename Callable> 710template <class P>
434 BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { 711void BufferCache<P>::BindHostTransformFeedbackBuffers() {
435 AlignBuffer(alignment); 712 if (maxwell3d.regs.tfb_enabled == 0) {
436 const std::size_t uploaded_offset = buffer_offset; 713 return;
437 callable(buffer_ptr);
438
439 buffer_ptr += size;
440 buffer_offset += size;
441 return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
442 } 714 }
443 715 for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
444 void AlignBuffer(std::size_t alignment) { 716 const Binding& binding = transform_feedback_buffers[index];
445 // Align the offset, not the mapped pointer 717 Buffer& buffer = slot_buffers[binding.buffer_id];
446 const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment); 718 const u32 size = binding.size;
447 buffer_ptr += offset_aligned - buffer_offset; 719 SynchronizeBuffer(buffer, binding.cpu_addr, size);
448 buffer_offset = offset_aligned; 720
721 const u32 offset = buffer.Offset(binding.cpu_addr);
722 runtime.BindTransformFeedbackBuffer(index, buffer, offset, size);
449 } 723 }
724}
450 725
451 std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) { 726template <class P>
452 const std::size_t old_size = buffer->Size(); 727void BufferCache<P>::BindHostComputeUniformBuffers() {
453 const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; 728 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
454 const VAddr cpu_addr = buffer->CpuAddr(); 729 // Mark all uniform buffers as dirty
455 std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size); 730 dirty_uniform_buffers.fill(~u32{0});
456 new_buffer->CopyFrom(*buffer, 0, 0, old_size); 731 }
457 QueueDestruction(std::move(buffer)); 732 u32 binding_index = 0;
458 733 ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
459 const VAddr cpu_addr_end = cpu_addr + new_size - 1; 734 const Binding& binding = compute_uniform_buffers[index];
460 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 735 Buffer& buffer = slot_buffers[binding.buffer_id];
461 for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { 736 const u32 size = binding.size;
462 blocks.insert_or_assign(page_start, new_buffer); 737 SynchronizeBuffer(buffer, binding.cpu_addr, size);
738
739 const u32 offset = buffer.Offset(binding.cpu_addr);
740 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
741 runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size);
742 ++binding_index;
743 } else {
744 runtime.BindUniformBuffer(buffer, offset, size);
463 } 745 }
746 });
747}
748
749template <class P>
750void BufferCache<P>::BindHostComputeStorageBuffers() {
751 u32 binding_index = 0;
752 ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
753 const Binding& binding = compute_storage_buffers[index];
754 Buffer& buffer = slot_buffers[binding.buffer_id];
755 const u32 size = binding.size;
756 SynchronizeBuffer(buffer, binding.cpu_addr, size);
757
758 const u32 offset = buffer.Offset(binding.cpu_addr);
759 const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0;
760 if constexpr (NEEDS_BIND_STORAGE_INDEX) {
761 runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written);
762 ++binding_index;
763 } else {
764 runtime.BindStorageBuffer(buffer, offset, size, is_written);
765 }
766 });
767}
464 768
465 return new_buffer; 769template <class P>
770void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
771 if (is_indexed) {
772 UpdateIndexBuffer();
466 } 773 }
774 UpdateVertexBuffers();
775 UpdateTransformFeedbackBuffers();
776 for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
777 UpdateUniformBuffers(stage);
778 UpdateStorageBuffers(stage);
779 }
780}
781
782template <class P>
783void BufferCache<P>::DoUpdateComputeBuffers() {
784 UpdateComputeUniformBuffers();
785 UpdateComputeStorageBuffers();
786}
787
788template <class P>
789void BufferCache<P>::UpdateIndexBuffer() {
790 // We have to check for the dirty flags and index count
791 // The index count is currently changed without updating the dirty flags
792 const auto& index_array = maxwell3d.regs.index_array;
793 auto& flags = maxwell3d.dirty.flags;
794 if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) {
795 return;
796 }
797 flags[Dirty::IndexBuffer] = false;
798 last_index_count = index_array.count;
799
800 const GPUVAddr gpu_addr_begin = index_array.StartAddress();
801 const GPUVAddr gpu_addr_end = index_array.EndAddress();
802 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
803 const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
804 const u32 draw_size = index_array.count * index_array.FormatSizeInBytes();
805 const u32 size = std::min(address_size, draw_size);
806 if (size == 0 || !cpu_addr) {
807 index_buffer = NULL_BINDING;
808 return;
809 }
810 index_buffer = Binding{
811 .cpu_addr = *cpu_addr,
812 .size = size,
813 .buffer_id = FindBuffer(*cpu_addr, size),
814 };
815}
467 816
468 std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first, 817template <class P>
469 std::shared_ptr<Buffer> second) { 818void BufferCache<P>::UpdateVertexBuffers() {
470 const std::size_t size_1 = first->Size(); 819 auto& flags = maxwell3d.dirty.flags;
471 const std::size_t size_2 = second->Size(); 820 if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) {
472 const VAddr first_addr = first->CpuAddr(); 821 return;
473 const VAddr second_addr = second->CpuAddr(); 822 }
474 const VAddr new_addr = std::min(first_addr, second_addr); 823 flags[Dirty::VertexBuffers] = false;
475 const std::size_t new_size = size_1 + size_2;
476
477 std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
478 new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
479 new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
480 QueueDestruction(std::move(first));
481 QueueDestruction(std::move(second));
482 824
483 const VAddr cpu_addr_end = new_addr + new_size - 1; 825 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
484 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 826 UpdateVertexBuffer(index);
485 for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
486 blocks.insert_or_assign(page_start, new_buffer);
487 }
488 return new_buffer;
489 } 827 }
828}
490 829
491 Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { 830template <class P>
492 std::shared_ptr<Buffer> found; 831void BufferCache<P>::UpdateVertexBuffer(u32 index) {
832 if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) {
833 return;
834 }
835 const auto& array = maxwell3d.regs.vertex_array[index];
836 const auto& limit = maxwell3d.regs.vertex_array_limit[index];
837 const GPUVAddr gpu_addr_begin = array.StartAddress();
838 const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1;
839 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
840 const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
841 const u32 size = address_size; // TODO: Analyze stride and number of vertices
842 if (array.enable == 0 || size == 0 || !cpu_addr) {
843 vertex_buffers[index] = NULL_BINDING;
844 return;
845 }
846 vertex_buffers[index] = Binding{
847 .cpu_addr = *cpu_addr,
848 .size = size,
849 .buffer_id = FindBuffer(*cpu_addr, size),
850 };
851}
852
853template <class P>
854void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
855 ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
856 Binding& binding = uniform_buffers[stage][index];
857 if (binding.buffer_id) {
858 // Already updated
859 return;
860 }
861 // Mark as dirty
862 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
863 dirty_uniform_buffers[stage] |= 1U << index;
864 }
865 // Resolve buffer
866 binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
867 });
868}
869
870template <class P>
871void BufferCache<P>::UpdateStorageBuffers(size_t stage) {
872 const u32 written_mask = written_storage_buffers[stage];
873 ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
874 // Resolve buffer
875 Binding& binding = storage_buffers[stage][index];
876 const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
877 binding.buffer_id = buffer_id;
878 // Mark buffer as written if needed
879 if (((written_mask >> index) & 1) != 0) {
880 MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
881 }
882 });
883}
493 884
494 const VAddr cpu_addr_end = cpu_addr + size - 1; 885template <class P>
495 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 886void BufferCache<P>::UpdateTransformFeedbackBuffers() {
496 for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { 887 if (maxwell3d.regs.tfb_enabled == 0) {
497 auto it = blocks.find(page_start); 888 return;
498 if (it == blocks.end()) { 889 }
499 if (found) { 890 for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
500 found = EnlargeBlock(found); 891 UpdateTransformFeedbackBuffer(index);
501 continue; 892 }
502 } 893}
503 const VAddr start_addr = page_start << BLOCK_PAGE_BITS; 894
504 found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); 895template <class P>
505 blocks.insert_or_assign(page_start, found); 896void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) {
506 continue; 897 const auto& binding = maxwell3d.regs.tfb_bindings[index];
507 } 898 const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset;
508 if (!found) { 899 const u32 size = binding.buffer_size;
509 found = it->second; 900 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
510 continue; 901 if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) {
511 } 902 transform_feedback_buffers[index] = NULL_BINDING;
512 if (found != it->second) { 903 return;
513 found = MergeBlocks(std::move(found), it->second); 904 }
905 const BufferId buffer_id = FindBuffer(*cpu_addr, size);
906 transform_feedback_buffers[index] = Binding{
907 .cpu_addr = *cpu_addr,
908 .size = size,
909 .buffer_id = buffer_id,
910 };
911 MarkWrittenBuffer(buffer_id, *cpu_addr, size);
912}
913
914template <class P>
915void BufferCache<P>::UpdateComputeUniformBuffers() {
916 ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
917 Binding& binding = compute_uniform_buffers[index];
918 binding = NULL_BINDING;
919 const auto& launch_desc = kepler_compute.launch_description;
920 if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) {
921 const auto& cbuf = launch_desc.const_buffer_config[index];
922 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address());
923 if (cpu_addr) {
924 binding.cpu_addr = *cpu_addr;
925 binding.size = cbuf.size;
514 } 926 }
515 } 927 }
516 return found.get(); 928 binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
929 });
930}
931
932template <class P>
933void BufferCache<P>::UpdateComputeStorageBuffers() {
934 ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
935 // Resolve buffer
936 Binding& binding = compute_storage_buffers[index];
937 const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
938 binding.buffer_id = buffer_id;
939 // Mark as written if needed
940 if (((written_compute_storage_buffers >> index) & 1) != 0) {
941 MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
942 }
943 });
944}
945
946template <class P>
947void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) {
948 Buffer& buffer = slot_buffers[buffer_id];
949 buffer.MarkRegionAsGpuModified(cpu_addr, size);
950
951 const bool is_accuracy_high = Settings::IsGPULevelHigh();
952 const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
953 if (!is_accuracy_high || !is_async) {
954 return;
517 } 955 }
956 if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) {
957 // Already inserted
958 return;
959 }
960 uncommitted_downloads.push_back(buffer_id);
961}
518 962
519 void MarkRegionAsWritten(VAddr start, VAddr end) { 963template <class P>
520 const u64 page_end = end >> WRITE_PAGE_BIT; 964BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) {
521 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 965 if (cpu_addr == 0) {
522 if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) { 966 return NULL_BUFFER_ID;
523 ++it->second; 967 }
524 } 968 const u64 page = cpu_addr >> PAGE_BITS;
969 const BufferId buffer_id = page_table[page];
970 if (!buffer_id) {
971 return CreateBuffer(cpu_addr, size);
972 }
973 const Buffer& buffer = slot_buffers[buffer_id];
974 if (buffer.IsInBounds(cpu_addr, size)) {
975 return buffer_id;
976 }
977 return CreateBuffer(cpu_addr, size);
978}
979
980template <class P>
981BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
982 std::vector<BufferId> overlap_ids;
983 VAddr cpu_addr_begin = cpu_addr;
984 VAddr cpu_addr_end = cpu_addr + wanted_size;
985 for (; cpu_addr >> PAGE_BITS < Common::DivCeil(cpu_addr_end, PAGE_SIZE);
986 cpu_addr += PAGE_SIZE) {
987 const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS];
988 if (!overlap_id) {
989 continue;
990 }
991 Buffer& overlap = slot_buffers[overlap_id];
992 if (overlap.IsPicked()) {
993 continue;
994 }
995 overlap.Pick();
996 overlap_ids.push_back(overlap_id);
997 const VAddr overlap_cpu_addr = overlap.CpuAddr();
998 if (overlap_cpu_addr < cpu_addr_begin) {
999 cpu_addr = cpu_addr_begin = overlap_cpu_addr;
525 } 1000 }
1001 cpu_addr_end = std::max(cpu_addr_end, overlap_cpu_addr + overlap.SizeBytes());
526 } 1002 }
527 1003 const u32 size = static_cast<u32>(cpu_addr_end - cpu_addr_begin);
528 void UnmarkRegionAsWritten(VAddr start, VAddr end) { 1004 const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, cpu_addr_begin, size);
529 const u64 page_end = end >> WRITE_PAGE_BIT; 1005 Buffer& new_buffer = slot_buffers[new_buffer_id];
530 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 1006
531 auto it = written_pages.find(page_start); 1007 for (const BufferId overlap_id : overlap_ids) {
532 if (it != written_pages.end()) { 1008 Buffer& overlap = slot_buffers[overlap_id];
533 if (it->second > 1) { 1009 overlap.Unpick();
534 --it->second; 1010
535 } else { 1011 std::vector<BufferCopy> copies;
536 written_pages.erase(it); 1012 const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr();
537 } 1013 overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) {
538 } 1014 copies.push_back(BufferCopy{
1015 .src_offset = begin,
1016 .dst_offset = dst_base_offset + begin,
1017 .size = range_size,
1018 });
1019 new_buffer.UnmarkRegionAsCpuModified(begin, range_size);
1020 new_buffer.MarkRegionAsGpuModified(begin, range_size);
1021 });
1022 if (!copies.empty()) {
1023 runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies);
1024 }
1025 ReplaceBufferDownloads(overlap_id, new_buffer_id);
1026 DeleteBuffer(overlap_id);
1027 }
1028 Register(new_buffer_id);
1029 return new_buffer_id;
1030}
1031
1032template <class P>
1033void BufferCache<P>::Register(BufferId buffer_id) {
1034 ChangeRegister<true>(buffer_id);
1035}
1036
1037template <class P>
1038void BufferCache<P>::Unregister(BufferId buffer_id) {
1039 ChangeRegister<false>(buffer_id);
1040}
1041
1042template <class P>
1043template <bool insert>
1044void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
1045 const Buffer& buffer = slot_buffers[buffer_id];
1046 const VAddr cpu_addr_begin = buffer.CpuAddr();
1047 const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes();
1048 const u64 page_begin = cpu_addr_begin / PAGE_SIZE;
1049 const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE);
1050 for (u64 page = page_begin; page != page_end; ++page) {
1051 if constexpr (insert) {
1052 page_table[page] = buffer_id;
1053 } else {
1054 page_table[page] = BufferId{};
539 } 1055 }
540 } 1056 }
1057}
541 1058
542 bool IsRegionWritten(VAddr start, VAddr end) const { 1059template <class P>
543 const u64 page_end = end >> WRITE_PAGE_BIT; 1060void BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
544 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 1061 if (buffer.CpuAddr() == 0) {
545 if (written_pages.contains(page_start)) { 1062 return;
546 return true; 1063 }
1064 SynchronizeBufferImpl(buffer, cpu_addr, size);
1065}
1066
1067template <class P>
1068void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) {
1069 boost::container::small_vector<BufferCopy, 4> copies;
1070 u64 total_size_bytes = 0;
1071 u64 largest_copy = 0;
1072 buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
1073 copies.push_back(BufferCopy{
1074 .src_offset = total_size_bytes,
1075 .dst_offset = range_offset,
1076 .size = range_size,
1077 });
1078 total_size_bytes += range_size;
1079 largest_copy = std::max(largest_copy, range_size);
1080 });
1081 if (total_size_bytes == 0) {
1082 return;
1083 }
1084 const std::span<BufferCopy> copies_span(copies.data(), copies.size());
1085 UploadMemory(buffer, total_size_bytes, largest_copy, copies_span);
1086}
1087
1088template <class P>
1089void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
1090 std::span<BufferCopy> copies) {
1091 if constexpr (USE_MEMORY_MAPS) {
1092 MappedUploadMemory(buffer, total_size_bytes, copies);
1093 } else {
1094 ImmediateUploadMemory(buffer, largest_copy, copies);
1095 }
1096}
1097
1098template <class P>
1099void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
1100 std::span<const BufferCopy> copies) {
1101 std::span<u8> immediate_buffer;
1102 for (const BufferCopy& copy : copies) {
1103 std::span<const u8> upload_span;
1104 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
1105 if (IsRangeGranular(cpu_addr, copy.size)) {
1106 upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size);
1107 } else {
1108 if (immediate_buffer.empty()) {
1109 immediate_buffer = ImmediateBuffer(largest_copy);
547 } 1110 }
1111 cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
1112 upload_span = immediate_buffer.subspan(0, copy.size);
548 } 1113 }
549 return false; 1114 buffer.ImmediateUpload(copy.dst_offset, upload_span);
550 } 1115 }
551 1116}
552 void QueueDestruction(std::shared_ptr<Buffer> buffer) { 1117
553 buffer->SetEpoch(epoch); 1118template <class P>
554 pending_destruction.push(std::move(buffer)); 1119void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
1120 std::span<const BufferCopy> copies) {
1121 auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes);
1122 const std::span<u8> staging_pointer = upload_staging.mapped_span;
1123 for (const BufferCopy& copy : copies) {
1124 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
1125 u8* const src_pointer = staging_pointer.data() + copy.src_offset;
1126 cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size);
555 } 1127 }
556 1128 runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
557 void MarkForAsyncFlush(MapInterval* map) { 1129}
558 if (!uncommitted_flushes) { 1130
559 uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>(); 1131template <class P>
1132void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
1133 const auto scalar_replace = [buffer_id](Binding& binding) {
1134 if (binding.buffer_id == buffer_id) {
1135 binding.buffer_id = BufferId{};
1136 }
1137 };
1138 const auto replace = [scalar_replace](std::span<Binding> bindings) {
1139 std::ranges::for_each(bindings, scalar_replace);
1140 };
1141 scalar_replace(index_buffer);
1142 replace(vertex_buffers);
1143 std::ranges::for_each(uniform_buffers, replace);
1144 std::ranges::for_each(storage_buffers, replace);
1145 replace(transform_feedback_buffers);
1146 replace(compute_uniform_buffers);
1147 replace(compute_storage_buffers);
1148 std::erase(cached_write_buffer_ids, buffer_id);
1149
1150 // Mark the whole buffer as CPU written to stop tracking CPU writes
1151 Buffer& buffer = slot_buffers[buffer_id];
1152 buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes());
1153
1154 Unregister(buffer_id);
1155 delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
1156
1157 NotifyBufferDeletion();
1158}
1159
1160template <class P>
1161void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) {
1162 const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) {
1163 std::ranges::replace(buffers, old_buffer_id, new_buffer_id);
1164 if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) {
1165 buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end());
560 } 1166 }
561 uncommitted_flushes->insert(map); 1167 };
1168 replace(uncommitted_downloads);
1169 std::ranges::for_each(committed_downloads, replace);
1170}
1171
1172template <class P>
1173void BufferCache<P>::NotifyBufferDeletion() {
1174 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
1175 dirty_uniform_buffers.fill(~u32{0});
562 } 1176 }
1177 auto& flags = maxwell3d.dirty.flags;
1178 flags[Dirty::IndexBuffer] = true;
1179 flags[Dirty::VertexBuffers] = true;
1180 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
1181 flags[Dirty::VertexBuffer0 + index] = true;
1182 }
1183 has_deleted_buffers = true;
1184}
1185
1186template <class P>
1187typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr) const {
1188 const GPUVAddr gpu_addr = gpu_memory.Read<u64>(ssbo_addr);
1189 const u32 size = gpu_memory.Read<u32>(ssbo_addr + 8);
1190 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
1191 if (!cpu_addr || size == 0) {
1192 return NULL_BINDING;
1193 }
1194 const Binding binding{
1195 .cpu_addr = *cpu_addr,
1196 .size = size,
1197 .buffer_id = BufferId{},
1198 };
1199 return binding;
1200}
1201
1202template <class P>
1203std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size_t size) {
1204 u8* const base_pointer = cpu_memory.GetPointer(cpu_addr);
1205 if (IsRangeGranular(cpu_addr, size) ||
1206 base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) {
1207 return std::span(base_pointer, size);
1208 } else {
1209 const std::span<u8> span = ImmediateBuffer(size);
1210 cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
1211 return span;
1212 }
1213}
563 1214
564 VideoCore::RasterizerInterface& rasterizer; 1215template <class P>
565 Tegra::MemoryManager& gpu_memory; 1216std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) {
566 Core::Memory::Memory& cpu_memory; 1217 if (wanted_capacity > immediate_buffer_capacity) {
567 StreamBuffer& stream_buffer; 1218 immediate_buffer_capacity = wanted_capacity;
568 1219 immediate_buffer_alloc = std::make_unique<u8[]>(wanted_capacity);
569 u8* buffer_ptr = nullptr; 1220 }
570 u64 buffer_offset = 0; 1221 return std::span<u8>(immediate_buffer_alloc.get(), wanted_capacity);
571 u64 buffer_offset_base = 0; 1222}
572 1223
573 MapIntervalAllocator mapped_addresses_allocator; 1224template <class P>
574 boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>> 1225bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept {
575 mapped_addresses; 1226 if constexpr (IS_OPENGL) {
576 1227 return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0;
577 std::unordered_map<u64, u32> written_pages; 1228 } else {
578 std::unordered_map<u64, std::shared_ptr<Buffer>> blocks; 1229 // Only OpenGL has fast uniform buffers
579 1230 return false;
580 std::queue<std::shared_ptr<Buffer>> pending_destruction; 1231 }
581 u64 epoch = 0; 1232}
582 u64 modified_ticks = 0;
583
584 std::vector<u8> staging_buffer;
585
586 std::list<MapInterval*> marked_for_unregister;
587
588 std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
589 std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
590
591 std::recursive_mutex mutex;
592};
593 1233
594} // namespace VideoCommon 1234} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp
deleted file mode 100644
index 62587e18a..000000000
--- a/src/video_core/buffer_cache/map_interval.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <array>
7#include <cstddef>
8#include <memory>
9
10#include "video_core/buffer_cache/map_interval.h"
11
12namespace VideoCommon {
13
14MapIntervalAllocator::MapIntervalAllocator() {
15 FillFreeList(first_chunk);
16}
17
18MapIntervalAllocator::~MapIntervalAllocator() = default;
19
20void MapIntervalAllocator::AllocateNewChunk() {
21 *new_chunk = std::make_unique<Chunk>();
22 FillFreeList(**new_chunk);
23 new_chunk = &(*new_chunk)->next;
24}
25
26void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
27 const std::size_t old_size = free_list.size();
28 free_list.resize(old_size + chunk.data.size());
29 std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
30 [](MapInterval& interval) { return &interval; });
31}
32
33} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
deleted file mode 100644
index ef974b08a..000000000
--- a/src/video_core/buffer_cache/map_interval.h
+++ /dev/null
@@ -1,93 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <cstddef>
9#include <memory>
10#include <vector>
11
12#include <boost/intrusive/set_hook.hpp>
13
14#include "common/common_types.h"
15#include "video_core/gpu.h"
16
17namespace VideoCommon {
18
19struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
20 MapInterval() = default;
21
22 /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
23
24 explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
25 : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
26
27 bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
28 return start <= other_start && other_end <= end;
29 }
30
31 bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
32 return start < other_end && other_start < end;
33 }
34
35 void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
36 is_modified = is_modified_;
37 ticks = ticks_;
38 }
39
40 boost::intrusive::set_member_hook<> member_hook_;
41 VAddr start = 0;
42 VAddr end = 0;
43 GPUVAddr gpu_addr = 0;
44 u64 ticks = 0;
45 bool is_written = false;
46 bool is_modified = false;
47 bool is_registered = false;
48 bool is_memory_marked = false;
49 bool is_sync_pending = false;
50};
51
52struct MapIntervalCompare {
53 constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
54 return lhs.start < rhs.start;
55 }
56};
57
58class MapIntervalAllocator {
59public:
60 MapIntervalAllocator();
61 ~MapIntervalAllocator();
62
63 MapInterval* Allocate() {
64 if (free_list.empty()) {
65 AllocateNewChunk();
66 }
67 MapInterval* const interval = free_list.back();
68 free_list.pop_back();
69 return interval;
70 }
71
72 void Release(MapInterval* interval) {
73 free_list.push_back(interval);
74 }
75
76private:
77 struct Chunk {
78 std::unique_ptr<Chunk> next;
79 std::array<MapInterval, 0x8000> data;
80 };
81
82 void AllocateNewChunk();
83
84 void FillFreeList(Chunk& chunk);
85
86 std::vector<MapInterval*> free_list;
87
88 Chunk first_chunk;
89
90 std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
91};
92
93} // namespace VideoCommon