summaryrefslogtreecommitdiff
path: root/src/video_core/buffer_cache
diff options
context:
space:
mode:
authorGravatar ReinUsesLisp2021-01-16 20:48:58 -0300
committerGravatar ReinUsesLisp2021-02-13 02:17:22 -0300
commit82c2601555b59a94d7160f2fd686cb63d32dd423 (patch)
treecd0ecd865945452fa589b572de614fc487f2f96a /src/video_core/buffer_cache
parentvulkan_common: Expose interop and headless devices (diff)
downloadyuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar.gz
yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar.xz
yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.zip
video_core: Reimplement the buffer cache
Reimplement the buffer cache using cached bindings and page level granularity for modification tracking. This also drops the usage of shared pointers and virtual functions from the cache. - Bindings are cached, allowing to skip work when the game changes few bits between draws. - OpenGL Assembly shaders no longer copy when a region has been modified from the GPU to emulate constant buffers, instead GL_EXT_memory_object is used to alias sub-buffers within the same allocation. - OpenGL Assembly shaders stream constant buffer data using glProgramBufferParametersIuivNV, from NV_parameter_buffer_object. In theory this should save one hash table resolve inside the driver compared to glBufferSubData. - A new OpenGL stream buffer is implemented based on fences for drivers that are not Nvidia's proprietary, due to their low performance on partial glBufferSubData calls synchronized with 3D rendering (that some games use a lot). - Most optimizations are shared between APIs now, allowing Vulkan to cache more bindings than before, skipping unnecesarry work. This commit adds the necessary infrastructure to use Vulkan object from OpenGL. Overall, it improves performance and fixes some bugs present on the old cache. There are still some edge cases hit by some games that harm performance on some vendors, this are planned to be fixed in later commits.
Diffstat (limited to 'src/video_core/buffer_cache')
-rw-r--r--src/video_core/buffer_cache/buffer_block.h62
-rw-r--r--src/video_core/buffer_cache/buffer_cache.cpp13
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h1598
-rw-r--r--src/video_core/buffer_cache/map_interval.cpp33
-rw-r--r--src/video_core/buffer_cache/map_interval.h93
5 files changed, 1132 insertions, 667 deletions
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
deleted file mode 100644
index e9306194a..000000000
--- a/src/video_core/buffer_cache/buffer_block.h
+++ /dev/null
@@ -1,62 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "common/common_types.h"
8
9namespace VideoCommon {
10
11class BufferBlock {
12public:
13 [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
14 return (cpu_addr < end) && (cpu_addr_end > start);
15 }
16
17 [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
18 return cpu_addr <= other_start && other_end <= cpu_addr_end;
19 }
20
21 [[nodiscard]] std::size_t Offset(VAddr in_addr) const {
22 return static_cast<std::size_t>(in_addr - cpu_addr);
23 }
24
25 [[nodiscard]] VAddr CpuAddr() const {
26 return cpu_addr;
27 }
28
29 [[nodiscard]] VAddr CpuAddrEnd() const {
30 return cpu_addr_end;
31 }
32
33 void SetCpuAddr(VAddr new_addr) {
34 cpu_addr = new_addr;
35 cpu_addr_end = new_addr + size;
36 }
37
38 [[nodiscard]] std::size_t Size() const {
39 return size;
40 }
41
42 [[nodiscard]] u64 Epoch() const {
43 return epoch;
44 }
45
46 void SetEpoch(u64 new_epoch) {
47 epoch = new_epoch;
48 }
49
50protected:
51 explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
52 SetCpuAddr(cpu_addr_);
53 }
54
55private:
56 VAddr cpu_addr{};
57 VAddr cpu_addr_end{};
58 std::size_t size{};
59 u64 epoch{};
60};
61
62} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
new file mode 100644
index 000000000..ab32294c8
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -0,0 +1,13 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/microprofile.h"
6
7namespace VideoCommon {
8
9MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128));
10MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128));
11MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128));
12
13} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 83b9ee871..e4f3c8e35 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -4,591 +4,1231 @@
4 4
5#pragma once 5#pragma once
6 6
7#include <list> 7#include <algorithm>
8#include <array>
9#include <deque>
8#include <memory> 10#include <memory>
9#include <mutex> 11#include <mutex>
12#include <span>
10#include <unordered_map> 13#include <unordered_map>
11#include <unordered_set>
12#include <utility>
13#include <vector> 14#include <vector>
14 15
15#include <boost/container/small_vector.hpp> 16#include <boost/container/small_vector.hpp>
16#include <boost/icl/interval_set.hpp>
17#include <boost/intrusive/set.hpp>
18 17
19#include "common/alignment.h"
20#include "common/assert.h"
21#include "common/common_types.h" 18#include "common/common_types.h"
22#include "common/logging/log.h" 19#include "common/div_ceil.h"
23#include "core/core.h" 20#include "common/microprofile.h"
21#include "common/scope_exit.h"
24#include "core/memory.h" 22#include "core/memory.h"
25#include "core/settings.h" 23#include "core/settings.h"
26#include "video_core/buffer_cache/buffer_block.h" 24#include "video_core/buffer_cache/buffer_base.h"
27#include "video_core/buffer_cache/map_interval.h" 25#include "video_core/delayed_destruction_ring.h"
26#include "video_core/dirty_flags.h"
27#include "video_core/engines/kepler_compute.h"
28#include "video_core/engines/maxwell_3d.h"
28#include "video_core/memory_manager.h" 29#include "video_core/memory_manager.h"
29#include "video_core/rasterizer_interface.h" 30#include "video_core/rasterizer_interface.h"
31#include "video_core/texture_cache/slot_vector.h"
32#include "video_core/texture_cache/types.h"
30 33
31namespace VideoCommon { 34namespace VideoCommon {
32 35
33template <typename Buffer, typename BufferType, typename StreamBuffer> 36MICROPROFILE_DECLARE(GPU_PrepareBuffers);
37MICROPROFILE_DECLARE(GPU_BindUploadBuffers);
38MICROPROFILE_DECLARE(GPU_DownloadMemory);
39
40using BufferId = SlotId;
41
42constexpr u32 NUM_VERTEX_BUFFERS = 32;
43constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4;
44constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18;
45constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
46constexpr u32 NUM_STORAGE_BUFFERS = 16;
47constexpr u32 NUM_STAGES = 5;
48
49template <typename P>
34class BufferCache { 50class BufferCache {
35 using IntervalSet = boost::icl::interval_set<VAddr>; 51 // Page size for caching purposes.
36 using IntervalType = typename IntervalSet::interval_type; 52 // This is unrelated to the CPU page size and it can be changed as it seems optimal.
37 using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>; 53 static constexpr u32 PAGE_BITS = 16;
54 static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS;
38 55
39 static constexpr u64 WRITE_PAGE_BIT = 11; 56 static constexpr bool IS_OPENGL = P::IS_OPENGL;
40 static constexpr u64 BLOCK_PAGE_BITS = 21; 57 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS =
41 static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; 58 P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS;
59 static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT =
60 P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT;
61 static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX;
62 static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX;
63 static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS;
42 64
43public: 65 static constexpr BufferId NULL_BUFFER_ID{0};
44 struct BufferInfo { 66
45 BufferType handle; 67 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
46 u64 offset; 68
47 u64 address; 69 using Runtime = typename P::Runtime;
70 using Buffer = typename P::Buffer;
71
72 struct Empty {};
73
74 struct Binding {
75 VAddr cpu_addr{};
76 u32 size{};
77 BufferId buffer_id;
48 }; 78 };
49 79
50 BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, 80 static constexpr Binding NULL_BINDING{
51 bool is_written = false, bool use_fast_cbuf = false) { 81 .cpu_addr = 0,
52 std::lock_guard lock{mutex}; 82 .size = 0,
83 .buffer_id = NULL_BUFFER_ID,
84 };
53 85
54 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); 86public:
55 if (!cpu_addr) { 87 static constexpr size_t SKIP_CACHE_SIZE = 4096;
56 return GetEmptyBuffer(size);
57 }
58 88
59 // Cache management is a big overhead, so only cache entries with a given size. 89 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
60 // TODO: Figure out which size is the best for given games. 90 Tegra::Engines::Maxwell3D& maxwell3d_,
61 constexpr std::size_t max_stream_size = 0x800; 91 Tegra::Engines::KeplerCompute& kepler_compute_,
62 if (use_fast_cbuf || size < max_stream_size) { 92 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
63 if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) { 93 Runtime& runtime_);
64 const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size);
65 if (use_fast_cbuf) {
66 u8* dest;
67 if (is_granular) {
68 dest = gpu_memory.GetPointer(gpu_addr);
69 } else {
70 staging_buffer.resize(size);
71 dest = staging_buffer.data();
72 gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
73 }
74 return ConstBufferUpload(dest, size);
75 }
76 if (is_granular) {
77 u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
78 return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
79 std::memcpy(dest, host_ptr, size);
80 });
81 } else {
82 return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) {
83 gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
84 });
85 }
86 }
87 }
88 94
89 Buffer* const block = GetBlock(*cpu_addr, size); 95 void TickFrame();
90 MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size);
91 if (!map) {
92 return GetEmptyBuffer(size);
93 }
94 if (is_written) {
95 map->MarkAsModified(true, GetModifiedTicks());
96 if (Settings::IsGPULevelHigh() &&
97 Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
98 MarkForAsyncFlush(map);
99 }
100 if (!map->is_written) {
101 map->is_written = true;
102 MarkRegionAsWritten(map->start, map->end - 1);
103 }
104 }
105 96
106 return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()}; 97 void WriteMemory(VAddr cpu_addr, u64 size);
107 }
108 98
109 /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. 99 void CachedWriteMemory(VAddr cpu_addr, u64 size);
110 BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
111 std::size_t alignment = 4) {
112 std::lock_guard lock{mutex};
113 return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
114 std::memcpy(dest, raw_pointer, size);
115 });
116 }
117 100
118 /// Prepares the buffer cache for data uploading 101 void DownloadMemory(VAddr cpu_addr, u64 size);
119 /// @param max_size Maximum number of bytes that will be uploaded
120 /// @return True when a stream buffer invalidation was required, false otherwise
121 void Map(std::size_t max_size) {
122 std::lock_guard lock{mutex};
123 102
124 std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4); 103 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
125 buffer_offset = buffer_offset_base;
126 }
127 104
128 /// Finishes the upload stream 105 void UpdateGraphicsBuffers(bool is_indexed);
129 void Unmap() {
130 std::lock_guard lock{mutex};
131 stream_buffer.Unmap(buffer_offset - buffer_offset_base);
132 }
133 106
134 /// Function called at the end of each frame, inteded for deferred operations 107 void UpdateComputeBuffers();
135 void TickFrame() {
136 ++epoch;
137 108
138 while (!pending_destruction.empty()) { 109 void BindHostGeometryBuffers(bool is_indexed);
139 // Delay at least 4 frames before destruction.
140 // This is due to triple buffering happening on some drivers.
141 static constexpr u64 epochs_to_destroy = 5;
142 if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
143 break;
144 }
145 pending_destruction.pop();
146 }
147 }
148 110
149 /// Write any cached resources overlapping the specified region back to memory 111 void BindHostStageBuffers(size_t stage);
150 void FlushRegion(VAddr addr, std::size_t size) {
151 std::lock_guard lock{mutex};
152 112
153 VectorMapInterval objects = GetMapsInRange(addr, size); 113 void BindHostComputeBuffers();
154 std::sort(objects.begin(), objects.end(),
155 [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
156 for (MapInterval* object : objects) {
157 if (object->is_modified && object->is_registered) {
158 mutex.unlock();
159 FlushMap(object);
160 mutex.lock();
161 }
162 }
163 }
164 114
165 bool MustFlushRegion(VAddr addr, std::size_t size) { 115 void SetEnabledUniformBuffers(size_t stage, u32 enabled);
166 std::lock_guard lock{mutex};
167 116
168 const VectorMapInterval objects = GetMapsInRange(addr, size); 117 void SetEnabledComputeUniformBuffers(u32 enabled);
169 return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
170 return map->is_modified && map->is_registered;
171 });
172 }
173 118
174 /// Mark the specified region as being invalidated 119 void UnbindGraphicsStorageBuffers(size_t stage);
175 void InvalidateRegion(VAddr addr, u64 size) {
176 std::lock_guard lock{mutex};
177 120
178 for (auto& object : GetMapsInRange(addr, size)) { 121 void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
179 if (object->is_registered) { 122 bool is_written);
180 Unregister(object);
181 }
182 }
183 }
184 123
185 void OnCPUWrite(VAddr addr, std::size_t size) { 124 void UnbindComputeStorageBuffers();
186 std::lock_guard lock{mutex};
187 125
188 for (MapInterval* object : GetMapsInRange(addr, size)) { 126 void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
189 if (object->is_memory_marked && object->is_registered) { 127 bool is_written);
190 UnmarkMemory(object);
191 object->is_sync_pending = true;
192 marked_for_unregister.emplace_back(object);
193 }
194 }
195 }
196 128
197 void SyncGuestHost() { 129 void FlushCachedWrites();
198 std::lock_guard lock{mutex};
199 130
200 for (auto& object : marked_for_unregister) { 131 /// Return true when there are uncommitted buffers to be downloaded
201 if (object->is_registered) { 132 [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
202 object->is_sync_pending = false; 133
203 Unregister(object); 134 /// Return true when the caller should wait for async downloads
204 } 135 [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
136
137 /// Commit asynchronous downloads
138 void CommitAsyncFlushes();
139
140 /// Pop asynchronous downloads
141 void PopAsyncFlushes();
142
143 /// Return true when a CPU region is modified from the GPU
144 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
145
146 std::mutex mutex;
147
148private:
149 template <typename Func>
150 static void ForEachEnabledBit(u32 enabled_mask, Func&& func) {
151 for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) {
152 const int disabled_bits = std::countr_zero(enabled_mask);
153 index += disabled_bits;
154 enabled_mask >>= disabled_bits;
155 func(index);
205 } 156 }
206 marked_for_unregister.clear();
207 } 157 }
208 158
209 void CommitAsyncFlushes() { 159 template <typename Func>
210 if (uncommitted_flushes) { 160 void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) {
211 auto commit_list = std::make_shared<std::list<MapInterval*>>(); 161 const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE);
212 for (MapInterval* map : *uncommitted_flushes) { 162 for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) {
213 if (map->is_registered && map->is_modified) { 163 const BufferId buffer_id = page_table[page];
214 // TODO(Blinkhawk): Implement backend asynchronous flushing 164 if (!buffer_id) {
215 // AsyncFlushMap(map) 165 ++page;
216 commit_list->push_back(map); 166 continue;
217 }
218 }
219 if (!commit_list->empty()) {
220 committed_flushes.push_back(commit_list);
221 } else {
222 committed_flushes.emplace_back();
223 } 167 }
224 } else { 168 Buffer& buffer = slot_buffers[buffer_id];
225 committed_flushes.emplace_back(); 169 func(buffer_id, buffer);
170
171 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
172 page = Common::DivCeil(end_addr, PAGE_SIZE);
226 } 173 }
227 uncommitted_flushes.reset();
228 } 174 }
229 175
230 bool ShouldWaitAsyncFlushes() const { 176 static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
231 return !committed_flushes.empty() && committed_flushes.front() != nullptr; 177 return (cpu_addr & ~Core::Memory::PAGE_MASK) ==
178 ((cpu_addr + size) & ~Core::Memory::PAGE_MASK);
232 } 179 }
233 180
234 bool HasUncommittedFlushes() const { 181 void BindHostIndexBuffer();
235 return uncommitted_flushes != nullptr;
236 }
237 182
238 void PopAsyncFlushes() { 183 void BindHostVertexBuffers();
239 if (committed_flushes.empty()) {
240 return;
241 }
242 auto& flush_list = committed_flushes.front();
243 if (!flush_list) {
244 committed_flushes.pop_front();
245 return;
246 }
247 for (MapInterval* map : *flush_list) {
248 if (map->is_registered) {
249 // TODO(Blinkhawk): Replace this for reading the asynchronous flush
250 FlushMap(map);
251 }
252 }
253 committed_flushes.pop_front();
254 }
255 184
256 virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; 185 void BindHostGraphicsUniformBuffers(size_t stage);
257 186
258protected: 187 void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind);
259 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
260 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
261 StreamBuffer& stream_buffer_)
262 : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
263 stream_buffer{stream_buffer_} {}
264 188
265 ~BufferCache() = default; 189 void BindHostGraphicsStorageBuffers(size_t stage);
266 190
267 virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0; 191 void BindHostTransformFeedbackBuffers();
268 192
269 virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { 193 void BindHostComputeUniformBuffers();
270 return {};
271 }
272 194
273 /// Register an object into the cache 195 void BindHostComputeStorageBuffers();
274 MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
275 const VAddr cpu_addr = new_map.start;
276 if (!cpu_addr) {
277 LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
278 new_map.gpu_addr);
279 return nullptr;
280 }
281 const std::size_t size = new_map.end - new_map.start;
282 new_map.is_registered = true;
283 rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
284 new_map.is_memory_marked = true;
285 if (inherit_written) {
286 MarkRegionAsWritten(new_map.start, new_map.end - 1);
287 new_map.is_written = true;
288 }
289 MapInterval* const storage = mapped_addresses_allocator.Allocate();
290 *storage = new_map;
291 mapped_addresses.insert(*storage);
292 return storage;
293 }
294 196
295 void UnmarkMemory(MapInterval* map) { 197 void DoUpdateGraphicsBuffers(bool is_indexed);
296 if (!map->is_memory_marked) { 198
297 return; 199 void DoUpdateComputeBuffers();
298 } 200
299 const std::size_t size = map->end - map->start; 201 void UpdateIndexBuffer();
300 rasterizer.UpdatePagesCachedCount(map->start, size, -1); 202
301 map->is_memory_marked = false; 203 void UpdateVertexBuffers();
302 } 204
303 205 void UpdateVertexBuffer(u32 index);
304 /// Unregisters an object from the cache 206
305 void Unregister(MapInterval* map) { 207 void UpdateUniformBuffers(size_t stage);
306 UnmarkMemory(map); 208
307 map->is_registered = false; 209 void UpdateStorageBuffers(size_t stage);
308 if (map->is_sync_pending) { 210
309 map->is_sync_pending = false; 211 void UpdateTransformFeedbackBuffers();
310 marked_for_unregister.remove(map); 212
213 void UpdateTransformFeedbackBuffer(u32 index);
214
215 void UpdateComputeUniformBuffers();
216
217 void UpdateComputeStorageBuffers();
218
219 void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size);
220
221 [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size);
222
223 [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size);
224
225 void Register(BufferId buffer_id);
226
227 void Unregister(BufferId buffer_id);
228
229 template <bool insert>
230 void ChangeRegister(BufferId buffer_id);
231
232 void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
233
234 void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
235
236 void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
237 std::span<BufferCopy> copies);
238
239 void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
240 std::span<const BufferCopy> copies);
241
242 void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
243 std::span<const BufferCopy> copies);
244
245 void DeleteBuffer(BufferId buffer_id);
246
247 void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
248
249 void NotifyBufferDeletion();
250
251 [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const;
252
253 [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size);
254
255 [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity);
256
257 [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept;
258
259 VideoCore::RasterizerInterface& rasterizer;
260 Tegra::Engines::Maxwell3D& maxwell3d;
261 Tegra::Engines::KeplerCompute& kepler_compute;
262 Tegra::MemoryManager& gpu_memory;
263 Core::Memory::Memory& cpu_memory;
264 Runtime& runtime;
265
266 SlotVector<Buffer> slot_buffers;
267 DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
268
269 u32 last_index_count = 0;
270
271 Binding index_buffer;
272 std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
273 std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
274 std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
275 std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
276
277 std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
278 std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
279
280 std::array<u32, NUM_STAGES> enabled_uniform_buffers{};
281 u32 enabled_compute_uniform_buffers = 0;
282
283 std::array<u32, NUM_STAGES> enabled_storage_buffers{};
284 std::array<u32, NUM_STAGES> written_storage_buffers{};
285 u32 enabled_compute_storage_buffers = 0;
286 u32 written_compute_storage_buffers = 0;
287
288 std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{};
289
290 bool has_deleted_buffers = false;
291
292 std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty>
293 dirty_uniform_buffers{};
294
295 std::vector<BufferId> cached_write_buffer_ids;
296
297 // TODO: This data structure is not optimal and it should be reworked
298 std::vector<BufferId> uncommitted_downloads;
299 std::deque<std::vector<BufferId>> committed_downloads;
300
301 size_t immediate_buffer_capacity = 0;
302 std::unique_ptr<u8[]> immediate_buffer_alloc;
303
304 std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
305};
306
307template <class P>
308BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
309 Tegra::Engines::Maxwell3D& maxwell3d_,
310 Tegra::Engines::KeplerCompute& kepler_compute_,
311 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
312 Runtime& runtime_)
313 : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_},
314 gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {
315 // Ensure the first slot is used for the null buffer
316 void(slot_buffers.insert(runtime, NullBufferParams{}));
317}
318
319template <class P>
320void BufferCache<P>::TickFrame() {
321 delayed_destruction_ring.Tick();
322}
323
324template <class P>
325void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) {
326 ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
327 buffer.MarkRegionAsCpuModified(cpu_addr, size);
328 });
329}
330
331template <class P>
332void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
333 ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
334 if (!buffer.HasCachedWrites()) {
335 cached_write_buffer_ids.push_back(buffer_id);
311 } 336 }
312 if (map->is_written) { 337 buffer.CachedCpuWrite(cpu_addr, size);
313 UnmarkRegionAsWritten(map->start, map->end - 1); 338 });
339}
340
341template <class P>
342void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
343 ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
344 boost::container::small_vector<BufferCopy, 1> copies;
345 u64 total_size_bytes = 0;
346 u64 largest_copy = 0;
347 buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
348 copies.push_back(BufferCopy{
349 .src_offset = range_offset,
350 .dst_offset = total_size_bytes,
351 .size = range_size,
352 });
353 total_size_bytes += range_size;
354 largest_copy = std::max(largest_copy, range_size);
355 });
356 if (total_size_bytes == 0) {
357 return;
314 } 358 }
315 const auto it = mapped_addresses.find(*map); 359 MICROPROFILE_SCOPE(GPU_DownloadMemory);
316 ASSERT(it != mapped_addresses.end()); 360
317 mapped_addresses.erase(it); 361 if constexpr (USE_MEMORY_MAPS) {
318 mapped_addresses_allocator.Release(map); 362 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
319 } 363 const u8* const mapped_memory = download_staging.mapped_span.data();
320 364 const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
321private: 365 runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
322 MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { 366 runtime.Finish();
323 const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); 367 for (const BufferCopy& copy : copies) {
324 if (overlaps.empty()) { 368 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
325 const VAddr cpu_addr_end = cpu_addr + size; 369 const u8* copy_mapped_memory = mapped_memory + copy.dst_offset;
326 if (gpu_memory.IsGranularRange(gpu_addr, size)) { 370 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
327 u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
328 block->Upload(block->Offset(cpu_addr), size, host_ptr);
329 } else {
330 staging_buffer.resize(size);
331 gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
332 block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
333 } 371 }
334 return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); 372 } else {
335 } 373 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
336 374 for (const BufferCopy& copy : copies) {
337 const VAddr cpu_addr_end = cpu_addr + size; 375 buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
338 if (overlaps.size() == 1) { 376 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
339 MapInterval* const current_map = overlaps[0]; 377 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
340 if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
341 return current_map;
342 } 378 }
343 } 379 }
344 VAddr new_start = cpu_addr; 380 });
345 VAddr new_end = cpu_addr_end; 381}
346 bool write_inheritance = false; 382
347 bool modified_inheritance = false; 383template <class P>
348 // Calculate new buffer parameters 384void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
349 for (MapInterval* overlap : overlaps) { 385 u32 size) {
350 new_start = std::min(overlap->start, new_start); 386 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
351 new_end = std::max(overlap->end, new_end); 387 if (!cpu_addr) {
352 write_inheritance |= overlap->is_written; 388 uniform_buffers[stage][index] = NULL_BINDING;
353 modified_inheritance |= overlap->is_modified; 389 return;
390 }
391 const Binding binding{
392 .cpu_addr = *cpu_addr,
393 .size = size,
394 .buffer_id = BufferId{},
395 };
396 uniform_buffers[stage][index] = binding;
397}
398
399template <class P>
400void BufferCache<P>::UpdateGraphicsBuffers(bool is_indexed) {
401 MICROPROFILE_SCOPE(GPU_PrepareBuffers);
402 do {
403 has_deleted_buffers = false;
404 DoUpdateGraphicsBuffers(is_indexed);
405 } while (has_deleted_buffers);
406}
407
408template <class P>
409void BufferCache<P>::UpdateComputeBuffers() {
410 MICROPROFILE_SCOPE(GPU_PrepareBuffers);
411 do {
412 has_deleted_buffers = false;
413 DoUpdateComputeBuffers();
414 } while (has_deleted_buffers);
415}
416
417template <class P>
418void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) {
419 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
420 if (is_indexed) {
421 BindHostIndexBuffer();
422 } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
423 const auto& regs = maxwell3d.regs;
424 if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
425 runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count);
354 } 426 }
355 GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; 427 }
356 for (auto& overlap : overlaps) { 428 BindHostVertexBuffers();
357 Unregister(overlap); 429 BindHostTransformFeedbackBuffers();
430}
431
432template <class P>
433void BufferCache<P>::BindHostStageBuffers(size_t stage) {
434 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
435 BindHostGraphicsUniformBuffers(stage);
436 BindHostGraphicsStorageBuffers(stage);
437}
438
439template <class P>
440void BufferCache<P>::BindHostComputeBuffers() {
441 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
442 BindHostComputeUniformBuffers();
443 BindHostComputeStorageBuffers();
444}
445
446template <class P>
447void BufferCache<P>::SetEnabledUniformBuffers(size_t stage, u32 enabled) {
448 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
449 if (enabled_uniform_buffers[stage] != enabled) {
450 dirty_uniform_buffers[stage] = ~u32{0};
358 } 451 }
359 UpdateBlock(block, new_start, new_end, overlaps); 452 }
360 453 enabled_uniform_buffers[stage] = enabled;
361 const MapInterval new_map{new_start, new_end, new_gpu_addr}; 454}
362 MapInterval* const map = Register(new_map, write_inheritance); 455
363 if (!map) { 456template <class P>
364 return nullptr; 457void BufferCache<P>::SetEnabledComputeUniformBuffers(u32 enabled) {
458 enabled_compute_uniform_buffers = enabled;
459}
460
461template <class P>
462void BufferCache<P>::UnbindGraphicsStorageBuffers(size_t stage) {
463 enabled_storage_buffers[stage] = 0;
464 written_storage_buffers[stage] = 0;
465}
466
467template <class P>
468void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index,
469 u32 cbuf_offset, bool is_written) {
470 enabled_storage_buffers[stage] |= 1U << ssbo_index;
471 written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index;
472
473 const auto& cbufs = maxwell3d.state.shader_stages[stage];
474 const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset;
475 storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr);
476}
477
478template <class P>
479void BufferCache<P>::UnbindComputeStorageBuffers() {
480 enabled_compute_storage_buffers = 0;
481 written_compute_storage_buffers = 0;
482}
483
484template <class P>
485void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
486 bool is_written) {
487 enabled_compute_storage_buffers |= 1U << ssbo_index;
488 written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index;
489
490 const auto& launch_desc = kepler_compute.launch_description;
491 ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0);
492
493 const auto& cbufs = launch_desc.const_buffer_config;
494 const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset;
495 compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr);
496}
497
498template <class P>
499void BufferCache<P>::FlushCachedWrites() {
500 for (const BufferId buffer_id : cached_write_buffer_ids) {
501 slot_buffers[buffer_id].FlushCachedWrites();
502 }
503 cached_write_buffer_ids.clear();
504}
505
506template <class P>
507bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
508 return !uncommitted_downloads.empty();
509}
510
511template <class P>
512bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
513 return !committed_downloads.empty() && !committed_downloads.front().empty();
514}
515
516template <class P>
517void BufferCache<P>::CommitAsyncFlushes() {
518 // This is intentionally passing the value by copy
519 committed_downloads.push_front(uncommitted_downloads);
520 uncommitted_downloads.clear();
521}
522
523template <class P>
524void BufferCache<P>::PopAsyncFlushes() {
525 if (committed_downloads.empty()) {
526 return;
527 }
528 auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); });
529 const std::span<const BufferId> download_ids = committed_downloads.back();
530 if (download_ids.empty()) {
531 return;
532 }
533 MICROPROFILE_SCOPE(GPU_DownloadMemory);
534
535 boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
536 u64 total_size_bytes = 0;
537 u64 largest_copy = 0;
538 for (const BufferId buffer_id : download_ids) {
539 slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) {
540 downloads.push_back({
541 BufferCopy{
542 .src_offset = range_offset,
543 .dst_offset = total_size_bytes,
544 .size = range_size,
545 },
546 buffer_id,
547 });
548 total_size_bytes += range_size;
549 largest_copy = std::max(largest_copy, range_size);
550 });
551 }
552 if (downloads.empty()) {
553 return;
554 }
555 if constexpr (USE_MEMORY_MAPS) {
556 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
557 for (const auto [copy, buffer_id] : downloads) {
558 const std::array copies{copy};
559 runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies);
365 } 560 }
366 if (modified_inheritance) { 561 runtime.Finish();
367 map->MarkAsModified(true, GetModifiedTicks()); 562 for (const auto [copy, buffer_id] : downloads) {
368 if (Settings::IsGPULevelHigh() && 563 const Buffer& buffer = slot_buffers[buffer_id];
369 Settings::values.use_asynchronous_gpu_emulation.GetValue()) { 564 const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
370 MarkForAsyncFlush(map); 565 const u8* read_mapped_memory = download_staging.mapped_span.data() + copy.dst_offset;
371 } 566 cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size);
567 }
568 } else {
569 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
570 for (const auto [copy, buffer_id] : downloads) {
571 Buffer& buffer = slot_buffers[buffer_id];
572 buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
573 const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
574 cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
372 } 575 }
373 return map;
374 } 576 }
375 577}
376 void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { 578
377 const IntervalType base_interval{start, end}; 579template <class P>
378 IntervalSet interval_set{}; 580bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
379 interval_set.add(base_interval); 581 const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
380 for (auto& overlap : overlaps) { 582 for (u64 page = addr >> PAGE_BITS; page < page_end;) {
381 const IntervalType subtract{overlap->start, overlap->end}; 583 const BufferId image_id = page_table[page];
382 interval_set.subtract(subtract); 584 if (!image_id) {
585 ++page;
586 continue;
383 } 587 }
384 for (auto& interval : interval_set) { 588 Buffer& buffer = slot_buffers[image_id];
385 const std::size_t size = interval.upper() - interval.lower(); 589 if (buffer.IsRegionGpuModified(addr, size)) {
386 if (size == 0) { 590 return true;
387 continue;
388 }
389 staging_buffer.resize(size);
390 cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
391 block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
392 } 591 }
592 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
593 page = Common::DivCeil(end_addr, PAGE_SIZE);
393 } 594 }
394 595 return false;
395 VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) { 596}
396 VectorMapInterval result; 597
397 if (size == 0) { 598template <class P>
398 return result; 599void BufferCache<P>::BindHostIndexBuffer() {
600 Buffer& buffer = slot_buffers[index_buffer.buffer_id];
601 const u32 offset = buffer.Offset(index_buffer.cpu_addr);
602 const u32 size = index_buffer.size;
603 SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
604 if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
605 runtime.BindIndexBuffer(buffer, offset, size);
606 } else {
607 runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format,
608 maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count,
609 buffer, offset, size);
610 }
611}
612
613template <class P>
614void BufferCache<P>::BindHostVertexBuffers() {
615 auto& flags = maxwell3d.dirty.flags;
616 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
617 const Binding& binding = vertex_buffers[index];
618 Buffer& buffer = slot_buffers[binding.buffer_id];
619 SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
620 if (!flags[Dirty::VertexBuffer0 + index]) {
621 continue;
399 } 622 }
623 flags[Dirty::VertexBuffer0 + index] = false;
624
625 const u32 stride = maxwell3d.regs.vertex_array[index].stride;
626 const u32 offset = buffer.Offset(binding.cpu_addr);
627 runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride);
628 }
629}
400 630
401 const VAddr addr_end = addr + size; 631template <class P>
402 auto it = mapped_addresses.lower_bound(addr); 632void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
403 if (it != mapped_addresses.begin()) { 633 u32 dirty = ~0U;
404 --it; 634 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
635 dirty = std::exchange(dirty_uniform_buffers[stage], 0);
636 }
637 u32 binding_index = 0;
638 ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
639 const bool needs_bind = ((dirty >> index) & 1) != 0;
640 BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind);
641 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
642 ++binding_index;
405 } 643 }
406 while (it != mapped_addresses.end() && it->start < addr_end) { 644 });
407 if (it->Overlaps(addr, addr_end)) { 645}
408 result.push_back(&*it); 646
647template <class P>
648void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index,
649 bool needs_bind) {
650 const Binding& binding = uniform_buffers[stage][index];
651 const VAddr cpu_addr = binding.cpu_addr;
652 const u32 size = binding.size;
653 Buffer& buffer = slot_buffers[binding.buffer_id];
654 if constexpr (IS_OPENGL) {
655 if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) {
656 if (runtime.HasFastBufferSubData()) {
657 // Fast path for Nvidia
658 if (!HasFastUniformBufferBound(stage, binding_index)) {
659 // We only have to bind when the currently bound buffer is not the fast version
660 fast_bound_uniform_buffers[stage] |= 1U << binding_index;
661 runtime.BindFastUniformBuffer(stage, binding_index, size);
662 }
663 const auto span = ImmediateBufferWithData(cpu_addr, size);
664 runtime.PushFastUniformBuffer(stage, binding_index, span);
665 } else {
666 // Stream buffer path to avoid stalling on non-Nvidia drivers
667 const auto span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
668 cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
409 } 669 }
410 ++it; 670 return;
411 } 671 }
412 return result;
413 } 672 }
414 673 // Classic cached path
415 /// Returns a ticks counter used for tracking when cached objects were last modified 674 SynchronizeBuffer(buffer, cpu_addr, size);
416 u64 GetModifiedTicks() { 675 if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) {
417 return ++modified_ticks; 676 // Skip binding if it's not needed and if the bound buffer is not the fast version
677 // This exists to avoid instances where the fast buffer is bound and a GPU write happens
678 return;
418 } 679 }
680 fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
419 681
420 void FlushMap(MapInterval* map) { 682 const u32 offset = buffer.Offset(cpu_addr);
421 const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); 683 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
422 ASSERT_OR_EXECUTE(it != blocks.end(), return;); 684 runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size);
423 685 } else {
424 std::shared_ptr<Buffer> block = it->second; 686 runtime.BindUniformBuffer(buffer, offset, size);
425
426 const std::size_t size = map->end - map->start;
427 staging_buffer.resize(size);
428 block->Download(block->Offset(map->start), size, staging_buffer.data());
429 cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size);
430 map->MarkAsModified(false, 0);
431 } 687 }
688}
689
690template <class P>
691void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
692 u32 binding_index = 0;
693 ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
694 const Binding& binding = storage_buffers[stage][index];
695 Buffer& buffer = slot_buffers[binding.buffer_id];
696 const u32 size = binding.size;
697 SynchronizeBuffer(buffer, binding.cpu_addr, size);
698
699 const u32 offset = buffer.Offset(binding.cpu_addr);
700 const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0;
701 if constexpr (NEEDS_BIND_STORAGE_INDEX) {
702 runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written);
703 ++binding_index;
704 } else {
705 runtime.BindStorageBuffer(buffer, offset, size, is_written);
706 }
707 });
708}
432 709
433 template <typename Callable> 710template <class P>
434 BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { 711void BufferCache<P>::BindHostTransformFeedbackBuffers() {
435 AlignBuffer(alignment); 712 if (maxwell3d.regs.tfb_enabled == 0) {
436 const std::size_t uploaded_offset = buffer_offset; 713 return;
437 callable(buffer_ptr);
438
439 buffer_ptr += size;
440 buffer_offset += size;
441 return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
442 } 714 }
443 715 for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
444 void AlignBuffer(std::size_t alignment) { 716 const Binding& binding = transform_feedback_buffers[index];
445 // Align the offset, not the mapped pointer 717 Buffer& buffer = slot_buffers[binding.buffer_id];
446 const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment); 718 const u32 size = binding.size;
447 buffer_ptr += offset_aligned - buffer_offset; 719 SynchronizeBuffer(buffer, binding.cpu_addr, size);
448 buffer_offset = offset_aligned; 720
721 const u32 offset = buffer.Offset(binding.cpu_addr);
722 runtime.BindTransformFeedbackBuffer(index, buffer, offset, size);
449 } 723 }
724}
450 725
451 std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) { 726template <class P>
452 const std::size_t old_size = buffer->Size(); 727void BufferCache<P>::BindHostComputeUniformBuffers() {
453 const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; 728 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
454 const VAddr cpu_addr = buffer->CpuAddr(); 729 // Mark all uniform buffers as dirty
455 std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size); 730 dirty_uniform_buffers.fill(~u32{0});
456 new_buffer->CopyFrom(*buffer, 0, 0, old_size); 731 }
457 QueueDestruction(std::move(buffer)); 732 u32 binding_index = 0;
458 733 ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
459 const VAddr cpu_addr_end = cpu_addr + new_size - 1; 734 const Binding& binding = compute_uniform_buffers[index];
460 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 735 Buffer& buffer = slot_buffers[binding.buffer_id];
461 for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { 736 const u32 size = binding.size;
462 blocks.insert_or_assign(page_start, new_buffer); 737 SynchronizeBuffer(buffer, binding.cpu_addr, size);
738
739 const u32 offset = buffer.Offset(binding.cpu_addr);
740 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
741 runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size);
742 ++binding_index;
743 } else {
744 runtime.BindUniformBuffer(buffer, offset, size);
463 } 745 }
746 });
747}
748
749template <class P>
750void BufferCache<P>::BindHostComputeStorageBuffers() {
751 u32 binding_index = 0;
752 ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
753 const Binding& binding = compute_storage_buffers[index];
754 Buffer& buffer = slot_buffers[binding.buffer_id];
755 const u32 size = binding.size;
756 SynchronizeBuffer(buffer, binding.cpu_addr, size);
757
758 const u32 offset = buffer.Offset(binding.cpu_addr);
759 const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0;
760 if constexpr (NEEDS_BIND_STORAGE_INDEX) {
761 runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written);
762 ++binding_index;
763 } else {
764 runtime.BindStorageBuffer(buffer, offset, size, is_written);
765 }
766 });
767}
464 768
465 return new_buffer; 769template <class P>
770void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
771 if (is_indexed) {
772 UpdateIndexBuffer();
466 } 773 }
774 UpdateVertexBuffers();
775 UpdateTransformFeedbackBuffers();
776 for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
777 UpdateUniformBuffers(stage);
778 UpdateStorageBuffers(stage);
779 }
780}
781
782template <class P>
783void BufferCache<P>::DoUpdateComputeBuffers() {
784 UpdateComputeUniformBuffers();
785 UpdateComputeStorageBuffers();
786}
787
788template <class P>
789void BufferCache<P>::UpdateIndexBuffer() {
790 // We have to check for the dirty flags and index count
791 // The index count is currently changed without updating the dirty flags
792 const auto& index_array = maxwell3d.regs.index_array;
793 auto& flags = maxwell3d.dirty.flags;
794 if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) {
795 return;
796 }
797 flags[Dirty::IndexBuffer] = false;
798 last_index_count = index_array.count;
799
800 const GPUVAddr gpu_addr_begin = index_array.StartAddress();
801 const GPUVAddr gpu_addr_end = index_array.EndAddress();
802 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
803 const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
804 const u32 draw_size = index_array.count * index_array.FormatSizeInBytes();
805 const u32 size = std::min(address_size, draw_size);
806 if (size == 0 || !cpu_addr) {
807 index_buffer = NULL_BINDING;
808 return;
809 }
810 index_buffer = Binding{
811 .cpu_addr = *cpu_addr,
812 .size = size,
813 .buffer_id = FindBuffer(*cpu_addr, size),
814 };
815}
467 816
468 std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first, 817template <class P>
469 std::shared_ptr<Buffer> second) { 818void BufferCache<P>::UpdateVertexBuffers() {
470 const std::size_t size_1 = first->Size(); 819 auto& flags = maxwell3d.dirty.flags;
471 const std::size_t size_2 = second->Size(); 820 if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) {
472 const VAddr first_addr = first->CpuAddr(); 821 return;
473 const VAddr second_addr = second->CpuAddr(); 822 }
474 const VAddr new_addr = std::min(first_addr, second_addr); 823 flags[Dirty::VertexBuffers] = false;
475 const std::size_t new_size = size_1 + size_2;
476
477 std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
478 new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
479 new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
480 QueueDestruction(std::move(first));
481 QueueDestruction(std::move(second));
482 824
483 const VAddr cpu_addr_end = new_addr + new_size - 1; 825 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
484 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 826 UpdateVertexBuffer(index);
485 for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
486 blocks.insert_or_assign(page_start, new_buffer);
487 }
488 return new_buffer;
489 } 827 }
828}
490 829
491 Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { 830template <class P>
492 std::shared_ptr<Buffer> found; 831void BufferCache<P>::UpdateVertexBuffer(u32 index) {
832 if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) {
833 return;
834 }
835 const auto& array = maxwell3d.regs.vertex_array[index];
836 const auto& limit = maxwell3d.regs.vertex_array_limit[index];
837 const GPUVAddr gpu_addr_begin = array.StartAddress();
838 const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1;
839 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
840 const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
841 const u32 size = address_size; // TODO: Analyze stride and number of vertices
842 if (array.enable == 0 || size == 0 || !cpu_addr) {
843 vertex_buffers[index] = NULL_BINDING;
844 return;
845 }
846 vertex_buffers[index] = Binding{
847 .cpu_addr = *cpu_addr,
848 .size = size,
849 .buffer_id = FindBuffer(*cpu_addr, size),
850 };
851}
852
853template <class P>
854void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
855 ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
856 Binding& binding = uniform_buffers[stage][index];
857 if (binding.buffer_id) {
858 // Already updated
859 return;
860 }
861 // Mark as dirty
862 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
863 dirty_uniform_buffers[stage] |= 1U << index;
864 }
865 // Resolve buffer
866 binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
867 });
868}
869
870template <class P>
871void BufferCache<P>::UpdateStorageBuffers(size_t stage) {
872 const u32 written_mask = written_storage_buffers[stage];
873 ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
874 // Resolve buffer
875 Binding& binding = storage_buffers[stage][index];
876 const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
877 binding.buffer_id = buffer_id;
878 // Mark buffer as written if needed
879 if (((written_mask >> index) & 1) != 0) {
880 MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
881 }
882 });
883}
493 884
494 const VAddr cpu_addr_end = cpu_addr + size - 1; 885template <class P>
495 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 886void BufferCache<P>::UpdateTransformFeedbackBuffers() {
496 for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { 887 if (maxwell3d.regs.tfb_enabled == 0) {
497 auto it = blocks.find(page_start); 888 return;
498 if (it == blocks.end()) { 889 }
499 if (found) { 890 for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
500 found = EnlargeBlock(found); 891 UpdateTransformFeedbackBuffer(index);
501 continue; 892 }
502 } 893}
503 const VAddr start_addr = page_start << BLOCK_PAGE_BITS; 894
504 found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); 895template <class P>
505 blocks.insert_or_assign(page_start, found); 896void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) {
506 continue; 897 const auto& binding = maxwell3d.regs.tfb_bindings[index];
507 } 898 const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset;
508 if (!found) { 899 const u32 size = binding.buffer_size;
509 found = it->second; 900 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
510 continue; 901 if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) {
511 } 902 transform_feedback_buffers[index] = NULL_BINDING;
512 if (found != it->second) { 903 return;
513 found = MergeBlocks(std::move(found), it->second); 904 }
905 const BufferId buffer_id = FindBuffer(*cpu_addr, size);
906 transform_feedback_buffers[index] = Binding{
907 .cpu_addr = *cpu_addr,
908 .size = size,
909 .buffer_id = buffer_id,
910 };
911 MarkWrittenBuffer(buffer_id, *cpu_addr, size);
912}
913
914template <class P>
915void BufferCache<P>::UpdateComputeUniformBuffers() {
916 ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
917 Binding& binding = compute_uniform_buffers[index];
918 binding = NULL_BINDING;
919 const auto& launch_desc = kepler_compute.launch_description;
920 if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) {
921 const auto& cbuf = launch_desc.const_buffer_config[index];
922 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address());
923 if (cpu_addr) {
924 binding.cpu_addr = *cpu_addr;
925 binding.size = cbuf.size;
514 } 926 }
515 } 927 }
516 return found.get(); 928 binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
929 });
930}
931
932template <class P>
933void BufferCache<P>::UpdateComputeStorageBuffers() {
934 ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
935 // Resolve buffer
936 Binding& binding = compute_storage_buffers[index];
937 const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
938 binding.buffer_id = buffer_id;
939 // Mark as written if needed
940 if (((written_compute_storage_buffers >> index) & 1) != 0) {
941 MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
942 }
943 });
944}
945
946template <class P>
947void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) {
948 Buffer& buffer = slot_buffers[buffer_id];
949 buffer.MarkRegionAsGpuModified(cpu_addr, size);
950
951 const bool is_accuracy_high = Settings::IsGPULevelHigh();
952 const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
953 if (!is_accuracy_high || !is_async) {
954 return;
517 } 955 }
956 if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) {
957 // Already inserted
958 return;
959 }
960 uncommitted_downloads.push_back(buffer_id);
961}
518 962
519 void MarkRegionAsWritten(VAddr start, VAddr end) { 963template <class P>
520 const u64 page_end = end >> WRITE_PAGE_BIT; 964BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) {
521 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 965 if (cpu_addr == 0) {
522 if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) { 966 return NULL_BUFFER_ID;
523 ++it->second; 967 }
524 } 968 const u64 page = cpu_addr >> PAGE_BITS;
969 const BufferId buffer_id = page_table[page];
970 if (!buffer_id) {
971 return CreateBuffer(cpu_addr, size);
972 }
973 const Buffer& buffer = slot_buffers[buffer_id];
974 if (buffer.IsInBounds(cpu_addr, size)) {
975 return buffer_id;
976 }
977 return CreateBuffer(cpu_addr, size);
978}
979
980template <class P>
981BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
982 std::vector<BufferId> overlap_ids;
983 VAddr cpu_addr_begin = cpu_addr;
984 VAddr cpu_addr_end = cpu_addr + wanted_size;
985 for (; cpu_addr >> PAGE_BITS < Common::DivCeil(cpu_addr_end, PAGE_SIZE);
986 cpu_addr += PAGE_SIZE) {
987 const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS];
988 if (!overlap_id) {
989 continue;
990 }
991 Buffer& overlap = slot_buffers[overlap_id];
992 if (overlap.IsPicked()) {
993 continue;
994 }
995 overlap.Pick();
996 overlap_ids.push_back(overlap_id);
997 const VAddr overlap_cpu_addr = overlap.CpuAddr();
998 if (overlap_cpu_addr < cpu_addr_begin) {
999 cpu_addr = cpu_addr_begin = overlap_cpu_addr;
525 } 1000 }
1001 cpu_addr_end = std::max(cpu_addr_end, overlap_cpu_addr + overlap.SizeBytes());
526 } 1002 }
527 1003 const u32 size = static_cast<u32>(cpu_addr_end - cpu_addr_begin);
528 void UnmarkRegionAsWritten(VAddr start, VAddr end) { 1004 const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, cpu_addr_begin, size);
529 const u64 page_end = end >> WRITE_PAGE_BIT; 1005 Buffer& new_buffer = slot_buffers[new_buffer_id];
530 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 1006
531 auto it = written_pages.find(page_start); 1007 for (const BufferId overlap_id : overlap_ids) {
532 if (it != written_pages.end()) { 1008 Buffer& overlap = slot_buffers[overlap_id];
533 if (it->second > 1) { 1009 overlap.Unpick();
534 --it->second; 1010
535 } else { 1011 std::vector<BufferCopy> copies;
536 written_pages.erase(it); 1012 const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr();
537 } 1013 overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) {
538 } 1014 copies.push_back(BufferCopy{
1015 .src_offset = begin,
1016 .dst_offset = dst_base_offset + begin,
1017 .size = range_size,
1018 });
1019 new_buffer.UnmarkRegionAsCpuModified(begin, range_size);
1020 new_buffer.MarkRegionAsGpuModified(begin, range_size);
1021 });
1022 if (!copies.empty()) {
1023 runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies);
1024 }
1025 ReplaceBufferDownloads(overlap_id, new_buffer_id);
1026 DeleteBuffer(overlap_id);
1027 }
1028 Register(new_buffer_id);
1029 return new_buffer_id;
1030}
1031
1032template <class P>
1033void BufferCache<P>::Register(BufferId buffer_id) {
1034 ChangeRegister<true>(buffer_id);
1035}
1036
1037template <class P>
1038void BufferCache<P>::Unregister(BufferId buffer_id) {
1039 ChangeRegister<false>(buffer_id);
1040}
1041
1042template <class P>
1043template <bool insert>
1044void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
1045 const Buffer& buffer = slot_buffers[buffer_id];
1046 const VAddr cpu_addr_begin = buffer.CpuAddr();
1047 const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes();
1048 const u64 page_begin = cpu_addr_begin / PAGE_SIZE;
1049 const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE);
1050 for (u64 page = page_begin; page != page_end; ++page) {
1051 if constexpr (insert) {
1052 page_table[page] = buffer_id;
1053 } else {
1054 page_table[page] = BufferId{};
539 } 1055 }
540 } 1056 }
1057}
541 1058
542 bool IsRegionWritten(VAddr start, VAddr end) const { 1059template <class P>
543 const u64 page_end = end >> WRITE_PAGE_BIT; 1060void BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
544 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 1061 if (buffer.CpuAddr() == 0) {
545 if (written_pages.contains(page_start)) { 1062 return;
546 return true; 1063 }
1064 SynchronizeBufferImpl(buffer, cpu_addr, size);
1065}
1066
1067template <class P>
1068void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) {
1069 boost::container::small_vector<BufferCopy, 4> copies;
1070 u64 total_size_bytes = 0;
1071 u64 largest_copy = 0;
1072 buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
1073 copies.push_back(BufferCopy{
1074 .src_offset = total_size_bytes,
1075 .dst_offset = range_offset,
1076 .size = range_size,
1077 });
1078 total_size_bytes += range_size;
1079 largest_copy = std::max(largest_copy, range_size);
1080 });
1081 if (total_size_bytes == 0) {
1082 return;
1083 }
1084 const std::span<BufferCopy> copies_span(copies.data(), copies.size());
1085 UploadMemory(buffer, total_size_bytes, largest_copy, copies_span);
1086}
1087
1088template <class P>
1089void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
1090 std::span<BufferCopy> copies) {
1091 if constexpr (USE_MEMORY_MAPS) {
1092 MappedUploadMemory(buffer, total_size_bytes, copies);
1093 } else {
1094 ImmediateUploadMemory(buffer, largest_copy, copies);
1095 }
1096}
1097
1098template <class P>
1099void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
1100 std::span<const BufferCopy> copies) {
1101 std::span<u8> immediate_buffer;
1102 for (const BufferCopy& copy : copies) {
1103 std::span<const u8> upload_span;
1104 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
1105 if (IsRangeGranular(cpu_addr, copy.size)) {
1106 upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size);
1107 } else {
1108 if (immediate_buffer.empty()) {
1109 immediate_buffer = ImmediateBuffer(largest_copy);
547 } 1110 }
1111 cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
1112 upload_span = immediate_buffer.subspan(0, copy.size);
548 } 1113 }
549 return false; 1114 buffer.ImmediateUpload(copy.dst_offset, upload_span);
550 } 1115 }
551 1116}
552 void QueueDestruction(std::shared_ptr<Buffer> buffer) { 1117
553 buffer->SetEpoch(epoch); 1118template <class P>
554 pending_destruction.push(std::move(buffer)); 1119void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
1120 std::span<const BufferCopy> copies) {
1121 auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes);
1122 const std::span<u8> staging_pointer = upload_staging.mapped_span;
1123 for (const BufferCopy& copy : copies) {
1124 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
1125 u8* const src_pointer = staging_pointer.data() + copy.src_offset;
1126 cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size);
555 } 1127 }
556 1128 runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
557 void MarkForAsyncFlush(MapInterval* map) { 1129}
558 if (!uncommitted_flushes) { 1130
559 uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>(); 1131template <class P>
1132void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
1133 const auto scalar_replace = [buffer_id](Binding& binding) {
1134 if (binding.buffer_id == buffer_id) {
1135 binding.buffer_id = BufferId{};
1136 }
1137 };
1138 const auto replace = [scalar_replace](std::span<Binding> bindings) {
1139 std::ranges::for_each(bindings, scalar_replace);
1140 };
1141 scalar_replace(index_buffer);
1142 replace(vertex_buffers);
1143 std::ranges::for_each(uniform_buffers, replace);
1144 std::ranges::for_each(storage_buffers, replace);
1145 replace(transform_feedback_buffers);
1146 replace(compute_uniform_buffers);
1147 replace(compute_storage_buffers);
1148 std::erase(cached_write_buffer_ids, buffer_id);
1149
1150 // Mark the whole buffer as CPU written to stop tracking CPU writes
1151 Buffer& buffer = slot_buffers[buffer_id];
1152 buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes());
1153
1154 Unregister(buffer_id);
1155 delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
1156
1157 NotifyBufferDeletion();
1158}
1159
1160template <class P>
1161void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) {
1162 const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) {
1163 std::ranges::replace(buffers, old_buffer_id, new_buffer_id);
1164 if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) {
1165 buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end());
560 } 1166 }
561 uncommitted_flushes->insert(map); 1167 };
1168 replace(uncommitted_downloads);
1169 std::ranges::for_each(committed_downloads, replace);
1170}
1171
1172template <class P>
1173void BufferCache<P>::NotifyBufferDeletion() {
1174 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
1175 dirty_uniform_buffers.fill(~u32{0});
562 } 1176 }
1177 auto& flags = maxwell3d.dirty.flags;
1178 flags[Dirty::IndexBuffer] = true;
1179 flags[Dirty::VertexBuffers] = true;
1180 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
1181 flags[Dirty::VertexBuffer0 + index] = true;
1182 }
1183 has_deleted_buffers = true;
1184}
1185
1186template <class P>
1187typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr) const {
1188 const GPUVAddr gpu_addr = gpu_memory.Read<u64>(ssbo_addr);
1189 const u32 size = gpu_memory.Read<u32>(ssbo_addr + 8);
1190 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
1191 if (!cpu_addr || size == 0) {
1192 return NULL_BINDING;
1193 }
1194 const Binding binding{
1195 .cpu_addr = *cpu_addr,
1196 .size = size,
1197 .buffer_id = BufferId{},
1198 };
1199 return binding;
1200}
1201
1202template <class P>
1203std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size_t size) {
1204 u8* const base_pointer = cpu_memory.GetPointer(cpu_addr);
1205 if (IsRangeGranular(cpu_addr, size) ||
1206 base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) {
1207 return std::span(base_pointer, size);
1208 } else {
1209 const std::span<u8> span = ImmediateBuffer(size);
1210 cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
1211 return span;
1212 }
1213}
563 1214
564 VideoCore::RasterizerInterface& rasterizer; 1215template <class P>
565 Tegra::MemoryManager& gpu_memory; 1216std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) {
566 Core::Memory::Memory& cpu_memory; 1217 if (wanted_capacity > immediate_buffer_capacity) {
567 StreamBuffer& stream_buffer; 1218 immediate_buffer_capacity = wanted_capacity;
568 1219 immediate_buffer_alloc = std::make_unique<u8[]>(wanted_capacity);
569 u8* buffer_ptr = nullptr; 1220 }
570 u64 buffer_offset = 0; 1221 return std::span<u8>(immediate_buffer_alloc.get(), wanted_capacity);
571 u64 buffer_offset_base = 0; 1222}
572 1223
573 MapIntervalAllocator mapped_addresses_allocator; 1224template <class P>
574 boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>> 1225bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept {
575 mapped_addresses; 1226 if constexpr (IS_OPENGL) {
576 1227 return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0;
577 std::unordered_map<u64, u32> written_pages; 1228 } else {
578 std::unordered_map<u64, std::shared_ptr<Buffer>> blocks; 1229 // Only OpenGL has fast uniform buffers
579 1230 return false;
580 std::queue<std::shared_ptr<Buffer>> pending_destruction; 1231 }
581 u64 epoch = 0; 1232}
582 u64 modified_ticks = 0;
583
584 std::vector<u8> staging_buffer;
585
586 std::list<MapInterval*> marked_for_unregister;
587
588 std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
589 std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
590
591 std::recursive_mutex mutex;
592};
593 1233
594} // namespace VideoCommon 1234} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp
deleted file mode 100644
index 62587e18a..000000000
--- a/src/video_core/buffer_cache/map_interval.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <array>
7#include <cstddef>
8#include <memory>
9
10#include "video_core/buffer_cache/map_interval.h"
11
12namespace VideoCommon {
13
14MapIntervalAllocator::MapIntervalAllocator() {
15 FillFreeList(first_chunk);
16}
17
18MapIntervalAllocator::~MapIntervalAllocator() = default;
19
20void MapIntervalAllocator::AllocateNewChunk() {
21 *new_chunk = std::make_unique<Chunk>();
22 FillFreeList(**new_chunk);
23 new_chunk = &(*new_chunk)->next;
24}
25
26void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
27 const std::size_t old_size = free_list.size();
28 free_list.resize(old_size + chunk.data.size());
29 std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
30 [](MapInterval& interval) { return &interval; });
31}
32
33} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
deleted file mode 100644
index ef974b08a..000000000
--- a/src/video_core/buffer_cache/map_interval.h
+++ /dev/null
@@ -1,93 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <cstddef>
9#include <memory>
10#include <vector>
11
12#include <boost/intrusive/set_hook.hpp>
13
14#include "common/common_types.h"
15#include "video_core/gpu.h"
16
17namespace VideoCommon {
18
19struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
20 MapInterval() = default;
21
22 /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
23
24 explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
25 : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
26
27 bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
28 return start <= other_start && other_end <= end;
29 }
30
31 bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
32 return start < other_end && other_start < end;
33 }
34
35 void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
36 is_modified = is_modified_;
37 ticks = ticks_;
38 }
39
40 boost::intrusive::set_member_hook<> member_hook_;
41 VAddr start = 0;
42 VAddr end = 0;
43 GPUVAddr gpu_addr = 0;
44 u64 ticks = 0;
45 bool is_written = false;
46 bool is_modified = false;
47 bool is_registered = false;
48 bool is_memory_marked = false;
49 bool is_sync_pending = false;
50};
51
52struct MapIntervalCompare {
53 constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
54 return lhs.start < rhs.start;
55 }
56};
57
58class MapIntervalAllocator {
59public:
60 MapIntervalAllocator();
61 ~MapIntervalAllocator();
62
63 MapInterval* Allocate() {
64 if (free_list.empty()) {
65 AllocateNewChunk();
66 }
67 MapInterval* const interval = free_list.back();
68 free_list.pop_back();
69 return interval;
70 }
71
72 void Release(MapInterval* interval) {
73 free_list.push_back(interval);
74 }
75
76private:
77 struct Chunk {
78 std::unique_ptr<Chunk> next;
79 std::array<MapInterval, 0x8000> data;
80 };
81
82 void AllocateNewChunk();
83
84 void FillFreeList(Chunk& chunk);
85
86 std::vector<MapInterval*> free_list;
87
88 Chunk first_chunk;
89
90 std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
91};
92
93} // namespace VideoCommon