video_core: Reimplement the buffer cache

Reimplement the buffer cache using cached bindings and page level granularity for modification tracking. This also drops the usage of shared pointers and virtual functions from the cache. - Bindings are cached, allowing to skip work when the game changes few bits between draws. - OpenGL Assembly shaders no longer copy when a region has been modified from the GPU to emulate constant buffers, instead GL_EXT_memory_object is used to alias sub-buffers within the same allocation. - OpenGL Assembly shaders stream constant buffer data using glProgramBufferParametersIuivNV, from NV_parameter_buffer_object. In theory this should save one hash table resolve inside the driver compared to glBufferSubData. - A new OpenGL stream buffer is implemented based on fences for drivers that are not Nvidia's proprietary, due to their low performance on partial glBufferSubData calls synchronized with 3D rendering (that some games use a lot). - Most optimizations are shared between APIs now, allowing Vulkan to cache more bindings than before, skipping unnecesarry work. This commit adds the necessary infrastructure to use Vulkan object from OpenGL. Overall, it improves performance and fixes some bugs present on the old cache. There are still some edge cases hit by some games that harm performance on some vendors, this are planned to be fixed in later commits.
author: ReinUsesLisp 2021-01-16 20:48:58 -0300
committer: ReinUsesLisp 2021-02-13 02:17:22 -0300
commit: 82c2601555b59a94d7160f2fd686cb63d32dd423 (patch)
tree: cd0ecd865945452fa589b572de614fc487f2f96a /src/video_core/buffer_cache
parent: vulkan_common: Expose interop and headless devices (diff)
download: yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar.gz
yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar.xz
yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.zip
5 files changed, 1132 insertions, 667 deletions
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
deleted file mode 100644
index e9306194a..000000000
--- a/src/video_core/buffer_cache/buffer_block.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-#pragma once
-#include "common/common_types.h"
-namespace VideoCommon {
-class BufferBlock {
-public:
-    [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
-        return (cpu_addr < end) && (cpu_addr_end > start);
-    }
-    [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
-        return cpu_addr <= other_start && other_end <= cpu_addr_end;
-    }
-    [[nodiscard]] std::size_t Offset(VAddr in_addr) const {
-        return static_cast<std::size_t>(in_addr - cpu_addr);
-    }
-    [[nodiscard]] VAddr CpuAddr() const {
-        return cpu_addr;
-    }
-    [[nodiscard]] VAddr CpuAddrEnd() const {
-        return cpu_addr_end;
-    }
-    void SetCpuAddr(VAddr new_addr) {
-        cpu_addr = new_addr;
-        cpu_addr_end = new_addr + size;
-    }
-    [[nodiscard]] std::size_t Size() const {
-        return size;
-    }
-    [[nodiscard]] u64 Epoch() const {
-        return epoch;
-    }
-    void SetEpoch(u64 new_epoch) {
-        epoch = new_epoch;
-    }
-protected:
-    explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
-        SetCpuAddr(cpu_addr_);
-    }
-private:
-    VAddr cpu_addr{};
-    VAddr cpu_addr_end{};
-    std::size_t size{};
-    u64 epoch{};
-};
-} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
new file mode 100644
index 000000000..ab32294c8
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -0,0 +1,13 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+#include "common/microprofile.h"
+namespace VideoCommon {
+MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128));
+MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128));
+MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128));
+} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 83b9ee871..e4f3c8e35 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -4,591 +4,1231 @@
 #pragma once
-#include <list>
+#include <algorithm>
+#include <array>
+#include <deque>
 #include <memory>
 #include <mutex>
+#include <span>
 #include <unordered_map>
-#include <unordered_set>
-#include <utility>
 #include <vector>
 #include <boost/container/small_vector.hpp>
-#include <boost/icl/interval_set.hpp>
-#include <boost/intrusive/set.hpp>
-#include "common/alignment.h"
-#include "common/assert.h"
 #include "common/common_types.h"
-#include "common/logging/log.h"
+#include "common/div_ceil.h"
-#include "core/core.h"
+#include "common/microprofile.h"
+#include "common/scope_exit.h"
 #include "core/memory.h"
 #include "core/settings.h"
-#include "video_core/buffer_cache/buffer_block.h"
+#include "video_core/buffer_cache/buffer_base.h"
-#include "video_core/buffer_cache/map_interval.h"
+#include "video_core/delayed_destruction_ring.h"
+#include "video_core/dirty_flags.h"
+#include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/texture_cache/slot_vector.h"
+#include "video_core/texture_cache/types.h"
 namespace VideoCommon {
-template <typename Buffer, typename BufferType, typename StreamBuffer>
+MICROPROFILE_DECLARE(GPU_PrepareBuffers);
+MICROPROFILE_DECLARE(GPU_BindUploadBuffers);
+MICROPROFILE_DECLARE(GPU_DownloadMemory);
+using BufferId = SlotId;
+constexpr u32 NUM_VERTEX_BUFFERS = 32;
+constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4;
+constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18;
+constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
+constexpr u32 NUM_STORAGE_BUFFERS = 16;
+constexpr u32 NUM_STAGES = 5;
+template <typename P>
 class BufferCache {
-    using IntervalSet = boost::icl::interval_set<VAddr>;
+    // Page size for caching purposes.
-    using IntervalType = typename IntervalSet::interval_type;
+    // This is unrelated to the CPU page size and it can be changed as it seems optimal.
-    using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
+    static constexpr u32 PAGE_BITS = 16;
+    static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS;
-    static constexpr u64 WRITE_PAGE_BIT = 11;
+    static constexpr bool IS_OPENGL = P::IS_OPENGL;
-    static constexpr u64 BLOCK_PAGE_BITS = 21;
+    static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS =
-    static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
+        P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS;
+    static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT =
+        P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT;
+    static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX;
+    static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX;
+    static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS;
-public:
+    static constexpr BufferId NULL_BUFFER_ID{0};
-    struct BufferInfo {
-        BufferType handle;
+    using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-        u64 offset;
-        u64 address;
+    using Runtime = typename P::Runtime;
+    using Buffer = typename P::Buffer;
+    struct Empty {};
+    struct Binding {
+        VAddr cpu_addr{};
+        u32 size{};
+        BufferId buffer_id;
    };
-    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
+    static constexpr Binding NULL_BINDING{
-                            bool is_written = false, bool use_fast_cbuf = false) {
+        .cpu_addr = 0,
-        std::lock_guard lock{mutex};
+        .size = 0,
+        .buffer_id = NULL_BUFFER_ID,
+    };
-        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+public:
-        if (!cpu_addr) {
+    static constexpr size_t SKIP_CACHE_SIZE = 4096;
-            return GetEmptyBuffer(size);
-        }
-        // Cache management is a big overhead, so only cache entries with a given size.
+    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
-        // TODO: Figure out which size is the best for given games.
+                         Tegra::Engines::Maxwell3D& maxwell3d_,
-        constexpr std::size_t max_stream_size = 0x800;
+                         Tegra::Engines::KeplerCompute& kepler_compute_,
-        if (use_fast_cbuf || size < max_stream_size) {
+                         Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
-            if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) {
+                         Runtime& runtime_);
-                const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size);
-                if (use_fast_cbuf) {
-                    u8* dest;
-                    if (is_granular) {
-                        dest = gpu_memory.GetPointer(gpu_addr);
-                    } else {
-                        staging_buffer.resize(size);
-                        dest = staging_buffer.data();
-                        gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
-                    }
-                    return ConstBufferUpload(dest, size);
-                }
-                if (is_granular) {
-                    u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
-                    return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
-                        std::memcpy(dest, host_ptr, size);
-                    });
-                } else {
-                    return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) {
-                        gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
-                    });
-                }
-            }
-        }
-        Buffer* const block = GetBlock(*cpu_addr, size);
+    void TickFrame();
-        MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size);
-        if (!map) {
-            return GetEmptyBuffer(size);
-        }
-        if (is_written) {
-            map->MarkAsModified(true, GetModifiedTicks());
-            if (Settings::IsGPULevelHigh() &&
-                Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
-                MarkForAsyncFlush(map);
-            }
-            if (!map->is_written) {
-                map->is_written = true;
-                MarkRegionAsWritten(map->start, map->end - 1);
-            }
-        }
-        return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()};
+    void WriteMemory(VAddr cpu_addr, u64 size);
-    }
-    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
+    void CachedWriteMemory(VAddr cpu_addr, u64 size);
-    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
-                                std::size_t alignment = 4) {
-        std::lock_guard lock{mutex};
-        return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
-            std::memcpy(dest, raw_pointer, size);
-        });
-    }
-    /// Prepares the buffer cache for data uploading
+    void DownloadMemory(VAddr cpu_addr, u64 size);
-    /// @param max_size Maximum number of bytes that will be uploaded
-    /// @return True when a stream buffer invalidation was required, false otherwise
-    void Map(std::size_t max_size) {
-        std::lock_guard lock{mutex};
-        std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4);
+    void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
-        buffer_offset = buffer_offset_base;
-    }
-    /// Finishes the upload stream
+    void UpdateGraphicsBuffers(bool is_indexed);
-    void Unmap() {
-        std::lock_guard lock{mutex};
-        stream_buffer.Unmap(buffer_offset - buffer_offset_base);
-    }
-    /// Function called at the end of each frame, inteded for deferred operations
+    void UpdateComputeBuffers();
-    void TickFrame() {
-        ++epoch;
-        while (!pending_destruction.empty()) {
+    void BindHostGeometryBuffers(bool is_indexed);
-            // Delay at least 4 frames before destruction.
-            // This is due to triple buffering happening on some drivers.
-            static constexpr u64 epochs_to_destroy = 5;
-            if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
-                break;
-            }
-            pending_destruction.pop();
-        }
-    }
-    /// Write any cached resources overlapping the specified region back to memory
+    void BindHostStageBuffers(size_t stage);
-    void FlushRegion(VAddr addr, std::size_t size) {
-        std::lock_guard lock{mutex};
-        VectorMapInterval objects = GetMapsInRange(addr, size);
+    void BindHostComputeBuffers();
-        std::sort(objects.begin(), objects.end(),
-                  [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
-        for (MapInterval* object : objects) {
-            if (object->is_modified && object->is_registered) {
-                mutex.unlock();
-                FlushMap(object);
-                mutex.lock();
-            }
-        }
-    }
-    bool MustFlushRegion(VAddr addr, std::size_t size) {
+    void SetEnabledUniformBuffers(size_t stage, u32 enabled);
-        std::lock_guard lock{mutex};
-        const VectorMapInterval objects = GetMapsInRange(addr, size);
+    void SetEnabledComputeUniformBuffers(u32 enabled);
-        return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
-            return map->is_modified && map->is_registered;
-        });
-    }
-    /// Mark the specified region as being invalidated
+    void UnbindGraphicsStorageBuffers(size_t stage);
-    void InvalidateRegion(VAddr addr, u64 size) {
-        std::lock_guard lock{mutex};
-        for (auto& object : GetMapsInRange(addr, size)) {
+    void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
-            if (object->is_registered) {
+                                   bool is_written);
-                Unregister(object);
-            }
-        }
-    }
-    void OnCPUWrite(VAddr addr, std::size_t size) {
+    void UnbindComputeStorageBuffers();
-        std::lock_guard lock{mutex};
-        for (MapInterval* object : GetMapsInRange(addr, size)) {
+    void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
-            if (object->is_memory_marked && object->is_registered) {
+                                  bool is_written);
-                UnmarkMemory(object);
-                object->is_sync_pending = true;
-                marked_for_unregister.emplace_back(object);
-            }
-        }
-    }
-    void SyncGuestHost() {
+    void FlushCachedWrites();
-        std::lock_guard lock{mutex};
-        for (auto& object : marked_for_unregister) {
+    /// Return true when there are uncommitted buffers to be downloaded
-            if (object->is_registered) {
+    [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
-                object->is_sync_pending = false;
-                Unregister(object);
+    /// Return true when the caller should wait for async downloads
-            }
+    [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
+    /// Commit asynchronous downloads
+    void CommitAsyncFlushes();
+    /// Pop asynchronous downloads
+    void PopAsyncFlushes();
+    /// Return true when a CPU region is modified from the GPU
+    [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
+    std::mutex mutex;
+private:
+    template <typename Func>
+    static void ForEachEnabledBit(u32 enabled_mask, Func&& func) {
+        for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) {
+            const int disabled_bits = std::countr_zero(enabled_mask);
+            index += disabled_bits;
+            enabled_mask >>= disabled_bits;
+            func(index);
        }
-        marked_for_unregister.clear();
    }
-    void CommitAsyncFlushes() {
+    template <typename Func>
-        if (uncommitted_flushes) {
+    void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) {
-            auto commit_list = std::make_shared<std::list<MapInterval*>>();
+        const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE);
-            for (MapInterval* map : *uncommitted_flushes) {
+        for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) {
-                if (map->is_registered && map->is_modified) {
+            const BufferId buffer_id = page_table[page];
-                    // TODO(Blinkhawk): Implement backend asynchronous flushing
+            if (!buffer_id) {
-                    // AsyncFlushMap(map)
+                ++page;
-                    commit_list->push_back(map);
+                continue;
-                }
-            }
-            if (!commit_list->empty()) {
-                committed_flushes.push_back(commit_list);
-            } else {
-                committed_flushes.emplace_back();
            }
-        } else {
+            Buffer& buffer = slot_buffers[buffer_id];
-            committed_flushes.emplace_back();
+            func(buffer_id, buffer);
+            const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
+            page = Common::DivCeil(end_addr, PAGE_SIZE);
        }
-        uncommitted_flushes.reset();
    }
-    bool ShouldWaitAsyncFlushes() const {
+    static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
-        return !committed_flushes.empty() && committed_flushes.front() != nullptr;
+        return (cpu_addr & ~Core::Memory::PAGE_MASK) ==
+               ((cpu_addr + size) & ~Core::Memory::PAGE_MASK);
    }
-    bool HasUncommittedFlushes() const {
+    void BindHostIndexBuffer();
-        return uncommitted_flushes != nullptr;
-    }
-    void PopAsyncFlushes() {
+    void BindHostVertexBuffers();
-        if (committed_flushes.empty()) {
-            return;
-        }
-        auto& flush_list = committed_flushes.front();
-        if (!flush_list) {
-            committed_flushes.pop_front();
-            return;
-        }
-        for (MapInterval* map : *flush_list) {
-            if (map->is_registered) {
-                // TODO(Blinkhawk): Replace this for reading the asynchronous flush
-                FlushMap(map);
-            }
-        }
-        committed_flushes.pop_front();
-    }
-    virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
+    void BindHostGraphicsUniformBuffers(size_t stage);
-protected:
+    void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind);
-    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
-                         Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
-                         StreamBuffer& stream_buffer_)
-        : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
-          stream_buffer{stream_buffer_} {}
-    ~BufferCache() = default;
+    void BindHostGraphicsStorageBuffers(size_t stage);
-    virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
+    void BindHostTransformFeedbackBuffers();
-    virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
+    void BindHostComputeUniformBuffers();
-        return {};
-    }
-    /// Register an object into the cache
+    void BindHostComputeStorageBuffers();
-    MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
-        const VAddr cpu_addr = new_map.start;
-        if (!cpu_addr) {
-            LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
-                         new_map.gpu_addr);
-            return nullptr;
-        }
-        const std::size_t size = new_map.end - new_map.start;
-        new_map.is_registered = true;
-        rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
-        new_map.is_memory_marked = true;
-        if (inherit_written) {
-            MarkRegionAsWritten(new_map.start, new_map.end - 1);
-            new_map.is_written = true;
-        }
-        MapInterval* const storage = mapped_addresses_allocator.Allocate();
-        *storage = new_map;
-        mapped_addresses.insert(*storage);
-        return storage;
-    }
-    void UnmarkMemory(MapInterval* map) {
+    void DoUpdateGraphicsBuffers(bool is_indexed);
-        if (!map->is_memory_marked) {
-            return;
+    void DoUpdateComputeBuffers();
-        }
-        const std::size_t size = map->end - map->start;
+    void UpdateIndexBuffer();
-        rasterizer.UpdatePagesCachedCount(map->start, size, -1);
-        map->is_memory_marked = false;
+    void UpdateVertexBuffers();
-    }
+    void UpdateVertexBuffer(u32 index);
-    /// Unregisters an object from the cache
-    void Unregister(MapInterval* map) {
+    void UpdateUniformBuffers(size_t stage);
-        UnmarkMemory(map);
-        map->is_registered = false;
+    void UpdateStorageBuffers(size_t stage);
-        if (map->is_sync_pending) {
-            map->is_sync_pending = false;
+    void UpdateTransformFeedbackBuffers();
-            marked_for_unregister.remove(map);
+    void UpdateTransformFeedbackBuffer(u32 index);
+    void UpdateComputeUniformBuffers();
+    void UpdateComputeStorageBuffers();
+    void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size);
+    [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size);
+    [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size);
+    void Register(BufferId buffer_id);
+    void Unregister(BufferId buffer_id);
+    template <bool insert>
+    void ChangeRegister(BufferId buffer_id);
+    void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
+    void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
+    void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
+                      std::span<BufferCopy> copies);
+    void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
+                               std::span<const BufferCopy> copies);
+    void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
+                            std::span<const BufferCopy> copies);
+    void DeleteBuffer(BufferId buffer_id);
+    void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
+    void NotifyBufferDeletion();
+    [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const;
+    [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size);
+    [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity);
+    [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept;
+    VideoCore::RasterizerInterface& rasterizer;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::Engines::KeplerCompute& kepler_compute;
+    Tegra::MemoryManager& gpu_memory;
+    Core::Memory::Memory& cpu_memory;
+    Runtime& runtime;
+    SlotVector<Buffer> slot_buffers;
+    DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
+    u32 last_index_count = 0;
+    Binding index_buffer;
+    std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
+    std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
+    std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
+    std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
+    std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
+    std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
+    std::array<u32, NUM_STAGES> enabled_uniform_buffers{};
+    u32 enabled_compute_uniform_buffers = 0;
+    std::array<u32, NUM_STAGES> enabled_storage_buffers{};
+    std::array<u32, NUM_STAGES> written_storage_buffers{};
+    u32 enabled_compute_storage_buffers = 0;
+    u32 written_compute_storage_buffers = 0;
+    std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{};
+    bool has_deleted_buffers = false;
+    std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty>
+        dirty_uniform_buffers{};
+    std::vector<BufferId> cached_write_buffer_ids;
+    // TODO: This data structure is not optimal and it should be reworked
+    std::vector<BufferId> uncommitted_downloads;
+    std::deque<std::vector<BufferId>> committed_downloads;
+    size_t immediate_buffer_capacity = 0;
+    std::unique_ptr<u8[]> immediate_buffer_alloc;
+    std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
+};
+template <class P>
+BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
+                            Tegra::Engines::Maxwell3D& maxwell3d_,
+                            Tegra::Engines::KeplerCompute& kepler_compute_,
+                            Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
+                            Runtime& runtime_)
+    : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_},
+      gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {
+    // Ensure the first slot is used for the null buffer
+    void(slot_buffers.insert(runtime, NullBufferParams{}));
+}
+template <class P>
+void BufferCache<P>::TickFrame() {
+    delayed_destruction_ring.Tick();
+}
+template <class P>
+void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) {
+    ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
+        buffer.MarkRegionAsCpuModified(cpu_addr, size);
+    });
+}
+template <class P>
+void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
+    ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
+        if (!buffer.HasCachedWrites()) {
+            cached_write_buffer_ids.push_back(buffer_id);
        }
-        if (map->is_written) {
+        buffer.CachedCpuWrite(cpu_addr, size);
-            UnmarkRegionAsWritten(map->start, map->end - 1);
+    });
+}
+template <class P>
+void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
+    ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
+        boost::container::small_vector<BufferCopy, 1> copies;
+        u64 total_size_bytes = 0;
+        u64 largest_copy = 0;
+        buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+            copies.push_back(BufferCopy{
+                .src_offset = range_offset,
+                .dst_offset = total_size_bytes,
+                .size = range_size,
+            });
+            total_size_bytes += range_size;
+            largest_copy = std::max(largest_copy, range_size);
+        });
+        if (total_size_bytes == 0) {
+            return;
        }
-        const auto it = mapped_addresses.find(*map);
+        MICROPROFILE_SCOPE(GPU_DownloadMemory);
-        ASSERT(it != mapped_addresses.end());
-        mapped_addresses.erase(it);
+        if constexpr (USE_MEMORY_MAPS) {
-        mapped_addresses_allocator.Release(map);
+            auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
-    }
+            const u8* const mapped_memory = download_staging.mapped_span.data();
+            const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
-private:
+            runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
-    MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) {
+            runtime.Finish();
-        const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
+            for (const BufferCopy& copy : copies) {
-        if (overlaps.empty()) {
+                const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
-            const VAddr cpu_addr_end = cpu_addr + size;
+                const u8* copy_mapped_memory = mapped_memory + copy.dst_offset;
-            if (gpu_memory.IsGranularRange(gpu_addr, size)) {
+                cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
-                u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
-                block->Upload(block->Offset(cpu_addr), size, host_ptr);
-            } else {
-                staging_buffer.resize(size);
-                gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
            }
-            return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
+        } else {
-        }
+            const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
+            for (const BufferCopy& copy : copies) {
-        const VAddr cpu_addr_end = cpu_addr + size;
+                buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
-        if (overlaps.size() == 1) {
+                const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
-            MapInterval* const current_map = overlaps[0];
+                cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
-            if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
-                return current_map;
            }
        }
-        VAddr new_start = cpu_addr;
+    });
-        VAddr new_end = cpu_addr_end;
+}
-        bool write_inheritance = false;
-        bool modified_inheritance = false;
+template <class P>
-        // Calculate new buffer parameters
+void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
-        for (MapInterval* overlap : overlaps) {
+                                               u32 size) {
-            new_start = std::min(overlap->start, new_start);
+    const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
-            new_end = std::max(overlap->end, new_end);
+    if (!cpu_addr) {
-            write_inheritance |= overlap->is_written;
+        uniform_buffers[stage][index] = NULL_BINDING;
-            modified_inheritance |= overlap->is_modified;
+        return;
+    }
+    const Binding binding{
+        .cpu_addr = *cpu_addr,
+        .size = size,
+        .buffer_id = BufferId{},
+    };
+    uniform_buffers[stage][index] = binding;
+}
+template <class P>
+void BufferCache<P>::UpdateGraphicsBuffers(bool is_indexed) {
+    MICROPROFILE_SCOPE(GPU_PrepareBuffers);
+    do {
+        has_deleted_buffers = false;
+        DoUpdateGraphicsBuffers(is_indexed);
+    } while (has_deleted_buffers);
+}
+template <class P>
+void BufferCache<P>::UpdateComputeBuffers() {
+    MICROPROFILE_SCOPE(GPU_PrepareBuffers);
+    do {
+        has_deleted_buffers = false;
+        DoUpdateComputeBuffers();
+    } while (has_deleted_buffers);
+}
+template <class P>
+void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) {
+    MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
+    if (is_indexed) {
+        BindHostIndexBuffer();
+    } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
+        const auto& regs = maxwell3d.regs;
+        if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
+            runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count);
        }
-        GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
+    }
-        for (auto& overlap : overlaps) {
+    BindHostVertexBuffers();
-            Unregister(overlap);
+    BindHostTransformFeedbackBuffers();
+}
+template <class P>
+void BufferCache<P>::BindHostStageBuffers(size_t stage) {
+    MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
+    BindHostGraphicsUniformBuffers(stage);
+    BindHostGraphicsStorageBuffers(stage);
+}
+template <class P>
+void BufferCache<P>::BindHostComputeBuffers() {
+    MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
+    BindHostComputeUniformBuffers();
+    BindHostComputeStorageBuffers();
+}
+template <class P>
+void BufferCache<P>::SetEnabledUniformBuffers(size_t stage, u32 enabled) {
+    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+        if (enabled_uniform_buffers[stage] != enabled) {
+            dirty_uniform_buffers[stage] = ~u32{0};
        }
-        UpdateBlock(block, new_start, new_end, overlaps);
+    }
+    enabled_uniform_buffers[stage] = enabled;
-        const MapInterval new_map{new_start, new_end, new_gpu_addr};
+}
-        MapInterval* const map = Register(new_map, write_inheritance);
-        if (!map) {
+template <class P>
-            return nullptr;
+void BufferCache<P>::SetEnabledComputeUniformBuffers(u32 enabled) {
+    enabled_compute_uniform_buffers = enabled;
+}
+template <class P>
+void BufferCache<P>::UnbindGraphicsStorageBuffers(size_t stage) {
+    enabled_storage_buffers[stage] = 0;
+    written_storage_buffers[stage] = 0;
+}
+template <class P>
+void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index,
+                                               u32 cbuf_offset, bool is_written) {
+    enabled_storage_buffers[stage] |= 1U << ssbo_index;
+    written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index;
+    const auto& cbufs = maxwell3d.state.shader_stages[stage];
+    const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset;
+    storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr);
+}
+template <class P>
+void BufferCache<P>::UnbindComputeStorageBuffers() {
+    enabled_compute_storage_buffers = 0;
+    written_compute_storage_buffers = 0;
+}
+template <class P>
+void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
+                                              bool is_written) {
+    enabled_compute_storage_buffers |= 1U << ssbo_index;
+    written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index;
+    const auto& launch_desc = kepler_compute.launch_description;
+    ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0);
+    const auto& cbufs = launch_desc.const_buffer_config;
+    const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset;
+    compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr);
+}
+template <class P>
+void BufferCache<P>::FlushCachedWrites() {
+    for (const BufferId buffer_id : cached_write_buffer_ids) {
+        slot_buffers[buffer_id].FlushCachedWrites();
+    }
+    cached_write_buffer_ids.clear();
+}
+template <class P>
+bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
+    return !uncommitted_downloads.empty();
+}
+template <class P>
+bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
+    return !committed_downloads.empty() && !committed_downloads.front().empty();
+}
+template <class P>
+void BufferCache<P>::CommitAsyncFlushes() {
+    // This is intentionally passing the value by copy
+    committed_downloads.push_front(uncommitted_downloads);
+    uncommitted_downloads.clear();
+}
+template <class P>
+void BufferCache<P>::PopAsyncFlushes() {
+    if (committed_downloads.empty()) {
+        return;
+    }
+    auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); });
+    const std::span<const BufferId> download_ids = committed_downloads.back();
+    if (download_ids.empty()) {
+        return;
+    }
+    MICROPROFILE_SCOPE(GPU_DownloadMemory);
+    boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
+    u64 total_size_bytes = 0;
+    u64 largest_copy = 0;
+    for (const BufferId buffer_id : download_ids) {
+        slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) {
+            downloads.push_back({
+                BufferCopy{
+                    .src_offset = range_offset,
+                    .dst_offset = total_size_bytes,
+                    .size = range_size,
+                },
+                buffer_id,
+            });
+            total_size_bytes += range_size;
+            largest_copy = std::max(largest_copy, range_size);
+        });
+    }
+    if (downloads.empty()) {
+        return;
+    }
+    if constexpr (USE_MEMORY_MAPS) {
+        auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
+        for (const auto [copy, buffer_id] : downloads) {
+            const std::array copies{copy};
+            runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies);
        }
-        if (modified_inheritance) {
+        runtime.Finish();
-            map->MarkAsModified(true, GetModifiedTicks());
+        for (const auto [copy, buffer_id] : downloads) {
-            if (Settings::IsGPULevelHigh() &&
+            const Buffer& buffer = slot_buffers[buffer_id];
-                Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
+            const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
-                MarkForAsyncFlush(map);
+            const u8* read_mapped_memory = download_staging.mapped_span.data() + copy.dst_offset;
-            }
+            cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size);
+        }
+    } else {
+        const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
+        for (const auto [copy, buffer_id] : downloads) {
+            Buffer& buffer = slot_buffers[buffer_id];
+            buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
+            const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
+            cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
        }
-        return map;
    }
+}
-    void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) {
-        const IntervalType base_interval{start, end};
+template <class P>
-        IntervalSet interval_set{};
+bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
-        interval_set.add(base_interval);
+    const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
-        for (auto& overlap : overlaps) {
+    for (u64 page = addr >> PAGE_BITS; page < page_end;) {
-            const IntervalType subtract{overlap->start, overlap->end};
+        const BufferId image_id = page_table[page];
-            interval_set.subtract(subtract);
+        if (!image_id) {
+            ++page;
+            continue;
        }
-        for (auto& interval : interval_set) {
+        Buffer& buffer = slot_buffers[image_id];
-            const std::size_t size = interval.upper() - interval.lower();
+        if (buffer.IsRegionGpuModified(addr, size)) {
-            if (size == 0) {
+            return true;
-                continue;
-            }
-            staging_buffer.resize(size);
-            cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
-            block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
        }
+        const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
+        page = Common::DivCeil(end_addr, PAGE_SIZE);
    }
+    return false;
-    VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
+}
-        VectorMapInterval result;
-        if (size == 0) {
+template <class P>
-            return result;
+void BufferCache<P>::BindHostIndexBuffer() {
+    Buffer& buffer = slot_buffers[index_buffer.buffer_id];
+    const u32 offset = buffer.Offset(index_buffer.cpu_addr);
+    const u32 size = index_buffer.size;
+    SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
+    if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
+        runtime.BindIndexBuffer(buffer, offset, size);
+    } else {
+        runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format,
+                                maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count,
+                                buffer, offset, size);
+    }
+}
+template <class P>
+void BufferCache<P>::BindHostVertexBuffers() {
+    auto& flags = maxwell3d.dirty.flags;
+    for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
+        const Binding& binding = vertex_buffers[index];
+        Buffer& buffer = slot_buffers[binding.buffer_id];
+        SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
+        if (!flags[Dirty::VertexBuffer0 + index]) {
+            continue;
        }
+        flags[Dirty::VertexBuffer0 + index] = false;
+        const u32 stride = maxwell3d.regs.vertex_array[index].stride;
+        const u32 offset = buffer.Offset(binding.cpu_addr);
+        runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride);
+    }
+}
-        const VAddr addr_end = addr + size;
+template <class P>
-        auto it = mapped_addresses.lower_bound(addr);
+void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
-        if (it != mapped_addresses.begin()) {
+    u32 dirty = ~0U;
-            --it;
+    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+        dirty = std::exchange(dirty_uniform_buffers[stage], 0);
+    }
+    u32 binding_index = 0;
+    ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
+        const bool needs_bind = ((dirty >> index) & 1) != 0;
+        BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind);
+        if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
+            ++binding_index;
        }
-        while (it != mapped_addresses.end() && it->start < addr_end) {
+    });
-            if (it->Overlaps(addr, addr_end)) {
+}
-                result.push_back(&*it);
+template <class P>
+void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index,
+                                                   bool needs_bind) {
+    const Binding& binding = uniform_buffers[stage][index];
+    const VAddr cpu_addr = binding.cpu_addr;
+    const u32 size = binding.size;
+    Buffer& buffer = slot_buffers[binding.buffer_id];
+    if constexpr (IS_OPENGL) {
+        if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) {
+            if (runtime.HasFastBufferSubData()) {
+                // Fast path for Nvidia
+                if (!HasFastUniformBufferBound(stage, binding_index)) {
+                    // We only have to bind when the currently bound buffer is not the fast version
+                    fast_bound_uniform_buffers[stage] |= 1U << binding_index;
+                    runtime.BindFastUniformBuffer(stage, binding_index, size);
+                }
+                const auto span = ImmediateBufferWithData(cpu_addr, size);
+                runtime.PushFastUniformBuffer(stage, binding_index, span);
+            } else {
+                // Stream buffer path to avoid stalling on non-Nvidia drivers
+                const auto span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
+                cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
            }
-            ++it;
+            return;
        }
-        return result;
    }
+    // Classic cached path
-    /// Returns a ticks counter used for tracking when cached objects were last modified
+    SynchronizeBuffer(buffer, cpu_addr, size);
-    u64 GetModifiedTicks() {
+    if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) {
-        return ++modified_ticks;
+        // Skip binding if it's not needed and if the bound buffer is not the fast version
+        // This exists to avoid instances where the fast buffer is bound and a GPU write happens
+        return;
    }
+    fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
-    void FlushMap(MapInterval* map) {
+    const u32 offset = buffer.Offset(cpu_addr);
-        const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
+    if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
-        ASSERT_OR_EXECUTE(it != blocks.end(), return;);
+        runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size);
+    } else {
-        std::shared_ptr<Buffer> block = it->second;
+        runtime.BindUniformBuffer(buffer, offset, size);
-        const std::size_t size = map->end - map->start;
-        staging_buffer.resize(size);
-        block->Download(block->Offset(map->start), size, staging_buffer.data());
-        cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size);
-        map->MarkAsModified(false, 0);
    }
+}
+template <class P>
+void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
+    u32 binding_index = 0;
+    ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
+        const Binding& binding = storage_buffers[stage][index];
+        Buffer& buffer = slot_buffers[binding.buffer_id];
+        const u32 size = binding.size;
+        SynchronizeBuffer(buffer, binding.cpu_addr, size);
+        const u32 offset = buffer.Offset(binding.cpu_addr);
+        const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0;
+        if constexpr (NEEDS_BIND_STORAGE_INDEX) {
+            runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written);
+            ++binding_index;
+        } else {
+            runtime.BindStorageBuffer(buffer, offset, size, is_written);
+        }
+    });
+}
-    template <typename Callable>
+template <class P>
-    BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
+void BufferCache<P>::BindHostTransformFeedbackBuffers() {
-        AlignBuffer(alignment);
+    if (maxwell3d.regs.tfb_enabled == 0) {
-        const std::size_t uploaded_offset = buffer_offset;
+        return;
-        callable(buffer_ptr);
-        buffer_ptr += size;
-        buffer_offset += size;
-        return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
    }
+    for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
-    void AlignBuffer(std::size_t alignment) {
+        const Binding& binding = transform_feedback_buffers[index];
-        // Align the offset, not the mapped pointer
+        Buffer& buffer = slot_buffers[binding.buffer_id];
-        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
+        const u32 size = binding.size;
-        buffer_ptr += offset_aligned - buffer_offset;
+        SynchronizeBuffer(buffer, binding.cpu_addr, size);
-        buffer_offset = offset_aligned;
+        const u32 offset = buffer.Offset(binding.cpu_addr);
+        runtime.BindTransformFeedbackBuffer(index, buffer, offset, size);
    }
+}
-    std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
+template <class P>
-        const std::size_t old_size = buffer->Size();
+void BufferCache<P>::BindHostComputeUniformBuffers() {
-        const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
+    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
-        const VAddr cpu_addr = buffer->CpuAddr();
+        // Mark all uniform buffers as dirty
-        std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
+        dirty_uniform_buffers.fill(~u32{0});
-        new_buffer->CopyFrom(*buffer, 0, 0, old_size);
+    }
-        QueueDestruction(std::move(buffer));
+    u32 binding_index = 0;
+    ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
-        const VAddr cpu_addr_end = cpu_addr + new_size - 1;
+        const Binding& binding = compute_uniform_buffers[index];
-        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        Buffer& buffer = slot_buffers[binding.buffer_id];
-        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+        const u32 size = binding.size;
-            blocks.insert_or_assign(page_start, new_buffer);
+        SynchronizeBuffer(buffer, binding.cpu_addr, size);
+        const u32 offset = buffer.Offset(binding.cpu_addr);
+        if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
+            runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size);
+            ++binding_index;
+        } else {
+            runtime.BindUniformBuffer(buffer, offset, size);
        }
+    });
+}
+template <class P>
+void BufferCache<P>::BindHostComputeStorageBuffers() {
+    u32 binding_index = 0;
+    ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
+        const Binding& binding = compute_storage_buffers[index];
+        Buffer& buffer = slot_buffers[binding.buffer_id];
+        const u32 size = binding.size;
+        SynchronizeBuffer(buffer, binding.cpu_addr, size);
+        const u32 offset = buffer.Offset(binding.cpu_addr);
+        const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0;
+        if constexpr (NEEDS_BIND_STORAGE_INDEX) {
+            runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written);
+            ++binding_index;
+        } else {
+            runtime.BindStorageBuffer(buffer, offset, size, is_written);
+        }
+    });
+}
-        return new_buffer;
+template <class P>
+void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
+    if (is_indexed) {
+        UpdateIndexBuffer();
    }
+    UpdateVertexBuffers();
+    UpdateTransformFeedbackBuffers();
+    for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
+        UpdateUniformBuffers(stage);
+        UpdateStorageBuffers(stage);
+    }
+}
+template <class P>
+void BufferCache<P>::DoUpdateComputeBuffers() {
+    UpdateComputeUniformBuffers();
+    UpdateComputeStorageBuffers();
+}
+template <class P>
+void BufferCache<P>::UpdateIndexBuffer() {
+    // We have to check for the dirty flags and index count
+    // The index count is currently changed without updating the dirty flags
+    const auto& index_array = maxwell3d.regs.index_array;
+    auto& flags = maxwell3d.dirty.flags;
+    if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) {
+        return;
+    }
+    flags[Dirty::IndexBuffer] = false;
+    last_index_count = index_array.count;
+    const GPUVAddr gpu_addr_begin = index_array.StartAddress();
+    const GPUVAddr gpu_addr_end = index_array.EndAddress();
+    const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
+    const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
+    const u32 draw_size = index_array.count * index_array.FormatSizeInBytes();
+    const u32 size = std::min(address_size, draw_size);
+    if (size == 0 || !cpu_addr) {
+        index_buffer = NULL_BINDING;
+        return;
+    }
+    index_buffer = Binding{
+        .cpu_addr = *cpu_addr,
+        .size = size,
+        .buffer_id = FindBuffer(*cpu_addr, size),
+    };
+}
-    std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
+template <class P>
-                                        std::shared_ptr<Buffer> second) {
+void BufferCache<P>::UpdateVertexBuffers() {
-        const std::size_t size_1 = first->Size();
+    auto& flags = maxwell3d.dirty.flags;
-        const std::size_t size_2 = second->Size();
+    if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) {
-        const VAddr first_addr = first->CpuAddr();
+        return;
-        const VAddr second_addr = second->CpuAddr();
+    }
-        const VAddr new_addr = std::min(first_addr, second_addr);
+    flags[Dirty::VertexBuffers] = false;
-        const std::size_t new_size = size_1 + size_2;
-        std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
-        new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
-        new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
-        QueueDestruction(std::move(first));
-        QueueDestruction(std::move(second));
-        const VAddr cpu_addr_end = new_addr + new_size - 1;
+    for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
-        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        UpdateVertexBuffer(index);
-        for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
-            blocks.insert_or_assign(page_start, new_buffer);
-        }
-        return new_buffer;
    }
+}
-    Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
+template <class P>
-        std::shared_ptr<Buffer> found;
+void BufferCache<P>::UpdateVertexBuffer(u32 index) {
+    if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) {
+        return;
+    }
+    const auto& array = maxwell3d.regs.vertex_array[index];
+    const auto& limit = maxwell3d.regs.vertex_array_limit[index];
+    const GPUVAddr gpu_addr_begin = array.StartAddress();
+    const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1;
+    const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
+    const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
+    const u32 size = address_size; // TODO: Analyze stride and number of vertices
+    if (array.enable == 0 || size == 0 || !cpu_addr) {
+        vertex_buffers[index] = NULL_BINDING;
+        return;
+    }
+    vertex_buffers[index] = Binding{
+        .cpu_addr = *cpu_addr,
+        .size = size,
+        .buffer_id = FindBuffer(*cpu_addr, size),
+    };
+}
+template <class P>
+void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
+    ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
+        Binding& binding = uniform_buffers[stage][index];
+        if (binding.buffer_id) {
+            // Already updated
+            return;
+        }
+        // Mark as dirty
+        if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+            dirty_uniform_buffers[stage] |= 1U << index;
+        }
+        // Resolve buffer
+        binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+    });
+}
+template <class P>
+void BufferCache<P>::UpdateStorageBuffers(size_t stage) {
+    const u32 written_mask = written_storage_buffers[stage];
+    ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
+        // Resolve buffer
+        Binding& binding = storage_buffers[stage][index];
+        const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+        binding.buffer_id = buffer_id;
+        // Mark buffer as written if needed
+        if (((written_mask >> index) & 1) != 0) {
+            MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
+        }
+    });
+}
-        const VAddr cpu_addr_end = cpu_addr + size - 1;
+template <class P>
-        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+void BufferCache<P>::UpdateTransformFeedbackBuffers() {
-        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+    if (maxwell3d.regs.tfb_enabled == 0) {
-            auto it = blocks.find(page_start);
+        return;
-            if (it == blocks.end()) {
+    }
-                if (found) {
+    for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
-                    found = EnlargeBlock(found);
+        UpdateTransformFeedbackBuffer(index);
-                    continue;
+    }
-                }
+}
-                const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
-                found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
+template <class P>
-                blocks.insert_or_assign(page_start, found);
+void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) {
-                continue;
+    const auto& binding = maxwell3d.regs.tfb_bindings[index];
-            }
+    const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset;
-            if (!found) {
+    const u32 size = binding.buffer_size;
-                found = it->second;
+    const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
-                continue;
+    if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) {
-            }
+        transform_feedback_buffers[index] = NULL_BINDING;
-            if (found != it->second) {
+        return;
-                found = MergeBlocks(std::move(found), it->second);
+    }
+    const BufferId buffer_id = FindBuffer(*cpu_addr, size);
+    transform_feedback_buffers[index] = Binding{
+        .cpu_addr = *cpu_addr,
+        .size = size,
+        .buffer_id = buffer_id,
+    };
+    MarkWrittenBuffer(buffer_id, *cpu_addr, size);
+}
+template <class P>
+void BufferCache<P>::UpdateComputeUniformBuffers() {
+    ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
+        Binding& binding = compute_uniform_buffers[index];
+        binding = NULL_BINDING;
+        const auto& launch_desc = kepler_compute.launch_description;
+        if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) {
+            const auto& cbuf = launch_desc.const_buffer_config[index];
+            const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address());
+            if (cpu_addr) {
+                binding.cpu_addr = *cpu_addr;
+                binding.size = cbuf.size;
            }
        }
-        return found.get();
+        binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+    });
+}
+template <class P>
+void BufferCache<P>::UpdateComputeStorageBuffers() {
+    ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
+        // Resolve buffer
+        Binding& binding = compute_storage_buffers[index];
+        const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+        binding.buffer_id = buffer_id;
+        // Mark as written if needed
+        if (((written_compute_storage_buffers >> index) & 1) != 0) {
+            MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
+        }
+    });
+}
+template <class P>
+void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) {
+    Buffer& buffer = slot_buffers[buffer_id];
+    buffer.MarkRegionAsGpuModified(cpu_addr, size);
+    const bool is_accuracy_high = Settings::IsGPULevelHigh();
+    const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
+    if (!is_accuracy_high || !is_async) {
+        return;
    }
+    if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) {
+        // Already inserted
+        return;
+    }
+    uncommitted_downloads.push_back(buffer_id);
+}
-    void MarkRegionAsWritten(VAddr start, VAddr end) {
+template <class P>
-        const u64 page_end = end >> WRITE_PAGE_BIT;
+BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) {
-        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
+    if (cpu_addr == 0) {
-            if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) {
+        return NULL_BUFFER_ID;
-                ++it->second;
+    }
-            }
+    const u64 page = cpu_addr >> PAGE_BITS;
+    const BufferId buffer_id = page_table[page];
+    if (!buffer_id) {
+        return CreateBuffer(cpu_addr, size);
+    }
+    const Buffer& buffer = slot_buffers[buffer_id];
+    if (buffer.IsInBounds(cpu_addr, size)) {
+        return buffer_id;
+    }
+    return CreateBuffer(cpu_addr, size);
+}
+template <class P>
+BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
+    std::vector<BufferId> overlap_ids;
+    VAddr cpu_addr_begin = cpu_addr;
+    VAddr cpu_addr_end = cpu_addr + wanted_size;
+    for (; cpu_addr >> PAGE_BITS < Common::DivCeil(cpu_addr_end, PAGE_SIZE);
+         cpu_addr += PAGE_SIZE) {
+        const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS];
+        if (!overlap_id) {
+            continue;
+        }
+        Buffer& overlap = slot_buffers[overlap_id];
+        if (overlap.IsPicked()) {
+            continue;
+        }
+        overlap.Pick();
+        overlap_ids.push_back(overlap_id);
+        const VAddr overlap_cpu_addr = overlap.CpuAddr();
+        if (overlap_cpu_addr < cpu_addr_begin) {
+            cpu_addr = cpu_addr_begin = overlap_cpu_addr;
        }
+        cpu_addr_end = std::max(cpu_addr_end, overlap_cpu_addr + overlap.SizeBytes());
    }
+    const u32 size = static_cast<u32>(cpu_addr_end - cpu_addr_begin);
-    void UnmarkRegionAsWritten(VAddr start, VAddr end) {
+    const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, cpu_addr_begin, size);
-        const u64 page_end = end >> WRITE_PAGE_BIT;
+    Buffer& new_buffer = slot_buffers[new_buffer_id];
-        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
-            auto it = written_pages.find(page_start);
+    for (const BufferId overlap_id : overlap_ids) {
-            if (it != written_pages.end()) {
+        Buffer& overlap = slot_buffers[overlap_id];
-                if (it->second > 1) {
+        overlap.Unpick();
-                    --it->second;
-                } else {
+        std::vector<BufferCopy> copies;
-                    written_pages.erase(it);
+        const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr();
-                }
+        overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) {
-            }
+            copies.push_back(BufferCopy{
+                .src_offset = begin,
+                .dst_offset = dst_base_offset + begin,
+                .size = range_size,
+            });
+            new_buffer.UnmarkRegionAsCpuModified(begin, range_size);
+            new_buffer.MarkRegionAsGpuModified(begin, range_size);
+        });
+        if (!copies.empty()) {
+            runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies);
+        }
+        ReplaceBufferDownloads(overlap_id, new_buffer_id);
+        DeleteBuffer(overlap_id);
+    }
+    Register(new_buffer_id);
+    return new_buffer_id;
+}
+template <class P>
+void BufferCache<P>::Register(BufferId buffer_id) {
+    ChangeRegister<true>(buffer_id);
+}
+template <class P>
+void BufferCache<P>::Unregister(BufferId buffer_id) {
+    ChangeRegister<false>(buffer_id);
+}
+template <class P>
+template <bool insert>
+void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
+    const Buffer& buffer = slot_buffers[buffer_id];
+    const VAddr cpu_addr_begin = buffer.CpuAddr();
+    const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes();
+    const u64 page_begin = cpu_addr_begin / PAGE_SIZE;
+    const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE);
+    for (u64 page = page_begin; page != page_end; ++page) {
+        if constexpr (insert) {
+            page_table[page] = buffer_id;
+        } else {
+            page_table[page] = BufferId{};
        }
    }
+}
-    bool IsRegionWritten(VAddr start, VAddr end) const {
+template <class P>
-        const u64 page_end = end >> WRITE_PAGE_BIT;
+void BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
-        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
+    if (buffer.CpuAddr() == 0) {
-            if (written_pages.contains(page_start)) {
+        return;
-                return true;
+    }
+    SynchronizeBufferImpl(buffer, cpu_addr, size);
+}
+template <class P>
+void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) {
+    boost::container::small_vector<BufferCopy, 4> copies;
+    u64 total_size_bytes = 0;
+    u64 largest_copy = 0;
+    buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+        copies.push_back(BufferCopy{
+            .src_offset = total_size_bytes,
+            .dst_offset = range_offset,
+            .size = range_size,
+        });
+        total_size_bytes += range_size;
+        largest_copy = std::max(largest_copy, range_size);
+    });
+    if (total_size_bytes == 0) {
+        return;
+    }
+    const std::span<BufferCopy> copies_span(copies.data(), copies.size());
+    UploadMemory(buffer, total_size_bytes, largest_copy, copies_span);
+}
+template <class P>
+void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
+                                  std::span<BufferCopy> copies) {
+    if constexpr (USE_MEMORY_MAPS) {
+        MappedUploadMemory(buffer, total_size_bytes, copies);
+    } else {
+        ImmediateUploadMemory(buffer, largest_copy, copies);
+    }
+}
+template <class P>
+void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
+                                           std::span<const BufferCopy> copies) {
+    std::span<u8> immediate_buffer;
+    for (const BufferCopy& copy : copies) {
+        std::span<const u8> upload_span;
+        const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
+        if (IsRangeGranular(cpu_addr, copy.size)) {
+            upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size);
+        } else {
+            if (immediate_buffer.empty()) {
+                immediate_buffer = ImmediateBuffer(largest_copy);
            }
+            cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
+            upload_span = immediate_buffer.subspan(0, copy.size);
        }
-        return false;
+        buffer.ImmediateUpload(copy.dst_offset, upload_span);
    }
+}
-    void QueueDestruction(std::shared_ptr<Buffer> buffer) {
-        buffer->SetEpoch(epoch);
+template <class P>
-        pending_destruction.push(std::move(buffer));
+void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
+                                        std::span<const BufferCopy> copies) {
+    auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes);
+    const std::span<u8> staging_pointer = upload_staging.mapped_span;
+    for (const BufferCopy& copy : copies) {
+        const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
+        u8* const src_pointer = staging_pointer.data() + copy.src_offset;
+        cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size);
    }
+    runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
-    void MarkForAsyncFlush(MapInterval* map) {
+}
-        if (!uncommitted_flushes) {
-            uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
+template <class P>
+void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
+    const auto scalar_replace = [buffer_id](Binding& binding) {
+        if (binding.buffer_id == buffer_id) {
+            binding.buffer_id = BufferId{};
+        }
+    };
+    const auto replace = [scalar_replace](std::span<Binding> bindings) {
+        std::ranges::for_each(bindings, scalar_replace);
+    };
+    scalar_replace(index_buffer);
+    replace(vertex_buffers);
+    std::ranges::for_each(uniform_buffers, replace);
+    std::ranges::for_each(storage_buffers, replace);
+    replace(transform_feedback_buffers);
+    replace(compute_uniform_buffers);
+    replace(compute_storage_buffers);
+    std::erase(cached_write_buffer_ids, buffer_id);
+    // Mark the whole buffer as CPU written to stop tracking CPU writes
+    Buffer& buffer = slot_buffers[buffer_id];
+    buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes());
+    Unregister(buffer_id);
+    delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
+    NotifyBufferDeletion();
+}
+template <class P>
+void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) {
+    const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) {
+        std::ranges::replace(buffers, old_buffer_id, new_buffer_id);
+        if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) {
+            buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end());
        }
-        uncommitted_flushes->insert(map);
+    };
+    replace(uncommitted_downloads);
+    std::ranges::for_each(committed_downloads, replace);
+}
+template <class P>
+void BufferCache<P>::NotifyBufferDeletion() {
+    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+        dirty_uniform_buffers.fill(~u32{0});
    }
+    auto& flags = maxwell3d.dirty.flags;
+    flags[Dirty::IndexBuffer] = true;
+    flags[Dirty::VertexBuffers] = true;
+    for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
+        flags[Dirty::VertexBuffer0 + index] = true;
+    }
+    has_deleted_buffers = true;
+}
+template <class P>
+typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr) const {
+    const GPUVAddr gpu_addr = gpu_memory.Read<u64>(ssbo_addr);
+    const u32 size = gpu_memory.Read<u32>(ssbo_addr + 8);
+    const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+    if (!cpu_addr || size == 0) {
+        return NULL_BINDING;
+    }
+    const Binding binding{
+        .cpu_addr = *cpu_addr,
+        .size = size,
+        .buffer_id = BufferId{},
+    };
+    return binding;
+}
+template <class P>
+std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size_t size) {
+    u8* const base_pointer = cpu_memory.GetPointer(cpu_addr);
+    if (IsRangeGranular(cpu_addr, size) ||
+        base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) {
+        return std::span(base_pointer, size);
+    } else {
+        const std::span<u8> span = ImmediateBuffer(size);
+        cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
+        return span;
+    }
+}
-    VideoCore::RasterizerInterface& rasterizer;
+template <class P>
-    Tegra::MemoryManager& gpu_memory;
+std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) {
-    Core::Memory::Memory& cpu_memory;
+    if (wanted_capacity > immediate_buffer_capacity) {
-    StreamBuffer& stream_buffer;
+        immediate_buffer_capacity = wanted_capacity;
+        immediate_buffer_alloc = std::make_unique<u8[]>(wanted_capacity);
-    u8* buffer_ptr = nullptr;
+    }
-    u64 buffer_offset = 0;
+    return std::span<u8>(immediate_buffer_alloc.get(), wanted_capacity);
-    u64 buffer_offset_base = 0;
+}
-    MapIntervalAllocator mapped_addresses_allocator;
+template <class P>
-    boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
+bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept {
-        mapped_addresses;
+    if constexpr (IS_OPENGL) {
+        return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0;
-    std::unordered_map<u64, u32> written_pages;
+    } else {
-    std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
+        // Only OpenGL has fast uniform buffers
+        return false;
-    std::queue<std::shared_ptr<Buffer>> pending_destruction;
+    }
-    u64 epoch = 0;
+}
-    u64 modified_ticks = 0;
-    std::vector<u8> staging_buffer;
-    std::list<MapInterval*> marked_for_unregister;
-    std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
-    std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
-    std::recursive_mutex mutex;
-};
 } // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp
deleted file mode 100644
index 62587e18a..000000000
--- a/src/video_core/buffer_cache/map_interval.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright 2020 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <memory>
-#include "video_core/buffer_cache/map_interval.h"
-namespace VideoCommon {
-MapIntervalAllocator::MapIntervalAllocator() {
-    FillFreeList(first_chunk);
-}
-MapIntervalAllocator::~MapIntervalAllocator() = default;
-void MapIntervalAllocator::AllocateNewChunk() {
-    *new_chunk = std::make_unique<Chunk>();
-    FillFreeList(**new_chunk);
-    new_chunk = &(*new_chunk)->next;
-}
-void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
-    const std::size_t old_size = free_list.size();
-    free_list.resize(old_size + chunk.data.size());
-    std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
-                   [](MapInterval& interval) { return &interval; });
-}
-} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
deleted file mode 100644
index ef974b08a..000000000
--- a/src/video_core/buffer_cache/map_interval.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-#pragma once
-#include <array>
-#include <cstddef>
-#include <memory>
-#include <vector>
-#include <boost/intrusive/set_hook.hpp>
-#include "common/common_types.h"
-#include "video_core/gpu.h"
-namespace VideoCommon {
-struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
-    MapInterval() = default;
-    /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
-    explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
-        : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
-    bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
-        return start <= other_start && other_end <= end;
-    }
-    bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
-        return start < other_end && other_start < end;
-    }
-    void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
-        is_modified = is_modified_;
-        ticks = ticks_;
-    }
-    boost::intrusive::set_member_hook<> member_hook_;
-    VAddr start = 0;
-    VAddr end = 0;
-    GPUVAddr gpu_addr = 0;
-    u64 ticks = 0;
-    bool is_written = false;
-    bool is_modified = false;
-    bool is_registered = false;
-    bool is_memory_marked = false;
-    bool is_sync_pending = false;
-};
-struct MapIntervalCompare {
-    constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
-        return lhs.start < rhs.start;
-    }
-};
-class MapIntervalAllocator {
-public:
-    MapIntervalAllocator();
-    ~MapIntervalAllocator();
-    MapInterval* Allocate() {
-        if (free_list.empty()) {
-            AllocateNewChunk();
-        }
-        MapInterval* const interval = free_list.back();
-        free_list.pop_back();
-        return interval;
-    }
-    void Release(MapInterval* interval) {
-        free_list.push_back(interval);
-    }
-private:
-    struct Chunk {
-        std::unique_ptr<Chunk> next;
-        std::array<MapInterval, 0x8000> data;
-    };
-    void AllocateNewChunk();
-    void FillFreeList(Chunk& chunk);
-    std::vector<MapInterval*> free_list;
-    Chunk first_chunk;
-    std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
-};
-} // namespace VideoCommon
author	ReinUsesLisp	2021-01-16 20:48:58 -0300
committer	ReinUsesLisp	2021-02-13 02:17:22 -0300
commit	82c2601555b59a94d7160f2fd686cb63d32dd423 (patch)
tree	cd0ecd865945452fa589b572de614fc487f2f96a /src/video_core/buffer_cache
parent	vulkan_common: Expose interop and headless devices (diff)
download	yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar.gz yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar.xz yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.zip