diff options
Diffstat (limited to 'src/video_core/buffer_cache')
| -rw-r--r-- | src/video_core/buffer_cache/buffer_block.h | 62 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.cpp | 13 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 1598 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/map_interval.cpp | 33 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/map_interval.h | 93 |
5 files changed, 1132 insertions, 667 deletions
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h deleted file mode 100644 index e9306194a..000000000 --- a/src/video_core/buffer_cache/buffer_block.h +++ /dev/null | |||
| @@ -1,62 +0,0 @@ | |||
| 1 | // Copyright 2019 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include "common/common_types.h" | ||
| 8 | |||
| 9 | namespace VideoCommon { | ||
| 10 | |||
| 11 | class BufferBlock { | ||
| 12 | public: | ||
| 13 | [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const { | ||
| 14 | return (cpu_addr < end) && (cpu_addr_end > start); | ||
| 15 | } | ||
| 16 | |||
| 17 | [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const { | ||
| 18 | return cpu_addr <= other_start && other_end <= cpu_addr_end; | ||
| 19 | } | ||
| 20 | |||
| 21 | [[nodiscard]] std::size_t Offset(VAddr in_addr) const { | ||
| 22 | return static_cast<std::size_t>(in_addr - cpu_addr); | ||
| 23 | } | ||
| 24 | |||
| 25 | [[nodiscard]] VAddr CpuAddr() const { | ||
| 26 | return cpu_addr; | ||
| 27 | } | ||
| 28 | |||
| 29 | [[nodiscard]] VAddr CpuAddrEnd() const { | ||
| 30 | return cpu_addr_end; | ||
| 31 | } | ||
| 32 | |||
| 33 | void SetCpuAddr(VAddr new_addr) { | ||
| 34 | cpu_addr = new_addr; | ||
| 35 | cpu_addr_end = new_addr + size; | ||
| 36 | } | ||
| 37 | |||
| 38 | [[nodiscard]] std::size_t Size() const { | ||
| 39 | return size; | ||
| 40 | } | ||
| 41 | |||
| 42 | [[nodiscard]] u64 Epoch() const { | ||
| 43 | return epoch; | ||
| 44 | } | ||
| 45 | |||
| 46 | void SetEpoch(u64 new_epoch) { | ||
| 47 | epoch = new_epoch; | ||
| 48 | } | ||
| 49 | |||
| 50 | protected: | ||
| 51 | explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} { | ||
| 52 | SetCpuAddr(cpu_addr_); | ||
| 53 | } | ||
| 54 | |||
| 55 | private: | ||
| 56 | VAddr cpu_addr{}; | ||
| 57 | VAddr cpu_addr_end{}; | ||
| 58 | std::size_t size{}; | ||
| 59 | u64 epoch{}; | ||
| 60 | }; | ||
| 61 | |||
| 62 | } // namespace VideoCommon | ||
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp new file mode 100644 index 000000000..ab32294c8 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache.cpp | |||
| @@ -0,0 +1,13 @@ | |||
| 1 | // Copyright 2021 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include "common/microprofile.h" | ||
| 6 | |||
| 7 | namespace VideoCommon { | ||
| 8 | |||
| 9 | MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128)); | ||
| 10 | MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128)); | ||
| 11 | MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128)); | ||
| 12 | |||
| 13 | } // namespace VideoCommon | ||
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 83b9ee871..e4f3c8e35 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h | |||
| @@ -4,591 +4,1231 @@ | |||
| 4 | 4 | ||
| 5 | #pragma once | 5 | #pragma once |
| 6 | 6 | ||
| 7 | #include <list> | 7 | #include <algorithm> |
| 8 | #include <array> | ||
| 9 | #include <deque> | ||
| 8 | #include <memory> | 10 | #include <memory> |
| 9 | #include <mutex> | 11 | #include <mutex> |
| 12 | #include <span> | ||
| 10 | #include <unordered_map> | 13 | #include <unordered_map> |
| 11 | #include <unordered_set> | ||
| 12 | #include <utility> | ||
| 13 | #include <vector> | 14 | #include <vector> |
| 14 | 15 | ||
| 15 | #include <boost/container/small_vector.hpp> | 16 | #include <boost/container/small_vector.hpp> |
| 16 | #include <boost/icl/interval_set.hpp> | ||
| 17 | #include <boost/intrusive/set.hpp> | ||
| 18 | 17 | ||
| 19 | #include "common/alignment.h" | ||
| 20 | #include "common/assert.h" | ||
| 21 | #include "common/common_types.h" | 18 | #include "common/common_types.h" |
| 22 | #include "common/logging/log.h" | 19 | #include "common/div_ceil.h" |
| 23 | #include "core/core.h" | 20 | #include "common/microprofile.h" |
| 21 | #include "common/scope_exit.h" | ||
| 24 | #include "core/memory.h" | 22 | #include "core/memory.h" |
| 25 | #include "core/settings.h" | 23 | #include "core/settings.h" |
| 26 | #include "video_core/buffer_cache/buffer_block.h" | 24 | #include "video_core/buffer_cache/buffer_base.h" |
| 27 | #include "video_core/buffer_cache/map_interval.h" | 25 | #include "video_core/delayed_destruction_ring.h" |
| 26 | #include "video_core/dirty_flags.h" | ||
| 27 | #include "video_core/engines/kepler_compute.h" | ||
| 28 | #include "video_core/engines/maxwell_3d.h" | ||
| 28 | #include "video_core/memory_manager.h" | 29 | #include "video_core/memory_manager.h" |
| 29 | #include "video_core/rasterizer_interface.h" | 30 | #include "video_core/rasterizer_interface.h" |
| 31 | #include "video_core/texture_cache/slot_vector.h" | ||
| 32 | #include "video_core/texture_cache/types.h" | ||
| 30 | 33 | ||
| 31 | namespace VideoCommon { | 34 | namespace VideoCommon { |
| 32 | 35 | ||
| 33 | template <typename Buffer, typename BufferType, typename StreamBuffer> | 36 | MICROPROFILE_DECLARE(GPU_PrepareBuffers); |
| 37 | MICROPROFILE_DECLARE(GPU_BindUploadBuffers); | ||
| 38 | MICROPROFILE_DECLARE(GPU_DownloadMemory); | ||
| 39 | |||
| 40 | using BufferId = SlotId; | ||
| 41 | |||
| 42 | constexpr u32 NUM_VERTEX_BUFFERS = 32; | ||
| 43 | constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; | ||
| 44 | constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; | ||
| 45 | constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; | ||
| 46 | constexpr u32 NUM_STORAGE_BUFFERS = 16; | ||
| 47 | constexpr u32 NUM_STAGES = 5; | ||
| 48 | |||
| 49 | template <typename P> | ||
| 34 | class BufferCache { | 50 | class BufferCache { |
| 35 | using IntervalSet = boost::icl::interval_set<VAddr>; | 51 | // Page size for caching purposes. |
| 36 | using IntervalType = typename IntervalSet::interval_type; | 52 | // This is unrelated to the CPU page size and it can be changed as it seems optimal. |
| 37 | using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>; | 53 | static constexpr u32 PAGE_BITS = 16; |
| 54 | static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; | ||
| 38 | 55 | ||
| 39 | static constexpr u64 WRITE_PAGE_BIT = 11; | 56 | static constexpr bool IS_OPENGL = P::IS_OPENGL; |
| 40 | static constexpr u64 BLOCK_PAGE_BITS = 21; | 57 | static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = |
| 41 | static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; | 58 | P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; |
| 59 | static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = | ||
| 60 | P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; | ||
| 61 | static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; | ||
| 62 | static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; | ||
| 63 | static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; | ||
| 42 | 64 | ||
| 43 | public: | 65 | static constexpr BufferId NULL_BUFFER_ID{0}; |
| 44 | struct BufferInfo { | 66 | |
| 45 | BufferType handle; | 67 | using Maxwell = Tegra::Engines::Maxwell3D::Regs; |
| 46 | u64 offset; | 68 | |
| 47 | u64 address; | 69 | using Runtime = typename P::Runtime; |
| 70 | using Buffer = typename P::Buffer; | ||
| 71 | |||
| 72 | struct Empty {}; | ||
| 73 | |||
| 74 | struct Binding { | ||
| 75 | VAddr cpu_addr{}; | ||
| 76 | u32 size{}; | ||
| 77 | BufferId buffer_id; | ||
| 48 | }; | 78 | }; |
| 49 | 79 | ||
| 50 | BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, | 80 | static constexpr Binding NULL_BINDING{ |
| 51 | bool is_written = false, bool use_fast_cbuf = false) { | 81 | .cpu_addr = 0, |
| 52 | std::lock_guard lock{mutex}; | 82 | .size = 0, |
| 83 | .buffer_id = NULL_BUFFER_ID, | ||
| 84 | }; | ||
| 53 | 85 | ||
| 54 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); | 86 | public: |
| 55 | if (!cpu_addr) { | 87 | static constexpr size_t SKIP_CACHE_SIZE = 4096; |
| 56 | return GetEmptyBuffer(size); | ||
| 57 | } | ||
| 58 | 88 | ||
| 59 | // Cache management is a big overhead, so only cache entries with a given size. | 89 | explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, |
| 60 | // TODO: Figure out which size is the best for given games. | 90 | Tegra::Engines::Maxwell3D& maxwell3d_, |
| 61 | constexpr std::size_t max_stream_size = 0x800; | 91 | Tegra::Engines::KeplerCompute& kepler_compute_, |
| 62 | if (use_fast_cbuf || size < max_stream_size) { | 92 | Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, |
| 63 | if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) { | 93 | Runtime& runtime_); |
| 64 | const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size); | ||
| 65 | if (use_fast_cbuf) { | ||
| 66 | u8* dest; | ||
| 67 | if (is_granular) { | ||
| 68 | dest = gpu_memory.GetPointer(gpu_addr); | ||
| 69 | } else { | ||
| 70 | staging_buffer.resize(size); | ||
| 71 | dest = staging_buffer.data(); | ||
| 72 | gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size); | ||
| 73 | } | ||
| 74 | return ConstBufferUpload(dest, size); | ||
| 75 | } | ||
| 76 | if (is_granular) { | ||
| 77 | u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); | ||
| 78 | return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) { | ||
| 79 | std::memcpy(dest, host_ptr, size); | ||
| 80 | }); | ||
| 81 | } else { | ||
| 82 | return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) { | ||
| 83 | gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size); | ||
| 84 | }); | ||
| 85 | } | ||
| 86 | } | ||
| 87 | } | ||
| 88 | 94 | ||
| 89 | Buffer* const block = GetBlock(*cpu_addr, size); | 95 | void TickFrame(); |
| 90 | MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size); | ||
| 91 | if (!map) { | ||
| 92 | return GetEmptyBuffer(size); | ||
| 93 | } | ||
| 94 | if (is_written) { | ||
| 95 | map->MarkAsModified(true, GetModifiedTicks()); | ||
| 96 | if (Settings::IsGPULevelHigh() && | ||
| 97 | Settings::values.use_asynchronous_gpu_emulation.GetValue()) { | ||
| 98 | MarkForAsyncFlush(map); | ||
| 99 | } | ||
| 100 | if (!map->is_written) { | ||
| 101 | map->is_written = true; | ||
| 102 | MarkRegionAsWritten(map->start, map->end - 1); | ||
| 103 | } | ||
| 104 | } | ||
| 105 | 96 | ||
| 106 | return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()}; | 97 | void WriteMemory(VAddr cpu_addr, u64 size); |
| 107 | } | ||
| 108 | 98 | ||
| 109 | /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. | 99 | void CachedWriteMemory(VAddr cpu_addr, u64 size); |
| 110 | BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, | ||
| 111 | std::size_t alignment = 4) { | ||
| 112 | std::lock_guard lock{mutex}; | ||
| 113 | return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) { | ||
| 114 | std::memcpy(dest, raw_pointer, size); | ||
| 115 | }); | ||
| 116 | } | ||
| 117 | 100 | ||
| 118 | /// Prepares the buffer cache for data uploading | 101 | void DownloadMemory(VAddr cpu_addr, u64 size); |
| 119 | /// @param max_size Maximum number of bytes that will be uploaded | ||
| 120 | /// @return True when a stream buffer invalidation was required, false otherwise | ||
| 121 | void Map(std::size_t max_size) { | ||
| 122 | std::lock_guard lock{mutex}; | ||
| 123 | 102 | ||
| 124 | std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4); | 103 | void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); |
| 125 | buffer_offset = buffer_offset_base; | ||
| 126 | } | ||
| 127 | 104 | ||
| 128 | /// Finishes the upload stream | 105 | void UpdateGraphicsBuffers(bool is_indexed); |
| 129 | void Unmap() { | ||
| 130 | std::lock_guard lock{mutex}; | ||
| 131 | stream_buffer.Unmap(buffer_offset - buffer_offset_base); | ||
| 132 | } | ||
| 133 | 106 | ||
| 134 | /// Function called at the end of each frame, inteded for deferred operations | 107 | void UpdateComputeBuffers(); |
| 135 | void TickFrame() { | ||
| 136 | ++epoch; | ||
| 137 | 108 | ||
| 138 | while (!pending_destruction.empty()) { | 109 | void BindHostGeometryBuffers(bool is_indexed); |
| 139 | // Delay at least 4 frames before destruction. | ||
| 140 | // This is due to triple buffering happening on some drivers. | ||
| 141 | static constexpr u64 epochs_to_destroy = 5; | ||
| 142 | if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) { | ||
| 143 | break; | ||
| 144 | } | ||
| 145 | pending_destruction.pop(); | ||
| 146 | } | ||
| 147 | } | ||
| 148 | 110 | ||
| 149 | /// Write any cached resources overlapping the specified region back to memory | 111 | void BindHostStageBuffers(size_t stage); |
| 150 | void FlushRegion(VAddr addr, std::size_t size) { | ||
| 151 | std::lock_guard lock{mutex}; | ||
| 152 | 112 | ||
| 153 | VectorMapInterval objects = GetMapsInRange(addr, size); | 113 | void BindHostComputeBuffers(); |
| 154 | std::sort(objects.begin(), objects.end(), | ||
| 155 | [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; }); | ||
| 156 | for (MapInterval* object : objects) { | ||
| 157 | if (object->is_modified && object->is_registered) { | ||
| 158 | mutex.unlock(); | ||
| 159 | FlushMap(object); | ||
| 160 | mutex.lock(); | ||
| 161 | } | ||
| 162 | } | ||
| 163 | } | ||
| 164 | 114 | ||
| 165 | bool MustFlushRegion(VAddr addr, std::size_t size) { | 115 | void SetEnabledUniformBuffers(size_t stage, u32 enabled); |
| 166 | std::lock_guard lock{mutex}; | ||
| 167 | 116 | ||
| 168 | const VectorMapInterval objects = GetMapsInRange(addr, size); | 117 | void SetEnabledComputeUniformBuffers(u32 enabled); |
| 169 | return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) { | ||
| 170 | return map->is_modified && map->is_registered; | ||
| 171 | }); | ||
| 172 | } | ||
| 173 | 118 | ||
| 174 | /// Mark the specified region as being invalidated | 119 | void UnbindGraphicsStorageBuffers(size_t stage); |
| 175 | void InvalidateRegion(VAddr addr, u64 size) { | ||
| 176 | std::lock_guard lock{mutex}; | ||
| 177 | 120 | ||
| 178 | for (auto& object : GetMapsInRange(addr, size)) { | 121 | void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, |
| 179 | if (object->is_registered) { | 122 | bool is_written); |
| 180 | Unregister(object); | ||
| 181 | } | ||
| 182 | } | ||
| 183 | } | ||
| 184 | 123 | ||
| 185 | void OnCPUWrite(VAddr addr, std::size_t size) { | 124 | void UnbindComputeStorageBuffers(); |
| 186 | std::lock_guard lock{mutex}; | ||
| 187 | 125 | ||
| 188 | for (MapInterval* object : GetMapsInRange(addr, size)) { | 126 | void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, |
| 189 | if (object->is_memory_marked && object->is_registered) { | 127 | bool is_written); |
| 190 | UnmarkMemory(object); | ||
| 191 | object->is_sync_pending = true; | ||
| 192 | marked_for_unregister.emplace_back(object); | ||
| 193 | } | ||
| 194 | } | ||
| 195 | } | ||
| 196 | 128 | ||
| 197 | void SyncGuestHost() { | 129 | void FlushCachedWrites(); |
| 198 | std::lock_guard lock{mutex}; | ||
| 199 | 130 | ||
| 200 | for (auto& object : marked_for_unregister) { | 131 | /// Return true when there are uncommitted buffers to be downloaded |
| 201 | if (object->is_registered) { | 132 | [[nodiscard]] bool HasUncommittedFlushes() const noexcept; |
| 202 | object->is_sync_pending = false; | 133 | |
| 203 | Unregister(object); | 134 | /// Return true when the caller should wait for async downloads |
| 204 | } | 135 | [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; |
| 136 | |||
| 137 | /// Commit asynchronous downloads | ||
| 138 | void CommitAsyncFlushes(); | ||
| 139 | |||
| 140 | /// Pop asynchronous downloads | ||
| 141 | void PopAsyncFlushes(); | ||
| 142 | |||
| 143 | /// Return true when a CPU region is modified from the GPU | ||
| 144 | [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); | ||
| 145 | |||
| 146 | std::mutex mutex; | ||
| 147 | |||
| 148 | private: | ||
| 149 | template <typename Func> | ||
| 150 | static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { | ||
| 151 | for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { | ||
| 152 | const int disabled_bits = std::countr_zero(enabled_mask); | ||
| 153 | index += disabled_bits; | ||
| 154 | enabled_mask >>= disabled_bits; | ||
| 155 | func(index); | ||
| 205 | } | 156 | } |
| 206 | marked_for_unregister.clear(); | ||
| 207 | } | 157 | } |
| 208 | 158 | ||
| 209 | void CommitAsyncFlushes() { | 159 | template <typename Func> |
| 210 | if (uncommitted_flushes) { | 160 | void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { |
| 211 | auto commit_list = std::make_shared<std::list<MapInterval*>>(); | 161 | const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); |
| 212 | for (MapInterval* map : *uncommitted_flushes) { | 162 | for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { |
| 213 | if (map->is_registered && map->is_modified) { | 163 | const BufferId buffer_id = page_table[page]; |
| 214 | // TODO(Blinkhawk): Implement backend asynchronous flushing | 164 | if (!buffer_id) { |
| 215 | // AsyncFlushMap(map) | 165 | ++page; |
| 216 | commit_list->push_back(map); | 166 | continue; |
| 217 | } | ||
| 218 | } | ||
| 219 | if (!commit_list->empty()) { | ||
| 220 | committed_flushes.push_back(commit_list); | ||
| 221 | } else { | ||
| 222 | committed_flushes.emplace_back(); | ||
| 223 | } | 167 | } |
| 224 | } else { | 168 | Buffer& buffer = slot_buffers[buffer_id]; |
| 225 | committed_flushes.emplace_back(); | 169 | func(buffer_id, buffer); |
| 170 | |||
| 171 | const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); | ||
| 172 | page = Common::DivCeil(end_addr, PAGE_SIZE); | ||
| 226 | } | 173 | } |
| 227 | uncommitted_flushes.reset(); | ||
| 228 | } | 174 | } |
| 229 | 175 | ||
| 230 | bool ShouldWaitAsyncFlushes() const { | 176 | static bool IsRangeGranular(VAddr cpu_addr, size_t size) { |
| 231 | return !committed_flushes.empty() && committed_flushes.front() != nullptr; | 177 | return (cpu_addr & ~Core::Memory::PAGE_MASK) == |
| 178 | ((cpu_addr + size) & ~Core::Memory::PAGE_MASK); | ||
| 232 | } | 179 | } |
| 233 | 180 | ||
| 234 | bool HasUncommittedFlushes() const { | 181 | void BindHostIndexBuffer(); |
| 235 | return uncommitted_flushes != nullptr; | ||
| 236 | } | ||
| 237 | 182 | ||
| 238 | void PopAsyncFlushes() { | 183 | void BindHostVertexBuffers(); |
| 239 | if (committed_flushes.empty()) { | ||
| 240 | return; | ||
| 241 | } | ||
| 242 | auto& flush_list = committed_flushes.front(); | ||
| 243 | if (!flush_list) { | ||
| 244 | committed_flushes.pop_front(); | ||
| 245 | return; | ||
| 246 | } | ||
| 247 | for (MapInterval* map : *flush_list) { | ||
| 248 | if (map->is_registered) { | ||
| 249 | // TODO(Blinkhawk): Replace this for reading the asynchronous flush | ||
| 250 | FlushMap(map); | ||
| 251 | } | ||
| 252 | } | ||
| 253 | committed_flushes.pop_front(); | ||
| 254 | } | ||
| 255 | 184 | ||
| 256 | virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; | 185 | void BindHostGraphicsUniformBuffers(size_t stage); |
| 257 | 186 | ||
| 258 | protected: | 187 | void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); |
| 259 | explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, | ||
| 260 | Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, | ||
| 261 | StreamBuffer& stream_buffer_) | ||
| 262 | : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, | ||
| 263 | stream_buffer{stream_buffer_} {} | ||
| 264 | 188 | ||
| 265 | ~BufferCache() = default; | 189 | void BindHostGraphicsStorageBuffers(size_t stage); |
| 266 | 190 | ||
| 267 | virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0; | 191 | void BindHostTransformFeedbackBuffers(); |
| 268 | 192 | ||
| 269 | virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { | 193 | void BindHostComputeUniformBuffers(); |
| 270 | return {}; | ||
| 271 | } | ||
| 272 | 194 | ||
| 273 | /// Register an object into the cache | 195 | void BindHostComputeStorageBuffers(); |
| 274 | MapInterval* Register(MapInterval new_map, bool inherit_written = false) { | ||
| 275 | const VAddr cpu_addr = new_map.start; | ||
| 276 | if (!cpu_addr) { | ||
| 277 | LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}", | ||
| 278 | new_map.gpu_addr); | ||
| 279 | return nullptr; | ||
| 280 | } | ||
| 281 | const std::size_t size = new_map.end - new_map.start; | ||
| 282 | new_map.is_registered = true; | ||
| 283 | rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1); | ||
| 284 | new_map.is_memory_marked = true; | ||
| 285 | if (inherit_written) { | ||
| 286 | MarkRegionAsWritten(new_map.start, new_map.end - 1); | ||
| 287 | new_map.is_written = true; | ||
| 288 | } | ||
| 289 | MapInterval* const storage = mapped_addresses_allocator.Allocate(); | ||
| 290 | *storage = new_map; | ||
| 291 | mapped_addresses.insert(*storage); | ||
| 292 | return storage; | ||
| 293 | } | ||
| 294 | 196 | ||
| 295 | void UnmarkMemory(MapInterval* map) { | 197 | void DoUpdateGraphicsBuffers(bool is_indexed); |
| 296 | if (!map->is_memory_marked) { | 198 | |
| 297 | return; | 199 | void DoUpdateComputeBuffers(); |
| 298 | } | 200 | |
| 299 | const std::size_t size = map->end - map->start; | 201 | void UpdateIndexBuffer(); |
| 300 | rasterizer.UpdatePagesCachedCount(map->start, size, -1); | 202 | |
| 301 | map->is_memory_marked = false; | 203 | void UpdateVertexBuffers(); |
| 302 | } | 204 | |
| 303 | 205 | void UpdateVertexBuffer(u32 index); | |
| 304 | /// Unregisters an object from the cache | 206 | |
| 305 | void Unregister(MapInterval* map) { | 207 | void UpdateUniformBuffers(size_t stage); |
| 306 | UnmarkMemory(map); | 208 | |
| 307 | map->is_registered = false; | 209 | void UpdateStorageBuffers(size_t stage); |
| 308 | if (map->is_sync_pending) { | 210 | |
| 309 | map->is_sync_pending = false; | 211 | void UpdateTransformFeedbackBuffers(); |
| 310 | marked_for_unregister.remove(map); | 212 | |
| 213 | void UpdateTransformFeedbackBuffer(u32 index); | ||
| 214 | |||
| 215 | void UpdateComputeUniformBuffers(); | ||
| 216 | |||
| 217 | void UpdateComputeStorageBuffers(); | ||
| 218 | |||
| 219 | void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); | ||
| 220 | |||
| 221 | [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); | ||
| 222 | |||
| 223 | [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); | ||
| 224 | |||
| 225 | void Register(BufferId buffer_id); | ||
| 226 | |||
| 227 | void Unregister(BufferId buffer_id); | ||
| 228 | |||
| 229 | template <bool insert> | ||
| 230 | void ChangeRegister(BufferId buffer_id); | ||
| 231 | |||
| 232 | void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); | ||
| 233 | |||
| 234 | void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); | ||
| 235 | |||
| 236 | void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, | ||
| 237 | std::span<BufferCopy> copies); | ||
| 238 | |||
| 239 | void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, | ||
| 240 | std::span<const BufferCopy> copies); | ||
| 241 | |||
| 242 | void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, | ||
| 243 | std::span<const BufferCopy> copies); | ||
| 244 | |||
| 245 | void DeleteBuffer(BufferId buffer_id); | ||
| 246 | |||
| 247 | void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); | ||
| 248 | |||
| 249 | void NotifyBufferDeletion(); | ||
| 250 | |||
| 251 | [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const; | ||
| 252 | |||
| 253 | [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size); | ||
| 254 | |||
| 255 | [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity); | ||
| 256 | |||
| 257 | [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; | ||
| 258 | |||
| 259 | VideoCore::RasterizerInterface& rasterizer; | ||
| 260 | Tegra::Engines::Maxwell3D& maxwell3d; | ||
| 261 | Tegra::Engines::KeplerCompute& kepler_compute; | ||
| 262 | Tegra::MemoryManager& gpu_memory; | ||
| 263 | Core::Memory::Memory& cpu_memory; | ||
| 264 | Runtime& runtime; | ||
| 265 | |||
| 266 | SlotVector<Buffer> slot_buffers; | ||
| 267 | DelayedDestructionRing<Buffer, 8> delayed_destruction_ring; | ||
| 268 | |||
| 269 | u32 last_index_count = 0; | ||
| 270 | |||
| 271 | Binding index_buffer; | ||
| 272 | std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers; | ||
| 273 | std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers; | ||
| 274 | std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers; | ||
| 275 | std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers; | ||
| 276 | |||
| 277 | std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers; | ||
| 278 | std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers; | ||
| 279 | |||
| 280 | std::array<u32, NUM_STAGES> enabled_uniform_buffers{}; | ||
| 281 | u32 enabled_compute_uniform_buffers = 0; | ||
| 282 | |||
| 283 | std::array<u32, NUM_STAGES> enabled_storage_buffers{}; | ||
| 284 | std::array<u32, NUM_STAGES> written_storage_buffers{}; | ||
| 285 | u32 enabled_compute_storage_buffers = 0; | ||
| 286 | u32 written_compute_storage_buffers = 0; | ||
| 287 | |||
| 288 | std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{}; | ||
| 289 | |||
| 290 | bool has_deleted_buffers = false; | ||
| 291 | |||
| 292 | std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty> | ||
| 293 | dirty_uniform_buffers{}; | ||
| 294 | |||
| 295 | std::vector<BufferId> cached_write_buffer_ids; | ||
| 296 | |||
| 297 | // TODO: This data structure is not optimal and it should be reworked | ||
| 298 | std::vector<BufferId> uncommitted_downloads; | ||
| 299 | std::deque<std::vector<BufferId>> committed_downloads; | ||
| 300 | |||
| 301 | size_t immediate_buffer_capacity = 0; | ||
| 302 | std::unique_ptr<u8[]> immediate_buffer_alloc; | ||
| 303 | |||
| 304 | std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; | ||
| 305 | }; | ||
| 306 | |||
| 307 | template <class P> | ||
| 308 | BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, | ||
| 309 | Tegra::Engines::Maxwell3D& maxwell3d_, | ||
| 310 | Tegra::Engines::KeplerCompute& kepler_compute_, | ||
| 311 | Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, | ||
| 312 | Runtime& runtime_) | ||
| 313 | : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_}, | ||
| 314 | gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} { | ||
| 315 | // Ensure the first slot is used for the null buffer | ||
| 316 | void(slot_buffers.insert(runtime, NullBufferParams{})); | ||
| 317 | } | ||
| 318 | |||
| 319 | template <class P> | ||
| 320 | void BufferCache<P>::TickFrame() { | ||
| 321 | delayed_destruction_ring.Tick(); | ||
| 322 | } | ||
| 323 | |||
| 324 | template <class P> | ||
| 325 | void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) { | ||
| 326 | ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { | ||
| 327 | buffer.MarkRegionAsCpuModified(cpu_addr, size); | ||
| 328 | }); | ||
| 329 | } | ||
| 330 | |||
| 331 | template <class P> | ||
| 332 | void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { | ||
| 333 | ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { | ||
| 334 | if (!buffer.HasCachedWrites()) { | ||
| 335 | cached_write_buffer_ids.push_back(buffer_id); | ||
| 311 | } | 336 | } |
| 312 | if (map->is_written) { | 337 | buffer.CachedCpuWrite(cpu_addr, size); |
| 313 | UnmarkRegionAsWritten(map->start, map->end - 1); | 338 | }); |
| 339 | } | ||
| 340 | |||
| 341 | template <class P> | ||
| 342 | void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { | ||
| 343 | ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { | ||
| 344 | boost::container::small_vector<BufferCopy, 1> copies; | ||
| 345 | u64 total_size_bytes = 0; | ||
| 346 | u64 largest_copy = 0; | ||
| 347 | buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { | ||
| 348 | copies.push_back(BufferCopy{ | ||
| 349 | .src_offset = range_offset, | ||
| 350 | .dst_offset = total_size_bytes, | ||
| 351 | .size = range_size, | ||
| 352 | }); | ||
| 353 | total_size_bytes += range_size; | ||
| 354 | largest_copy = std::max(largest_copy, range_size); | ||
| 355 | }); | ||
| 356 | if (total_size_bytes == 0) { | ||
| 357 | return; | ||
| 314 | } | 358 | } |
| 315 | const auto it = mapped_addresses.find(*map); | 359 | MICROPROFILE_SCOPE(GPU_DownloadMemory); |
| 316 | ASSERT(it != mapped_addresses.end()); | 360 | |
| 317 | mapped_addresses.erase(it); | 361 | if constexpr (USE_MEMORY_MAPS) { |
| 318 | mapped_addresses_allocator.Release(map); | 362 | auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); |
| 319 | } | 363 | const u8* const mapped_memory = download_staging.mapped_span.data(); |
| 320 | 364 | const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); | |
| 321 | private: | 365 | runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); |
| 322 | MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { | 366 | runtime.Finish(); |
| 323 | const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); | 367 | for (const BufferCopy& copy : copies) { |
| 324 | if (overlaps.empty()) { | 368 | const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; |
| 325 | const VAddr cpu_addr_end = cpu_addr + size; | 369 | const u8* copy_mapped_memory = mapped_memory + copy.dst_offset; |
| 326 | if (gpu_memory.IsGranularRange(gpu_addr, size)) { | 370 | cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); |
| 327 | u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); | ||
| 328 | block->Upload(block->Offset(cpu_addr), size, host_ptr); | ||
| 329 | } else { | ||
| 330 | staging_buffer.resize(size); | ||
| 331 | gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); | ||
| 332 | block->Upload(block->Offset(cpu_addr), size, staging_buffer.data()); | ||
| 333 | } | 371 | } |
| 334 | return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); | 372 | } else { |
| 335 | } | 373 | const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); |
| 336 | 374 | for (const BufferCopy& copy : copies) { | |
| 337 | const VAddr cpu_addr_end = cpu_addr + size; | 375 | buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); |
| 338 | if (overlaps.size() == 1) { | 376 | const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; |
| 339 | MapInterval* const current_map = overlaps[0]; | 377 | cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); |
| 340 | if (current_map->IsInside(cpu_addr, cpu_addr_end)) { | ||
| 341 | return current_map; | ||
| 342 | } | 378 | } |
| 343 | } | 379 | } |
| 344 | VAddr new_start = cpu_addr; | 380 | }); |
| 345 | VAddr new_end = cpu_addr_end; | 381 | } |
| 346 | bool write_inheritance = false; | 382 | |
| 347 | bool modified_inheritance = false; | 383 | template <class P> |
| 348 | // Calculate new buffer parameters | 384 | void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, |
| 349 | for (MapInterval* overlap : overlaps) { | 385 | u32 size) { |
| 350 | new_start = std::min(overlap->start, new_start); | 386 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); |
| 351 | new_end = std::max(overlap->end, new_end); | 387 | if (!cpu_addr) { |
| 352 | write_inheritance |= overlap->is_written; | 388 | uniform_buffers[stage][index] = NULL_BINDING; |
| 353 | modified_inheritance |= overlap->is_modified; | 389 | return; |
| 390 | } | ||
| 391 | const Binding binding{ | ||
| 392 | .cpu_addr = *cpu_addr, | ||
| 393 | .size = size, | ||
| 394 | .buffer_id = BufferId{}, | ||
| 395 | }; | ||
| 396 | uniform_buffers[stage][index] = binding; | ||
| 397 | } | ||
| 398 | |||
| 399 | template <class P> | ||
| 400 | void BufferCache<P>::UpdateGraphicsBuffers(bool is_indexed) { | ||
| 401 | MICROPROFILE_SCOPE(GPU_PrepareBuffers); | ||
| 402 | do { | ||
| 403 | has_deleted_buffers = false; | ||
| 404 | DoUpdateGraphicsBuffers(is_indexed); | ||
| 405 | } while (has_deleted_buffers); | ||
| 406 | } | ||
| 407 | |||
| 408 | template <class P> | ||
| 409 | void BufferCache<P>::UpdateComputeBuffers() { | ||
| 410 | MICROPROFILE_SCOPE(GPU_PrepareBuffers); | ||
| 411 | do { | ||
| 412 | has_deleted_buffers = false; | ||
| 413 | DoUpdateComputeBuffers(); | ||
| 414 | } while (has_deleted_buffers); | ||
| 415 | } | ||
| 416 | |||
| 417 | template <class P> | ||
| 418 | void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) { | ||
| 419 | MICROPROFILE_SCOPE(GPU_BindUploadBuffers); | ||
| 420 | if (is_indexed) { | ||
| 421 | BindHostIndexBuffer(); | ||
| 422 | } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { | ||
| 423 | const auto& regs = maxwell3d.regs; | ||
| 424 | if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) { | ||
| 425 | runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count); | ||
| 354 | } | 426 | } |
| 355 | GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; | 427 | } |
| 356 | for (auto& overlap : overlaps) { | 428 | BindHostVertexBuffers(); |
| 357 | Unregister(overlap); | 429 | BindHostTransformFeedbackBuffers(); |
| 430 | } | ||
| 431 | |||
| 432 | template <class P> | ||
| 433 | void BufferCache<P>::BindHostStageBuffers(size_t stage) { | ||
| 434 | MICROPROFILE_SCOPE(GPU_BindUploadBuffers); | ||
| 435 | BindHostGraphicsUniformBuffers(stage); | ||
| 436 | BindHostGraphicsStorageBuffers(stage); | ||
| 437 | } | ||
| 438 | |||
| 439 | template <class P> | ||
| 440 | void BufferCache<P>::BindHostComputeBuffers() { | ||
| 441 | MICROPROFILE_SCOPE(GPU_BindUploadBuffers); | ||
| 442 | BindHostComputeUniformBuffers(); | ||
| 443 | BindHostComputeStorageBuffers(); | ||
| 444 | } | ||
| 445 | |||
| 446 | template <class P> | ||
| 447 | void BufferCache<P>::SetEnabledUniformBuffers(size_t stage, u32 enabled) { | ||
| 448 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { | ||
| 449 | if (enabled_uniform_buffers[stage] != enabled) { | ||
| 450 | dirty_uniform_buffers[stage] = ~u32{0}; | ||
| 358 | } | 451 | } |
| 359 | UpdateBlock(block, new_start, new_end, overlaps); | 452 | } |
| 360 | 453 | enabled_uniform_buffers[stage] = enabled; | |
| 361 | const MapInterval new_map{new_start, new_end, new_gpu_addr}; | 454 | } |
| 362 | MapInterval* const map = Register(new_map, write_inheritance); | 455 | |
| 363 | if (!map) { | 456 | template <class P> |
| 364 | return nullptr; | 457 | void BufferCache<P>::SetEnabledComputeUniformBuffers(u32 enabled) { |
| 458 | enabled_compute_uniform_buffers = enabled; | ||
| 459 | } | ||
| 460 | |||
| 461 | template <class P> | ||
| 462 | void BufferCache<P>::UnbindGraphicsStorageBuffers(size_t stage) { | ||
| 463 | enabled_storage_buffers[stage] = 0; | ||
| 464 | written_storage_buffers[stage] = 0; | ||
| 465 | } | ||
| 466 | |||
| 467 | template <class P> | ||
| 468 | void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, | ||
| 469 | u32 cbuf_offset, bool is_written) { | ||
| 470 | enabled_storage_buffers[stage] |= 1U << ssbo_index; | ||
| 471 | written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index; | ||
| 472 | |||
| 473 | const auto& cbufs = maxwell3d.state.shader_stages[stage]; | ||
| 474 | const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset; | ||
| 475 | storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr); | ||
| 476 | } | ||
| 477 | |||
| 478 | template <class P> | ||
| 479 | void BufferCache<P>::UnbindComputeStorageBuffers() { | ||
| 480 | enabled_compute_storage_buffers = 0; | ||
| 481 | written_compute_storage_buffers = 0; | ||
| 482 | } | ||
| 483 | |||
| 484 | template <class P> | ||
| 485 | void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, | ||
| 486 | bool is_written) { | ||
| 487 | enabled_compute_storage_buffers |= 1U << ssbo_index; | ||
| 488 | written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index; | ||
| 489 | |||
| 490 | const auto& launch_desc = kepler_compute.launch_description; | ||
| 491 | ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0); | ||
| 492 | |||
| 493 | const auto& cbufs = launch_desc.const_buffer_config; | ||
| 494 | const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset; | ||
| 495 | compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr); | ||
| 496 | } | ||
| 497 | |||
| 498 | template <class P> | ||
| 499 | void BufferCache<P>::FlushCachedWrites() { | ||
| 500 | for (const BufferId buffer_id : cached_write_buffer_ids) { | ||
| 501 | slot_buffers[buffer_id].FlushCachedWrites(); | ||
| 502 | } | ||
| 503 | cached_write_buffer_ids.clear(); | ||
| 504 | } | ||
| 505 | |||
| 506 | template <class P> | ||
| 507 | bool BufferCache<P>::HasUncommittedFlushes() const noexcept { | ||
| 508 | return !uncommitted_downloads.empty(); | ||
| 509 | } | ||
| 510 | |||
| 511 | template <class P> | ||
| 512 | bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { | ||
| 513 | return !committed_downloads.empty() && !committed_downloads.front().empty(); | ||
| 514 | } | ||
| 515 | |||
| 516 | template <class P> | ||
| 517 | void BufferCache<P>::CommitAsyncFlushes() { | ||
| 518 | // This is intentionally passing the value by copy | ||
| 519 | committed_downloads.push_front(uncommitted_downloads); | ||
| 520 | uncommitted_downloads.clear(); | ||
| 521 | } | ||
| 522 | |||
| 523 | template <class P> | ||
| 524 | void BufferCache<P>::PopAsyncFlushes() { | ||
| 525 | if (committed_downloads.empty()) { | ||
| 526 | return; | ||
| 527 | } | ||
| 528 | auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); }); | ||
| 529 | const std::span<const BufferId> download_ids = committed_downloads.back(); | ||
| 530 | if (download_ids.empty()) { | ||
| 531 | return; | ||
| 532 | } | ||
| 533 | MICROPROFILE_SCOPE(GPU_DownloadMemory); | ||
| 534 | |||
| 535 | boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads; | ||
| 536 | u64 total_size_bytes = 0; | ||
| 537 | u64 largest_copy = 0; | ||
| 538 | for (const BufferId buffer_id : download_ids) { | ||
| 539 | slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) { | ||
| 540 | downloads.push_back({ | ||
| 541 | BufferCopy{ | ||
| 542 | .src_offset = range_offset, | ||
| 543 | .dst_offset = total_size_bytes, | ||
| 544 | .size = range_size, | ||
| 545 | }, | ||
| 546 | buffer_id, | ||
| 547 | }); | ||
| 548 | total_size_bytes += range_size; | ||
| 549 | largest_copy = std::max(largest_copy, range_size); | ||
| 550 | }); | ||
| 551 | } | ||
| 552 | if (downloads.empty()) { | ||
| 553 | return; | ||
| 554 | } | ||
| 555 | if constexpr (USE_MEMORY_MAPS) { | ||
| 556 | auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); | ||
| 557 | for (const auto [copy, buffer_id] : downloads) { | ||
| 558 | const std::array copies{copy}; | ||
| 559 | runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies); | ||
| 365 | } | 560 | } |
| 366 | if (modified_inheritance) { | 561 | runtime.Finish(); |
| 367 | map->MarkAsModified(true, GetModifiedTicks()); | 562 | for (const auto [copy, buffer_id] : downloads) { |
| 368 | if (Settings::IsGPULevelHigh() && | 563 | const Buffer& buffer = slot_buffers[buffer_id]; |
| 369 | Settings::values.use_asynchronous_gpu_emulation.GetValue()) { | 564 | const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; |
| 370 | MarkForAsyncFlush(map); | 565 | const u8* read_mapped_memory = download_staging.mapped_span.data() + copy.dst_offset; |
| 371 | } | 566 | cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); |
| 567 | } | ||
| 568 | } else { | ||
| 569 | const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); | ||
| 570 | for (const auto [copy, buffer_id] : downloads) { | ||
| 571 | Buffer& buffer = slot_buffers[buffer_id]; | ||
| 572 | buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); | ||
| 573 | const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; | ||
| 574 | cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); | ||
| 372 | } | 575 | } |
| 373 | return map; | ||
| 374 | } | 576 | } |
| 375 | 577 | } | |
| 376 | void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { | 578 | |
| 377 | const IntervalType base_interval{start, end}; | 579 | template <class P> |
| 378 | IntervalSet interval_set{}; | 580 | bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { |
| 379 | interval_set.add(base_interval); | 581 | const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); |
| 380 | for (auto& overlap : overlaps) { | 582 | for (u64 page = addr >> PAGE_BITS; page < page_end;) { |
| 381 | const IntervalType subtract{overlap->start, overlap->end}; | 583 | const BufferId image_id = page_table[page]; |
| 382 | interval_set.subtract(subtract); | 584 | if (!image_id) { |
| 585 | ++page; | ||
| 586 | continue; | ||
| 383 | } | 587 | } |
| 384 | for (auto& interval : interval_set) { | 588 | Buffer& buffer = slot_buffers[image_id]; |
| 385 | const std::size_t size = interval.upper() - interval.lower(); | 589 | if (buffer.IsRegionGpuModified(addr, size)) { |
| 386 | if (size == 0) { | 590 | return true; |
| 387 | continue; | ||
| 388 | } | ||
| 389 | staging_buffer.resize(size); | ||
| 390 | cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); | ||
| 391 | block->Upload(block->Offset(interval.lower()), size, staging_buffer.data()); | ||
| 392 | } | 591 | } |
| 592 | const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); | ||
| 593 | page = Common::DivCeil(end_addr, PAGE_SIZE); | ||
| 393 | } | 594 | } |
| 394 | 595 | return false; | |
| 395 | VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) { | 596 | } |
| 396 | VectorMapInterval result; | 597 | |
| 397 | if (size == 0) { | 598 | template <class P> |
| 398 | return result; | 599 | void BufferCache<P>::BindHostIndexBuffer() { |
| 600 | Buffer& buffer = slot_buffers[index_buffer.buffer_id]; | ||
| 601 | const u32 offset = buffer.Offset(index_buffer.cpu_addr); | ||
| 602 | const u32 size = index_buffer.size; | ||
| 603 | SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); | ||
| 604 | if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { | ||
| 605 | runtime.BindIndexBuffer(buffer, offset, size); | ||
| 606 | } else { | ||
| 607 | runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format, | ||
| 608 | maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count, | ||
| 609 | buffer, offset, size); | ||
| 610 | } | ||
| 611 | } | ||
| 612 | |||
| 613 | template <class P> | ||
| 614 | void BufferCache<P>::BindHostVertexBuffers() { | ||
| 615 | auto& flags = maxwell3d.dirty.flags; | ||
| 616 | for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { | ||
| 617 | const Binding& binding = vertex_buffers[index]; | ||
| 618 | Buffer& buffer = slot_buffers[binding.buffer_id]; | ||
| 619 | SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); | ||
| 620 | if (!flags[Dirty::VertexBuffer0 + index]) { | ||
| 621 | continue; | ||
| 399 | } | 622 | } |
| 623 | flags[Dirty::VertexBuffer0 + index] = false; | ||
| 624 | |||
| 625 | const u32 stride = maxwell3d.regs.vertex_array[index].stride; | ||
| 626 | const u32 offset = buffer.Offset(binding.cpu_addr); | ||
| 627 | runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride); | ||
| 628 | } | ||
| 629 | } | ||
| 400 | 630 | ||
| 401 | const VAddr addr_end = addr + size; | 631 | template <class P> |
| 402 | auto it = mapped_addresses.lower_bound(addr); | 632 | void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) { |
| 403 | if (it != mapped_addresses.begin()) { | 633 | u32 dirty = ~0U; |
| 404 | --it; | 634 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { |
| 635 | dirty = std::exchange(dirty_uniform_buffers[stage], 0); | ||
| 636 | } | ||
| 637 | u32 binding_index = 0; | ||
| 638 | ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) { | ||
| 639 | const bool needs_bind = ((dirty >> index) & 1) != 0; | ||
| 640 | BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind); | ||
| 641 | if constexpr (NEEDS_BIND_UNIFORM_INDEX) { | ||
| 642 | ++binding_index; | ||
| 405 | } | 643 | } |
| 406 | while (it != mapped_addresses.end() && it->start < addr_end) { | 644 | }); |
| 407 | if (it->Overlaps(addr, addr_end)) { | 645 | } |
| 408 | result.push_back(&*it); | 646 | |
| 647 | template <class P> | ||
| 648 | void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, | ||
| 649 | bool needs_bind) { | ||
| 650 | const Binding& binding = uniform_buffers[stage][index]; | ||
| 651 | const VAddr cpu_addr = binding.cpu_addr; | ||
| 652 | const u32 size = binding.size; | ||
| 653 | Buffer& buffer = slot_buffers[binding.buffer_id]; | ||
| 654 | if constexpr (IS_OPENGL) { | ||
| 655 | if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) { | ||
| 656 | if (runtime.HasFastBufferSubData()) { | ||
| 657 | // Fast path for Nvidia | ||
| 658 | if (!HasFastUniformBufferBound(stage, binding_index)) { | ||
| 659 | // We only have to bind when the currently bound buffer is not the fast version | ||
| 660 | fast_bound_uniform_buffers[stage] |= 1U << binding_index; | ||
| 661 | runtime.BindFastUniformBuffer(stage, binding_index, size); | ||
| 662 | } | ||
| 663 | const auto span = ImmediateBufferWithData(cpu_addr, size); | ||
| 664 | runtime.PushFastUniformBuffer(stage, binding_index, span); | ||
| 665 | } else { | ||
| 666 | // Stream buffer path to avoid stalling on non-Nvidia drivers | ||
| 667 | const auto span = runtime.BindMappedUniformBuffer(stage, binding_index, size); | ||
| 668 | cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size); | ||
| 409 | } | 669 | } |
| 410 | ++it; | 670 | return; |
| 411 | } | 671 | } |
| 412 | return result; | ||
| 413 | } | 672 | } |
| 414 | 673 | // Classic cached path | |
| 415 | /// Returns a ticks counter used for tracking when cached objects were last modified | 674 | SynchronizeBuffer(buffer, cpu_addr, size); |
| 416 | u64 GetModifiedTicks() { | 675 | if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) { |
| 417 | return ++modified_ticks; | 676 | // Skip binding if it's not needed and if the bound buffer is not the fast version |
| 677 | // This exists to avoid instances where the fast buffer is bound and a GPU write happens | ||
| 678 | return; | ||
| 418 | } | 679 | } |
| 680 | fast_bound_uniform_buffers[stage] &= ~(1U << binding_index); | ||
| 419 | 681 | ||
| 420 | void FlushMap(MapInterval* map) { | 682 | const u32 offset = buffer.Offset(cpu_addr); |
| 421 | const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); | 683 | if constexpr (NEEDS_BIND_UNIFORM_INDEX) { |
| 422 | ASSERT_OR_EXECUTE(it != blocks.end(), return;); | 684 | runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size); |
| 423 | 685 | } else { | |
| 424 | std::shared_ptr<Buffer> block = it->second; | 686 | runtime.BindUniformBuffer(buffer, offset, size); |
| 425 | |||
| 426 | const std::size_t size = map->end - map->start; | ||
| 427 | staging_buffer.resize(size); | ||
| 428 | block->Download(block->Offset(map->start), size, staging_buffer.data()); | ||
| 429 | cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size); | ||
| 430 | map->MarkAsModified(false, 0); | ||
| 431 | } | 687 | } |
| 688 | } | ||
| 689 | |||
| 690 | template <class P> | ||
| 691 | void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) { | ||
| 692 | u32 binding_index = 0; | ||
| 693 | ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { | ||
| 694 | const Binding& binding = storage_buffers[stage][index]; | ||
| 695 | Buffer& buffer = slot_buffers[binding.buffer_id]; | ||
| 696 | const u32 size = binding.size; | ||
| 697 | SynchronizeBuffer(buffer, binding.cpu_addr, size); | ||
| 698 | |||
| 699 | const u32 offset = buffer.Offset(binding.cpu_addr); | ||
| 700 | const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0; | ||
| 701 | if constexpr (NEEDS_BIND_STORAGE_INDEX) { | ||
| 702 | runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written); | ||
| 703 | ++binding_index; | ||
| 704 | } else { | ||
| 705 | runtime.BindStorageBuffer(buffer, offset, size, is_written); | ||
| 706 | } | ||
| 707 | }); | ||
| 708 | } | ||
| 432 | 709 | ||
| 433 | template <typename Callable> | 710 | template <class P> |
| 434 | BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { | 711 | void BufferCache<P>::BindHostTransformFeedbackBuffers() { |
| 435 | AlignBuffer(alignment); | 712 | if (maxwell3d.regs.tfb_enabled == 0) { |
| 436 | const std::size_t uploaded_offset = buffer_offset; | 713 | return; |
| 437 | callable(buffer_ptr); | ||
| 438 | |||
| 439 | buffer_ptr += size; | ||
| 440 | buffer_offset += size; | ||
| 441 | return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()}; | ||
| 442 | } | 714 | } |
| 443 | 715 | for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { | |
| 444 | void AlignBuffer(std::size_t alignment) { | 716 | const Binding& binding = transform_feedback_buffers[index]; |
| 445 | // Align the offset, not the mapped pointer | 717 | Buffer& buffer = slot_buffers[binding.buffer_id]; |
| 446 | const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment); | 718 | const u32 size = binding.size; |
| 447 | buffer_ptr += offset_aligned - buffer_offset; | 719 | SynchronizeBuffer(buffer, binding.cpu_addr, size); |
| 448 | buffer_offset = offset_aligned; | 720 | |
| 721 | const u32 offset = buffer.Offset(binding.cpu_addr); | ||
| 722 | runtime.BindTransformFeedbackBuffer(index, buffer, offset, size); | ||
| 449 | } | 723 | } |
| 724 | } | ||
| 450 | 725 | ||
| 451 | std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) { | 726 | template <class P> |
| 452 | const std::size_t old_size = buffer->Size(); | 727 | void BufferCache<P>::BindHostComputeUniformBuffers() { |
| 453 | const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; | 728 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { |
| 454 | const VAddr cpu_addr = buffer->CpuAddr(); | 729 | // Mark all uniform buffers as dirty |
| 455 | std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size); | 730 | dirty_uniform_buffers.fill(~u32{0}); |
| 456 | new_buffer->CopyFrom(*buffer, 0, 0, old_size); | 731 | } |
| 457 | QueueDestruction(std::move(buffer)); | 732 | u32 binding_index = 0; |
| 458 | 733 | ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { | |
| 459 | const VAddr cpu_addr_end = cpu_addr + new_size - 1; | 734 | const Binding& binding = compute_uniform_buffers[index]; |
| 460 | const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; | 735 | Buffer& buffer = slot_buffers[binding.buffer_id]; |
| 461 | for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { | 736 | const u32 size = binding.size; |
| 462 | blocks.insert_or_assign(page_start, new_buffer); | 737 | SynchronizeBuffer(buffer, binding.cpu_addr, size); |
| 738 | |||
| 739 | const u32 offset = buffer.Offset(binding.cpu_addr); | ||
| 740 | if constexpr (NEEDS_BIND_UNIFORM_INDEX) { | ||
| 741 | runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size); | ||
| 742 | ++binding_index; | ||
| 743 | } else { | ||
| 744 | runtime.BindUniformBuffer(buffer, offset, size); | ||
| 463 | } | 745 | } |
| 746 | }); | ||
| 747 | } | ||
| 748 | |||
| 749 | template <class P> | ||
| 750 | void BufferCache<P>::BindHostComputeStorageBuffers() { | ||
| 751 | u32 binding_index = 0; | ||
| 752 | ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { | ||
| 753 | const Binding& binding = compute_storage_buffers[index]; | ||
| 754 | Buffer& buffer = slot_buffers[binding.buffer_id]; | ||
| 755 | const u32 size = binding.size; | ||
| 756 | SynchronizeBuffer(buffer, binding.cpu_addr, size); | ||
| 757 | |||
| 758 | const u32 offset = buffer.Offset(binding.cpu_addr); | ||
| 759 | const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0; | ||
| 760 | if constexpr (NEEDS_BIND_STORAGE_INDEX) { | ||
| 761 | runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written); | ||
| 762 | ++binding_index; | ||
| 763 | } else { | ||
| 764 | runtime.BindStorageBuffer(buffer, offset, size, is_written); | ||
| 765 | } | ||
| 766 | }); | ||
| 767 | } | ||
| 464 | 768 | ||
| 465 | return new_buffer; | 769 | template <class P> |
| 770 | void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) { | ||
| 771 | if (is_indexed) { | ||
| 772 | UpdateIndexBuffer(); | ||
| 466 | } | 773 | } |
| 774 | UpdateVertexBuffers(); | ||
| 775 | UpdateTransformFeedbackBuffers(); | ||
| 776 | for (size_t stage = 0; stage < NUM_STAGES; ++stage) { | ||
| 777 | UpdateUniformBuffers(stage); | ||
| 778 | UpdateStorageBuffers(stage); | ||
| 779 | } | ||
| 780 | } | ||
| 781 | |||
| 782 | template <class P> | ||
| 783 | void BufferCache<P>::DoUpdateComputeBuffers() { | ||
| 784 | UpdateComputeUniformBuffers(); | ||
| 785 | UpdateComputeStorageBuffers(); | ||
| 786 | } | ||
| 787 | |||
| 788 | template <class P> | ||
| 789 | void BufferCache<P>::UpdateIndexBuffer() { | ||
| 790 | // We have to check for the dirty flags and index count | ||
| 791 | // The index count is currently changed without updating the dirty flags | ||
| 792 | const auto& index_array = maxwell3d.regs.index_array; | ||
| 793 | auto& flags = maxwell3d.dirty.flags; | ||
| 794 | if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) { | ||
| 795 | return; | ||
| 796 | } | ||
| 797 | flags[Dirty::IndexBuffer] = false; | ||
| 798 | last_index_count = index_array.count; | ||
| 799 | |||
| 800 | const GPUVAddr gpu_addr_begin = index_array.StartAddress(); | ||
| 801 | const GPUVAddr gpu_addr_end = index_array.EndAddress(); | ||
| 802 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); | ||
| 803 | const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); | ||
| 804 | const u32 draw_size = index_array.count * index_array.FormatSizeInBytes(); | ||
| 805 | const u32 size = std::min(address_size, draw_size); | ||
| 806 | if (size == 0 || !cpu_addr) { | ||
| 807 | index_buffer = NULL_BINDING; | ||
| 808 | return; | ||
| 809 | } | ||
| 810 | index_buffer = Binding{ | ||
| 811 | .cpu_addr = *cpu_addr, | ||
| 812 | .size = size, | ||
| 813 | .buffer_id = FindBuffer(*cpu_addr, size), | ||
| 814 | }; | ||
| 815 | } | ||
| 467 | 816 | ||
| 468 | std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first, | 817 | template <class P> |
| 469 | std::shared_ptr<Buffer> second) { | 818 | void BufferCache<P>::UpdateVertexBuffers() { |
| 470 | const std::size_t size_1 = first->Size(); | 819 | auto& flags = maxwell3d.dirty.flags; |
| 471 | const std::size_t size_2 = second->Size(); | 820 | if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) { |
| 472 | const VAddr first_addr = first->CpuAddr(); | 821 | return; |
| 473 | const VAddr second_addr = second->CpuAddr(); | 822 | } |
| 474 | const VAddr new_addr = std::min(first_addr, second_addr); | 823 | flags[Dirty::VertexBuffers] = false; |
| 475 | const std::size_t new_size = size_1 + size_2; | ||
| 476 | |||
| 477 | std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size); | ||
| 478 | new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1); | ||
| 479 | new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2); | ||
| 480 | QueueDestruction(std::move(first)); | ||
| 481 | QueueDestruction(std::move(second)); | ||
| 482 | 824 | ||
| 483 | const VAddr cpu_addr_end = new_addr + new_size - 1; | 825 | for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { |
| 484 | const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; | 826 | UpdateVertexBuffer(index); |
| 485 | for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { | ||
| 486 | blocks.insert_or_assign(page_start, new_buffer); | ||
| 487 | } | ||
| 488 | return new_buffer; | ||
| 489 | } | 827 | } |
| 828 | } | ||
| 490 | 829 | ||
| 491 | Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { | 830 | template <class P> |
| 492 | std::shared_ptr<Buffer> found; | 831 | void BufferCache<P>::UpdateVertexBuffer(u32 index) { |
| 832 | if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) { | ||
| 833 | return; | ||
| 834 | } | ||
| 835 | const auto& array = maxwell3d.regs.vertex_array[index]; | ||
| 836 | const auto& limit = maxwell3d.regs.vertex_array_limit[index]; | ||
| 837 | const GPUVAddr gpu_addr_begin = array.StartAddress(); | ||
| 838 | const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1; | ||
| 839 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); | ||
| 840 | const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); | ||
| 841 | const u32 size = address_size; // TODO: Analyze stride and number of vertices | ||
| 842 | if (array.enable == 0 || size == 0 || !cpu_addr) { | ||
| 843 | vertex_buffers[index] = NULL_BINDING; | ||
| 844 | return; | ||
| 845 | } | ||
| 846 | vertex_buffers[index] = Binding{ | ||
| 847 | .cpu_addr = *cpu_addr, | ||
| 848 | .size = size, | ||
| 849 | .buffer_id = FindBuffer(*cpu_addr, size), | ||
| 850 | }; | ||
| 851 | } | ||
| 852 | |||
| 853 | template <class P> | ||
| 854 | void BufferCache<P>::UpdateUniformBuffers(size_t stage) { | ||
| 855 | ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) { | ||
| 856 | Binding& binding = uniform_buffers[stage][index]; | ||
| 857 | if (binding.buffer_id) { | ||
| 858 | // Already updated | ||
| 859 | return; | ||
| 860 | } | ||
| 861 | // Mark as dirty | ||
| 862 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { | ||
| 863 | dirty_uniform_buffers[stage] |= 1U << index; | ||
| 864 | } | ||
| 865 | // Resolve buffer | ||
| 866 | binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size); | ||
| 867 | }); | ||
| 868 | } | ||
| 869 | |||
| 870 | template <class P> | ||
| 871 | void BufferCache<P>::UpdateStorageBuffers(size_t stage) { | ||
| 872 | const u32 written_mask = written_storage_buffers[stage]; | ||
| 873 | ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { | ||
| 874 | // Resolve buffer | ||
| 875 | Binding& binding = storage_buffers[stage][index]; | ||
| 876 | const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size); | ||
| 877 | binding.buffer_id = buffer_id; | ||
| 878 | // Mark buffer as written if needed | ||
| 879 | if (((written_mask >> index) & 1) != 0) { | ||
| 880 | MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size); | ||
| 881 | } | ||
| 882 | }); | ||
| 883 | } | ||
| 493 | 884 | ||
| 494 | const VAddr cpu_addr_end = cpu_addr + size - 1; | 885 | template <class P> |
| 495 | const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; | 886 | void BufferCache<P>::UpdateTransformFeedbackBuffers() { |
| 496 | for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { | 887 | if (maxwell3d.regs.tfb_enabled == 0) { |
| 497 | auto it = blocks.find(page_start); | 888 | return; |
| 498 | if (it == blocks.end()) { | 889 | } |
| 499 | if (found) { | 890 | for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { |
| 500 | found = EnlargeBlock(found); | 891 | UpdateTransformFeedbackBuffer(index); |
| 501 | continue; | 892 | } |
| 502 | } | 893 | } |
| 503 | const VAddr start_addr = page_start << BLOCK_PAGE_BITS; | 894 | |
| 504 | found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); | 895 | template <class P> |
| 505 | blocks.insert_or_assign(page_start, found); | 896 | void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) { |
| 506 | continue; | 897 | const auto& binding = maxwell3d.regs.tfb_bindings[index]; |
| 507 | } | 898 | const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset; |
| 508 | if (!found) { | 899 | const u32 size = binding.buffer_size; |
| 509 | found = it->second; | 900 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); |
| 510 | continue; | 901 | if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) { |
| 511 | } | 902 | transform_feedback_buffers[index] = NULL_BINDING; |
| 512 | if (found != it->second) { | 903 | return; |
| 513 | found = MergeBlocks(std::move(found), it->second); | 904 | } |
| 905 | const BufferId buffer_id = FindBuffer(*cpu_addr, size); | ||
| 906 | transform_feedback_buffers[index] = Binding{ | ||
| 907 | .cpu_addr = *cpu_addr, | ||
| 908 | .size = size, | ||
| 909 | .buffer_id = buffer_id, | ||
| 910 | }; | ||
| 911 | MarkWrittenBuffer(buffer_id, *cpu_addr, size); | ||
| 912 | } | ||
| 913 | |||
| 914 | template <class P> | ||
| 915 | void BufferCache<P>::UpdateComputeUniformBuffers() { | ||
| 916 | ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { | ||
| 917 | Binding& binding = compute_uniform_buffers[index]; | ||
| 918 | binding = NULL_BINDING; | ||
| 919 | const auto& launch_desc = kepler_compute.launch_description; | ||
| 920 | if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) { | ||
| 921 | const auto& cbuf = launch_desc.const_buffer_config[index]; | ||
| 922 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address()); | ||
| 923 | if (cpu_addr) { | ||
| 924 | binding.cpu_addr = *cpu_addr; | ||
| 925 | binding.size = cbuf.size; | ||
| 514 | } | 926 | } |
| 515 | } | 927 | } |
| 516 | return found.get(); | 928 | binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size); |
| 929 | }); | ||
| 930 | } | ||
| 931 | |||
| 932 | template <class P> | ||
| 933 | void BufferCache<P>::UpdateComputeStorageBuffers() { | ||
| 934 | ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { | ||
| 935 | // Resolve buffer | ||
| 936 | Binding& binding = compute_storage_buffers[index]; | ||
| 937 | const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size); | ||
| 938 | binding.buffer_id = buffer_id; | ||
| 939 | // Mark as written if needed | ||
| 940 | if (((written_compute_storage_buffers >> index) & 1) != 0) { | ||
| 941 | MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size); | ||
| 942 | } | ||
| 943 | }); | ||
| 944 | } | ||
| 945 | |||
| 946 | template <class P> | ||
| 947 | void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { | ||
| 948 | Buffer& buffer = slot_buffers[buffer_id]; | ||
| 949 | buffer.MarkRegionAsGpuModified(cpu_addr, size); | ||
| 950 | |||
| 951 | const bool is_accuracy_high = Settings::IsGPULevelHigh(); | ||
| 952 | const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); | ||
| 953 | if (!is_accuracy_high || !is_async) { | ||
| 954 | return; | ||
| 517 | } | 955 | } |
| 956 | if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) { | ||
| 957 | // Already inserted | ||
| 958 | return; | ||
| 959 | } | ||
| 960 | uncommitted_downloads.push_back(buffer_id); | ||
| 961 | } | ||
| 518 | 962 | ||
| 519 | void MarkRegionAsWritten(VAddr start, VAddr end) { | 963 | template <class P> |
| 520 | const u64 page_end = end >> WRITE_PAGE_BIT; | 964 | BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) { |
| 521 | for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { | 965 | if (cpu_addr == 0) { |
| 522 | if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) { | 966 | return NULL_BUFFER_ID; |
| 523 | ++it->second; | 967 | } |
| 524 | } | 968 | const u64 page = cpu_addr >> PAGE_BITS; |
| 969 | const BufferId buffer_id = page_table[page]; | ||
| 970 | if (!buffer_id) { | ||
| 971 | return CreateBuffer(cpu_addr, size); | ||
| 972 | } | ||
| 973 | const Buffer& buffer = slot_buffers[buffer_id]; | ||
| 974 | if (buffer.IsInBounds(cpu_addr, size)) { | ||
| 975 | return buffer_id; | ||
| 976 | } | ||
| 977 | return CreateBuffer(cpu_addr, size); | ||
| 978 | } | ||
| 979 | |||
| 980 | template <class P> | ||
| 981 | BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { | ||
| 982 | std::vector<BufferId> overlap_ids; | ||
| 983 | VAddr cpu_addr_begin = cpu_addr; | ||
| 984 | VAddr cpu_addr_end = cpu_addr + wanted_size; | ||
| 985 | for (; cpu_addr >> PAGE_BITS < Common::DivCeil(cpu_addr_end, PAGE_SIZE); | ||
| 986 | cpu_addr += PAGE_SIZE) { | ||
| 987 | const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; | ||
| 988 | if (!overlap_id) { | ||
| 989 | continue; | ||
| 990 | } | ||
| 991 | Buffer& overlap = slot_buffers[overlap_id]; | ||
| 992 | if (overlap.IsPicked()) { | ||
| 993 | continue; | ||
| 994 | } | ||
| 995 | overlap.Pick(); | ||
| 996 | overlap_ids.push_back(overlap_id); | ||
| 997 | const VAddr overlap_cpu_addr = overlap.CpuAddr(); | ||
| 998 | if (overlap_cpu_addr < cpu_addr_begin) { | ||
| 999 | cpu_addr = cpu_addr_begin = overlap_cpu_addr; | ||
| 525 | } | 1000 | } |
| 1001 | cpu_addr_end = std::max(cpu_addr_end, overlap_cpu_addr + overlap.SizeBytes()); | ||
| 526 | } | 1002 | } |
| 527 | 1003 | const u32 size = static_cast<u32>(cpu_addr_end - cpu_addr_begin); | |
| 528 | void UnmarkRegionAsWritten(VAddr start, VAddr end) { | 1004 | const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, cpu_addr_begin, size); |
| 529 | const u64 page_end = end >> WRITE_PAGE_BIT; | 1005 | Buffer& new_buffer = slot_buffers[new_buffer_id]; |
| 530 | for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { | 1006 | |
| 531 | auto it = written_pages.find(page_start); | 1007 | for (const BufferId overlap_id : overlap_ids) { |
| 532 | if (it != written_pages.end()) { | 1008 | Buffer& overlap = slot_buffers[overlap_id]; |
| 533 | if (it->second > 1) { | 1009 | overlap.Unpick(); |
| 534 | --it->second; | 1010 | |
| 535 | } else { | 1011 | std::vector<BufferCopy> copies; |
| 536 | written_pages.erase(it); | 1012 | const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); |
| 537 | } | 1013 | overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) { |
| 538 | } | 1014 | copies.push_back(BufferCopy{ |
| 1015 | .src_offset = begin, | ||
| 1016 | .dst_offset = dst_base_offset + begin, | ||
| 1017 | .size = range_size, | ||
| 1018 | }); | ||
| 1019 | new_buffer.UnmarkRegionAsCpuModified(begin, range_size); | ||
| 1020 | new_buffer.MarkRegionAsGpuModified(begin, range_size); | ||
| 1021 | }); | ||
| 1022 | if (!copies.empty()) { | ||
| 1023 | runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); | ||
| 1024 | } | ||
| 1025 | ReplaceBufferDownloads(overlap_id, new_buffer_id); | ||
| 1026 | DeleteBuffer(overlap_id); | ||
| 1027 | } | ||
| 1028 | Register(new_buffer_id); | ||
| 1029 | return new_buffer_id; | ||
| 1030 | } | ||
| 1031 | |||
| 1032 | template <class P> | ||
| 1033 | void BufferCache<P>::Register(BufferId buffer_id) { | ||
| 1034 | ChangeRegister<true>(buffer_id); | ||
| 1035 | } | ||
| 1036 | |||
| 1037 | template <class P> | ||
| 1038 | void BufferCache<P>::Unregister(BufferId buffer_id) { | ||
| 1039 | ChangeRegister<false>(buffer_id); | ||
| 1040 | } | ||
| 1041 | |||
| 1042 | template <class P> | ||
| 1043 | template <bool insert> | ||
| 1044 | void BufferCache<P>::ChangeRegister(BufferId buffer_id) { | ||
| 1045 | const Buffer& buffer = slot_buffers[buffer_id]; | ||
| 1046 | const VAddr cpu_addr_begin = buffer.CpuAddr(); | ||
| 1047 | const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes(); | ||
| 1048 | const u64 page_begin = cpu_addr_begin / PAGE_SIZE; | ||
| 1049 | const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); | ||
| 1050 | for (u64 page = page_begin; page != page_end; ++page) { | ||
| 1051 | if constexpr (insert) { | ||
| 1052 | page_table[page] = buffer_id; | ||
| 1053 | } else { | ||
| 1054 | page_table[page] = BufferId{}; | ||
| 539 | } | 1055 | } |
| 540 | } | 1056 | } |
| 1057 | } | ||
| 541 | 1058 | ||
| 542 | bool IsRegionWritten(VAddr start, VAddr end) const { | 1059 | template <class P> |
| 543 | const u64 page_end = end >> WRITE_PAGE_BIT; | 1060 | void BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { |
| 544 | for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { | 1061 | if (buffer.CpuAddr() == 0) { |
| 545 | if (written_pages.contains(page_start)) { | 1062 | return; |
| 546 | return true; | 1063 | } |
| 1064 | SynchronizeBufferImpl(buffer, cpu_addr, size); | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | template <class P> | ||
| 1068 | void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) { | ||
| 1069 | boost::container::small_vector<BufferCopy, 4> copies; | ||
| 1070 | u64 total_size_bytes = 0; | ||
| 1071 | u64 largest_copy = 0; | ||
| 1072 | buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { | ||
| 1073 | copies.push_back(BufferCopy{ | ||
| 1074 | .src_offset = total_size_bytes, | ||
| 1075 | .dst_offset = range_offset, | ||
| 1076 | .size = range_size, | ||
| 1077 | }); | ||
| 1078 | total_size_bytes += range_size; | ||
| 1079 | largest_copy = std::max(largest_copy, range_size); | ||
| 1080 | }); | ||
| 1081 | if (total_size_bytes == 0) { | ||
| 1082 | return; | ||
| 1083 | } | ||
| 1084 | const std::span<BufferCopy> copies_span(copies.data(), copies.size()); | ||
| 1085 | UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); | ||
| 1086 | } | ||
| 1087 | |||
| 1088 | template <class P> | ||
| 1089 | void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, | ||
| 1090 | std::span<BufferCopy> copies) { | ||
| 1091 | if constexpr (USE_MEMORY_MAPS) { | ||
| 1092 | MappedUploadMemory(buffer, total_size_bytes, copies); | ||
| 1093 | } else { | ||
| 1094 | ImmediateUploadMemory(buffer, largest_copy, copies); | ||
| 1095 | } | ||
| 1096 | } | ||
| 1097 | |||
| 1098 | template <class P> | ||
| 1099 | void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, | ||
| 1100 | std::span<const BufferCopy> copies) { | ||
| 1101 | std::span<u8> immediate_buffer; | ||
| 1102 | for (const BufferCopy& copy : copies) { | ||
| 1103 | std::span<const u8> upload_span; | ||
| 1104 | const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; | ||
| 1105 | if (IsRangeGranular(cpu_addr, copy.size)) { | ||
| 1106 | upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); | ||
| 1107 | } else { | ||
| 1108 | if (immediate_buffer.empty()) { | ||
| 1109 | immediate_buffer = ImmediateBuffer(largest_copy); | ||
| 547 | } | 1110 | } |
| 1111 | cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); | ||
| 1112 | upload_span = immediate_buffer.subspan(0, copy.size); | ||
| 548 | } | 1113 | } |
| 549 | return false; | 1114 | buffer.ImmediateUpload(copy.dst_offset, upload_span); |
| 550 | } | 1115 | } |
| 551 | 1116 | } | |
| 552 | void QueueDestruction(std::shared_ptr<Buffer> buffer) { | 1117 | |
| 553 | buffer->SetEpoch(epoch); | 1118 | template <class P> |
| 554 | pending_destruction.push(std::move(buffer)); | 1119 | void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, |
| 1120 | std::span<const BufferCopy> copies) { | ||
| 1121 | auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); | ||
| 1122 | const std::span<u8> staging_pointer = upload_staging.mapped_span; | ||
| 1123 | for (const BufferCopy& copy : copies) { | ||
| 1124 | const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; | ||
| 1125 | u8* const src_pointer = staging_pointer.data() + copy.src_offset; | ||
| 1126 | cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); | ||
| 555 | } | 1127 | } |
| 556 | 1128 | runtime.CopyBuffer(buffer, upload_staging.buffer, copies); | |
| 557 | void MarkForAsyncFlush(MapInterval* map) { | 1129 | } |
| 558 | if (!uncommitted_flushes) { | 1130 | |
| 559 | uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>(); | 1131 | template <class P> |
| 1132 | void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { | ||
| 1133 | const auto scalar_replace = [buffer_id](Binding& binding) { | ||
| 1134 | if (binding.buffer_id == buffer_id) { | ||
| 1135 | binding.buffer_id = BufferId{}; | ||
| 1136 | } | ||
| 1137 | }; | ||
| 1138 | const auto replace = [scalar_replace](std::span<Binding> bindings) { | ||
| 1139 | std::ranges::for_each(bindings, scalar_replace); | ||
| 1140 | }; | ||
| 1141 | scalar_replace(index_buffer); | ||
| 1142 | replace(vertex_buffers); | ||
| 1143 | std::ranges::for_each(uniform_buffers, replace); | ||
| 1144 | std::ranges::for_each(storage_buffers, replace); | ||
| 1145 | replace(transform_feedback_buffers); | ||
| 1146 | replace(compute_uniform_buffers); | ||
| 1147 | replace(compute_storage_buffers); | ||
| 1148 | std::erase(cached_write_buffer_ids, buffer_id); | ||
| 1149 | |||
| 1150 | // Mark the whole buffer as CPU written to stop tracking CPU writes | ||
| 1151 | Buffer& buffer = slot_buffers[buffer_id]; | ||
| 1152 | buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); | ||
| 1153 | |||
| 1154 | Unregister(buffer_id); | ||
| 1155 | delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); | ||
| 1156 | |||
| 1157 | NotifyBufferDeletion(); | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | template <class P> | ||
| 1161 | void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) { | ||
| 1162 | const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) { | ||
| 1163 | std::ranges::replace(buffers, old_buffer_id, new_buffer_id); | ||
| 1164 | if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) { | ||
| 1165 | buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end()); | ||
| 560 | } | 1166 | } |
| 561 | uncommitted_flushes->insert(map); | 1167 | }; |
| 1168 | replace(uncommitted_downloads); | ||
| 1169 | std::ranges::for_each(committed_downloads, replace); | ||
| 1170 | } | ||
| 1171 | |||
| 1172 | template <class P> | ||
| 1173 | void BufferCache<P>::NotifyBufferDeletion() { | ||
| 1174 | if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { | ||
| 1175 | dirty_uniform_buffers.fill(~u32{0}); | ||
| 562 | } | 1176 | } |
| 1177 | auto& flags = maxwell3d.dirty.flags; | ||
| 1178 | flags[Dirty::IndexBuffer] = true; | ||
| 1179 | flags[Dirty::VertexBuffers] = true; | ||
| 1180 | for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { | ||
| 1181 | flags[Dirty::VertexBuffer0 + index] = true; | ||
| 1182 | } | ||
| 1183 | has_deleted_buffers = true; | ||
| 1184 | } | ||
| 1185 | |||
| 1186 | template <class P> | ||
| 1187 | typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr) const { | ||
| 1188 | const GPUVAddr gpu_addr = gpu_memory.Read<u64>(ssbo_addr); | ||
| 1189 | const u32 size = gpu_memory.Read<u32>(ssbo_addr + 8); | ||
| 1190 | const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); | ||
| 1191 | if (!cpu_addr || size == 0) { | ||
| 1192 | return NULL_BINDING; | ||
| 1193 | } | ||
| 1194 | const Binding binding{ | ||
| 1195 | .cpu_addr = *cpu_addr, | ||
| 1196 | .size = size, | ||
| 1197 | .buffer_id = BufferId{}, | ||
| 1198 | }; | ||
| 1199 | return binding; | ||
| 1200 | } | ||
| 1201 | |||
| 1202 | template <class P> | ||
| 1203 | std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size_t size) { | ||
| 1204 | u8* const base_pointer = cpu_memory.GetPointer(cpu_addr); | ||
| 1205 | if (IsRangeGranular(cpu_addr, size) || | ||
| 1206 | base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) { | ||
| 1207 | return std::span(base_pointer, size); | ||
| 1208 | } else { | ||
| 1209 | const std::span<u8> span = ImmediateBuffer(size); | ||
| 1210 | cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size); | ||
| 1211 | return span; | ||
| 1212 | } | ||
| 1213 | } | ||
| 563 | 1214 | ||
| 564 | VideoCore::RasterizerInterface& rasterizer; | 1215 | template <class P> |
| 565 | Tegra::MemoryManager& gpu_memory; | 1216 | std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) { |
| 566 | Core::Memory::Memory& cpu_memory; | 1217 | if (wanted_capacity > immediate_buffer_capacity) { |
| 567 | StreamBuffer& stream_buffer; | 1218 | immediate_buffer_capacity = wanted_capacity; |
| 568 | 1219 | immediate_buffer_alloc = std::make_unique<u8[]>(wanted_capacity); | |
| 569 | u8* buffer_ptr = nullptr; | 1220 | } |
| 570 | u64 buffer_offset = 0; | 1221 | return std::span<u8>(immediate_buffer_alloc.get(), wanted_capacity); |
| 571 | u64 buffer_offset_base = 0; | 1222 | } |
| 572 | 1223 | ||
| 573 | MapIntervalAllocator mapped_addresses_allocator; | 1224 | template <class P> |
| 574 | boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>> | 1225 | bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept { |
| 575 | mapped_addresses; | 1226 | if constexpr (IS_OPENGL) { |
| 576 | 1227 | return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0; | |
| 577 | std::unordered_map<u64, u32> written_pages; | 1228 | } else { |
| 578 | std::unordered_map<u64, std::shared_ptr<Buffer>> blocks; | 1229 | // Only OpenGL has fast uniform buffers |
| 579 | 1230 | return false; | |
| 580 | std::queue<std::shared_ptr<Buffer>> pending_destruction; | 1231 | } |
| 581 | u64 epoch = 0; | 1232 | } |
| 582 | u64 modified_ticks = 0; | ||
| 583 | |||
| 584 | std::vector<u8> staging_buffer; | ||
| 585 | |||
| 586 | std::list<MapInterval*> marked_for_unregister; | ||
| 587 | |||
| 588 | std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes; | ||
| 589 | std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes; | ||
| 590 | |||
| 591 | std::recursive_mutex mutex; | ||
| 592 | }; | ||
| 593 | 1233 | ||
| 594 | } // namespace VideoCommon | 1234 | } // namespace VideoCommon |
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp deleted file mode 100644 index 62587e18a..000000000 --- a/src/video_core/buffer_cache/map_interval.cpp +++ /dev/null | |||
| @@ -1,33 +0,0 @@ | |||
| 1 | // Copyright 2020 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <array> | ||
| 7 | #include <cstddef> | ||
| 8 | #include <memory> | ||
| 9 | |||
| 10 | #include "video_core/buffer_cache/map_interval.h" | ||
| 11 | |||
| 12 | namespace VideoCommon { | ||
| 13 | |||
| 14 | MapIntervalAllocator::MapIntervalAllocator() { | ||
| 15 | FillFreeList(first_chunk); | ||
| 16 | } | ||
| 17 | |||
| 18 | MapIntervalAllocator::~MapIntervalAllocator() = default; | ||
| 19 | |||
| 20 | void MapIntervalAllocator::AllocateNewChunk() { | ||
| 21 | *new_chunk = std::make_unique<Chunk>(); | ||
| 22 | FillFreeList(**new_chunk); | ||
| 23 | new_chunk = &(*new_chunk)->next; | ||
| 24 | } | ||
| 25 | |||
| 26 | void MapIntervalAllocator::FillFreeList(Chunk& chunk) { | ||
| 27 | const std::size_t old_size = free_list.size(); | ||
| 28 | free_list.resize(old_size + chunk.data.size()); | ||
| 29 | std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size, | ||
| 30 | [](MapInterval& interval) { return &interval; }); | ||
| 31 | } | ||
| 32 | |||
| 33 | } // namespace VideoCommon | ||
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h deleted file mode 100644 index ef974b08a..000000000 --- a/src/video_core/buffer_cache/map_interval.h +++ /dev/null | |||
| @@ -1,93 +0,0 @@ | |||
| 1 | // Copyright 2019 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <array> | ||
| 8 | #include <cstddef> | ||
| 9 | #include <memory> | ||
| 10 | #include <vector> | ||
| 11 | |||
| 12 | #include <boost/intrusive/set_hook.hpp> | ||
| 13 | |||
| 14 | #include "common/common_types.h" | ||
| 15 | #include "video_core/gpu.h" | ||
| 16 | |||
| 17 | namespace VideoCommon { | ||
| 18 | |||
| 19 | struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> { | ||
| 20 | MapInterval() = default; | ||
| 21 | |||
| 22 | /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {} | ||
| 23 | |||
| 24 | explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept | ||
| 25 | : start{start_}, end{end_}, gpu_addr{gpu_addr_} {} | ||
| 26 | |||
| 27 | bool IsInside(VAddr other_start, VAddr other_end) const noexcept { | ||
| 28 | return start <= other_start && other_end <= end; | ||
| 29 | } | ||
| 30 | |||
| 31 | bool Overlaps(VAddr other_start, VAddr other_end) const noexcept { | ||
| 32 | return start < other_end && other_start < end; | ||
| 33 | } | ||
| 34 | |||
| 35 | void MarkAsModified(bool is_modified_, u64 ticks_) noexcept { | ||
| 36 | is_modified = is_modified_; | ||
| 37 | ticks = ticks_; | ||
| 38 | } | ||
| 39 | |||
| 40 | boost::intrusive::set_member_hook<> member_hook_; | ||
| 41 | VAddr start = 0; | ||
| 42 | VAddr end = 0; | ||
| 43 | GPUVAddr gpu_addr = 0; | ||
| 44 | u64 ticks = 0; | ||
| 45 | bool is_written = false; | ||
| 46 | bool is_modified = false; | ||
| 47 | bool is_registered = false; | ||
| 48 | bool is_memory_marked = false; | ||
| 49 | bool is_sync_pending = false; | ||
| 50 | }; | ||
| 51 | |||
| 52 | struct MapIntervalCompare { | ||
| 53 | constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept { | ||
| 54 | return lhs.start < rhs.start; | ||
| 55 | } | ||
| 56 | }; | ||
| 57 | |||
| 58 | class MapIntervalAllocator { | ||
| 59 | public: | ||
| 60 | MapIntervalAllocator(); | ||
| 61 | ~MapIntervalAllocator(); | ||
| 62 | |||
| 63 | MapInterval* Allocate() { | ||
| 64 | if (free_list.empty()) { | ||
| 65 | AllocateNewChunk(); | ||
| 66 | } | ||
| 67 | MapInterval* const interval = free_list.back(); | ||
| 68 | free_list.pop_back(); | ||
| 69 | return interval; | ||
| 70 | } | ||
| 71 | |||
| 72 | void Release(MapInterval* interval) { | ||
| 73 | free_list.push_back(interval); | ||
| 74 | } | ||
| 75 | |||
| 76 | private: | ||
| 77 | struct Chunk { | ||
| 78 | std::unique_ptr<Chunk> next; | ||
| 79 | std::array<MapInterval, 0x8000> data; | ||
| 80 | }; | ||
| 81 | |||
| 82 | void AllocateNewChunk(); | ||
| 83 | |||
| 84 | void FillFreeList(Chunk& chunk); | ||
| 85 | |||
| 86 | std::vector<MapInterval*> free_list; | ||
| 87 | |||
| 88 | Chunk first_chunk; | ||
| 89 | |||
| 90 | std::unique_ptr<Chunk>* new_chunk = &first_chunk.next; | ||
| 91 | }; | ||
| 92 | |||
| 93 | } // namespace VideoCommon | ||