summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/video_core/CMakeLists.txt6
-rw-r--r--src/video_core/buffer_cache/buffer_block.h62
-rw-r--r--src/video_core/buffer_cache/buffer_cache.cpp13
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h1598
-rw-r--r--src/video_core/buffer_cache/map_interval.cpp33
-rw-r--r--src/video_core/buffer_cache/map_interval.h93
-rw-r--r--src/video_core/command_classes/vic.cpp3
-rw-r--r--src/video_core/dirty_flags.cpp29
-rw-r--r--src/video_core/dirty_flags.h8
-rw-r--r--src/video_core/dma_pusher.cpp2
-rw-r--r--src/video_core/engines/kepler_compute.cpp1
-rw-r--r--src/video_core/engines/kepler_memory.cpp1
-rw-r--r--src/video_core/engines/maxwell_3d.cpp17
-rw-r--r--src/video_core/engines/maxwell_3d.h12
-rw-r--r--src/video_core/engines/maxwell_dma.cpp3
-rw-r--r--src/video_core/fence_manager.h4
-rw-r--r--src/video_core/host_shaders/CMakeLists.txt1
-rw-r--r--src/video_core/host_shaders/vulkan_quad_array.comp28
-rw-r--r--src/video_core/host_shaders/vulkan_uint8.comp9
-rw-r--r--src/video_core/rasterizer_interface.h5
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp257
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.h168
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp16
-rw-r--r--src/video_core/renderer_opengl/gl_device.h8
-rw-r--r--src/video_core/renderer_opengl/gl_fence_manager.cpp2
-rw-r--r--src/video_core/renderer_opengl/gl_fence_manager.h9
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp574
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h73
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.cpp6
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.h3
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp61
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.h2
-rw-r--r--src/video_core/renderer_opengl/gl_state_tracker.cpp25
-rw-r--r--src/video_core/renderer_opengl/gl_state_tracker.h32
-rw-r--r--src/video_core/renderer_opengl/gl_stream_buffer.cpp94
-rw-r--r--src/video_core/renderer_opengl/gl_stream_buffer.h60
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp19
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h21
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.cpp95
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.h14
-rw-r--r--src/video_core/renderer_opengl/util_shaders.cpp18
-rw-r--r--src/video_core/renderer_opengl/util_shaders.h3
-rw-r--r--src/video_core/renderer_vulkan/maxwell_to_vk.cpp6
-rw-r--r--src/video_core/renderer_vulkan/maxwell_to_vk.h2
-rw-r--r--src/video_core/renderer_vulkan/renderer_vulkan.cpp2
-rw-r--r--src/video_core/renderer_vulkan/renderer_vulkan.h3
-rw-r--r--src/video_core/renderer_vulkan/vk_blit_screen.cpp9
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.cpp366
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.h107
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.cpp97
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.h24
-rw-r--r--src/video_core/renderer_vulkan/vk_fence_manager.cpp4
-rw-r--r--src/video_core/renderer_vulkan/vk_fence_manager.h11
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp664
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h64
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.cpp14
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.h26
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.cpp3
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.h20
-rw-r--r--src/video_core/renderer_vulkan/vk_state_tracker.cpp9
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp131
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.h26
-rw-r--r--src/video_core/shader/async_shaders.h9
-rw-r--r--src/video_core/shader/decode/other.cpp1
-rw-r--r--src/video_core/shader/shader_ir.h5
-rw-r--r--src/video_core/texture_cache/texture_cache.h28
-rw-r--r--src/video_core/vulkan_common/vulkan_memory_allocator.h2
67 files changed, 2514 insertions, 2607 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index dd4c29ed3..9b931976a 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -2,10 +2,8 @@ add_subdirectory(host_shaders)
2 2
3add_library(video_core STATIC 3add_library(video_core STATIC
4 buffer_cache/buffer_base.h 4 buffer_cache/buffer_base.h
5 buffer_cache/buffer_block.h 5 buffer_cache/buffer_cache.cpp
6 buffer_cache/buffer_cache.h 6 buffer_cache/buffer_cache.h
7 buffer_cache/map_interval.cpp
8 buffer_cache/map_interval.h
9 cdma_pusher.cpp 7 cdma_pusher.cpp
10 cdma_pusher.h 8 cdma_pusher.h
11 command_classes/codecs/codec.cpp 9 command_classes/codecs/codec.cpp
@@ -152,8 +150,6 @@ add_library(video_core STATIC
152 renderer_vulkan/vk_staging_buffer_pool.h 150 renderer_vulkan/vk_staging_buffer_pool.h
153 renderer_vulkan/vk_state_tracker.cpp 151 renderer_vulkan/vk_state_tracker.cpp
154 renderer_vulkan/vk_state_tracker.h 152 renderer_vulkan/vk_state_tracker.h
155 renderer_vulkan/vk_stream_buffer.cpp
156 renderer_vulkan/vk_stream_buffer.h
157 renderer_vulkan/vk_swapchain.cpp 153 renderer_vulkan/vk_swapchain.cpp
158 renderer_vulkan/vk_swapchain.h 154 renderer_vulkan/vk_swapchain.h
159 renderer_vulkan/vk_texture_cache.cpp 155 renderer_vulkan/vk_texture_cache.cpp
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
deleted file mode 100644
index e9306194a..000000000
--- a/src/video_core/buffer_cache/buffer_block.h
+++ /dev/null
@@ -1,62 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "common/common_types.h"
8
9namespace VideoCommon {
10
11class BufferBlock {
12public:
13 [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
14 return (cpu_addr < end) && (cpu_addr_end > start);
15 }
16
17 [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
18 return cpu_addr <= other_start && other_end <= cpu_addr_end;
19 }
20
21 [[nodiscard]] std::size_t Offset(VAddr in_addr) const {
22 return static_cast<std::size_t>(in_addr - cpu_addr);
23 }
24
25 [[nodiscard]] VAddr CpuAddr() const {
26 return cpu_addr;
27 }
28
29 [[nodiscard]] VAddr CpuAddrEnd() const {
30 return cpu_addr_end;
31 }
32
33 void SetCpuAddr(VAddr new_addr) {
34 cpu_addr = new_addr;
35 cpu_addr_end = new_addr + size;
36 }
37
38 [[nodiscard]] std::size_t Size() const {
39 return size;
40 }
41
42 [[nodiscard]] u64 Epoch() const {
43 return epoch;
44 }
45
46 void SetEpoch(u64 new_epoch) {
47 epoch = new_epoch;
48 }
49
50protected:
51 explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
52 SetCpuAddr(cpu_addr_);
53 }
54
55private:
56 VAddr cpu_addr{};
57 VAddr cpu_addr_end{};
58 std::size_t size{};
59 u64 epoch{};
60};
61
62} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
new file mode 100644
index 000000000..ab32294c8
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -0,0 +1,13 @@
1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/microprofile.h"
6
7namespace VideoCommon {
8
9MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128));
10MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128));
11MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128));
12
13} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 83b9ee871..e4f3c8e35 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -4,591 +4,1231 @@
4 4
5#pragma once 5#pragma once
6 6
7#include <list> 7#include <algorithm>
8#include <array>
9#include <deque>
8#include <memory> 10#include <memory>
9#include <mutex> 11#include <mutex>
12#include <span>
10#include <unordered_map> 13#include <unordered_map>
11#include <unordered_set>
12#include <utility>
13#include <vector> 14#include <vector>
14 15
15#include <boost/container/small_vector.hpp> 16#include <boost/container/small_vector.hpp>
16#include <boost/icl/interval_set.hpp>
17#include <boost/intrusive/set.hpp>
18 17
19#include "common/alignment.h"
20#include "common/assert.h"
21#include "common/common_types.h" 18#include "common/common_types.h"
22#include "common/logging/log.h" 19#include "common/div_ceil.h"
23#include "core/core.h" 20#include "common/microprofile.h"
21#include "common/scope_exit.h"
24#include "core/memory.h" 22#include "core/memory.h"
25#include "core/settings.h" 23#include "core/settings.h"
26#include "video_core/buffer_cache/buffer_block.h" 24#include "video_core/buffer_cache/buffer_base.h"
27#include "video_core/buffer_cache/map_interval.h" 25#include "video_core/delayed_destruction_ring.h"
26#include "video_core/dirty_flags.h"
27#include "video_core/engines/kepler_compute.h"
28#include "video_core/engines/maxwell_3d.h"
28#include "video_core/memory_manager.h" 29#include "video_core/memory_manager.h"
29#include "video_core/rasterizer_interface.h" 30#include "video_core/rasterizer_interface.h"
31#include "video_core/texture_cache/slot_vector.h"
32#include "video_core/texture_cache/types.h"
30 33
31namespace VideoCommon { 34namespace VideoCommon {
32 35
33template <typename Buffer, typename BufferType, typename StreamBuffer> 36MICROPROFILE_DECLARE(GPU_PrepareBuffers);
37MICROPROFILE_DECLARE(GPU_BindUploadBuffers);
38MICROPROFILE_DECLARE(GPU_DownloadMemory);
39
40using BufferId = SlotId;
41
42constexpr u32 NUM_VERTEX_BUFFERS = 32;
43constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4;
44constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18;
45constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
46constexpr u32 NUM_STORAGE_BUFFERS = 16;
47constexpr u32 NUM_STAGES = 5;
48
49template <typename P>
34class BufferCache { 50class BufferCache {
35 using IntervalSet = boost::icl::interval_set<VAddr>; 51 // Page size for caching purposes.
36 using IntervalType = typename IntervalSet::interval_type; 52 // This is unrelated to the CPU page size and it can be changed as it seems optimal.
37 using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>; 53 static constexpr u32 PAGE_BITS = 16;
54 static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS;
38 55
39 static constexpr u64 WRITE_PAGE_BIT = 11; 56 static constexpr bool IS_OPENGL = P::IS_OPENGL;
40 static constexpr u64 BLOCK_PAGE_BITS = 21; 57 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS =
41 static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; 58 P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS;
59 static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT =
60 P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT;
61 static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX;
62 static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX;
63 static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS;
42 64
43public: 65 static constexpr BufferId NULL_BUFFER_ID{0};
44 struct BufferInfo { 66
45 BufferType handle; 67 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
46 u64 offset; 68
47 u64 address; 69 using Runtime = typename P::Runtime;
70 using Buffer = typename P::Buffer;
71
72 struct Empty {};
73
74 struct Binding {
75 VAddr cpu_addr{};
76 u32 size{};
77 BufferId buffer_id;
48 }; 78 };
49 79
50 BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, 80 static constexpr Binding NULL_BINDING{
51 bool is_written = false, bool use_fast_cbuf = false) { 81 .cpu_addr = 0,
52 std::lock_guard lock{mutex}; 82 .size = 0,
83 .buffer_id = NULL_BUFFER_ID,
84 };
53 85
54 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); 86public:
55 if (!cpu_addr) { 87 static constexpr size_t SKIP_CACHE_SIZE = 4096;
56 return GetEmptyBuffer(size);
57 }
58 88
59 // Cache management is a big overhead, so only cache entries with a given size. 89 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
60 // TODO: Figure out which size is the best for given games. 90 Tegra::Engines::Maxwell3D& maxwell3d_,
61 constexpr std::size_t max_stream_size = 0x800; 91 Tegra::Engines::KeplerCompute& kepler_compute_,
62 if (use_fast_cbuf || size < max_stream_size) { 92 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
63 if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) { 93 Runtime& runtime_);
64 const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size);
65 if (use_fast_cbuf) {
66 u8* dest;
67 if (is_granular) {
68 dest = gpu_memory.GetPointer(gpu_addr);
69 } else {
70 staging_buffer.resize(size);
71 dest = staging_buffer.data();
72 gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
73 }
74 return ConstBufferUpload(dest, size);
75 }
76 if (is_granular) {
77 u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
78 return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
79 std::memcpy(dest, host_ptr, size);
80 });
81 } else {
82 return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) {
83 gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
84 });
85 }
86 }
87 }
88 94
89 Buffer* const block = GetBlock(*cpu_addr, size); 95 void TickFrame();
90 MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size);
91 if (!map) {
92 return GetEmptyBuffer(size);
93 }
94 if (is_written) {
95 map->MarkAsModified(true, GetModifiedTicks());
96 if (Settings::IsGPULevelHigh() &&
97 Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
98 MarkForAsyncFlush(map);
99 }
100 if (!map->is_written) {
101 map->is_written = true;
102 MarkRegionAsWritten(map->start, map->end - 1);
103 }
104 }
105 96
106 return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()}; 97 void WriteMemory(VAddr cpu_addr, u64 size);
107 }
108 98
109 /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. 99 void CachedWriteMemory(VAddr cpu_addr, u64 size);
110 BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
111 std::size_t alignment = 4) {
112 std::lock_guard lock{mutex};
113 return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
114 std::memcpy(dest, raw_pointer, size);
115 });
116 }
117 100
118 /// Prepares the buffer cache for data uploading 101 void DownloadMemory(VAddr cpu_addr, u64 size);
119 /// @param max_size Maximum number of bytes that will be uploaded
120 /// @return True when a stream buffer invalidation was required, false otherwise
121 void Map(std::size_t max_size) {
122 std::lock_guard lock{mutex};
123 102
124 std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4); 103 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
125 buffer_offset = buffer_offset_base;
126 }
127 104
128 /// Finishes the upload stream 105 void UpdateGraphicsBuffers(bool is_indexed);
129 void Unmap() {
130 std::lock_guard lock{mutex};
131 stream_buffer.Unmap(buffer_offset - buffer_offset_base);
132 }
133 106
134 /// Function called at the end of each frame, inteded for deferred operations 107 void UpdateComputeBuffers();
135 void TickFrame() {
136 ++epoch;
137 108
138 while (!pending_destruction.empty()) { 109 void BindHostGeometryBuffers(bool is_indexed);
139 // Delay at least 4 frames before destruction.
140 // This is due to triple buffering happening on some drivers.
141 static constexpr u64 epochs_to_destroy = 5;
142 if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
143 break;
144 }
145 pending_destruction.pop();
146 }
147 }
148 110
149 /// Write any cached resources overlapping the specified region back to memory 111 void BindHostStageBuffers(size_t stage);
150 void FlushRegion(VAddr addr, std::size_t size) {
151 std::lock_guard lock{mutex};
152 112
153 VectorMapInterval objects = GetMapsInRange(addr, size); 113 void BindHostComputeBuffers();
154 std::sort(objects.begin(), objects.end(),
155 [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
156 for (MapInterval* object : objects) {
157 if (object->is_modified && object->is_registered) {
158 mutex.unlock();
159 FlushMap(object);
160 mutex.lock();
161 }
162 }
163 }
164 114
165 bool MustFlushRegion(VAddr addr, std::size_t size) { 115 void SetEnabledUniformBuffers(size_t stage, u32 enabled);
166 std::lock_guard lock{mutex};
167 116
168 const VectorMapInterval objects = GetMapsInRange(addr, size); 117 void SetEnabledComputeUniformBuffers(u32 enabled);
169 return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
170 return map->is_modified && map->is_registered;
171 });
172 }
173 118
174 /// Mark the specified region as being invalidated 119 void UnbindGraphicsStorageBuffers(size_t stage);
175 void InvalidateRegion(VAddr addr, u64 size) {
176 std::lock_guard lock{mutex};
177 120
178 for (auto& object : GetMapsInRange(addr, size)) { 121 void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
179 if (object->is_registered) { 122 bool is_written);
180 Unregister(object);
181 }
182 }
183 }
184 123
185 void OnCPUWrite(VAddr addr, std::size_t size) { 124 void UnbindComputeStorageBuffers();
186 std::lock_guard lock{mutex};
187 125
188 for (MapInterval* object : GetMapsInRange(addr, size)) { 126 void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
189 if (object->is_memory_marked && object->is_registered) { 127 bool is_written);
190 UnmarkMemory(object);
191 object->is_sync_pending = true;
192 marked_for_unregister.emplace_back(object);
193 }
194 }
195 }
196 128
197 void SyncGuestHost() { 129 void FlushCachedWrites();
198 std::lock_guard lock{mutex};
199 130
200 for (auto& object : marked_for_unregister) { 131 /// Return true when there are uncommitted buffers to be downloaded
201 if (object->is_registered) { 132 [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
202 object->is_sync_pending = false; 133
203 Unregister(object); 134 /// Return true when the caller should wait for async downloads
204 } 135 [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
136
137 /// Commit asynchronous downloads
138 void CommitAsyncFlushes();
139
140 /// Pop asynchronous downloads
141 void PopAsyncFlushes();
142
143 /// Return true when a CPU region is modified from the GPU
144 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
145
146 std::mutex mutex;
147
148private:
149 template <typename Func>
150 static void ForEachEnabledBit(u32 enabled_mask, Func&& func) {
151 for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) {
152 const int disabled_bits = std::countr_zero(enabled_mask);
153 index += disabled_bits;
154 enabled_mask >>= disabled_bits;
155 func(index);
205 } 156 }
206 marked_for_unregister.clear();
207 } 157 }
208 158
209 void CommitAsyncFlushes() { 159 template <typename Func>
210 if (uncommitted_flushes) { 160 void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) {
211 auto commit_list = std::make_shared<std::list<MapInterval*>>(); 161 const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE);
212 for (MapInterval* map : *uncommitted_flushes) { 162 for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) {
213 if (map->is_registered && map->is_modified) { 163 const BufferId buffer_id = page_table[page];
214 // TODO(Blinkhawk): Implement backend asynchronous flushing 164 if (!buffer_id) {
215 // AsyncFlushMap(map) 165 ++page;
216 commit_list->push_back(map); 166 continue;
217 }
218 }
219 if (!commit_list->empty()) {
220 committed_flushes.push_back(commit_list);
221 } else {
222 committed_flushes.emplace_back();
223 } 167 }
224 } else { 168 Buffer& buffer = slot_buffers[buffer_id];
225 committed_flushes.emplace_back(); 169 func(buffer_id, buffer);
170
171 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
172 page = Common::DivCeil(end_addr, PAGE_SIZE);
226 } 173 }
227 uncommitted_flushes.reset();
228 } 174 }
229 175
230 bool ShouldWaitAsyncFlushes() const { 176 static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
231 return !committed_flushes.empty() && committed_flushes.front() != nullptr; 177 return (cpu_addr & ~Core::Memory::PAGE_MASK) ==
178 ((cpu_addr + size) & ~Core::Memory::PAGE_MASK);
232 } 179 }
233 180
234 bool HasUncommittedFlushes() const { 181 void BindHostIndexBuffer();
235 return uncommitted_flushes != nullptr;
236 }
237 182
238 void PopAsyncFlushes() { 183 void BindHostVertexBuffers();
239 if (committed_flushes.empty()) {
240 return;
241 }
242 auto& flush_list = committed_flushes.front();
243 if (!flush_list) {
244 committed_flushes.pop_front();
245 return;
246 }
247 for (MapInterval* map : *flush_list) {
248 if (map->is_registered) {
249 // TODO(Blinkhawk): Replace this for reading the asynchronous flush
250 FlushMap(map);
251 }
252 }
253 committed_flushes.pop_front();
254 }
255 184
256 virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; 185 void BindHostGraphicsUniformBuffers(size_t stage);
257 186
258protected: 187 void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind);
259 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
260 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
261 StreamBuffer& stream_buffer_)
262 : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
263 stream_buffer{stream_buffer_} {}
264 188
265 ~BufferCache() = default; 189 void BindHostGraphicsStorageBuffers(size_t stage);
266 190
267 virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0; 191 void BindHostTransformFeedbackBuffers();
268 192
269 virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { 193 void BindHostComputeUniformBuffers();
270 return {};
271 }
272 194
273 /// Register an object into the cache 195 void BindHostComputeStorageBuffers();
274 MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
275 const VAddr cpu_addr = new_map.start;
276 if (!cpu_addr) {
277 LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
278 new_map.gpu_addr);
279 return nullptr;
280 }
281 const std::size_t size = new_map.end - new_map.start;
282 new_map.is_registered = true;
283 rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
284 new_map.is_memory_marked = true;
285 if (inherit_written) {
286 MarkRegionAsWritten(new_map.start, new_map.end - 1);
287 new_map.is_written = true;
288 }
289 MapInterval* const storage = mapped_addresses_allocator.Allocate();
290 *storage = new_map;
291 mapped_addresses.insert(*storage);
292 return storage;
293 }
294 196
295 void UnmarkMemory(MapInterval* map) { 197 void DoUpdateGraphicsBuffers(bool is_indexed);
296 if (!map->is_memory_marked) { 198
297 return; 199 void DoUpdateComputeBuffers();
298 } 200
299 const std::size_t size = map->end - map->start; 201 void UpdateIndexBuffer();
300 rasterizer.UpdatePagesCachedCount(map->start, size, -1); 202
301 map->is_memory_marked = false; 203 void UpdateVertexBuffers();
302 } 204
303 205 void UpdateVertexBuffer(u32 index);
304 /// Unregisters an object from the cache 206
305 void Unregister(MapInterval* map) { 207 void UpdateUniformBuffers(size_t stage);
306 UnmarkMemory(map); 208
307 map->is_registered = false; 209 void UpdateStorageBuffers(size_t stage);
308 if (map->is_sync_pending) { 210
309 map->is_sync_pending = false; 211 void UpdateTransformFeedbackBuffers();
310 marked_for_unregister.remove(map); 212
213 void UpdateTransformFeedbackBuffer(u32 index);
214
215 void UpdateComputeUniformBuffers();
216
217 void UpdateComputeStorageBuffers();
218
219 void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size);
220
221 [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size);
222
223 [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size);
224
225 void Register(BufferId buffer_id);
226
227 void Unregister(BufferId buffer_id);
228
229 template <bool insert>
230 void ChangeRegister(BufferId buffer_id);
231
232 void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
233
234 void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
235
236 void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
237 std::span<BufferCopy> copies);
238
239 void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
240 std::span<const BufferCopy> copies);
241
242 void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
243 std::span<const BufferCopy> copies);
244
245 void DeleteBuffer(BufferId buffer_id);
246
247 void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
248
249 void NotifyBufferDeletion();
250
251 [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const;
252
253 [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size);
254
255 [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity);
256
257 [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept;
258
259 VideoCore::RasterizerInterface& rasterizer;
260 Tegra::Engines::Maxwell3D& maxwell3d;
261 Tegra::Engines::KeplerCompute& kepler_compute;
262 Tegra::MemoryManager& gpu_memory;
263 Core::Memory::Memory& cpu_memory;
264 Runtime& runtime;
265
266 SlotVector<Buffer> slot_buffers;
267 DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
268
269 u32 last_index_count = 0;
270
271 Binding index_buffer;
272 std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
273 std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
274 std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
275 std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
276
277 std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
278 std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
279
280 std::array<u32, NUM_STAGES> enabled_uniform_buffers{};
281 u32 enabled_compute_uniform_buffers = 0;
282
283 std::array<u32, NUM_STAGES> enabled_storage_buffers{};
284 std::array<u32, NUM_STAGES> written_storage_buffers{};
285 u32 enabled_compute_storage_buffers = 0;
286 u32 written_compute_storage_buffers = 0;
287
288 std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{};
289
290 bool has_deleted_buffers = false;
291
292 std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty>
293 dirty_uniform_buffers{};
294
295 std::vector<BufferId> cached_write_buffer_ids;
296
297 // TODO: This data structure is not optimal and it should be reworked
298 std::vector<BufferId> uncommitted_downloads;
299 std::deque<std::vector<BufferId>> committed_downloads;
300
301 size_t immediate_buffer_capacity = 0;
302 std::unique_ptr<u8[]> immediate_buffer_alloc;
303
304 std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
305};
306
307template <class P>
308BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
309 Tegra::Engines::Maxwell3D& maxwell3d_,
310 Tegra::Engines::KeplerCompute& kepler_compute_,
311 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
312 Runtime& runtime_)
313 : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_},
314 gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {
315 // Ensure the first slot is used for the null buffer
316 void(slot_buffers.insert(runtime, NullBufferParams{}));
317}
318
319template <class P>
320void BufferCache<P>::TickFrame() {
321 delayed_destruction_ring.Tick();
322}
323
324template <class P>
325void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) {
326 ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
327 buffer.MarkRegionAsCpuModified(cpu_addr, size);
328 });
329}
330
331template <class P>
332void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
333 ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
334 if (!buffer.HasCachedWrites()) {
335 cached_write_buffer_ids.push_back(buffer_id);
311 } 336 }
312 if (map->is_written) { 337 buffer.CachedCpuWrite(cpu_addr, size);
313 UnmarkRegionAsWritten(map->start, map->end - 1); 338 });
339}
340
341template <class P>
342void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
343 ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
344 boost::container::small_vector<BufferCopy, 1> copies;
345 u64 total_size_bytes = 0;
346 u64 largest_copy = 0;
347 buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
348 copies.push_back(BufferCopy{
349 .src_offset = range_offset,
350 .dst_offset = total_size_bytes,
351 .size = range_size,
352 });
353 total_size_bytes += range_size;
354 largest_copy = std::max(largest_copy, range_size);
355 });
356 if (total_size_bytes == 0) {
357 return;
314 } 358 }
315 const auto it = mapped_addresses.find(*map); 359 MICROPROFILE_SCOPE(GPU_DownloadMemory);
316 ASSERT(it != mapped_addresses.end()); 360
317 mapped_addresses.erase(it); 361 if constexpr (USE_MEMORY_MAPS) {
318 mapped_addresses_allocator.Release(map); 362 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
319 } 363 const u8* const mapped_memory = download_staging.mapped_span.data();
320 364 const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
321private: 365 runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
322 MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { 366 runtime.Finish();
323 const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); 367 for (const BufferCopy& copy : copies) {
324 if (overlaps.empty()) { 368 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
325 const VAddr cpu_addr_end = cpu_addr + size; 369 const u8* copy_mapped_memory = mapped_memory + copy.dst_offset;
326 if (gpu_memory.IsGranularRange(gpu_addr, size)) { 370 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
327 u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
328 block->Upload(block->Offset(cpu_addr), size, host_ptr);
329 } else {
330 staging_buffer.resize(size);
331 gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
332 block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
333 } 371 }
334 return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); 372 } else {
335 } 373 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
336 374 for (const BufferCopy& copy : copies) {
337 const VAddr cpu_addr_end = cpu_addr + size; 375 buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
338 if (overlaps.size() == 1) { 376 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
339 MapInterval* const current_map = overlaps[0]; 377 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
340 if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
341 return current_map;
342 } 378 }
343 } 379 }
344 VAddr new_start = cpu_addr; 380 });
345 VAddr new_end = cpu_addr_end; 381}
346 bool write_inheritance = false; 382
347 bool modified_inheritance = false; 383template <class P>
348 // Calculate new buffer parameters 384void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
349 for (MapInterval* overlap : overlaps) { 385 u32 size) {
350 new_start = std::min(overlap->start, new_start); 386 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
351 new_end = std::max(overlap->end, new_end); 387 if (!cpu_addr) {
352 write_inheritance |= overlap->is_written; 388 uniform_buffers[stage][index] = NULL_BINDING;
353 modified_inheritance |= overlap->is_modified; 389 return;
390 }
391 const Binding binding{
392 .cpu_addr = *cpu_addr,
393 .size = size,
394 .buffer_id = BufferId{},
395 };
396 uniform_buffers[stage][index] = binding;
397}
398
399template <class P>
400void BufferCache<P>::UpdateGraphicsBuffers(bool is_indexed) {
401 MICROPROFILE_SCOPE(GPU_PrepareBuffers);
402 do {
403 has_deleted_buffers = false;
404 DoUpdateGraphicsBuffers(is_indexed);
405 } while (has_deleted_buffers);
406}
407
408template <class P>
409void BufferCache<P>::UpdateComputeBuffers() {
410 MICROPROFILE_SCOPE(GPU_PrepareBuffers);
411 do {
412 has_deleted_buffers = false;
413 DoUpdateComputeBuffers();
414 } while (has_deleted_buffers);
415}
416
417template <class P>
418void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) {
419 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
420 if (is_indexed) {
421 BindHostIndexBuffer();
422 } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
423 const auto& regs = maxwell3d.regs;
424 if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
425 runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count);
354 } 426 }
355 GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; 427 }
356 for (auto& overlap : overlaps) { 428 BindHostVertexBuffers();
357 Unregister(overlap); 429 BindHostTransformFeedbackBuffers();
430}
431
432template <class P>
433void BufferCache<P>::BindHostStageBuffers(size_t stage) {
434 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
435 BindHostGraphicsUniformBuffers(stage);
436 BindHostGraphicsStorageBuffers(stage);
437}
438
439template <class P>
440void BufferCache<P>::BindHostComputeBuffers() {
441 MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
442 BindHostComputeUniformBuffers();
443 BindHostComputeStorageBuffers();
444}
445
446template <class P>
447void BufferCache<P>::SetEnabledUniformBuffers(size_t stage, u32 enabled) {
448 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
449 if (enabled_uniform_buffers[stage] != enabled) {
450 dirty_uniform_buffers[stage] = ~u32{0};
358 } 451 }
359 UpdateBlock(block, new_start, new_end, overlaps); 452 }
360 453 enabled_uniform_buffers[stage] = enabled;
361 const MapInterval new_map{new_start, new_end, new_gpu_addr}; 454}
362 MapInterval* const map = Register(new_map, write_inheritance); 455
363 if (!map) { 456template <class P>
364 return nullptr; 457void BufferCache<P>::SetEnabledComputeUniformBuffers(u32 enabled) {
458 enabled_compute_uniform_buffers = enabled;
459}
460
461template <class P>
462void BufferCache<P>::UnbindGraphicsStorageBuffers(size_t stage) {
463 enabled_storage_buffers[stage] = 0;
464 written_storage_buffers[stage] = 0;
465}
466
467template <class P>
468void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index,
469 u32 cbuf_offset, bool is_written) {
470 enabled_storage_buffers[stage] |= 1U << ssbo_index;
471 written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index;
472
473 const auto& cbufs = maxwell3d.state.shader_stages[stage];
474 const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset;
475 storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr);
476}
477
478template <class P>
479void BufferCache<P>::UnbindComputeStorageBuffers() {
480 enabled_compute_storage_buffers = 0;
481 written_compute_storage_buffers = 0;
482}
483
484template <class P>
485void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
486 bool is_written) {
487 enabled_compute_storage_buffers |= 1U << ssbo_index;
488 written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index;
489
490 const auto& launch_desc = kepler_compute.launch_description;
491 ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0);
492
493 const auto& cbufs = launch_desc.const_buffer_config;
494 const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset;
495 compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr);
496}
497
498template <class P>
499void BufferCache<P>::FlushCachedWrites() {
500 for (const BufferId buffer_id : cached_write_buffer_ids) {
501 slot_buffers[buffer_id].FlushCachedWrites();
502 }
503 cached_write_buffer_ids.clear();
504}
505
506template <class P>
507bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
508 return !uncommitted_downloads.empty();
509}
510
511template <class P>
512bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
513 return !committed_downloads.empty() && !committed_downloads.front().empty();
514}
515
516template <class P>
517void BufferCache<P>::CommitAsyncFlushes() {
518 // This is intentionally passing the value by copy
519 committed_downloads.push_front(uncommitted_downloads);
520 uncommitted_downloads.clear();
521}
522
523template <class P>
524void BufferCache<P>::PopAsyncFlushes() {
525 if (committed_downloads.empty()) {
526 return;
527 }
528 auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); });
529 const std::span<const BufferId> download_ids = committed_downloads.back();
530 if (download_ids.empty()) {
531 return;
532 }
533 MICROPROFILE_SCOPE(GPU_DownloadMemory);
534
535 boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
536 u64 total_size_bytes = 0;
537 u64 largest_copy = 0;
538 for (const BufferId buffer_id : download_ids) {
539 slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) {
540 downloads.push_back({
541 BufferCopy{
542 .src_offset = range_offset,
543 .dst_offset = total_size_bytes,
544 .size = range_size,
545 },
546 buffer_id,
547 });
548 total_size_bytes += range_size;
549 largest_copy = std::max(largest_copy, range_size);
550 });
551 }
552 if (downloads.empty()) {
553 return;
554 }
555 if constexpr (USE_MEMORY_MAPS) {
556 auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
557 for (const auto [copy, buffer_id] : downloads) {
558 const std::array copies{copy};
559 runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies);
365 } 560 }
366 if (modified_inheritance) { 561 runtime.Finish();
367 map->MarkAsModified(true, GetModifiedTicks()); 562 for (const auto [copy, buffer_id] : downloads) {
368 if (Settings::IsGPULevelHigh() && 563 const Buffer& buffer = slot_buffers[buffer_id];
369 Settings::values.use_asynchronous_gpu_emulation.GetValue()) { 564 const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
370 MarkForAsyncFlush(map); 565 const u8* read_mapped_memory = download_staging.mapped_span.data() + copy.dst_offset;
371 } 566 cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size);
567 }
568 } else {
569 const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
570 for (const auto [copy, buffer_id] : downloads) {
571 Buffer& buffer = slot_buffers[buffer_id];
572 buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
573 const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
574 cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
372 } 575 }
373 return map;
374 } 576 }
375 577}
376 void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { 578
377 const IntervalType base_interval{start, end}; 579template <class P>
378 IntervalSet interval_set{}; 580bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
379 interval_set.add(base_interval); 581 const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
380 for (auto& overlap : overlaps) { 582 for (u64 page = addr >> PAGE_BITS; page < page_end;) {
381 const IntervalType subtract{overlap->start, overlap->end}; 583 const BufferId image_id = page_table[page];
382 interval_set.subtract(subtract); 584 if (!image_id) {
585 ++page;
586 continue;
383 } 587 }
384 for (auto& interval : interval_set) { 588 Buffer& buffer = slot_buffers[image_id];
385 const std::size_t size = interval.upper() - interval.lower(); 589 if (buffer.IsRegionGpuModified(addr, size)) {
386 if (size == 0) { 590 return true;
387 continue;
388 }
389 staging_buffer.resize(size);
390 cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
391 block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
392 } 591 }
592 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
593 page = Common::DivCeil(end_addr, PAGE_SIZE);
393 } 594 }
394 595 return false;
395 VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) { 596}
396 VectorMapInterval result; 597
397 if (size == 0) { 598template <class P>
398 return result; 599void BufferCache<P>::BindHostIndexBuffer() {
600 Buffer& buffer = slot_buffers[index_buffer.buffer_id];
601 const u32 offset = buffer.Offset(index_buffer.cpu_addr);
602 const u32 size = index_buffer.size;
603 SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
604 if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
605 runtime.BindIndexBuffer(buffer, offset, size);
606 } else {
607 runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format,
608 maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count,
609 buffer, offset, size);
610 }
611}
612
613template <class P>
614void BufferCache<P>::BindHostVertexBuffers() {
615 auto& flags = maxwell3d.dirty.flags;
616 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
617 const Binding& binding = vertex_buffers[index];
618 Buffer& buffer = slot_buffers[binding.buffer_id];
619 SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
620 if (!flags[Dirty::VertexBuffer0 + index]) {
621 continue;
399 } 622 }
623 flags[Dirty::VertexBuffer0 + index] = false;
624
625 const u32 stride = maxwell3d.regs.vertex_array[index].stride;
626 const u32 offset = buffer.Offset(binding.cpu_addr);
627 runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride);
628 }
629}
400 630
401 const VAddr addr_end = addr + size; 631template <class P>
402 auto it = mapped_addresses.lower_bound(addr); 632void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
403 if (it != mapped_addresses.begin()) { 633 u32 dirty = ~0U;
404 --it; 634 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
635 dirty = std::exchange(dirty_uniform_buffers[stage], 0);
636 }
637 u32 binding_index = 0;
638 ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
639 const bool needs_bind = ((dirty >> index) & 1) != 0;
640 BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind);
641 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
642 ++binding_index;
405 } 643 }
406 while (it != mapped_addresses.end() && it->start < addr_end) { 644 });
407 if (it->Overlaps(addr, addr_end)) { 645}
408 result.push_back(&*it); 646
647template <class P>
648void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index,
649 bool needs_bind) {
650 const Binding& binding = uniform_buffers[stage][index];
651 const VAddr cpu_addr = binding.cpu_addr;
652 const u32 size = binding.size;
653 Buffer& buffer = slot_buffers[binding.buffer_id];
654 if constexpr (IS_OPENGL) {
655 if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) {
656 if (runtime.HasFastBufferSubData()) {
657 // Fast path for Nvidia
658 if (!HasFastUniformBufferBound(stage, binding_index)) {
659 // We only have to bind when the currently bound buffer is not the fast version
660 fast_bound_uniform_buffers[stage] |= 1U << binding_index;
661 runtime.BindFastUniformBuffer(stage, binding_index, size);
662 }
663 const auto span = ImmediateBufferWithData(cpu_addr, size);
664 runtime.PushFastUniformBuffer(stage, binding_index, span);
665 } else {
666 // Stream buffer path to avoid stalling on non-Nvidia drivers
667 const auto span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
668 cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
409 } 669 }
410 ++it; 670 return;
411 } 671 }
412 return result;
413 } 672 }
414 673 // Classic cached path
415 /// Returns a ticks counter used for tracking when cached objects were last modified 674 SynchronizeBuffer(buffer, cpu_addr, size);
416 u64 GetModifiedTicks() { 675 if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) {
417 return ++modified_ticks; 676 // Skip binding if it's not needed and if the bound buffer is not the fast version
677 // This exists to avoid instances where the fast buffer is bound and a GPU write happens
678 return;
418 } 679 }
680 fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
419 681
420 void FlushMap(MapInterval* map) { 682 const u32 offset = buffer.Offset(cpu_addr);
421 const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); 683 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
422 ASSERT_OR_EXECUTE(it != blocks.end(), return;); 684 runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size);
423 685 } else {
424 std::shared_ptr<Buffer> block = it->second; 686 runtime.BindUniformBuffer(buffer, offset, size);
425
426 const std::size_t size = map->end - map->start;
427 staging_buffer.resize(size);
428 block->Download(block->Offset(map->start), size, staging_buffer.data());
429 cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size);
430 map->MarkAsModified(false, 0);
431 } 687 }
688}
689
690template <class P>
691void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
692 u32 binding_index = 0;
693 ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
694 const Binding& binding = storage_buffers[stage][index];
695 Buffer& buffer = slot_buffers[binding.buffer_id];
696 const u32 size = binding.size;
697 SynchronizeBuffer(buffer, binding.cpu_addr, size);
698
699 const u32 offset = buffer.Offset(binding.cpu_addr);
700 const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0;
701 if constexpr (NEEDS_BIND_STORAGE_INDEX) {
702 runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written);
703 ++binding_index;
704 } else {
705 runtime.BindStorageBuffer(buffer, offset, size, is_written);
706 }
707 });
708}
432 709
433 template <typename Callable> 710template <class P>
434 BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { 711void BufferCache<P>::BindHostTransformFeedbackBuffers() {
435 AlignBuffer(alignment); 712 if (maxwell3d.regs.tfb_enabled == 0) {
436 const std::size_t uploaded_offset = buffer_offset; 713 return;
437 callable(buffer_ptr);
438
439 buffer_ptr += size;
440 buffer_offset += size;
441 return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
442 } 714 }
443 715 for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
444 void AlignBuffer(std::size_t alignment) { 716 const Binding& binding = transform_feedback_buffers[index];
445 // Align the offset, not the mapped pointer 717 Buffer& buffer = slot_buffers[binding.buffer_id];
446 const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment); 718 const u32 size = binding.size;
447 buffer_ptr += offset_aligned - buffer_offset; 719 SynchronizeBuffer(buffer, binding.cpu_addr, size);
448 buffer_offset = offset_aligned; 720
721 const u32 offset = buffer.Offset(binding.cpu_addr);
722 runtime.BindTransformFeedbackBuffer(index, buffer, offset, size);
449 } 723 }
724}
450 725
451 std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) { 726template <class P>
452 const std::size_t old_size = buffer->Size(); 727void BufferCache<P>::BindHostComputeUniformBuffers() {
453 const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; 728 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
454 const VAddr cpu_addr = buffer->CpuAddr(); 729 // Mark all uniform buffers as dirty
455 std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size); 730 dirty_uniform_buffers.fill(~u32{0});
456 new_buffer->CopyFrom(*buffer, 0, 0, old_size); 731 }
457 QueueDestruction(std::move(buffer)); 732 u32 binding_index = 0;
458 733 ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
459 const VAddr cpu_addr_end = cpu_addr + new_size - 1; 734 const Binding& binding = compute_uniform_buffers[index];
460 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 735 Buffer& buffer = slot_buffers[binding.buffer_id];
461 for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { 736 const u32 size = binding.size;
462 blocks.insert_or_assign(page_start, new_buffer); 737 SynchronizeBuffer(buffer, binding.cpu_addr, size);
738
739 const u32 offset = buffer.Offset(binding.cpu_addr);
740 if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
741 runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size);
742 ++binding_index;
743 } else {
744 runtime.BindUniformBuffer(buffer, offset, size);
463 } 745 }
746 });
747}
748
749template <class P>
750void BufferCache<P>::BindHostComputeStorageBuffers() {
751 u32 binding_index = 0;
752 ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
753 const Binding& binding = compute_storage_buffers[index];
754 Buffer& buffer = slot_buffers[binding.buffer_id];
755 const u32 size = binding.size;
756 SynchronizeBuffer(buffer, binding.cpu_addr, size);
757
758 const u32 offset = buffer.Offset(binding.cpu_addr);
759 const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0;
760 if constexpr (NEEDS_BIND_STORAGE_INDEX) {
761 runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written);
762 ++binding_index;
763 } else {
764 runtime.BindStorageBuffer(buffer, offset, size, is_written);
765 }
766 });
767}
464 768
465 return new_buffer; 769template <class P>
770void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
771 if (is_indexed) {
772 UpdateIndexBuffer();
466 } 773 }
774 UpdateVertexBuffers();
775 UpdateTransformFeedbackBuffers();
776 for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
777 UpdateUniformBuffers(stage);
778 UpdateStorageBuffers(stage);
779 }
780}
781
782template <class P>
783void BufferCache<P>::DoUpdateComputeBuffers() {
784 UpdateComputeUniformBuffers();
785 UpdateComputeStorageBuffers();
786}
787
788template <class P>
789void BufferCache<P>::UpdateIndexBuffer() {
790 // We have to check for the dirty flags and index count
791 // The index count is currently changed without updating the dirty flags
792 const auto& index_array = maxwell3d.regs.index_array;
793 auto& flags = maxwell3d.dirty.flags;
794 if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) {
795 return;
796 }
797 flags[Dirty::IndexBuffer] = false;
798 last_index_count = index_array.count;
799
800 const GPUVAddr gpu_addr_begin = index_array.StartAddress();
801 const GPUVAddr gpu_addr_end = index_array.EndAddress();
802 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
803 const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
804 const u32 draw_size = index_array.count * index_array.FormatSizeInBytes();
805 const u32 size = std::min(address_size, draw_size);
806 if (size == 0 || !cpu_addr) {
807 index_buffer = NULL_BINDING;
808 return;
809 }
810 index_buffer = Binding{
811 .cpu_addr = *cpu_addr,
812 .size = size,
813 .buffer_id = FindBuffer(*cpu_addr, size),
814 };
815}
467 816
468 std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first, 817template <class P>
469 std::shared_ptr<Buffer> second) { 818void BufferCache<P>::UpdateVertexBuffers() {
470 const std::size_t size_1 = first->Size(); 819 auto& flags = maxwell3d.dirty.flags;
471 const std::size_t size_2 = second->Size(); 820 if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) {
472 const VAddr first_addr = first->CpuAddr(); 821 return;
473 const VAddr second_addr = second->CpuAddr(); 822 }
474 const VAddr new_addr = std::min(first_addr, second_addr); 823 flags[Dirty::VertexBuffers] = false;
475 const std::size_t new_size = size_1 + size_2;
476
477 std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
478 new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
479 new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
480 QueueDestruction(std::move(first));
481 QueueDestruction(std::move(second));
482 824
483 const VAddr cpu_addr_end = new_addr + new_size - 1; 825 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
484 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 826 UpdateVertexBuffer(index);
485 for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
486 blocks.insert_or_assign(page_start, new_buffer);
487 }
488 return new_buffer;
489 } 827 }
828}
490 829
491 Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { 830template <class P>
492 std::shared_ptr<Buffer> found; 831void BufferCache<P>::UpdateVertexBuffer(u32 index) {
832 if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) {
833 return;
834 }
835 const auto& array = maxwell3d.regs.vertex_array[index];
836 const auto& limit = maxwell3d.regs.vertex_array_limit[index];
837 const GPUVAddr gpu_addr_begin = array.StartAddress();
838 const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1;
839 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
840 const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
841 const u32 size = address_size; // TODO: Analyze stride and number of vertices
842 if (array.enable == 0 || size == 0 || !cpu_addr) {
843 vertex_buffers[index] = NULL_BINDING;
844 return;
845 }
846 vertex_buffers[index] = Binding{
847 .cpu_addr = *cpu_addr,
848 .size = size,
849 .buffer_id = FindBuffer(*cpu_addr, size),
850 };
851}
852
853template <class P>
854void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
855 ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
856 Binding& binding = uniform_buffers[stage][index];
857 if (binding.buffer_id) {
858 // Already updated
859 return;
860 }
861 // Mark as dirty
862 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
863 dirty_uniform_buffers[stage] |= 1U << index;
864 }
865 // Resolve buffer
866 binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
867 });
868}
869
870template <class P>
871void BufferCache<P>::UpdateStorageBuffers(size_t stage) {
872 const u32 written_mask = written_storage_buffers[stage];
873 ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
874 // Resolve buffer
875 Binding& binding = storage_buffers[stage][index];
876 const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
877 binding.buffer_id = buffer_id;
878 // Mark buffer as written if needed
879 if (((written_mask >> index) & 1) != 0) {
880 MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
881 }
882 });
883}
493 884
494 const VAddr cpu_addr_end = cpu_addr + size - 1; 885template <class P>
495 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; 886void BufferCache<P>::UpdateTransformFeedbackBuffers() {
496 for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { 887 if (maxwell3d.regs.tfb_enabled == 0) {
497 auto it = blocks.find(page_start); 888 return;
498 if (it == blocks.end()) { 889 }
499 if (found) { 890 for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
500 found = EnlargeBlock(found); 891 UpdateTransformFeedbackBuffer(index);
501 continue; 892 }
502 } 893}
503 const VAddr start_addr = page_start << BLOCK_PAGE_BITS; 894
504 found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); 895template <class P>
505 blocks.insert_or_assign(page_start, found); 896void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) {
506 continue; 897 const auto& binding = maxwell3d.regs.tfb_bindings[index];
507 } 898 const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset;
508 if (!found) { 899 const u32 size = binding.buffer_size;
509 found = it->second; 900 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
510 continue; 901 if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) {
511 } 902 transform_feedback_buffers[index] = NULL_BINDING;
512 if (found != it->second) { 903 return;
513 found = MergeBlocks(std::move(found), it->second); 904 }
905 const BufferId buffer_id = FindBuffer(*cpu_addr, size);
906 transform_feedback_buffers[index] = Binding{
907 .cpu_addr = *cpu_addr,
908 .size = size,
909 .buffer_id = buffer_id,
910 };
911 MarkWrittenBuffer(buffer_id, *cpu_addr, size);
912}
913
914template <class P>
915void BufferCache<P>::UpdateComputeUniformBuffers() {
916 ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
917 Binding& binding = compute_uniform_buffers[index];
918 binding = NULL_BINDING;
919 const auto& launch_desc = kepler_compute.launch_description;
920 if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) {
921 const auto& cbuf = launch_desc.const_buffer_config[index];
922 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address());
923 if (cpu_addr) {
924 binding.cpu_addr = *cpu_addr;
925 binding.size = cbuf.size;
514 } 926 }
515 } 927 }
516 return found.get(); 928 binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
929 });
930}
931
932template <class P>
933void BufferCache<P>::UpdateComputeStorageBuffers() {
934 ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
935 // Resolve buffer
936 Binding& binding = compute_storage_buffers[index];
937 const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
938 binding.buffer_id = buffer_id;
939 // Mark as written if needed
940 if (((written_compute_storage_buffers >> index) & 1) != 0) {
941 MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
942 }
943 });
944}
945
946template <class P>
947void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) {
948 Buffer& buffer = slot_buffers[buffer_id];
949 buffer.MarkRegionAsGpuModified(cpu_addr, size);
950
951 const bool is_accuracy_high = Settings::IsGPULevelHigh();
952 const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
953 if (!is_accuracy_high || !is_async) {
954 return;
517 } 955 }
956 if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) {
957 // Already inserted
958 return;
959 }
960 uncommitted_downloads.push_back(buffer_id);
961}
518 962
519 void MarkRegionAsWritten(VAddr start, VAddr end) { 963template <class P>
520 const u64 page_end = end >> WRITE_PAGE_BIT; 964BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) {
521 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 965 if (cpu_addr == 0) {
522 if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) { 966 return NULL_BUFFER_ID;
523 ++it->second; 967 }
524 } 968 const u64 page = cpu_addr >> PAGE_BITS;
969 const BufferId buffer_id = page_table[page];
970 if (!buffer_id) {
971 return CreateBuffer(cpu_addr, size);
972 }
973 const Buffer& buffer = slot_buffers[buffer_id];
974 if (buffer.IsInBounds(cpu_addr, size)) {
975 return buffer_id;
976 }
977 return CreateBuffer(cpu_addr, size);
978}
979
980template <class P>
981BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
982 std::vector<BufferId> overlap_ids;
983 VAddr cpu_addr_begin = cpu_addr;
984 VAddr cpu_addr_end = cpu_addr + wanted_size;
985 for (; cpu_addr >> PAGE_BITS < Common::DivCeil(cpu_addr_end, PAGE_SIZE);
986 cpu_addr += PAGE_SIZE) {
987 const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS];
988 if (!overlap_id) {
989 continue;
990 }
991 Buffer& overlap = slot_buffers[overlap_id];
992 if (overlap.IsPicked()) {
993 continue;
994 }
995 overlap.Pick();
996 overlap_ids.push_back(overlap_id);
997 const VAddr overlap_cpu_addr = overlap.CpuAddr();
998 if (overlap_cpu_addr < cpu_addr_begin) {
999 cpu_addr = cpu_addr_begin = overlap_cpu_addr;
525 } 1000 }
1001 cpu_addr_end = std::max(cpu_addr_end, overlap_cpu_addr + overlap.SizeBytes());
526 } 1002 }
527 1003 const u32 size = static_cast<u32>(cpu_addr_end - cpu_addr_begin);
528 void UnmarkRegionAsWritten(VAddr start, VAddr end) { 1004 const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, cpu_addr_begin, size);
529 const u64 page_end = end >> WRITE_PAGE_BIT; 1005 Buffer& new_buffer = slot_buffers[new_buffer_id];
530 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 1006
531 auto it = written_pages.find(page_start); 1007 for (const BufferId overlap_id : overlap_ids) {
532 if (it != written_pages.end()) { 1008 Buffer& overlap = slot_buffers[overlap_id];
533 if (it->second > 1) { 1009 overlap.Unpick();
534 --it->second; 1010
535 } else { 1011 std::vector<BufferCopy> copies;
536 written_pages.erase(it); 1012 const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr();
537 } 1013 overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) {
538 } 1014 copies.push_back(BufferCopy{
1015 .src_offset = begin,
1016 .dst_offset = dst_base_offset + begin,
1017 .size = range_size,
1018 });
1019 new_buffer.UnmarkRegionAsCpuModified(begin, range_size);
1020 new_buffer.MarkRegionAsGpuModified(begin, range_size);
1021 });
1022 if (!copies.empty()) {
1023 runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies);
1024 }
1025 ReplaceBufferDownloads(overlap_id, new_buffer_id);
1026 DeleteBuffer(overlap_id);
1027 }
1028 Register(new_buffer_id);
1029 return new_buffer_id;
1030}
1031
1032template <class P>
1033void BufferCache<P>::Register(BufferId buffer_id) {
1034 ChangeRegister<true>(buffer_id);
1035}
1036
1037template <class P>
1038void BufferCache<P>::Unregister(BufferId buffer_id) {
1039 ChangeRegister<false>(buffer_id);
1040}
1041
1042template <class P>
1043template <bool insert>
1044void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
1045 const Buffer& buffer = slot_buffers[buffer_id];
1046 const VAddr cpu_addr_begin = buffer.CpuAddr();
1047 const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes();
1048 const u64 page_begin = cpu_addr_begin / PAGE_SIZE;
1049 const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE);
1050 for (u64 page = page_begin; page != page_end; ++page) {
1051 if constexpr (insert) {
1052 page_table[page] = buffer_id;
1053 } else {
1054 page_table[page] = BufferId{};
539 } 1055 }
540 } 1056 }
1057}
541 1058
542 bool IsRegionWritten(VAddr start, VAddr end) const { 1059template <class P>
543 const u64 page_end = end >> WRITE_PAGE_BIT; 1060void BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
544 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { 1061 if (buffer.CpuAddr() == 0) {
545 if (written_pages.contains(page_start)) { 1062 return;
546 return true; 1063 }
1064 SynchronizeBufferImpl(buffer, cpu_addr, size);
1065}
1066
1067template <class P>
1068void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) {
1069 boost::container::small_vector<BufferCopy, 4> copies;
1070 u64 total_size_bytes = 0;
1071 u64 largest_copy = 0;
1072 buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
1073 copies.push_back(BufferCopy{
1074 .src_offset = total_size_bytes,
1075 .dst_offset = range_offset,
1076 .size = range_size,
1077 });
1078 total_size_bytes += range_size;
1079 largest_copy = std::max(largest_copy, range_size);
1080 });
1081 if (total_size_bytes == 0) {
1082 return;
1083 }
1084 const std::span<BufferCopy> copies_span(copies.data(), copies.size());
1085 UploadMemory(buffer, total_size_bytes, largest_copy, copies_span);
1086}
1087
1088template <class P>
1089void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
1090 std::span<BufferCopy> copies) {
1091 if constexpr (USE_MEMORY_MAPS) {
1092 MappedUploadMemory(buffer, total_size_bytes, copies);
1093 } else {
1094 ImmediateUploadMemory(buffer, largest_copy, copies);
1095 }
1096}
1097
1098template <class P>
1099void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
1100 std::span<const BufferCopy> copies) {
1101 std::span<u8> immediate_buffer;
1102 for (const BufferCopy& copy : copies) {
1103 std::span<const u8> upload_span;
1104 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
1105 if (IsRangeGranular(cpu_addr, copy.size)) {
1106 upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size);
1107 } else {
1108 if (immediate_buffer.empty()) {
1109 immediate_buffer = ImmediateBuffer(largest_copy);
547 } 1110 }
1111 cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
1112 upload_span = immediate_buffer.subspan(0, copy.size);
548 } 1113 }
549 return false; 1114 buffer.ImmediateUpload(copy.dst_offset, upload_span);
550 } 1115 }
551 1116}
552 void QueueDestruction(std::shared_ptr<Buffer> buffer) { 1117
553 buffer->SetEpoch(epoch); 1118template <class P>
554 pending_destruction.push(std::move(buffer)); 1119void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
1120 std::span<const BufferCopy> copies) {
1121 auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes);
1122 const std::span<u8> staging_pointer = upload_staging.mapped_span;
1123 for (const BufferCopy& copy : copies) {
1124 const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
1125 u8* const src_pointer = staging_pointer.data() + copy.src_offset;
1126 cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size);
555 } 1127 }
556 1128 runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
557 void MarkForAsyncFlush(MapInterval* map) { 1129}
558 if (!uncommitted_flushes) { 1130
559 uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>(); 1131template <class P>
1132void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
1133 const auto scalar_replace = [buffer_id](Binding& binding) {
1134 if (binding.buffer_id == buffer_id) {
1135 binding.buffer_id = BufferId{};
1136 }
1137 };
1138 const auto replace = [scalar_replace](std::span<Binding> bindings) {
1139 std::ranges::for_each(bindings, scalar_replace);
1140 };
1141 scalar_replace(index_buffer);
1142 replace(vertex_buffers);
1143 std::ranges::for_each(uniform_buffers, replace);
1144 std::ranges::for_each(storage_buffers, replace);
1145 replace(transform_feedback_buffers);
1146 replace(compute_uniform_buffers);
1147 replace(compute_storage_buffers);
1148 std::erase(cached_write_buffer_ids, buffer_id);
1149
1150 // Mark the whole buffer as CPU written to stop tracking CPU writes
1151 Buffer& buffer = slot_buffers[buffer_id];
1152 buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes());
1153
1154 Unregister(buffer_id);
1155 delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
1156
1157 NotifyBufferDeletion();
1158}
1159
1160template <class P>
1161void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) {
1162 const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) {
1163 std::ranges::replace(buffers, old_buffer_id, new_buffer_id);
1164 if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) {
1165 buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end());
560 } 1166 }
561 uncommitted_flushes->insert(map); 1167 };
1168 replace(uncommitted_downloads);
1169 std::ranges::for_each(committed_downloads, replace);
1170}
1171
1172template <class P>
1173void BufferCache<P>::NotifyBufferDeletion() {
1174 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
1175 dirty_uniform_buffers.fill(~u32{0});
562 } 1176 }
1177 auto& flags = maxwell3d.dirty.flags;
1178 flags[Dirty::IndexBuffer] = true;
1179 flags[Dirty::VertexBuffers] = true;
1180 for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
1181 flags[Dirty::VertexBuffer0 + index] = true;
1182 }
1183 has_deleted_buffers = true;
1184}
1185
1186template <class P>
1187typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr) const {
1188 const GPUVAddr gpu_addr = gpu_memory.Read<u64>(ssbo_addr);
1189 const u32 size = gpu_memory.Read<u32>(ssbo_addr + 8);
1190 const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
1191 if (!cpu_addr || size == 0) {
1192 return NULL_BINDING;
1193 }
1194 const Binding binding{
1195 .cpu_addr = *cpu_addr,
1196 .size = size,
1197 .buffer_id = BufferId{},
1198 };
1199 return binding;
1200}
1201
1202template <class P>
1203std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size_t size) {
1204 u8* const base_pointer = cpu_memory.GetPointer(cpu_addr);
1205 if (IsRangeGranular(cpu_addr, size) ||
1206 base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) {
1207 return std::span(base_pointer, size);
1208 } else {
1209 const std::span<u8> span = ImmediateBuffer(size);
1210 cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
1211 return span;
1212 }
1213}
563 1214
564 VideoCore::RasterizerInterface& rasterizer; 1215template <class P>
565 Tegra::MemoryManager& gpu_memory; 1216std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) {
566 Core::Memory::Memory& cpu_memory; 1217 if (wanted_capacity > immediate_buffer_capacity) {
567 StreamBuffer& stream_buffer; 1218 immediate_buffer_capacity = wanted_capacity;
568 1219 immediate_buffer_alloc = std::make_unique<u8[]>(wanted_capacity);
569 u8* buffer_ptr = nullptr; 1220 }
570 u64 buffer_offset = 0; 1221 return std::span<u8>(immediate_buffer_alloc.get(), wanted_capacity);
571 u64 buffer_offset_base = 0; 1222}
572 1223
573 MapIntervalAllocator mapped_addresses_allocator; 1224template <class P>
574 boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>> 1225bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept {
575 mapped_addresses; 1226 if constexpr (IS_OPENGL) {
576 1227 return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0;
577 std::unordered_map<u64, u32> written_pages; 1228 } else {
578 std::unordered_map<u64, std::shared_ptr<Buffer>> blocks; 1229 // Only OpenGL has fast uniform buffers
579 1230 return false;
580 std::queue<std::shared_ptr<Buffer>> pending_destruction; 1231 }
581 u64 epoch = 0; 1232}
582 u64 modified_ticks = 0;
583
584 std::vector<u8> staging_buffer;
585
586 std::list<MapInterval*> marked_for_unregister;
587
588 std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
589 std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
590
591 std::recursive_mutex mutex;
592};
593 1233
594} // namespace VideoCommon 1234} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp
deleted file mode 100644
index 62587e18a..000000000
--- a/src/video_core/buffer_cache/map_interval.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <array>
7#include <cstddef>
8#include <memory>
9
10#include "video_core/buffer_cache/map_interval.h"
11
12namespace VideoCommon {
13
14MapIntervalAllocator::MapIntervalAllocator() {
15 FillFreeList(first_chunk);
16}
17
18MapIntervalAllocator::~MapIntervalAllocator() = default;
19
20void MapIntervalAllocator::AllocateNewChunk() {
21 *new_chunk = std::make_unique<Chunk>();
22 FillFreeList(**new_chunk);
23 new_chunk = &(*new_chunk)->next;
24}
25
26void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
27 const std::size_t old_size = free_list.size();
28 free_list.resize(old_size + chunk.data.size());
29 std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
30 [](MapInterval& interval) { return &interval; });
31}
32
33} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
deleted file mode 100644
index ef974b08a..000000000
--- a/src/video_core/buffer_cache/map_interval.h
+++ /dev/null
@@ -1,93 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <cstddef>
9#include <memory>
10#include <vector>
11
12#include <boost/intrusive/set_hook.hpp>
13
14#include "common/common_types.h"
15#include "video_core/gpu.h"
16
17namespace VideoCommon {
18
19struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
20 MapInterval() = default;
21
22 /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
23
24 explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
25 : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
26
27 bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
28 return start <= other_start && other_end <= end;
29 }
30
31 bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
32 return start < other_end && other_start < end;
33 }
34
35 void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
36 is_modified = is_modified_;
37 ticks = ticks_;
38 }
39
40 boost::intrusive::set_member_hook<> member_hook_;
41 VAddr start = 0;
42 VAddr end = 0;
43 GPUVAddr gpu_addr = 0;
44 u64 ticks = 0;
45 bool is_written = false;
46 bool is_modified = false;
47 bool is_registered = false;
48 bool is_memory_marked = false;
49 bool is_sync_pending = false;
50};
51
52struct MapIntervalCompare {
53 constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
54 return lhs.start < rhs.start;
55 }
56};
57
58class MapIntervalAllocator {
59public:
60 MapIntervalAllocator();
61 ~MapIntervalAllocator();
62
63 MapInterval* Allocate() {
64 if (free_list.empty()) {
65 AllocateNewChunk();
66 }
67 MapInterval* const interval = free_list.back();
68 free_list.pop_back();
69 return interval;
70 }
71
72 void Release(MapInterval* interval) {
73 free_list.push_back(interval);
74 }
75
76private:
77 struct Chunk {
78 std::unique_ptr<Chunk> next;
79 std::array<MapInterval, 0x8000> data;
80 };
81
82 void AllocateNewChunk();
83
84 void FillFreeList(Chunk& chunk);
85
86 std::vector<MapInterval*> free_list;
87
88 Chunk first_chunk;
89
90 std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
91};
92
93} // namespace VideoCommon
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp
index 55e632346..2b7569335 100644
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@@ -110,12 +110,10 @@ void Vic::Execute() {
110 converted_frame_buffer.get(), block_height, 0, 0); 110 converted_frame_buffer.get(), block_height, 0, 0);
111 111
112 gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size); 112 gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
113 gpu.Maxwell3D().OnMemoryWrite();
114 } else { 113 } else {
115 // send pitch linear frame 114 // send pitch linear frame
116 gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, 115 gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
117 linear_size); 116 linear_size);
118 gpu.Maxwell3D().OnMemoryWrite();
119 } 117 }
120 break; 118 break;
121 } 119 }
@@ -163,7 +161,6 @@ void Vic::Execute() {
163 } 161 }
164 gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(), 162 gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
165 chroma_buffer.size()); 163 chroma_buffer.size());
166 gpu.Maxwell3D().OnMemoryWrite();
167 break; 164 break;
168 } 165 }
169 default: 166 default:
diff --git a/src/video_core/dirty_flags.cpp b/src/video_core/dirty_flags.cpp
index b1eaac00c..7149af290 100644
--- a/src/video_core/dirty_flags.cpp
+++ b/src/video_core/dirty_flags.cpp
@@ -12,13 +12,30 @@
12#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32))) 12#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32)))
13 13
14namespace VideoCommon::Dirty { 14namespace VideoCommon::Dirty {
15 15namespace {
16using Tegra::Engines::Maxwell3D; 16using Tegra::Engines::Maxwell3D;
17 17
18void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) { 18void SetupDirtyVertexBuffers(Maxwell3D::DirtyState::Tables& tables) {
19 static constexpr std::size_t num_array = 3;
20 for (std::size_t i = 0; i < Maxwell3D::Regs::NumVertexArrays; ++i) {
21 const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]);
22 const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]);
23
24 FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers);
25 FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers);
26 }
27}
28
29void SetupIndexBuffer(Maxwell3D::DirtyState::Tables& tables) {
30 FillBlock(tables[0], OFF(index_array), NUM(index_array), IndexBuffer);
31}
32
33void SetupDirtyDescriptors(Maxwell3D::DirtyState::Tables& tables) {
19 FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors); 34 FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors);
20 FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors); 35 FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors);
36}
21 37
38void SetupDirtyRenderTargets(Maxwell3D::DirtyState::Tables& tables) {
22 static constexpr std::size_t num_per_rt = NUM(rt[0]); 39 static constexpr std::size_t num_per_rt = NUM(rt[0]);
23 static constexpr std::size_t begin = OFF(rt); 40 static constexpr std::size_t begin = OFF(rt);
24 static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets; 41 static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets;
@@ -41,5 +58,13 @@ void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tabl
41 FillBlock(table, OFF(zeta), NUM(zeta), flag); 58 FillBlock(table, OFF(zeta), NUM(zeta), flag);
42 } 59 }
43} 60}
61} // Anonymous namespace
62
63void SetupDirtyFlags(Maxwell3D::DirtyState::Tables& tables) {
64 SetupDirtyVertexBuffers(tables);
65 SetupIndexBuffer(tables);
66 SetupDirtyDescriptors(tables);
67 SetupDirtyRenderTargets(tables);
68}
44 69
45} // namespace VideoCommon::Dirty 70} // namespace VideoCommon::Dirty
diff --git a/src/video_core/dirty_flags.h b/src/video_core/dirty_flags.h
index 875527ddd..702688ace 100644
--- a/src/video_core/dirty_flags.h
+++ b/src/video_core/dirty_flags.h
@@ -30,6 +30,12 @@ enum : u8 {
30 ColorBuffer7, 30 ColorBuffer7,
31 ZetaBuffer, 31 ZetaBuffer,
32 32
33 VertexBuffers,
34 VertexBuffer0,
35 VertexBuffer31 = VertexBuffer0 + 31,
36
37 IndexBuffer,
38
33 LastCommonEntry, 39 LastCommonEntry,
34}; 40};
35 41
@@ -47,6 +53,6 @@ void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_
47 FillBlock(tables[1], begin, num, index_b); 53 FillBlock(tables[1], begin, num, index_b);
48} 54}
49 55
50void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables); 56void SetupDirtyFlags(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables);
51 57
52} // namespace VideoCommon::Dirty 58} // namespace VideoCommon::Dirty
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 2c8b20024..8b33c04ab 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -23,8 +23,6 @@ void DmaPusher::DispatchCalls() {
23 MICROPROFILE_SCOPE(DispatchCalls); 23 MICROPROFILE_SCOPE(DispatchCalls);
24 24
25 gpu.SyncGuestHost(); 25 gpu.SyncGuestHost();
26 // On entering GPU code, assume all memory may be touched by the ARM core.
27 gpu.Maxwell3D().OnMemoryWrite();
28 26
29 dma_pushbuffer_subindex = 0; 27 dma_pushbuffer_subindex = 0;
30 28
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index ed29fc7ac..a9b75091e 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -39,7 +39,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
39 case KEPLER_COMPUTE_REG_INDEX(data_upload): { 39 case KEPLER_COMPUTE_REG_INDEX(data_upload): {
40 upload_state.ProcessData(method_argument, is_last_call); 40 upload_state.ProcessData(method_argument, is_last_call);
41 if (is_last_call) { 41 if (is_last_call) {
42 system.GPU().Maxwell3D().OnMemoryWrite();
43 } 42 }
44 break; 43 break;
45 } 44 }
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 9911140e9..560551157 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -33,7 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
33 case KEPLERMEMORY_REG_INDEX(data): { 33 case KEPLERMEMORY_REG_INDEX(data): {
34 upload_state.ProcessData(method_argument, is_last_call); 34 upload_state.ProcessData(method_argument, is_last_call);
35 if (is_last_call) { 35 if (is_last_call) {
36 system.GPU().Maxwell3D().OnMemoryWrite();
37 } 36 }
38 break; 37 break;
39 } 38 }
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index d6ba9da5c..75517a4f7 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -223,7 +223,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
223 case MAXWELL3D_REG_INDEX(data_upload): 223 case MAXWELL3D_REG_INDEX(data_upload):
224 upload_state.ProcessData(argument, is_last_call); 224 upload_state.ProcessData(argument, is_last_call);
225 if (is_last_call) { 225 if (is_last_call) {
226 OnMemoryWrite();
227 } 226 }
228 return; 227 return;
229 case MAXWELL3D_REG_INDEX(fragment_barrier): 228 case MAXWELL3D_REG_INDEX(fragment_barrier):
@@ -570,17 +569,18 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
570 } 569 }
571} 570}
572 571
573void Maxwell3D::ProcessCBBind(std::size_t stage_index) { 572void Maxwell3D::ProcessCBBind(size_t stage_index) {
574 // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage. 573 // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
575 auto& shader = state.shader_stages[stage_index]; 574 const auto& bind_data = regs.cb_bind[stage_index];
576 auto& bind_data = regs.cb_bind[stage_index]; 575 auto& buffer = state.shader_stages[stage_index].const_buffers[bind_data.index];
577
578 ASSERT(bind_data.index < Regs::MaxConstBuffers);
579 auto& buffer = shader.const_buffers[bind_data.index];
580
581 buffer.enabled = bind_data.valid.Value() != 0; 576 buffer.enabled = bind_data.valid.Value() != 0;
582 buffer.address = regs.const_buffer.BufferAddress(); 577 buffer.address = regs.const_buffer.BufferAddress();
583 buffer.size = regs.const_buffer.cb_size; 578 buffer.size = regs.const_buffer.cb_size;
579
580 const bool is_enabled = bind_data.valid.Value() != 0;
581 const GPUVAddr gpu_addr = is_enabled ? regs.const_buffer.BufferAddress() : 0;
582 const u32 size = is_enabled ? regs.const_buffer.cb_size : 0;
583 rasterizer->BindGraphicsUniformBuffer(stage_index, bind_data.index, gpu_addr, size);
584} 584}
585 585
586void Maxwell3D::ProcessCBData(u32 value) { 586void Maxwell3D::ProcessCBData(u32 value) {
@@ -635,7 +635,6 @@ void Maxwell3D::FinishCBData() {
635 635
636 const u32 id = cb_data_state.id; 636 const u32 id = cb_data_state.id;
637 memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size); 637 memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
638 OnMemoryWrite();
639 638
640 cb_data_state.id = null_cb_data; 639 cb_data_state.id = null_cb_data;
641 cb_data_state.current = null_cb_data; 640 cb_data_state.current = null_cb_data;
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index cc94d2678..ffed42a29 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1314,8 +1314,7 @@ public:
1314 1314
1315 GPUVAddr LimitAddress() const { 1315 GPUVAddr LimitAddress() const {
1316 return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_high) << 32) | 1316 return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_high) << 32) |
1317 limit_low) + 1317 limit_low);
1318 1;
1319 } 1318 }
1320 } vertex_array_limit[NumVertexArrays]; 1319 } vertex_array_limit[NumVertexArrays];
1321 1320
@@ -1403,6 +1402,7 @@ public:
1403 }; 1402 };
1404 1403
1405 std::array<ShaderStageInfo, Regs::MaxShaderStage> shader_stages; 1404 std::array<ShaderStageInfo, Regs::MaxShaderStage> shader_stages;
1405
1406 u32 current_instance = 0; ///< Current instance to be used to simulate instanced rendering. 1406 u32 current_instance = 0; ///< Current instance to be used to simulate instanced rendering.
1407 }; 1407 };
1408 1408
@@ -1452,11 +1452,6 @@ public:
1452 return *rasterizer; 1452 return *rasterizer;
1453 } 1453 }
1454 1454
1455 /// Notify a memory write has happened.
1456 void OnMemoryWrite() {
1457 dirty.flags |= dirty.on_write_stores;
1458 }
1459
1460 enum class MMEDrawMode : u32 { 1455 enum class MMEDrawMode : u32 {
1461 Undefined, 1456 Undefined,
1462 Array, 1457 Array,
@@ -1478,7 +1473,6 @@ public:
1478 using Tables = std::array<Table, 2>; 1473 using Tables = std::array<Table, 2>;
1479 1474
1480 Flags flags; 1475 Flags flags;
1481 Flags on_write_stores;
1482 Tables tables{}; 1476 Tables tables{};
1483 } dirty; 1477 } dirty;
1484 1478
@@ -1541,7 +1535,7 @@ private:
1541 void FinishCBData(); 1535 void FinishCBData();
1542 1536
1543 /// Handles a write to the CB_BIND register. 1537 /// Handles a write to the CB_BIND register.
1544 void ProcessCBBind(std::size_t stage_index); 1538 void ProcessCBBind(size_t stage_index);
1545 1539
1546 /// Handles a write to the VERTEX_END_GL register, triggering a draw. 1540 /// Handles a write to the VERTEX_END_GL register, triggering a draw.
1547 void DrawArrays(); 1541 void DrawArrays();
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index ba750748c..a2f19559f 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -60,9 +60,6 @@ void MaxwellDMA::Launch() {
60 return; 60 return;
61 } 61 }
62 62
63 // All copies here update the main memory, so mark all rasterizer states as invalid.
64 system.GPU().Maxwell3D().OnMemoryWrite();
65
66 if (is_src_pitch && is_dst_pitch) { 63 if (is_src_pitch && is_dst_pitch) {
67 CopyPitchToPitch(); 64 CopyPitchToPitch();
68 } else { 65 } else {
diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h
index 3512283ff..f055b61e9 100644
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@@ -143,22 +143,26 @@ private:
143 } 143 }
144 144
145 bool ShouldWait() const { 145 bool ShouldWait() const {
146 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
146 return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() || 147 return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() ||
147 query_cache.ShouldWaitAsyncFlushes(); 148 query_cache.ShouldWaitAsyncFlushes();
148 } 149 }
149 150
150 bool ShouldFlush() const { 151 bool ShouldFlush() const {
152 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
151 return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() || 153 return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() ||
152 query_cache.HasUncommittedFlushes(); 154 query_cache.HasUncommittedFlushes();
153 } 155 }
154 156
155 void PopAsyncFlushes() { 157 void PopAsyncFlushes() {
158 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
156 texture_cache.PopAsyncFlushes(); 159 texture_cache.PopAsyncFlushes();
157 buffer_cache.PopAsyncFlushes(); 160 buffer_cache.PopAsyncFlushes();
158 query_cache.PopAsyncFlushes(); 161 query_cache.PopAsyncFlushes();
159 } 162 }
160 163
161 void CommitAsyncFlushes() { 164 void CommitAsyncFlushes() {
165 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
162 texture_cache.CommitAsyncFlushes(); 166 texture_cache.CommitAsyncFlushes();
163 buffer_cache.CommitAsyncFlushes(); 167 buffer_cache.CommitAsyncFlushes();
164 query_cache.CommitAsyncFlushes(); 168 query_cache.CommitAsyncFlushes();
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index 28f2b8614..970120acc 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -12,7 +12,6 @@ set(SHADER_FILES
12 vulkan_blit_depth_stencil.frag 12 vulkan_blit_depth_stencil.frag
13 vulkan_present.frag 13 vulkan_present.frag
14 vulkan_present.vert 14 vulkan_present.vert
15 vulkan_quad_array.comp
16 vulkan_quad_indexed.comp 15 vulkan_quad_indexed.comp
17 vulkan_uint8.comp 16 vulkan_uint8.comp
18) 17)
diff --git a/src/video_core/host_shaders/vulkan_quad_array.comp b/src/video_core/host_shaders/vulkan_quad_array.comp
deleted file mode 100644
index 212f4e998..000000000
--- a/src/video_core/host_shaders/vulkan_quad_array.comp
+++ /dev/null
@@ -1,28 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#version 460 core
6
7layout (local_size_x = 1024) in;
8
9layout (std430, set = 0, binding = 0) buffer OutputBuffer {
10 uint output_indexes[];
11};
12
13layout (push_constant) uniform PushConstants {
14 uint first;
15};
16
17void main() {
18 uint primitive = gl_GlobalInvocationID.x;
19 if (primitive * 6 >= output_indexes.length()) {
20 return;
21 }
22
23 const uint quad_map[6] = uint[](0, 1, 2, 0, 2, 3);
24 for (uint vertex = 0; vertex < 6; ++vertex) {
25 uint index = first + primitive * 4 + quad_map[vertex];
26 output_indexes[primitive * 6 + vertex] = index;
27 }
28}
diff --git a/src/video_core/host_shaders/vulkan_uint8.comp b/src/video_core/host_shaders/vulkan_uint8.comp
index ad74d7af9..872291670 100644
--- a/src/video_core/host_shaders/vulkan_uint8.comp
+++ b/src/video_core/host_shaders/vulkan_uint8.comp
@@ -16,9 +16,16 @@ layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
16 uint16_t output_indexes[]; 16 uint16_t output_indexes[];
17}; 17};
18 18
19uint AssembleIndex(uint id) {
20 // Most primitive restart indices are 0xFF
21 // Hardcode this to 0xFF for now
22 uint index = uint(input_indexes[id]);
23 return index == 0xFF ? 0xFFFF : index;
24}
25
19void main() { 26void main() {
20 uint id = gl_GlobalInvocationID.x; 27 uint id = gl_GlobalInvocationID.x;
21 if (id < input_indexes.length()) { 28 if (id < input_indexes.length()) {
22 output_indexes[id] = uint16_t(input_indexes[id]); 29 output_indexes[id] = uint16_t(AssembleIndex(id));
23 } 30 }
24} 31}
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 0cb0f387d..50491b758 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -7,6 +7,7 @@
7#include <atomic> 7#include <atomic>
8#include <functional> 8#include <functional>
9#include <optional> 9#include <optional>
10#include <span>
10#include "common/common_types.h" 11#include "common/common_types.h"
11#include "video_core/engines/fermi_2d.h" 12#include "video_core/engines/fermi_2d.h"
12#include "video_core/gpu.h" 13#include "video_core/gpu.h"
@@ -49,6 +50,10 @@ public:
49 /// Records a GPU query and caches it 50 /// Records a GPU query and caches it
50 virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; 51 virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
51 52
53 /// Signal an uniform buffer binding
54 virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
55 u32 size) = 0;
56
52 /// Signal a GPU based semaphore as a fence 57 /// Signal a GPU based semaphore as a fence
53 virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0; 58 virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0;
54 59
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 5772cad87..889ad6c56 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -2,98 +2,235 @@
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <memory> 5#include <span>
6 6
7#include <glad/glad.h>
8
9#include "common/assert.h"
10#include "common/microprofile.h"
11#include "video_core/buffer_cache/buffer_cache.h" 7#include "video_core/buffer_cache/buffer_cache.h"
12#include "video_core/engines/maxwell_3d.h"
13#include "video_core/rasterizer_interface.h"
14#include "video_core/renderer_opengl/gl_buffer_cache.h" 8#include "video_core/renderer_opengl/gl_buffer_cache.h"
15#include "video_core/renderer_opengl/gl_device.h" 9#include "video_core/renderer_opengl/gl_device.h"
16#include "video_core/renderer_opengl/gl_rasterizer.h" 10#include "video_core/vulkan_common/vulkan_device.h"
17#include "video_core/renderer_opengl/gl_resource_manager.h" 11#include "video_core/vulkan_common/vulkan_instance.h"
12#include "video_core/vulkan_common/vulkan_library.h"
13#include "video_core/vulkan_common/vulkan_memory_allocator.h"
18 14
19namespace OpenGL { 15namespace OpenGL {
16namespace {
17struct BindlessSSBO {
18 GLuint64EXT address;
19 GLsizei length;
20 GLsizei padding;
21};
22static_assert(sizeof(BindlessSSBO) == sizeof(GLuint) * 4);
23
24constexpr std::array PROGRAM_LUT{
25 GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
26 GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
27};
28} // Anonymous namespace
29
30Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params)
31 : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {}
32
33Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
34 VAddr cpu_addr_, u64 size_bytes_)
35 : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_) {
36 buffer.Create();
37 const std::string name = fmt::format("Buffer 0x{:x}", CpuAddr());
38 glObjectLabel(GL_BUFFER, buffer.handle, static_cast<GLsizei>(name.size()), name.data());
39 if (runtime.device.UseAssemblyShaders()) {
40 CreateMemoryObjects(runtime);
41 glNamedBufferStorageMemEXT(buffer.handle, SizeBytes(), memory_commit.ExportOpenGLHandle(),
42 memory_commit.Offset());
43 } else {
44 glNamedBufferData(buffer.handle, SizeBytes(), nullptr, GL_DYNAMIC_DRAW);
45 }
46 if (runtime.has_unified_vertex_buffers) {
47 glGetNamedBufferParameterui64vNV(buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &address);
48 }
49}
20 50
21using Maxwell = Tegra::Engines::Maxwell3D::Regs; 51void Buffer::ImmediateUpload(size_t offset, std::span<const u8> data) noexcept {
52 glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
53 static_cast<GLsizeiptr>(data.size_bytes()), data.data());
54}
22 55
23MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); 56void Buffer::ImmediateDownload(size_t offset, std::span<u8> data) noexcept {
57 glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
58 static_cast<GLsizeiptr>(data.size_bytes()), data.data());
59}
24 60
25Buffer::Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_) 61void Buffer::MakeResident(GLenum access) noexcept {
26 : BufferBlock{cpu_addr_, size_} { 62 // Abuse GLenum's order to exit early
27 gl_buffer.Create(); 63 // GL_NONE (default) < GL_READ_ONLY < GL_READ_WRITE
28 glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size_), nullptr, GL_DYNAMIC_DRAW); 64 if (access <= current_residency_access || buffer.handle == 0) {
29 if (device_.UseAssemblyShaders() || device_.HasVertexBufferUnifiedMemory()) { 65 return;
30 glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); 66 }
31 glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); 67 if (std::exchange(current_residency_access, access) != GL_NONE) {
68 // If the buffer is already resident, remove its residency before promoting it
69 glMakeNamedBufferNonResidentNV(buffer.handle);
32 } 70 }
71 glMakeNamedBufferResidentNV(buffer.handle, access);
33} 72}
34 73
35Buffer::~Buffer() = default; 74GLuint Buffer::SubBuffer(u32 offset) {
75 if (offset == 0) {
76 return buffer.handle;
77 }
78 for (const auto& [sub_buffer, sub_offset] : subs) {
79 if (sub_offset == offset) {
80 return sub_buffer.handle;
81 }
82 }
83 OGLBuffer sub_buffer;
84 sub_buffer.Create();
85 glNamedBufferStorageMemEXT(sub_buffer.handle, SizeBytes() - offset,
86 memory_commit.ExportOpenGLHandle(), memory_commit.Offset() + offset);
87 return subs.emplace_back(std::move(sub_buffer), offset).first.handle;
88}
36 89
37void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) { 90void Buffer::CreateMemoryObjects(BufferCacheRuntime& runtime) {
38 glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), 91 auto& allocator = runtime.vulkan_memory_allocator;
39 static_cast<GLsizeiptr>(data_size), data); 92 auto& device = runtime.vulkan_device->GetLogical();
93 auto vulkan_buffer = device.CreateBuffer(VkBufferCreateInfo{
94 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
95 .pNext = nullptr,
96 .flags = 0,
97 .size = SizeBytes(),
98 .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
99 VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT |
100 VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
101 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
102 VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
103 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
104 .queueFamilyIndexCount = 0,
105 .pQueueFamilyIndices = nullptr,
106 });
107 const VkMemoryRequirements requirements = device.GetBufferMemoryRequirements(*vulkan_buffer);
108 memory_commit = allocator->Commit(requirements, Vulkan::MemoryUsage::DeviceLocal);
40} 109}
41 110
42void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) { 111BufferCacheRuntime::BufferCacheRuntime(const Device& device_, const Vulkan::Device* vulkan_device_,
43 MICROPROFILE_SCOPE(OpenGL_Buffer_Download); 112 Vulkan::MemoryAllocator* vulkan_memory_allocator_)
44 const GLsizeiptr gl_size = static_cast<GLsizeiptr>(data_size); 113 : device{device_}, vulkan_device{vulkan_device_},
45 const GLintptr gl_offset = static_cast<GLintptr>(offset); 114 vulkan_memory_allocator{vulkan_memory_allocator_},
46 if (read_buffer.handle == 0) { 115 stream_buffer{device.HasFastBufferSubData() ? std::nullopt
47 read_buffer.Create(); 116 : std::make_optional<StreamBuffer>()} {
48 glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr, 117 GLint gl_max_attributes;
49 GL_STREAM_READ); 118 glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &gl_max_attributes);
119 max_attributes = static_cast<u32>(gl_max_attributes);
120 use_assembly_shaders = device.UseAssemblyShaders();
121 has_unified_vertex_buffers = device.HasVertexBufferUnifiedMemory();
122
123 for (auto& stage_uniforms : fast_uniforms) {
124 for (OGLBuffer& buffer : stage_uniforms) {
125 buffer.Create();
126 glNamedBufferData(buffer.handle, BufferCache::SKIP_CACHE_SIZE, nullptr, GL_STREAM_DRAW);
127 }
50 } 128 }
51 glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
52 glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size);
53 glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data);
54} 129}
55 130
56void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, 131void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
57 std::size_t copy_size) { 132 std::span<const VideoCommon::BufferCopy> copies) {
58 glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset), 133 for (const VideoCommon::BufferCopy& copy : copies) {
59 static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(copy_size)); 134 glCopyNamedBufferSubData(
135 src_buffer.Handle(), dst_buffer.Handle(), static_cast<GLintptr>(copy.src_offset),
136 static_cast<GLintptr>(copy.dst_offset), static_cast<GLsizeiptr>(copy.size));
137 }
60} 138}
61 139
62OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer_, 140void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) {
63 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, 141 if (has_unified_vertex_buffers) {
64 const Device& device_, OGLStreamBuffer& stream_buffer_, 142 buffer.MakeResident(GL_READ_ONLY);
65 StateTracker& state_tracker) 143 glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, buffer.HostGpuAddr() + offset,
66 : GenericBufferCache{rasterizer_, gpu_memory_, cpu_memory_, stream_buffer_}, device{device_} { 144 static_cast<GLsizeiptr>(size));
67 if (!device.HasFastBufferSubData()) { 145 } else {
68 return; 146 glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer.Handle());
147 index_buffer_offset = offset;
69 } 148 }
149}
70 150
71 static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); 151void BufferCacheRuntime::BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size,
72 glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); 152 u32 stride) {
73 for (const GLuint cbuf : cbufs) { 153 if (index >= max_attributes) {
74 glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); 154 return;
155 }
156 if (has_unified_vertex_buffers) {
157 buffer.MakeResident(GL_READ_ONLY);
158 glBindVertexBuffer(index, 0, 0, static_cast<GLsizei>(stride));
159 glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, index,
160 buffer.HostGpuAddr() + offset, static_cast<GLsizeiptr>(size));
161 } else {
162 glBindVertexBuffer(index, buffer.Handle(), static_cast<GLintptr>(offset),
163 static_cast<GLsizei>(stride));
75 } 164 }
76} 165}
77 166
78OGLBufferCache::~OGLBufferCache() { 167void BufferCacheRuntime::BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer,
79 glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); 168 u32 offset, u32 size) {
169 if (use_assembly_shaders) {
170 const GLuint sub_buffer = buffer.SubBuffer(offset);
171 glBindBufferRangeNV(PABO_LUT[stage], binding_index, sub_buffer, 0,
172 static_cast<GLsizeiptr>(size));
173 } else {
174 const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
175 const GLuint binding = base_binding + binding_index;
176 glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer.Handle(),
177 static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
178 }
80} 179}
81 180
82std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { 181void BufferCacheRuntime::BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset,
83 return std::make_shared<Buffer>(device, cpu_addr, size); 182 u32 size) {
183 if (use_assembly_shaders) {
184 glBindBufferRangeNV(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding_index,
185 buffer.SubBuffer(offset), 0, static_cast<GLsizeiptr>(size));
186 } else {
187 glBindBufferRange(GL_UNIFORM_BUFFER, binding_index, buffer.Handle(),
188 static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
189 }
84} 190}
85 191
86OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) { 192void BufferCacheRuntime::BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer,
87 return {0, 0, 0}; 193 u32 offset, u32 size, bool is_written) {
194 if (use_assembly_shaders) {
195 const BindlessSSBO ssbo{
196 .address = buffer.HostGpuAddr() + offset,
197 .length = static_cast<GLsizei>(size),
198 .padding = 0,
199 };
200 buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
201 glProgramLocalParametersI4uivNV(PROGRAM_LUT[stage], binding_index, 1,
202 reinterpret_cast<const GLuint*>(&ssbo));
203 } else {
204 const GLuint base_binding = device.GetBaseBindings(stage).shader_storage_buffer;
205 const GLuint binding = base_binding + binding_index;
206 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, buffer.Handle(),
207 static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
208 }
88} 209}
89 210
90OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, 211void BufferCacheRuntime::BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset,
91 std::size_t size) { 212 u32 size, bool is_written) {
92 DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); 213 if (use_assembly_shaders) {
93 const GLuint cbuf = cbufs[cbuf_cursor++]; 214 const BindlessSSBO ssbo{
215 .address = buffer.HostGpuAddr() + offset,
216 .length = static_cast<GLsizei>(size),
217 .padding = 0,
218 };
219 buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
220 glProgramLocalParametersI4uivNV(GL_COMPUTE_PROGRAM_NV, binding_index, 1,
221 reinterpret_cast<const GLuint*>(&ssbo));
222 } else if (size == 0) {
223 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, 0, 0, 0);
224 } else {
225 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, buffer.Handle(),
226 static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
227 }
228}
94 229
95 glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer); 230void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset,
96 return {cbuf, 0, 0}; 231 u32 size) {
232 glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, index, buffer.Handle(),
233 static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
97} 234}
98 235
99} // namespace OpenGL 236} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 17ee90316..f4d8871a9 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -5,79 +5,167 @@
5#pragma once 5#pragma once
6 6
7#include <array> 7#include <array>
8#include <memory> 8#include <span>
9 9
10#include "common/alignment.h"
10#include "common/common_types.h" 11#include "common/common_types.h"
12#include "common/dynamic_library.h"
11#include "video_core/buffer_cache/buffer_cache.h" 13#include "video_core/buffer_cache/buffer_cache.h"
12#include "video_core/engines/maxwell_3d.h" 14#include "video_core/rasterizer_interface.h"
15#include "video_core/renderer_opengl/gl_device.h"
13#include "video_core/renderer_opengl/gl_resource_manager.h" 16#include "video_core/renderer_opengl/gl_resource_manager.h"
14#include "video_core/renderer_opengl/gl_stream_buffer.h" 17#include "video_core/renderer_opengl/gl_stream_buffer.h"
18#include "video_core/vulkan_common/vulkan_device.h"
19#include "video_core/vulkan_common/vulkan_memory_allocator.h"
15 20
16namespace Core { 21namespace Vulkan {
17class System; 22class Device;
18} 23class MemoryAllocator;
24} // namespace Vulkan
19 25
20namespace OpenGL { 26namespace OpenGL {
21 27
22class Device; 28class BufferCacheRuntime;
23class OGLStreamBuffer;
24class RasterizerOpenGL;
25class StateTracker;
26 29
27class Buffer : public VideoCommon::BufferBlock { 30class Buffer : public VideoCommon::BufferBase<VideoCore::RasterizerInterface> {
28public: 31public:
29 explicit Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_); 32 explicit Buffer(BufferCacheRuntime&, VideoCore::RasterizerInterface& rasterizer, VAddr cpu_addr,
30 ~Buffer(); 33 u64 size_bytes);
34 explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams);
31 35
32 void Upload(std::size_t offset, std::size_t data_size, const u8* data); 36 void ImmediateUpload(size_t offset, std::span<const u8> data) noexcept;
33 37
34 void Download(std::size_t offset, std::size_t data_size, u8* data); 38 void ImmediateDownload(size_t offset, std::span<u8> data) noexcept;
35 39
36 void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, 40 void MakeResident(GLenum access) noexcept;
37 std::size_t copy_size);
38 41
39 GLuint Handle() const noexcept { 42 [[nodiscard]] GLuint SubBuffer(u32 offset);
40 return gl_buffer.handle; 43
44 [[nodiscard]] GLuint64EXT HostGpuAddr() const noexcept {
45 return address;
41 } 46 }
42 47
43 u64 Address() const noexcept { 48 [[nodiscard]] GLuint Handle() const noexcept {
44 return gpu_address; 49 return buffer.handle;
45 } 50 }
46 51
47private: 52private:
48 OGLBuffer gl_buffer; 53 void CreateMemoryObjects(BufferCacheRuntime& runtime);
49 OGLBuffer read_buffer; 54
50 u64 gpu_address = 0; 55 GLuint64EXT address = 0;
56 Vulkan::MemoryCommit memory_commit;
57 OGLBuffer buffer;
58 GLenum current_residency_access = GL_NONE;
59 std::vector<std::pair<OGLBuffer, u32>> subs;
51}; 60};
52 61
53using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; 62class BufferCacheRuntime {
54class OGLBufferCache final : public GenericBufferCache { 63 friend Buffer;
64
55public: 65public:
56 explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer, 66 static constexpr u8 INVALID_BINDING = std::numeric_limits<u8>::max();
57 Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, 67
58 const Device& device, OGLStreamBuffer& stream_buffer, 68 explicit BufferCacheRuntime(const Device& device_, const Vulkan::Device* vulkan_device_,
59 StateTracker& state_tracker); 69 Vulkan::MemoryAllocator* vulkan_memory_allocator_);
60 ~OGLBufferCache(); 70
71 void CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
72 std::span<const VideoCommon::BufferCopy> copies);
73
74 void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size);
75
76 void BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, u32 stride);
77
78 void BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size);
79
80 void BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size);
81
82 void BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size,
83 bool is_written);
84
85 void BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size,
86 bool is_written);
61 87
62 BufferInfo GetEmptyBuffer(std::size_t) override; 88 void BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, u32 size);
63 89
64 void Acquire() noexcept { 90 void BindFastUniformBuffer(size_t stage, u32 binding_index, u32 size) {
65 cbuf_cursor = 0; 91 if (use_assembly_shaders) {
92 const GLuint handle = fast_uniforms[stage][binding_index].handle;
93 const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size);
94 glBindBufferRangeNV(PABO_LUT[stage], binding_index, handle, 0, gl_size);
95 } else {
96 const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
97 const GLuint binding = base_binding + binding_index;
98 glBindBufferRange(GL_UNIFORM_BUFFER, binding,
99 fast_uniforms[stage][binding_index].handle, 0,
100 static_cast<GLsizeiptr>(size));
101 }
66 } 102 }
67 103
68protected: 104 void PushFastUniformBuffer(size_t stage, u32 binding_index, std::span<const u8> data) {
69 std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; 105 if (use_assembly_shaders) {
106 glProgramBufferParametersIuivNV(
107 PABO_LUT[stage], binding_index, 0,
108 static_cast<GLsizei>(data.size_bytes() / sizeof(GLuint)),
109 reinterpret_cast<const GLuint*>(data.data()));
110 } else {
111 glNamedBufferSubData(fast_uniforms[stage][binding_index].handle, 0,
112 static_cast<GLsizeiptr>(data.size_bytes()), data.data());
113 }
114 }
115
116 std::span<u8> BindMappedUniformBuffer(size_t stage, u32 binding_index, u32 size) noexcept {
117 const auto [mapped_span, offset] = stream_buffer->Request(static_cast<size_t>(size));
118 const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
119 const GLuint binding = base_binding + binding_index;
120 glBindBufferRange(GL_UNIFORM_BUFFER, binding, stream_buffer->Handle(),
121 static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
122 return mapped_span;
123 }
70 124
71 BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; 125 [[nodiscard]] const GLvoid* IndexOffset() const noexcept {
126 return reinterpret_cast<const GLvoid*>(static_cast<uintptr_t>(index_buffer_offset));
127 }
128
129 [[nodiscard]] bool HasFastBufferSubData() const noexcept {
130 return device.HasFastBufferSubData();
131 }
72 132
73private: 133private:
74 static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * 134 static constexpr std::array PABO_LUT{
75 Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; 135 GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
136 GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
137 GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
138 };
76 139
77 const Device& device; 140 const Device& device;
141 const Vulkan::Device* vulkan_device;
142 Vulkan::MemoryAllocator* vulkan_memory_allocator;
143 std::optional<StreamBuffer> stream_buffer;
144
145 u32 max_attributes = 0;
78 146
79 std::size_t cbuf_cursor = 0; 147 bool use_assembly_shaders = false;
80 std::array<GLuint, NUM_CBUFS> cbufs{}; 148 bool has_unified_vertex_buffers = false;
149
150 std::array<std::array<OGLBuffer, VideoCommon::NUM_GRAPHICS_UNIFORM_BUFFERS>,
151 VideoCommon::NUM_STAGES>
152 fast_uniforms;
153
154 u32 index_buffer_offset = 0;
155};
156
157struct BufferCacheParams {
158 using Runtime = OpenGL::BufferCacheRuntime;
159 using Buffer = OpenGL::Buffer;
160
161 static constexpr bool IS_OPENGL = true;
162 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true;
163 static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = true;
164 static constexpr bool NEEDS_BIND_UNIFORM_INDEX = true;
165 static constexpr bool NEEDS_BIND_STORAGE_INDEX = true;
166 static constexpr bool USE_MEMORY_MAPS = false;
81}; 167};
82 168
169using BufferCache = VideoCommon::BufferCache<BufferCacheParams>;
170
83} // namespace OpenGL 171} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 04c267ee4..0f492f006 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -21,9 +21,7 @@
21#include "video_core/renderer_opengl/gl_resource_manager.h" 21#include "video_core/renderer_opengl/gl_resource_manager.h"
22 22
23namespace OpenGL { 23namespace OpenGL {
24
25namespace { 24namespace {
26
27// One uniform block is reserved for emulation purposes 25// One uniform block is reserved for emulation purposes
28constexpr u32 ReservedUniformBlocks = 1; 26constexpr u32 ReservedUniformBlocks = 1;
29 27
@@ -197,11 +195,13 @@ bool IsASTCSupported() {
197 const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); 195 const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
198 return nsight || HasExtension(extensions, "GL_EXT_debug_tool"); 196 return nsight || HasExtension(extensions, "GL_EXT_debug_tool");
199} 197}
200
201} // Anonymous namespace 198} // Anonymous namespace
202 199
203Device::Device() 200Device::Device(bool has_vulkan_instance) {
204 : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { 201 if (!GLAD_GL_VERSION_4_6) {
202 LOG_ERROR(Render_OpenGL, "OpenGL 4.6 is not available");
203 throw std::runtime_error{"Insufficient version"};
204 }
205 const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); 205 const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
206 const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION)); 206 const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
207 const std::vector extensions = GetExtensions(); 207 const std::vector extensions = GetExtensions();
@@ -217,6 +217,9 @@ Device::Device()
217 "Beta driver 443.24 is known to have issues. There might be performance issues."); 217 "Beta driver 443.24 is known to have issues. There might be performance issues.");
218 disable_fast_buffer_sub_data = true; 218 disable_fast_buffer_sub_data = true;
219 } 219 }
220
221 max_uniform_buffers = BuildMaxUniformBuffers();
222 base_bindings = BuildBaseBindings();
220 uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); 223 uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
221 shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); 224 shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
222 max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); 225 max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
@@ -243,7 +246,8 @@ Device::Device()
243 246
244 use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() && 247 use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() &&
245 GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 && 248 GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 &&
246 GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2; 249 GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2 &&
250 has_vulkan_instance;
247 251
248 use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue(); 252 use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue();
249 use_driver_cache = is_nvidia; 253 use_driver_cache = is_nvidia;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 9141de635..eb62ae52d 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -10,18 +10,16 @@
10 10
11namespace OpenGL { 11namespace OpenGL {
12 12
13static constexpr u32 EmulationUniformBlockBinding = 0; 13class Device {
14
15class Device final {
16public: 14public:
17 struct BaseBindings final { 15 struct BaseBindings {
18 u32 uniform_buffer{}; 16 u32 uniform_buffer{};
19 u32 shader_storage_buffer{}; 17 u32 shader_storage_buffer{};
20 u32 sampler{}; 18 u32 sampler{};
21 u32 image{}; 19 u32 image{};
22 }; 20 };
23 21
24 explicit Device(); 22 explicit Device(bool has_vulkan_instance);
25 explicit Device(std::nullptr_t); 23 explicit Device(std::nullptr_t);
26 24
27 u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept { 25 u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp
index 3e9c922f5..151290101 100644
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -47,7 +47,7 @@ void GLInnerFence::Wait() {
47 47
48FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, 48FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_,
49 Tegra::GPU& gpu_, TextureCache& texture_cache_, 49 Tegra::GPU& gpu_, TextureCache& texture_cache_,
50 OGLBufferCache& buffer_cache_, QueryCache& query_cache_) 50 BufferCache& buffer_cache_, QueryCache& query_cache_)
51 : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {} 51 : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {}
52 52
53Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) { 53Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) {
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h
index 30dbee613..e714aa115 100644
--- a/src/video_core/renderer_opengl/gl_fence_manager.h
+++ b/src/video_core/renderer_opengl/gl_fence_manager.h
@@ -32,14 +32,13 @@ private:
32}; 32};
33 33
34using Fence = std::shared_ptr<GLInnerFence>; 34using Fence = std::shared_ptr<GLInnerFence>;
35using GenericFenceManager = 35using GenericFenceManager = VideoCommon::FenceManager<Fence, TextureCache, BufferCache, QueryCache>;
36 VideoCommon::FenceManager<Fence, TextureCache, OGLBufferCache, QueryCache>;
37 36
38class FenceManagerOpenGL final : public GenericFenceManager { 37class FenceManagerOpenGL final : public GenericFenceManager {
39public: 38public:
40 explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, 39 explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
41 TextureCache& texture_cache_, OGLBufferCache& buffer_cache_, 40 TextureCache& texture_cache, BufferCache& buffer_cache,
42 QueryCache& query_cache_); 41 QueryCache& query_cache);
43 42
44protected: 43protected:
45 Fence CreateFence(u32 value, bool is_stubbed) override; 44 Fence CreateFence(u32 value, bool is_stubbed) override;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index ea4ca9a82..52499ee4c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -44,17 +44,10 @@ using VideoCore::Surface::PixelFormat;
44using VideoCore::Surface::SurfaceTarget; 44using VideoCore::Surface::SurfaceTarget;
45using VideoCore::Surface::SurfaceType; 45using VideoCore::Surface::SurfaceType;
46 46
47MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Format Setup", MP_RGB(128, 128, 192));
48MICROPROFILE_DEFINE(OpenGL_VB, "OpenGL", "Vertex Buffer Setup", MP_RGB(128, 128, 192));
49MICROPROFILE_DEFINE(OpenGL_Shader, "OpenGL", "Shader Setup", MP_RGB(128, 128, 192));
50MICROPROFILE_DEFINE(OpenGL_UBO, "OpenGL", "Const Buffer Setup", MP_RGB(128, 128, 192));
51MICROPROFILE_DEFINE(OpenGL_Index, "OpenGL", "Index Buffer Setup", MP_RGB(128, 128, 192));
52MICROPROFILE_DEFINE(OpenGL_Texture, "OpenGL", "Texture Setup", MP_RGB(128, 128, 192));
53MICROPROFILE_DEFINE(OpenGL_Framebuffer, "OpenGL", "Framebuffer Setup", MP_RGB(128, 128, 192));
54MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192)); 47MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
48MICROPROFILE_DEFINE(OpenGL_Clears, "OpenGL", "Clears", MP_RGB(128, 128, 192));
55MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192)); 49MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192));
56MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100)); 50MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Management", MP_RGB(100, 255, 100));
57MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255, 100, 100));
58 51
59namespace { 52namespace {
60 53
@@ -101,20 +94,6 @@ TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const
101 return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index); 94 return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index);
102} 95}
103 96
104std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
105 const ConstBufferEntry& entry) {
106 if (!entry.IsIndirect()) {
107 return entry.GetSize();
108 }
109 if (buffer.size > Maxwell::MaxConstBufferSize) {
110 LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size,
111 Maxwell::MaxConstBufferSize);
112 return Maxwell::MaxConstBufferSize;
113 }
114
115 return buffer.size;
116}
117
118/// Translates hardware transform feedback indices 97/// Translates hardware transform feedback indices
119/// @param location Hardware location 98/// @param location Hardware location
120/// @return Pair of ARB_transform_feedback3 token stream first and third arguments 99/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
@@ -147,14 +126,6 @@ void oglEnable(GLenum cap, bool state) {
147 (state ? glEnable : glDisable)(cap); 126 (state ? glEnable : glDisable)(cap);
148} 127}
149 128
150void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) {
151 if (num_ssbos == 0) {
152 return;
153 }
154 glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos),
155 reinterpret_cast<const GLuint*>(ssbos));
156}
157
158ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) { 129ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) {
159 if (entry.is_buffer) { 130 if (entry.is_buffer) {
160 return ImageViewType::Buffer; 131 return ImageViewType::Buffer;
@@ -196,49 +167,35 @@ ImageViewType ImageViewTypeFromEntry(const ImageEntry& entry) {
196 167
197RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, 168RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
198 Core::Memory::Memory& cpu_memory_, const Device& device_, 169 Core::Memory::Memory& cpu_memory_, const Device& device_,
170 const Vulkan::Device* vulkan_device,
171 Vulkan::MemoryAllocator* vulkan_memory_allocator,
199 ScreenInfo& screen_info_, ProgramManager& program_manager_, 172 ScreenInfo& screen_info_, ProgramManager& program_manager_,
200 StateTracker& state_tracker_) 173 StateTracker& state_tracker_)
201 : RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()), 174 : RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()),
202 kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_), 175 kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_),
203 screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_), 176 screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_),
204 stream_buffer(device, state_tracker),
205 texture_cache_runtime(device, program_manager, state_tracker), 177 texture_cache_runtime(device, program_manager, state_tracker),
206 texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), 178 texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory),
179 buffer_cache_runtime(device, vulkan_device, vulkan_memory_allocator),
180 buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime),
207 shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device), 181 shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device),
208 query_cache(*this, maxwell3d, gpu_memory), 182 query_cache(*this, maxwell3d, gpu_memory),
209 buffer_cache(*this, gpu_memory, cpu_memory_, device, stream_buffer, state_tracker),
210 fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), 183 fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
211 async_shaders(emu_window_) { 184 async_shaders(emu_window_) {
212 unified_uniform_buffer.Create();
213 glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
214
215 if (device.UseAssemblyShaders()) {
216 glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
217 for (const GLuint cbuf : staging_cbufs) {
218 glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
219 nullptr, 0);
220 }
221 }
222 if (device.UseAsynchronousShaders()) { 185 if (device.UseAsynchronousShaders()) {
223 async_shaders.AllocateWorkers(); 186 async_shaders.AllocateWorkers();
224 } 187 }
225} 188}
226 189
227RasterizerOpenGL::~RasterizerOpenGL() { 190RasterizerOpenGL::~RasterizerOpenGL() = default;
228 if (device.UseAssemblyShaders()) {
229 glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
230 }
231}
232 191
233void RasterizerOpenGL::SetupVertexFormat() { 192void RasterizerOpenGL::SyncVertexFormats() {
234 auto& flags = maxwell3d.dirty.flags; 193 auto& flags = maxwell3d.dirty.flags;
235 if (!flags[Dirty::VertexFormats]) { 194 if (!flags[Dirty::VertexFormats]) {
236 return; 195 return;
237 } 196 }
238 flags[Dirty::VertexFormats] = false; 197 flags[Dirty::VertexFormats] = false;
239 198
240 MICROPROFILE_SCOPE(OpenGL_VAO);
241
242 // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables 199 // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables
243 // the first 16 vertex attributes always, as we don't know which ones are actually used until 200 // the first 16 vertex attributes always, as we don't know which ones are actually used until
244 // shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to 201 // shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to
@@ -274,55 +231,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
274 } 231 }
275} 232}
276 233
277void RasterizerOpenGL::SetupVertexBuffer() { 234void RasterizerOpenGL::SyncVertexInstances() {
278 auto& flags = maxwell3d.dirty.flags;
279 if (!flags[Dirty::VertexBuffers]) {
280 return;
281 }
282 flags[Dirty::VertexBuffers] = false;
283
284 MICROPROFILE_SCOPE(OpenGL_VB);
285
286 const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
287
288 // Upload all guest vertex arrays sequentially to our buffer
289 const auto& regs = maxwell3d.regs;
290 for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
291 if (!flags[Dirty::VertexBuffer0 + index]) {
292 continue;
293 }
294 flags[Dirty::VertexBuffer0 + index] = false;
295
296 const auto& vertex_array = regs.vertex_array[index];
297 if (!vertex_array.IsEnabled()) {
298 continue;
299 }
300
301 const GPUVAddr start = vertex_array.StartAddress();
302 const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
303 ASSERT(end >= start);
304
305 const GLuint gl_index = static_cast<GLuint>(index);
306 const u64 size = end - start;
307 if (size == 0) {
308 glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
309 if (use_unified_memory) {
310 glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
311 }
312 continue;
313 }
314 const auto info = buffer_cache.UploadMemory(start, size);
315 if (use_unified_memory) {
316 glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
317 glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
318 info.address + info.offset, size);
319 } else {
320 glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
321 }
322 }
323}
324
325void RasterizerOpenGL::SetupVertexInstances() {
326 auto& flags = maxwell3d.dirty.flags; 235 auto& flags = maxwell3d.dirty.flags;
327 if (!flags[Dirty::VertexInstances]) { 236 if (!flags[Dirty::VertexInstances]) {
328 return; 237 return;
@@ -343,17 +252,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
343 } 252 }
344} 253}
345 254
346GLintptr RasterizerOpenGL::SetupIndexBuffer() { 255void RasterizerOpenGL::SetupShaders(bool is_indexed) {
347 MICROPROFILE_SCOPE(OpenGL_Index);
348 const auto& regs = maxwell3d.regs;
349 const std::size_t size = CalculateIndexBufferSize();
350 const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
351 glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
352 return info.offset;
353}
354
355void RasterizerOpenGL::SetupShaders() {
356 MICROPROFILE_SCOPE(OpenGL_Shader);
357 u32 clip_distances = 0; 256 u32 clip_distances = 0;
358 257
359 std::array<Shader*, Maxwell::MaxShaderStage> shaders{}; 258 std::array<Shader*, Maxwell::MaxShaderStage> shaders{};
@@ -410,11 +309,19 @@ void RasterizerOpenGL::SetupShaders() {
410 const size_t stage = index == 0 ? 0 : index - 1; 309 const size_t stage = index == 0 ? 0 : index - 1;
411 shaders[stage] = shader; 310 shaders[stage] = shader;
412 311
413 SetupDrawConstBuffers(stage, shader);
414 SetupDrawGlobalMemory(stage, shader);
415 SetupDrawTextures(shader, stage); 312 SetupDrawTextures(shader, stage);
416 SetupDrawImages(shader, stage); 313 SetupDrawImages(shader, stage);
417 314
315 buffer_cache.SetEnabledUniformBuffers(stage, shader->GetEntries().enabled_uniform_buffers);
316
317 buffer_cache.UnbindGraphicsStorageBuffers(stage);
318 u32 ssbo_index = 0;
319 for (const auto& buffer : shader->GetEntries().global_memory_entries) {
320 buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index,
321 buffer.cbuf_offset, buffer.is_written);
322 ++ssbo_index;
323 }
324
418 // Workaround for Intel drivers. 325 // Workaround for Intel drivers.
419 // When a clip distance is enabled but not set in the shader it crops parts of the screen 326 // When a clip distance is enabled but not set in the shader it crops parts of the screen
420 // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the 327 // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
@@ -430,43 +337,26 @@ void RasterizerOpenGL::SetupShaders() {
430 SyncClipEnabled(clip_distances); 337 SyncClipEnabled(clip_distances);
431 maxwell3d.dirty.flags[Dirty::Shaders] = false; 338 maxwell3d.dirty.flags[Dirty::Shaders] = false;
432 339
340 buffer_cache.UpdateGraphicsBuffers(is_indexed);
341
433 const std::span indices_span(image_view_indices.data(), image_view_indices.size()); 342 const std::span indices_span(image_view_indices.data(), image_view_indices.size());
434 texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); 343 texture_cache.FillGraphicsImageViews(indices_span, image_view_ids);
435 344
345 buffer_cache.BindHostGeometryBuffers(is_indexed);
346
436 size_t image_view_index = 0; 347 size_t image_view_index = 0;
437 size_t texture_index = 0; 348 size_t texture_index = 0;
438 size_t image_index = 0; 349 size_t image_index = 0;
439 for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { 350 for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
440 const Shader* const shader = shaders[stage]; 351 const Shader* const shader = shaders[stage];
441 if (shader) { 352 if (!shader) {
442 const auto base = device.GetBaseBindings(stage);
443 BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index,
444 texture_index, image_index);
445 }
446 }
447}
448
449std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
450 const auto& regs = maxwell3d.regs;
451
452 std::size_t size = 0;
453 for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
454 if (!regs.vertex_array[index].IsEnabled())
455 continue; 353 continue;
456 354 }
457 const GPUVAddr start = regs.vertex_array[index].StartAddress(); 355 buffer_cache.BindHostStageBuffers(stage);
458 const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); 356 const auto& base = device.GetBaseBindings(stage);
459 357 BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index,
460 size += end - start; 358 texture_index, image_index);
461 ASSERT(end >= start);
462 } 359 }
463
464 return size;
465}
466
467std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const {
468 return static_cast<std::size_t>(maxwell3d.regs.index_array.count) *
469 static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
470} 360}
471 361
472void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, 362void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
@@ -475,6 +365,7 @@ void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& s
475} 365}
476 366
477void RasterizerOpenGL::Clear() { 367void RasterizerOpenGL::Clear() {
368 MICROPROFILE_SCOPE(OpenGL_Clears);
478 if (!maxwell3d.ShouldExecute()) { 369 if (!maxwell3d.ShouldExecute()) {
479 return; 370 return;
480 } 371 }
@@ -525,11 +416,9 @@ void RasterizerOpenGL::Clear() {
525 } 416 }
526 UNIMPLEMENTED_IF(regs.clear_flags.viewport); 417 UNIMPLEMENTED_IF(regs.clear_flags.viewport);
527 418
528 { 419 std::scoped_lock lock{texture_cache.mutex};
529 auto lock = texture_cache.AcquireLock(); 420 texture_cache.UpdateRenderTargets(true);
530 texture_cache.UpdateRenderTargets(true); 421 state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
531 state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
532 }
533 422
534 if (use_color) { 423 if (use_color) {
535 glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); 424 glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
@@ -541,7 +430,6 @@ void RasterizerOpenGL::Clear() {
541 } else if (use_stencil) { 430 } else if (use_stencil) {
542 glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil); 431 glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
543 } 432 }
544
545 ++num_queued_commands; 433 ++num_queued_commands;
546} 434}
547 435
@@ -550,75 +438,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
550 438
551 query_cache.UpdateCounters(); 439 query_cache.UpdateCounters();
552 440
553 SyncViewport(); 441 SyncState();
554 SyncRasterizeEnable();
555 SyncPolygonModes();
556 SyncColorMask();
557 SyncFragmentColorClampState();
558 SyncMultiSampleState();
559 SyncDepthTestState();
560 SyncDepthClamp();
561 SyncStencilTestState();
562 SyncBlendState();
563 SyncLogicOpState();
564 SyncCullMode();
565 SyncPrimitiveRestart();
566 SyncScissorTest();
567 SyncPointState();
568 SyncLineState();
569 SyncPolygonOffset();
570 SyncAlphaTest();
571 SyncFramebufferSRGB();
572
573 buffer_cache.Acquire();
574 current_cbuf = 0;
575
576 std::size_t buffer_size = CalculateVertexArraysSize();
577
578 // Add space for index buffer
579 if (is_indexed) {
580 buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
581 }
582
583 // Uniform space for the 5 shader stages
584 buffer_size =
585 Common::AlignUp<std::size_t>(buffer_size, 4) +
586 (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
587
588 // Add space for at least 18 constant buffers
589 buffer_size += Maxwell::MaxConstBuffers *
590 (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
591
592 // Prepare the vertex array.
593 buffer_cache.Map(buffer_size);
594
595 // Prepare vertex array format.
596 SetupVertexFormat();
597
598 // Upload vertex and index data.
599 SetupVertexBuffer();
600 SetupVertexInstances();
601 GLintptr index_buffer_offset = 0;
602 if (is_indexed) {
603 index_buffer_offset = SetupIndexBuffer();
604 }
605
606 // Setup emulation uniform buffer.
607 if (!device.UseAssemblyShaders()) {
608 MaxwellUniformData ubo;
609 ubo.SetFromRegs(maxwell3d);
610 const auto info =
611 buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
612 glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
613 static_cast<GLsizeiptr>(sizeof(ubo)));
614 }
615 442
616 // Setup shaders and their used resources. 443 // Setup shaders and their used resources.
617 auto lock = texture_cache.AcquireLock(); 444 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
618 SetupShaders(); 445 SetupShaders(is_indexed);
619 446
620 // Signal the buffer cache that we are not going to upload more things.
621 buffer_cache.Unmap();
622 texture_cache.UpdateRenderTargets(false); 447 texture_cache.UpdateRenderTargets(false);
623 state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); 448 state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
624 program_manager.BindGraphicsPipeline(); 449 program_manager.BindGraphicsPipeline();
@@ -632,7 +457,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
632 if (is_indexed) { 457 if (is_indexed) {
633 const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base); 458 const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base);
634 const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count); 459 const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count);
635 const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset); 460 const GLvoid* const offset = buffer_cache_runtime.IndexOffset();
636 const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format); 461 const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format);
637 if (num_instances == 1 && base_instance == 0 && base_vertex == 0) { 462 if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
638 glDrawElements(primitive_mode, num_vertices, format, offset); 463 glDrawElements(primitive_mode, num_vertices, format, offset);
@@ -672,22 +497,22 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
672} 497}
673 498
674void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { 499void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
675 buffer_cache.Acquire();
676 current_cbuf = 0;
677
678 Shader* const kernel = shader_cache.GetComputeKernel(code_addr); 500 Shader* const kernel = shader_cache.GetComputeKernel(code_addr);
679 501
680 auto lock = texture_cache.AcquireLock(); 502 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
681 BindComputeTextures(kernel); 503 BindComputeTextures(kernel);
682 504
683 const size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers * 505 const auto& entries = kernel->GetEntries();
684 (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); 506 buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers);
685 buffer_cache.Map(buffer_size); 507 buffer_cache.UnbindComputeStorageBuffers();
686 508 u32 ssbo_index = 0;
687 SetupComputeConstBuffers(kernel); 509 for (const auto& buffer : entries.global_memory_entries) {
688 SetupComputeGlobalMemory(kernel); 510 buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset,
689 511 buffer.is_written);
690 buffer_cache.Unmap(); 512 ++ssbo_index;
513 }
514 buffer_cache.UpdateComputeBuffers();
515 buffer_cache.BindHostComputeBuffers();
691 516
692 const auto& launch_desc = kepler_compute.launch_description; 517 const auto& launch_desc = kepler_compute.launch_description;
693 glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); 518 glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
@@ -703,6 +528,12 @@ void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
703 query_cache.Query(gpu_addr, type, timestamp); 528 query_cache.Query(gpu_addr, type, timestamp);
704} 529}
705 530
531void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
532 u32 size) {
533 std::scoped_lock lock{buffer_cache.mutex};
534 buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size);
535}
536
706void RasterizerOpenGL::FlushAll() {} 537void RasterizerOpenGL::FlushAll() {}
707 538
708void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { 539void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
@@ -711,19 +542,23 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
711 return; 542 return;
712 } 543 }
713 { 544 {
714 auto lock = texture_cache.AcquireLock(); 545 std::scoped_lock lock{texture_cache.mutex};
715 texture_cache.DownloadMemory(addr, size); 546 texture_cache.DownloadMemory(addr, size);
716 } 547 }
717 buffer_cache.FlushRegion(addr, size); 548 {
549 std::scoped_lock lock{buffer_cache.mutex};
550 buffer_cache.DownloadMemory(addr, size);
551 }
718 query_cache.FlushRegion(addr, size); 552 query_cache.FlushRegion(addr, size);
719} 553}
720 554
721bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) { 555bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) {
556 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
722 if (!Settings::IsGPULevelHigh()) { 557 if (!Settings::IsGPULevelHigh()) {
723 return buffer_cache.MustFlushRegion(addr, size); 558 return buffer_cache.IsRegionGpuModified(addr, size);
724 } 559 }
725 return texture_cache.IsRegionGpuModified(addr, size) || 560 return texture_cache.IsRegionGpuModified(addr, size) ||
726 buffer_cache.MustFlushRegion(addr, size); 561 buffer_cache.IsRegionGpuModified(addr, size);
727} 562}
728 563
729void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { 564void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
@@ -732,11 +567,14 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
732 return; 567 return;
733 } 568 }
734 { 569 {
735 auto lock = texture_cache.AcquireLock(); 570 std::scoped_lock lock{texture_cache.mutex};
736 texture_cache.WriteMemory(addr, size); 571 texture_cache.WriteMemory(addr, size);
737 } 572 }
573 {
574 std::scoped_lock lock{buffer_cache.mutex};
575 buffer_cache.WriteMemory(addr, size);
576 }
738 shader_cache.InvalidateRegion(addr, size); 577 shader_cache.InvalidateRegion(addr, size);
739 buffer_cache.InvalidateRegion(addr, size);
740 query_cache.InvalidateRegion(addr, size); 578 query_cache.InvalidateRegion(addr, size);
741} 579}
742 580
@@ -745,26 +583,35 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
745 if (addr == 0 || size == 0) { 583 if (addr == 0 || size == 0) {
746 return; 584 return;
747 } 585 }
586 shader_cache.OnCPUWrite(addr, size);
748 { 587 {
749 auto lock = texture_cache.AcquireLock(); 588 std::scoped_lock lock{texture_cache.mutex};
750 texture_cache.WriteMemory(addr, size); 589 texture_cache.WriteMemory(addr, size);
751 } 590 }
752 shader_cache.OnCPUWrite(addr, size); 591 {
753 buffer_cache.OnCPUWrite(addr, size); 592 std::scoped_lock lock{buffer_cache.mutex};
593 buffer_cache.CachedWriteMemory(addr, size);
594 }
754} 595}
755 596
756void RasterizerOpenGL::SyncGuestHost() { 597void RasterizerOpenGL::SyncGuestHost() {
757 MICROPROFILE_SCOPE(OpenGL_CacheManagement); 598 MICROPROFILE_SCOPE(OpenGL_CacheManagement);
758 buffer_cache.SyncGuestHost();
759 shader_cache.SyncGuestHost(); 599 shader_cache.SyncGuestHost();
600 {
601 std::scoped_lock lock{buffer_cache.mutex};
602 buffer_cache.FlushCachedWrites();
603 }
760} 604}
761 605
762void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) { 606void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) {
763 { 607 {
764 auto lock = texture_cache.AcquireLock(); 608 std::scoped_lock lock{texture_cache.mutex};
765 texture_cache.UnmapMemory(addr, size); 609 texture_cache.UnmapMemory(addr, size);
766 } 610 }
767 buffer_cache.OnCPUWrite(addr, size); 611 {
612 std::scoped_lock lock{buffer_cache.mutex};
613 buffer_cache.WriteMemory(addr, size);
614 }
768 shader_cache.OnCPUWrite(addr, size); 615 shader_cache.OnCPUWrite(addr, size);
769} 616}
770 617
@@ -799,14 +646,7 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
799} 646}
800 647
801void RasterizerOpenGL::WaitForIdle() { 648void RasterizerOpenGL::WaitForIdle() {
802 // Place a barrier on everything that is not framebuffer related. 649 glMemoryBarrier(GL_ALL_BARRIER_BITS);
803 // This is related to another flag that is not currently implemented.
804 glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT |
805 GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT |
806 GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT |
807 GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT |
808 GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT |
809 GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT);
810} 650}
811 651
812void RasterizerOpenGL::FragmentBarrier() { 652void RasterizerOpenGL::FragmentBarrier() {
@@ -831,18 +671,21 @@ void RasterizerOpenGL::TickFrame() {
831 num_queued_commands = 0; 671 num_queued_commands = 0;
832 672
833 fence_manager.TickFrame(); 673 fence_manager.TickFrame();
834 buffer_cache.TickFrame();
835 { 674 {
836 auto lock = texture_cache.AcquireLock(); 675 std::scoped_lock lock{texture_cache.mutex};
837 texture_cache.TickFrame(); 676 texture_cache.TickFrame();
838 } 677 }
678 {
679 std::scoped_lock lock{buffer_cache.mutex};
680 buffer_cache.TickFrame();
681 }
839} 682}
840 683
841bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, 684bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
842 const Tegra::Engines::Fermi2D::Surface& dst, 685 const Tegra::Engines::Fermi2D::Surface& dst,
843 const Tegra::Engines::Fermi2D::Config& copy_config) { 686 const Tegra::Engines::Fermi2D::Config& copy_config) {
844 MICROPROFILE_SCOPE(OpenGL_Blits); 687 MICROPROFILE_SCOPE(OpenGL_Blits);
845 auto lock = texture_cache.AcquireLock(); 688 std::scoped_lock lock{texture_cache.mutex};
846 texture_cache.BlitImage(dst, src, copy_config); 689 texture_cache.BlitImage(dst, src, copy_config);
847 return true; 690 return true;
848} 691}
@@ -854,7 +697,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
854 } 697 }
855 MICROPROFILE_SCOPE(OpenGL_CacheManagement); 698 MICROPROFILE_SCOPE(OpenGL_CacheManagement);
856 699
857 auto lock = texture_cache.AcquireLock(); 700 std::scoped_lock lock{texture_cache.mutex};
858 ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)}; 701 ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)};
859 if (!image_view) { 702 if (!image_view) {
860 return false; 703 return false;
@@ -921,166 +764,6 @@ void RasterizerOpenGL::BindTextures(const ShaderEntries& entries, GLuint base_te
921 } 764 }
922} 765}
923 766
924void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
925 static constexpr std::array PARAMETER_LUT{
926 GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
927 GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
928 GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
929 };
930 MICROPROFILE_SCOPE(OpenGL_UBO);
931 const auto& stages = maxwell3d.state.shader_stages;
932 const auto& shader_stage = stages[stage_index];
933 const auto& entries = shader->GetEntries();
934 const bool use_unified = entries.use_unified_uniforms;
935 const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
936
937 const auto base_bindings = device.GetBaseBindings(stage_index);
938 u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
939 for (const auto& entry : entries.const_buffers) {
940 const u32 index = entry.GetIndex();
941 const auto& buffer = shader_stage.const_buffers[index];
942 SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
943 base_unified_offset + index * Maxwell::MaxConstBufferSize);
944 ++binding;
945 }
946 if (use_unified) {
947 const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
948 entries.global_memory_entries.size());
949 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
950 base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
951 }
952}
953
954void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
955 MICROPROFILE_SCOPE(OpenGL_UBO);
956 const auto& launch_desc = kepler_compute.launch_description;
957 const auto& entries = kernel->GetEntries();
958 const bool use_unified = entries.use_unified_uniforms;
959
960 u32 binding = 0;
961 for (const auto& entry : entries.const_buffers) {
962 const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
963 const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
964 Tegra::Engines::ConstBufferInfo buffer;
965 buffer.address = config.Address();
966 buffer.size = config.size;
967 buffer.enabled = mask[entry.GetIndex()];
968 SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
969 use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
970 ++binding;
971 }
972 if (use_unified) {
973 const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
974 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
975 NUM_CONST_BUFFERS_BYTES_PER_STAGE);
976 }
977}
978
979void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
980 const Tegra::Engines::ConstBufferInfo& buffer,
981 const ConstBufferEntry& entry, bool use_unified,
982 std::size_t unified_offset) {
983 if (!buffer.enabled) {
984 // Set values to zero to unbind buffers
985 if (device.UseAssemblyShaders()) {
986 glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
987 } else {
988 glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
989 }
990 return;
991 }
992
993 // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
994 // UBO alignment requirements.
995 const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
996
997 const bool fast_upload = !use_unified && device.HasFastBufferSubData();
998
999 const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
1000 const GPUVAddr gpu_addr = buffer.address;
1001 auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
1002
1003 if (device.UseAssemblyShaders()) {
1004 UNIMPLEMENTED_IF(use_unified);
1005 if (info.offset != 0) {
1006 const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
1007 glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
1008 info.handle = staging_cbuf;
1009 info.offset = 0;
1010 }
1011 glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
1012 return;
1013 }
1014
1015 if (use_unified) {
1016 glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
1017 unified_offset, size);
1018 } else {
1019 glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
1020 }
1021}
1022
1023void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
1024 static constexpr std::array TARGET_LUT = {
1025 GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
1026 GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
1027 };
1028 const auto& cbufs{maxwell3d.state.shader_stages[stage_index]};
1029 const auto& entries{shader->GetEntries().global_memory_entries};
1030
1031 std::array<BindlessSSBO, 32> ssbos;
1032 ASSERT(entries.size() < ssbos.size());
1033
1034 const bool assembly_shaders = device.UseAssemblyShaders();
1035 u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
1036 for (const auto& entry : entries) {
1037 const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
1038 const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
1039 const u32 size{gpu_memory.Read<u32>(addr + 8)};
1040 SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
1041 ++binding;
1042 }
1043 if (assembly_shaders) {
1044 UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size());
1045 }
1046}
1047
1048void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
1049 const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
1050 const auto& entries{kernel->GetEntries().global_memory_entries};
1051
1052 std::array<BindlessSSBO, 32> ssbos;
1053 ASSERT(entries.size() < ssbos.size());
1054
1055 u32 binding = 0;
1056 for (const auto& entry : entries) {
1057 const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
1058 const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
1059 const u32 size{gpu_memory.Read<u32>(addr + 8)};
1060 SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
1061 ++binding;
1062 }
1063 if (device.UseAssemblyShaders()) {
1064 UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size());
1065 }
1066}
1067
1068void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
1069 GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) {
1070 const size_t alignment{device.GetShaderStorageBufferAlignment()};
1071 const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
1072 if (device.UseAssemblyShaders()) {
1073 *ssbo = BindlessSSBO{
1074 .address = static_cast<GLuint64EXT>(info.address + info.offset),
1075 .length = static_cast<GLsizei>(size),
1076 .padding = 0,
1077 };
1078 } else {
1079 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
1080 static_cast<GLsizeiptr>(size));
1081 }
1082}
1083
1084void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) { 767void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) {
1085 const bool via_header_index = 768 const bool via_header_index =
1086 maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; 769 maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
@@ -1128,6 +811,30 @@ void RasterizerOpenGL::SetupComputeImages(const Shader* shader) {
1128 } 811 }
1129} 812}
1130 813
814void RasterizerOpenGL::SyncState() {
815 SyncViewport();
816 SyncRasterizeEnable();
817 SyncPolygonModes();
818 SyncColorMask();
819 SyncFragmentColorClampState();
820 SyncMultiSampleState();
821 SyncDepthTestState();
822 SyncDepthClamp();
823 SyncStencilTestState();
824 SyncBlendState();
825 SyncLogicOpState();
826 SyncCullMode();
827 SyncPrimitiveRestart();
828 SyncScissorTest();
829 SyncPointState();
830 SyncLineState();
831 SyncPolygonOffset();
832 SyncAlphaTest();
833 SyncFramebufferSRGB();
834 SyncVertexFormats();
835 SyncVertexInstances();
836}
837
1131void RasterizerOpenGL::SyncViewport() { 838void RasterizerOpenGL::SyncViewport() {
1132 auto& flags = maxwell3d.dirty.flags; 839 auto& flags = maxwell3d.dirty.flags;
1133 const auto& regs = maxwell3d.regs; 840 const auto& regs = maxwell3d.regs;
@@ -1163,9 +870,11 @@ void RasterizerOpenGL::SyncViewport() {
1163 if (regs.screen_y_control.y_negate != 0) { 870 if (regs.screen_y_control.y_negate != 0) {
1164 flip_y = !flip_y; 871 flip_y = !flip_y;
1165 } 872 }
1166 glClipControl(flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT, 873 const bool is_zero_to_one = regs.depth_mode == Maxwell::DepthMode::ZeroToOne;
1167 regs.depth_mode == Maxwell::DepthMode::ZeroToOne ? GL_ZERO_TO_ONE 874 const GLenum origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT;
1168 : GL_NEGATIVE_ONE_TO_ONE); 875 const GLenum depth = is_zero_to_one ? GL_ZERO_TO_ONE : GL_NEGATIVE_ONE_TO_ONE;
876 state_tracker.ClipControl(origin, depth);
877 state_tracker.SetYNegate(regs.screen_y_control.y_negate != 0);
1169 } 878 }
1170 879
1171 if (dirty_viewport) { 880 if (dirty_viewport) {
@@ -1649,36 +1358,13 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
1649 if (regs.tfb_enabled == 0) { 1358 if (regs.tfb_enabled == 0) {
1650 return; 1359 return;
1651 } 1360 }
1652
1653 if (device.UseAssemblyShaders()) { 1361 if (device.UseAssemblyShaders()) {
1654 SyncTransformFeedback(); 1362 SyncTransformFeedback();
1655 } 1363 }
1656
1657 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || 1364 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
1658 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || 1365 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
1659 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); 1366 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
1660 1367 UNIMPLEMENTED_IF(primitive_mode != GL_POINTS);
1661 for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
1662 const auto& binding = regs.tfb_bindings[index];
1663 if (!binding.buffer_enable) {
1664 if (enabled_transform_feedback_buffers[index]) {
1665 glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0,
1666 0);
1667 }
1668 enabled_transform_feedback_buffers[index] = false;
1669 continue;
1670 }
1671 enabled_transform_feedback_buffers[index] = true;
1672
1673 auto& tfb_buffer = transform_feedback_buffers[index];
1674 tfb_buffer.Create();
1675
1676 const GLuint handle = tfb_buffer.handle;
1677 const std::size_t size = binding.buffer_size;
1678 glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY);
1679 glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0,
1680 static_cast<GLsizeiptr>(size));
1681 }
1682 1368
1683 // We may have to call BeginTransformFeedbackNV here since they seem to call different 1369 // We may have to call BeginTransformFeedbackNV here since they seem to call different
1684 // implementations on Nvidia's driver (the pointer is different) but we are using 1370 // implementations on Nvidia's driver (the pointer is different) but we are using
@@ -1692,23 +1378,7 @@ void RasterizerOpenGL::EndTransformFeedback() {
1692 if (regs.tfb_enabled == 0) { 1378 if (regs.tfb_enabled == 0) {
1693 return; 1379 return;
1694 } 1380 }
1695
1696 glEndTransformFeedback(); 1381 glEndTransformFeedback();
1697
1698 for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
1699 const auto& binding = regs.tfb_bindings[index];
1700 if (!binding.buffer_enable) {
1701 continue;
1702 }
1703 UNIMPLEMENTED_IF(binding.buffer_offset != 0);
1704
1705 const GLuint handle = transform_feedback_buffers[index].handle;
1706 const GPUVAddr gpu_addr = binding.Address();
1707 const std::size_t size = binding.buffer_size;
1708 const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
1709 glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
1710 static_cast<GLsizeiptr>(size));
1711 }
1712} 1382}
1713 1383
1714} // namespace OpenGL 1384} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 82e03e677..31d69a94c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -30,7 +30,6 @@
30#include "video_core/renderer_opengl/gl_shader_decompiler.h" 30#include "video_core/renderer_opengl/gl_shader_decompiler.h"
31#include "video_core/renderer_opengl/gl_shader_manager.h" 31#include "video_core/renderer_opengl/gl_shader_manager.h"
32#include "video_core/renderer_opengl/gl_state_tracker.h" 32#include "video_core/renderer_opengl/gl_state_tracker.h"
33#include "video_core/renderer_opengl/gl_stream_buffer.h"
34#include "video_core/renderer_opengl/gl_texture_cache.h" 33#include "video_core/renderer_opengl/gl_texture_cache.h"
35#include "video_core/shader/async_shaders.h" 34#include "video_core/shader/async_shaders.h"
36#include "video_core/textures/texture.h" 35#include "video_core/textures/texture.h"
@@ -47,6 +46,11 @@ namespace Tegra {
47class MemoryManager; 46class MemoryManager;
48} 47}
49 48
49namespace Vulkan {
50class Device;
51class MemoryAllocator;
52} // namespace Vulkan
53
50namespace OpenGL { 54namespace OpenGL {
51 55
52struct ScreenInfo; 56struct ScreenInfo;
@@ -63,6 +67,8 @@ class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
63public: 67public:
64 explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, 68 explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
65 Core::Memory::Memory& cpu_memory_, const Device& device_, 69 Core::Memory::Memory& cpu_memory_, const Device& device_,
70 const Vulkan::Device* vulkan_device,
71 Vulkan::MemoryAllocator* vulkan_memory_allocator,
66 ScreenInfo& screen_info_, ProgramManager& program_manager_, 72 ScreenInfo& screen_info_, ProgramManager& program_manager_,
67 StateTracker& state_tracker_); 73 StateTracker& state_tracker_);
68 ~RasterizerOpenGL() override; 74 ~RasterizerOpenGL() override;
@@ -72,6 +78,7 @@ public:
72 void DispatchCompute(GPUVAddr code_addr) override; 78 void DispatchCompute(GPUVAddr code_addr) override;
73 void ResetCounter(VideoCore::QueryType type) override; 79 void ResetCounter(VideoCore::QueryType type) override;
74 void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; 80 void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
81 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
75 void FlushAll() override; 82 void FlushAll() override;
76 void FlushRegion(VAddr addr, u64 size) override; 83 void FlushRegion(VAddr addr, u64 size) override;
77 bool MustFlushRegion(VAddr addr, u64 size) override; 84 bool MustFlushRegion(VAddr addr, u64 size) override;
@@ -119,27 +126,6 @@ private:
119 void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image, 126 void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image,
120 size_t& image_view_index, size_t& texture_index, size_t& image_index); 127 size_t& image_view_index, size_t& texture_index, size_t& image_index);
121 128
122 /// Configures the current constbuffers to use for the draw command.
123 void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
124
125 /// Configures the current constbuffers to use for the kernel invocation.
126 void SetupComputeConstBuffers(Shader* kernel);
127
128 /// Configures a constant buffer.
129 void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
130 const ConstBufferEntry& entry, bool use_unified,
131 std::size_t unified_offset);
132
133 /// Configures the current global memory entries to use for the draw command.
134 void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);
135
136 /// Configures the current global memory entries to use for the kernel invocation.
137 void SetupComputeGlobalMemory(Shader* kernel);
138
139 /// Configures a global memory buffer.
140 void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
141 size_t size, BindlessSSBO* ssbo);
142
143 /// Configures the current textures to use for the draw command. 129 /// Configures the current textures to use for the draw command.
144 void SetupDrawTextures(const Shader* shader, size_t stage_index); 130 void SetupDrawTextures(const Shader* shader, size_t stage_index);
145 131
@@ -152,6 +138,9 @@ private:
152 /// Configures images in a compute shader. 138 /// Configures images in a compute shader.
153 void SetupComputeImages(const Shader* shader); 139 void SetupComputeImages(const Shader* shader);
154 140
141 /// Syncs state to match guest's
142 void SyncState();
143
155 /// Syncs the viewport and depth range to match the guest state 144 /// Syncs the viewport and depth range to match the guest state
156 void SyncViewport(); 145 void SyncViewport();
157 146
@@ -215,6 +204,12 @@ private:
215 /// Syncs the framebuffer sRGB state to match the guest state 204 /// Syncs the framebuffer sRGB state to match the guest state
216 void SyncFramebufferSRGB(); 205 void SyncFramebufferSRGB();
217 206
207 /// Syncs vertex formats to match the guest state
208 void SyncVertexFormats();
209
210 /// Syncs vertex instances to match the guest state
211 void SyncVertexInstances();
212
218 /// Syncs transform feedback state to match guest state 213 /// Syncs transform feedback state to match guest state
219 /// @note Only valid on assembly shaders 214 /// @note Only valid on assembly shaders
220 void SyncTransformFeedback(); 215 void SyncTransformFeedback();
@@ -225,19 +220,7 @@ private:
225 /// End a transform feedback 220 /// End a transform feedback
226 void EndTransformFeedback(); 221 void EndTransformFeedback();
227 222
228 std::size_t CalculateVertexArraysSize() const; 223 void SetupShaders(bool is_indexed);
229
230 std::size_t CalculateIndexBufferSize() const;
231
232 /// Updates the current vertex format
233 void SetupVertexFormat();
234
235 void SetupVertexBuffer();
236 void SetupVertexInstances();
237
238 GLintptr SetupIndexBuffer();
239
240 void SetupShaders();
241 224
242 Tegra::GPU& gpu; 225 Tegra::GPU& gpu;
243 Tegra::Engines::Maxwell3D& maxwell3d; 226 Tegra::Engines::Maxwell3D& maxwell3d;
@@ -249,12 +232,12 @@ private:
249 ProgramManager& program_manager; 232 ProgramManager& program_manager;
250 StateTracker& state_tracker; 233 StateTracker& state_tracker;
251 234
252 OGLStreamBuffer stream_buffer;
253 TextureCacheRuntime texture_cache_runtime; 235 TextureCacheRuntime texture_cache_runtime;
254 TextureCache texture_cache; 236 TextureCache texture_cache;
237 BufferCacheRuntime buffer_cache_runtime;
238 BufferCache buffer_cache;
255 ShaderCacheOpenGL shader_cache; 239 ShaderCacheOpenGL shader_cache;
256 QueryCache query_cache; 240 QueryCache query_cache;
257 OGLBufferCache buffer_cache;
258 FenceManagerOpenGL fence_manager; 241 FenceManagerOpenGL fence_manager;
259 242
260 VideoCommon::Shader::AsyncShaders async_shaders; 243 VideoCommon::Shader::AsyncShaders async_shaders;
@@ -262,20 +245,8 @@ private:
262 boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices; 245 boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices;
263 std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids; 246 std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids;
264 boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles; 247 boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles;
265 std::array<GLuint, MAX_TEXTURES> texture_handles; 248 std::array<GLuint, MAX_TEXTURES> texture_handles{};
266 std::array<GLuint, MAX_IMAGES> image_handles; 249 std::array<GLuint, MAX_IMAGES> image_handles{};
267
268 std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
269 transform_feedback_buffers;
270 std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
271 enabled_transform_feedback_buffers;
272
273 static constexpr std::size_t NUM_CONSTANT_BUFFERS =
274 Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
275 Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
276 std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
277 std::size_t current_cbuf = 0;
278 OGLBuffer unified_uniform_buffer;
279 250
280 /// Number of commands queued to the OpenGL driver. Resetted on flush. 251 /// Number of commands queued to the OpenGL driver. Resetted on flush.
281 std::size_t num_queued_commands = 0; 252 std::size_t num_queued_commands = 0;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 0e34a0f20..3428e5e21 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -171,12 +171,6 @@ void OGLBuffer::Release() {
171 handle = 0; 171 handle = 0;
172} 172}
173 173
174void OGLBuffer::MakeStreamCopy(std::size_t buffer_size) {
175 ASSERT_OR_EXECUTE((handle != 0 && buffer_size != 0), { return; });
176
177 glNamedBufferData(handle, buffer_size, nullptr, GL_STREAM_COPY);
178}
179
180void OGLSync::Create() { 174void OGLSync::Create() {
181 if (handle != 0) 175 if (handle != 0)
182 return; 176 return;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index f48398669..552d79db4 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -234,9 +234,6 @@ public:
234 /// Deletes the internal OpenGL resource 234 /// Deletes the internal OpenGL resource
235 void Release(); 235 void Release();
236 236
237 // Converts the buffer into a stream copy buffer with a fixed size
238 void MakeStreamCopy(std::size_t buffer_size);
239
240 GLuint handle = 0; 237 GLuint handle = 0;
241}; 238};
242 239
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index c35b71b6b..ac78d344c 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -64,7 +64,7 @@ using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>
64constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32); 64constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
65constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32); 65constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
66 66
67constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt 67constexpr std::string_view COMMON_DECLARATIONS = R"(#define ftoi floatBitsToInt
68#define ftou floatBitsToUint 68#define ftou floatBitsToUint
69#define itof intBitsToFloat 69#define itof intBitsToFloat
70#define utof uintBitsToFloat 70#define utof uintBitsToFloat
@@ -77,10 +77,6 @@ bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{
77 77
78const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f ); 78const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f );
79const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f ); 79const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f );
80
81layout (std140, binding = {}) uniform vs_config {{
82 float y_direction;
83}};
84)"; 80)";
85 81
86class ShaderWriter final { 82class ShaderWriter final {
@@ -402,13 +398,6 @@ std::string FlowStackTopName(MetaStackClass stack) {
402 return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); 398 return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
403} 399}
404 400
405bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
406 const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
407 // We waste one UBO for emulation
408 const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
409 return num_ubos > num_available_ubos;
410}
411
412struct GenericVaryingDescription { 401struct GenericVaryingDescription {
413 std::string name; 402 std::string name;
414 u8 first_element = 0; 403 u8 first_element = 0;
@@ -420,9 +409,8 @@ public:
420 explicit GLSLDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_, 409 explicit GLSLDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_,
421 ShaderType stage_, std::string_view identifier_, 410 ShaderType stage_, std::string_view identifier_,
422 std::string_view suffix_) 411 std::string_view suffix_)
423 : device{device_}, ir{ir_}, registry{registry_}, stage{stage_}, identifier{identifier_}, 412 : device{device_}, ir{ir_}, registry{registry_}, stage{stage_},
424 suffix{suffix_}, header{ir.GetHeader()}, use_unified_uniforms{ 413 identifier{identifier_}, suffix{suffix_}, header{ir.GetHeader()} {
425 UseUnifiedUniforms(device_, ir_, stage_)} {
426 if (stage != ShaderType::Compute) { 414 if (stage != ShaderType::Compute) {
427 transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); 415 transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
428 } 416 }
@@ -516,7 +504,8 @@ private:
516 if (!identifier.empty()) { 504 if (!identifier.empty()) {
517 code.AddLine("// {}", identifier); 505 code.AddLine("// {}", identifier);
518 } 506 }
519 code.AddLine("#version 440 {}", ir.UsesLegacyVaryings() ? "compatibility" : "core"); 507 const bool use_compatibility = ir.UsesLegacyVaryings() || ir.UsesYNegate();
508 code.AddLine("#version 440 {}", use_compatibility ? "compatibility" : "core");
520 code.AddLine("#extension GL_ARB_separate_shader_objects : enable"); 509 code.AddLine("#extension GL_ARB_separate_shader_objects : enable");
521 if (device.HasShaderBallot()) { 510 if (device.HasShaderBallot()) {
522 code.AddLine("#extension GL_ARB_shader_ballot : require"); 511 code.AddLine("#extension GL_ARB_shader_ballot : require");
@@ -542,7 +531,7 @@ private:
542 531
543 code.AddNewLine(); 532 code.AddNewLine();
544 533
545 code.AddLine(CommonDeclarations, EmulationUniformBlockBinding); 534 code.AddLine(COMMON_DECLARATIONS);
546 } 535 }
547 536
548 void DeclareVertex() { 537 void DeclareVertex() {
@@ -865,17 +854,6 @@ private:
865 } 854 }
866 855
867 void DeclareConstantBuffers() { 856 void DeclareConstantBuffers() {
868 if (use_unified_uniforms) {
869 const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
870 static_cast<u32>(ir.GetGlobalMemory().size());
871 code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
872 binding);
873 code.AddLine(" uint cbufs[];");
874 code.AddLine("}};");
875 code.AddNewLine();
876 return;
877 }
878
879 u32 binding = device.GetBaseBindings(stage).uniform_buffer; 857 u32 binding = device.GetBaseBindings(stage).uniform_buffer;
880 for (const auto& [index, info] : ir.GetConstantBuffers()) { 858 for (const auto& [index, info] : ir.GetConstantBuffers()) {
881 const u32 num_elements = Common::DivCeil(info.GetSize(), 4 * sizeof(u32)); 859 const u32 num_elements = Common::DivCeil(info.GetSize(), 4 * sizeof(u32));
@@ -1081,29 +1059,17 @@ private:
1081 1059
1082 if (const auto cbuf = std::get_if<CbufNode>(&*node)) { 1060 if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
1083 const Node offset = cbuf->GetOffset(); 1061 const Node offset = cbuf->GetOffset();
1084 const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
1085 1062
1086 if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { 1063 if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
1087 // Direct access 1064 // Direct access
1088 const u32 offset_imm = immediate->GetValue(); 1065 const u32 offset_imm = immediate->GetValue();
1089 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); 1066 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
1090 if (use_unified_uniforms) { 1067 return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
1091 return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4), 1068 offset_imm / (4 * 4), (offset_imm / 4) % 4),
1092 Type::Uint};
1093 } else {
1094 return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
1095 offset_imm / (4 * 4), (offset_imm / 4) % 4),
1096 Type::Uint};
1097 }
1098 }
1099
1100 // Indirect access
1101 if (use_unified_uniforms) {
1102 return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
1103 Visit(offset).AsUint()),
1104 Type::Uint}; 1069 Type::Uint};
1105 } 1070 }
1106 1071
1072 // Indirect access
1107 const std::string final_offset = code.GenerateTemporary(); 1073 const std::string final_offset = code.GenerateTemporary();
1108 code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); 1074 code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
1109 1075
@@ -2293,7 +2259,6 @@ private:
2293 } 2259 }
2294 } 2260 }
2295 } 2261 }
2296
2297 if (header.ps.omap.depth) { 2262 if (header.ps.omap.depth) {
2298 // The depth output is always 2 registers after the last color output, and current_reg 2263 // The depth output is always 2 registers after the last color output, and current_reg
2299 // already contains one past the last color register. 2264 // already contains one past the last color register.
@@ -2337,7 +2302,8 @@ private:
2337 } 2302 }
2338 2303
2339 Expression YNegate(Operation operation) { 2304 Expression YNegate(Operation operation) {
2340 return {"y_direction", Type::Float}; 2305 // Y_NEGATE is mapped to this uniform value
2306 return {"gl_FrontMaterial.ambient.a", Type::Float};
2341 } 2307 }
2342 2308
2343 template <u32 element> 2309 template <u32 element>
@@ -2787,7 +2753,6 @@ private:
2787 const std::string_view identifier; 2753 const std::string_view identifier;
2788 const std::string_view suffix; 2754 const std::string_view suffix;
2789 const Header header; 2755 const Header header;
2790 const bool use_unified_uniforms;
2791 std::unordered_map<u8, VaryingTFB> transform_feedback; 2756 std::unordered_map<u8, VaryingTFB> transform_feedback;
2792 2757
2793 ShaderWriter code; 2758 ShaderWriter code;
@@ -3003,8 +2968,10 @@ ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType s
3003 for (std::size_t i = 0; i < std::size(clip_distances); ++i) { 2968 for (std::size_t i = 0; i < std::size(clip_distances); ++i) {
3004 entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; 2969 entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
3005 } 2970 }
2971 for (const auto& buffer : entries.const_buffers) {
2972 entries.enabled_uniform_buffers |= 1U << buffer.GetIndex();
2973 }
3006 entries.shader_length = ir.GetLength(); 2974 entries.shader_length = ir.GetLength();
3007 entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
3008 return entries; 2975 return entries;
3009} 2976}
3010 2977
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index be68994bb..0397a000c 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -55,7 +55,7 @@ struct ShaderEntries {
55 std::vector<ImageEntry> images; 55 std::vector<ImageEntry> images;
56 std::size_t shader_length{}; 56 std::size_t shader_length{};
57 u32 clip_distances{}; 57 u32 clip_distances{};
58 bool use_unified_uniforms{}; 58 u32 enabled_uniform_buffers{};
59}; 59};
60 60
61ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir, 61ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp
index 60e6fa39f..dbdf5230f 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.cpp
+++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp
@@ -36,16 +36,10 @@ void SetupDirtyColorMasks(Tables& tables) {
36 FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks); 36 FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks);
37} 37}
38 38
39void SetupDirtyVertexArrays(Tables& tables) { 39void SetupDirtyVertexInstances(Tables& tables) {
40 static constexpr std::size_t num_array = 3;
41 static constexpr std::size_t instance_base_offset = 3; 40 static constexpr std::size_t instance_base_offset = 3;
42 for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { 41 for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) {
43 const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]); 42 const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]);
44 const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]);
45
46 FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers);
47 FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers);
48
49 const std::size_t instance_array_offset = array_offset + instance_base_offset; 43 const std::size_t instance_array_offset = array_offset + instance_base_offset;
50 tables[0][instance_array_offset] = static_cast<u8>(VertexInstance0 + i); 44 tables[0][instance_array_offset] = static_cast<u8>(VertexInstance0 + i);
51 tables[1][instance_array_offset] = VertexInstances; 45 tables[1][instance_array_offset] = VertexInstances;
@@ -217,11 +211,11 @@ void SetupDirtyMisc(Tables& tables) {
217StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} { 211StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} {
218 auto& dirty = gpu.Maxwell3D().dirty; 212 auto& dirty = gpu.Maxwell3D().dirty;
219 auto& tables = dirty.tables; 213 auto& tables = dirty.tables;
220 SetupDirtyRenderTargets(tables); 214 SetupDirtyFlags(tables);
221 SetupDirtyColorMasks(tables); 215 SetupDirtyColorMasks(tables);
222 SetupDirtyViewports(tables); 216 SetupDirtyViewports(tables);
223 SetupDirtyScissors(tables); 217 SetupDirtyScissors(tables);
224 SetupDirtyVertexArrays(tables); 218 SetupDirtyVertexInstances(tables);
225 SetupDirtyVertexFormat(tables); 219 SetupDirtyVertexFormat(tables);
226 SetupDirtyShaders(tables); 220 SetupDirtyShaders(tables);
227 SetupDirtyPolygonModes(tables); 221 SetupDirtyPolygonModes(tables);
@@ -241,19 +235,6 @@ StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags}
241 SetupDirtyClipControl(tables); 235 SetupDirtyClipControl(tables);
242 SetupDirtyDepthClampEnabled(tables); 236 SetupDirtyDepthClampEnabled(tables);
243 SetupDirtyMisc(tables); 237 SetupDirtyMisc(tables);
244
245 auto& store = dirty.on_write_stores;
246 store[VertexBuffers] = true;
247 for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) {
248 store[VertexBuffer0 + i] = true;
249 }
250}
251
252void StateTracker::InvalidateStreamBuffer() {
253 flags[Dirty::VertexBuffers] = true;
254 for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
255 flags[index] = true;
256 }
257} 238}
258 239
259} // namespace OpenGL 240} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h
index 574615d3c..94c905116 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.h
+++ b/src/video_core/renderer_opengl/gl_state_tracker.h
@@ -28,10 +28,6 @@ enum : u8 {
28 VertexFormat0, 28 VertexFormat0,
29 VertexFormat31 = VertexFormat0 + 31, 29 VertexFormat31 = VertexFormat0 + 31,
30 30
31 VertexBuffers,
32 VertexBuffer0,
33 VertexBuffer31 = VertexBuffer0 + 31,
34
35 VertexInstances, 31 VertexInstances,
36 VertexInstance0, 32 VertexInstance0,
37 VertexInstance31 = VertexInstance0 + 31, 33 VertexInstance31 = VertexInstance0 + 31,
@@ -92,8 +88,6 @@ class StateTracker {
92public: 88public:
93 explicit StateTracker(Tegra::GPU& gpu); 89 explicit StateTracker(Tegra::GPU& gpu);
94 90
95 void InvalidateStreamBuffer();
96
97 void BindIndexBuffer(GLuint new_index_buffer) { 91 void BindIndexBuffer(GLuint new_index_buffer) {
98 if (index_buffer == new_index_buffer) { 92 if (index_buffer == new_index_buffer) {
99 return; 93 return;
@@ -110,13 +104,32 @@ public:
110 glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer); 104 glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
111 } 105 }
112 106
107 void ClipControl(GLenum new_origin, GLenum new_depth) {
108 if (new_origin == origin && new_depth == depth) {
109 return;
110 }
111 origin = new_origin;
112 depth = new_depth;
113 glClipControl(origin, depth);
114 }
115
116 void SetYNegate(bool new_y_negate) {
117 if (new_y_negate == y_negate) {
118 return;
119 }
120 // Y_NEGATE is mapped to gl_FrontMaterial.ambient.a
121 y_negate = new_y_negate;
122 const std::array ambient{0.0f, 0.0f, 0.0f, y_negate ? -1.0f : 1.0f};
123 glMaterialfv(GL_FRONT, GL_AMBIENT, ambient.data());
124 }
125
113 void NotifyScreenDrawVertexArray() { 126 void NotifyScreenDrawVertexArray() {
114 flags[OpenGL::Dirty::VertexFormats] = true; 127 flags[OpenGL::Dirty::VertexFormats] = true;
115 flags[OpenGL::Dirty::VertexFormat0 + 0] = true; 128 flags[OpenGL::Dirty::VertexFormat0 + 0] = true;
116 flags[OpenGL::Dirty::VertexFormat0 + 1] = true; 129 flags[OpenGL::Dirty::VertexFormat0 + 1] = true;
117 130
118 flags[OpenGL::Dirty::VertexBuffers] = true; 131 flags[VideoCommon::Dirty::VertexBuffers] = true;
119 flags[OpenGL::Dirty::VertexBuffer0] = true; 132 flags[VideoCommon::Dirty::VertexBuffer0] = true;
120 133
121 flags[OpenGL::Dirty::VertexInstances] = true; 134 flags[OpenGL::Dirty::VertexInstances] = true;
122 flags[OpenGL::Dirty::VertexInstance0 + 0] = true; 135 flags[OpenGL::Dirty::VertexInstance0 + 0] = true;
@@ -202,6 +215,9 @@ private:
202 215
203 GLuint framebuffer = 0; 216 GLuint framebuffer = 0;
204 GLuint index_buffer = 0; 217 GLuint index_buffer = 0;
218 GLenum origin = GL_LOWER_LEFT;
219 GLenum depth = GL_NEGATIVE_ONE_TO_ONE;
220 bool y_negate = false;
205}; 221};
206 222
207} // namespace OpenGL 223} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index e0819cdf2..bfb992a79 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -1,70 +1,64 @@
1// Copyright 2018 Citra Emulator Project 1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <tuple> 5#include <array>
6#include <vector> 6#include <memory>
7#include <span>
8
9#include <glad/glad.h>
7 10
8#include "common/alignment.h" 11#include "common/alignment.h"
9#include "common/assert.h" 12#include "common/assert.h"
10#include "common/microprofile.h"
11#include "video_core/renderer_opengl/gl_device.h"
12#include "video_core/renderer_opengl/gl_state_tracker.h"
13#include "video_core/renderer_opengl/gl_stream_buffer.h" 13#include "video_core/renderer_opengl/gl_stream_buffer.h"
14 14
15MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
16 MP_RGB(128, 128, 192));
17
18namespace OpenGL { 15namespace OpenGL {
19 16
20OGLStreamBuffer::OGLStreamBuffer(const Device& device, StateTracker& state_tracker_) 17StreamBuffer::StreamBuffer() {
21 : state_tracker{state_tracker_} { 18 static constexpr GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT;
22 gl_buffer.Create(); 19 buffer.Create();
23 20 glObjectLabel(GL_BUFFER, buffer.handle, -1, "Stream Buffer");
24 static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; 21 glNamedBufferStorage(buffer.handle, STREAM_BUFFER_SIZE, nullptr, flags);
25 glNamedBufferStorage(gl_buffer.handle, BUFFER_SIZE, nullptr, flags); 22 mapped_pointer =
26 mapped_ptr = static_cast<u8*>( 23 static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, STREAM_BUFFER_SIZE, flags));
27 glMapNamedBufferRange(gl_buffer.handle, 0, BUFFER_SIZE, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); 24 for (OGLSync& sync : fences) {
28 25 sync.Create();
29 if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
30 glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
31 glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
32 } 26 }
33} 27}
34 28
35OGLStreamBuffer::~OGLStreamBuffer() { 29std::pair<std::span<u8>, size_t> StreamBuffer::Request(size_t size) noexcept {
36 glUnmapNamedBuffer(gl_buffer.handle); 30 ASSERT(size < REGION_SIZE);
37 gl_buffer.Release(); 31 for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end;
38} 32 ++region) {
39 33 fences[region].Create();
40std::pair<u8*, GLintptr> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
41 ASSERT(size <= BUFFER_SIZE);
42 ASSERT(alignment <= BUFFER_SIZE);
43 mapped_size = size;
44
45 if (alignment > 0) {
46 buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
47 } 34 }
35 used_iterator = iterator;
48 36
49 if (buffer_pos + size > BUFFER_SIZE) { 37 for (size_t region = Region(free_iterator) + 1,
50 MICROPROFILE_SCOPE(OpenGL_StreamBuffer); 38 region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS);
51 glInvalidateBufferData(gl_buffer.handle); 39 region < region_end; ++region) {
52 state_tracker.InvalidateStreamBuffer(); 40 glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED);
53 41 fences[region].Release();
54 buffer_pos = 0;
55 } 42 }
56 43 if (iterator + size > free_iterator) {
57 return std::make_pair(mapped_ptr + buffer_pos, buffer_pos); 44 free_iterator = iterator + size;
58}
59
60void OGLStreamBuffer::Unmap(GLsizeiptr size) {
61 ASSERT(size <= mapped_size);
62
63 if (size > 0) {
64 glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
65 } 45 }
66 46 if (iterator + size > STREAM_BUFFER_SIZE) {
67 buffer_pos += size; 47 for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) {
48 fences[region].Create();
49 }
50 used_iterator = 0;
51 iterator = 0;
52 free_iterator = size;
53
54 for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) {
55 glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED);
56 fences[region].Release();
57 }
58 }
59 const size_t offset = iterator;
60 iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT);
61 return {std::span(mapped_pointer + offset, size), offset};
68} 62}
69 63
70} // namespace OpenGL 64} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index dd9cf67eb..6dbb6bfba 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -1,9 +1,12 @@
1// Copyright 2018 Citra Emulator Project 1// Copyright 2021 yuzu Emulator Project
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#pragma once 5#pragma once
6 6
7#include <array>
8#include <memory>
9#include <span>
7#include <utility> 10#include <utility>
8 11
9#include <glad/glad.h> 12#include <glad/glad.h>
@@ -13,48 +16,35 @@
13 16
14namespace OpenGL { 17namespace OpenGL {
15 18
16class Device; 19class StreamBuffer {
17class StateTracker; 20 static constexpr size_t STREAM_BUFFER_SIZE = 64 * 1024 * 1024;
21 static constexpr size_t NUM_SYNCS = 16;
22 static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS;
23 static constexpr size_t MAX_ALIGNMENT = 256;
24 static_assert(STREAM_BUFFER_SIZE % MAX_ALIGNMENT == 0);
25 static_assert(STREAM_BUFFER_SIZE % NUM_SYNCS == 0);
26 static_assert(REGION_SIZE % MAX_ALIGNMENT == 0);
18 27
19class OGLStreamBuffer : private NonCopyable {
20public: 28public:
21 explicit OGLStreamBuffer(const Device& device, StateTracker& state_tracker_); 29 explicit StreamBuffer();
22 ~OGLStreamBuffer();
23
24 /*
25 * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
26 * and the optional alignment requirement.
27 * If the buffer is full, the whole buffer is reallocated which invalidates old chunks.
28 * The return values are the pointer to the new chunk, and the offset within the buffer.
29 * The actual used size must be specified on unmapping the chunk.
30 */
31 std::pair<u8*, GLintptr> Map(GLsizeiptr size, GLintptr alignment = 0);
32
33 void Unmap(GLsizeiptr size);
34
35 GLuint Handle() const {
36 return gl_buffer.handle;
37 }
38 30
39 u64 Address() const { 31 [[nodiscard]] std::pair<std::span<u8>, size_t> Request(size_t size) noexcept;
40 return gpu_address;
41 }
42 32
43 GLsizeiptr Size() const noexcept { 33 [[nodiscard]] GLuint Handle() const noexcept {
44 return BUFFER_SIZE; 34 return buffer.handle;
45 } 35 }
46 36
47private: 37private:
48 static constexpr GLsizeiptr BUFFER_SIZE = 256 * 1024 * 1024; 38 [[nodiscard]] static size_t Region(size_t offset) noexcept {
49 39 return offset / REGION_SIZE;
50 StateTracker& state_tracker; 40 }
51
52 OGLBuffer gl_buffer;
53 41
54 GLuint64EXT gpu_address = 0; 42 size_t iterator = 0;
55 GLintptr buffer_pos = 0; 43 size_t used_iterator = 0;
56 GLsizeiptr mapped_size = 0; 44 size_t free_iterator = 0;
57 u8* mapped_ptr = nullptr; 45 u8* mapped_pointer = nullptr;
46 OGLBuffer buffer;
47 std::array<OGLSync, NUM_SYNCS> fences;
58}; 48};
59 49
60} // namespace OpenGL 50} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 546cb6d00..37572ab28 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -398,9 +398,6 @@ void AttachTexture(GLuint fbo, GLenum attachment, const ImageView* image_view) {
398 398
399} // Anonymous namespace 399} // Anonymous namespace
400 400
401ImageBufferMap::ImageBufferMap(GLuint handle_, u8* map, size_t size, OGLSync* sync_)
402 : span(map, size), sync{sync_}, handle{handle_} {}
403
404ImageBufferMap::~ImageBufferMap() { 401ImageBufferMap::~ImageBufferMap() {
405 if (sync) { 402 if (sync) {
406 sync->Create(); 403 sync->Create();
@@ -487,11 +484,11 @@ void TextureCacheRuntime::Finish() {
487 glFinish(); 484 glFinish();
488} 485}
489 486
490ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) { 487ImageBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) {
491 return upload_buffers.RequestMap(size, true); 488 return upload_buffers.RequestMap(size, true);
492} 489}
493 490
494ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) { 491ImageBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) {
495 return download_buffers.RequestMap(size, false); 492 return download_buffers.RequestMap(size, false);
496} 493}
497 494
@@ -596,7 +593,11 @@ ImageBufferMap TextureCacheRuntime::StagingBuffers::RequestMap(size_t requested_
596 bool insert_fence) { 593 bool insert_fence) {
597 const size_t index = RequestBuffer(requested_size); 594 const size_t index = RequestBuffer(requested_size);
598 OGLSync* const sync = insert_fence ? &syncs[index] : nullptr; 595 OGLSync* const sync = insert_fence ? &syncs[index] : nullptr;
599 return ImageBufferMap(buffers[index].handle, maps[index], requested_size, sync); 596 return ImageBufferMap{
597 .mapped_span = std::span(maps[index], requested_size),
598 .sync = sync,
599 .buffer = buffers[index].handle,
600 };
600} 601}
601 602
602size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) { 603size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) {
@@ -711,7 +712,7 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_,
711 712
712void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, 713void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
713 std::span<const VideoCommon::BufferImageCopy> copies) { 714 std::span<const VideoCommon::BufferImageCopy> copies) {
714 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.Handle()); 715 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer);
715 glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes); 716 glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes);
716 717
717 glPixelStorei(GL_UNPACK_ALIGNMENT, 1); 718 glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
@@ -735,7 +736,7 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
735void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, 736void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
736 std::span<const VideoCommon::BufferCopy> copies) { 737 std::span<const VideoCommon::BufferCopy> copies) {
737 for (const VideoCommon::BufferCopy& copy : copies) { 738 for (const VideoCommon::BufferCopy& copy : copies) {
738 glCopyNamedBufferSubData(map.Handle(), buffer.handle, copy.src_offset + buffer_offset, 739 glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + buffer_offset,
739 copy.dst_offset, copy.size); 740 copy.dst_offset, copy.size);
740 } 741 }
741} 742}
@@ -744,7 +745,7 @@ void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset,
744 std::span<const VideoCommon::BufferImageCopy> copies) { 745 std::span<const VideoCommon::BufferImageCopy> copies) {
745 glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API 746 glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API
746 747
747 glBindBuffer(GL_PIXEL_PACK_BUFFER, map.Handle()); 748 glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer);
748 glPixelStorei(GL_PACK_ALIGNMENT, 1); 749 glPixelStorei(GL_PACK_ALIGNMENT, 1);
749 750
750 u32 current_row_length = std::numeric_limits<u32>::max(); 751 u32 current_row_length = std::numeric_limits<u32>::max();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 15b7c3676..60d08d6d6 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -31,23 +31,12 @@ using VideoCommon::NUM_RT;
31using VideoCommon::Offset2D; 31using VideoCommon::Offset2D;
32using VideoCommon::RenderTargets; 32using VideoCommon::RenderTargets;
33 33
34class ImageBufferMap { 34struct ImageBufferMap {
35public:
36 explicit ImageBufferMap(GLuint handle, u8* map, size_t size, OGLSync* sync);
37 ~ImageBufferMap(); 35 ~ImageBufferMap();
38 36
39 GLuint Handle() const noexcept { 37 std::span<u8> mapped_span;
40 return handle;
41 }
42
43 std::span<u8> Span() const noexcept {
44 return span;
45 }
46
47private:
48 std::span<u8> span;
49 OGLSync* sync; 38 OGLSync* sync;
50 GLuint handle; 39 GLuint buffer;
51}; 40};
52 41
53struct FormatProperties { 42struct FormatProperties {
@@ -69,9 +58,9 @@ public:
69 58
70 void Finish(); 59 void Finish();
71 60
72 ImageBufferMap MapUploadBuffer(size_t size); 61 ImageBufferMap UploadStagingBuffer(size_t size);
73 62
74 ImageBufferMap MapDownloadBuffer(size_t size); 63 ImageBufferMap DownloadStagingBuffer(size_t size);
75 64
76 void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); 65 void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
77 66
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 7eb5ab17a..8fcb86581 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -27,11 +27,14 @@
27#include "video_core/renderer_opengl/gl_shader_manager.h" 27#include "video_core/renderer_opengl/gl_shader_manager.h"
28#include "video_core/renderer_opengl/renderer_opengl.h" 28#include "video_core/renderer_opengl/renderer_opengl.h"
29#include "video_core/textures/decoders.h" 29#include "video_core/textures/decoders.h"
30#include "video_core/vulkan_common/vulkan_debug_callback.h"
31#include "video_core/vulkan_common/vulkan_device.h"
32#include "video_core/vulkan_common/vulkan_instance.h"
33#include "video_core/vulkan_common/vulkan_library.h"
34#include "video_core/vulkan_common/vulkan_memory_allocator.h"
30 35
31namespace OpenGL { 36namespace OpenGL {
32
33namespace { 37namespace {
34
35constexpr GLint PositionLocation = 0; 38constexpr GLint PositionLocation = 0;
36constexpr GLint TexCoordLocation = 1; 39constexpr GLint TexCoordLocation = 1;
37constexpr GLint ModelViewMatrixLocation = 0; 40constexpr GLint ModelViewMatrixLocation = 0;
@@ -125,25 +128,98 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit
125 } 128 }
126} 129}
127 130
131Vulkan::vk::PhysicalDevice FindPhysicalDevice(Vulkan::vk::Instance& instance) {
132 using namespace Vulkan;
133 using UUID = std::array<GLubyte, GL_UUID_SIZE_EXT>;
134
135 GLint num_device_uuids;
136 glGetIntegerv(GL_NUM_DEVICE_UUIDS_EXT, &num_device_uuids);
137 std::vector<UUID> device_uuids(num_device_uuids);
138 for (GLint index = 0; index < num_device_uuids; ++index) {
139 glGetUnsignedBytei_vEXT(GL_DEVICE_UUID_EXT, 0, device_uuids[index].data());
140 }
141 UUID driver_uuid;
142 glGetUnsignedBytevEXT(GL_DRIVER_UUID_EXT, driver_uuid.data());
143
144 for (const VkPhysicalDevice raw_physical_device : instance.EnumeratePhysicalDevices()) {
145 VkPhysicalDeviceIDProperties device_id_properties{};
146 device_id_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
147
148 VkPhysicalDeviceProperties2KHR properties{
149 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
150 .pNext = &device_id_properties,
151 .properties{},
152 };
153 vk::PhysicalDevice physical_device(raw_physical_device, instance.Dispatch());
154 physical_device.GetProperties2KHR(properties);
155 if (!std::ranges::equal(device_id_properties.driverUUID, driver_uuid)) {
156 continue;
157 }
158 const auto it =
159 std::ranges::find_if(device_uuids, [&device_id_properties, driver_uuid](UUID uuid) {
160 return std::ranges::equal(device_id_properties.deviceUUID, uuid);
161 });
162 if (it != device_uuids.end()) {
163 return physical_device;
164 }
165 }
166 throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER);
167}
128} // Anonymous namespace 168} // Anonymous namespace
129 169
170struct VulkanObjects {
171 static std::unique_ptr<VulkanObjects> TryCreate() {
172 if (!GLAD_GL_EXT_memory_object) {
173 // Interop is not present
174 return nullptr;
175 }
176 const std::string_view vendor{reinterpret_cast<const char*>(glGetString(GL_VENDOR))};
177 if (vendor == "ATI Technologies Inc.") {
178 // Avoid using GL_EXT_memory_object on AMD, as it makes the GL driver crash
179 return nullptr;
180 }
181 if (!Settings::values.use_assembly_shaders.GetValue()) {
182 // We only need interop when assembly shaders are enabled
183 return nullptr;
184 }
185#ifdef __linux__
186 LOG_WARNING(Render_OpenGL, "Interop doesn't work on Linux at the moment");
187 return nullptr;
188#endif
189 try {
190 return std::make_unique<VulkanObjects>();
191 } catch (const Vulkan::vk::Exception& exception) {
192 LOG_ERROR(Render_OpenGL, "Failed to initialize Vulkan objects with error: {}",
193 exception.what());
194 return nullptr;
195 }
196 }
197
198 Common::DynamicLibrary library{Vulkan::OpenLibrary()};
199 Vulkan::vk::InstanceDispatch dld;
200 Vulkan::vk::Instance instance{Vulkan::CreateInstance(library, dld, VK_API_VERSION_1_1)};
201 Vulkan::Device device{*instance, FindPhysicalDevice(instance), nullptr, dld};
202 Vulkan::MemoryAllocator memory_allocator{device, true};
203};
204
130RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_, 205RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_,
131 Core::Frontend::EmuWindow& emu_window_, 206 Core::Frontend::EmuWindow& emu_window_,
132 Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, 207 Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
133 std::unique_ptr<Core::Frontend::GraphicsContext> context_) 208 std::unique_ptr<Core::Frontend::GraphicsContext> context_)
134 : RendererBase{emu_window_, std::move(context_)}, telemetry_session{telemetry_session_}, 209 : RendererBase{emu_window_, std::move(context_)}, telemetry_session{telemetry_session_},
135 emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device}, 210 emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_},
136 rasterizer{emu_window, gpu, cpu_memory, device, screen_info, program_manager, state_tracker} { 211 vulkan_objects{VulkanObjects::TryCreate()}, device{vulkan_objects != nullptr},
212 state_tracker{gpu}, program_manager{device},
213 rasterizer(emu_window, gpu, cpu_memory, device,
214 vulkan_objects ? &vulkan_objects->device : nullptr,
215 vulkan_objects ? &vulkan_objects->memory_allocator : nullptr, screen_info,
216 program_manager, state_tracker) {
137 if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) { 217 if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
138 glEnable(GL_DEBUG_OUTPUT); 218 glEnable(GL_DEBUG_OUTPUT);
139 glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); 219 glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
140 glDebugMessageCallback(DebugHandler, nullptr); 220 glDebugMessageCallback(DebugHandler, nullptr);
141 } 221 }
142 AddTelemetryFields(); 222 AddTelemetryFields();
143
144 if (!GLAD_GL_VERSION_4_6) {
145 throw std::runtime_error{"OpenGL 4.3 is not available"};
146 }
147 InitOpenGLObjects(); 223 InitOpenGLObjects();
148} 224}
149 225
@@ -280,6 +356,7 @@ void RendererOpenGL::InitOpenGLObjects() {
280 // Enable unified vertex attributes and query vertex buffer address when the driver supports it 356 // Enable unified vertex attributes and query vertex buffer address when the driver supports it
281 if (device.HasVertexBufferUnifiedMemory()) { 357 if (device.HasVertexBufferUnifiedMemory()) {
282 glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); 358 glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
359 glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
283 360
284 glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY); 361 glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
285 glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, 362 glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
@@ -412,6 +489,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
412 489
413 program_manager.BindHostPipeline(pipeline.handle); 490 program_manager.BindHostPipeline(pipeline.handle);
414 491
492 state_tracker.ClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
415 glEnable(GL_CULL_FACE); 493 glEnable(GL_CULL_FACE);
416 if (screen_info.display_srgb) { 494 if (screen_info.display_srgb) {
417 glEnable(GL_FRAMEBUFFER_SRGB); 495 glEnable(GL_FRAMEBUFFER_SRGB);
@@ -430,7 +508,6 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
430 glCullFace(GL_BACK); 508 glCullFace(GL_BACK);
431 glFrontFace(GL_CW); 509 glFrontFace(GL_CW);
432 glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); 510 glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
433 glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
434 glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width), 511 glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width),
435 static_cast<GLfloat>(layout.height)); 512 static_cast<GLfloat>(layout.height));
436 glDepthRangeIndexed(0, 0.0, 0.0); 513 glDepthRangeIndexed(0, 0.0, 0.0);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index e043a0ccb..f210190dd 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -38,6 +38,8 @@ class GPU;
38 38
39namespace OpenGL { 39namespace OpenGL {
40 40
41struct VulkanObjects;
42
41/// Structure used for storing information about the textures for the Switch screen 43/// Structure used for storing information about the textures for the Switch screen
42struct TextureInfo { 44struct TextureInfo {
43 OGLTexture resource; 45 OGLTexture resource;
@@ -99,8 +101,11 @@ private:
99 Core::Memory::Memory& cpu_memory; 101 Core::Memory::Memory& cpu_memory;
100 Tegra::GPU& gpu; 102 Tegra::GPU& gpu;
101 103
102 const Device device; 104 std::unique_ptr<VulkanObjects> vulkan_objects;
103 StateTracker state_tracker{gpu}; 105 Device device;
106 StateTracker state_tracker;
107 ProgramManager program_manager;
108 RasterizerOpenGL rasterizer;
104 109
105 // OpenGL object IDs 110 // OpenGL object IDs
106 OGLSampler present_sampler; 111 OGLSampler present_sampler;
@@ -116,11 +121,6 @@ private:
116 /// Display information for Switch screen 121 /// Display information for Switch screen
117 ScreenInfo screen_info; 122 ScreenInfo screen_info;
118 123
119 /// Global dummy shader pipeline
120 ProgramManager program_manager;
121
122 RasterizerOpenGL rasterizer;
123
124 /// OpenGL framebuffer data 124 /// OpenGL framebuffer data
125 std::vector<u8> gl_framebuffer_data; 125 std::vector<u8> gl_framebuffer_data;
126 126
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp
index eb849cbf2..aeb36551c 100644
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -71,7 +71,7 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
71 static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; 71 static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
72 72
73 program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle); 73 program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle);
74 glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); 74 glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
75 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); 75 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
76 76
77 const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); 77 const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format));
@@ -91,8 +91,8 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
91 glUniform1ui(5, params.x_shift); 91 glUniform1ui(5, params.x_shift);
92 glUniform1ui(6, params.block_height); 92 glUniform1ui(6, params.block_height);
93 glUniform1ui(7, params.block_height_mask); 93 glUniform1ui(7, params.block_height_mask);
94 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), 94 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
95 input_offset, image.guest_size_bytes - swizzle.buffer_offset); 95 image.guest_size_bytes - swizzle.buffer_offset);
96 glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0, 96 glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0,
97 GL_WRITE_ONLY, store_format); 97 GL_WRITE_ONLY, store_format);
98 glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); 98 glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers);
@@ -108,7 +108,7 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
108 static constexpr GLuint BINDING_INPUT_BUFFER = 1; 108 static constexpr GLuint BINDING_INPUT_BUFFER = 1;
109 static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; 109 static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
110 110
111 glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); 111 glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
112 program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle); 112 program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle);
113 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); 113 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
114 114
@@ -132,8 +132,8 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
132 glUniform1ui(7, params.block_height_mask); 132 glUniform1ui(7, params.block_height_mask);
133 glUniform1ui(8, params.block_depth); 133 glUniform1ui(8, params.block_depth);
134 glUniform1ui(9, params.block_depth_mask); 134 glUniform1ui(9, params.block_depth_mask);
135 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), 135 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
136 input_offset, image.guest_size_bytes - swizzle.buffer_offset); 136 image.guest_size_bytes - swizzle.buffer_offset);
137 glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0, 137 glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0,
138 GL_WRITE_ONLY, store_format); 138 GL_WRITE_ONLY, store_format);
139 glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z); 139 glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z);
@@ -159,7 +159,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
159 "Non-power of two images are not implemented"); 159 "Non-power of two images are not implemented");
160 160
161 program_manager.BindHostCompute(pitch_unswizzle_program.handle); 161 program_manager.BindHostCompute(pitch_unswizzle_program.handle);
162 glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); 162 glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
163 glUniform2ui(LOC_ORIGIN, 0, 0); 163 glUniform2ui(LOC_ORIGIN, 0, 0);
164 glUniform2i(LOC_DESTINATION, 0, 0); 164 glUniform2i(LOC_DESTINATION, 0, 0);
165 glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block); 165 glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block);
@@ -172,8 +172,8 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
172 const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); 172 const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
173 const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); 173 const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);
174 174
175 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), 175 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
176 input_offset, image.guest_size_bytes - swizzle.buffer_offset); 176 image.guest_size_bytes - swizzle.buffer_offset);
177 glDispatchCompute(num_dispatches_x, num_dispatches_y, 1); 177 glDispatchCompute(num_dispatches_x, num_dispatches_y, 1);
178 } 178 }
179 program_manager.RestoreGuestCompute(); 179 program_manager.RestoreGuestCompute();
diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h
index 359997255..bec026bc3 100644
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -15,9 +15,10 @@
15namespace OpenGL { 15namespace OpenGL {
16 16
17class Image; 17class Image;
18class ImageBufferMap;
19class ProgramManager; 18class ProgramManager;
20 19
20struct ImageBufferMap;
21
21class UtilShaders { 22class UtilShaders {
22public: 23public:
23 explicit UtilShaders(ProgramManager& program_manager); 24 explicit UtilShaders(ProgramManager& program_manager);
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 85121d9fd..19aaf034f 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -531,13 +531,9 @@ VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison) {
531 return {}; 531 return {};
532} 532}
533 533
534VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format) { 534VkIndexType IndexFormat(Maxwell::IndexFormat index_format) {
535 switch (index_format) { 535 switch (index_format) {
536 case Maxwell::IndexFormat::UnsignedByte: 536 case Maxwell::IndexFormat::UnsignedByte:
537 if (!device.IsExtIndexTypeUint8Supported()) {
538 UNIMPLEMENTED_MSG("Native uint8 indices are not supported on this device");
539 return VK_INDEX_TYPE_UINT16;
540 }
541 return VK_INDEX_TYPE_UINT8_EXT; 537 return VK_INDEX_TYPE_UINT8_EXT;
542 case Maxwell::IndexFormat::UnsignedShort: 538 case Maxwell::IndexFormat::UnsignedShort:
543 return VK_INDEX_TYPE_UINT16; 539 return VK_INDEX_TYPE_UINT16;
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h
index 7c34b47dc..e3e06ba38 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.h
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h
@@ -53,7 +53,7 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
53 53
54VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison); 54VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison);
55 55
56VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format); 56VkIndexType IndexFormat(Maxwell::IndexFormat index_format);
57 57
58VkStencilOp StencilOp(Maxwell::StencilOp stencil_op); 58VkStencilOp StencilOp(Maxwell::StencilOp stencil_op);
59 59
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index 6909576cb..1cc720ddd 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -107,7 +107,7 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
107 debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr), 107 debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr),
108 surface(CreateSurface(instance, render_window)), 108 surface(CreateSurface(instance, render_window)),
109 device(CreateDevice(instance, dld, *surface)), 109 device(CreateDevice(instance, dld, *surface)),
110 memory_allocator(device), 110 memory_allocator(device, false),
111 state_tracker(gpu), 111 state_tracker(gpu),
112 scheduler(device, state_tracker), 112 scheduler(device, state_tracker),
113 swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width, 113 swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width,
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h
index 1efaf3b77..72071316c 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -58,12 +58,11 @@ private:
58 vk::InstanceDispatch dld; 58 vk::InstanceDispatch dld;
59 59
60 vk::Instance instance; 60 vk::Instance instance;
61 61 vk::DebugUtilsMessenger debug_callback;
62 vk::SurfaceKHR surface; 62 vk::SurfaceKHR surface;
63 63
64 VKScreenInfo screen_info; 64 VKScreenInfo screen_info;
65 65
66 vk::DebugUtilsMessenger debug_callback;
67 Device device; 66 Device device;
68 MemoryAllocator memory_allocator; 67 MemoryAllocator memory_allocator;
69 StateTracker state_tracker; 68 StateTracker state_tracker;
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
index df8992528..a1a32aabe 100644
--- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
@@ -148,8 +148,8 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
148 SetUniformData(data, framebuffer); 148 SetUniformData(data, framebuffer);
149 SetVertexData(data, framebuffer); 149 SetVertexData(data, framebuffer);
150 150
151 const std::span<u8> map = buffer_commit.Map(); 151 const std::span<u8> mapped_span = buffer_commit.Map();
152 std::memcpy(map.data(), &data, sizeof(data)); 152 std::memcpy(mapped_span.data(), &data, sizeof(data));
153 153
154 if (!use_accelerated) { 154 if (!use_accelerated) {
155 const u64 image_offset = GetRawImageOffset(framebuffer, image_index); 155 const u64 image_offset = GetRawImageOffset(framebuffer, image_index);
@@ -162,8 +162,8 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
162 constexpr u32 block_height_log2 = 4; 162 constexpr u32 block_height_log2 = 4;
163 const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer); 163 const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer);
164 Tegra::Texture::UnswizzleTexture( 164 Tegra::Texture::UnswizzleTexture(
165 map.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes), bytes_per_pixel, 165 mapped_span.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes),
166 framebuffer.width, framebuffer.height, 1, block_height_log2, 0); 166 bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0);
167 167
168 const VkBufferImageCopy copy{ 168 const VkBufferImageCopy copy{
169 .bufferOffset = image_offset, 169 .bufferOffset = image_offset,
@@ -263,7 +263,6 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
263 cmdbuf.Draw(4, 1, 0, 0); 263 cmdbuf.Draw(4, 1, 0, 0);
264 cmdbuf.EndRenderPass(); 264 cmdbuf.EndRenderPass();
265 }); 265 });
266
267 return *semaphores[image_index]; 266 return *semaphores[image_index];
268} 267}
269 268
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index d8ad40a0f..48fc5d966 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -3,188 +3,276 @@
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <algorithm> 5#include <algorithm>
6#include <array>
6#include <cstring> 7#include <cstring>
7#include <memory> 8#include <span>
9#include <vector>
8 10
9#include "core/core.h"
10#include "video_core/buffer_cache/buffer_cache.h" 11#include "video_core/buffer_cache/buffer_cache.h"
12#include "video_core/renderer_vulkan/maxwell_to_vk.h"
11#include "video_core/renderer_vulkan/vk_buffer_cache.h" 13#include "video_core/renderer_vulkan/vk_buffer_cache.h"
12#include "video_core/renderer_vulkan/vk_scheduler.h" 14#include "video_core/renderer_vulkan/vk_scheduler.h"
13#include "video_core/renderer_vulkan/vk_stream_buffer.h" 15#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
16#include "video_core/renderer_vulkan/vk_update_descriptor.h"
14#include "video_core/vulkan_common/vulkan_device.h" 17#include "video_core/vulkan_common/vulkan_device.h"
18#include "video_core/vulkan_common/vulkan_memory_allocator.h"
15#include "video_core/vulkan_common/vulkan_wrapper.h" 19#include "video_core/vulkan_common/vulkan_wrapper.h"
16 20
17namespace Vulkan { 21namespace Vulkan {
18
19namespace { 22namespace {
23VkBufferCopy MakeBufferCopy(const VideoCommon::BufferCopy& copy) {
24 return VkBufferCopy{
25 .srcOffset = copy.src_offset,
26 .dstOffset = copy.dst_offset,
27 .size = copy.size,
28 };
29}
20 30
21constexpr VkBufferUsageFlags BUFFER_USAGE = 31VkIndexType IndexTypeFromNumElements(const Device& device, u32 num_elements) {
22 VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | 32 if (num_elements <= 0xff && device.IsExtIndexTypeUint8Supported()) {
23 VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; 33 return VK_INDEX_TYPE_UINT8_EXT;
24 34 }
25constexpr VkPipelineStageFlags UPLOAD_PIPELINE_STAGE = 35 if (num_elements <= 0xffff) {
26 VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | 36 return VK_INDEX_TYPE_UINT16;
27 VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | 37 }
28 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; 38 return VK_INDEX_TYPE_UINT32;
29 39}
30constexpr VkAccessFlags UPLOAD_ACCESS_BARRIERS =
31 VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT |
32 VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_INDEX_READ_BIT;
33 40
34constexpr VkAccessFlags TRANSFORM_FEEDBACK_WRITE_ACCESS = 41size_t BytesPerIndex(VkIndexType index_type) {
35 VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT; 42 switch (index_type) {
43 case VK_INDEX_TYPE_UINT8_EXT:
44 return 1;
45 case VK_INDEX_TYPE_UINT16:
46 return 2;
47 case VK_INDEX_TYPE_UINT32:
48 return 4;
49 default:
50 UNREACHABLE_MSG("Invalid index type={}", index_type);
51 return 1;
52 }
53}
36 54
55template <typename T>
56std::array<T, 6> MakeQuadIndices(u32 quad, u32 first) {
57 std::array<T, 6> indices{0, 1, 2, 0, 2, 3};
58 std::ranges::transform(indices, indices.begin(),
59 [quad, first](u32 index) { return first + index + quad * 4; });
60 return indices;
61}
37} // Anonymous namespace 62} // Anonymous namespace
38 63
39Buffer::Buffer(const Device& device_, MemoryAllocator& memory_allocator, VKScheduler& scheduler_, 64Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params)
40 StagingBufferPool& staging_pool_, VAddr cpu_addr_, std::size_t size_) 65 : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {}
41 : BufferBlock{cpu_addr_, size_}, device{device_}, scheduler{scheduler_}, staging_pool{ 66
42 staging_pool_} { 67Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
43 buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ 68 VAddr cpu_addr_, u64 size_bytes_)
69 : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_) {
70 buffer = runtime.device.GetLogical().CreateBuffer(VkBufferCreateInfo{
44 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, 71 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
45 .pNext = nullptr, 72 .pNext = nullptr,
46 .flags = 0, 73 .flags = 0,
47 .size = static_cast<VkDeviceSize>(size_), 74 .size = SizeBytes(),
48 .usage = BUFFER_USAGE | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 75 .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
76 VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT |
77 VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
78 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
79 VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
49 .sharingMode = VK_SHARING_MODE_EXCLUSIVE, 80 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
50 .queueFamilyIndexCount = 0, 81 .queueFamilyIndexCount = 0,
51 .pQueueFamilyIndices = nullptr, 82 .pQueueFamilyIndices = nullptr,
52 }); 83 });
53 commit = memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); 84 if (runtime.device.HasDebuggingToolAttached()) {
85 buffer.SetObjectNameEXT(fmt::format("Buffer 0x{:x}", CpuAddr()).c_str());
86 }
87 commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal);
54} 88}
55 89
56Buffer::~Buffer() = default; 90BufferCacheRuntime::BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_allocator_,
91 VKScheduler& scheduler_, StagingBufferPool& staging_pool_,
92 VKUpdateDescriptorQueue& update_descriptor_queue_,
93 VKDescriptorPool& descriptor_pool)
94 : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
95 staging_pool{staging_pool_}, update_descriptor_queue{update_descriptor_queue_},
96 uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
97 quad_index_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue) {}
57 98
58void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) { 99StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) {
59 const auto& staging = staging_pool.Request(data_size, MemoryUsage::Upload); 100 return staging_pool.Request(size, MemoryUsage::Upload);
60 std::memcpy(staging.mapped_span.data(), data, data_size); 101}
61 102
62 scheduler.RequestOutsideRenderPassOperationContext(); 103StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) {
104 return staging_pool.Request(size, MemoryUsage::Download);
105}
63 106
64 const VkBuffer handle = Handle(); 107void BufferCacheRuntime::Finish() {
65 scheduler.Record([staging = staging.buffer, handle, offset, data_size, 108 scheduler.Finish();
66 &device = device](vk::CommandBuffer cmdbuf) { 109}
67 const VkBufferMemoryBarrier read_barrier{ 110
68 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, 111void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
69 .pNext = nullptr, 112 std::span<const VideoCommon::BufferCopy> copies) {
70 .srcAccessMask = 113 static constexpr VkMemoryBarrier READ_BARRIER{
71 VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT | 114 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
72 VK_ACCESS_HOST_WRITE_BIT | 115 .pNext = nullptr,
73 (device.IsExtTransformFeedbackSupported() ? TRANSFORM_FEEDBACK_WRITE_ACCESS : 0), 116 .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
74 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, 117 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
75 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, 118 };
76 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, 119 static constexpr VkMemoryBarrier WRITE_BARRIER{
77 .buffer = handle, 120 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
78 .offset = offset, 121 .pNext = nullptr,
79 .size = data_size, 122 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
80 }; 123 .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
81 const VkBufferMemoryBarrier write_barrier{ 124 };
82 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, 125 // Measuring a popular game, this number never exceeds the specified size once data is warmed up
83 .pNext = nullptr, 126 boost::container::small_vector<VkBufferCopy, 3> vk_copies(copies.size());
84 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, 127 std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy);
85 .dstAccessMask = UPLOAD_ACCESS_BARRIERS, 128 scheduler.RequestOutsideRenderPassOperationContext();
86 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, 129 scheduler.Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) {
87 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
88 .buffer = handle,
89 .offset = offset,
90 .size = data_size,
91 };
92 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 130 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
93 0, read_barrier); 131 0, READ_BARRIER);
94 cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, data_size}); 132 cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies);
95 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, 133 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
96 write_barrier); 134 0, WRITE_BARRIER);
97 }); 135 });
98} 136}
99 137
100void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) { 138void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format,
101 auto staging = staging_pool.Request(data_size, MemoryUsage::Download); 139 u32 base_vertex, u32 num_indices, VkBuffer buffer,
102 scheduler.RequestOutsideRenderPassOperationContext(); 140 u32 offset, [[maybe_unused]] u32 size) {
141 VkIndexType index_type = MaxwellToVK::IndexFormat(index_format);
142 if (topology == PrimitiveTopology::Quads) {
143 index_type = VK_INDEX_TYPE_UINT32;
144 std::tie(buffer, offset) =
145 quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset);
146 } else if (index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) {
147 index_type = VK_INDEX_TYPE_UINT16;
148 std::tie(buffer, offset) = uint8_pass.Assemble(num_indices, buffer, offset);
149 }
150 scheduler.Record([buffer, offset, index_type](vk::CommandBuffer cmdbuf) {
151 cmdbuf.BindIndexBuffer(buffer, offset, index_type);
152 });
153}
103 154
104 const VkBuffer handle = Handle(); 155void BufferCacheRuntime::BindQuadArrayIndexBuffer(u32 first, u32 count) {
105 scheduler.Record( 156 ReserveQuadArrayLUT(first + count, true);
106 [staging = staging.buffer, handle, offset, data_size](vk::CommandBuffer cmdbuf) {
107 const VkBufferMemoryBarrier barrier{
108 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
109 .pNext = nullptr,
110 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
111 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
112 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
113 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
114 .buffer = handle,
115 .offset = offset,
116 .size = data_size,
117 };
118
119 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
120 VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
121 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
122 VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {});
123 cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, data_size});
124 });
125 scheduler.Finish();
126 157
127 std::memcpy(data, staging.mapped_span.data(), data_size); 158 // The LUT has the indices 0, 1, 2, and 3 copied as an array
159 // To apply these 'first' offsets we can apply an offset based on the modulus.
160 const VkIndexType index_type = quad_array_lut_index_type;
161 const size_t sub_first_offset = static_cast<size_t>(first % 4) * (current_num_indices / 4);
162 const size_t offset = (sub_first_offset + first / 4) * 6ULL * BytesPerIndex(index_type);
163 scheduler.Record([buffer = *quad_array_lut, index_type, offset](vk::CommandBuffer cmdbuf) {
164 cmdbuf.BindIndexBuffer(buffer, offset, index_type);
165 });
128} 166}
129 167
130void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, 168void BufferCacheRuntime::BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size,
131 std::size_t copy_size) { 169 u32 stride) {
132 scheduler.RequestOutsideRenderPassOperationContext(); 170 if (device.IsExtExtendedDynamicStateSupported()) {
171 scheduler.Record([index, buffer, offset, size, stride](vk::CommandBuffer cmdbuf) {
172 const VkDeviceSize vk_offset = offset;
173 const VkDeviceSize vk_size = buffer != VK_NULL_HANDLE ? size : VK_WHOLE_SIZE;
174 const VkDeviceSize vk_stride = stride;
175 cmdbuf.BindVertexBuffers2EXT(index, 1, &buffer, &vk_offset, &vk_size, &vk_stride);
176 });
177 } else {
178 scheduler.Record([index, buffer, offset](vk::CommandBuffer cmdbuf) {
179 cmdbuf.BindVertexBuffer(index, buffer, offset);
180 });
181 }
182}
133 183
134 const VkBuffer dst_buffer = Handle(); 184void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset,
135 scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset, 185 u32 size) {
136 copy_size](vk::CommandBuffer cmdbuf) { 186 if (!device.IsExtTransformFeedbackSupported()) {
137 cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, copy_size}); 187 // Already logged in the rasterizer
138 188 return;
139 std::array<VkBufferMemoryBarrier, 2> barriers; 189 }
140 barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; 190 scheduler.Record([index, buffer, offset, size](vk::CommandBuffer cmdbuf) {
141 barriers[0].pNext = nullptr; 191 const VkDeviceSize vk_offset = offset;
142 barriers[0].srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; 192 const VkDeviceSize vk_size = size;
143 barriers[0].dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; 193 cmdbuf.BindTransformFeedbackBuffersEXT(index, 1, &buffer, &vk_offset, &vk_size);
144 barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
145 barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
146 barriers[0].buffer = src_buffer;
147 barriers[0].offset = src_offset;
148 barriers[0].size = copy_size;
149 barriers[1].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
150 barriers[1].pNext = nullptr;
151 barriers[1].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
152 barriers[1].dstAccessMask = UPLOAD_ACCESS_BARRIERS;
153 barriers[1].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
154 barriers[1].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
155 barriers[1].buffer = dst_buffer;
156 barriers[1].offset = dst_offset;
157 barriers[1].size = copy_size;
158 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {},
159 barriers, {});
160 }); 194 });
161} 195}
162 196
163VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer_, 197void BufferCacheRuntime::BindBuffer(VkBuffer buffer, u32 offset, u32 size) {
164 Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, 198 update_descriptor_queue.AddBuffer(buffer, offset, size);
165 const Device& device_, MemoryAllocator& memory_allocator_,
166 VKScheduler& scheduler_, VKStreamBuffer& stream_buffer_,
167 StagingBufferPool& staging_pool_)
168 : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer_, gpu_memory_,
169 cpu_memory_, stream_buffer_},
170 device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
171 staging_pool{staging_pool_} {}
172
173VKBufferCache::~VKBufferCache() = default;
174
175std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
176 return std::make_shared<Buffer>(device, memory_allocator, scheduler, staging_pool, cpu_addr,
177 size);
178} 199}
179 200
180VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) { 201void BufferCacheRuntime::ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle) {
181 size = std::max(size, std::size_t(4)); 202 if (num_indices <= current_num_indices) {
182 const auto& empty = staging_pool.Request(size, MemoryUsage::DeviceLocal); 203 return;
204 }
205 if (wait_for_idle) {
206 scheduler.Finish();
207 }
208 current_num_indices = num_indices;
209 quad_array_lut_index_type = IndexTypeFromNumElements(device, num_indices);
210
211 const u32 num_quads = num_indices / 4;
212 const u32 num_triangle_indices = num_quads * 6;
213 const u32 num_first_offset_copies = 4;
214 const size_t bytes_per_index = BytesPerIndex(quad_array_lut_index_type);
215 const size_t size_bytes = num_triangle_indices * bytes_per_index * num_first_offset_copies;
216 quad_array_lut = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
217 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
218 .pNext = nullptr,
219 .flags = 0,
220 .size = size_bytes,
221 .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
222 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
223 .queueFamilyIndexCount = 0,
224 .pQueueFamilyIndices = nullptr,
225 });
226 if (device.HasDebuggingToolAttached()) {
227 quad_array_lut.SetObjectNameEXT("Quad LUT");
228 }
229 quad_array_lut_commit = memory_allocator.Commit(quad_array_lut, MemoryUsage::DeviceLocal);
230
231 const StagingBufferRef staging = staging_pool.Request(size_bytes, MemoryUsage::Upload);
232 u8* staging_data = staging.mapped_span.data();
233 const size_t quad_size = bytes_per_index * 6;
234 for (u32 first = 0; first < num_first_offset_copies; ++first) {
235 for (u32 quad = 0; quad < num_quads; ++quad) {
236 switch (quad_array_lut_index_type) {
237 case VK_INDEX_TYPE_UINT8_EXT:
238 std::memcpy(staging_data, MakeQuadIndices<u8>(quad, first).data(), quad_size);
239 break;
240 case VK_INDEX_TYPE_UINT16:
241 std::memcpy(staging_data, MakeQuadIndices<u16>(quad, first).data(), quad_size);
242 break;
243 case VK_INDEX_TYPE_UINT32:
244 std::memcpy(staging_data, MakeQuadIndices<u32>(quad, first).data(), quad_size);
245 break;
246 default:
247 UNREACHABLE();
248 break;
249 }
250 staging_data += quad_size;
251 }
252 }
183 scheduler.RequestOutsideRenderPassOperationContext(); 253 scheduler.RequestOutsideRenderPassOperationContext();
184 scheduler.Record([size, buffer = empty.buffer](vk::CommandBuffer cmdbuf) { 254 scheduler.Record([src_buffer = staging.buffer, dst_buffer = *quad_array_lut,
185 cmdbuf.FillBuffer(buffer, 0, size, 0); 255 size_bytes](vk::CommandBuffer cmdbuf) {
256 const VkBufferCopy copy{
257 .srcOffset = 0,
258 .dstOffset = 0,
259 .size = size_bytes,
260 };
261 const VkBufferMemoryBarrier write_barrier{
262 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
263 .pNext = nullptr,
264 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
265 .dstAccessMask = VK_ACCESS_INDEX_READ_BIT,
266 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
267 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
268 .buffer = dst_buffer,
269 .offset = 0,
270 .size = size_bytes,
271 };
272 cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy);
273 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
274 0, write_barrier);
186 }); 275 });
187 return {empty.buffer, 0, 0};
188} 276}
189 277
190} // namespace Vulkan 278} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 41d577510..d232e1f2d 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -4,69 +4,112 @@
4 4
5#pragma once 5#pragma once
6 6
7#include <memory>
8
9#include "common/common_types.h"
10#include "video_core/buffer_cache/buffer_cache.h" 7#include "video_core/buffer_cache/buffer_cache.h"
8#include "video_core/engines/maxwell_3d.h"
9#include "video_core/renderer_vulkan/vk_compute_pass.h"
11#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" 10#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
12#include "video_core/renderer_vulkan/vk_stream_buffer.h"
13#include "video_core/vulkan_common/vulkan_memory_allocator.h" 11#include "video_core/vulkan_common/vulkan_memory_allocator.h"
14#include "video_core/vulkan_common/vulkan_wrapper.h" 12#include "video_core/vulkan_common/vulkan_wrapper.h"
15 13
16namespace Vulkan { 14namespace Vulkan {
17 15
18class Device; 16class Device;
17class VKDescriptorPool;
19class VKScheduler; 18class VKScheduler;
19class VKUpdateDescriptorQueue;
20 20
21class Buffer final : public VideoCommon::BufferBlock { 21class BufferCacheRuntime;
22public:
23 explicit Buffer(const Device& device, MemoryAllocator& memory_allocator, VKScheduler& scheduler,
24 StagingBufferPool& staging_pool, VAddr cpu_addr_, std::size_t size_);
25 ~Buffer();
26
27 void Upload(std::size_t offset, std::size_t data_size, const u8* data);
28
29 void Download(std::size_t offset, std::size_t data_size, u8* data);
30 22
31 void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, 23class Buffer : public VideoCommon::BufferBase<VideoCore::RasterizerInterface> {
32 std::size_t copy_size); 24public:
25 explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params);
26 explicit Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
27 VAddr cpu_addr_, u64 size_bytes_);
33 28
34 VkBuffer Handle() const { 29 [[nodiscard]] VkBuffer Handle() const noexcept {
35 return *buffer; 30 return *buffer;
36 } 31 }
37 32
38 u64 Address() const { 33 operator VkBuffer() const noexcept {
39 return 0; 34 return *buffer;
40 } 35 }
41 36
42private: 37private:
43 const Device& device;
44 VKScheduler& scheduler;
45 StagingBufferPool& staging_pool;
46
47 vk::Buffer buffer; 38 vk::Buffer buffer;
48 MemoryCommit commit; 39 MemoryCommit commit;
49}; 40};
50 41
51class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> { 42class BufferCacheRuntime {
43 friend Buffer;
44
45 using PrimitiveTopology = Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology;
46 using IndexFormat = Tegra::Engines::Maxwell3D::Regs::IndexFormat;
47
52public: 48public:
53 explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, 49 explicit BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_manager_,
54 Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, 50 VKScheduler& scheduler_, StagingBufferPool& staging_pool_,
55 const Device& device, MemoryAllocator& memory_allocator, 51 VKUpdateDescriptorQueue& update_descriptor_queue_,
56 VKScheduler& scheduler, VKStreamBuffer& stream_buffer, 52 VKDescriptorPool& descriptor_pool);
57 StagingBufferPool& staging_pool); 53
58 ~VKBufferCache(); 54 void Finish();
55
56 [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size);
57
58 [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size);
59 59
60 BufferInfo GetEmptyBuffer(std::size_t size) override; 60 void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer,
61 std::span<const VideoCommon::BufferCopy> copies);
61 62
62protected: 63 void BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, u32 num_indices,
63 std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; 64 u32 base_vertex, VkBuffer buffer, u32 offset, u32 size);
65
66 void BindQuadArrayIndexBuffer(u32 first, u32 count);
67
68 void BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size, u32 stride);
69
70 void BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size);
71
72 void BindUniformBuffer(VkBuffer buffer, u32 offset, u32 size) {
73 BindBuffer(buffer, offset, size);
74 }
75
76 void BindStorageBuffer(VkBuffer buffer, u32 offset, u32 size,
77 [[maybe_unused]] bool is_written) {
78 BindBuffer(buffer, offset, size);
79 }
64 80
65private: 81private:
82 void BindBuffer(VkBuffer buffer, u32 offset, u32 size);
83
84 void ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle);
85
66 const Device& device; 86 const Device& device;
67 MemoryAllocator& memory_allocator; 87 MemoryAllocator& memory_allocator;
68 VKScheduler& scheduler; 88 VKScheduler& scheduler;
69 StagingBufferPool& staging_pool; 89 StagingBufferPool& staging_pool;
90 VKUpdateDescriptorQueue& update_descriptor_queue;
91
92 vk::Buffer quad_array_lut;
93 MemoryCommit quad_array_lut_commit;
94 VkIndexType quad_array_lut_index_type{};
95 u32 current_num_indices = 0;
96
97 Uint8Pass uint8_pass;
98 QuadIndexedPass quad_index_pass;
70}; 99};
71 100
101struct BufferCacheParams {
102 using Runtime = Vulkan::BufferCacheRuntime;
103 using Buffer = Vulkan::Buffer;
104
105 static constexpr bool IS_OPENGL = false;
106 static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false;
107 static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = false;
108 static constexpr bool NEEDS_BIND_UNIFORM_INDEX = false;
109 static constexpr bool NEEDS_BIND_STORAGE_INDEX = false;
110 static constexpr bool USE_MEMORY_MAPS = true;
111};
112
113using BufferCache = VideoCommon::BufferCache<BufferCacheParams>;
114
72} // namespace Vulkan 115} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index 5eb6a54be..a4fdcdf81 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -10,7 +10,6 @@
10#include "common/alignment.h" 10#include "common/alignment.h"
11#include "common/assert.h" 11#include "common/assert.h"
12#include "common/common_types.h" 12#include "common/common_types.h"
13#include "video_core/host_shaders/vulkan_quad_array_comp_spv.h"
14#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" 13#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
15#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" 14#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
16#include "video_core/renderer_vulkan/vk_compute_pass.h" 15#include "video_core/renderer_vulkan/vk_compute_pass.h"
@@ -22,30 +21,7 @@
22#include "video_core/vulkan_common/vulkan_wrapper.h" 21#include "video_core/vulkan_common/vulkan_wrapper.h"
23 22
24namespace Vulkan { 23namespace Vulkan {
25
26namespace { 24namespace {
27
28VkDescriptorSetLayoutBinding BuildQuadArrayPassDescriptorSetLayoutBinding() {
29 return {
30 .binding = 0,
31 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
32 .descriptorCount = 1,
33 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
34 .pImmutableSamplers = nullptr,
35 };
36}
37
38VkDescriptorUpdateTemplateEntryKHR BuildQuadArrayPassDescriptorUpdateTemplateEntry() {
39 return {
40 .dstBinding = 0,
41 .dstArrayElement = 0,
42 .descriptorCount = 1,
43 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
44 .offset = 0,
45 .stride = sizeof(DescriptorUpdateEntry),
46 };
47}
48
49VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { 25VkPushConstantRange BuildComputePushConstantRange(std::size_t size) {
50 return { 26 return {
51 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, 27 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
@@ -162,55 +138,6 @@ VkDescriptorSet VKComputePass::CommitDescriptorSet(
162 return set; 138 return set;
163} 139}
164 140
165QuadArrayPass::QuadArrayPass(const Device& device_, VKScheduler& scheduler_,
166 VKDescriptorPool& descriptor_pool_,
167 StagingBufferPool& staging_buffer_pool_,
168 VKUpdateDescriptorQueue& update_descriptor_queue_)
169 : VKComputePass(device_, descriptor_pool_, BuildQuadArrayPassDescriptorSetLayoutBinding(),
170 BuildQuadArrayPassDescriptorUpdateTemplateEntry(),
171 BuildComputePushConstantRange(sizeof(u32)), VULKAN_QUAD_ARRAY_COMP_SPV),
172 scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_},
173 update_descriptor_queue{update_descriptor_queue_} {}
174
175QuadArrayPass::~QuadArrayPass() = default;
176
177std::pair<VkBuffer, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertices, u32 first) {
178 const u32 num_triangle_vertices = (num_vertices / 4) * 6;
179 const std::size_t staging_size = num_triangle_vertices * sizeof(u32);
180 const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
181
182 update_descriptor_queue.Acquire();
183 update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size);
184 const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
185
186 scheduler.RequestOutsideRenderPassOperationContext();
187
188 ASSERT(num_vertices % 4 == 0);
189 const u32 num_quads = num_vertices / 4;
190 scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer,
191 num_quads, first, set](vk::CommandBuffer cmdbuf) {
192 constexpr u32 dispatch_size = 1024;
193 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
194 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
195 cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(first), &first);
196 cmdbuf.Dispatch(Common::AlignUp(num_quads, dispatch_size) / dispatch_size, 1, 1);
197
198 VkBufferMemoryBarrier barrier;
199 barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
200 barrier.pNext = nullptr;
201 barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
202 barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
203 barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
204 barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
205 barrier.buffer = buffer;
206 barrier.offset = 0;
207 barrier.size = static_cast<VkDeviceSize>(num_quads) * 6 * sizeof(u32);
208 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
209 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, {barrier}, {});
210 });
211 return {staging_ref.buffer, 0};
212}
213
214Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_, 141Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_,
215 VKDescriptorPool& descriptor_pool, StagingBufferPool& staging_buffer_pool_, 142 VKDescriptorPool& descriptor_pool, StagingBufferPool& staging_buffer_pool_,
216 VKUpdateDescriptorQueue& update_descriptor_queue_) 143 VKUpdateDescriptorQueue& update_descriptor_queue_)
@@ -221,18 +148,18 @@ Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_,
221 148
222Uint8Pass::~Uint8Pass() = default; 149Uint8Pass::~Uint8Pass() = default;
223 150
224std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, 151std::pair<VkBuffer, u32> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
225 u64 src_offset) { 152 u32 src_offset) {
226 const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16)); 153 const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16));
227 const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); 154 const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
228 155
229 update_descriptor_queue.Acquire(); 156 update_descriptor_queue.Acquire();
230 update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices); 157 update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices);
231 update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size); 158 update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size);
232 const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); 159 const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
233 160
234 scheduler.RequestOutsideRenderPassOperationContext(); 161 scheduler.RequestOutsideRenderPassOperationContext();
235 scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer, set, 162 scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
236 num_vertices](vk::CommandBuffer cmdbuf) { 163 num_vertices](vk::CommandBuffer cmdbuf) {
237 constexpr u32 dispatch_size = 1024; 164 constexpr u32 dispatch_size = 1024;
238 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); 165 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
@@ -252,7 +179,7 @@ std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buff
252 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 179 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
253 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); 180 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {});
254 }); 181 });
255 return {staging_ref.buffer, 0}; 182 return {staging.buffer, 0};
256} 183}
257 184
258QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, 185QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
@@ -267,9 +194,9 @@ QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
267 194
268QuadIndexedPass::~QuadIndexedPass() = default; 195QuadIndexedPass::~QuadIndexedPass() = default;
269 196
270std::pair<VkBuffer, u64> QuadIndexedPass::Assemble( 197std::pair<VkBuffer, u32> QuadIndexedPass::Assemble(
271 Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex, 198 Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex,
272 VkBuffer src_buffer, u64 src_offset) { 199 VkBuffer src_buffer, u32 src_offset) {
273 const u32 index_shift = [index_format] { 200 const u32 index_shift = [index_format] {
274 switch (index_format) { 201 switch (index_format) {
275 case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedByte: 202 case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedByte:
@@ -286,15 +213,15 @@ std::pair<VkBuffer, u64> QuadIndexedPass::Assemble(
286 const u32 num_tri_vertices = (num_vertices / 4) * 6; 213 const u32 num_tri_vertices = (num_vertices / 4) * 6;
287 214
288 const std::size_t staging_size = num_tri_vertices * sizeof(u32); 215 const std::size_t staging_size = num_tri_vertices * sizeof(u32);
289 const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); 216 const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
290 217
291 update_descriptor_queue.Acquire(); 218 update_descriptor_queue.Acquire();
292 update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size); 219 update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size);
293 update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size); 220 update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size);
294 const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); 221 const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
295 222
296 scheduler.RequestOutsideRenderPassOperationContext(); 223 scheduler.RequestOutsideRenderPassOperationContext();
297 scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer, set, 224 scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
298 num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) { 225 num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) {
299 static constexpr u32 dispatch_size = 1024; 226 static constexpr u32 dispatch_size = 1024;
300 const std::array push_constants = {base_vertex, index_shift}; 227 const std::array push_constants = {base_vertex, index_shift};
@@ -317,7 +244,7 @@ std::pair<VkBuffer, u64> QuadIndexedPass::Assemble(
317 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 244 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
318 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); 245 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {});
319 }); 246 });
320 return {staging_ref.buffer, 0}; 247 return {staging.buffer, 0};
321} 248}
322 249
323} // namespace Vulkan 250} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index f5c6f5f17..4904019f5 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -41,22 +41,6 @@ private:
41 vk::ShaderModule module; 41 vk::ShaderModule module;
42}; 42};
43 43
44class QuadArrayPass final : public VKComputePass {
45public:
46 explicit QuadArrayPass(const Device& device_, VKScheduler& scheduler_,
47 VKDescriptorPool& descriptor_pool_,
48 StagingBufferPool& staging_buffer_pool_,
49 VKUpdateDescriptorQueue& update_descriptor_queue_);
50 ~QuadArrayPass();
51
52 std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, u32 first);
53
54private:
55 VKScheduler& scheduler;
56 StagingBufferPool& staging_buffer_pool;
57 VKUpdateDescriptorQueue& update_descriptor_queue;
58};
59
60class Uint8Pass final : public VKComputePass { 44class Uint8Pass final : public VKComputePass {
61public: 45public:
62 explicit Uint8Pass(const Device& device_, VKScheduler& scheduler_, 46 explicit Uint8Pass(const Device& device_, VKScheduler& scheduler_,
@@ -64,7 +48,9 @@ public:
64 VKUpdateDescriptorQueue& update_descriptor_queue_); 48 VKUpdateDescriptorQueue& update_descriptor_queue_);
65 ~Uint8Pass(); 49 ~Uint8Pass();
66 50
67 std::pair<VkBuffer, u64> Assemble(u32 num_vertices, VkBuffer src_buffer, u64 src_offset); 51 /// Assemble uint8 indices into an uint16 index buffer
52 /// Returns a pair with the staging buffer, and the offset where the assembled data is
53 std::pair<VkBuffer, u32> Assemble(u32 num_vertices, VkBuffer src_buffer, u32 src_offset);
68 54
69private: 55private:
70 VKScheduler& scheduler; 56 VKScheduler& scheduler;
@@ -80,9 +66,9 @@ public:
80 VKUpdateDescriptorQueue& update_descriptor_queue_); 66 VKUpdateDescriptorQueue& update_descriptor_queue_);
81 ~QuadIndexedPass(); 67 ~QuadIndexedPass();
82 68
83 std::pair<VkBuffer, u64> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, 69 std::pair<VkBuffer, u32> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format,
84 u32 num_vertices, u32 base_vertex, VkBuffer src_buffer, 70 u32 num_vertices, u32 base_vertex, VkBuffer src_buffer,
85 u64 src_offset); 71 u32 src_offset);
86 72
87private: 73private:
88 VKScheduler& scheduler; 74 VKScheduler& scheduler;
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp
index 6cd00884d..3bec48d14 100644
--- a/src/video_core/renderer_vulkan/vk_fence_manager.cpp
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp
@@ -45,8 +45,8 @@ void InnerFence::Wait() {
45} 45}
46 46
47VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, 47VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
48 Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_, 48 TextureCache& texture_cache_, BufferCache& buffer_cache_,
49 VKBufferCache& buffer_cache_, VKQueryCache& query_cache_, 49 VKQueryCache& query_cache_, const Device& device_,
50 VKScheduler& scheduler_) 50 VKScheduler& scheduler_)
51 : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_}, 51 : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_},
52 scheduler{scheduler_} {} 52 scheduler{scheduler_} {}
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h
index 9c5e5aa8f..2f8322d29 100644
--- a/src/video_core/renderer_vulkan/vk_fence_manager.h
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.h
@@ -22,7 +22,6 @@ class RasterizerInterface;
22namespace Vulkan { 22namespace Vulkan {
23 23
24class Device; 24class Device;
25class VKBufferCache;
26class VKQueryCache; 25class VKQueryCache;
27class VKScheduler; 26class VKScheduler;
28 27
@@ -45,14 +44,14 @@ private:
45using Fence = std::shared_ptr<InnerFence>; 44using Fence = std::shared_ptr<InnerFence>;
46 45
47using GenericFenceManager = 46using GenericFenceManager =
48 VideoCommon::FenceManager<Fence, TextureCache, VKBufferCache, VKQueryCache>; 47 VideoCommon::FenceManager<Fence, TextureCache, BufferCache, VKQueryCache>;
49 48
50class VKFenceManager final : public GenericFenceManager { 49class VKFenceManager final : public GenericFenceManager {
51public: 50public:
52 explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, 51 explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
53 Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_, 52 TextureCache& texture_cache, BufferCache& buffer_cache,
54 VKBufferCache& buffer_cache_, VKQueryCache& query_cache_, 53 VKQueryCache& query_cache, const Device& device,
55 VKScheduler& scheduler_); 54 VKScheduler& scheduler);
56 55
57protected: 56protected:
58 Fence CreateFence(u32 value, bool is_stubbed) override; 57 Fence CreateFence(u32 value, bool is_stubbed) override;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index f0a111829..684d4e3a6 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -8,8 +8,6 @@
8#include <mutex> 8#include <mutex>
9#include <vector> 9#include <vector>
10 10
11#include <boost/container/static_vector.hpp>
12
13#include "common/alignment.h" 11#include "common/alignment.h"
14#include "common/assert.h" 12#include "common/assert.h"
15#include "common/logging/log.h" 13#include "common/logging/log.h"
@@ -24,7 +22,6 @@
24#include "video_core/renderer_vulkan/maxwell_to_vk.h" 22#include "video_core/renderer_vulkan/maxwell_to_vk.h"
25#include "video_core/renderer_vulkan/renderer_vulkan.h" 23#include "video_core/renderer_vulkan/renderer_vulkan.h"
26#include "video_core/renderer_vulkan/vk_buffer_cache.h" 24#include "video_core/renderer_vulkan/vk_buffer_cache.h"
27#include "video_core/renderer_vulkan/vk_compute_pass.h"
28#include "video_core/renderer_vulkan/vk_compute_pipeline.h" 25#include "video_core/renderer_vulkan/vk_compute_pipeline.h"
29#include "video_core/renderer_vulkan/vk_descriptor_pool.h" 26#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
30#include "video_core/renderer_vulkan/vk_graphics_pipeline.h" 27#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
@@ -50,15 +47,16 @@ MICROPROFILE_DEFINE(Vulkan_WaitForWorker, "Vulkan", "Wait for worker", MP_RGB(25
50MICROPROFILE_DEFINE(Vulkan_Drawing, "Vulkan", "Record drawing", MP_RGB(192, 128, 128)); 47MICROPROFILE_DEFINE(Vulkan_Drawing, "Vulkan", "Record drawing", MP_RGB(192, 128, 128));
51MICROPROFILE_DEFINE(Vulkan_Compute, "Vulkan", "Record compute", MP_RGB(192, 128, 128)); 48MICROPROFILE_DEFINE(Vulkan_Compute, "Vulkan", "Record compute", MP_RGB(192, 128, 128));
52MICROPROFILE_DEFINE(Vulkan_Clearing, "Vulkan", "Record clearing", MP_RGB(192, 128, 128)); 49MICROPROFILE_DEFINE(Vulkan_Clearing, "Vulkan", "Record clearing", MP_RGB(192, 128, 128));
53MICROPROFILE_DEFINE(Vulkan_Geometry, "Vulkan", "Setup geometry", MP_RGB(192, 128, 128));
54MICROPROFILE_DEFINE(Vulkan_ConstBuffers, "Vulkan", "Setup constant buffers", MP_RGB(192, 128, 128));
55MICROPROFILE_DEFINE(Vulkan_GlobalBuffers, "Vulkan", "Setup global buffers", MP_RGB(192, 128, 128));
56MICROPROFILE_DEFINE(Vulkan_RenderTargets, "Vulkan", "Setup render targets", MP_RGB(192, 128, 128));
57MICROPROFILE_DEFINE(Vulkan_Textures, "Vulkan", "Setup textures", MP_RGB(192, 128, 128));
58MICROPROFILE_DEFINE(Vulkan_Images, "Vulkan", "Setup images", MP_RGB(192, 128, 128));
59MICROPROFILE_DEFINE(Vulkan_PipelineCache, "Vulkan", "Pipeline cache", MP_RGB(192, 128, 128)); 50MICROPROFILE_DEFINE(Vulkan_PipelineCache, "Vulkan", "Pipeline cache", MP_RGB(192, 128, 128));
60 51
61namespace { 52namespace {
53struct DrawParams {
54 u32 base_instance;
55 u32 num_instances;
56 u32 base_vertex;
57 u32 num_vertices;
58 bool is_indexed;
59};
62 60
63constexpr auto COMPUTE_SHADER_INDEX = static_cast<size_t>(Tegra::Engines::ShaderType::Compute); 61constexpr auto COMPUTE_SHADER_INDEX = static_cast<size_t>(Tegra::Engines::ShaderType::Compute);
64 62
@@ -67,7 +65,6 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in
67 const float width = src.scale_x * 2.0f; 65 const float width = src.scale_x * 2.0f;
68 const float height = src.scale_y * 2.0f; 66 const float height = src.scale_y * 2.0f;
69 const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f; 67 const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f;
70
71 VkViewport viewport{ 68 VkViewport viewport{
72 .x = src.translate_x - src.scale_x, 69 .x = src.translate_x - src.scale_x,
73 .y = src.translate_y - src.scale_y, 70 .y = src.translate_y - src.scale_y,
@@ -76,12 +73,10 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in
76 .minDepth = src.translate_z - src.scale_z * reduce_z, 73 .minDepth = src.translate_z - src.scale_z * reduce_z,
77 .maxDepth = src.translate_z + src.scale_z, 74 .maxDepth = src.translate_z + src.scale_z,
78 }; 75 };
79
80 if (!device.IsExtDepthRangeUnrestrictedSupported()) { 76 if (!device.IsExtDepthRangeUnrestrictedSupported()) {
81 viewport.minDepth = std::clamp(viewport.minDepth, 0.0f, 1.0f); 77 viewport.minDepth = std::clamp(viewport.minDepth, 0.0f, 1.0f);
82 viewport.maxDepth = std::clamp(viewport.maxDepth, 0.0f, 1.0f); 78 viewport.maxDepth = std::clamp(viewport.maxDepth, 0.0f, 1.0f);
83 } 79 }
84
85 return viewport; 80 return viewport;
86} 81}
87 82
@@ -146,13 +141,6 @@ TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const
146 return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index); 141 return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index);
147} 142}
148 143
149template <size_t N>
150std::array<VkDeviceSize, N> ExpandStrides(const std::array<u16, N>& strides) {
151 std::array<VkDeviceSize, N> expanded;
152 std::copy(strides.begin(), strides.end(), expanded.begin());
153 return expanded;
154}
155
156ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) { 144ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) {
157 if (entry.is_buffer) { 145 if (entry.is_buffer) {
158 return ImageViewType::e2D; 146 return ImageViewType::e2D;
@@ -221,190 +209,25 @@ void PushImageDescriptors(const ShaderEntries& entries, TextureCache& texture_ca
221 } 209 }
222} 210}
223 211
224} // Anonymous namespace 212DrawParams MakeDrawParams(const Maxwell& regs, u32 num_instances, bool is_instanced,
225 213 bool is_indexed) {
226class BufferBindings final { 214 DrawParams params{
227public: 215 .base_instance = regs.vb_base_instance,
228 void AddVertexBinding(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, u32 stride) { 216 .num_instances = is_instanced ? num_instances : 1,
229 vertex.buffers[vertex.num_buffers] = buffer; 217 .base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first,
230 vertex.offsets[vertex.num_buffers] = offset; 218 .num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count,
231 vertex.sizes[vertex.num_buffers] = size; 219 .is_indexed = is_indexed,
232 vertex.strides[vertex.num_buffers] = static_cast<u16>(stride); 220 };
233 ++vertex.num_buffers; 221 if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
234 } 222 // 6 triangle vertices per quad, base vertex is part of the index
235 223 // See BindQuadArrayIndexBuffer for more details
236 void SetIndexBinding(VkBuffer buffer, VkDeviceSize offset, VkIndexType type) { 224 params.num_vertices = (params.num_vertices / 4) * 6;
237 index.buffer = buffer; 225 params.base_vertex = 0;
238 index.offset = offset; 226 params.is_indexed = true;
239 index.type = type;
240 }
241
242 void Bind(const Device& device, VKScheduler& scheduler) const {
243 // Use this large switch case to avoid dispatching more memory in the record lambda than
244 // what we need. It looks horrible, but it's the best we can do on standard C++.
245 switch (vertex.num_buffers) {
246 case 0:
247 return BindStatic<0>(device, scheduler);
248 case 1:
249 return BindStatic<1>(device, scheduler);
250 case 2:
251 return BindStatic<2>(device, scheduler);
252 case 3:
253 return BindStatic<3>(device, scheduler);
254 case 4:
255 return BindStatic<4>(device, scheduler);
256 case 5:
257 return BindStatic<5>(device, scheduler);
258 case 6:
259 return BindStatic<6>(device, scheduler);
260 case 7:
261 return BindStatic<7>(device, scheduler);
262 case 8:
263 return BindStatic<8>(device, scheduler);
264 case 9:
265 return BindStatic<9>(device, scheduler);
266 case 10:
267 return BindStatic<10>(device, scheduler);
268 case 11:
269 return BindStatic<11>(device, scheduler);
270 case 12:
271 return BindStatic<12>(device, scheduler);
272 case 13:
273 return BindStatic<13>(device, scheduler);
274 case 14:
275 return BindStatic<14>(device, scheduler);
276 case 15:
277 return BindStatic<15>(device, scheduler);
278 case 16:
279 return BindStatic<16>(device, scheduler);
280 case 17:
281 return BindStatic<17>(device, scheduler);
282 case 18:
283 return BindStatic<18>(device, scheduler);
284 case 19:
285 return BindStatic<19>(device, scheduler);
286 case 20:
287 return BindStatic<20>(device, scheduler);
288 case 21:
289 return BindStatic<21>(device, scheduler);
290 case 22:
291 return BindStatic<22>(device, scheduler);
292 case 23:
293 return BindStatic<23>(device, scheduler);
294 case 24:
295 return BindStatic<24>(device, scheduler);
296 case 25:
297 return BindStatic<25>(device, scheduler);
298 case 26:
299 return BindStatic<26>(device, scheduler);
300 case 27:
301 return BindStatic<27>(device, scheduler);
302 case 28:
303 return BindStatic<28>(device, scheduler);
304 case 29:
305 return BindStatic<29>(device, scheduler);
306 case 30:
307 return BindStatic<30>(device, scheduler);
308 case 31:
309 return BindStatic<31>(device, scheduler);
310 case 32:
311 return BindStatic<32>(device, scheduler);
312 }
313 UNREACHABLE();
314 }
315
316private:
317 // Some of these fields are intentionally left uninitialized to avoid initializing them twice.
318 struct {
319 size_t num_buffers = 0;
320 std::array<VkBuffer, Maxwell::NumVertexArrays> buffers;
321 std::array<VkDeviceSize, Maxwell::NumVertexArrays> offsets;
322 std::array<VkDeviceSize, Maxwell::NumVertexArrays> sizes;
323 std::array<u16, Maxwell::NumVertexArrays> strides;
324 } vertex;
325
326 struct {
327 VkBuffer buffer = nullptr;
328 VkDeviceSize offset;
329 VkIndexType type;
330 } index;
331
332 template <size_t N>
333 void BindStatic(const Device& device, VKScheduler& scheduler) const {
334 if (device.IsExtExtendedDynamicStateSupported()) {
335 if (index.buffer) {
336 BindStatic<N, true, true>(scheduler);
337 } else {
338 BindStatic<N, false, true>(scheduler);
339 }
340 } else {
341 if (index.buffer) {
342 BindStatic<N, true, false>(scheduler);
343 } else {
344 BindStatic<N, false, false>(scheduler);
345 }
346 }
347 }
348
349 template <size_t N, bool is_indexed, bool has_extended_dynamic_state>
350 void BindStatic(VKScheduler& scheduler) const {
351 static_assert(N <= Maxwell::NumVertexArrays);
352 if constexpr (N == 0) {
353 return;
354 }
355
356 std::array<VkBuffer, N> buffers;
357 std::array<VkDeviceSize, N> offsets;
358 std::copy(vertex.buffers.begin(), vertex.buffers.begin() + N, buffers.begin());
359 std::copy(vertex.offsets.begin(), vertex.offsets.begin() + N, offsets.begin());
360
361 if constexpr (has_extended_dynamic_state) {
362 // With extended dynamic states we can specify the length and stride of a vertex buffer
363 std::array<VkDeviceSize, N> sizes;
364 std::array<u16, N> strides;
365 std::copy(vertex.sizes.begin(), vertex.sizes.begin() + N, sizes.begin());
366 std::copy(vertex.strides.begin(), vertex.strides.begin() + N, strides.begin());
367
368 if constexpr (is_indexed) {
369 scheduler.Record(
370 [buffers, offsets, sizes, strides, index = index](vk::CommandBuffer cmdbuf) {
371 cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type);
372 cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(),
373 offsets.data(), sizes.data(),
374 ExpandStrides(strides).data());
375 });
376 } else {
377 scheduler.Record([buffers, offsets, sizes, strides](vk::CommandBuffer cmdbuf) {
378 cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(),
379 offsets.data(), sizes.data(),
380 ExpandStrides(strides).data());
381 });
382 }
383 return;
384 }
385
386 if constexpr (is_indexed) {
387 // Indexed draw
388 scheduler.Record([buffers, offsets, index = index](vk::CommandBuffer cmdbuf) {
389 cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type);
390 cmdbuf.BindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data());
391 });
392 } else {
393 // Array draw
394 scheduler.Record([buffers, offsets](vk::CommandBuffer cmdbuf) {
395 cmdbuf.BindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data());
396 });
397 }
398 }
399};
400
401void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf) const {
402 if (is_indexed) {
403 cmdbuf.DrawIndexed(num_vertices, num_instances, 0, base_vertex, base_instance);
404 } else {
405 cmdbuf.Draw(num_vertices, num_instances, base_vertex, base_instance);
406 } 227 }
228 return params;
407} 229}
230} // Anonymous namespace
408 231
409RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, 232RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
410 Tegra::MemoryManager& gpu_memory_, 233 Tegra::MemoryManager& gpu_memory_,
@@ -414,21 +237,19 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
414 : RasterizerAccelerated{cpu_memory_}, gpu{gpu_}, 237 : RasterizerAccelerated{cpu_memory_}, gpu{gpu_},
415 gpu_memory{gpu_memory_}, maxwell3d{gpu.Maxwell3D()}, kepler_compute{gpu.KeplerCompute()}, 238 gpu_memory{gpu_memory_}, maxwell3d{gpu.Maxwell3D()}, kepler_compute{gpu.KeplerCompute()},
416 screen_info{screen_info_}, device{device_}, memory_allocator{memory_allocator_}, 239 screen_info{screen_info_}, device{device_}, memory_allocator{memory_allocator_},
417 state_tracker{state_tracker_}, scheduler{scheduler_}, stream_buffer(device, scheduler), 240 state_tracker{state_tracker_}, scheduler{scheduler_},
418 staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler), 241 staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler),
419 update_descriptor_queue(device, scheduler), 242 update_descriptor_queue(device, scheduler),
420 blit_image(device, scheduler, state_tracker, descriptor_pool), 243 blit_image(device, scheduler, state_tracker, descriptor_pool),
421 quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
422 quad_indexed_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
423 uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
424 texture_cache_runtime{device, scheduler, memory_allocator, staging_pool, blit_image}, 244 texture_cache_runtime{device, scheduler, memory_allocator, staging_pool, blit_image},
425 texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), 245 texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory),
246 buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool,
247 update_descriptor_queue, descriptor_pool),
248 buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime),
426 pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler, 249 pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler,
427 descriptor_pool, update_descriptor_queue), 250 descriptor_pool, update_descriptor_queue),
428 buffer_cache(*this, gpu_memory, cpu_memory_, device, memory_allocator, scheduler,
429 stream_buffer, staging_pool),
430 query_cache{*this, maxwell3d, gpu_memory, device, scheduler}, 251 query_cache{*this, maxwell3d, gpu_memory, device, scheduler},
431 fence_manager(*this, gpu, gpu_memory, texture_cache, buffer_cache, query_cache, scheduler), 252 fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler),
432 wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window_) { 253 wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window_) {
433 scheduler.SetQueryCache(query_cache); 254 scheduler.SetQueryCache(query_cache);
434 if (device.UseAsynchronousShaders()) { 255 if (device.UseAsynchronousShaders()) {
@@ -449,22 +270,14 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
449 GraphicsPipelineCacheKey key; 270 GraphicsPipelineCacheKey key;
450 key.fixed_state.Fill(maxwell3d.regs, device.IsExtExtendedDynamicStateSupported()); 271 key.fixed_state.Fill(maxwell3d.regs, device.IsExtExtendedDynamicStateSupported());
451 272
452 buffer_cache.Map(CalculateGraphicsStreamBufferSize(is_indexed)); 273 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
453
454 BufferBindings buffer_bindings;
455 const DrawParameters draw_params =
456 SetupGeometry(key.fixed_state, buffer_bindings, is_indexed, is_instanced);
457 274
458 auto lock = texture_cache.AcquireLock();
459 texture_cache.SynchronizeGraphicsDescriptors(); 275 texture_cache.SynchronizeGraphicsDescriptors();
460
461 texture_cache.UpdateRenderTargets(false); 276 texture_cache.UpdateRenderTargets(false);
462 277
463 const auto shaders = pipeline_cache.GetShaders(); 278 const auto shaders = pipeline_cache.GetShaders();
464 key.shaders = GetShaderAddresses(shaders); 279 key.shaders = GetShaderAddresses(shaders);
465 SetupShaderDescriptors(shaders); 280 SetupShaderDescriptors(shaders, is_indexed);
466
467 buffer_cache.Unmap();
468 281
469 const Framebuffer* const framebuffer = texture_cache.GetFramebuffer(); 282 const Framebuffer* const framebuffer = texture_cache.GetFramebuffer();
470 key.renderpass = framebuffer->RenderPass(); 283 key.renderpass = framebuffer->RenderPass();
@@ -476,22 +289,29 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
476 return; 289 return;
477 } 290 }
478 291
479 buffer_bindings.Bind(device, scheduler);
480
481 BeginTransformFeedback(); 292 BeginTransformFeedback();
482 293
483 scheduler.RequestRenderpass(framebuffer); 294 scheduler.RequestRenderpass(framebuffer);
484 scheduler.BindGraphicsPipeline(pipeline->GetHandle()); 295 scheduler.BindGraphicsPipeline(pipeline->GetHandle());
485 UpdateDynamicStates(); 296 UpdateDynamicStates();
486 297
487 const auto pipeline_layout = pipeline->GetLayout(); 298 const auto& regs = maxwell3d.regs;
488 const auto descriptor_set = pipeline->CommitDescriptorSet(); 299 const u32 num_instances = maxwell3d.mme_draw.instance_count;
300 const DrawParams draw_params = MakeDrawParams(regs, num_instances, is_instanced, is_indexed);
301 const VkPipelineLayout pipeline_layout = pipeline->GetLayout();
302 const VkDescriptorSet descriptor_set = pipeline->CommitDescriptorSet();
489 scheduler.Record([pipeline_layout, descriptor_set, draw_params](vk::CommandBuffer cmdbuf) { 303 scheduler.Record([pipeline_layout, descriptor_set, draw_params](vk::CommandBuffer cmdbuf) {
490 if (descriptor_set) { 304 if (descriptor_set) {
491 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, 305 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout,
492 DESCRIPTOR_SET, descriptor_set, {}); 306 DESCRIPTOR_SET, descriptor_set, nullptr);
307 }
308 if (draw_params.is_indexed) {
309 cmdbuf.DrawIndexed(draw_params.num_vertices, draw_params.num_instances, 0,
310 draw_params.base_vertex, draw_params.base_instance);
311 } else {
312 cmdbuf.Draw(draw_params.num_vertices, draw_params.num_instances,
313 draw_params.base_vertex, draw_params.base_instance);
493 } 314 }
494 draw_params.Draw(cmdbuf);
495 }); 315 });
496 316
497 EndTransformFeedback(); 317 EndTransformFeedback();
@@ -515,7 +335,7 @@ void RasterizerVulkan::Clear() {
515 return; 335 return;
516 } 336 }
517 337
518 auto lock = texture_cache.AcquireLock(); 338 std::scoped_lock lock{texture_cache.mutex};
519 texture_cache.UpdateRenderTargets(true); 339 texture_cache.UpdateRenderTargets(true);
520 const Framebuffer* const framebuffer = texture_cache.GetFramebuffer(); 340 const Framebuffer* const framebuffer = texture_cache.GetFramebuffer();
521 const VkExtent2D render_area = framebuffer->RenderArea(); 341 const VkExtent2D render_area = framebuffer->RenderArea();
@@ -559,7 +379,6 @@ void RasterizerVulkan::Clear() {
559 if (use_stencil) { 379 if (use_stencil) {
560 aspect_flags |= VK_IMAGE_ASPECT_STENCIL_BIT; 380 aspect_flags |= VK_IMAGE_ASPECT_STENCIL_BIT;
561 } 381 }
562
563 scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil, 382 scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil,
564 clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) { 383 clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) {
565 VkClearAttachment attachment; 384 VkClearAttachment attachment;
@@ -580,12 +399,11 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
580 auto& pipeline = pipeline_cache.GetComputePipeline({ 399 auto& pipeline = pipeline_cache.GetComputePipeline({
581 .shader = code_addr, 400 .shader = code_addr,
582 .shared_memory_size = launch_desc.shared_alloc, 401 .shared_memory_size = launch_desc.shared_alloc,
583 .workgroup_size = 402 .workgroup_size{
584 { 403 launch_desc.block_dim_x,
585 launch_desc.block_dim_x, 404 launch_desc.block_dim_y,
586 launch_desc.block_dim_y, 405 launch_desc.block_dim_z,
587 launch_desc.block_dim_z, 406 },
588 },
589 }); 407 });
590 408
591 // Compute dispatches can't be executed inside a renderpass 409 // Compute dispatches can't be executed inside a renderpass
@@ -594,10 +412,21 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
594 image_view_indices.clear(); 412 image_view_indices.clear();
595 sampler_handles.clear(); 413 sampler_handles.clear();
596 414
597 auto lock = texture_cache.AcquireLock(); 415 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
598 texture_cache.SynchronizeComputeDescriptors();
599 416
600 const auto& entries = pipeline.GetEntries(); 417 const auto& entries = pipeline.GetEntries();
418 buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers);
419 buffer_cache.UnbindComputeStorageBuffers();
420 u32 ssbo_index = 0;
421 for (const auto& buffer : entries.global_buffers) {
422 buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset,
423 buffer.is_written);
424 ++ssbo_index;
425 }
426 buffer_cache.UpdateComputeBuffers();
427
428 texture_cache.SynchronizeComputeDescriptors();
429
601 SetupComputeUniformTexels(entries); 430 SetupComputeUniformTexels(entries);
602 SetupComputeTextures(entries); 431 SetupComputeTextures(entries);
603 SetupComputeStorageTexels(entries); 432 SetupComputeStorageTexels(entries);
@@ -606,20 +435,15 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
606 const std::span indices_span(image_view_indices.data(), image_view_indices.size()); 435 const std::span indices_span(image_view_indices.data(), image_view_indices.size());
607 texture_cache.FillComputeImageViews(indices_span, image_view_ids); 436 texture_cache.FillComputeImageViews(indices_span, image_view_ids);
608 437
609 buffer_cache.Map(CalculateComputeStreamBufferSize());
610
611 update_descriptor_queue.Acquire(); 438 update_descriptor_queue.Acquire();
612 439
613 SetupComputeConstBuffers(entries); 440 buffer_cache.BindHostComputeBuffers();
614 SetupComputeGlobalBuffers(entries);
615 441
616 ImageViewId* image_view_id_ptr = image_view_ids.data(); 442 ImageViewId* image_view_id_ptr = image_view_ids.data();
617 VkSampler* sampler_ptr = sampler_handles.data(); 443 VkSampler* sampler_ptr = sampler_handles.data();
618 PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr, 444 PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr,
619 sampler_ptr); 445 sampler_ptr);
620 446
621 buffer_cache.Unmap();
622
623 const VkPipeline pipeline_handle = pipeline.GetHandle(); 447 const VkPipeline pipeline_handle = pipeline.GetHandle();
624 const VkPipelineLayout pipeline_layout = pipeline.GetLayout(); 448 const VkPipelineLayout pipeline_layout = pipeline.GetLayout();
625 const VkDescriptorSet descriptor_set = pipeline.CommitDescriptorSet(); 449 const VkDescriptorSet descriptor_set = pipeline.CommitDescriptorSet();
@@ -644,6 +468,11 @@ void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
644 query_cache.Query(gpu_addr, type, timestamp); 468 query_cache.Query(gpu_addr, type, timestamp);
645} 469}
646 470
471void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
472 u32 size) {
473 buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size);
474}
475
647void RasterizerVulkan::FlushAll() {} 476void RasterizerVulkan::FlushAll() {}
648 477
649void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) { 478void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) {
@@ -651,19 +480,23 @@ void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) {
651 return; 480 return;
652 } 481 }
653 { 482 {
654 auto lock = texture_cache.AcquireLock(); 483 std::scoped_lock lock{texture_cache.mutex};
655 texture_cache.DownloadMemory(addr, size); 484 texture_cache.DownloadMemory(addr, size);
656 } 485 }
657 buffer_cache.FlushRegion(addr, size); 486 {
487 std::scoped_lock lock{buffer_cache.mutex};
488 buffer_cache.DownloadMemory(addr, size);
489 }
658 query_cache.FlushRegion(addr, size); 490 query_cache.FlushRegion(addr, size);
659} 491}
660 492
661bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) { 493bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) {
494 std::scoped_lock lock{texture_cache.mutex, buffer_cache.mutex};
662 if (!Settings::IsGPULevelHigh()) { 495 if (!Settings::IsGPULevelHigh()) {
663 return buffer_cache.MustFlushRegion(addr, size); 496 return buffer_cache.IsRegionGpuModified(addr, size);
664 } 497 }
665 return texture_cache.IsRegionGpuModified(addr, size) || 498 return texture_cache.IsRegionGpuModified(addr, size) ||
666 buffer_cache.MustFlushRegion(addr, size); 499 buffer_cache.IsRegionGpuModified(addr, size);
667} 500}
668 501
669void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) { 502void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) {
@@ -671,11 +504,14 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) {
671 return; 504 return;
672 } 505 }
673 { 506 {
674 auto lock = texture_cache.AcquireLock(); 507 std::scoped_lock lock{texture_cache.mutex};
675 texture_cache.WriteMemory(addr, size); 508 texture_cache.WriteMemory(addr, size);
676 } 509 }
510 {
511 std::scoped_lock lock{buffer_cache.mutex};
512 buffer_cache.WriteMemory(addr, size);
513 }
677 pipeline_cache.InvalidateRegion(addr, size); 514 pipeline_cache.InvalidateRegion(addr, size);
678 buffer_cache.InvalidateRegion(addr, size);
679 query_cache.InvalidateRegion(addr, size); 515 query_cache.InvalidateRegion(addr, size);
680} 516}
681 517
@@ -683,25 +519,34 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
683 if (addr == 0 || size == 0) { 519 if (addr == 0 || size == 0) {
684 return; 520 return;
685 } 521 }
522 pipeline_cache.OnCPUWrite(addr, size);
686 { 523 {
687 auto lock = texture_cache.AcquireLock(); 524 std::scoped_lock lock{texture_cache.mutex};
688 texture_cache.WriteMemory(addr, size); 525 texture_cache.WriteMemory(addr, size);
689 } 526 }
690 pipeline_cache.OnCPUWrite(addr, size); 527 {
691 buffer_cache.OnCPUWrite(addr, size); 528 std::scoped_lock lock{buffer_cache.mutex};
529 buffer_cache.CachedWriteMemory(addr, size);
530 }
692} 531}
693 532
694void RasterizerVulkan::SyncGuestHost() { 533void RasterizerVulkan::SyncGuestHost() {
695 buffer_cache.SyncGuestHost();
696 pipeline_cache.SyncGuestHost(); 534 pipeline_cache.SyncGuestHost();
535 {
536 std::scoped_lock lock{buffer_cache.mutex};
537 buffer_cache.FlushCachedWrites();
538 }
697} 539}
698 540
699void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) { 541void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) {
700 { 542 {
701 auto lock = texture_cache.AcquireLock(); 543 std::scoped_lock lock{texture_cache.mutex};
702 texture_cache.UnmapMemory(addr, size); 544 texture_cache.UnmapMemory(addr, size);
703 } 545 }
704 buffer_cache.OnCPUWrite(addr, size); 546 {
547 std::scoped_lock lock{buffer_cache.mutex};
548 buffer_cache.WriteMemory(addr, size);
549 }
705 pipeline_cache.OnCPUWrite(addr, size); 550 pipeline_cache.OnCPUWrite(addr, size);
706} 551}
707 552
@@ -774,18 +619,21 @@ void RasterizerVulkan::TickFrame() {
774 draw_counter = 0; 619 draw_counter = 0;
775 update_descriptor_queue.TickFrame(); 620 update_descriptor_queue.TickFrame();
776 fence_manager.TickFrame(); 621 fence_manager.TickFrame();
777 buffer_cache.TickFrame();
778 staging_pool.TickFrame(); 622 staging_pool.TickFrame();
779 { 623 {
780 auto lock = texture_cache.AcquireLock(); 624 std::scoped_lock lock{texture_cache.mutex};
781 texture_cache.TickFrame(); 625 texture_cache.TickFrame();
782 } 626 }
627 {
628 std::scoped_lock lock{buffer_cache.mutex};
629 buffer_cache.TickFrame();
630 }
783} 631}
784 632
785bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, 633bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
786 const Tegra::Engines::Fermi2D::Surface& dst, 634 const Tegra::Engines::Fermi2D::Surface& dst,
787 const Tegra::Engines::Fermi2D::Config& copy_config) { 635 const Tegra::Engines::Fermi2D::Config& copy_config) {
788 auto lock = texture_cache.AcquireLock(); 636 std::scoped_lock lock{texture_cache.mutex};
789 texture_cache.BlitImage(dst, src, copy_config); 637 texture_cache.BlitImage(dst, src, copy_config);
790 return true; 638 return true;
791} 639}
@@ -795,13 +643,11 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
795 if (!framebuffer_addr) { 643 if (!framebuffer_addr) {
796 return false; 644 return false;
797 } 645 }
798 646 std::scoped_lock lock{texture_cache.mutex};
799 auto lock = texture_cache.AcquireLock();
800 ImageView* const image_view = texture_cache.TryFindFramebufferImageView(framebuffer_addr); 647 ImageView* const image_view = texture_cache.TryFindFramebufferImageView(framebuffer_addr);
801 if (!image_view) { 648 if (!image_view) {
802 return false; 649 return false;
803 } 650 }
804
805 screen_info.image_view = image_view->Handle(VideoCommon::ImageViewType::e2D); 651 screen_info.image_view = image_view->Handle(VideoCommon::ImageViewType::e2D);
806 screen_info.width = image_view->size.width; 652 screen_info.width = image_view->size.width;
807 screen_info.height = image_view->size.height; 653 screen_info.height = image_view->size.height;
@@ -830,29 +676,8 @@ void RasterizerVulkan::FlushWork() {
830 draw_counter = 0; 676 draw_counter = 0;
831} 677}
832 678
833RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineState& fixed_state,
834 BufferBindings& buffer_bindings,
835 bool is_indexed,
836 bool is_instanced) {
837 MICROPROFILE_SCOPE(Vulkan_Geometry);
838
839 const auto& regs = maxwell3d.regs;
840
841 SetupVertexArrays(buffer_bindings);
842
843 const u32 base_instance = regs.vb_base_instance;
844 const u32 num_instances = is_instanced ? maxwell3d.mme_draw.instance_count : 1;
845 const u32 base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first;
846 const u32 num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count;
847
848 DrawParameters params{base_instance, num_instances, base_vertex, num_vertices, is_indexed};
849 SetupIndexBuffer(buffer_bindings, params, is_indexed);
850
851 return params;
852}
853
854void RasterizerVulkan::SetupShaderDescriptors( 679void RasterizerVulkan::SetupShaderDescriptors(
855 const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { 680 const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders, bool is_indexed) {
856 image_view_indices.clear(); 681 image_view_indices.clear();
857 sampler_handles.clear(); 682 sampler_handles.clear();
858 for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { 683 for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
@@ -860,15 +685,27 @@ void RasterizerVulkan::SetupShaderDescriptors(
860 if (!shader) { 685 if (!shader) {
861 continue; 686 continue;
862 } 687 }
863 const auto& entries = shader->GetEntries(); 688 const ShaderEntries& entries = shader->GetEntries();
864 SetupGraphicsUniformTexels(entries, stage); 689 SetupGraphicsUniformTexels(entries, stage);
865 SetupGraphicsTextures(entries, stage); 690 SetupGraphicsTextures(entries, stage);
866 SetupGraphicsStorageTexels(entries, stage); 691 SetupGraphicsStorageTexels(entries, stage);
867 SetupGraphicsImages(entries, stage); 692 SetupGraphicsImages(entries, stage);
693
694 buffer_cache.SetEnabledUniformBuffers(stage, entries.enabled_uniform_buffers);
695 buffer_cache.UnbindGraphicsStorageBuffers(stage);
696 u32 ssbo_index = 0;
697 for (const auto& buffer : entries.global_buffers) {
698 buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index,
699 buffer.cbuf_offset, buffer.is_written);
700 ++ssbo_index;
701 }
868 } 702 }
869 const std::span indices_span(image_view_indices.data(), image_view_indices.size()); 703 const std::span indices_span(image_view_indices.data(), image_view_indices.size());
704 buffer_cache.UpdateGraphicsBuffers(is_indexed);
870 texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); 705 texture_cache.FillGraphicsImageViews(indices_span, image_view_ids);
871 706
707 buffer_cache.BindHostGeometryBuffers(is_indexed);
708
872 update_descriptor_queue.Acquire(); 709 update_descriptor_queue.Acquire();
873 710
874 ImageViewId* image_view_id_ptr = image_view_ids.data(); 711 ImageViewId* image_view_id_ptr = image_view_ids.data();
@@ -879,11 +716,9 @@ void RasterizerVulkan::SetupShaderDescriptors(
879 if (!shader) { 716 if (!shader) {
880 continue; 717 continue;
881 } 718 }
882 const auto& entries = shader->GetEntries(); 719 buffer_cache.BindHostStageBuffers(stage);
883 SetupGraphicsConstBuffers(entries, stage); 720 PushImageDescriptors(shader->GetEntries(), texture_cache, update_descriptor_queue,
884 SetupGraphicsGlobalBuffers(entries, stage); 721 image_view_id_ptr, sampler_ptr);
885 PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr,
886 sampler_ptr);
887 } 722 }
888} 723}
889 724
@@ -916,27 +751,11 @@ void RasterizerVulkan::BeginTransformFeedback() {
916 LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); 751 LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
917 return; 752 return;
918 } 753 }
919
920 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || 754 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
921 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || 755 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
922 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); 756 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
923 757 scheduler.Record(
924 UNIMPLEMENTED_IF(regs.tfb_bindings[1].buffer_enable); 758 [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); });
925 UNIMPLEMENTED_IF(regs.tfb_bindings[2].buffer_enable);
926 UNIMPLEMENTED_IF(regs.tfb_bindings[3].buffer_enable);
927
928 const auto& binding = regs.tfb_bindings[0];
929 UNIMPLEMENTED_IF(binding.buffer_enable == 0);
930 UNIMPLEMENTED_IF(binding.buffer_offset != 0);
931
932 const GPUVAddr gpu_addr = binding.Address();
933 const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size);
934 const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
935
936 scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) {
937 cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size);
938 cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr);
939 });
940} 759}
941 760
942void RasterizerVulkan::EndTransformFeedback() { 761void RasterizerVulkan::EndTransformFeedback() {
@@ -947,104 +766,11 @@ void RasterizerVulkan::EndTransformFeedback() {
947 if (!device.IsExtTransformFeedbackSupported()) { 766 if (!device.IsExtTransformFeedbackSupported()) {
948 return; 767 return;
949 } 768 }
950
951 scheduler.Record( 769 scheduler.Record(
952 [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); 770 [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
953} 771}
954 772
955void RasterizerVulkan::SetupVertexArrays(BufferBindings& buffer_bindings) {
956 const auto& regs = maxwell3d.regs;
957
958 for (size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
959 const auto& vertex_array = regs.vertex_array[index];
960 if (!vertex_array.IsEnabled()) {
961 continue;
962 }
963 const GPUVAddr start{vertex_array.StartAddress()};
964 const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
965
966 ASSERT(end >= start);
967 const size_t size = end - start;
968 if (size == 0) {
969 buffer_bindings.AddVertexBinding(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE, 0);
970 continue;
971 }
972 const auto info = buffer_cache.UploadMemory(start, size);
973 buffer_bindings.AddVertexBinding(info.handle, info.offset, size, vertex_array.stride);
974 }
975}
976
977void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params,
978 bool is_indexed) {
979 if (params.num_vertices == 0) {
980 return;
981 }
982 const auto& regs = maxwell3d.regs;
983 switch (regs.draw.topology) {
984 case Maxwell::PrimitiveTopology::Quads: {
985 if (!params.is_indexed) {
986 const auto [buffer, offset] =
987 quad_array_pass.Assemble(params.num_vertices, params.base_vertex);
988 buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32);
989 params.base_vertex = 0;
990 params.num_vertices = params.num_vertices * 6 / 4;
991 params.is_indexed = true;
992 break;
993 }
994 const GPUVAddr gpu_addr = regs.index_array.IndexStart();
995 const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
996 VkBuffer buffer = info.handle;
997 u64 offset = info.offset;
998 std::tie(buffer, offset) = quad_indexed_pass.Assemble(
999 regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset);
1000
1001 buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32);
1002 params.num_vertices = (params.num_vertices / 4) * 6;
1003 params.base_vertex = 0;
1004 break;
1005 }
1006 default: {
1007 if (!is_indexed) {
1008 break;
1009 }
1010 const GPUVAddr gpu_addr = regs.index_array.IndexStart();
1011 const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
1012 VkBuffer buffer = info.handle;
1013 u64 offset = info.offset;
1014
1015 auto format = regs.index_array.format;
1016 const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte;
1017 if (is_uint8 && !device.IsExtIndexTypeUint8Supported()) {
1018 std::tie(buffer, offset) = uint8_pass.Assemble(params.num_vertices, buffer, offset);
1019 format = Maxwell::IndexFormat::UnsignedShort;
1020 }
1021
1022 buffer_bindings.SetIndexBinding(buffer, offset, MaxwellToVK::IndexFormat(device, format));
1023 break;
1024 }
1025 }
1026}
1027
1028void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, size_t stage) {
1029 MICROPROFILE_SCOPE(Vulkan_ConstBuffers);
1030 const auto& shader_stage = maxwell3d.state.shader_stages[stage];
1031 for (const auto& entry : entries.const_buffers) {
1032 SetupConstBuffer(entry, shader_stage.const_buffers[entry.GetIndex()]);
1033 }
1034}
1035
1036void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, size_t stage) {
1037 MICROPROFILE_SCOPE(Vulkan_GlobalBuffers);
1038 const auto& cbufs{maxwell3d.state.shader_stages[stage]};
1039
1040 for (const auto& entry : entries.global_buffers) {
1041 const auto addr = cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset();
1042 SetupGlobalBuffer(entry, addr);
1043 }
1044}
1045
1046void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, size_t stage) { 773void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, size_t stage) {
1047 MICROPROFILE_SCOPE(Vulkan_Textures);
1048 const auto& regs = maxwell3d.regs; 774 const auto& regs = maxwell3d.regs;
1049 const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; 775 const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
1050 for (const auto& entry : entries.uniform_texels) { 776 for (const auto& entry : entries.uniform_texels) {
@@ -1054,7 +780,6 @@ void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries,
1054} 780}
1055 781
1056void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_t stage) { 782void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_t stage) {
1057 MICROPROFILE_SCOPE(Vulkan_Textures);
1058 const auto& regs = maxwell3d.regs; 783 const auto& regs = maxwell3d.regs;
1059 const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; 784 const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
1060 for (const auto& entry : entries.samplers) { 785 for (const auto& entry : entries.samplers) {
@@ -1070,7 +795,6 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_
1070} 795}
1071 796
1072void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, size_t stage) { 797void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, size_t stage) {
1073 MICROPROFILE_SCOPE(Vulkan_Textures);
1074 const auto& regs = maxwell3d.regs; 798 const auto& regs = maxwell3d.regs;
1075 const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; 799 const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
1076 for (const auto& entry : entries.storage_texels) { 800 for (const auto& entry : entries.storage_texels) {
@@ -1080,7 +804,6 @@ void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries,
1080} 804}
1081 805
1082void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t stage) { 806void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t stage) {
1083 MICROPROFILE_SCOPE(Vulkan_Images);
1084 const auto& regs = maxwell3d.regs; 807 const auto& regs = maxwell3d.regs;
1085 const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; 808 const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
1086 for (const auto& entry : entries.images) { 809 for (const auto& entry : entries.images) {
@@ -1089,32 +812,7 @@ void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t
1089 } 812 }
1090} 813}
1091 814
1092void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) {
1093 MICROPROFILE_SCOPE(Vulkan_ConstBuffers);
1094 const auto& launch_desc = kepler_compute.launch_description;
1095 for (const auto& entry : entries.const_buffers) {
1096 const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
1097 const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
1098 const Tegra::Engines::ConstBufferInfo info{
1099 .address = config.Address(),
1100 .size = config.size,
1101 .enabled = mask[entry.GetIndex()],
1102 };
1103 SetupConstBuffer(entry, info);
1104 }
1105}
1106
1107void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
1108 MICROPROFILE_SCOPE(Vulkan_GlobalBuffers);
1109 const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
1110 for (const auto& entry : entries.global_buffers) {
1111 const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
1112 SetupGlobalBuffer(entry, addr);
1113 }
1114}
1115
1116void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) { 815void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
1117 MICROPROFILE_SCOPE(Vulkan_Textures);
1118 const bool via_header_index = kepler_compute.launch_description.linked_tsc; 816 const bool via_header_index = kepler_compute.launch_description.linked_tsc;
1119 for (const auto& entry : entries.uniform_texels) { 817 for (const auto& entry : entries.uniform_texels) {
1120 const TextureHandle handle = 818 const TextureHandle handle =
@@ -1124,7 +822,6 @@ void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
1124} 822}
1125 823
1126void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { 824void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
1127 MICROPROFILE_SCOPE(Vulkan_Textures);
1128 const bool via_header_index = kepler_compute.launch_description.linked_tsc; 825 const bool via_header_index = kepler_compute.launch_description.linked_tsc;
1129 for (const auto& entry : entries.samplers) { 826 for (const auto& entry : entries.samplers) {
1130 for (size_t index = 0; index < entry.size; ++index) { 827 for (size_t index = 0; index < entry.size; ++index) {
@@ -1139,7 +836,6 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
1139} 836}
1140 837
1141void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) { 838void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
1142 MICROPROFILE_SCOPE(Vulkan_Textures);
1143 const bool via_header_index = kepler_compute.launch_description.linked_tsc; 839 const bool via_header_index = kepler_compute.launch_description.linked_tsc;
1144 for (const auto& entry : entries.storage_texels) { 840 for (const auto& entry : entries.storage_texels) {
1145 const TextureHandle handle = 841 const TextureHandle handle =
@@ -1149,7 +845,6 @@ void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
1149} 845}
1150 846
1151void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { 847void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
1152 MICROPROFILE_SCOPE(Vulkan_Images);
1153 const bool via_header_index = kepler_compute.launch_description.linked_tsc; 848 const bool via_header_index = kepler_compute.launch_description.linked_tsc;
1154 for (const auto& entry : entries.images) { 849 for (const auto& entry : entries.images) {
1155 const TextureHandle handle = 850 const TextureHandle handle =
@@ -1158,42 +853,6 @@ void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
1158 } 853 }
1159} 854}
1160 855
1161void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
1162 const Tegra::Engines::ConstBufferInfo& buffer) {
1163 if (!buffer.enabled) {
1164 // Set values to zero to unbind buffers
1165 update_descriptor_queue.AddBuffer(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE);
1166 return;
1167 }
1168 // Align the size to avoid bad std140 interactions
1169 const size_t size = Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float));
1170 ASSERT(size <= MaxConstbufferSize);
1171
1172 const u64 alignment = device.GetUniformBufferAlignment();
1173 const auto info = buffer_cache.UploadMemory(buffer.address, size, alignment);
1174 update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
1175}
1176
1177void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) {
1178 const u64 actual_addr = gpu_memory.Read<u64>(address);
1179 const u32 size = gpu_memory.Read<u32>(address + 8);
1180
1181 if (size == 0) {
1182 // Sometimes global memory pointers don't have a proper size. Upload a dummy entry
1183 // because Vulkan doesn't like empty buffers.
1184 // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the
1185 // default buffer.
1186 static constexpr size_t dummy_size = 4;
1187 const auto info = buffer_cache.GetEmptyBuffer(dummy_size);
1188 update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size);
1189 return;
1190 }
1191
1192 const auto info = buffer_cache.UploadMemory(
1193 actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten());
1194 update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
1195}
1196
1197void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { 856void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) {
1198 if (!state_tracker.TouchViewports()) { 857 if (!state_tracker.TouchViewports()) {
1199 return; 858 return;
@@ -1206,7 +865,8 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg
1206 GetViewportState(device, regs, 8), GetViewportState(device, regs, 9), 865 GetViewportState(device, regs, 8), GetViewportState(device, regs, 9),
1207 GetViewportState(device, regs, 10), GetViewportState(device, regs, 11), 866 GetViewportState(device, regs, 10), GetViewportState(device, regs, 11),
1208 GetViewportState(device, regs, 12), GetViewportState(device, regs, 13), 867 GetViewportState(device, regs, 12), GetViewportState(device, regs, 13),
1209 GetViewportState(device, regs, 14), GetViewportState(device, regs, 15)}; 868 GetViewportState(device, regs, 14), GetViewportState(device, regs, 15),
869 };
1210 scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); }); 870 scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); });
1211} 871}
1212 872
@@ -1214,13 +874,14 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs
1214 if (!state_tracker.TouchScissors()) { 874 if (!state_tracker.TouchScissors()) {
1215 return; 875 return;
1216 } 876 }
1217 const std::array scissors = { 877 const std::array scissors{
1218 GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2), 878 GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2),
1219 GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5), 879 GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5),
1220 GetScissorState(regs, 6), GetScissorState(regs, 7), GetScissorState(regs, 8), 880 GetScissorState(regs, 6), GetScissorState(regs, 7), GetScissorState(regs, 8),
1221 GetScissorState(regs, 9), GetScissorState(regs, 10), GetScissorState(regs, 11), 881 GetScissorState(regs, 9), GetScissorState(regs, 10), GetScissorState(regs, 11),
1222 GetScissorState(regs, 12), GetScissorState(regs, 13), GetScissorState(regs, 14), 882 GetScissorState(regs, 12), GetScissorState(regs, 13), GetScissorState(regs, 14),
1223 GetScissorState(regs, 15)}; 883 GetScissorState(regs, 15),
884 };
1224 scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); }); 885 scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); });
1225} 886}
1226 887
@@ -1385,73 +1046,4 @@ void RasterizerVulkan::UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs&
1385 }); 1046 });
1386} 1047}
1387 1048
1388size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) const {
1389 size_t size = CalculateVertexArraysSize();
1390 if (is_indexed) {
1391 size = Common::AlignUp(size, 4) + CalculateIndexBufferSize();
1392 }
1393 size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment());
1394 return size;
1395}
1396
1397size_t RasterizerVulkan::CalculateComputeStreamBufferSize() const {
1398 return Tegra::Engines::KeplerCompute::NumConstBuffers *
1399 (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
1400}
1401
1402size_t RasterizerVulkan::CalculateVertexArraysSize() const {
1403 const auto& regs = maxwell3d.regs;
1404
1405 size_t size = 0;
1406 for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
1407 // This implementation assumes that all attributes are used in the shader.
1408 const GPUVAddr start{regs.vertex_array[index].StartAddress()};
1409 const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
1410 DEBUG_ASSERT(end >= start);
1411
1412 size += (end - start) * regs.vertex_array[index].enable;
1413 }
1414 return size;
1415}
1416
1417size_t RasterizerVulkan::CalculateIndexBufferSize() const {
1418 return static_cast<size_t>(maxwell3d.regs.index_array.count) *
1419 static_cast<size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
1420}
1421
1422size_t RasterizerVulkan::CalculateConstBufferSize(
1423 const ConstBufferEntry& entry, const Tegra::Engines::ConstBufferInfo& buffer) const {
1424 if (entry.IsIndirect()) {
1425 // Buffer is accessed indirectly, so upload the entire thing
1426 return buffer.size;
1427 } else {
1428 // Buffer is accessed directly, upload just what we use
1429 return entry.GetSize();
1430 }
1431}
1432
1433VkBuffer RasterizerVulkan::DefaultBuffer() {
1434 if (default_buffer) {
1435 return *default_buffer;
1436 }
1437 default_buffer = device.GetLogical().CreateBuffer({
1438 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
1439 .pNext = nullptr,
1440 .flags = 0,
1441 .size = DEFAULT_BUFFER_SIZE,
1442 .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
1443 VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
1444 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
1445 .queueFamilyIndexCount = 0,
1446 .pQueueFamilyIndices = nullptr,
1447 });
1448 default_buffer_commit = memory_allocator.Commit(default_buffer, MemoryUsage::DeviceLocal);
1449
1450 scheduler.RequestOutsideRenderPassOperationContext();
1451 scheduler.Record([buffer = *default_buffer](vk::CommandBuffer cmdbuf) {
1452 cmdbuf.FillBuffer(buffer, 0, DEFAULT_BUFFER_SIZE, 0);
1453 });
1454 return *default_buffer;
1455}
1456
1457} // namespace Vulkan 1049} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 8e261b9bd..7fc6741da 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -18,14 +18,12 @@
18#include "video_core/renderer_vulkan/blit_image.h" 18#include "video_core/renderer_vulkan/blit_image.h"
19#include "video_core/renderer_vulkan/fixed_pipeline_state.h" 19#include "video_core/renderer_vulkan/fixed_pipeline_state.h"
20#include "video_core/renderer_vulkan/vk_buffer_cache.h" 20#include "video_core/renderer_vulkan/vk_buffer_cache.h"
21#include "video_core/renderer_vulkan/vk_compute_pass.h"
22#include "video_core/renderer_vulkan/vk_descriptor_pool.h" 21#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
23#include "video_core/renderer_vulkan/vk_fence_manager.h" 22#include "video_core/renderer_vulkan/vk_fence_manager.h"
24#include "video_core/renderer_vulkan/vk_pipeline_cache.h" 23#include "video_core/renderer_vulkan/vk_pipeline_cache.h"
25#include "video_core/renderer_vulkan/vk_query_cache.h" 24#include "video_core/renderer_vulkan/vk_query_cache.h"
26#include "video_core/renderer_vulkan/vk_scheduler.h" 25#include "video_core/renderer_vulkan/vk_scheduler.h"
27#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" 26#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
28#include "video_core/renderer_vulkan/vk_stream_buffer.h"
29#include "video_core/renderer_vulkan/vk_texture_cache.h" 27#include "video_core/renderer_vulkan/vk_texture_cache.h"
30#include "video_core/renderer_vulkan/vk_update_descriptor.h" 28#include "video_core/renderer_vulkan/vk_update_descriptor.h"
31#include "video_core/shader/async_shaders.h" 29#include "video_core/shader/async_shaders.h"
@@ -49,7 +47,6 @@ namespace Vulkan {
49struct VKScreenInfo; 47struct VKScreenInfo;
50 48
51class StateTracker; 49class StateTracker;
52class BufferBindings;
53 50
54class RasterizerVulkan final : public VideoCore::RasterizerAccelerated { 51class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {
55public: 52public:
@@ -65,6 +62,7 @@ public:
65 void DispatchCompute(GPUVAddr code_addr) override; 62 void DispatchCompute(GPUVAddr code_addr) override;
66 void ResetCounter(VideoCore::QueryType type) override; 63 void ResetCounter(VideoCore::QueryType type) override;
67 void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; 64 void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
65 void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
68 void FlushAll() override; 66 void FlushAll() override;
69 void FlushRegion(VAddr addr, u64 size) override; 67 void FlushRegion(VAddr addr, u64 size) override;
70 bool MustFlushRegion(VAddr addr, u64 size) override; 68 bool MustFlushRegion(VAddr addr, u64 size) override;
@@ -107,24 +105,11 @@ private:
107 105
108 static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float); 106 static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float);
109 107
110 struct DrawParameters {
111 void Draw(vk::CommandBuffer cmdbuf) const;
112
113 u32 base_instance = 0;
114 u32 num_instances = 0;
115 u32 base_vertex = 0;
116 u32 num_vertices = 0;
117 bool is_indexed = 0;
118 };
119
120 void FlushWork(); 108 void FlushWork();
121 109
122 /// Setups geometry buffers and state.
123 DrawParameters SetupGeometry(FixedPipelineState& fixed_state, BufferBindings& buffer_bindings,
124 bool is_indexed, bool is_instanced);
125
126 /// Setup descriptors in the graphics pipeline. 110 /// Setup descriptors in the graphics pipeline.
127 void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders); 111 void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders,
112 bool is_indexed);
128 113
129 void UpdateDynamicStates(); 114 void UpdateDynamicStates();
130 115
@@ -132,16 +117,6 @@ private:
132 117
133 void EndTransformFeedback(); 118 void EndTransformFeedback();
134 119
135 void SetupVertexArrays(BufferBindings& buffer_bindings);
136
137 void SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed);
138
139 /// Setup constant buffers in the graphics pipeline.
140 void SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage);
141
142 /// Setup global buffers in the graphics pipeline.
143 void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage);
144
145 /// Setup uniform texels in the graphics pipeline. 120 /// Setup uniform texels in the graphics pipeline.
146 void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage); 121 void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);
147 122
@@ -154,12 +129,6 @@ private:
154 /// Setup images in the graphics pipeline. 129 /// Setup images in the graphics pipeline.
155 void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage); 130 void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage);
156 131
157 /// Setup constant buffers in the compute pipeline.
158 void SetupComputeConstBuffers(const ShaderEntries& entries);
159
160 /// Setup global buffers in the compute pipeline.
161 void SetupComputeGlobalBuffers(const ShaderEntries& entries);
162
163 /// Setup texel buffers in the compute pipeline. 132 /// Setup texel buffers in the compute pipeline.
164 void SetupComputeUniformTexels(const ShaderEntries& entries); 133 void SetupComputeUniformTexels(const ShaderEntries& entries);
165 134
@@ -172,11 +141,6 @@ private:
172 /// Setup images in the compute pipeline. 141 /// Setup images in the compute pipeline.
173 void SetupComputeImages(const ShaderEntries& entries); 142 void SetupComputeImages(const ShaderEntries& entries);
174 143
175 void SetupConstBuffer(const ConstBufferEntry& entry,
176 const Tegra::Engines::ConstBufferInfo& buffer);
177
178 void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address);
179
180 void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); 144 void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
181 void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); 145 void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs);
182 void UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs); 146 void UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs);
@@ -193,19 +157,6 @@ private:
193 void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs); 157 void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs);
194 void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs); 158 void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs);
195 159
196 size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const;
197
198 size_t CalculateComputeStreamBufferSize() const;
199
200 size_t CalculateVertexArraysSize() const;
201
202 size_t CalculateIndexBufferSize() const;
203
204 size_t CalculateConstBufferSize(const ConstBufferEntry& entry,
205 const Tegra::Engines::ConstBufferInfo& buffer) const;
206
207 VkBuffer DefaultBuffer();
208
209 Tegra::GPU& gpu; 160 Tegra::GPU& gpu;
210 Tegra::MemoryManager& gpu_memory; 161 Tegra::MemoryManager& gpu_memory;
211 Tegra::Engines::Maxwell3D& maxwell3d; 162 Tegra::Engines::Maxwell3D& maxwell3d;
@@ -217,24 +168,19 @@ private:
217 StateTracker& state_tracker; 168 StateTracker& state_tracker;
218 VKScheduler& scheduler; 169 VKScheduler& scheduler;
219 170
220 VKStreamBuffer stream_buffer;
221 StagingBufferPool staging_pool; 171 StagingBufferPool staging_pool;
222 VKDescriptorPool descriptor_pool; 172 VKDescriptorPool descriptor_pool;
223 VKUpdateDescriptorQueue update_descriptor_queue; 173 VKUpdateDescriptorQueue update_descriptor_queue;
224 BlitImageHelper blit_image; 174 BlitImageHelper blit_image;
225 QuadArrayPass quad_array_pass;
226 QuadIndexedPass quad_indexed_pass;
227 Uint8Pass uint8_pass;
228 175
229 TextureCacheRuntime texture_cache_runtime; 176 TextureCacheRuntime texture_cache_runtime;
230 TextureCache texture_cache; 177 TextureCache texture_cache;
178 BufferCacheRuntime buffer_cache_runtime;
179 BufferCache buffer_cache;
231 VKPipelineCache pipeline_cache; 180 VKPipelineCache pipeline_cache;
232 VKBufferCache buffer_cache;
233 VKQueryCache query_cache; 181 VKQueryCache query_cache;
234 VKFenceManager fence_manager; 182 VKFenceManager fence_manager;
235 183
236 vk::Buffer default_buffer;
237 MemoryCommit default_buffer_commit;
238 vk::Event wfi_event; 184 vk::Event wfi_event;
239 VideoCommon::Shader::AsyncShaders async_shaders; 185 VideoCommon::Shader::AsyncShaders async_shaders;
240 186
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 66004f9c0..f35c120b0 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -52,18 +52,6 @@ VKScheduler::~VKScheduler() {
52 worker_thread.join(); 52 worker_thread.join();
53} 53}
54 54
55u64 VKScheduler::CurrentTick() const noexcept {
56 return master_semaphore->CurrentTick();
57}
58
59bool VKScheduler::IsFree(u64 tick) const noexcept {
60 return master_semaphore->IsFree(tick);
61}
62
63void VKScheduler::Wait(u64 tick) {
64 master_semaphore->Wait(tick);
65}
66
67void VKScheduler::Flush(VkSemaphore semaphore) { 55void VKScheduler::Flush(VkSemaphore semaphore) {
68 SubmitExecution(semaphore); 56 SubmitExecution(semaphore);
69 AllocateNewContext(); 57 AllocateNewContext();
@@ -269,7 +257,7 @@ void VKScheduler::EndRenderPass() {
269 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | 257 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
270 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | 258 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
271 VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, 259 VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
272 VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, 0, nullptr, nullptr, 260 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, nullptr, nullptr,
273 vk::Span(barriers.data(), num_images)); 261 vk::Span(barriers.data(), num_images));
274 }); 262 });
275 state.renderpass = nullptr; 263 state.renderpass = nullptr;
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index 15f2987eb..3ce48e9d2 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -14,6 +14,7 @@
14#include "common/alignment.h" 14#include "common/alignment.h"
15#include "common/common_types.h" 15#include "common/common_types.h"
16#include "common/threadsafe_queue.h" 16#include "common/threadsafe_queue.h"
17#include "video_core/renderer_vulkan/vk_master_semaphore.h"
17#include "video_core/vulkan_common/vulkan_wrapper.h" 18#include "video_core/vulkan_common/vulkan_wrapper.h"
18 19
19namespace Vulkan { 20namespace Vulkan {
@@ -21,7 +22,6 @@ namespace Vulkan {
21class CommandPool; 22class CommandPool;
22class Device; 23class Device;
23class Framebuffer; 24class Framebuffer;
24class MasterSemaphore;
25class StateTracker; 25class StateTracker;
26class VKQueryCache; 26class VKQueryCache;
27 27
@@ -32,15 +32,6 @@ public:
32 explicit VKScheduler(const Device& device, StateTracker& state_tracker); 32 explicit VKScheduler(const Device& device, StateTracker& state_tracker);
33 ~VKScheduler(); 33 ~VKScheduler();
34 34
35 /// Returns the current command buffer tick.
36 [[nodiscard]] u64 CurrentTick() const noexcept;
37
38 /// Returns true when a tick has been triggered by the GPU.
39 [[nodiscard]] bool IsFree(u64 tick) const noexcept;
40
41 /// Waits for the given tick to trigger on the GPU.
42 void Wait(u64 tick);
43
44 /// Sends the current execution context to the GPU. 35 /// Sends the current execution context to the GPU.
45 void Flush(VkSemaphore semaphore = nullptr); 36 void Flush(VkSemaphore semaphore = nullptr);
46 37
@@ -82,6 +73,21 @@ public:
82 (void)chunk->Record(command); 73 (void)chunk->Record(command);
83 } 74 }
84 75
76 /// Returns the current command buffer tick.
77 [[nodiscard]] u64 CurrentTick() const noexcept {
78 return master_semaphore->CurrentTick();
79 }
80
81 /// Returns true when a tick has been triggered by the GPU.
82 [[nodiscard]] bool IsFree(u64 tick) const noexcept {
83 return master_semaphore->IsFree(tick);
84 }
85
86 /// Waits for the given tick to trigger on the GPU.
87 void Wait(u64 tick) {
88 master_semaphore->Wait(tick);
89 }
90
85 /// Returns the master timeline semaphore. 91 /// Returns the master timeline semaphore.
86 [[nodiscard]] MasterSemaphore& GetMasterSemaphore() const noexcept { 92 [[nodiscard]] MasterSemaphore& GetMasterSemaphore() const noexcept {
87 return *master_semaphore; 93 return *master_semaphore;
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 61d52b961..e165a6987 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -3127,6 +3127,9 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
3127 entries.attributes.insert(GetGenericAttributeLocation(attribute)); 3127 entries.attributes.insert(GetGenericAttributeLocation(attribute));
3128 } 3128 }
3129 } 3129 }
3130 for (const auto& buffer : entries.const_buffers) {
3131 entries.enabled_uniform_buffers |= 1U << buffer.GetIndex();
3132 }
3130 entries.clip_distances = ir.GetClipDistances(); 3133 entries.clip_distances = ir.GetClipDistances();
3131 entries.shader_length = ir.GetLength(); 3134 entries.shader_length = ir.GetLength();
3132 entries.uses_warps = ir.UsesWarps(); 3135 entries.uses_warps = ir.UsesWarps();
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
index 26381e444..5d94132a5 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -39,24 +39,7 @@ private:
39 u32 index{}; 39 u32 index{};
40}; 40};
41 41
42class GlobalBufferEntry { 42struct GlobalBufferEntry {
43public:
44 constexpr explicit GlobalBufferEntry(u32 cbuf_index_, u32 cbuf_offset_, bool is_written_)
45 : cbuf_index{cbuf_index_}, cbuf_offset{cbuf_offset_}, is_written{is_written_} {}
46
47 constexpr u32 GetCbufIndex() const {
48 return cbuf_index;
49 }
50
51 constexpr u32 GetCbufOffset() const {
52 return cbuf_offset;
53 }
54
55 constexpr bool IsWritten() const {
56 return is_written;
57 }
58
59private:
60 u32 cbuf_index{}; 43 u32 cbuf_index{};
61 u32 cbuf_offset{}; 44 u32 cbuf_offset{};
62 bool is_written{}; 45 bool is_written{};
@@ -78,6 +61,7 @@ struct ShaderEntries {
78 std::set<u32> attributes; 61 std::set<u32> attributes;
79 std::array<bool, Maxwell::NumClipDistances> clip_distances{}; 62 std::array<bool, Maxwell::NumClipDistances> clip_distances{};
80 std::size_t shader_length{}; 63 std::size_t shader_length{};
64 u32 enabled_uniform_buffers{};
81 bool uses_warps{}; 65 bool uses_warps{};
82}; 66};
83 67
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
index 1779a2e30..e81fad007 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
@@ -30,15 +30,18 @@ using Table = Maxwell3D::DirtyState::Table;
30using Flags = Maxwell3D::DirtyState::Flags; 30using Flags = Maxwell3D::DirtyState::Flags;
31 31
32Flags MakeInvalidationFlags() { 32Flags MakeInvalidationFlags() {
33 static constexpr std::array INVALIDATION_FLAGS{ 33 static constexpr int INVALIDATION_FLAGS[]{
34 Viewports, Scissors, DepthBias, BlendConstants, DepthBounds, 34 Viewports, Scissors, DepthBias, BlendConstants, DepthBounds,
35 StencilProperties, CullMode, DepthBoundsEnable, DepthTestEnable, DepthWriteEnable, 35 StencilProperties, CullMode, DepthBoundsEnable, DepthTestEnable, DepthWriteEnable,
36 DepthCompareOp, FrontFace, StencilOp, StencilTestEnable, 36 DepthCompareOp, FrontFace, StencilOp, StencilTestEnable, VertexBuffers,
37 }; 37 };
38 Flags flags{}; 38 Flags flags{};
39 for (const int flag : INVALIDATION_FLAGS) { 39 for (const int flag : INVALIDATION_FLAGS) {
40 flags[flag] = true; 40 flags[flag] = true;
41 } 41 }
42 for (int index = VertexBuffer0; index <= VertexBuffer31; ++index) {
43 flags[index] = true;
44 }
42 return flags; 45 return flags;
43} 46}
44 47
@@ -130,7 +133,7 @@ void SetupDirtyStencilTestEnable(Tables& tables) {
130StateTracker::StateTracker(Tegra::GPU& gpu) 133StateTracker::StateTracker(Tegra::GPU& gpu)
131 : flags{gpu.Maxwell3D().dirty.flags}, invalidation_flags{MakeInvalidationFlags()} { 134 : flags{gpu.Maxwell3D().dirty.flags}, invalidation_flags{MakeInvalidationFlags()} {
132 auto& tables = gpu.Maxwell3D().dirty.tables; 135 auto& tables = gpu.Maxwell3D().dirty.tables;
133 SetupDirtyRenderTargets(tables); 136 SetupDirtyFlags(tables);
134 SetupDirtyViewports(tables); 137 SetupDirtyViewports(tables);
135 SetupDirtyScissors(tables); 138 SetupDirtyScissors(tables);
136 SetupDirtyDepthBias(tables); 139 SetupDirtyDepthBias(tables);
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index aa7c5d7c6..1eeb45ca9 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -426,46 +426,47 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
426void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage image, 426void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage image,
427 VkImageAspectFlags aspect_mask, bool is_initialized, 427 VkImageAspectFlags aspect_mask, bool is_initialized,
428 std::span<const VkBufferImageCopy> copies) { 428 std::span<const VkBufferImageCopy> copies) {
429 static constexpr VkAccessFlags ACCESS_FLAGS = VK_ACCESS_SHADER_WRITE_BIT | 429 static constexpr VkAccessFlags WRITE_ACCESS_FLAGS =
430 VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | 430 VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
431 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; 431 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
432 static constexpr VkAccessFlags READ_ACCESS_FLAGS = VK_ACCESS_SHADER_READ_BIT |
433 VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
434 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT;
432 const VkImageMemoryBarrier read_barrier{ 435 const VkImageMemoryBarrier read_barrier{
433 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, 436 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
434 .pNext = nullptr, 437 .pNext = nullptr,
435 .srcAccessMask = ACCESS_FLAGS, 438 .srcAccessMask = WRITE_ACCESS_FLAGS,
436 .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, 439 .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
437 .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, 440 .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED,
438 .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 441 .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
439 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, 442 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
440 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, 443 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
441 .image = image, 444 .image = image,
442 .subresourceRange = 445 .subresourceRange{
443 { 446 .aspectMask = aspect_mask,
444 .aspectMask = aspect_mask, 447 .baseMipLevel = 0,
445 .baseMipLevel = 0, 448 .levelCount = VK_REMAINING_MIP_LEVELS,
446 .levelCount = VK_REMAINING_MIP_LEVELS, 449 .baseArrayLayer = 0,
447 .baseArrayLayer = 0, 450 .layerCount = VK_REMAINING_ARRAY_LAYERS,
448 .layerCount = VK_REMAINING_ARRAY_LAYERS, 451 },
449 },
450 }; 452 };
451 const VkImageMemoryBarrier write_barrier{ 453 const VkImageMemoryBarrier write_barrier{
452 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, 454 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
453 .pNext = nullptr, 455 .pNext = nullptr,
454 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, 456 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
455 .dstAccessMask = ACCESS_FLAGS, 457 .dstAccessMask = WRITE_ACCESS_FLAGS | READ_ACCESS_FLAGS,
456 .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 458 .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
457 .newLayout = VK_IMAGE_LAYOUT_GENERAL, 459 .newLayout = VK_IMAGE_LAYOUT_GENERAL,
458 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, 460 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
459 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, 461 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
460 .image = image, 462 .image = image,
461 .subresourceRange = 463 .subresourceRange{
462 { 464 .aspectMask = aspect_mask,
463 .aspectMask = aspect_mask, 465 .baseMipLevel = 0,
464 .baseMipLevel = 0, 466 .levelCount = VK_REMAINING_MIP_LEVELS,
465 .levelCount = VK_REMAINING_MIP_LEVELS, 467 .baseArrayLayer = 0,
466 .baseArrayLayer = 0, 468 .layerCount = VK_REMAINING_ARRAY_LAYERS,
467 .layerCount = VK_REMAINING_ARRAY_LAYERS, 469 },
468 },
469 }; 470 };
470 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 471 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
471 read_barrier); 472 read_barrier);
@@ -569,20 +570,12 @@ void TextureCacheRuntime::Finish() {
569 scheduler.Finish(); 570 scheduler.Finish();
570} 571}
571 572
572ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) { 573StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) {
573 const auto staging_ref = staging_buffer_pool.Request(size, MemoryUsage::Upload); 574 return staging_buffer_pool.Request(size, MemoryUsage::Upload);
574 return {
575 .handle = staging_ref.buffer,
576 .span = staging_ref.mapped_span,
577 };
578} 575}
579 576
580ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) { 577StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size) {
581 const auto staging_ref = staging_buffer_pool.Request(size, MemoryUsage::Download); 578 return staging_buffer_pool.Request(size, MemoryUsage::Download);
582 return {
583 .handle = staging_ref.buffer,
584 .span = staging_ref.mapped_span,
585 };
586} 579}
587 580
588void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, 581void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src,
@@ -754,7 +747,7 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
754 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | 747 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
755 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | 748 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
756 VK_ACCESS_TRANSFER_WRITE_BIT, 749 VK_ACCESS_TRANSFER_WRITE_BIT,
757 .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, 750 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
758 .oldLayout = VK_IMAGE_LAYOUT_GENERAL, 751 .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
759 .newLayout = VK_IMAGE_LAYOUT_GENERAL, 752 .newLayout = VK_IMAGE_LAYOUT_GENERAL,
760 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, 753 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@@ -765,12 +758,9 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
765 VkImageMemoryBarrier{ 758 VkImageMemoryBarrier{
766 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, 759 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
767 .pNext = nullptr, 760 .pNext = nullptr,
768 .srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | 761 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
769 VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
770 VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
771 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
772 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | 762 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
773 VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, 763 VK_ACCESS_TRANSFER_WRITE_BIT,
774 .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, 764 .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
775 .oldLayout = VK_IMAGE_LAYOUT_GENERAL, 765 .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
776 .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 766 .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
@@ -828,12 +818,12 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
828 } 818 }
829} 819}
830 820
831void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, 821void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
832 std::span<const BufferImageCopy> copies) { 822 std::span<const BufferImageCopy> copies) {
833 // TODO: Move this to another API 823 // TODO: Move this to another API
834 scheduler->RequestOutsideRenderPassOperationContext(); 824 scheduler->RequestOutsideRenderPassOperationContext();
835 std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); 825 std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask);
836 const VkBuffer src_buffer = map.handle; 826 const VkBuffer src_buffer = map.buffer;
837 const VkImage vk_image = *image; 827 const VkImage vk_image = *image;
838 const VkImageAspectFlags vk_aspect_mask = aspect_mask; 828 const VkImageAspectFlags vk_aspect_mask = aspect_mask;
839 const bool is_initialized = std::exchange(initialized, true); 829 const bool is_initialized = std::exchange(initialized, true);
@@ -843,12 +833,12 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
843 }); 833 });
844} 834}
845 835
846void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, 836void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
847 std::span<const VideoCommon::BufferCopy> copies) { 837 std::span<const VideoCommon::BufferCopy> copies) {
848 // TODO: Move this to another API 838 // TODO: Move this to another API
849 scheduler->RequestOutsideRenderPassOperationContext(); 839 scheduler->RequestOutsideRenderPassOperationContext();
850 std::vector vk_copies = TransformBufferCopies(copies, buffer_offset); 840 std::vector vk_copies = TransformBufferCopies(copies, buffer_offset);
851 const VkBuffer src_buffer = map.handle; 841 const VkBuffer src_buffer = map.buffer;
852 const VkBuffer dst_buffer = *buffer; 842 const VkBuffer dst_buffer = *buffer;
853 scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { 843 scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) {
854 // TODO: Barriers 844 // TODO: Barriers
@@ -856,13 +846,58 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
856 }); 846 });
857} 847}
858 848
859void Image::DownloadMemory(const ImageBufferMap& map, size_t buffer_offset, 849void Image::DownloadMemory(const StagingBufferRef& map, size_t buffer_offset,
860 std::span<const BufferImageCopy> copies) { 850 std::span<const BufferImageCopy> copies) {
861 std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); 851 std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask);
862 scheduler->Record([buffer = map.handle, image = *image, aspect_mask = aspect_mask, 852 scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask,
863 vk_copies](vk::CommandBuffer cmdbuf) { 853 vk_copies](vk::CommandBuffer cmdbuf) {
864 // TODO: Barriers 854 const VkImageMemoryBarrier read_barrier{
865 cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_GENERAL, buffer, vk_copies); 855 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
856 .pNext = nullptr,
857 .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
858 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
859 .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
860 .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
861 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
862 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
863 .image = image,
864 .subresourceRange{
865 .aspectMask = aspect_mask,
866 .baseMipLevel = 0,
867 .levelCount = VK_REMAINING_MIP_LEVELS,
868 .baseArrayLayer = 0,
869 .layerCount = VK_REMAINING_ARRAY_LAYERS,
870 },
871 };
872 const VkImageMemoryBarrier image_write_barrier{
873 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
874 .pNext = nullptr,
875 .srcAccessMask = 0,
876 .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
877 .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
878 .newLayout = VK_IMAGE_LAYOUT_GENERAL,
879 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
880 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
881 .image = image,
882 .subresourceRange{
883 .aspectMask = aspect_mask,
884 .baseMipLevel = 0,
885 .levelCount = VK_REMAINING_MIP_LEVELS,
886 .baseArrayLayer = 0,
887 .layerCount = VK_REMAINING_ARRAY_LAYERS,
888 },
889 };
890 const VkMemoryBarrier memory_write_barrier{
891 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
892 .pNext = nullptr,
893 .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
894 .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
895 };
896 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
897 0, read_barrier);
898 cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, vk_copies);
899 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
900 0, memory_write_barrier, nullptr, image_write_barrier);
866 }); 901 });
867} 902}
868 903
@@ -1127,7 +1162,7 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
1127 .pAttachments = attachments.data(), 1162 .pAttachments = attachments.data(),
1128 .width = key.size.width, 1163 .width = key.size.width,
1129 .height = key.size.height, 1164 .height = key.size.height,
1130 .layers = static_cast<u32>(num_layers), 1165 .layers = static_cast<u32>(std::max(num_layers, 1)),
1131 }); 1166 });
1132 if (runtime.device.HasDebuggingToolAttached()) { 1167 if (runtime.device.HasDebuggingToolAttached()) {
1133 framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str()); 1168 framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str());
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 8d29361a1..4558c3297 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -7,6 +7,7 @@
7#include <compare> 7#include <compare>
8#include <span> 8#include <span>
9 9
10#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
10#include "video_core/texture_cache/texture_cache.h" 11#include "video_core/texture_cache/texture_cache.h"
11#include "video_core/vulkan_common/vulkan_memory_allocator.h" 12#include "video_core/vulkan_common/vulkan_memory_allocator.h"
12#include "video_core/vulkan_common/vulkan_wrapper.h" 13#include "video_core/vulkan_common/vulkan_wrapper.h"
@@ -53,19 +54,6 @@ struct hash<Vulkan::RenderPassKey> {
53 54
54namespace Vulkan { 55namespace Vulkan {
55 56
56struct ImageBufferMap {
57 [[nodiscard]] VkBuffer Handle() const noexcept {
58 return handle;
59 }
60
61 [[nodiscard]] std::span<u8> Span() const noexcept {
62 return span;
63 }
64
65 VkBuffer handle;
66 std::span<u8> span;
67};
68
69struct TextureCacheRuntime { 57struct TextureCacheRuntime {
70 const Device& device; 58 const Device& device;
71 VKScheduler& scheduler; 59 VKScheduler& scheduler;
@@ -76,9 +64,9 @@ struct TextureCacheRuntime {
76 64
77 void Finish(); 65 void Finish();
78 66
79 [[nodiscard]] ImageBufferMap MapUploadBuffer(size_t size); 67 [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size);
80 68
81 [[nodiscard]] ImageBufferMap MapDownloadBuffer(size_t size); 69 [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size);
82 70
83 void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, 71 void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src,
84 const std::array<Offset2D, 2>& dst_region, 72 const std::array<Offset2D, 2>& dst_region,
@@ -94,7 +82,7 @@ struct TextureCacheRuntime {
94 return false; 82 return false;
95 } 83 }
96 84
97 void AccelerateImageUpload(Image&, const ImageBufferMap&, size_t, 85 void AccelerateImageUpload(Image&, const StagingBufferRef&, size_t,
98 std::span<const VideoCommon::SwizzleParameters>) { 86 std::span<const VideoCommon::SwizzleParameters>) {
99 UNREACHABLE(); 87 UNREACHABLE();
100 } 88 }
@@ -112,13 +100,13 @@ public:
112 explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, 100 explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
113 VAddr cpu_addr); 101 VAddr cpu_addr);
114 102
115 void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, 103 void UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
116 std::span<const VideoCommon::BufferImageCopy> copies); 104 std::span<const VideoCommon::BufferImageCopy> copies);
117 105
118 void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, 106 void UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
119 std::span<const VideoCommon::BufferCopy> copies); 107 std::span<const VideoCommon::BufferCopy> copies);
120 108
121 void DownloadMemory(const ImageBufferMap& map, size_t buffer_offset, 109 void DownloadMemory(const StagingBufferRef& map, size_t buffer_offset,
122 std::span<const VideoCommon::BufferImageCopy> copies); 110 std::span<const VideoCommon::BufferImageCopy> copies);
123 111
124 [[nodiscard]] VkImage Handle() const noexcept { 112 [[nodiscard]] VkImage Handle() const noexcept {
diff --git a/src/video_core/shader/async_shaders.h b/src/video_core/shader/async_shaders.h
index 0dbb1a31f..7fdff6e56 100644
--- a/src/video_core/shader/async_shaders.h
+++ b/src/video_core/shader/async_shaders.h
@@ -9,16 +9,7 @@
9#include <shared_mutex> 9#include <shared_mutex>
10#include <thread> 10#include <thread>
11 11
12// This header includes both Vulkan and OpenGL headers, this has to be fixed
13// Unfortunately, including OpenGL will include Windows.h that defines macros that can cause issues.
14// Forcefully include glad early and undefine macros
15#include <glad/glad.h> 12#include <glad/glad.h>
16#ifdef CreateEvent
17#undef CreateEvent
18#endif
19#ifdef CreateSemaphore
20#undef CreateSemaphore
21#endif
22 13
23#include "common/common_types.h" 14#include "common/common_types.h"
24#include "video_core/renderer_opengl/gl_device.h" 15#include "video_core/renderer_opengl/gl_device.h"
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index d3ea07aac..5f88537bc 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -76,6 +76,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
76 case SystemVariable::InvocationId: 76 case SystemVariable::InvocationId:
77 return Operation(OperationCode::InvocationId); 77 return Operation(OperationCode::InvocationId);
78 case SystemVariable::Ydirection: 78 case SystemVariable::Ydirection:
79 uses_y_negate = true;
79 return Operation(OperationCode::YNegate); 80 return Operation(OperationCode::YNegate);
80 case SystemVariable::InvocationInfo: 81 case SystemVariable::InvocationInfo:
81 LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete"); 82 LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete");
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 0c6ab0f07..1cd7c14d7 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -139,6 +139,10 @@ public:
139 return uses_legacy_varyings; 139 return uses_legacy_varyings;
140 } 140 }
141 141
142 bool UsesYNegate() const {
143 return uses_y_negate;
144 }
145
142 bool UsesWarps() const { 146 bool UsesWarps() const {
143 return uses_warps; 147 return uses_warps;
144 } 148 }
@@ -465,6 +469,7 @@ private:
465 bool uses_instance_id{}; 469 bool uses_instance_id{};
466 bool uses_vertex_id{}; 470 bool uses_vertex_id{};
467 bool uses_legacy_varyings{}; 471 bool uses_legacy_varyings{};
472 bool uses_y_negate{};
468 bool uses_warps{}; 473 bool uses_warps{};
469 bool uses_indexed_samplers{}; 474 bool uses_indexed_samplers{};
470 475
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index d1080300f..f336b705f 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -103,9 +103,6 @@ public:
103 /// Notify the cache that a new frame has been queued 103 /// Notify the cache that a new frame has been queued
104 void TickFrame(); 104 void TickFrame();
105 105
106 /// Return an unique mutually exclusive lock for the cache
107 [[nodiscard]] std::unique_lock<std::mutex> AcquireLock();
108
109 /// Return a constant reference to the given image view id 106 /// Return a constant reference to the given image view id
110 [[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept; 107 [[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept;
111 108
@@ -179,6 +176,8 @@ public:
179 /// Return true when a CPU region is modified from the GPU 176 /// Return true when a CPU region is modified from the GPU
180 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); 177 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
181 178
179 std::mutex mutex;
180
182private: 181private:
183 /// Iterate over all page indices in a range 182 /// Iterate over all page indices in a range
184 template <typename Func> 183 template <typename Func>
@@ -212,8 +211,8 @@ private:
212 void RefreshContents(Image& image); 211 void RefreshContents(Image& image);
213 212
214 /// Upload data from guest to an image 213 /// Upload data from guest to an image
215 template <typename MapBuffer> 214 template <typename StagingBuffer>
216 void UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset); 215 void UploadImageContents(Image& image, StagingBuffer& staging_buffer, size_t buffer_offset);
217 216
218 /// Find or create an image view from a guest descriptor 217 /// Find or create an image view from a guest descriptor
219 [[nodiscard]] ImageViewId FindImageView(const TICEntry& config); 218 [[nodiscard]] ImageViewId FindImageView(const TICEntry& config);
@@ -325,8 +324,6 @@ private:
325 324
326 RenderTargets render_targets; 325 RenderTargets render_targets;
327 326
328 std::mutex mutex;
329
330 std::unordered_map<TICEntry, ImageViewId> image_views; 327 std::unordered_map<TICEntry, ImageViewId> image_views;
331 std::unordered_map<TSCEntry, SamplerId> samplers; 328 std::unordered_map<TSCEntry, SamplerId> samplers;
332 std::unordered_map<RenderTargets, FramebufferId> framebuffers; 329 std::unordered_map<RenderTargets, FramebufferId> framebuffers;
@@ -386,11 +383,6 @@ void TextureCache<P>::TickFrame() {
386} 383}
387 384
388template <class P> 385template <class P>
389std::unique_lock<std::mutex> TextureCache<P>::AcquireLock() {
390 return std::unique_lock{mutex};
391}
392
393template <class P>
394const typename P::ImageView& TextureCache<P>::GetImageView(ImageViewId id) const noexcept { 386const typename P::ImageView& TextureCache<P>::GetImageView(ImageViewId id) const noexcept {
395 return slot_image_views[id]; 387 return slot_image_views[id];
396} 388}
@@ -598,11 +590,11 @@ void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
598 }); 590 });
599 for (const ImageId image_id : images) { 591 for (const ImageId image_id : images) {
600 Image& image = slot_images[image_id]; 592 Image& image = slot_images[image_id];
601 auto map = runtime.MapDownloadBuffer(image.unswizzled_size_bytes); 593 auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
602 const auto copies = FullDownloadCopies(image.info); 594 const auto copies = FullDownloadCopies(image.info);
603 image.DownloadMemory(map, 0, copies); 595 image.DownloadMemory(map, 0, copies);
604 runtime.Finish(); 596 runtime.Finish();
605 SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.Span()); 597 SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span);
606 } 598 }
607} 599}
608 600
@@ -757,7 +749,7 @@ void TextureCache<P>::PopAsyncFlushes() {
757 for (const ImageId image_id : download_ids) { 749 for (const ImageId image_id : download_ids) {
758 total_size_bytes += slot_images[image_id].unswizzled_size_bytes; 750 total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
759 } 751 }
760 auto download_map = runtime.MapDownloadBuffer(total_size_bytes); 752 auto download_map = runtime.DownloadStagingBuffer(total_size_bytes);
761 size_t buffer_offset = 0; 753 size_t buffer_offset = 0;
762 for (const ImageId image_id : download_ids) { 754 for (const ImageId image_id : download_ids) {
763 Image& image = slot_images[image_id]; 755 Image& image = slot_images[image_id];
@@ -769,7 +761,7 @@ void TextureCache<P>::PopAsyncFlushes() {
769 runtime.Finish(); 761 runtime.Finish();
770 762
771 buffer_offset = 0; 763 buffer_offset = 0;
772 const std::span<u8> download_span = download_map.Span(); 764 const std::span<u8> download_span = download_map.mapped_span;
773 for (const ImageId image_id : download_ids) { 765 for (const ImageId image_id : download_ids) {
774 const ImageBase& image = slot_images[image_id]; 766 const ImageBase& image = slot_images[image_id];
775 const auto copies = FullDownloadCopies(image.info); 767 const auto copies = FullDownloadCopies(image.info);
@@ -806,7 +798,7 @@ void TextureCache<P>::RefreshContents(Image& image) {
806 LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); 798 LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
807 return; 799 return;
808 } 800 }
809 auto map = runtime.MapUploadBuffer(MapSizeBytes(image)); 801 auto map = runtime.UploadStagingBuffer(MapSizeBytes(image));
810 UploadImageContents(image, map, 0); 802 UploadImageContents(image, map, 0);
811 runtime.InsertUploadMemoryBarrier(); 803 runtime.InsertUploadMemoryBarrier();
812} 804}
@@ -814,7 +806,7 @@ void TextureCache<P>::RefreshContents(Image& image) {
814template <class P> 806template <class P>
815template <typename MapBuffer> 807template <typename MapBuffer>
816void TextureCache<P>::UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset) { 808void TextureCache<P>::UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset) {
817 const std::span<u8> mapped_span = map.Span().subspan(buffer_offset); 809 const std::span<u8> mapped_span = map.mapped_span.subspan(buffer_offset);
818 const GPUVAddr gpu_addr = image.gpu_addr; 810 const GPUVAddr gpu_addr = image.gpu_addr;
819 811
820 if (True(image.flags & ImageFlagBits::AcceleratedUpload)) { 812 if (True(image.flags & ImageFlagBits::AcceleratedUpload)) {
diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h
index 86393310a..d1ce29450 100644
--- a/src/video_core/vulkan_common/vulkan_memory_allocator.h
+++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h
@@ -78,7 +78,7 @@ public:
78 * 78 *
79 * @throw vk::Exception on failure 79 * @throw vk::Exception on failure
80 */ 80 */
81 explicit MemoryAllocator(const Device& device_, bool export_allocations_ = false); 81 explicit MemoryAllocator(const Device& device_, bool export_allocations_);
82 ~MemoryAllocator(); 82 ~MemoryAllocator();
83 83
84 MemoryAllocator& operator=(const MemoryAllocator&) = delete; 84 MemoryAllocator& operator=(const MemoryAllocator&) = delete;