summaryrefslogtreecommitdiff
path: root/src/video_core/renderer_vulkan
diff options
context:
space:
mode:
authorGravatar ReinUsesLisp2021-01-16 16:20:18 -0300
committerGravatar ReinUsesLisp2021-02-13 02:17:24 -0300
commit35df1d1864ba721ea7b1cebf9a106dd771cde4f5 (patch)
tree034a8281294246e2a8eea92d1937607ad00ed428 /src/video_core/renderer_vulkan
parentvulkan_device: Enable robustBufferAccess (diff)
downloadyuzu-35df1d1864ba721ea7b1cebf9a106dd771cde4f5.tar.gz
yuzu-35df1d1864ba721ea7b1cebf9a106dd771cde4f5.tar.xz
yuzu-35df1d1864ba721ea7b1cebf9a106dd771cde4f5.zip
vk_staging_buffer_pool: Add stream buffer for small uploads
This uses a ring buffer similar to OpenGL's stream buffer for small uploads. This stops us from allocating several small buffers, reducing memory fragmentation and cache locality. It uses dedicated allocations when possible.
Diffstat (limited to 'src/video_core/renderer_vulkan')
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.cpp23
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.cpp61
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.h9
-rw-r--r--src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp142
-rw-r--r--src/video_core/renderer_vulkan/vk_staging_buffer_pool.h20
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp14
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.h9
7 files changed, 214 insertions, 64 deletions
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 48fc5d966..4f1e4ec28 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -138,17 +138,18 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
138void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, 138void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format,
139 u32 base_vertex, u32 num_indices, VkBuffer buffer, 139 u32 base_vertex, u32 num_indices, VkBuffer buffer,
140 u32 offset, [[maybe_unused]] u32 size) { 140 u32 offset, [[maybe_unused]] u32 size) {
141 VkIndexType index_type = MaxwellToVK::IndexFormat(index_format); 141 VkIndexType vk_index_type = MaxwellToVK::IndexFormat(index_format);
142 VkDeviceSize vk_offset = offset;
142 if (topology == PrimitiveTopology::Quads) { 143 if (topology == PrimitiveTopology::Quads) {
143 index_type = VK_INDEX_TYPE_UINT32; 144 vk_index_type = VK_INDEX_TYPE_UINT32;
144 std::tie(buffer, offset) = 145 std::tie(buffer, vk_offset) =
145 quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset); 146 quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset);
146 } else if (index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) { 147 } else if (vk_index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) {
147 index_type = VK_INDEX_TYPE_UINT16; 148 vk_index_type = VK_INDEX_TYPE_UINT16;
148 std::tie(buffer, offset) = uint8_pass.Assemble(num_indices, buffer, offset); 149 std::tie(buffer, vk_offset) = uint8_pass.Assemble(num_indices, buffer, offset);
149 } 150 }
150 scheduler.Record([buffer, offset, index_type](vk::CommandBuffer cmdbuf) { 151 scheduler.Record([buffer, vk_offset, vk_index_type](vk::CommandBuffer cmdbuf) {
151 cmdbuf.BindIndexBuffer(buffer, offset, index_type); 152 cmdbuf.BindIndexBuffer(buffer, vk_offset, vk_index_type);
152 }); 153 });
153} 154}
154 155
@@ -251,10 +252,10 @@ void BufferCacheRuntime::ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle
251 } 252 }
252 } 253 }
253 scheduler.RequestOutsideRenderPassOperationContext(); 254 scheduler.RequestOutsideRenderPassOperationContext();
254 scheduler.Record([src_buffer = staging.buffer, dst_buffer = *quad_array_lut, 255 scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset,
255 size_bytes](vk::CommandBuffer cmdbuf) { 256 dst_buffer = *quad_array_lut, size_bytes](vk::CommandBuffer cmdbuf) {
256 const VkBufferCopy copy{ 257 const VkBufferCopy copy{
257 .srcOffset = 0, 258 .srcOffset = src_offset,
258 .dstOffset = 0, 259 .dstOffset = 0,
259 .size = size_bytes, 260 .size = size_bytes,
260 }; 261 };
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index a4fdcdf81..2f9a7b028 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -10,6 +10,7 @@
10#include "common/alignment.h" 10#include "common/alignment.h"
11#include "common/assert.h" 11#include "common/assert.h"
12#include "common/common_types.h" 12#include "common/common_types.h"
13#include "common/div_ceil.h"
13#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" 14#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
14#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" 15#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
15#include "video_core/renderer_vulkan/vk_compute_pass.h" 16#include "video_core/renderer_vulkan/vk_compute_pass.h"
@@ -148,38 +149,33 @@ Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_,
148 149
149Uint8Pass::~Uint8Pass() = default; 150Uint8Pass::~Uint8Pass() = default;
150 151
151std::pair<VkBuffer, u32> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, 152std::pair<VkBuffer, VkDeviceSize> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
152 u32 src_offset) { 153 u32 src_offset) {
153 const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16)); 154 const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16));
154 const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); 155 const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
155 156
156 update_descriptor_queue.Acquire(); 157 update_descriptor_queue.Acquire();
157 update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices); 158 update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices);
158 update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size); 159 update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size);
159 const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); 160 const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
160 161
161 scheduler.RequestOutsideRenderPassOperationContext(); 162 scheduler.RequestOutsideRenderPassOperationContext();
162 scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set, 163 scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
163 num_vertices](vk::CommandBuffer cmdbuf) { 164 num_vertices](vk::CommandBuffer cmdbuf) {
164 constexpr u32 dispatch_size = 1024; 165 static constexpr u32 DISPATCH_SIZE = 1024;
166 static constexpr VkMemoryBarrier WRITE_BARRIER{
167 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
168 .pNext = nullptr,
169 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
170 .dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
171 };
165 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); 172 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
166 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); 173 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
167 cmdbuf.Dispatch(Common::AlignUp(num_vertices, dispatch_size) / dispatch_size, 1, 1); 174 cmdbuf.Dispatch(Common::DivCeil(num_vertices, DISPATCH_SIZE), 1, 1);
168
169 VkBufferMemoryBarrier barrier;
170 barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
171 barrier.pNext = nullptr;
172 barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
173 barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
174 barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
175 barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
176 barrier.buffer = buffer;
177 barrier.offset = 0;
178 barrier.size = static_cast<VkDeviceSize>(num_vertices * sizeof(u16));
179 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 175 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
180 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); 176 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER);
181 }); 177 });
182 return {staging.buffer, 0}; 178 return {staging.buffer, staging.offset};
183} 179}
184 180
185QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, 181QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
@@ -194,7 +190,7 @@ QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
194 190
195QuadIndexedPass::~QuadIndexedPass() = default; 191QuadIndexedPass::~QuadIndexedPass() = default;
196 192
197std::pair<VkBuffer, u32> QuadIndexedPass::Assemble( 193std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
198 Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex, 194 Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex,
199 VkBuffer src_buffer, u32 src_offset) { 195 VkBuffer src_buffer, u32 src_offset) {
200 const u32 index_shift = [index_format] { 196 const u32 index_shift = [index_format] {
@@ -217,34 +213,29 @@ std::pair<VkBuffer, u32> QuadIndexedPass::Assemble(
217 213
218 update_descriptor_queue.Acquire(); 214 update_descriptor_queue.Acquire();
219 update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size); 215 update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size);
220 update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size); 216 update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size);
221 const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); 217 const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
222 218
223 scheduler.RequestOutsideRenderPassOperationContext(); 219 scheduler.RequestOutsideRenderPassOperationContext();
224 scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set, 220 scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
225 num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) { 221 num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) {
226 static constexpr u32 dispatch_size = 1024; 222 static constexpr u32 DISPATCH_SIZE = 1024;
223 static constexpr VkMemoryBarrier WRITE_BARRIER{
224 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
225 .pNext = nullptr,
226 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
227 .dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
228 };
227 const std::array push_constants = {base_vertex, index_shift}; 229 const std::array push_constants = {base_vertex, index_shift};
228 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); 230 cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
229 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); 231 cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
230 cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), 232 cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
231 &push_constants); 233 &push_constants);
232 cmdbuf.Dispatch(Common::AlignUp(num_tri_vertices, dispatch_size) / dispatch_size, 1, 1); 234 cmdbuf.Dispatch(Common::DivCeil(num_tri_vertices, DISPATCH_SIZE), 1, 1);
233
234 VkBufferMemoryBarrier barrier;
235 barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
236 barrier.pNext = nullptr;
237 barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
238 barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
239 barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
240 barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
241 barrier.buffer = buffer;
242 barrier.offset = 0;
243 barrier.size = static_cast<VkDeviceSize>(num_tri_vertices * sizeof(u32));
244 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 235 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
245 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); 236 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER);
246 }); 237 });
247 return {staging.buffer, 0}; 238 return {staging.buffer, staging.offset};
248} 239}
249 240
250} // namespace Vulkan 241} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index 4904019f5..17d781d99 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -50,7 +50,8 @@ public:
50 50
51 /// Assemble uint8 indices into an uint16 index buffer 51 /// Assemble uint8 indices into an uint16 index buffer
52 /// Returns a pair with the staging buffer, and the offset where the assembled data is 52 /// Returns a pair with the staging buffer, and the offset where the assembled data is
53 std::pair<VkBuffer, u32> Assemble(u32 num_vertices, VkBuffer src_buffer, u32 src_offset); 53 std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, VkBuffer src_buffer,
54 u32 src_offset);
54 55
55private: 56private:
56 VKScheduler& scheduler; 57 VKScheduler& scheduler;
@@ -66,9 +67,9 @@ public:
66 VKUpdateDescriptorQueue& update_descriptor_queue_); 67 VKUpdateDescriptorQueue& update_descriptor_queue_);
67 ~QuadIndexedPass(); 68 ~QuadIndexedPass();
68 69
69 std::pair<VkBuffer, u32> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, 70 std::pair<VkBuffer, VkDeviceSize> Assemble(
70 u32 num_vertices, u32 base_vertex, VkBuffer src_buffer, 71 Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices,
71 u32 src_offset); 72 u32 base_vertex, VkBuffer src_buffer, u32 src_offset);
72 73
73private: 74private:
74 VKScheduler& scheduler; 75 VKScheduler& scheduler;
diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
index 97fd41cc1..275d740b8 100644
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
@@ -8,6 +8,7 @@
8 8
9#include <fmt/format.h> 9#include <fmt/format.h>
10 10
11#include "common/alignment.h"
11#include "common/assert.h" 12#include "common/assert.h"
12#include "common/bit_util.h" 13#include "common/bit_util.h"
13#include "common/common_types.h" 14#include "common/common_types.h"
@@ -17,14 +18,117 @@
17#include "video_core/vulkan_common/vulkan_wrapper.h" 18#include "video_core/vulkan_common/vulkan_wrapper.h"
18 19
19namespace Vulkan { 20namespace Vulkan {
21namespace {
22// Maximum potential alignment of a Vulkan buffer
23constexpr VkDeviceSize MAX_ALIGNMENT = 256;
24// Maximum size to put elements in the stream buffer
25constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8 * 1024 * 1024;
26// Stream buffer size in bytes
27constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
28constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS;
29
30constexpr VkMemoryPropertyFlags HOST_FLAGS =
31 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
32constexpr VkMemoryPropertyFlags STREAM_FLAGS = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | HOST_FLAGS;
33
34bool IsStreamHeap(VkMemoryHeap heap) noexcept {
35 return STREAM_BUFFER_SIZE < (heap.size * 2) / 3;
36}
37
38std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask,
39 VkMemoryPropertyFlags flags) noexcept {
40 for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) {
41 if (((type_mask >> type_index) & 1) == 0) {
42 // Memory type is incompatible
43 continue;
44 }
45 const VkMemoryType& memory_type = props.memoryTypes[type_index];
46 if ((memory_type.propertyFlags & flags) != flags) {
47 // Memory type doesn't have the flags we want
48 continue;
49 }
50 if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex])) {
51 // Memory heap is not suitable for streaming
52 continue;
53 }
54 // Success!
55 return type_index;
56 }
57 return std::nullopt;
58}
59
60u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask) {
61 // Try to find a DEVICE_LOCAL_BIT type, Nvidia and AMD have a dedicated heap for this
62 std::optional<u32> type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS);
63 if (type) {
64 return *type;
65 }
66 // Otherwise try without the DEVICE_LOCAL_BIT
67 type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS);
68 if (type) {
69 return *type;
70 }
71 // This should never happen, and in case it does, signal it as an out of memory situation
72 throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY);
73}
74
75size_t Region(size_t iterator) noexcept {
76 return iterator / REGION_SIZE;
77}
78} // Anonymous namespace
20 79
21StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, 80StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_,
22 VKScheduler& scheduler_) 81 VKScheduler& scheduler_)
23 : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} {} 82 : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} {
83 const vk::Device& dev = device.GetLogical();
84 stream_buffer = dev.CreateBuffer(VkBufferCreateInfo{
85 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
86 .pNext = nullptr,
87 .flags = 0,
88 .size = STREAM_BUFFER_SIZE,
89 .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
90 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
91 .queueFamilyIndexCount = 0,
92 .pQueueFamilyIndices = nullptr,
93 });
94 if (device.HasDebuggingToolAttached()) {
95 stream_buffer.SetObjectNameEXT("Stream Buffer");
96 }
97 VkMemoryDedicatedRequirements dedicated_reqs{
98 .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS,
99 .pNext = nullptr,
100 .prefersDedicatedAllocation = VK_FALSE,
101 .requiresDedicatedAllocation = VK_FALSE,
102 };
103 const auto requirements = dev.GetBufferMemoryRequirements(*stream_buffer, &dedicated_reqs);
104 const bool make_dedicated = dedicated_reqs.prefersDedicatedAllocation == VK_TRUE ||
105 dedicated_reqs.requiresDedicatedAllocation == VK_TRUE;
106 const VkMemoryDedicatedAllocateInfo dedicated_info{
107 .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
108 .pNext = nullptr,
109 .image = nullptr,
110 .buffer = *stream_buffer,
111 };
112 const auto memory_properties = device.GetPhysical().GetMemoryProperties();
113 stream_memory = dev.AllocateMemory(VkMemoryAllocateInfo{
114 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
115 .pNext = make_dedicated ? &dedicated_info : nullptr,
116 .allocationSize = requirements.size,
117 .memoryTypeIndex = FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits),
118 });
119 if (device.HasDebuggingToolAttached()) {
120 stream_memory.SetObjectNameEXT("Stream Buffer Memory");
121 }
122 stream_buffer.BindMemory(*stream_memory, 0);
123 stream_pointer = stream_memory.Map(0, STREAM_BUFFER_SIZE);
124}
24 125
25StagingBufferPool::~StagingBufferPool() = default; 126StagingBufferPool::~StagingBufferPool() = default;
26 127
27StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage) { 128StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage) {
129 if (usage == MemoryUsage::Upload && size <= MAX_STREAM_BUFFER_REQUEST_SIZE) {
130 return GetStreamBuffer(size);
131 }
28 if (const std::optional<StagingBufferRef> ref = TryGetReservedBuffer(size, usage)) { 132 if (const std::optional<StagingBufferRef> ref = TryGetReservedBuffer(size, usage)) {
29 return *ref; 133 return *ref;
30 } 134 }
@@ -39,6 +143,42 @@ void StagingBufferPool::TickFrame() {
39 ReleaseCache(MemoryUsage::Download); 143 ReleaseCache(MemoryUsage::Download);
40} 144}
41 145
146StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
147 for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end;
148 ++region) {
149 sync_ticks[region] = scheduler.CurrentTick();
150 }
151 used_iterator = iterator;
152
153 for (size_t region = Region(free_iterator) + 1,
154 region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS);
155 region < region_end; ++region) {
156 scheduler.Wait(sync_ticks[region]);
157 }
158 if (iterator + size > free_iterator) {
159 free_iterator = iterator + size;
160 }
161 if (iterator + size > STREAM_BUFFER_SIZE) {
162 for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) {
163 sync_ticks[region] = scheduler.CurrentTick();
164 }
165 used_iterator = 0;
166 iterator = 0;
167 free_iterator = size;
168
169 for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) {
170 scheduler.Wait(sync_ticks[region]);
171 }
172 }
173 const size_t offset = iterator;
174 iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT);
175 return StagingBufferRef{
176 .buffer = *stream_buffer,
177 .offset = static_cast<VkDeviceSize>(offset),
178 .mapped_span = std::span<u8>(stream_pointer + offset, size),
179 };
180}
181
42std::optional<StagingBufferRef> StagingBufferPool::TryGetReservedBuffer(size_t size, 182std::optional<StagingBufferRef> StagingBufferPool::TryGetReservedBuffer(size_t size,
43 MemoryUsage usage) { 183 MemoryUsage usage) {
44 StagingBuffers& cache_level = GetCache(usage)[Common::Log2Ceil64(size)]; 184 StagingBuffers& cache_level = GetCache(usage)[Common::Log2Ceil64(size)];
diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
index d42918a47..4ed99c0df 100644
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
@@ -19,11 +19,14 @@ class VKScheduler;
19 19
20struct StagingBufferRef { 20struct StagingBufferRef {
21 VkBuffer buffer; 21 VkBuffer buffer;
22 VkDeviceSize offset;
22 std::span<u8> mapped_span; 23 std::span<u8> mapped_span;
23}; 24};
24 25
25class StagingBufferPool { 26class StagingBufferPool {
26public: 27public:
28 static constexpr size_t NUM_SYNCS = 16;
29
27 explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator, 30 explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator,
28 VKScheduler& scheduler); 31 VKScheduler& scheduler);
29 ~StagingBufferPool(); 32 ~StagingBufferPool();
@@ -33,6 +36,11 @@ public:
33 void TickFrame(); 36 void TickFrame();
34 37
35private: 38private:
39 struct StreamBufferCommit {
40 size_t upper_bound;
41 u64 tick;
42 };
43
36 struct StagingBuffer { 44 struct StagingBuffer {
37 vk::Buffer buffer; 45 vk::Buffer buffer;
38 MemoryCommit commit; 46 MemoryCommit commit;
@@ -42,6 +50,7 @@ private:
42 StagingBufferRef Ref() const noexcept { 50 StagingBufferRef Ref() const noexcept {
43 return { 51 return {
44 .buffer = *buffer, 52 .buffer = *buffer,
53 .offset = 0,
45 .mapped_span = mapped_span, 54 .mapped_span = mapped_span,
46 }; 55 };
47 } 56 }
@@ -56,6 +65,8 @@ private:
56 static constexpr size_t NUM_LEVELS = sizeof(size_t) * CHAR_BIT; 65 static constexpr size_t NUM_LEVELS = sizeof(size_t) * CHAR_BIT;
57 using StagingBuffersCache = std::array<StagingBuffers, NUM_LEVELS>; 66 using StagingBuffersCache = std::array<StagingBuffers, NUM_LEVELS>;
58 67
68 StagingBufferRef GetStreamBuffer(size_t size);
69
59 std::optional<StagingBufferRef> TryGetReservedBuffer(size_t size, MemoryUsage usage); 70 std::optional<StagingBufferRef> TryGetReservedBuffer(size_t size, MemoryUsage usage);
60 71
61 StagingBufferRef CreateStagingBuffer(size_t size, MemoryUsage usage); 72 StagingBufferRef CreateStagingBuffer(size_t size, MemoryUsage usage);
@@ -70,6 +81,15 @@ private:
70 MemoryAllocator& memory_allocator; 81 MemoryAllocator& memory_allocator;
71 VKScheduler& scheduler; 82 VKScheduler& scheduler;
72 83
84 vk::Buffer stream_buffer;
85 vk::DeviceMemory stream_memory;
86 u8* stream_pointer = nullptr;
87
88 size_t iterator = 0;
89 size_t used_iterator = 0;
90 size_t free_iterator = 0;
91 std::array<u64, NUM_SYNCS> sync_ticks{};
92
73 StagingBuffersCache device_local_cache; 93 StagingBuffersCache device_local_cache;
74 StagingBuffersCache upload_cache; 94 StagingBuffersCache upload_cache;
75 StagingBuffersCache download_cache; 95 StagingBuffersCache download_cache;
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 1eeb45ca9..22a1014a9 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -818,11 +818,10 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
818 } 818 }
819} 819}
820 820
821void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset, 821void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
822 std::span<const BufferImageCopy> copies) {
823 // TODO: Move this to another API 822 // TODO: Move this to another API
824 scheduler->RequestOutsideRenderPassOperationContext(); 823 scheduler->RequestOutsideRenderPassOperationContext();
825 std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); 824 std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
826 const VkBuffer src_buffer = map.buffer; 825 const VkBuffer src_buffer = map.buffer;
827 const VkImage vk_image = *image; 826 const VkImage vk_image = *image;
828 const VkImageAspectFlags vk_aspect_mask = aspect_mask; 827 const VkImageAspectFlags vk_aspect_mask = aspect_mask;
@@ -833,11 +832,11 @@ void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
833 }); 832 });
834} 833}
835 834
836void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset, 835void Image::UploadMemory(const StagingBufferRef& map,
837 std::span<const VideoCommon::BufferCopy> copies) { 836 std::span<const VideoCommon::BufferCopy> copies) {
838 // TODO: Move this to another API 837 // TODO: Move this to another API
839 scheduler->RequestOutsideRenderPassOperationContext(); 838 scheduler->RequestOutsideRenderPassOperationContext();
840 std::vector vk_copies = TransformBufferCopies(copies, buffer_offset); 839 std::vector vk_copies = TransformBufferCopies(copies, map.offset);
841 const VkBuffer src_buffer = map.buffer; 840 const VkBuffer src_buffer = map.buffer;
842 const VkBuffer dst_buffer = *buffer; 841 const VkBuffer dst_buffer = *buffer;
843 scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { 842 scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) {
@@ -846,9 +845,8 @@ void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
846 }); 845 });
847} 846}
848 847
849void Image::DownloadMemory(const StagingBufferRef& map, size_t buffer_offset, 848void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
850 std::span<const BufferImageCopy> copies) { 849 std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
851 std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask);
852 scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask, 850 scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask,
853 vk_copies](vk::CommandBuffer cmdbuf) { 851 vk_copies](vk::CommandBuffer cmdbuf) {
854 const VkImageMemoryBarrier read_barrier{ 852 const VkImageMemoryBarrier read_barrier{
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 4558c3297..b08c23459 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -82,7 +82,7 @@ struct TextureCacheRuntime {
82 return false; 82 return false;
83 } 83 }
84 84
85 void AccelerateImageUpload(Image&, const StagingBufferRef&, size_t, 85 void AccelerateImageUpload(Image&, const StagingBufferRef&,
86 std::span<const VideoCommon::SwizzleParameters>) { 86 std::span<const VideoCommon::SwizzleParameters>) {
87 UNREACHABLE(); 87 UNREACHABLE();
88 } 88 }
@@ -100,13 +100,12 @@ public:
100 explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, 100 explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
101 VAddr cpu_addr); 101 VAddr cpu_addr);
102 102
103 void UploadMemory(const StagingBufferRef& map, size_t buffer_offset, 103 void UploadMemory(const StagingBufferRef& map,
104 std::span<const VideoCommon::BufferImageCopy> copies); 104 std::span<const VideoCommon::BufferImageCopy> copies);
105 105
106 void UploadMemory(const StagingBufferRef& map, size_t buffer_offset, 106 void UploadMemory(const StagingBufferRef& map, std::span<const VideoCommon::BufferCopy> copies);
107 std::span<const VideoCommon::BufferCopy> copies);
108 107
109 void DownloadMemory(const StagingBufferRef& map, size_t buffer_offset, 108 void DownloadMemory(const StagingBufferRef& map,
110 std::span<const VideoCommon::BufferImageCopy> copies); 109 std::span<const VideoCommon::BufferImageCopy> copies);
111 110
112 [[nodiscard]] VkImage Handle() const noexcept { 111 [[nodiscard]] VkImage Handle() const noexcept {