From b780d5b5c580a65a670de73140b743072efc0fd2 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 13 Jul 2021 03:33:08 +0200 Subject: DMAEngine: Accelerate BufferClear --- src/video_core/buffer_cache/buffer_cache.h | 65 ++++++++++++++++++++-- src/video_core/engines/maxwell_dma.cpp | 6 +- src/video_core/engines/maxwell_dma.h | 2 + src/video_core/renderer_opengl/gl_buffer_cache.cpp | 6 ++ src/video_core/renderer_opengl/gl_buffer_cache.h | 2 + src/video_core/renderer_opengl/gl_rasterizer.cpp | 5 ++ src/video_core/renderer_opengl/gl_rasterizer.h | 2 + src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 24 ++++++++ src/video_core/renderer_vulkan/vk_buffer_cache.h | 2 + src/video_core/renderer_vulkan/vk_rasterizer.cpp | 5 ++ src/video_core/renderer_vulkan/vk_rasterizer.h | 2 + 11 files changed, 115 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 2871682f6..5f5a59bba 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -164,11 +164,16 @@ public: /// Pop asynchronous downloads void PopAsyncFlushes(); - [[nodiscard]] bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); + bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); + + bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + /// Return true when a region is registered on the cache + [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); + /// Return true when a CPU region is modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); @@ -469,8 +474,8 @@ bool BufferCache
::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am if (!cpu_src_address || !cpu_dest_address) { return false; } - const bool source_dirty = IsRegionGpuModified(*cpu_src_address, amount); - const bool dest_dirty = IsRegionGpuModified(*cpu_dest_address, amount); + const bool source_dirty = IsRegionRegistered(*cpu_src_address, amount); + const bool dest_dirty = IsRegionRegistered(*cpu_dest_address, amount); if (!source_dirty && !dest_dirty) { return false; } @@ -515,7 +520,7 @@ bool BufferCache
::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
}
runtime.CopyBuffer(dest_buffer, src_buffer, copies);
- if (source_dirty) {
+ if (IsRegionGpuModified(*cpu_src_address, amount)) {
dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount);
}
std::vector ::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
return true;
}
+template ::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) {
+ const std::optional ::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
u32 size) {
@@ -781,6 +817,27 @@ bool BufferCache ::IsRegionGpuModified(VAddr addr, size_t size) {
return false;
}
+template ::IsRegionRegistered(VAddr addr, size_t size) {
+ const VAddr end_addr = addr + size;
+ const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE);
+ for (u64 page = addr >> PAGE_BITS; page < page_end;) {
+ const BufferId buffer_id = page_table[page];
+ if (!buffer_id) {
+ ++page;
+ continue;
+ }
+ Buffer& buffer = slot_buffers[buffer_id];
+ const VAddr buf_start_addr = buffer.CpuAddr();
+ const VAddr buf_end_addr = buf_start_addr + buffer.SizeBytes();
+ if (buf_start_addr < end_addr && addr < buf_end_addr) {
+ return true;
+ }
+ page = Common::DivCeil(end_addr, PAGE_SIZE);
+ }
+ return false;
+}
+
template ::IsRegionCpuModified(VAddr addr, size_t size) {
const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 24481952b..81becb88a 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -87,9 +87,11 @@ void MaxwellDMA::CopyPitchToPitch() {
// TODO: allow multisized components.
if (is_buffer_clear) {
ASSERT(regs.remap_const.component_size_minus_one == 3);
+ accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value);
std::vector ::DownloadMemory(VAddr cpu_addr, u64 size) {
});
}
+template ::ClearDownload(IntervalType subtract_interval) {
+ uncommitted_ranges.subtract(subtract_interval);
+ for (auto& interval_set : committed_ranges) {
+ interval_set.subtract(subtract_interval);
+ }
+}
+
template ::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) {
const std::optional ::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
}
const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount};
- uncommitted_ranges.subtract(subtract_interval);
- for (auto& interval_set : committed_ranges) {
- interval_set.subtract(subtract_interval);
- }
+ ClearDownload(subtract_interval);
BufferId buffer_a;
BufferId buffer_b;
@@ -496,7 +503,6 @@ bool BufferCache ::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
auto& src_buffer = slot_buffers[buffer_a];
auto& dest_buffer = slot_buffers[buffer_b];
SynchronizeBuffer(src_buffer, *cpu_src_address, static_cast ::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
ForEachWrittenRange(*cpu_src_address, amount, mirror);
// This subtraction in this order is important for overlapping copies.
common_ranges.subtract(subtract_interval);
+ bool atleast_1_download = tmp_intervals.size() != 0;
for (const IntervalType add_interval : tmp_intervals) {
common_ranges.add(add_interval);
}
+ if (dest_buffer.HasCachedWrites()) {
+ dest_buffer.FlushCachedWrites();
+ }
runtime.CopyBuffer(dest_buffer, src_buffer, copies);
- if (IsRegionGpuModified(*cpu_src_address, amount)) {
+ dest_buffer.UnmarkRegionAsCpuModified(*cpu_dest_address, amount);
+ if (atleast_1_download) {
dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount);
}
std::vector ::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) {
}
const IntervalType subtract_interval{*cpu_dst_address, *cpu_dst_address + amount * sizeof(u32)};
- uncommitted_ranges.subtract(subtract_interval);
- for (auto& interval_set : committed_ranges) {
- interval_set.subtract(subtract_interval);
- }
+ ClearDownload(subtract_interval);
common_ranges.subtract(subtract_interval);
const size_t size = amount * sizeof(u32);
@@ -557,6 +565,7 @@ bool BufferCache ::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) {
auto& dest_buffer = slot_buffers[buffer];
const u32 offset = static_cast ::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si
const VAddr end_address = start_address + range_size;
ForEachWrittenRange(start_address, range_size, add_download);
const IntervalType subtract_interval{start_address, end_address};
+ ClearDownload(subtract_interval);
common_ranges.subtract(subtract_interval);
});
if (total_size_bytes == 0) {
--
cgit v1.2.3
From 495b8e31b55ac7617c19d8dec216b1e08f415a2f Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow
Date: Wed, 14 Jul 2021 16:44:53 +0200
Subject: DMAEngine: Revert flushing from Pitch to BlpockLinear.
---
src/video_core/engines/maxwell_dma.cpp | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
(limited to 'src')
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 81becb88a..0ae6692f9 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -181,8 +181,13 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
write_buffer.resize(dst_size);
}
- memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
- memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+ if (Settings::IsGPULevelExtreme()) {
+ memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
+ memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+ } else {
+ memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size);
+ memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
+ }
// If the input is linear and the output is tiled, swizzle the input and copy it over.
if (regs.dst_params.block_size.depth > 0) {
--
cgit v1.2.3
From a0eb3f8a3ee511e29ee362687d5f7e2df2e281f5 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow
Date: Wed, 14 Jul 2021 18:25:33 +0200
Subject: Buffer Cache: Fixes to DMA Copy.
---
src/video_core/buffer_cache/buffer_cache.h | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
(limited to 'src')
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 4def8f076..9399bcfea 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -172,7 +172,7 @@ public:
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
/// Return true when a region is registered on the cache
- [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
+ [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size) const;
/// Return true when a CPU region is modified from the CPU
[[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
@@ -503,6 +503,11 @@ bool BufferCache ::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
auto& src_buffer = slot_buffers[buffer_a];
auto& dest_buffer = slot_buffers[buffer_b];
SynchronizeBuffer(src_buffer, *cpu_src_address, static_cast ::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
common_ranges.add(add_interval);
}
- if (dest_buffer.HasCachedWrites()) {
- dest_buffer.FlushCachedWrites();
- }
runtime.CopyBuffer(dest_buffer, src_buffer, copies);
- dest_buffer.UnmarkRegionAsCpuModified(*cpu_dest_address, amount);
if (atleast_1_download) {
dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount);
}
@@ -827,7 +828,7 @@ bool BufferCache ::IsRegionGpuModified(VAddr addr, size_t size) {
}
template ::IsRegionRegistered(VAddr addr, size_t size) {
+bool BufferCache ::IsRegionRegistered(VAddr addr, size_t size) const {
const VAddr end_addr = addr + size;
const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE);
for (u64 page = addr >> PAGE_BITS; page < page_end;) {
--
cgit v1.2.3
From 1a95a7cdd9d8ffb6fd83396a5b3d4d93c2cb79fb Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow
Date: Wed, 14 Jul 2021 18:39:31 +0200
Subject: GPUMemoryManager: Force inmediate invalidation when writting block.
---
src/video_core/memory_manager.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
(limited to 'src')
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index d2b9d5f2b..2bc97ec30 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -327,7 +327,7 @@ void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, s
// Invalidate must happen on the rasterizer interface, such that memory is always
// synchronous when it is written (even when in asynchronous GPU mode).
- rasterizer->InvalidateRegion(dest_addr, copy_amount);
+ rasterizer->UnmapMemory(dest_addr, copy_amount);
system.Memory().WriteBlockUnsafe(dest_addr, src_buffer, copy_amount);
}
--
cgit v1.2.3
From 1ae4b684fff380035b468086586159a231237ed7 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow
Date: Wed, 14 Jul 2021 19:04:45 +0200
Subject: Buffer cache: Fixes, Clang and Feedback.
---
src/video_core/buffer_cache/buffer_cache.h | 14 ++++----------
src/video_core/engines/maxwell_dma.cpp | 5 +++++
src/video_core/memory_manager.cpp | 2 +-
3 files changed, 10 insertions(+), 11 deletions(-)
(limited to 'src')
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 9399bcfea..7373cb62d 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -172,7 +172,7 @@ public:
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
/// Return true when a region is registered on the cache
- [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size) const;
+ [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
/// Return true when a CPU region is modified from the CPU
[[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
@@ -503,10 +503,6 @@ bool BufferCache ::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
auto& src_buffer = slot_buffers[buffer_a];
auto& dest_buffer = slot_buffers[buffer_b];
SynchronizeBuffer(src_buffer, *cpu_src_address, static_cast ::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) {
return false;
}
- const IntervalType subtract_interval{*cpu_dst_address, *cpu_dst_address + amount * sizeof(u32)};
+ const size_t size = amount * sizeof(u32);
+ const IntervalType subtract_interval{*cpu_dst_address, *cpu_dst_address + size};
ClearDownload(subtract_interval);
common_ranges.subtract(subtract_interval);
- const size_t size = amount * sizeof(u32);
BufferId buffer;
do {
has_deleted_buffers = false;
buffer = FindBuffer(*cpu_dst_address, static_cast ::IsRegionGpuModified(VAddr addr, size_t size) {
}
template ::IsRegionRegistered(VAddr addr, size_t size) const {
+bool BufferCache ::IsRegionRegistered(VAddr addr, size_t size) {
const VAddr end_addr = addr + size;
const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE);
for (u64 page = addr >> PAGE_BITS; page < page_end;) {
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 0ae6692f9..c51776466 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -4,6 +4,7 @@
#include "common/assert.h"
#include "common/logging/log.h"
+#include "common/microprofile.h"
#include "common/settings.h"
#include "core/core.h"
#include "video_core/engines/maxwell_3d.h"
@@ -12,6 +13,9 @@
#include "video_core/renderer_base.h"
#include "video_core/textures/decoders.h"
+MICROPROFILE_DECLARE(GPU_DMAEngine);
+MICROPROFILE_DEFINE(GPU_DMAEngine, "GPU", "DMA Engine", MP_RGB(224, 224, 128));
+
namespace Tegra::Engines {
using namespace Texture;
@@ -43,6 +47,7 @@ void MaxwellDMA::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
}
void MaxwellDMA::Launch() {
+ MICROPROFILE_SCOPE(GPU_DMAEngine);
LOG_TRACE(Render_OpenGL, "DMA copy 0x{:x} -> 0x{:x}", static_cast