summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar liamwhite2023-06-27 11:21:10 -0400
committerGravatar GitHub2023-06-27 11:21:10 -0400
commitdafbc86366f8bcd9153949db4d141ec489928f81 (patch)
tree8496cb635eef6967a99d0b7dfad41419a24b5efd /src
parentMerge pull request #10925 from t895/fs-agony (diff)
parentOpenGL: Limit lmem warmup to NVIDIA (diff)
downloadyuzu-dafbc86366f8bcd9153949db4d141ec489928f81.tar.gz
yuzu-dafbc86366f8bcd9153949db4d141ec489928f81.tar.xz
yuzu-dafbc86366f8bcd9153949db4d141ec489928f81.zip
Merge pull request #10916 from ameerj/lolmem
OpenGL: Add Local Memory warmup shader for Nvidia
Diffstat (limited to 'src')
-rw-r--r--src/shader_recompiler/backend/glasm/emit_glasm.cpp2
-rw-r--r--src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp4
-rw-r--r--src/shader_recompiler/shader_info.h1
-rw-r--r--src/video_core/host_shaders/CMakeLists.txt1
-rw-r--r--src/video_core/host_shaders/opengl_lmem_warmup.comp47
-rw-r--r--src/video_core/renderer_opengl/gl_compute_pipeline.cpp1
-rw-r--r--src/video_core/renderer_opengl/gl_compute_pipeline.h5
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp1
-rw-r--r--src/video_core/renderer_opengl/gl_device.h5
-rw-r--r--src/video_core/renderer_opengl/gl_graphics_pipeline.cpp1
-rw-r--r--src/video_core/renderer_opengl/gl_graphics_pipeline.h5
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp6
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.cpp13
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.h3
14 files changed, 94 insertions, 1 deletions
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm.cpp b/src/shader_recompiler/backend/glasm/emit_glasm.cpp
index fd4a61a4d..b795c0179 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm.cpp
@@ -461,7 +461,7 @@ std::string EmitGLASM(const Profile& profile, const RuntimeInfo& runtime_info, I
461 header += fmt::format("R{},", index); 461 header += fmt::format("R{},", index);
462 } 462 }
463 if (program.local_memory_size > 0) { 463 if (program.local_memory_size > 0) {
464 header += fmt::format("lmem[{}],", program.local_memory_size); 464 header += fmt::format("lmem[{}],", Common::DivCeil(program.local_memory_size, 4U));
465 } 465 }
466 if (program.info.uses_fswzadd) { 466 if (program.info.uses_fswzadd) {
467 header += "FSWZA[4],FSWZB[4],"; 467 header += "FSWZA[4],FSWZB[4],";
diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
index 5a4195217..70292686f 100644
--- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
+++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
@@ -424,6 +424,10 @@ void VisitUsages(Info& info, IR::Inst& inst) {
424 info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2; 424 info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2;
425 info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4; 425 info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4;
426 break; 426 break;
427 case IR::Opcode::LoadLocal:
428 case IR::Opcode::WriteLocal:
429 info.uses_local_memory = true;
430 break;
427 default: 431 default:
428 break; 432 break;
429 } 433 }
diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h
index d308db942..b4b4afd37 100644
--- a/src/shader_recompiler/shader_info.h
+++ b/src/shader_recompiler/shader_info.h
@@ -172,6 +172,7 @@ struct Info {
172 bool stores_indexed_attributes{}; 172 bool stores_indexed_attributes{};
173 173
174 bool stores_global_memory{}; 174 bool stores_global_memory{};
175 bool uses_local_memory{};
175 176
176 bool uses_fp16{}; 177 bool uses_fp16{};
177 bool uses_fp64{}; 178 bool uses_fp64{};
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index 2442c3c29..e61d9af80 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -33,6 +33,7 @@ set(SHADER_FILES
33 opengl_fidelityfx_fsr.frag 33 opengl_fidelityfx_fsr.frag
34 opengl_fidelityfx_fsr_easu.frag 34 opengl_fidelityfx_fsr_easu.frag
35 opengl_fidelityfx_fsr_rcas.frag 35 opengl_fidelityfx_fsr_rcas.frag
36 opengl_lmem_warmup.comp
36 opengl_present.frag 37 opengl_present.frag
37 opengl_present.vert 38 opengl_present.vert
38 opengl_present_scaleforce.frag 39 opengl_present_scaleforce.frag
diff --git a/src/video_core/host_shaders/opengl_lmem_warmup.comp b/src/video_core/host_shaders/opengl_lmem_warmup.comp
new file mode 100644
index 000000000..518268477
--- /dev/null
+++ b/src/video_core/host_shaders/opengl_lmem_warmup.comp
@@ -0,0 +1,47 @@
1// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later
3
4// This shader is a workaround for a quirk in NVIDIA OpenGL drivers
5// Shaders using local memory see a great performance benefit if a shader that was dispatched
6// before it had more local memory allocated.
7// This shader allocates the maximum local memory allowed on NVIDIA drivers to ensure that
8// subsequent shaders see the performance boost.
9
10// NOTE: This shader does no actual meaningful work and returns immediately,
11// it is simply a means to have the driver expect a shader using lots of local memory.
12
13#version 450
14
15layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
16
17layout(location = 0) uniform uint uniform_data;
18
19layout(binding = 0, rgba8) uniform writeonly restrict image2DArray dest_image;
20
21#define MAX_LMEM_SIZE 4080 // Size chosen to avoid errors in Nvidia's GLSL compiler
22#define NUM_LMEM_CONSTANTS 1
23#define ARRAY_SIZE MAX_LMEM_SIZE - NUM_LMEM_CONSTANTS
24
25uint lmem_0[ARRAY_SIZE];
26const uvec4 constant_values[NUM_LMEM_CONSTANTS] = uvec4[](uvec4(0));
27
28void main() {
29 const uint global_id = gl_GlobalInvocationID.x;
30 if (global_id <= 128) {
31 // Since the shader is called with a dispatch of 1x1x1
32 // This should always be the case, and this shader will not actually execute
33 return;
34 }
35 for (uint t = 0; t < uniform_data; t++) {
36 const uint offset = (t * uniform_data);
37 lmem_0[offset] = t;
38 }
39 const uint offset = (gl_GlobalInvocationID.y * uniform_data + gl_GlobalInvocationID.x);
40 const uint value = lmem_0[offset];
41 const uint const_value = constant_values[offset / 4][offset % 4];
42 const uvec4 color = uvec4(value + const_value);
43
44 // A "side-effect" is needed so the variables don't get optimized out,
45 // but this should never execute so there should be no clobbering of previously bound state.
46 imageStore(dest_image, ivec3(gl_GlobalInvocationID), color);
47}
diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
index 3151c0db8..f9ca55c36 100644
--- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
@@ -63,6 +63,7 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac
63 writes_global_memory = !use_storage_buffers && 63 writes_global_memory = !use_storage_buffers &&
64 std::ranges::any_of(info.storage_buffers_descriptors, 64 std::ranges::any_of(info.storage_buffers_descriptors,
65 [](const auto& desc) { return desc.is_written; }); 65 [](const auto& desc) { return desc.is_written; });
66 uses_local_memory = info.uses_local_memory;
66 if (force_context_flush) { 67 if (force_context_flush) {
67 std::scoped_lock lock{built_mutex}; 68 std::scoped_lock lock{built_mutex};
68 built_fence.Create(); 69 built_fence.Create();
diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.h b/src/video_core/renderer_opengl/gl_compute_pipeline.h
index 9bcc72b59..c26b4fa5e 100644
--- a/src/video_core/renderer_opengl/gl_compute_pipeline.h
+++ b/src/video_core/renderer_opengl/gl_compute_pipeline.h
@@ -59,6 +59,10 @@ public:
59 return writes_global_memory; 59 return writes_global_memory;
60 } 60 }
61 61
62 [[nodiscard]] bool UsesLocalMemory() const noexcept {
63 return uses_local_memory;
64 }
65
62 void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_, 66 void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_,
63 Tegra::MemoryManager* gpu_memory_) { 67 Tegra::MemoryManager* gpu_memory_) {
64 kepler_compute = kepler_compute_; 68 kepler_compute = kepler_compute_;
@@ -84,6 +88,7 @@ private:
84 88
85 bool use_storage_buffers{}; 89 bool use_storage_buffers{};
86 bool writes_global_memory{}; 90 bool writes_global_memory{};
91 bool uses_local_memory{};
87 92
88 std::mutex built_mutex; 93 std::mutex built_mutex;
89 std::condition_variable built_condvar; 94 std::condition_variable built_condvar;
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 03d234f2f..33e63c17d 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -194,6 +194,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) {
194 has_bool_ref_bug = true; 194 has_bool_ref_bug = true;
195 } 195 }
196 } 196 }
197 has_lmem_perf_bug = is_nvidia;
197 198
198 strict_context_required = emu_window.StrictContextRequired(); 199 strict_context_required = emu_window.StrictContextRequired();
199 // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation. 200 // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation.
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index ad27264e5..a5a6bbbba 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -192,6 +192,10 @@ public:
192 return supports_conditional_barriers; 192 return supports_conditional_barriers;
193 } 193 }
194 194
195 bool HasLmemPerfBug() const {
196 return has_lmem_perf_bug;
197 }
198
195private: 199private:
196 static bool TestVariableAoffi(); 200 static bool TestVariableAoffi();
197 static bool TestPreciseBug(); 201 static bool TestPreciseBug();
@@ -238,6 +242,7 @@ private:
238 bool can_report_memory{}; 242 bool can_report_memory{};
239 bool strict_context_required{}; 243 bool strict_context_required{};
240 bool supports_conditional_barriers{}; 244 bool supports_conditional_barriers{};
245 bool has_lmem_perf_bug{};
241 246
242 std::string vendor_name; 247 std::string vendor_name;
243}; 248};
diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
index c58f760b8..23a48c6fe 100644
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
@@ -215,6 +215,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c
215 215
216 writes_global_memory |= std::ranges::any_of( 216 writes_global_memory |= std::ranges::any_of(
217 info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); 217 info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; });
218 uses_local_memory |= info.uses_local_memory;
218 } 219 }
219 ASSERT(num_textures <= MAX_TEXTURES); 220 ASSERT(num_textures <= MAX_TEXTURES);
220 ASSERT(num_images <= MAX_IMAGES); 221 ASSERT(num_images <= MAX_IMAGES);
diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.h b/src/video_core/renderer_opengl/gl_graphics_pipeline.h
index 7bab3be0a..7b3d7eae8 100644
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h
@@ -98,6 +98,10 @@ public:
98 return writes_global_memory; 98 return writes_global_memory;
99 } 99 }
100 100
101 [[nodiscard]] bool UsesLocalMemory() const noexcept {
102 return uses_local_memory;
103 }
104
101 [[nodiscard]] bool IsBuilt() noexcept; 105 [[nodiscard]] bool IsBuilt() noexcept;
102 106
103 template <typename Spec> 107 template <typename Spec>
@@ -146,6 +150,7 @@ private:
146 150
147 bool use_storage_buffers{}; 151 bool use_storage_buffers{};
148 bool writes_global_memory{}; 152 bool writes_global_memory{};
153 bool uses_local_memory{};
149 154
150 static constexpr std::size_t XFB_ENTRY_STRIDE = 3; 155 static constexpr std::size_t XFB_ENTRY_STRIDE = 3;
151 GLsizei num_xfb_attribs{}; 156 GLsizei num_xfb_attribs{};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index fc711c44a..edf527f2d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -222,6 +222,9 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) {
222 gpu.TickWork(); 222 gpu.TickWork();
223 223
224 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; 224 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
225 if (pipeline->UsesLocalMemory()) {
226 program_manager.LocalMemoryWarmup();
227 }
225 pipeline->SetEngine(maxwell3d, gpu_memory); 228 pipeline->SetEngine(maxwell3d, gpu_memory);
226 pipeline->Configure(is_indexed); 229 pipeline->Configure(is_indexed);
227 230
@@ -371,6 +374,9 @@ void RasterizerOpenGL::DispatchCompute() {
371 if (!pipeline) { 374 if (!pipeline) {
372 return; 375 return;
373 } 376 }
377 if (pipeline->UsesLocalMemory()) {
378 program_manager.LocalMemoryWarmup();
379 }
374 pipeline->SetEngine(kepler_compute, gpu_memory); 380 pipeline->SetEngine(kepler_compute, gpu_memory);
375 pipeline->Configure(); 381 pipeline->Configure();
376 const auto& qmd{kepler_compute->launch_description}; 382 const auto& qmd{kepler_compute->launch_description};
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 98841ae65..03d4b9d06 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -3,7 +3,9 @@
3 3
4#include <glad/glad.h> 4#include <glad/glad.h>
5 5
6#include "video_core/host_shaders/opengl_lmem_warmup_comp.h"
6#include "video_core/renderer_opengl/gl_shader_manager.h" 7#include "video_core/renderer_opengl/gl_shader_manager.h"
8#include "video_core/renderer_opengl/gl_shader_util.h"
7 9
8namespace OpenGL { 10namespace OpenGL {
9 11
@@ -17,6 +19,10 @@ ProgramManager::ProgramManager(const Device& device) {
17 if (device.UseAssemblyShaders()) { 19 if (device.UseAssemblyShaders()) {
18 glEnable(GL_COMPUTE_PROGRAM_NV); 20 glEnable(GL_COMPUTE_PROGRAM_NV);
19 } 21 }
22 if (device.HasLmemPerfBug()) {
23 lmem_warmup_program =
24 CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER);
25 }
20} 26}
21 27
22void ProgramManager::BindComputeProgram(GLuint program) { 28void ProgramManager::BindComputeProgram(GLuint program) {
@@ -98,6 +104,13 @@ void ProgramManager::BindAssemblyPrograms(std::span<const OGLAssemblyProgram, NU
98 104
99void ProgramManager::RestoreGuestCompute() {} 105void ProgramManager::RestoreGuestCompute() {}
100 106
107void ProgramManager::LocalMemoryWarmup() {
108 if (lmem_warmup_program.handle != 0) {
109 BindComputeProgram(lmem_warmup_program.handle);
110 glDispatchCompute(1, 1, 1);
111 }
112}
113
101void ProgramManager::BindPipeline() { 114void ProgramManager::BindPipeline() {
102 if (!is_pipeline_bound) { 115 if (!is_pipeline_bound) {
103 is_pipeline_bound = true; 116 is_pipeline_bound = true;
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index 07ffab77f..852d8c88e 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -30,6 +30,8 @@ public:
30 30
31 void RestoreGuestCompute(); 31 void RestoreGuestCompute();
32 32
33 void LocalMemoryWarmup();
34
33private: 35private:
34 void BindPipeline(); 36 void BindPipeline();
35 37
@@ -44,6 +46,7 @@ private:
44 u32 current_stage_mask = 0; 46 u32 current_stage_mask = 0;
45 std::array<GLuint, NUM_STAGES> current_programs{}; 47 std::array<GLuint, NUM_STAGES> current_programs{};
46 GLuint current_assembly_compute_program = 0; 48 GLuint current_assembly_compute_program = 0;
49 OGLProgram lmem_warmup_program;
47}; 50};
48 51
49} // namespace OpenGL 52} // namespace OpenGL