diff options
| author | 2023-06-27 11:21:10 -0400 | |
|---|---|---|
| committer | 2023-06-27 11:21:10 -0400 | |
| commit | dafbc86366f8bcd9153949db4d141ec489928f81 (patch) | |
| tree | 8496cb635eef6967a99d0b7dfad41419a24b5efd /src | |
| parent | Merge pull request #10925 from t895/fs-agony (diff) | |
| parent | OpenGL: Limit lmem warmup to NVIDIA (diff) | |
| download | yuzu-dafbc86366f8bcd9153949db4d141ec489928f81.tar.gz yuzu-dafbc86366f8bcd9153949db4d141ec489928f81.tar.xz yuzu-dafbc86366f8bcd9153949db4d141ec489928f81.zip | |
Merge pull request #10916 from ameerj/lolmem
OpenGL: Add Local Memory warmup shader for Nvidia
Diffstat (limited to 'src')
14 files changed, 94 insertions, 1 deletions
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm.cpp b/src/shader_recompiler/backend/glasm/emit_glasm.cpp index fd4a61a4d..b795c0179 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm.cpp | |||
| @@ -461,7 +461,7 @@ std::string EmitGLASM(const Profile& profile, const RuntimeInfo& runtime_info, I | |||
| 461 | header += fmt::format("R{},", index); | 461 | header += fmt::format("R{},", index); |
| 462 | } | 462 | } |
| 463 | if (program.local_memory_size > 0) { | 463 | if (program.local_memory_size > 0) { |
| 464 | header += fmt::format("lmem[{}],", program.local_memory_size); | 464 | header += fmt::format("lmem[{}],", Common::DivCeil(program.local_memory_size, 4U)); |
| 465 | } | 465 | } |
| 466 | if (program.info.uses_fswzadd) { | 466 | if (program.info.uses_fswzadd) { |
| 467 | header += "FSWZA[4],FSWZB[4],"; | 467 | header += "FSWZA[4],FSWZB[4],"; |
diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp index 5a4195217..70292686f 100644 --- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp +++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp | |||
| @@ -424,6 +424,10 @@ void VisitUsages(Info& info, IR::Inst& inst) { | |||
| 424 | info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2; | 424 | info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2; |
| 425 | info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4; | 425 | info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4; |
| 426 | break; | 426 | break; |
| 427 | case IR::Opcode::LoadLocal: | ||
| 428 | case IR::Opcode::WriteLocal: | ||
| 429 | info.uses_local_memory = true; | ||
| 430 | break; | ||
| 427 | default: | 431 | default: |
| 428 | break; | 432 | break; |
| 429 | } | 433 | } |
diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h index d308db942..b4b4afd37 100644 --- a/src/shader_recompiler/shader_info.h +++ b/src/shader_recompiler/shader_info.h | |||
| @@ -172,6 +172,7 @@ struct Info { | |||
| 172 | bool stores_indexed_attributes{}; | 172 | bool stores_indexed_attributes{}; |
| 173 | 173 | ||
| 174 | bool stores_global_memory{}; | 174 | bool stores_global_memory{}; |
| 175 | bool uses_local_memory{}; | ||
| 175 | 176 | ||
| 176 | bool uses_fp16{}; | 177 | bool uses_fp16{}; |
| 177 | bool uses_fp64{}; | 178 | bool uses_fp64{}; |
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 2442c3c29..e61d9af80 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt | |||
| @@ -33,6 +33,7 @@ set(SHADER_FILES | |||
| 33 | opengl_fidelityfx_fsr.frag | 33 | opengl_fidelityfx_fsr.frag |
| 34 | opengl_fidelityfx_fsr_easu.frag | 34 | opengl_fidelityfx_fsr_easu.frag |
| 35 | opengl_fidelityfx_fsr_rcas.frag | 35 | opengl_fidelityfx_fsr_rcas.frag |
| 36 | opengl_lmem_warmup.comp | ||
| 36 | opengl_present.frag | 37 | opengl_present.frag |
| 37 | opengl_present.vert | 38 | opengl_present.vert |
| 38 | opengl_present_scaleforce.frag | 39 | opengl_present_scaleforce.frag |
diff --git a/src/video_core/host_shaders/opengl_lmem_warmup.comp b/src/video_core/host_shaders/opengl_lmem_warmup.comp new file mode 100644 index 000000000..518268477 --- /dev/null +++ b/src/video_core/host_shaders/opengl_lmem_warmup.comp | |||
| @@ -0,0 +1,47 @@ | |||
| 1 | // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project | ||
| 2 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 3 | |||
| 4 | // This shader is a workaround for a quirk in NVIDIA OpenGL drivers | ||
| 5 | // Shaders using local memory see a great performance benefit if a shader that was dispatched | ||
| 6 | // before it had more local memory allocated. | ||
| 7 | // This shader allocates the maximum local memory allowed on NVIDIA drivers to ensure that | ||
| 8 | // subsequent shaders see the performance boost. | ||
| 9 | |||
| 10 | // NOTE: This shader does no actual meaningful work and returns immediately, | ||
| 11 | // it is simply a means to have the driver expect a shader using lots of local memory. | ||
| 12 | |||
| 13 | #version 450 | ||
| 14 | |||
| 15 | layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; | ||
| 16 | |||
| 17 | layout(location = 0) uniform uint uniform_data; | ||
| 18 | |||
| 19 | layout(binding = 0, rgba8) uniform writeonly restrict image2DArray dest_image; | ||
| 20 | |||
| 21 | #define MAX_LMEM_SIZE 4080 // Size chosen to avoid errors in Nvidia's GLSL compiler | ||
| 22 | #define NUM_LMEM_CONSTANTS 1 | ||
| 23 | #define ARRAY_SIZE MAX_LMEM_SIZE - NUM_LMEM_CONSTANTS | ||
| 24 | |||
| 25 | uint lmem_0[ARRAY_SIZE]; | ||
| 26 | const uvec4 constant_values[NUM_LMEM_CONSTANTS] = uvec4[](uvec4(0)); | ||
| 27 | |||
| 28 | void main() { | ||
| 29 | const uint global_id = gl_GlobalInvocationID.x; | ||
| 30 | if (global_id <= 128) { | ||
| 31 | // Since the shader is called with a dispatch of 1x1x1 | ||
| 32 | // This should always be the case, and this shader will not actually execute | ||
| 33 | return; | ||
| 34 | } | ||
| 35 | for (uint t = 0; t < uniform_data; t++) { | ||
| 36 | const uint offset = (t * uniform_data); | ||
| 37 | lmem_0[offset] = t; | ||
| 38 | } | ||
| 39 | const uint offset = (gl_GlobalInvocationID.y * uniform_data + gl_GlobalInvocationID.x); | ||
| 40 | const uint value = lmem_0[offset]; | ||
| 41 | const uint const_value = constant_values[offset / 4][offset % 4]; | ||
| 42 | const uvec4 color = uvec4(value + const_value); | ||
| 43 | |||
| 44 | // A "side-effect" is needed so the variables don't get optimized out, | ||
| 45 | // but this should never execute so there should be no clobbering of previously bound state. | ||
| 46 | imageStore(dest_image, ivec3(gl_GlobalInvocationID), color); | ||
| 47 | } | ||
diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index 3151c0db8..f9ca55c36 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp | |||
| @@ -63,6 +63,7 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac | |||
| 63 | writes_global_memory = !use_storage_buffers && | 63 | writes_global_memory = !use_storage_buffers && |
| 64 | std::ranges::any_of(info.storage_buffers_descriptors, | 64 | std::ranges::any_of(info.storage_buffers_descriptors, |
| 65 | [](const auto& desc) { return desc.is_written; }); | 65 | [](const auto& desc) { return desc.is_written; }); |
| 66 | uses_local_memory = info.uses_local_memory; | ||
| 66 | if (force_context_flush) { | 67 | if (force_context_flush) { |
| 67 | std::scoped_lock lock{built_mutex}; | 68 | std::scoped_lock lock{built_mutex}; |
| 68 | built_fence.Create(); | 69 | built_fence.Create(); |
diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.h b/src/video_core/renderer_opengl/gl_compute_pipeline.h index 9bcc72b59..c26b4fa5e 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.h +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.h | |||
| @@ -59,6 +59,10 @@ public: | |||
| 59 | return writes_global_memory; | 59 | return writes_global_memory; |
| 60 | } | 60 | } |
| 61 | 61 | ||
| 62 | [[nodiscard]] bool UsesLocalMemory() const noexcept { | ||
| 63 | return uses_local_memory; | ||
| 64 | } | ||
| 65 | |||
| 62 | void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_, | 66 | void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_, |
| 63 | Tegra::MemoryManager* gpu_memory_) { | 67 | Tegra::MemoryManager* gpu_memory_) { |
| 64 | kepler_compute = kepler_compute_; | 68 | kepler_compute = kepler_compute_; |
| @@ -84,6 +88,7 @@ private: | |||
| 84 | 88 | ||
| 85 | bool use_storage_buffers{}; | 89 | bool use_storage_buffers{}; |
| 86 | bool writes_global_memory{}; | 90 | bool writes_global_memory{}; |
| 91 | bool uses_local_memory{}; | ||
| 87 | 92 | ||
| 88 | std::mutex built_mutex; | 93 | std::mutex built_mutex; |
| 89 | std::condition_variable built_condvar; | 94 | std::condition_variable built_condvar; |
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 03d234f2f..33e63c17d 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp | |||
| @@ -194,6 +194,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) { | |||
| 194 | has_bool_ref_bug = true; | 194 | has_bool_ref_bug = true; |
| 195 | } | 195 | } |
| 196 | } | 196 | } |
| 197 | has_lmem_perf_bug = is_nvidia; | ||
| 197 | 198 | ||
| 198 | strict_context_required = emu_window.StrictContextRequired(); | 199 | strict_context_required = emu_window.StrictContextRequired(); |
| 199 | // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation. | 200 | // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation. |
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index ad27264e5..a5a6bbbba 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h | |||
| @@ -192,6 +192,10 @@ public: | |||
| 192 | return supports_conditional_barriers; | 192 | return supports_conditional_barriers; |
| 193 | } | 193 | } |
| 194 | 194 | ||
| 195 | bool HasLmemPerfBug() const { | ||
| 196 | return has_lmem_perf_bug; | ||
| 197 | } | ||
| 198 | |||
| 195 | private: | 199 | private: |
| 196 | static bool TestVariableAoffi(); | 200 | static bool TestVariableAoffi(); |
| 197 | static bool TestPreciseBug(); | 201 | static bool TestPreciseBug(); |
| @@ -238,6 +242,7 @@ private: | |||
| 238 | bool can_report_memory{}; | 242 | bool can_report_memory{}; |
| 239 | bool strict_context_required{}; | 243 | bool strict_context_required{}; |
| 240 | bool supports_conditional_barriers{}; | 244 | bool supports_conditional_barriers{}; |
| 245 | bool has_lmem_perf_bug{}; | ||
| 241 | 246 | ||
| 242 | std::string vendor_name; | 247 | std::string vendor_name; |
| 243 | }; | 248 | }; |
diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index c58f760b8..23a48c6fe 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp | |||
| @@ -215,6 +215,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c | |||
| 215 | 215 | ||
| 216 | writes_global_memory |= std::ranges::any_of( | 216 | writes_global_memory |= std::ranges::any_of( |
| 217 | info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); | 217 | info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); |
| 218 | uses_local_memory |= info.uses_local_memory; | ||
| 218 | } | 219 | } |
| 219 | ASSERT(num_textures <= MAX_TEXTURES); | 220 | ASSERT(num_textures <= MAX_TEXTURES); |
| 220 | ASSERT(num_images <= MAX_IMAGES); | 221 | ASSERT(num_images <= MAX_IMAGES); |
diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.h b/src/video_core/renderer_opengl/gl_graphics_pipeline.h index 7bab3be0a..7b3d7eae8 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h | |||
| @@ -98,6 +98,10 @@ public: | |||
| 98 | return writes_global_memory; | 98 | return writes_global_memory; |
| 99 | } | 99 | } |
| 100 | 100 | ||
| 101 | [[nodiscard]] bool UsesLocalMemory() const noexcept { | ||
| 102 | return uses_local_memory; | ||
| 103 | } | ||
| 104 | |||
| 101 | [[nodiscard]] bool IsBuilt() noexcept; | 105 | [[nodiscard]] bool IsBuilt() noexcept; |
| 102 | 106 | ||
| 103 | template <typename Spec> | 107 | template <typename Spec> |
| @@ -146,6 +150,7 @@ private: | |||
| 146 | 150 | ||
| 147 | bool use_storage_buffers{}; | 151 | bool use_storage_buffers{}; |
| 148 | bool writes_global_memory{}; | 152 | bool writes_global_memory{}; |
| 153 | bool uses_local_memory{}; | ||
| 149 | 154 | ||
| 150 | static constexpr std::size_t XFB_ENTRY_STRIDE = 3; | 155 | static constexpr std::size_t XFB_ENTRY_STRIDE = 3; |
| 151 | GLsizei num_xfb_attribs{}; | 156 | GLsizei num_xfb_attribs{}; |
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index fc711c44a..edf527f2d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp | |||
| @@ -222,6 +222,9 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) { | |||
| 222 | gpu.TickWork(); | 222 | gpu.TickWork(); |
| 223 | 223 | ||
| 224 | std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; | 224 | std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; |
| 225 | if (pipeline->UsesLocalMemory()) { | ||
| 226 | program_manager.LocalMemoryWarmup(); | ||
| 227 | } | ||
| 225 | pipeline->SetEngine(maxwell3d, gpu_memory); | 228 | pipeline->SetEngine(maxwell3d, gpu_memory); |
| 226 | pipeline->Configure(is_indexed); | 229 | pipeline->Configure(is_indexed); |
| 227 | 230 | ||
| @@ -371,6 +374,9 @@ void RasterizerOpenGL::DispatchCompute() { | |||
| 371 | if (!pipeline) { | 374 | if (!pipeline) { |
| 372 | return; | 375 | return; |
| 373 | } | 376 | } |
| 377 | if (pipeline->UsesLocalMemory()) { | ||
| 378 | program_manager.LocalMemoryWarmup(); | ||
| 379 | } | ||
| 374 | pipeline->SetEngine(kepler_compute, gpu_memory); | 380 | pipeline->SetEngine(kepler_compute, gpu_memory); |
| 375 | pipeline->Configure(); | 381 | pipeline->Configure(); |
| 376 | const auto& qmd{kepler_compute->launch_description}; | 382 | const auto& qmd{kepler_compute->launch_description}; |
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 98841ae65..03d4b9d06 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp | |||
| @@ -3,7 +3,9 @@ | |||
| 3 | 3 | ||
| 4 | #include <glad/glad.h> | 4 | #include <glad/glad.h> |
| 5 | 5 | ||
| 6 | #include "video_core/host_shaders/opengl_lmem_warmup_comp.h" | ||
| 6 | #include "video_core/renderer_opengl/gl_shader_manager.h" | 7 | #include "video_core/renderer_opengl/gl_shader_manager.h" |
| 8 | #include "video_core/renderer_opengl/gl_shader_util.h" | ||
| 7 | 9 | ||
| 8 | namespace OpenGL { | 10 | namespace OpenGL { |
| 9 | 11 | ||
| @@ -17,6 +19,10 @@ ProgramManager::ProgramManager(const Device& device) { | |||
| 17 | if (device.UseAssemblyShaders()) { | 19 | if (device.UseAssemblyShaders()) { |
| 18 | glEnable(GL_COMPUTE_PROGRAM_NV); | 20 | glEnable(GL_COMPUTE_PROGRAM_NV); |
| 19 | } | 21 | } |
| 22 | if (device.HasLmemPerfBug()) { | ||
| 23 | lmem_warmup_program = | ||
| 24 | CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER); | ||
| 25 | } | ||
| 20 | } | 26 | } |
| 21 | 27 | ||
| 22 | void ProgramManager::BindComputeProgram(GLuint program) { | 28 | void ProgramManager::BindComputeProgram(GLuint program) { |
| @@ -98,6 +104,13 @@ void ProgramManager::BindAssemblyPrograms(std::span<const OGLAssemblyProgram, NU | |||
| 98 | 104 | ||
| 99 | void ProgramManager::RestoreGuestCompute() {} | 105 | void ProgramManager::RestoreGuestCompute() {} |
| 100 | 106 | ||
| 107 | void ProgramManager::LocalMemoryWarmup() { | ||
| 108 | if (lmem_warmup_program.handle != 0) { | ||
| 109 | BindComputeProgram(lmem_warmup_program.handle); | ||
| 110 | glDispatchCompute(1, 1, 1); | ||
| 111 | } | ||
| 112 | } | ||
| 113 | |||
| 101 | void ProgramManager::BindPipeline() { | 114 | void ProgramManager::BindPipeline() { |
| 102 | if (!is_pipeline_bound) { | 115 | if (!is_pipeline_bound) { |
| 103 | is_pipeline_bound = true; | 116 | is_pipeline_bound = true; |
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 07ffab77f..852d8c88e 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h | |||
| @@ -30,6 +30,8 @@ public: | |||
| 30 | 30 | ||
| 31 | void RestoreGuestCompute(); | 31 | void RestoreGuestCompute(); |
| 32 | 32 | ||
| 33 | void LocalMemoryWarmup(); | ||
| 34 | |||
| 33 | private: | 35 | private: |
| 34 | void BindPipeline(); | 36 | void BindPipeline(); |
| 35 | 37 | ||
| @@ -44,6 +46,7 @@ private: | |||
| 44 | u32 current_stage_mask = 0; | 46 | u32 current_stage_mask = 0; |
| 45 | std::array<GLuint, NUM_STAGES> current_programs{}; | 47 | std::array<GLuint, NUM_STAGES> current_programs{}; |
| 46 | GLuint current_assembly_compute_program = 0; | 48 | GLuint current_assembly_compute_program = 0; |
| 49 | OGLProgram lmem_warmup_program; | ||
| 47 | }; | 50 | }; |
| 48 | 51 | ||
| 49 | } // namespace OpenGL | 52 | } // namespace OpenGL |