summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar bunnei2020-06-02 14:58:50 -0400
committerGravatar GitHub2020-06-02 14:58:50 -0400
commit597d8b4bd457ff0aa1293ff4ac7761e2eefc9150 (patch)
treef16a61443b947bf5cc336d51e32111c8f67eb6b3 /src
parentMerge pull request #4016 from ReinUsesLisp/invocation-info (diff)
parentglsl: Squash constant buffers into a single SSBO when we hit the limit (diff)
downloadyuzu-597d8b4bd457ff0aa1293ff4ac7761e2eefc9150.tar.gz
yuzu-597d8b4bd457ff0aa1293ff4ac7761e2eefc9150.tar.xz
yuzu-597d8b4bd457ff0aa1293ff4ac7761e2eefc9150.zip
Merge pull request #4006 from ReinUsesLisp/squash-ubos
glsl: Squash constant buffers into a single SSBO when we hit the limit
Diffstat (limited to 'src')
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp46
-rw-r--r--src/video_core/renderer_opengl/gl_device.h7
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp78
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h4
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp12
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp99
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.h6
7 files changed, 173 insertions, 79 deletions
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index e1b245288..d20547c04 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -6,6 +6,7 @@
6#include <array> 6#include <array>
7#include <cstddef> 7#include <cstddef>
8#include <cstring> 8#include <cstring>
9#include <limits>
9#include <optional> 10#include <optional>
10#include <vector> 11#include <vector>
11 12
@@ -26,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;
26 27
27constexpr u32 NumStages = 5; 28constexpr u32 NumStages = 5;
28 29
29constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, 30constexpr std::array LimitUBOs = {
30 GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, 31 GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
31 GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS}; 32 GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
33 GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS};
32 34
33constexpr std::array LimitSSBOs = { 35constexpr std::array LimitSSBOs = {
34 GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, 36 GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
35 GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, 37 GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
36 GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS}; 38 GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
37 39
38constexpr std::array LimitSamplers = { 40constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
39 GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, 41 GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
40 GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, 42 GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
41 GL_MAX_TEXTURE_IMAGE_UNITS}; 43 GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
44 GL_MAX_TEXTURE_IMAGE_UNITS,
45 GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
42 46
43constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS, 47constexpr std::array LimitImages = {
44 GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, 48 GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
45 GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, 49 GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
46 GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS}; 50 GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS};
47 51
48template <typename T> 52template <typename T>
49T GetInteger(GLenum pname) { 53T GetInteger(GLenum pname) {
@@ -85,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
85 return std::exchange(base, base + amount); 89 return std::exchange(base, base + amount);
86} 90}
87 91
92std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
93 std::array<u32, Tegra::Engines::MaxShaderTypes> max;
94 std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
95 [](GLenum pname) { return GetInteger<u32>(pname); });
96 return max;
97}
98
88std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept { 99std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
89 std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings; 100 std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
90 101
@@ -159,7 +170,8 @@ bool IsASTCSupported() {
159 170
160} // Anonymous namespace 171} // Anonymous namespace
161 172
162Device::Device() : base_bindings{BuildBaseBindings()} { 173Device::Device()
174 : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
163 const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); 175 const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
164 const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); 176 const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
165 const std::vector extensions = GetExtensions(); 177 const std::vector extensions = GetExtensions();
@@ -194,7 +206,9 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
194} 206}
195 207
196Device::Device(std::nullptr_t) { 208Device::Device(std::nullptr_t) {
197 uniform_buffer_alignment = 0; 209 max_uniform_buffers.fill(std::numeric_limits<u32>::max());
210 uniform_buffer_alignment = 4;
211 shader_storage_alignment = 4;
198 max_vertex_attributes = 16; 212 max_vertex_attributes = 16;
199 max_varyings = 15; 213 max_varyings = 15;
200 has_warp_intrinsics = true; 214 has_warp_intrinsics = true;
@@ -202,8 +216,6 @@ Device::Device(std::nullptr_t) {
202 has_vertex_viewport_layer = true; 216 has_vertex_viewport_layer = true;
203 has_image_load_formatted = true; 217 has_image_load_formatted = true;
204 has_variable_aoffi = true; 218 has_variable_aoffi = true;
205 has_component_indexing_bug = false;
206 has_precise_bug = false;
207} 219}
208 220
209bool Device::TestVariableAoffi() { 221bool Device::TestVariableAoffi() {
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 683ed9002..98cca0254 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -24,6 +24,10 @@ public:
24 explicit Device(); 24 explicit Device();
25 explicit Device(std::nullptr_t); 25 explicit Device(std::nullptr_t);
26 26
27 u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
28 return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
29 }
30
27 const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept { 31 const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
28 return base_bindings[stage_index]; 32 return base_bindings[stage_index];
29 } 33 }
@@ -92,7 +96,8 @@ private:
92 static bool TestVariableAoffi(); 96 static bool TestVariableAoffi();
93 static bool TestPreciseBug(); 97 static bool TestPreciseBug();
94 98
95 std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings; 99 std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
100 std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
96 std::size_t uniform_buffer_alignment{}; 101 std::size_t uniform_buffer_alignment{};
97 std::size_t shader_storage_alignment{}; 102 std::size_t shader_storage_alignment{};
98 u32 max_vertex_attributes{}; 103 u32 max_vertex_attributes{};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 3c421dd16..55e79aaf6 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -54,6 +54,12 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255
54 54
55namespace { 55namespace {
56 56
57constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
58constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
59 NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
60constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
61 NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
62
57constexpr std::size_t NumSupportedVertexAttributes = 16; 63constexpr std::size_t NumSupportedVertexAttributes = 16;
58 64
59template <typename Engine, typename Entry> 65template <typename Engine, typename Entry>
@@ -104,6 +110,9 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind
104 screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { 110 screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
105 CheckExtensions(); 111 CheckExtensions();
106 112
113 unified_uniform_buffer.Create();
114 glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
115
107 if (device.UseAssemblyShaders()) { 116 if (device.UseAssemblyShaders()) {
108 glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); 117 glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
109 for (const GLuint cbuf : staging_cbufs) { 118 for (const GLuint cbuf : staging_cbufs) {
@@ -842,34 +851,56 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad
842 MICROPROFILE_SCOPE(OpenGL_UBO); 851 MICROPROFILE_SCOPE(OpenGL_UBO);
843 const auto& stages = system.GPU().Maxwell3D().state.shader_stages; 852 const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
844 const auto& shader_stage = stages[stage_index]; 853 const auto& shader_stage = stages[stage_index];
854 const auto& entries = shader->GetEntries();
855 const bool use_unified = entries.use_unified_uniforms;
856 const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
845 857
846 u32 binding = 858 const auto base_bindings = device.GetBaseBindings(stage_index);
847 device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer; 859 u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
848 for (const auto& entry : shader->GetEntries().const_buffers) { 860 for (const auto& entry : entries.const_buffers) {
849 const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; 861 const u32 index = entry.GetIndex();
850 SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry); 862 const auto& buffer = shader_stage.const_buffers[index];
863 SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
864 base_unified_offset + index * Maxwell::MaxConstBufferSize);
865 ++binding;
866 }
867 if (use_unified) {
868 const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
869 entries.global_memory_entries.size());
870 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
871 base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
851 } 872 }
852} 873}
853 874
854void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { 875void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
855 MICROPROFILE_SCOPE(OpenGL_UBO); 876 MICROPROFILE_SCOPE(OpenGL_UBO);
856 const auto& launch_desc = system.GPU().KeplerCompute().launch_description; 877 const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
878 const auto& entries = kernel->GetEntries();
879 const bool use_unified = entries.use_unified_uniforms;
857 880
858 u32 binding = 0; 881 u32 binding = 0;
859 for (const auto& entry : kernel->GetEntries().const_buffers) { 882 for (const auto& entry : entries.const_buffers) {
860 const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; 883 const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
861 const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); 884 const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
862 Tegra::Engines::ConstBufferInfo buffer; 885 Tegra::Engines::ConstBufferInfo buffer;
863 buffer.address = config.Address(); 886 buffer.address = config.Address();
864 buffer.size = config.size; 887 buffer.size = config.size;
865 buffer.enabled = mask[entry.GetIndex()]; 888 buffer.enabled = mask[entry.GetIndex()];
866 SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry); 889 SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
890 use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
891 ++binding;
892 }
893 if (use_unified) {
894 const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
895 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
896 NUM_CONST_BUFFERS_BYTES_PER_STAGE);
867 } 897 }
868} 898}
869 899
870void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, 900void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
871 const Tegra::Engines::ConstBufferInfo& buffer, 901 const Tegra::Engines::ConstBufferInfo& buffer,
872 const ConstBufferEntry& entry) { 902 const ConstBufferEntry& entry, bool use_unified,
903 std::size_t unified_offset) {
873 if (!buffer.enabled) { 904 if (!buffer.enabled) {
874 // Set values to zero to unbind buffers 905 // Set values to zero to unbind buffers
875 if (device.UseAssemblyShaders()) { 906 if (device.UseAssemblyShaders()) {
@@ -885,20 +916,29 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
885 // UBO alignment requirements. 916 // UBO alignment requirements.
886 const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); 917 const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
887 918
888 const auto alignment = device.GetUniformBufferAlignment(); 919 const bool fast_upload = !use_unified && device.HasFastBufferSubData();
889 auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, 920
890 device.HasFastBufferSubData()); 921 const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
891 if (!device.UseAssemblyShaders()) { 922 const GPUVAddr gpu_addr = buffer.address;
892 glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); 923 auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
924
925 if (device.UseAssemblyShaders()) {
926 UNIMPLEMENTED_IF(use_unified);
927 if (offset != 0) {
928 const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
929 glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
930 cbuf = staging_cbuf;
931 offset = 0;
932 }
933 glBindBufferRangeNV(stage, binding, cbuf, offset, size);
893 return; 934 return;
894 } 935 }
895 if (offset != 0) { 936
896 const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; 937 if (use_unified) {
897 glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); 938 glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
898 cbuf = staging_cbuf; 939 } else {
899 offset = 0; 940 glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
900 } 941 }
901 glBindBufferRangeNV(stage, binding, cbuf, offset, size);
902} 942}
903 943
904void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { 944void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 87f7fe159..f5dc56a0e 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -107,7 +107,8 @@ private:
107 107
108 /// Configures a constant buffer. 108 /// Configures a constant buffer.
109 void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, 109 void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
110 const ConstBufferEntry& entry); 110 const ConstBufferEntry& entry, bool use_unified,
111 std::size_t unified_offset);
111 112
112 /// Configures the current global memory entries to use for the draw command. 113 /// Configures the current global memory entries to use for the draw command.
113 void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); 114 void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
@@ -253,6 +254,7 @@ private:
253 Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; 254 Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
254 std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{}; 255 std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
255 std::size_t current_cbuf = 0; 256 std::size_t current_cbuf = 0;
257 OGLBuffer unified_uniform_buffer;
256 258
257 /// Number of commands queued to the OpenGL driver. Reseted on flush. 259 /// Number of commands queued to the OpenGL driver. Reseted on flush.
258 std::size_t num_queued_commands = 0; 260 std::size_t num_queued_commands = 0;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 4cd0f36cf..a991ca64a 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -241,8 +241,9 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
241 entry.bindless_samplers = registry->GetBindlessSamplers(); 241 entry.bindless_samplers = registry->GetBindlessSamplers();
242 params.disk_cache.SaveEntry(std::move(entry)); 242 params.disk_cache.SaveEntry(std::move(entry));
243 243
244 return std::shared_ptr<CachedShader>(new CachedShader( 244 return std::shared_ptr<CachedShader>(
245 params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); 245 new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
246 MakeEntries(params.device, ir, shader_type), std::move(program)));
246} 247}
247 248
248Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { 249Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
@@ -265,8 +266,9 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
265 entry.bindless_samplers = registry->GetBindlessSamplers(); 266 entry.bindless_samplers = registry->GetBindlessSamplers();
266 params.disk_cache.SaveEntry(std::move(entry)); 267 params.disk_cache.SaveEntry(std::move(entry));
267 268
268 return std::shared_ptr<CachedShader>(new CachedShader( 269 return std::shared_ptr<CachedShader>(
269 params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); 270 new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
271 MakeEntries(params.device, ir, ShaderType::Compute), std::move(program)));
270} 272}
271 273
272Shader CachedShader::CreateFromCache(const ShaderParameters& params, 274Shader CachedShader::CreateFromCache(const ShaderParameters& params,
@@ -348,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
348 PrecompiledShader shader; 350 PrecompiledShader shader;
349 shader.program = std::move(program); 351 shader.program = std::move(program);
350 shader.registry = std::move(registry); 352 shader.registry = std::move(registry);
351 shader.entries = MakeEntries(ir); 353 shader.entries = MakeEntries(device, ir, entry.type);
352 354
353 std::scoped_lock lock{mutex}; 355 std::scoped_lock lock{mutex};
354 if (callback) { 356 if (callback) {
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 9cb115959..502b95973 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -61,8 +61,8 @@ struct TextureDerivates {};
61using TextureArgument = std::pair<Type, Node>; 61using TextureArgument = std::pair<Type, Node>;
62using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>; 62using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;
63 63
64constexpr u32 MAX_CONSTBUFFER_ELEMENTS = 64constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
65 static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); 65constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
66 66
67constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt 67constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
68#define ftou floatBitsToUint 68#define ftou floatBitsToUint
@@ -402,6 +402,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
402 return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); 402 return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
403} 403}
404 404
405bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
406 const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
407 // We waste one UBO for emulation
408 const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
409 return num_ubos > num_available_ubos;
410}
411
405struct GenericVaryingDescription { 412struct GenericVaryingDescription {
406 std::string name; 413 std::string name;
407 u8 first_element = 0; 414 u8 first_element = 0;
@@ -412,8 +419,9 @@ class GLSLDecompiler final {
412public: 419public:
413 explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, 420 explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
414 ShaderType stage, std::string_view identifier, std::string_view suffix) 421 ShaderType stage, std::string_view identifier, std::string_view suffix)
415 : device{device}, ir{ir}, registry{registry}, stage{stage}, 422 : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
416 identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} { 423 suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
424 UseUnifiedUniforms(device, ir, stage)} {
417 if (stage != ShaderType::Compute) { 425 if (stage != ShaderType::Compute) {
418 transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); 426 transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
419 } 427 }
@@ -834,12 +842,24 @@ private:
834 } 842 }
835 843
836 void DeclareConstantBuffers() { 844 void DeclareConstantBuffers() {
845 if (use_unified_uniforms) {
846 const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
847 static_cast<u32>(ir.GetGlobalMemory().size());
848 code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
849 binding);
850 code.AddLine(" uint cbufs[];");
851 code.AddLine("}};");
852 code.AddNewLine();
853 return;
854 }
855
837 u32 binding = device.GetBaseBindings(stage).uniform_buffer; 856 u32 binding = device.GetBaseBindings(stage).uniform_buffer;
838 for (const auto& buffers : ir.GetConstantBuffers()) { 857 for (const auto [index, info] : ir.GetConstantBuffers()) {
839 const auto index = buffers.first; 858 const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
859 const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
840 code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, 860 code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
841 GetConstBufferBlock(index)); 861 GetConstBufferBlock(index));
842 code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS); 862 code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size);
843 code.AddLine("}};"); 863 code.AddLine("}};");
844 code.AddNewLine(); 864 code.AddNewLine();
845 } 865 }
@@ -1038,42 +1058,51 @@ private:
1038 1058
1039 if (const auto cbuf = std::get_if<CbufNode>(&*node)) { 1059 if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
1040 const Node offset = cbuf->GetOffset(); 1060 const Node offset = cbuf->GetOffset();
1061 const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
1062
1041 if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { 1063 if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
1042 // Direct access 1064 // Direct access
1043 const u32 offset_imm = immediate->GetValue(); 1065 const u32 offset_imm = immediate->GetValue();
1044 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); 1066 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
1045 return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), 1067 if (use_unified_uniforms) {
1046 offset_imm / (4 * 4), (offset_imm / 4) % 4), 1068 return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
1047 Type::Uint}; 1069 Type::Uint};
1070 } else {
1071 return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
1072 offset_imm / (4 * 4), (offset_imm / 4) % 4),
1073 Type::Uint};
1074 }
1048 } 1075 }
1049 1076
1050 if (std::holds_alternative<OperationNode>(*offset)) { 1077 // Indirect access
1051 // Indirect access 1078 if (use_unified_uniforms) {
1052 const std::string final_offset = code.GenerateTemporary(); 1079 return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
1053 code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); 1080 Visit(offset).AsUint()),
1081 Type::Uint};
1082 }
1054 1083
1055 if (!device.HasComponentIndexingBug()) { 1084 const std::string final_offset = code.GenerateTemporary();
1056 return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), 1085 code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
1057 final_offset, final_offset),
1058 Type::Uint};
1059 }
1060 1086
1061 // AMD's proprietary GLSL compiler emits ill code for variable component access. 1087 if (!device.HasComponentIndexingBug()) {
1062 // To bypass this driver bug generate 4 ifs, one per each component. 1088 return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
1063 const std::string pack = code.GenerateTemporary(); 1089 final_offset, final_offset),
1064 code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), 1090 Type::Uint};
1065 final_offset);
1066
1067 const std::string result = code.GenerateTemporary();
1068 code.AddLine("uint {};", result);
1069 for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
1070 code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
1071 pack, GetSwizzle(swizzle));
1072 }
1073 return {result, Type::Uint};
1074 } 1091 }
1075 1092
1076 UNREACHABLE_MSG("Unmanaged offset node type"); 1093 // AMD's proprietary GLSL compiler emits ill code for variable component access.
1094 // To bypass this driver bug generate 4 ifs, one per each component.
1095 const std::string pack = code.GenerateTemporary();
1096 code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
1097 final_offset);
1098
1099 const std::string result = code.GenerateTemporary();
1100 code.AddLine("uint {};", result);
1101 for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
1102 code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
1103 GetSwizzle(swizzle));
1104 }
1105 return {result, Type::Uint};
1077 } 1106 }
1078 1107
1079 if (const auto gmem = std::get_if<GmemNode>(&*node)) { 1108 if (const auto gmem = std::get_if<GmemNode>(&*node)) {
@@ -2710,6 +2739,7 @@ private:
2710 const std::string_view identifier; 2739 const std::string_view identifier;
2711 const std::string_view suffix; 2740 const std::string_view suffix;
2712 const Header header; 2741 const Header header;
2742 const bool use_unified_uniforms;
2713 std::unordered_map<u8, VaryingTFB> transform_feedback; 2743 std::unordered_map<u8, VaryingTFB> transform_feedback;
2714 2744
2715 ShaderWriter code; 2745 ShaderWriter code;
@@ -2905,7 +2935,7 @@ void GLSLDecompiler::DecompileAST() {
2905 2935
2906} // Anonymous namespace 2936} // Anonymous namespace
2907 2937
2908ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { 2938ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
2909 ShaderEntries entries; 2939 ShaderEntries entries;
2910 for (const auto& cbuf : ir.GetConstantBuffers()) { 2940 for (const auto& cbuf : ir.GetConstantBuffers()) {
2911 entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), 2941 entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2926,6 +2956,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
2926 entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; 2956 entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
2927 } 2957 }
2928 entries.shader_length = ir.GetLength(); 2958 entries.shader_length = ir.GetLength();
2959 entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
2929 return entries; 2960 return entries;
2930} 2961}
2931 2962
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index e8a178764..451c9689a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -53,11 +53,13 @@ struct ShaderEntries {
53 std::vector<GlobalMemoryEntry> global_memory_entries; 53 std::vector<GlobalMemoryEntry> global_memory_entries;
54 std::vector<SamplerEntry> samplers; 54 std::vector<SamplerEntry> samplers;
55 std::vector<ImageEntry> images; 55 std::vector<ImageEntry> images;
56 u32 clip_distances{};
57 std::size_t shader_length{}; 56 std::size_t shader_length{};
57 u32 clip_distances{};
58 bool use_unified_uniforms{};
58}; 59};
59 60
60ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir); 61ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
62 Tegra::Engines::ShaderType stage);
61 63
62std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, 64std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
63 const VideoCommon::Shader::Registry& registry, 65 const VideoCommon::Shader::Registry& registry,