diff options
| author | 2020-06-02 14:58:50 -0400 | |
|---|---|---|
| committer | 2020-06-02 14:58:50 -0400 | |
| commit | 597d8b4bd457ff0aa1293ff4ac7761e2eefc9150 (patch) | |
| tree | f16a61443b947bf5cc336d51e32111c8f67eb6b3 /src | |
| parent | Merge pull request #4016 from ReinUsesLisp/invocation-info (diff) | |
| parent | glsl: Squash constant buffers into a single SSBO when we hit the limit (diff) | |
| download | yuzu-597d8b4bd457ff0aa1293ff4ac7761e2eefc9150.tar.gz yuzu-597d8b4bd457ff0aa1293ff4ac7761e2eefc9150.tar.xz yuzu-597d8b4bd457ff0aa1293ff4ac7761e2eefc9150.zip | |
Merge pull request #4006 from ReinUsesLisp/squash-ubos
glsl: Squash constant buffers into a single SSBO when we hit the limit
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.cpp | 46 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.h | 7 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 78 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.h | 4 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_cache.cpp | 12 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 99 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.h | 6 |
7 files changed, 173 insertions, 79 deletions
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index e1b245288..d20547c04 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <array> | 6 | #include <array> |
| 7 | #include <cstddef> | 7 | #include <cstddef> |
| 8 | #include <cstring> | 8 | #include <cstring> |
| 9 | #include <limits> | ||
| 9 | #include <optional> | 10 | #include <optional> |
| 10 | #include <vector> | 11 | #include <vector> |
| 11 | 12 | ||
| @@ -26,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1; | |||
| 26 | 27 | ||
| 27 | constexpr u32 NumStages = 5; | 28 | constexpr u32 NumStages = 5; |
| 28 | 29 | ||
| 29 | constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, | 30 | constexpr std::array LimitUBOs = { |
| 30 | GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, | 31 | GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, |
| 31 | GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS}; | 32 | GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS, |
| 33 | GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS}; | ||
| 32 | 34 | ||
| 33 | constexpr std::array LimitSSBOs = { | 35 | constexpr std::array LimitSSBOs = { |
| 34 | GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, | 36 | GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, |
| 35 | GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, | 37 | GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, |
| 36 | GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS}; | 38 | GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS}; |
| 37 | 39 | ||
| 38 | constexpr std::array LimitSamplers = { | 40 | constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, |
| 39 | GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, | 41 | GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, |
| 40 | GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, | 42 | GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, |
| 41 | GL_MAX_TEXTURE_IMAGE_UNITS}; | 43 | GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, |
| 44 | GL_MAX_TEXTURE_IMAGE_UNITS, | ||
| 45 | GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS}; | ||
| 42 | 46 | ||
| 43 | constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS, | 47 | constexpr std::array LimitImages = { |
| 44 | GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, | 48 | GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, |
| 45 | GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, | 49 | GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS, |
| 46 | GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS}; | 50 | GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS}; |
| 47 | 51 | ||
| 48 | template <typename T> | 52 | template <typename T> |
| 49 | T GetInteger(GLenum pname) { | 53 | T GetInteger(GLenum pname) { |
| @@ -85,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) { | |||
| 85 | return std::exchange(base, base + amount); | 89 | return std::exchange(base, base + amount); |
| 86 | } | 90 | } |
| 87 | 91 | ||
| 92 | std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept { | ||
| 93 | std::array<u32, Tegra::Engines::MaxShaderTypes> max; | ||
| 94 | std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(), | ||
| 95 | [](GLenum pname) { return GetInteger<u32>(pname); }); | ||
| 96 | return max; | ||
| 97 | } | ||
| 98 | |||
| 88 | std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept { | 99 | std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept { |
| 89 | std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings; | 100 | std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings; |
| 90 | 101 | ||
| @@ -159,7 +170,8 @@ bool IsASTCSupported() { | |||
| 159 | 170 | ||
| 160 | } // Anonymous namespace | 171 | } // Anonymous namespace |
| 161 | 172 | ||
| 162 | Device::Device() : base_bindings{BuildBaseBindings()} { | 173 | Device::Device() |
| 174 | : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { | ||
| 163 | const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); | 175 | const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); |
| 164 | const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); | 176 | const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); |
| 165 | const std::vector extensions = GetExtensions(); | 177 | const std::vector extensions = GetExtensions(); |
| @@ -194,7 +206,9 @@ Device::Device() : base_bindings{BuildBaseBindings()} { | |||
| 194 | } | 206 | } |
| 195 | 207 | ||
| 196 | Device::Device(std::nullptr_t) { | 208 | Device::Device(std::nullptr_t) { |
| 197 | uniform_buffer_alignment = 0; | 209 | max_uniform_buffers.fill(std::numeric_limits<u32>::max()); |
| 210 | uniform_buffer_alignment = 4; | ||
| 211 | shader_storage_alignment = 4; | ||
| 198 | max_vertex_attributes = 16; | 212 | max_vertex_attributes = 16; |
| 199 | max_varyings = 15; | 213 | max_varyings = 15; |
| 200 | has_warp_intrinsics = true; | 214 | has_warp_intrinsics = true; |
| @@ -202,8 +216,6 @@ Device::Device(std::nullptr_t) { | |||
| 202 | has_vertex_viewport_layer = true; | 216 | has_vertex_viewport_layer = true; |
| 203 | has_image_load_formatted = true; | 217 | has_image_load_formatted = true; |
| 204 | has_variable_aoffi = true; | 218 | has_variable_aoffi = true; |
| 205 | has_component_indexing_bug = false; | ||
| 206 | has_precise_bug = false; | ||
| 207 | } | 219 | } |
| 208 | 220 | ||
| 209 | bool Device::TestVariableAoffi() { | 221 | bool Device::TestVariableAoffi() { |
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 683ed9002..98cca0254 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h | |||
| @@ -24,6 +24,10 @@ public: | |||
| 24 | explicit Device(); | 24 | explicit Device(); |
| 25 | explicit Device(std::nullptr_t); | 25 | explicit Device(std::nullptr_t); |
| 26 | 26 | ||
| 27 | u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept { | ||
| 28 | return max_uniform_buffers[static_cast<std::size_t>(shader_type)]; | ||
| 29 | } | ||
| 30 | |||
| 27 | const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept { | 31 | const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept { |
| 28 | return base_bindings[stage_index]; | 32 | return base_bindings[stage_index]; |
| 29 | } | 33 | } |
| @@ -92,7 +96,8 @@ private: | |||
| 92 | static bool TestVariableAoffi(); | 96 | static bool TestVariableAoffi(); |
| 93 | static bool TestPreciseBug(); | 97 | static bool TestPreciseBug(); |
| 94 | 98 | ||
| 95 | std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings; | 99 | std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{}; |
| 100 | std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{}; | ||
| 96 | std::size_t uniform_buffer_alignment{}; | 101 | std::size_t uniform_buffer_alignment{}; |
| 97 | std::size_t shader_storage_alignment{}; | 102 | std::size_t shader_storage_alignment{}; |
| 98 | u32 max_vertex_attributes{}; | 103 | u32 max_vertex_attributes{}; |
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 3c421dd16..55e79aaf6 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp | |||
| @@ -54,6 +54,12 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255 | |||
| 54 | 54 | ||
| 55 | namespace { | 55 | namespace { |
| 56 | 56 | ||
| 57 | constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18; | ||
| 58 | constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = | ||
| 59 | NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize; | ||
| 60 | constexpr std::size_t TOTAL_CONST_BUFFER_BYTES = | ||
| 61 | NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; | ||
| 62 | |||
| 57 | constexpr std::size_t NumSupportedVertexAttributes = 16; | 63 | constexpr std::size_t NumSupportedVertexAttributes = 16; |
| 58 | 64 | ||
| 59 | template <typename Engine, typename Entry> | 65 | template <typename Engine, typename Entry> |
| @@ -104,6 +110,9 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind | |||
| 104 | screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { | 110 | screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { |
| 105 | CheckExtensions(); | 111 | CheckExtensions(); |
| 106 | 112 | ||
| 113 | unified_uniform_buffer.Create(); | ||
| 114 | glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0); | ||
| 115 | |||
| 107 | if (device.UseAssemblyShaders()) { | 116 | if (device.UseAssemblyShaders()) { |
| 108 | glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); | 117 | glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); |
| 109 | for (const GLuint cbuf : staging_cbufs) { | 118 | for (const GLuint cbuf : staging_cbufs) { |
| @@ -842,34 +851,56 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad | |||
| 842 | MICROPROFILE_SCOPE(OpenGL_UBO); | 851 | MICROPROFILE_SCOPE(OpenGL_UBO); |
| 843 | const auto& stages = system.GPU().Maxwell3D().state.shader_stages; | 852 | const auto& stages = system.GPU().Maxwell3D().state.shader_stages; |
| 844 | const auto& shader_stage = stages[stage_index]; | 853 | const auto& shader_stage = stages[stage_index]; |
| 854 | const auto& entries = shader->GetEntries(); | ||
| 855 | const bool use_unified = entries.use_unified_uniforms; | ||
| 856 | const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE; | ||
| 845 | 857 | ||
| 846 | u32 binding = | 858 | const auto base_bindings = device.GetBaseBindings(stage_index); |
| 847 | device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer; | 859 | u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer; |
| 848 | for (const auto& entry : shader->GetEntries().const_buffers) { | 860 | for (const auto& entry : entries.const_buffers) { |
| 849 | const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; | 861 | const u32 index = entry.GetIndex(); |
| 850 | SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry); | 862 | const auto& buffer = shader_stage.const_buffers[index]; |
| 863 | SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified, | ||
| 864 | base_unified_offset + index * Maxwell::MaxConstBufferSize); | ||
| 865 | ++binding; | ||
| 866 | } | ||
| 867 | if (use_unified) { | ||
| 868 | const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer + | ||
| 869 | entries.global_memory_entries.size()); | ||
| 870 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, | ||
| 871 | base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE); | ||
| 851 | } | 872 | } |
| 852 | } | 873 | } |
| 853 | 874 | ||
| 854 | void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { | 875 | void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { |
| 855 | MICROPROFILE_SCOPE(OpenGL_UBO); | 876 | MICROPROFILE_SCOPE(OpenGL_UBO); |
| 856 | const auto& launch_desc = system.GPU().KeplerCompute().launch_description; | 877 | const auto& launch_desc = system.GPU().KeplerCompute().launch_description; |
| 878 | const auto& entries = kernel->GetEntries(); | ||
| 879 | const bool use_unified = entries.use_unified_uniforms; | ||
| 857 | 880 | ||
| 858 | u32 binding = 0; | 881 | u32 binding = 0; |
| 859 | for (const auto& entry : kernel->GetEntries().const_buffers) { | 882 | for (const auto& entry : entries.const_buffers) { |
| 860 | const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; | 883 | const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; |
| 861 | const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); | 884 | const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); |
| 862 | Tegra::Engines::ConstBufferInfo buffer; | 885 | Tegra::Engines::ConstBufferInfo buffer; |
| 863 | buffer.address = config.Address(); | 886 | buffer.address = config.Address(); |
| 864 | buffer.size = config.size; | 887 | buffer.size = config.size; |
| 865 | buffer.enabled = mask[entry.GetIndex()]; | 888 | buffer.enabled = mask[entry.GetIndex()]; |
| 866 | SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry); | 889 | SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry, |
| 890 | use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize); | ||
| 891 | ++binding; | ||
| 892 | } | ||
| 893 | if (use_unified) { | ||
| 894 | const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size()); | ||
| 895 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0, | ||
| 896 | NUM_CONST_BUFFERS_BYTES_PER_STAGE); | ||
| 867 | } | 897 | } |
| 868 | } | 898 | } |
| 869 | 899 | ||
| 870 | void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, | 900 | void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, |
| 871 | const Tegra::Engines::ConstBufferInfo& buffer, | 901 | const Tegra::Engines::ConstBufferInfo& buffer, |
| 872 | const ConstBufferEntry& entry) { | 902 | const ConstBufferEntry& entry, bool use_unified, |
| 903 | std::size_t unified_offset) { | ||
| 873 | if (!buffer.enabled) { | 904 | if (!buffer.enabled) { |
| 874 | // Set values to zero to unbind buffers | 905 | // Set values to zero to unbind buffers |
| 875 | if (device.UseAssemblyShaders()) { | 906 | if (device.UseAssemblyShaders()) { |
| @@ -885,20 +916,29 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, | |||
| 885 | // UBO alignment requirements. | 916 | // UBO alignment requirements. |
| 886 | const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); | 917 | const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); |
| 887 | 918 | ||
| 888 | const auto alignment = device.GetUniformBufferAlignment(); | 919 | const bool fast_upload = !use_unified && device.HasFastBufferSubData(); |
| 889 | auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, | 920 | |
| 890 | device.HasFastBufferSubData()); | 921 | const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment(); |
| 891 | if (!device.UseAssemblyShaders()) { | 922 | const GPUVAddr gpu_addr = buffer.address; |
| 892 | glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); | 923 | auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); |
| 924 | |||
| 925 | if (device.UseAssemblyShaders()) { | ||
| 926 | UNIMPLEMENTED_IF(use_unified); | ||
| 927 | if (offset != 0) { | ||
| 928 | const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; | ||
| 929 | glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); | ||
| 930 | cbuf = staging_cbuf; | ||
| 931 | offset = 0; | ||
| 932 | } | ||
| 933 | glBindBufferRangeNV(stage, binding, cbuf, offset, size); | ||
| 893 | return; | 934 | return; |
| 894 | } | 935 | } |
| 895 | if (offset != 0) { | 936 | |
| 896 | const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; | 937 | if (use_unified) { |
| 897 | glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); | 938 | glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size); |
| 898 | cbuf = staging_cbuf; | 939 | } else { |
| 899 | offset = 0; | 940 | glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); |
| 900 | } | 941 | } |
| 901 | glBindBufferRangeNV(stage, binding, cbuf, offset, size); | ||
| 902 | } | 942 | } |
| 903 | 943 | ||
| 904 | void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { | 944 | void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { |
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 87f7fe159..f5dc56a0e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h | |||
| @@ -107,7 +107,8 @@ private: | |||
| 107 | 107 | ||
| 108 | /// Configures a constant buffer. | 108 | /// Configures a constant buffer. |
| 109 | void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, | 109 | void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, |
| 110 | const ConstBufferEntry& entry); | 110 | const ConstBufferEntry& entry, bool use_unified, |
| 111 | std::size_t unified_offset); | ||
| 111 | 112 | ||
| 112 | /// Configures the current global memory entries to use for the draw command. | 113 | /// Configures the current global memory entries to use for the draw command. |
| 113 | void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); | 114 | void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); |
| @@ -253,6 +254,7 @@ private: | |||
| 253 | Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; | 254 | Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; |
| 254 | std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{}; | 255 | std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{}; |
| 255 | std::size_t current_cbuf = 0; | 256 | std::size_t current_cbuf = 0; |
| 257 | OGLBuffer unified_uniform_buffer; | ||
| 256 | 258 | ||
| 257 | /// Number of commands queued to the OpenGL driver. Reseted on flush. | 259 | /// Number of commands queued to the OpenGL driver. Reseted on flush. |
| 258 | std::size_t num_queued_commands = 0; | 260 | std::size_t num_queued_commands = 0; |
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 4cd0f36cf..a991ca64a 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp | |||
| @@ -241,8 +241,9 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, | |||
| 241 | entry.bindless_samplers = registry->GetBindlessSamplers(); | 241 | entry.bindless_samplers = registry->GetBindlessSamplers(); |
| 242 | params.disk_cache.SaveEntry(std::move(entry)); | 242 | params.disk_cache.SaveEntry(std::move(entry)); |
| 243 | 243 | ||
| 244 | return std::shared_ptr<CachedShader>(new CachedShader( | 244 | return std::shared_ptr<CachedShader>( |
| 245 | params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); | 245 | new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), |
| 246 | MakeEntries(params.device, ir, shader_type), std::move(program))); | ||
| 246 | } | 247 | } |
| 247 | 248 | ||
| 248 | Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { | 249 | Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { |
| @@ -265,8 +266,9 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog | |||
| 265 | entry.bindless_samplers = registry->GetBindlessSamplers(); | 266 | entry.bindless_samplers = registry->GetBindlessSamplers(); |
| 266 | params.disk_cache.SaveEntry(std::move(entry)); | 267 | params.disk_cache.SaveEntry(std::move(entry)); |
| 267 | 268 | ||
| 268 | return std::shared_ptr<CachedShader>(new CachedShader( | 269 | return std::shared_ptr<CachedShader>( |
| 269 | params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); | 270 | new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), |
| 271 | MakeEntries(params.device, ir, ShaderType::Compute), std::move(program))); | ||
| 270 | } | 272 | } |
| 271 | 273 | ||
| 272 | Shader CachedShader::CreateFromCache(const ShaderParameters& params, | 274 | Shader CachedShader::CreateFromCache(const ShaderParameters& params, |
| @@ -348,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, | |||
| 348 | PrecompiledShader shader; | 350 | PrecompiledShader shader; |
| 349 | shader.program = std::move(program); | 351 | shader.program = std::move(program); |
| 350 | shader.registry = std::move(registry); | 352 | shader.registry = std::move(registry); |
| 351 | shader.entries = MakeEntries(ir); | 353 | shader.entries = MakeEntries(device, ir, entry.type); |
| 352 | 354 | ||
| 353 | std::scoped_lock lock{mutex}; | 355 | std::scoped_lock lock{mutex}; |
| 354 | if (callback) { | 356 | if (callback) { |
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 9cb115959..502b95973 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp | |||
| @@ -61,8 +61,8 @@ struct TextureDerivates {}; | |||
| 61 | using TextureArgument = std::pair<Type, Node>; | 61 | using TextureArgument = std::pair<Type, Node>; |
| 62 | using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>; | 62 | using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>; |
| 63 | 63 | ||
| 64 | constexpr u32 MAX_CONSTBUFFER_ELEMENTS = | 64 | constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32); |
| 65 | static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); | 65 | constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32); |
| 66 | 66 | ||
| 67 | constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt | 67 | constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt |
| 68 | #define ftou floatBitsToUint | 68 | #define ftou floatBitsToUint |
| @@ -402,6 +402,13 @@ std::string FlowStackTopName(MetaStackClass stack) { | |||
| 402 | return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); | 402 | return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); |
| 403 | } | 403 | } |
| 404 | 404 | ||
| 405 | bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) { | ||
| 406 | const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size()); | ||
| 407 | // We waste one UBO for emulation | ||
| 408 | const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1; | ||
| 409 | return num_ubos > num_available_ubos; | ||
| 410 | } | ||
| 411 | |||
| 405 | struct GenericVaryingDescription { | 412 | struct GenericVaryingDescription { |
| 406 | std::string name; | 413 | std::string name; |
| 407 | u8 first_element = 0; | 414 | u8 first_element = 0; |
| @@ -412,8 +419,9 @@ class GLSLDecompiler final { | |||
| 412 | public: | 419 | public: |
| 413 | explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, | 420 | explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, |
| 414 | ShaderType stage, std::string_view identifier, std::string_view suffix) | 421 | ShaderType stage, std::string_view identifier, std::string_view suffix) |
| 415 | : device{device}, ir{ir}, registry{registry}, stage{stage}, | 422 | : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier}, |
| 416 | identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} { | 423 | suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{ |
| 424 | UseUnifiedUniforms(device, ir, stage)} { | ||
| 417 | if (stage != ShaderType::Compute) { | 425 | if (stage != ShaderType::Compute) { |
| 418 | transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); | 426 | transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); |
| 419 | } | 427 | } |
| @@ -834,12 +842,24 @@ private: | |||
| 834 | } | 842 | } |
| 835 | 843 | ||
| 836 | void DeclareConstantBuffers() { | 844 | void DeclareConstantBuffers() { |
| 845 | if (use_unified_uniforms) { | ||
| 846 | const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer + | ||
| 847 | static_cast<u32>(ir.GetGlobalMemory().size()); | ||
| 848 | code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{", | ||
| 849 | binding); | ||
| 850 | code.AddLine(" uint cbufs[];"); | ||
| 851 | code.AddLine("}};"); | ||
| 852 | code.AddNewLine(); | ||
| 853 | return; | ||
| 854 | } | ||
| 855 | |||
| 837 | u32 binding = device.GetBaseBindings(stage).uniform_buffer; | 856 | u32 binding = device.GetBaseBindings(stage).uniform_buffer; |
| 838 | for (const auto& buffers : ir.GetConstantBuffers()) { | 857 | for (const auto [index, info] : ir.GetConstantBuffers()) { |
| 839 | const auto index = buffers.first; | 858 | const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4; |
| 859 | const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements; | ||
| 840 | code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, | 860 | code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, |
| 841 | GetConstBufferBlock(index)); | 861 | GetConstBufferBlock(index)); |
| 842 | code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS); | 862 | code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size); |
| 843 | code.AddLine("}};"); | 863 | code.AddLine("}};"); |
| 844 | code.AddNewLine(); | 864 | code.AddNewLine(); |
| 845 | } | 865 | } |
| @@ -1038,42 +1058,51 @@ private: | |||
| 1038 | 1058 | ||
| 1039 | if (const auto cbuf = std::get_if<CbufNode>(&*node)) { | 1059 | if (const auto cbuf = std::get_if<CbufNode>(&*node)) { |
| 1040 | const Node offset = cbuf->GetOffset(); | 1060 | const Node offset = cbuf->GetOffset(); |
| 1061 | const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS; | ||
| 1062 | |||
| 1041 | if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { | 1063 | if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { |
| 1042 | // Direct access | 1064 | // Direct access |
| 1043 | const u32 offset_imm = immediate->GetValue(); | 1065 | const u32 offset_imm = immediate->GetValue(); |
| 1044 | ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); | 1066 | ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); |
| 1045 | return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), | 1067 | if (use_unified_uniforms) { |
| 1046 | offset_imm / (4 * 4), (offset_imm / 4) % 4), | 1068 | return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4), |
| 1047 | Type::Uint}; | 1069 | Type::Uint}; |
| 1070 | } else { | ||
| 1071 | return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), | ||
| 1072 | offset_imm / (4 * 4), (offset_imm / 4) % 4), | ||
| 1073 | Type::Uint}; | ||
| 1074 | } | ||
| 1048 | } | 1075 | } |
| 1049 | 1076 | ||
| 1050 | if (std::holds_alternative<OperationNode>(*offset)) { | 1077 | // Indirect access |
| 1051 | // Indirect access | 1078 | if (use_unified_uniforms) { |
| 1052 | const std::string final_offset = code.GenerateTemporary(); | 1079 | return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset, |
| 1053 | code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); | 1080 | Visit(offset).AsUint()), |
| 1081 | Type::Uint}; | ||
| 1082 | } | ||
| 1054 | 1083 | ||
| 1055 | if (!device.HasComponentIndexingBug()) { | 1084 | const std::string final_offset = code.GenerateTemporary(); |
| 1056 | return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), | 1085 | code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); |
| 1057 | final_offset, final_offset), | ||
| 1058 | Type::Uint}; | ||
| 1059 | } | ||
| 1060 | 1086 | ||
| 1061 | // AMD's proprietary GLSL compiler emits ill code for variable component access. | 1087 | if (!device.HasComponentIndexingBug()) { |
| 1062 | // To bypass this driver bug generate 4 ifs, one per each component. | 1088 | return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), |
| 1063 | const std::string pack = code.GenerateTemporary(); | 1089 | final_offset, final_offset), |
| 1064 | code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), | 1090 | Type::Uint}; |
| 1065 | final_offset); | ||
| 1066 | |||
| 1067 | const std::string result = code.GenerateTemporary(); | ||
| 1068 | code.AddLine("uint {};", result); | ||
| 1069 | for (u32 swizzle = 0; swizzle < 4; ++swizzle) { | ||
| 1070 | code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, | ||
| 1071 | pack, GetSwizzle(swizzle)); | ||
| 1072 | } | ||
| 1073 | return {result, Type::Uint}; | ||
| 1074 | } | 1091 | } |
| 1075 | 1092 | ||
| 1076 | UNREACHABLE_MSG("Unmanaged offset node type"); | 1093 | // AMD's proprietary GLSL compiler emits ill code for variable component access. |
| 1094 | // To bypass this driver bug generate 4 ifs, one per each component. | ||
| 1095 | const std::string pack = code.GenerateTemporary(); | ||
| 1096 | code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), | ||
| 1097 | final_offset); | ||
| 1098 | |||
| 1099 | const std::string result = code.GenerateTemporary(); | ||
| 1100 | code.AddLine("uint {};", result); | ||
| 1101 | for (u32 swizzle = 0; swizzle < 4; ++swizzle) { | ||
| 1102 | code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack, | ||
| 1103 | GetSwizzle(swizzle)); | ||
| 1104 | } | ||
| 1105 | return {result, Type::Uint}; | ||
| 1077 | } | 1106 | } |
| 1078 | 1107 | ||
| 1079 | if (const auto gmem = std::get_if<GmemNode>(&*node)) { | 1108 | if (const auto gmem = std::get_if<GmemNode>(&*node)) { |
| @@ -2710,6 +2739,7 @@ private: | |||
| 2710 | const std::string_view identifier; | 2739 | const std::string_view identifier; |
| 2711 | const std::string_view suffix; | 2740 | const std::string_view suffix; |
| 2712 | const Header header; | 2741 | const Header header; |
| 2742 | const bool use_unified_uniforms; | ||
| 2713 | std::unordered_map<u8, VaryingTFB> transform_feedback; | 2743 | std::unordered_map<u8, VaryingTFB> transform_feedback; |
| 2714 | 2744 | ||
| 2715 | ShaderWriter code; | 2745 | ShaderWriter code; |
| @@ -2905,7 +2935,7 @@ void GLSLDecompiler::DecompileAST() { | |||
| 2905 | 2935 | ||
| 2906 | } // Anonymous namespace | 2936 | } // Anonymous namespace |
| 2907 | 2937 | ||
| 2908 | ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { | 2938 | ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) { |
| 2909 | ShaderEntries entries; | 2939 | ShaderEntries entries; |
| 2910 | for (const auto& cbuf : ir.GetConstantBuffers()) { | 2940 | for (const auto& cbuf : ir.GetConstantBuffers()) { |
| 2911 | entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), | 2941 | entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), |
| @@ -2926,6 +2956,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { | |||
| 2926 | entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; | 2956 | entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; |
| 2927 | } | 2957 | } |
| 2928 | entries.shader_length = ir.GetLength(); | 2958 | entries.shader_length = ir.GetLength(); |
| 2959 | entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage); | ||
| 2929 | return entries; | 2960 | return entries; |
| 2930 | } | 2961 | } |
| 2931 | 2962 | ||
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index e8a178764..451c9689a 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h | |||
| @@ -53,11 +53,13 @@ struct ShaderEntries { | |||
| 53 | std::vector<GlobalMemoryEntry> global_memory_entries; | 53 | std::vector<GlobalMemoryEntry> global_memory_entries; |
| 54 | std::vector<SamplerEntry> samplers; | 54 | std::vector<SamplerEntry> samplers; |
| 55 | std::vector<ImageEntry> images; | 55 | std::vector<ImageEntry> images; |
| 56 | u32 clip_distances{}; | ||
| 57 | std::size_t shader_length{}; | 56 | std::size_t shader_length{}; |
| 57 | u32 clip_distances{}; | ||
| 58 | bool use_unified_uniforms{}; | ||
| 58 | }; | 59 | }; |
| 59 | 60 | ||
| 60 | ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir); | 61 | ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir, |
| 62 | Tegra::Engines::ShaderType stage); | ||
| 61 | 63 | ||
| 62 | std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, | 64 | std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, |
| 63 | const VideoCommon::Shader::Registry& registry, | 65 | const VideoCommon::Shader::Registry& registry, |