diff options
| author | 2019-04-16 22:15:17 -0400 | |
|---|---|---|
| committer | 2019-04-16 22:15:17 -0400 | |
| commit | 1b83f255c290fd83562502f019799ad86a85b8a8 (patch) | |
| tree | 07fc355ae298cc71153d2653283723137ce9c958 /src | |
| parent | Merge pull request #2376 from lioncash/const (diff) | |
| parent | shader_ir: Implement STG, keep track of global memory usage and flush (diff) | |
| download | yuzu-1b83f255c290fd83562502f019799ad86a85b8a8.tar.gz yuzu-1b83f255c290fd83562502f019799ad86a85b8a8.tar.xz yuzu-1b83f255c290fd83562502f019799ad86a85b8a8.zip | |
Merge pull request #2092 from ReinUsesLisp/stg
shader/memory: Implement STG and global memory flushing
Diffstat (limited to 'src')
| -rw-r--r-- | src/video_core/engines/shader_bytecode.h | 6 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_global_cache.cpp | 42 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_global_cache.h | 16 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 4 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.h | 4 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 36 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.h | 15 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_disk_cache.cpp | 13 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_shader_decompiler.cpp | 14 | ||||
| -rw-r--r-- | src/video_core/shader/decode/memory.cpp | 109 | ||||
| -rw-r--r-- | src/video_core/shader/shader_ir.h | 16 |
11 files changed, 186 insertions, 89 deletions
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 2e1e96c81..acf475289 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h | |||
| @@ -792,6 +792,12 @@ union Instruction { | |||
| 792 | } ldg; | 792 | } ldg; |
| 793 | 793 | ||
| 794 | union { | 794 | union { |
| 795 | BitField<48, 3, UniformType> type; | ||
| 796 | BitField<46, 2, u64> cache_mode; | ||
| 797 | BitField<20, 24, s64> immediate_offset; | ||
| 798 | } stg; | ||
| 799 | |||
| 800 | union { | ||
| 795 | BitField<0, 3, u64> pred0; | 801 | BitField<0, 3, u64> pred0; |
| 796 | BitField<3, 3, u64> pred3; | 802 | BitField<3, 3, u64> pred3; |
| 797 | BitField<7, 1, u64> abs_a; | 803 | BitField<7, 1, u64> abs_a; |
diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp index 8d9ee81f1..ea4a593af 100644 --- a/src/video_core/renderer_opengl/gl_global_cache.cpp +++ b/src/video_core/renderer_opengl/gl_global_cache.cpp | |||
| @@ -14,28 +14,28 @@ | |||
| 14 | 14 | ||
| 15 | namespace OpenGL { | 15 | namespace OpenGL { |
| 16 | 16 | ||
| 17 | CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr) | 17 | CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size) |
| 18 | : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size} { | 18 | : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size}, |
| 19 | max_size{max_size} { | ||
| 19 | buffer.Create(); | 20 | buffer.Create(); |
| 20 | // Bind and unbind the buffer so it gets allocated by the driver | ||
| 21 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle); | ||
| 22 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); | ||
| 23 | LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory"); | 21 | LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory"); |
| 24 | } | 22 | } |
| 25 | 23 | ||
| 26 | void CachedGlobalRegion::Reload(u32 size_) { | 24 | CachedGlobalRegion::~CachedGlobalRegion() = default; |
| 27 | constexpr auto max_size = static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize); | ||
| 28 | 25 | ||
| 26 | void CachedGlobalRegion::Reload(u32 size_) { | ||
| 29 | size = size_; | 27 | size = size_; |
| 30 | if (size > max_size) { | 28 | if (size > max_size) { |
| 31 | size = max_size; | 29 | size = max_size; |
| 32 | LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the expected size {}!", size_, | 30 | LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_, |
| 33 | max_size); | 31 | max_size); |
| 34 | } | 32 | } |
| 33 | glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW); | ||
| 34 | } | ||
| 35 | 35 | ||
| 36 | // TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer | 36 | void CachedGlobalRegion::Flush() { |
| 37 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle); | 37 | LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr); |
| 38 | glBufferData(GL_SHADER_STORAGE_BUFFER, size, GetHostPtr(), GL_DYNAMIC_DRAW); | 38 | glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr); |
| 39 | } | 39 | } |
| 40 | 40 | ||
| 41 | GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const { | 41 | GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const { |
| @@ -46,14 +46,16 @@ GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, | |||
| 46 | return search->second; | 46 | return search->second; |
| 47 | } | 47 | } |
| 48 | 48 | ||
| 49 | GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u32 size, | 49 | GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, |
| 50 | u8* host_ptr) { | 50 | u32 size) { |
| 51 | GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)}; | 51 | GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)}; |
| 52 | if (!region) { | 52 | if (!region) { |
| 53 | // No reserved surface available, create a new one and reserve it | 53 | // No reserved surface available, create a new one and reserve it |
| 54 | auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; | 54 | auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; |
| 55 | const auto cpu_addr = *memory_manager.GpuToCpuAddress(addr); | 55 | const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)}; |
| 56 | region = std::make_shared<CachedGlobalRegion>(cpu_addr, size, host_ptr); | 56 | ASSERT(cpu_addr); |
| 57 | |||
| 58 | region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size); | ||
| 57 | ReserveGlobalRegion(region); | 59 | ReserveGlobalRegion(region); |
| 58 | } | 60 | } |
| 59 | region->Reload(size); | 61 | region->Reload(size); |
| @@ -65,7 +67,11 @@ void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) { | |||
| 65 | } | 67 | } |
| 66 | 68 | ||
| 67 | GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer) | 69 | GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer) |
| 68 | : RasterizerCache{rasterizer} {} | 70 | : RasterizerCache{rasterizer} { |
| 71 | GLint max_ssbo_size_; | ||
| 72 | glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_); | ||
| 73 | max_ssbo_size = static_cast<u32>(max_ssbo_size_); | ||
| 74 | } | ||
| 69 | 75 | ||
| 70 | GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion( | 76 | GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion( |
| 71 | const GLShader::GlobalMemoryEntry& global_region, | 77 | const GLShader::GlobalMemoryEntry& global_region, |
| @@ -73,7 +79,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion( | |||
| 73 | 79 | ||
| 74 | auto& gpu{Core::System::GetInstance().GPU()}; | 80 | auto& gpu{Core::System::GetInstance().GPU()}; |
| 75 | auto& memory_manager{gpu.MemoryManager()}; | 81 | auto& memory_manager{gpu.MemoryManager()}; |
| 76 | const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<u64>(stage)]}; | 82 | const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]}; |
| 77 | const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address + | 83 | const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address + |
| 78 | global_region.GetCbufOffset()}; | 84 | global_region.GetCbufOffset()}; |
| 79 | const auto actual_addr{memory_manager.Read<u64>(addr)}; | 85 | const auto actual_addr{memory_manager.Read<u64>(addr)}; |
| @@ -85,7 +91,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion( | |||
| 85 | 91 | ||
| 86 | if (!region) { | 92 | if (!region) { |
| 87 | // No global region found - create a new one | 93 | // No global region found - create a new one |
| 88 | region = GetUncachedGlobalRegion(actual_addr, size, host_ptr); | 94 | region = GetUncachedGlobalRegion(actual_addr, host_ptr, size); |
| 89 | Register(region); | 95 | Register(region); |
| 90 | } | 96 | } |
| 91 | 97 | ||
diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h index 5a21ab66f..196e6e278 100644 --- a/src/video_core/renderer_opengl/gl_global_cache.h +++ b/src/video_core/renderer_opengl/gl_global_cache.h | |||
| @@ -19,7 +19,7 @@ namespace OpenGL { | |||
| 19 | 19 | ||
| 20 | namespace GLShader { | 20 | namespace GLShader { |
| 21 | class GlobalMemoryEntry; | 21 | class GlobalMemoryEntry; |
| 22 | } // namespace GLShader | 22 | } |
| 23 | 23 | ||
| 24 | class RasterizerOpenGL; | 24 | class RasterizerOpenGL; |
| 25 | class CachedGlobalRegion; | 25 | class CachedGlobalRegion; |
| @@ -27,7 +27,8 @@ using GlobalRegion = std::shared_ptr<CachedGlobalRegion>; | |||
| 27 | 27 | ||
| 28 | class CachedGlobalRegion final : public RasterizerCacheObject { | 28 | class CachedGlobalRegion final : public RasterizerCacheObject { |
| 29 | public: | 29 | public: |
| 30 | explicit CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr); | 30 | explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size); |
| 31 | ~CachedGlobalRegion(); | ||
| 31 | 32 | ||
| 32 | VAddr GetCpuAddr() const override { | 33 | VAddr GetCpuAddr() const override { |
| 33 | return cpu_addr; | 34 | return cpu_addr; |
| @@ -45,14 +46,14 @@ public: | |||
| 45 | /// Reloads the global region from guest memory | 46 | /// Reloads the global region from guest memory |
| 46 | void Reload(u32 size_); | 47 | void Reload(u32 size_); |
| 47 | 48 | ||
| 48 | // TODO(Rodrigo): When global memory is written (STG), implement flushing | 49 | void Flush() override; |
| 49 | void Flush() override { | ||
| 50 | UNIMPLEMENTED(); | ||
| 51 | } | ||
| 52 | 50 | ||
| 53 | private: | 51 | private: |
| 54 | VAddr cpu_addr{}; | 52 | VAddr cpu_addr{}; |
| 53 | u8* host_ptr{}; | ||
| 55 | u32 size{}; | 54 | u32 size{}; |
| 55 | u32 max_size{}; | ||
| 56 | |||
| 56 | OGLBuffer buffer; | 57 | OGLBuffer buffer; |
| 57 | }; | 58 | }; |
| 58 | 59 | ||
| @@ -66,10 +67,11 @@ public: | |||
| 66 | 67 | ||
| 67 | private: | 68 | private: |
| 68 | GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const; | 69 | GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const; |
| 69 | GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u32 size, u8* host_ptr); | 70 | GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size); |
| 70 | void ReserveGlobalRegion(GlobalRegion region); | 71 | void ReserveGlobalRegion(GlobalRegion region); |
| 71 | 72 | ||
| 72 | std::unordered_map<CacheAddr, GlobalRegion> reserve; | 73 | std::unordered_map<CacheAddr, GlobalRegion> reserve; |
| 74 | u32 max_ssbo_size{}; | ||
| 73 | }; | 75 | }; |
| 74 | 76 | ||
| 75 | } // namespace OpenGL | 77 | } // namespace OpenGL |
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index d250d5cbb..ea42fd060 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp | |||
| @@ -756,6 +756,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) { | |||
| 756 | return; | 756 | return; |
| 757 | } | 757 | } |
| 758 | res_cache.FlushRegion(addr, size); | 758 | res_cache.FlushRegion(addr, size); |
| 759 | global_cache.FlushRegion(addr, size); | ||
| 759 | } | 760 | } |
| 760 | 761 | ||
| 761 | void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { | 762 | void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { |
| @@ -953,6 +954,9 @@ void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::Shade | |||
| 953 | for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { | 954 | for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { |
| 954 | const auto& entry{entries[bindpoint]}; | 955 | const auto& entry{entries[bindpoint]}; |
| 955 | const auto& region{global_cache.GetGlobalRegion(entry, stage)}; | 956 | const auto& region{global_cache.GetGlobalRegion(entry, stage)}; |
| 957 | if (entry.IsWritten()) { | ||
| 958 | region->MarkAsModified(true, global_cache); | ||
| 959 | } | ||
| 956 | bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0, | 960 | bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0, |
| 957 | static_cast<GLsizeiptr>(region->GetSizeInBytes())); | 961 | static_cast<GLsizeiptr>(region->GetSizeInBytes())); |
| 958 | } | 962 | } |
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index e4c64ae71..d4c2cf80e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h | |||
| @@ -71,10 +71,6 @@ public: | |||
| 71 | static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0, | 71 | static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0, |
| 72 | "The maximum size of a constbuffer must be a multiple of the size of GLvec4"); | 72 | "The maximum size of a constbuffer must be a multiple of the size of GLvec4"); |
| 73 | 73 | ||
| 74 | static constexpr std::size_t MaxGlobalMemorySize = 0x10000; | ||
| 75 | static_assert(MaxGlobalMemorySize % sizeof(float) == 0, | ||
| 76 | "The maximum size of a global memory must be a multiple of the size of float"); | ||
| 77 | |||
| 78 | private: | 74 | private: |
| 79 | class SamplerInfo { | 75 | class SamplerInfo { |
| 80 | public: | 76 | public: |
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 28e490b3c..445048daf 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp | |||
| @@ -45,8 +45,6 @@ using TextureIR = std::variant<TextureAoffi, TextureArgument>; | |||
| 45 | enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 }; | 45 | enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 }; |
| 46 | constexpr u32 MAX_CONSTBUFFER_ELEMENTS = | 46 | constexpr u32 MAX_CONSTBUFFER_ELEMENTS = |
| 47 | static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float)); | 47 | static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float)); |
| 48 | constexpr u32 MAX_GLOBALMEMORY_ELEMENTS = | ||
| 49 | static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize) / sizeof(float); | ||
| 50 | 48 | ||
| 51 | class ShaderWriter { | 49 | class ShaderWriter { |
| 52 | public: | 50 | public: |
| @@ -208,8 +206,10 @@ public: | |||
| 208 | for (const auto& sampler : ir.GetSamplers()) { | 206 | for (const auto& sampler : ir.GetSamplers()) { |
| 209 | entries.samplers.emplace_back(sampler); | 207 | entries.samplers.emplace_back(sampler); |
| 210 | } | 208 | } |
| 211 | for (const auto& gmem : ir.GetGlobalMemoryBases()) { | 209 | for (const auto& gmem_pair : ir.GetGlobalMemory()) { |
| 212 | entries.global_memory_entries.emplace_back(gmem.cbuf_index, gmem.cbuf_offset); | 210 | const auto& [base, usage] = gmem_pair; |
| 211 | entries.global_memory_entries.emplace_back(base.cbuf_index, base.cbuf_offset, | ||
| 212 | usage.is_read, usage.is_written); | ||
| 213 | } | 213 | } |
| 214 | entries.clip_distances = ir.GetClipDistances(); | 214 | entries.clip_distances = ir.GetClipDistances(); |
| 215 | entries.shader_length = ir.GetLength(); | 215 | entries.shader_length = ir.GetLength(); |
| @@ -380,12 +380,22 @@ private: | |||
| 380 | } | 380 | } |
| 381 | 381 | ||
| 382 | void DeclareGlobalMemory() { | 382 | void DeclareGlobalMemory() { |
| 383 | for (const auto& entry : ir.GetGlobalMemoryBases()) { | 383 | for (const auto& gmem : ir.GetGlobalMemory()) { |
| 384 | const auto& [base, usage] = gmem; | ||
| 385 | |||
| 386 | // Since we don't know how the shader will use the shader, hint the driver to disable as | ||
| 387 | // much optimizations as possible | ||
| 388 | std::string qualifier = "coherent volatile"; | ||
| 389 | if (usage.is_read && !usage.is_written) | ||
| 390 | qualifier += " readonly"; | ||
| 391 | else if (usage.is_written && !usage.is_read) | ||
| 392 | qualifier += " writeonly"; | ||
| 393 | |||
| 384 | const std::string binding = | 394 | const std::string binding = |
| 385 | fmt::format("GMEM_BINDING_{}_{}", entry.cbuf_index, entry.cbuf_offset); | 395 | fmt::format("GMEM_BINDING_{}_{}", base.cbuf_index, base.cbuf_offset); |
| 386 | code.AddLine("layout (std430, binding = " + binding + ") buffer " + | 396 | code.AddLine("layout (std430, binding = " + binding + ") " + qualifier + " buffer " + |
| 387 | GetGlobalMemoryBlock(entry) + " {"); | 397 | GetGlobalMemoryBlock(base) + " {"); |
| 388 | code.AddLine(" float " + GetGlobalMemory(entry) + "[MAX_GLOBALMEMORY_ELEMENTS];"); | 398 | code.AddLine(" float " + GetGlobalMemory(base) + "[];"); |
| 389 | code.AddLine("};"); | 399 | code.AddLine("};"); |
| 390 | code.AddNewLine(); | 400 | code.AddNewLine(); |
| 391 | } | 401 | } |
| @@ -868,6 +878,12 @@ private: | |||
| 868 | } else if (const auto lmem = std::get_if<LmemNode>(dest)) { | 878 | } else if (const auto lmem = std::get_if<LmemNode>(dest)) { |
| 869 | target = GetLocalMemory() + "[ftou(" + Visit(lmem->GetAddress()) + ") / 4]"; | 879 | target = GetLocalMemory() + "[ftou(" + Visit(lmem->GetAddress()) + ") / 4]"; |
| 870 | 880 | ||
| 881 | } else if (const auto gmem = std::get_if<GmemNode>(dest)) { | ||
| 882 | const std::string real = Visit(gmem->GetRealAddress()); | ||
| 883 | const std::string base = Visit(gmem->GetBaseAddress()); | ||
| 884 | const std::string final_offset = "(ftou(" + real + ") - ftou(" + base + ")) / 4"; | ||
| 885 | target = fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset); | ||
| 886 | |||
| 871 | } else { | 887 | } else { |
| 872 | UNREACHABLE_MSG("Assign called without a proper target"); | 888 | UNREACHABLE_MSG("Assign called without a proper target"); |
| 873 | } | 889 | } |
| @@ -1621,9 +1637,7 @@ private: | |||
| 1621 | 1637 | ||
| 1622 | std::string GetCommonDeclarations() { | 1638 | std::string GetCommonDeclarations() { |
| 1623 | const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS); | 1639 | const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS); |
| 1624 | const auto gmem = std::to_string(MAX_GLOBALMEMORY_ELEMENTS); | ||
| 1625 | return "#define MAX_CONSTBUFFER_ELEMENTS " + cbuf + "\n" + | 1640 | return "#define MAX_CONSTBUFFER_ELEMENTS " + cbuf + "\n" + |
| 1626 | "#define MAX_GLOBALMEMORY_ELEMENTS " + gmem + "\n" + | ||
| 1627 | "#define ftoi floatBitsToInt\n" | 1641 | "#define ftoi floatBitsToInt\n" |
| 1628 | "#define ftou floatBitsToUint\n" | 1642 | "#define ftou floatBitsToUint\n" |
| 1629 | "#define itof intBitsToFloat\n" | 1643 | "#define itof intBitsToFloat\n" |
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index 4e04ab2f8..55b3d4d7b 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h | |||
| @@ -39,8 +39,9 @@ private: | |||
| 39 | 39 | ||
| 40 | class GlobalMemoryEntry { | 40 | class GlobalMemoryEntry { |
| 41 | public: | 41 | public: |
| 42 | explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset) | 42 | explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, bool is_written) |
| 43 | : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset} {} | 43 | : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{ |
| 44 | is_written} {} | ||
| 44 | 45 | ||
| 45 | u32 GetCbufIndex() const { | 46 | u32 GetCbufIndex() const { |
| 46 | return cbuf_index; | 47 | return cbuf_index; |
| @@ -50,9 +51,19 @@ public: | |||
| 50 | return cbuf_offset; | 51 | return cbuf_offset; |
| 51 | } | 52 | } |
| 52 | 53 | ||
| 54 | bool IsRead() const { | ||
| 55 | return is_read; | ||
| 56 | } | ||
| 57 | |||
| 58 | bool IsWritten() const { | ||
| 59 | return is_written; | ||
| 60 | } | ||
| 61 | |||
| 53 | private: | 62 | private: |
| 54 | u32 cbuf_index{}; | 63 | u32 cbuf_index{}; |
| 55 | u32 cbuf_offset{}; | 64 | u32 cbuf_offset{}; |
| 65 | bool is_read{}; | ||
| 66 | bool is_written{}; | ||
| 56 | }; | 67 | }; |
| 57 | 68 | ||
| 58 | struct ShaderEntries { | 69 | struct ShaderEntries { |
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 8a43eb157..d5890a375 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp | |||
| @@ -337,11 +337,16 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn | |||
| 337 | for (u32 i = 0; i < global_memory_count; ++i) { | 337 | for (u32 i = 0; i < global_memory_count; ++i) { |
| 338 | u32 cbuf_index{}; | 338 | u32 cbuf_index{}; |
| 339 | u32 cbuf_offset{}; | 339 | u32 cbuf_offset{}; |
| 340 | u8 is_read{}; | ||
| 341 | u8 is_written{}; | ||
| 340 | if (file.ReadBytes(&cbuf_index, sizeof(u32)) != sizeof(u32) || | 342 | if (file.ReadBytes(&cbuf_index, sizeof(u32)) != sizeof(u32) || |
| 341 | file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32)) { | 343 | file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32) || |
| 344 | file.ReadBytes(&is_read, sizeof(u8)) != sizeof(u8) || | ||
| 345 | file.ReadBytes(&is_written, sizeof(u8)) != sizeof(u8)) { | ||
| 342 | return {}; | 346 | return {}; |
| 343 | } | 347 | } |
| 344 | entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset); | 348 | entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read != 0, |
| 349 | is_written != 0); | ||
| 345 | } | 350 | } |
| 346 | 351 | ||
| 347 | for (auto& clip_distance : entry.entries.clip_distances) { | 352 | for (auto& clip_distance : entry.entries.clip_distances) { |
| @@ -397,7 +402,9 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(FileUtil::IOFile& file, u64 uniqu | |||
| 397 | return false; | 402 | return false; |
| 398 | for (const auto& gmem : entries.global_memory_entries) { | 403 | for (const auto& gmem : entries.global_memory_entries) { |
| 399 | if (file.WriteObject(static_cast<u32>(gmem.GetCbufIndex())) != 1 || | 404 | if (file.WriteObject(static_cast<u32>(gmem.GetCbufIndex())) != 1 || |
| 400 | file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1) { | 405 | file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1 || |
| 406 | file.WriteObject(static_cast<u8>(gmem.IsRead() ? 1 : 0)) != 1 || | ||
| 407 | file.WriteObject(static_cast<u8>(gmem.IsWritten() ? 1 : 0)) != 1) { | ||
| 401 | return false; | 408 | return false; |
| 402 | } | 409 | } |
| 403 | } | 410 | } |
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index e0a6f5e87..25500f9a3 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp | |||
| @@ -191,8 +191,9 @@ public: | |||
| 191 | for (const auto& cbuf : ir.GetConstantBuffers()) { | 191 | for (const auto& cbuf : ir.GetConstantBuffers()) { |
| 192 | entries.const_buffers.emplace_back(cbuf.second, cbuf.first); | 192 | entries.const_buffers.emplace_back(cbuf.second, cbuf.first); |
| 193 | } | 193 | } |
| 194 | for (const auto& gmem : ir.GetGlobalMemoryBases()) { | 194 | for (const auto& gmem_pair : ir.GetGlobalMemory()) { |
| 195 | entries.global_buffers.emplace_back(gmem.cbuf_index, gmem.cbuf_offset); | 195 | const auto& [base, usage] = gmem_pair; |
| 196 | entries.global_buffers.emplace_back(base.cbuf_index, base.cbuf_offset); | ||
| 196 | } | 197 | } |
| 197 | for (const auto& sampler : ir.GetSamplers()) { | 198 | for (const auto& sampler : ir.GetSamplers()) { |
| 198 | entries.samplers.emplace_back(sampler); | 199 | entries.samplers.emplace_back(sampler); |
| @@ -225,7 +226,7 @@ private: | |||
| 225 | return current_binding; | 226 | return current_binding; |
| 226 | }; | 227 | }; |
| 227 | const_buffers_base_binding = Allocate(ir.GetConstantBuffers().size()); | 228 | const_buffers_base_binding = Allocate(ir.GetConstantBuffers().size()); |
| 228 | global_buffers_base_binding = Allocate(ir.GetGlobalMemoryBases().size()); | 229 | global_buffers_base_binding = Allocate(ir.GetGlobalMemory().size()); |
| 229 | samplers_base_binding = Allocate(ir.GetSamplers().size()); | 230 | samplers_base_binding = Allocate(ir.GetSamplers().size()); |
| 230 | 231 | ||
| 231 | ASSERT_MSG(binding_iterator - binding_base < STAGE_BINDING_STRIDE, | 232 | ASSERT_MSG(binding_iterator - binding_base < STAGE_BINDING_STRIDE, |
| @@ -390,14 +391,15 @@ private: | |||
| 390 | 391 | ||
| 391 | void DeclareGlobalBuffers() { | 392 | void DeclareGlobalBuffers() { |
| 392 | u32 binding = global_buffers_base_binding; | 393 | u32 binding = global_buffers_base_binding; |
| 393 | for (const auto& entry : ir.GetGlobalMemoryBases()) { | 394 | for (const auto& entry : ir.GetGlobalMemory()) { |
| 395 | const auto [base, usage] = entry; | ||
| 394 | const Id id = OpVariable(t_gmem_ssbo, spv::StorageClass::StorageBuffer); | 396 | const Id id = OpVariable(t_gmem_ssbo, spv::StorageClass::StorageBuffer); |
| 395 | AddGlobalVariable( | 397 | AddGlobalVariable( |
| 396 | Name(id, fmt::format("gmem_{}_{}", entry.cbuf_index, entry.cbuf_offset))); | 398 | Name(id, fmt::format("gmem_{}_{}", base.cbuf_index, base.cbuf_offset))); |
| 397 | 399 | ||
| 398 | Decorate(id, spv::Decoration::Binding, binding++); | 400 | Decorate(id, spv::Decoration::Binding, binding++); |
| 399 | Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); | 401 | Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); |
| 400 | global_buffers.emplace(entry, id); | 402 | global_buffers.emplace(base, id); |
| 401 | } | 403 | } |
| 402 | } | 404 | } |
| 403 | 405 | ||
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp index ea3c71eed..ff19ada55 100644 --- a/src/video_core/shader/decode/memory.cpp +++ b/src/video_core/shader/decode/memory.cpp | |||
| @@ -18,6 +18,23 @@ using Tegra::Shader::Instruction; | |||
| 18 | using Tegra::Shader::OpCode; | 18 | using Tegra::Shader::OpCode; |
| 19 | using Tegra::Shader::Register; | 19 | using Tegra::Shader::Register; |
| 20 | 20 | ||
| 21 | namespace { | ||
| 22 | u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) { | ||
| 23 | switch (uniform_type) { | ||
| 24 | case Tegra::Shader::UniformType::Single: | ||
| 25 | return 1; | ||
| 26 | case Tegra::Shader::UniformType::Double: | ||
| 27 | return 2; | ||
| 28 | case Tegra::Shader::UniformType::Quad: | ||
| 29 | case Tegra::Shader::UniformType::UnsignedQuad: | ||
| 30 | return 4; | ||
| 31 | default: | ||
| 32 | UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type)); | ||
| 33 | return 1; | ||
| 34 | } | ||
| 35 | } | ||
| 36 | } // namespace | ||
| 37 | |||
| 21 | u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { | 38 | u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { |
| 22 | const Instruction instr = {program_code[pc]}; | 39 | const Instruction instr = {program_code[pc]}; |
| 23 | const auto opcode = OpCode::Decode(instr); | 40 | const auto opcode = OpCode::Decode(instr); |
| @@ -126,45 +143,15 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { | |||
| 126 | break; | 143 | break; |
| 127 | } | 144 | } |
| 128 | case OpCode::Id::LDG: { | 145 | case OpCode::Id::LDG: { |
| 129 | const u32 count = [&]() { | 146 | const auto [real_address_base, base_address, descriptor] = |
| 130 | switch (instr.ldg.type) { | 147 | TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8), |
| 131 | case Tegra::Shader::UniformType::Single: | 148 | static_cast<u32>(instr.ldg.immediate_offset.Value()), false); |
| 132 | return 1; | ||
| 133 | case Tegra::Shader::UniformType::Double: | ||
| 134 | return 2; | ||
| 135 | case Tegra::Shader::UniformType::Quad: | ||
| 136 | case Tegra::Shader::UniformType::UnsignedQuad: | ||
| 137 | return 4; | ||
| 138 | default: | ||
| 139 | UNIMPLEMENTED_MSG("Unimplemented LDG size!"); | ||
| 140 | return 1; | ||
| 141 | } | ||
| 142 | }(); | ||
| 143 | |||
| 144 | const Node addr_register = GetRegister(instr.gpr8); | ||
| 145 | const Node base_address = | ||
| 146 | TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size())); | ||
| 147 | const auto cbuf = std::get_if<CbufNode>(base_address); | ||
| 148 | ASSERT(cbuf != nullptr); | ||
| 149 | const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset()); | ||
| 150 | ASSERT(cbuf_offset_imm != nullptr); | ||
| 151 | const auto cbuf_offset = cbuf_offset_imm->GetValue(); | ||
| 152 | |||
| 153 | bb.push_back(Comment( | ||
| 154 | fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset))); | ||
| 155 | |||
| 156 | const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset}; | ||
| 157 | used_global_memory_bases.insert(descriptor); | ||
| 158 | |||
| 159 | const Node immediate_offset = | ||
| 160 | Immediate(static_cast<u32>(instr.ldg.immediate_offset.Value())); | ||
| 161 | const Node base_real_address = | ||
| 162 | Operation(OperationCode::UAdd, NO_PRECISE, immediate_offset, addr_register); | ||
| 163 | 149 | ||
| 150 | const u32 count = GetUniformTypeElementsCount(instr.ldg.type); | ||
| 164 | for (u32 i = 0; i < count; ++i) { | 151 | for (u32 i = 0; i < count; ++i) { |
| 165 | const Node it_offset = Immediate(i * 4); | 152 | const Node it_offset = Immediate(i * 4); |
| 166 | const Node real_address = | 153 | const Node real_address = |
| 167 | Operation(OperationCode::UAdd, NO_PRECISE, base_real_address, it_offset); | 154 | Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset); |
| 168 | const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor)); | 155 | const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor)); |
| 169 | 156 | ||
| 170 | SetTemporal(bb, i, gmem); | 157 | SetTemporal(bb, i, gmem); |
| @@ -174,6 +161,28 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { | |||
| 174 | } | 161 | } |
| 175 | break; | 162 | break; |
| 176 | } | 163 | } |
| 164 | case OpCode::Id::STG: { | ||
| 165 | const auto [real_address_base, base_address, descriptor] = | ||
| 166 | TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8), | ||
| 167 | static_cast<u32>(instr.stg.immediate_offset.Value()), true); | ||
| 168 | |||
| 169 | // Encode in temporary registers like this: real_base_address, {registers_to_be_written...} | ||
| 170 | SetTemporal(bb, 0, real_address_base); | ||
| 171 | |||
| 172 | const u32 count = GetUniformTypeElementsCount(instr.stg.type); | ||
| 173 | for (u32 i = 0; i < count; ++i) { | ||
| 174 | SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i)); | ||
| 175 | } | ||
| 176 | for (u32 i = 0; i < count; ++i) { | ||
| 177 | const Node it_offset = Immediate(i * 4); | ||
| 178 | const Node real_address = | ||
| 179 | Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset); | ||
| 180 | const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor)); | ||
| 181 | |||
| 182 | bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1))); | ||
| 183 | } | ||
| 184 | break; | ||
| 185 | } | ||
| 177 | case OpCode::Id::ST_A: { | 186 | case OpCode::Id::ST_A: { |
| 178 | UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex, | 187 | UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex, |
| 179 | "Indirect attribute loads are not supported"); | 188 | "Indirect attribute loads are not supported"); |
| @@ -236,4 +245,34 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { | |||
| 236 | return pc; | 245 | return pc; |
| 237 | } | 246 | } |
| 238 | 247 | ||
| 248 | std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeBlock& bb, | ||
| 249 | Node addr_register, | ||
| 250 | u32 immediate_offset, | ||
| 251 | bool is_write) { | ||
| 252 | const Node base_address{ | ||
| 253 | TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))}; | ||
| 254 | const auto cbuf = std::get_if<CbufNode>(base_address); | ||
| 255 | ASSERT(cbuf != nullptr); | ||
| 256 | const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset()); | ||
| 257 | ASSERT(cbuf_offset_imm != nullptr); | ||
| 258 | const auto cbuf_offset = cbuf_offset_imm->GetValue(); | ||
| 259 | |||
| 260 | bb.push_back( | ||
| 261 | Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset))); | ||
| 262 | |||
| 263 | const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset}; | ||
| 264 | const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor); | ||
| 265 | auto& usage = entry->second; | ||
| 266 | if (is_write) { | ||
| 267 | usage.is_written = true; | ||
| 268 | } else { | ||
| 269 | usage.is_read = true; | ||
| 270 | } | ||
| 271 | |||
| 272 | const auto real_address = | ||
| 273 | Operation(OperationCode::UAdd, NO_PRECISE, Immediate(immediate_offset), addr_register); | ||
| 274 | |||
| 275 | return {real_address, base_address, descriptor}; | ||
| 276 | } | ||
| 277 | |||
| 239 | } // namespace VideoCommon::Shader | 278 | } // namespace VideoCommon::Shader |
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 4888998d3..1afab08c0 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h | |||
| @@ -276,6 +276,11 @@ struct GlobalMemoryBase { | |||
| 276 | } | 276 | } |
| 277 | }; | 277 | }; |
| 278 | 278 | ||
| 279 | struct GlobalMemoryUsage { | ||
| 280 | bool is_read{}; | ||
| 281 | bool is_written{}; | ||
| 282 | }; | ||
| 283 | |||
| 279 | struct MetaArithmetic { | 284 | struct MetaArithmetic { |
| 280 | bool precise{}; | 285 | bool precise{}; |
| 281 | }; | 286 | }; |
| @@ -578,8 +583,8 @@ public: | |||
| 578 | return used_clip_distances; | 583 | return used_clip_distances; |
| 579 | } | 584 | } |
| 580 | 585 | ||
| 581 | const std::set<GlobalMemoryBase>& GetGlobalMemoryBases() const { | 586 | const std::map<GlobalMemoryBase, GlobalMemoryUsage>& GetGlobalMemory() const { |
| 582 | return used_global_memory_bases; | 587 | return used_global_memory; |
| 583 | } | 588 | } |
| 584 | 589 | ||
| 585 | std::size_t GetLength() const { | 590 | std::size_t GetLength() const { |
| @@ -781,6 +786,11 @@ private: | |||
| 781 | 786 | ||
| 782 | std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor); | 787 | std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor); |
| 783 | 788 | ||
| 789 | std::tuple<Node, Node, GlobalMemoryBase> TrackAndGetGlobalMemory(NodeBlock& bb, | ||
| 790 | Node addr_register, | ||
| 791 | u32 immediate_offset, | ||
| 792 | bool is_write); | ||
| 793 | |||
| 784 | template <typename... T> | 794 | template <typename... T> |
| 785 | Node Operation(OperationCode code, const T*... operands) { | 795 | Node Operation(OperationCode code, const T*... operands) { |
| 786 | return StoreNode(OperationNode(code, operands...)); | 796 | return StoreNode(OperationNode(code, operands...)); |
| @@ -834,7 +844,7 @@ private: | |||
| 834 | std::map<u32, ConstBuffer> used_cbufs; | 844 | std::map<u32, ConstBuffer> used_cbufs; |
| 835 | std::set<Sampler> used_samplers; | 845 | std::set<Sampler> used_samplers; |
| 836 | std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{}; | 846 | std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{}; |
| 837 | std::set<GlobalMemoryBase> used_global_memory_bases; | 847 | std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory; |
| 838 | 848 | ||
| 839 | Tegra::Shader::Header header; | 849 | Tegra::Shader::Header header; |
| 840 | }; | 850 | }; |