summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar ReinUsesLisp2020-06-25 17:12:33 -0300
committerGravatar ReinUsesLisp2020-07-18 01:59:57 -0300
commita8a2526128970dbe47bc25c28b8d2bfb52ac4a26 (patch)
treee092baf3d1e1ea99baf805e2d9e1285810ded2d7 /src
parentMerge pull request #4273 from ogniK5377/async-shaders-prod (diff)
downloadyuzu-a8a2526128970dbe47bc25c28b8d2bfb52ac4a26.tar.gz
yuzu-a8a2526128970dbe47bc25c28b8d2bfb52ac4a26.tar.xz
yuzu-a8a2526128970dbe47bc25c28b8d2bfb52ac4a26.zip
gl_arb_decompiler: Use NV_shader_buffer_{load,store} on assembly shaders
NV_shader_buffer_{load,store} is a 2010 extension that allows GL applications to use what in Vulkan is known as physical pointers, this is basically C pointers. On GLASM these is exposed through the LOAD/STORE/ATOM instructions. Up until now, assembly shaders were using NV_shader_storage_buffer_object. These work fine, but have a (probably unintended) limitation that forces us to have the limit of a single stage for all shader stages. In contrast, with NV_shader_buffer_{load,store} we can pass GPU addresses to the shader through local parameters (GLASM equivalent uniform constants, or push constants on Vulkan). Local parameters have the advantage of being per stage, allowing us to generate code without worrying about binding overlaps.
Diffstat (limited to 'src')
-rw-r--r--src/video_core/renderer_opengl/gl_arb_decompiler.cpp84
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp2
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp103
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h4
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.cpp71
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.h17
-rw-r--r--src/video_core/renderer_opengl/gl_stream_buffer.cpp2
7 files changed, 173 insertions, 110 deletions
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
index eb5158407..4489abf61 100644
--- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -185,10 +185,6 @@ std::string TextureType(const MetaTexture& meta) {
185 return type; 185 return type;
186} 186}
187 187
188std::string GlobalMemoryName(const GlobalMemoryBase& base) {
189 return fmt::format("gmem{}_{}", base.cbuf_index, base.cbuf_offset);
190}
191
192class ARBDecompiler final { 188class ARBDecompiler final {
193public: 189public:
194 explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, 190 explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
@@ -199,6 +195,8 @@ public:
199 } 195 }
200 196
201private: 197private:
198 void DefineGlobalMemory();
199
202 void DeclareHeader(); 200 void DeclareHeader();
203 void DeclareVertex(); 201 void DeclareVertex();
204 void DeclareGeometry(); 202 void DeclareGeometry();
@@ -228,6 +226,7 @@ private:
228 226
229 std::pair<std::string, std::size_t> BuildCoords(Operation); 227 std::pair<std::string, std::size_t> BuildCoords(Operation);
230 std::string BuildAoffi(Operation); 228 std::string BuildAoffi(Operation);
229 std::string GlobalMemoryPointer(const GmemNode& gmem);
231 void Exit(); 230 void Exit();
232 231
233 std::string Assign(Operation); 232 std::string Assign(Operation);
@@ -378,10 +377,8 @@ private:
378 std::string address; 377 std::string address;
379 std::string_view opname; 378 std::string_view opname;
380 if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) { 379 if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
381 AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), 380 address = GlobalMemoryPointer(*gmem);
382 Visit(gmem->GetBaseAddress())); 381 opname = "ATOM";
383 address = fmt::format("{}[{}]", GlobalMemoryName(gmem->GetDescriptor()), temporary);
384 opname = "ATOMB";
385 } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) { 382 } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
386 address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress())); 383 address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
387 opname = "ATOMS"; 384 opname = "ATOMS";
@@ -456,9 +453,13 @@ private:
456 shader_source += '\n'; 453 shader_source += '\n';
457 } 454 }
458 455
459 std::string AllocTemporary() { 456 std::string AllocLongVectorTemporary() {
460 max_temporaries = std::max(max_temporaries, num_temporaries + 1); 457 max_long_temporaries = std::max(max_long_temporaries, num_long_temporaries + 1);
461 return fmt::format("T{}.x", num_temporaries++); 458 return fmt::format("L{}", num_long_temporaries++);
459 }
460
461 std::string AllocLongTemporary() {
462 return fmt::format("{}.x", AllocLongVectorTemporary());
462 } 463 }
463 464
464 std::string AllocVectorTemporary() { 465 std::string AllocVectorTemporary() {
@@ -466,8 +467,13 @@ private:
466 return fmt::format("T{}", num_temporaries++); 467 return fmt::format("T{}", num_temporaries++);
467 } 468 }
468 469
470 std::string AllocTemporary() {
471 return fmt::format("{}.x", AllocVectorTemporary());
472 }
473
469 void ResetTemporaries() noexcept { 474 void ResetTemporaries() noexcept {
470 num_temporaries = 0; 475 num_temporaries = 0;
476 num_long_temporaries = 0;
471 } 477 }
472 478
473 const Device& device; 479 const Device& device;
@@ -478,6 +484,11 @@ private:
478 std::size_t num_temporaries = 0; 484 std::size_t num_temporaries = 0;
479 std::size_t max_temporaries = 0; 485 std::size_t max_temporaries = 0;
480 486
487 std::size_t num_long_temporaries = 0;
488 std::size_t max_long_temporaries = 0;
489
490 std::map<GlobalMemoryBase, u32> global_memory_names;
491
481 std::string shader_source; 492 std::string shader_source;
482 493
483 static constexpr std::string_view ADD_F32 = "ADD.F32"; 494 static constexpr std::string_view ADD_F32 = "ADD.F32";
@@ -784,6 +795,8 @@ private:
784ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, 795ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
785 ShaderType stage, std::string_view identifier) 796 ShaderType stage, std::string_view identifier)
786 : device{device}, ir{ir}, registry{registry}, stage{stage} { 797 : device{device}, ir{ir}, registry{registry}, stage{stage} {
798 DefineGlobalMemory();
799
787 AddLine("TEMP RC;"); 800 AddLine("TEMP RC;");
788 AddLine("TEMP FSWZA[4];"); 801 AddLine("TEMP FSWZA[4];");
789 AddLine("TEMP FSWZB[4];"); 802 AddLine("TEMP FSWZB[4];");
@@ -829,12 +842,20 @@ std::string_view HeaderStageName(ShaderType stage) {
829 } 842 }
830} 843}
831 844
845void ARBDecompiler::DefineGlobalMemory() {
846 u32 binding = 0;
847 for (const auto& pair : ir.GetGlobalMemory()) {
848 const GlobalMemoryBase base = pair.first;
849 global_memory_names.emplace(base, binding);
850 ++binding;
851 }
852}
853
832void ARBDecompiler::DeclareHeader() { 854void ARBDecompiler::DeclareHeader() {
833 AddLine("!!NV{}5.0", HeaderStageName(stage)); 855 AddLine("!!NV{}5.0", HeaderStageName(stage));
834 // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D 856 // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D
835 AddLine("OPTION NV_internal;"); 857 AddLine("OPTION NV_internal;");
836 AddLine("OPTION NV_gpu_program_fp64;"); 858 AddLine("OPTION NV_gpu_program_fp64;");
837 AddLine("OPTION NV_shader_storage_buffer;");
838 AddLine("OPTION NV_shader_thread_group;"); 859 AddLine("OPTION NV_shader_thread_group;");
839 if (ir.UsesWarps() && device.HasWarpIntrinsics()) { 860 if (ir.UsesWarps() && device.HasWarpIntrinsics()) {
840 AddLine("OPTION NV_shader_thread_shuffle;"); 861 AddLine("OPTION NV_shader_thread_shuffle;");
@@ -951,11 +972,10 @@ void ARBDecompiler::DeclareLocalMemory() {
951} 972}
952 973
953void ARBDecompiler::DeclareGlobalMemory() { 974void ARBDecompiler::DeclareGlobalMemory() {
954 u32 binding = 0; // device.GetBaseBindings(stage).shader_storage_buffer; 975 const std::size_t num_entries = ir.GetGlobalMemory().size();
955 for (const auto& pair : ir.GetGlobalMemory()) { 976 if (num_entries > 0) {
956 const auto& base = pair.first; 977 const std::size_t num_vectors = Common::AlignUp(num_entries, 2) / 2;
957 AddLine("STORAGE {}[] = {{ program.storage[{}] }};", GlobalMemoryName(base), binding); 978 AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_vectors, num_vectors - 1);
958 ++binding;
959 } 979 }
960} 980}
961 981
@@ -977,6 +997,9 @@ void ARBDecompiler::DeclareTemporaries() {
977 for (std::size_t i = 0; i < max_temporaries; ++i) { 997 for (std::size_t i = 0; i < max_temporaries; ++i) {
978 AddLine("TEMP T{};", i); 998 AddLine("TEMP T{};", i);
979 } 999 }
1000 for (std::size_t i = 0; i < max_long_temporaries; ++i) {
1001 AddLine("LONG TEMP L{};", i);
1002 }
980} 1003}
981 1004
982void ARBDecompiler::DeclarePredicates() { 1005void ARBDecompiler::DeclarePredicates() {
@@ -1339,10 +1362,7 @@ std::string ARBDecompiler::Visit(const Node& node) {
1339 1362
1340 if (const auto gmem = std::get_if<GmemNode>(&*node)) { 1363 if (const auto gmem = std::get_if<GmemNode>(&*node)) {
1341 std::string temporary = AllocTemporary(); 1364 std::string temporary = AllocTemporary();
1342 AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), 1365 AddLine("LOAD.U32 {}, {};", temporary, GlobalMemoryPointer(*gmem));
1343 Visit(gmem->GetBaseAddress()));
1344 AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()),
1345 temporary);
1346 return temporary; 1366 return temporary;
1347 } 1367 }
1348 1368
@@ -1419,6 +1439,22 @@ std::string ARBDecompiler::BuildAoffi(Operation operation) {
1419 return fmt::format(", offset({})", temporary); 1439 return fmt::format(", offset({})", temporary);
1420} 1440}
1421 1441
1442std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) {
1443 const u32 binding = global_memory_names.at(gmem.GetDescriptor());
1444 const char result_swizzle = binding % 2 == 0 ? 'x' : 'y';
1445
1446 const std::string pointer = AllocLongVectorTemporary();
1447 std::string temporary = AllocTemporary();
1448
1449 const u32 local_index = binding / 2;
1450 AddLine("PK64.U {}, c[{}];", pointer, local_index);
1451 AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()),
1452 Visit(gmem.GetBaseAddress()));
1453 AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary);
1454 AddLine("ADD.U64 {}.x, {}.{}, {}.z;", pointer, pointer, result_swizzle, pointer);
1455 return fmt::format("{}.x", pointer);
1456}
1457
1422void ARBDecompiler::Exit() { 1458void ARBDecompiler::Exit() {
1423 if (stage != ShaderType::Fragment) { 1459 if (stage != ShaderType::Fragment) {
1424 AddLine("RET;"); 1460 AddLine("RET;");
@@ -1515,11 +1551,7 @@ std::string ARBDecompiler::Assign(Operation operation) {
1515 ResetTemporaries(); 1551 ResetTemporaries();
1516 return {}; 1552 return {};
1517 } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { 1553 } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
1518 const std::string temporary = AllocTemporary(); 1554 AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem));
1519 AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
1520 Visit(gmem->GetBaseAddress()));
1521 AddLine("STB.U32 {}, {}[{}];", Visit(src), GlobalMemoryName(gmem->GetDescriptor()),
1522 temporary);
1523 ResetTemporaries(); 1555 ResetTemporaries();
1524 return {}; 1556 return {};
1525 } else { 1557 } else {
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index e461e4c70..e866d8f2f 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -26,7 +26,7 @@ Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
26 : VideoCommon::BufferBlock{cpu_addr, size} { 26 : VideoCommon::BufferBlock{cpu_addr, size} {
27 gl_buffer.Create(); 27 gl_buffer.Create();
28 glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); 28 glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
29 if (device.HasVertexBufferUnifiedMemory()) { 29 if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
30 glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); 30 glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
31 glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); 31 glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
32 } 32 }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index c3fad563c..03e82c599 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -139,6 +139,18 @@ void oglEnable(GLenum cap, bool state) {
139 (state ? glEnable : glDisable)(cap); 139 (state ? glEnable : glDisable)(cap);
140} 140}
141 141
142void UpdateBindlessPointers(GLenum target, GLuint64EXT* pointers, std::size_t num_entries) {
143 if (num_entries == 0) {
144 return;
145 }
146 if (num_entries % 2 == 1) {
147 pointers[num_entries] = 0;
148 }
149 const GLsizei num_vectors = static_cast<GLsizei>((num_entries + 1) / 2);
150 glProgramLocalParametersI4uivNV(target, 0, num_vectors,
151 reinterpret_cast<const GLuint*>(pointers));
152}
153
142} // Anonymous namespace 154} // Anonymous namespace
143 155
144RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, 156RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
@@ -324,7 +336,6 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
324void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { 336void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
325 MICROPROFILE_SCOPE(OpenGL_Shader); 337 MICROPROFILE_SCOPE(OpenGL_Shader);
326 auto& gpu = system.GPU().Maxwell3D(); 338 auto& gpu = system.GPU().Maxwell3D();
327 std::size_t num_ssbos = 0;
328 u32 clip_distances = 0; 339 u32 clip_distances = 0;
329 340
330 for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { 341 for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@@ -347,29 +358,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
347 } 358 }
348 359
349 // Currently this stages are not supported in the OpenGL backend. 360 // Currently this stages are not supported in the OpenGL backend.
350 // Todo(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL 361 // TODO(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL
351 if (program == Maxwell::ShaderProgram::TesselationControl) { 362 if (program == Maxwell::ShaderProgram::TesselationControl ||
363 program == Maxwell::ShaderProgram::TesselationEval) {
352 continue; 364 continue;
353 } else if (program == Maxwell::ShaderProgram::TesselationEval) {
354 continue;
355 }
356
357 Shader* shader = shader_cache.GetStageProgram(program, async_shaders);
358
359 if (device.UseAssemblyShaders()) {
360 // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
361 // all stages share the same bindings.
362 const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size();
363 ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage");
364 num_ssbos += num_stage_ssbos;
365 } 365 }
366 366
367 // Stage indices are 0 - 5 367 Shader* const shader = shader_cache.GetStageProgram(program, async_shaders);
368 const std::size_t stage = index == 0 ? 0 : index - 1;
369 SetupDrawConstBuffers(stage, shader);
370 SetupDrawGlobalMemory(stage, shader);
371 SetupDrawTextures(stage, shader);
372 SetupDrawImages(stage, shader);
373 368
374 const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0; 369 const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0;
375 switch (program) { 370 switch (program) {
@@ -388,6 +383,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
388 shader_config.enable.Value(), shader_config.offset); 383 shader_config.enable.Value(), shader_config.offset);
389 } 384 }
390 385
386 // Stage indices are 0 - 5
387 const std::size_t stage = index == 0 ? 0 : index - 1;
388 SetupDrawConstBuffers(stage, shader);
389 SetupDrawGlobalMemory(stage, shader);
390 SetupDrawTextures(stage, shader);
391 SetupDrawImages(stage, shader);
392
391 // Workaround for Intel drivers. 393 // Workaround for Intel drivers.
392 // When a clip distance is enabled but not set in the shader it crops parts of the screen 394 // When a clip distance is enabled but not set in the shader it crops parts of the screen
393 // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the 395 // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
@@ -749,6 +751,8 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
749 current_cbuf = 0; 751 current_cbuf = 0;
750 752
751 auto kernel = shader_cache.GetComputeKernel(code_addr); 753 auto kernel = shader_cache.GetComputeKernel(code_addr);
754 program_manager.BindCompute(kernel->GetHandle());
755
752 SetupComputeTextures(kernel); 756 SetupComputeTextures(kernel);
753 SetupComputeImages(kernel); 757 SetupComputeImages(kernel);
754 758
@@ -763,7 +767,6 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
763 buffer_cache.Unmap(); 767 buffer_cache.Unmap();
764 768
765 const auto& launch_desc = system.GPU().KeplerCompute().launch_description; 769 const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
766 program_manager.BindCompute(kernel->GetHandle());
767 glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); 770 glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
768 ++num_queued_commands; 771 ++num_queued_commands;
769} 772}
@@ -1023,40 +1026,66 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
1023} 1026}
1024 1027
1025void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) { 1028void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
1029 static constexpr std::array TARGET_LUT = {
1030 GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
1031 GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
1032 };
1033
1026 auto& gpu{system.GPU()}; 1034 auto& gpu{system.GPU()};
1027 auto& memory_manager{gpu.MemoryManager()}; 1035 auto& memory_manager{gpu.MemoryManager()};
1028 const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; 1036 const auto& cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
1037 const auto& entries{shader->GetEntries().global_memory_entries};
1038
1039 std::array<GLuint64EXT, 32> pointers;
1040 ASSERT(entries.size() < pointers.size());
1029 1041
1030 u32 binding = 1042 const bool assembly_shaders = device.UseAssemblyShaders();
1031 device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer; 1043 u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
1032 for (const auto& entry : shader->GetEntries().global_memory_entries) { 1044 for (const auto& entry : entries) {
1033 const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; 1045 const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
1034 const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)}; 1046 const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
1035 const u32 size{memory_manager.Read<u32>(addr + 8)}; 1047 const u32 size{memory_manager.Read<u32>(addr + 8)};
1036 SetupGlobalMemory(binding++, entry, gpu_addr, size); 1048 SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]);
1049 ++binding;
1050 }
1051 if (assembly_shaders) {
1052 UpdateBindlessPointers(TARGET_LUT[stage_index], pointers.data(), entries.size());
1037 } 1053 }
1038} 1054}
1039 1055
1040void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { 1056void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
1041 auto& gpu{system.GPU()}; 1057 auto& gpu{system.GPU()};
1042 auto& memory_manager{gpu.MemoryManager()}; 1058 auto& memory_manager{gpu.MemoryManager()};
1043 const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; 1059 const auto& cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
1060 const auto& entries{kernel->GetEntries().global_memory_entries};
1061
1062 std::array<GLuint64EXT, 32> pointers;
1063 ASSERT(entries.size() < pointers.size());
1044 1064
1045 u32 binding = 0; 1065 u32 binding = 0;
1046 for (const auto& entry : kernel->GetEntries().global_memory_entries) { 1066 for (const auto& entry : entries) {
1047 const auto addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset}; 1067 const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
1048 const auto gpu_addr{memory_manager.Read<u64>(addr)}; 1068 const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
1049 const auto size{memory_manager.Read<u32>(addr + 8)}; 1069 const u32 size{memory_manager.Read<u32>(addr + 8)};
1050 SetupGlobalMemory(binding++, entry, gpu_addr, size); 1070 SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]);
1071 ++binding;
1072 }
1073 if (device.UseAssemblyShaders()) {
1074 UpdateBindlessPointers(GL_COMPUTE_PROGRAM_NV, pointers.data(), entries.size());
1051 } 1075 }
1052} 1076}
1053 1077
1054void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, 1078void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
1055 GPUVAddr gpu_addr, std::size_t size) { 1079 GPUVAddr gpu_addr, std::size_t size,
1056 const auto alignment{device.GetShaderStorageBufferAlignment()}; 1080 GLuint64EXT* pointer) {
1081 const std::size_t alignment{device.GetShaderStorageBufferAlignment()};
1057 const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); 1082 const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
1058 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset, 1083 if (device.UseAssemblyShaders()) {
1059 static_cast<GLsizeiptr>(size)); 1084 *pointer = info.address + info.offset;
1085 } else {
1086 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
1087 static_cast<GLsizeiptr>(size));
1088 }
1060} 1089}
1061 1090
1062void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) { 1091void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index a95646936..ccc6f50f6 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -124,9 +124,9 @@ private:
124 /// Configures the current global memory entries to use for the kernel invocation. 124 /// Configures the current global memory entries to use for the kernel invocation.
125 void SetupComputeGlobalMemory(Shader* kernel); 125 void SetupComputeGlobalMemory(Shader* kernel);
126 126
127 /// Configures a constant buffer. 127 /// Configures a global memory buffer.
128 void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, 128 void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
129 std::size_t size); 129 std::size_t size, GLuint64EXT* pointer);
130 130
131 /// Configures the current textures to use for the draw command. 131 /// Configures the current textures to use for the draw command.
132 void SetupDrawTextures(std::size_t stage_index, Shader* shader); 132 void SetupDrawTextures(std::size_t stage_index, Shader* shader);
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 8e754fa90..691c6c79b 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -11,8 +11,30 @@
11 11
12namespace OpenGL { 12namespace OpenGL {
13 13
14ProgramManager::ProgramManager(const Device& device) { 14namespace {
15 use_assembly_programs = device.UseAssemblyShaders(); 15
16void BindProgram(GLenum stage, GLuint current, GLuint old, bool& enabled) {
17 if (current == old) {
18 return;
19 }
20 if (current == 0) {
21 if (enabled) {
22 enabled = false;
23 glDisable(stage);
24 }
25 return;
26 }
27 if (!enabled) {
28 enabled = true;
29 glEnable(stage);
30 }
31 glBindProgramARB(stage, current);
32}
33
34} // Anonymous namespace
35
36ProgramManager::ProgramManager(const Device& device)
37 : use_assembly_programs{device.UseAssemblyShaders()} {
16 if (use_assembly_programs) { 38 if (use_assembly_programs) {
17 glEnable(GL_COMPUTE_PROGRAM_NV); 39 glEnable(GL_COMPUTE_PROGRAM_NV);
18 } else { 40 } else {
@@ -33,9 +55,7 @@ void ProgramManager::BindCompute(GLuint program) {
33} 55}
34 56
35void ProgramManager::BindGraphicsPipeline() { 57void ProgramManager::BindGraphicsPipeline() {
36 if (use_assembly_programs) { 58 if (!use_assembly_programs) {
37 UpdateAssemblyPrograms();
38 } else {
39 UpdateSourcePrograms(); 59 UpdateSourcePrograms();
40 } 60 }
41} 61}
@@ -63,32 +83,25 @@ void ProgramManager::RestoreGuestPipeline() {
63 } 83 }
64} 84}
65 85
66void ProgramManager::UpdateAssemblyPrograms() { 86void ProgramManager::UseVertexShader(GLuint program) {
67 const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) { 87 if (use_assembly_programs) {
68 if (current == old) { 88 BindProgram(GL_VERTEX_PROGRAM_NV, program, current_state.vertex, vertex_enabled);
69 return; 89 }
70 } 90 current_state.vertex = program;
71 if (current == 0) { 91}
72 if (enabled) {
73 enabled = false;
74 glDisable(stage);
75 }
76 return;
77 }
78 if (!enabled) {
79 enabled = true;
80 glEnable(stage);
81 }
82 glBindProgramARB(stage, current);
83 };
84 92
85 update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex); 93void ProgramManager::UseGeometryShader(GLuint program) {
86 update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry, 94 if (use_assembly_programs) {
87 old_state.geometry); 95 BindProgram(GL_GEOMETRY_PROGRAM_NV, program, current_state.vertex, geometry_enabled);
88 update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment, 96 }
89 old_state.fragment); 97 current_state.geometry = program;
98}
90 99
91 old_state = current_state; 100void ProgramManager::UseFragmentShader(GLuint program) {
101 if (use_assembly_programs) {
102 BindProgram(GL_FRAGMENT_PROGRAM_NV, program, current_state.vertex, fragment_enabled);
103 }
104 current_state.fragment = program;
92} 105}
93 106
94void ProgramManager::UpdateSourcePrograms() { 107void ProgramManager::UpdateSourcePrograms() {
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index 0f03b4f12..950e0dfcb 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -45,17 +45,9 @@ public:
45 /// Rewinds BindHostPipeline state changes. 45 /// Rewinds BindHostPipeline state changes.
46 void RestoreGuestPipeline(); 46 void RestoreGuestPipeline();
47 47
48 void UseVertexShader(GLuint program) { 48 void UseVertexShader(GLuint program);
49 current_state.vertex = program; 49 void UseGeometryShader(GLuint program);
50 } 50 void UseFragmentShader(GLuint program);
51
52 void UseGeometryShader(GLuint program) {
53 current_state.geometry = program;
54 }
55
56 void UseFragmentShader(GLuint program) {
57 current_state.fragment = program;
58 }
59 51
60private: 52private:
61 struct PipelineState { 53 struct PipelineState {
@@ -64,9 +56,6 @@ private:
64 GLuint fragment = 0; 56 GLuint fragment = 0;
65 }; 57 };
66 58
67 /// Update NV_gpu_program5 programs.
68 void UpdateAssemblyPrograms();
69
70 /// Update GLSL programs. 59 /// Update GLSL programs.
71 void UpdateSourcePrograms(); 60 void UpdateSourcePrograms();
72 61
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 3655ff629..887995cf4 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -35,7 +35,7 @@ OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool ver
35 mapped_ptr = static_cast<u8*>( 35 mapped_ptr = static_cast<u8*>(
36 glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); 36 glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
37 37
38 if (device.HasVertexBufferUnifiedMemory()) { 38 if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
39 glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY); 39 glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
40 glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); 40 glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
41 } 41 }