summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
m---------externals/sirit0
-rw-r--r--src/core/file_sys/system_archive/system_version.cpp14
-rw-r--r--src/core/hle/service/hid/controllers/keyboard.cpp7
-rw-r--r--src/core/hle/service/nifm/nifm.cpp3
-rw-r--r--src/core/settings.cpp1
-rw-r--r--src/core/settings.h1
-rw-r--r--src/core/telemetry_session.cpp1
-rw-r--r--src/video_core/CMakeLists.txt1
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h227
-rw-r--r--src/video_core/buffer_cache/map_interval.cpp33
-rw-r--r--src/video_core/buffer_cache/map_interval.h133
-rw-r--r--src/video_core/rasterizer_cache.h58
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp1
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp7
-rw-r--r--src/video_core/renderer_opengl/gl_device.h5
-rw-r--r--src/video_core/renderer_opengl/gl_fence_manager.cpp1
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp100
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h16
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.cpp9
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.h16
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp101
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.h15
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp37
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.cpp106
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.h56
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.cpp16
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.h5
-rw-r--r--src/video_core/renderer_vulkan/maxwell_to_vk.cpp4
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.cpp1
-rw-r--r--src/video_core/renderer_vulkan/vk_device.cpp1
-rw-r--r--src/video_core/renderer_vulkan/vk_fence_manager.h1
-rw-r--r--src/video_core/renderer_vulkan/vk_pipeline_cache.cpp3
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp4
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.cpp47
-rw-r--r--src/video_core/shader/decode/memory.cpp3
-rw-r--r--src/video_core/shader/decode/other.cpp26
-rw-r--r--src/video_core/shader/node.h6
-rw-r--r--src/yuzu/configuration/config.cpp4
-rw-r--r--src/yuzu/configuration/configure_graphics_advanced.cpp6
-rw-r--r--src/yuzu/configuration/configure_graphics_advanced.ui10
-rw-r--r--src/yuzu/discord_impl.cpp2
-rw-r--r--src/yuzu_cmd/config.cpp2
-rw-r--r--src/yuzu_cmd/default_ini.h4
43 files changed, 775 insertions, 319 deletions
diff --git a/externals/sirit b/externals/sirit
Subproject 414fc4dbd28d8fe48f735a0c389db8a234f733c Subproject a62c5bbc100a5e5a31ea0ccc4a78d8fa6a4167c
diff --git a/src/core/file_sys/system_archive/system_version.cpp b/src/core/file_sys/system_archive/system_version.cpp
index 6e22f97b0..aa313de66 100644
--- a/src/core/file_sys/system_archive/system_version.cpp
+++ b/src/core/file_sys/system_archive/system_version.cpp
@@ -12,17 +12,17 @@ namespace SystemVersionData {
12// This section should reflect the best system version to describe yuzu's HLE api. 12// This section should reflect the best system version to describe yuzu's HLE api.
13// TODO(DarkLordZach): Update when HLE gets better. 13// TODO(DarkLordZach): Update when HLE gets better.
14 14
15constexpr u8 VERSION_MAJOR = 5; 15constexpr u8 VERSION_MAJOR = 10;
16constexpr u8 VERSION_MINOR = 1; 16constexpr u8 VERSION_MINOR = 0;
17constexpr u8 VERSION_MICRO = 0; 17constexpr u8 VERSION_MICRO = 2;
18 18
19constexpr u8 REVISION_MAJOR = 3; 19constexpr u8 REVISION_MAJOR = 1;
20constexpr u8 REVISION_MINOR = 0; 20constexpr u8 REVISION_MINOR = 0;
21 21
22constexpr char PLATFORM_STRING[] = "NX"; 22constexpr char PLATFORM_STRING[] = "NX";
23constexpr char VERSION_HASH[] = "23f9df53e25709d756e0c76effcb2473bd3447dd"; 23constexpr char VERSION_HASH[] = "f90143fa8bbc061d4f68c35f95f04f8080c0ecdc";
24constexpr char DISPLAY_VERSION[] = "5.1.0"; 24constexpr char DISPLAY_VERSION[] = "10.0.2";
25constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 5.1.0-3.0"; 25constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 10.0.2-1.0";
26 26
27} // namespace SystemVersionData 27} // namespace SystemVersionData
28 28
diff --git a/src/core/hle/service/hid/controllers/keyboard.cpp b/src/core/hle/service/hid/controllers/keyboard.cpp
index 358cb9329..9a8d354ba 100644
--- a/src/core/hle/service/hid/controllers/keyboard.cpp
+++ b/src/core/hle/service/hid/controllers/keyboard.cpp
@@ -38,10 +38,11 @@ void Controller_Keyboard::OnUpdate(const Core::Timing::CoreTiming& core_timing,
38 cur_entry.sampling_number = last_entry.sampling_number + 1; 38 cur_entry.sampling_number = last_entry.sampling_number + 1;
39 cur_entry.sampling_number2 = cur_entry.sampling_number; 39 cur_entry.sampling_number2 = cur_entry.sampling_number;
40 40
41 cur_entry.key.fill(0);
42 cur_entry.modifier = 0;
43
41 for (std::size_t i = 0; i < keyboard_keys.size(); ++i) { 44 for (std::size_t i = 0; i < keyboard_keys.size(); ++i) {
42 for (std::size_t k = 0; k < KEYS_PER_BYTE; ++k) { 45 cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << (i % KEYS_PER_BYTE));
43 cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << k);
44 }
45 } 46 }
46 47
47 for (std::size_t i = 0; i < keyboard_mods.size(); ++i) { 48 for (std::size_t i = 0; i < keyboard_mods.size(); ++i) {
diff --git a/src/core/hle/service/nifm/nifm.cpp b/src/core/hle/service/nifm/nifm.cpp
index 767158444..01ddcdbd6 100644
--- a/src/core/hle/service/nifm/nifm.cpp
+++ b/src/core/hle/service/nifm/nifm.cpp
@@ -177,7 +177,8 @@ private:
177 void CreateTemporaryNetworkProfile(Kernel::HLERequestContext& ctx) { 177 void CreateTemporaryNetworkProfile(Kernel::HLERequestContext& ctx) {
178 LOG_DEBUG(Service_NIFM, "called"); 178 LOG_DEBUG(Service_NIFM, "called");
179 179
180 ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c, "NetworkProfileData is not the correct size"); 180 ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c,
181 "SfNetworkProfileData is not the correct size");
181 u128 uuid{}; 182 u128 uuid{};
182 auto buffer = ctx.ReadBuffer(); 183 auto buffer = ctx.ReadBuffer();
183 std::memcpy(&uuid, buffer.data() + 8, sizeof(u128)); 184 std::memcpy(&uuid, buffer.data() + 8, sizeof(u128));
diff --git a/src/core/settings.cpp b/src/core/settings.cpp
index da53cde05..4edff9cd8 100644
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -112,6 +112,7 @@ void LogSettings() {
112 LogSetting("Renderer_UseAsynchronousGpuEmulation", 112 LogSetting("Renderer_UseAsynchronousGpuEmulation",
113 Settings::values.use_asynchronous_gpu_emulation); 113 Settings::values.use_asynchronous_gpu_emulation);
114 LogSetting("Renderer_UseVsync", Settings::values.use_vsync); 114 LogSetting("Renderer_UseVsync", Settings::values.use_vsync);
115 LogSetting("Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
115 LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy); 116 LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy);
116 LogSetting("Audio_OutputEngine", Settings::values.sink_id); 117 LogSetting("Audio_OutputEngine", Settings::values.sink_id);
117 LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching); 118 LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching);
diff --git a/src/core/settings.h b/src/core/settings.h
index c1266b341..78eb33737 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -446,6 +446,7 @@ struct Values {
446 GPUAccuracy gpu_accuracy; 446 GPUAccuracy gpu_accuracy;
447 bool use_asynchronous_gpu_emulation; 447 bool use_asynchronous_gpu_emulation;
448 bool use_vsync; 448 bool use_vsync;
449 bool use_assembly_shaders;
449 bool force_30fps_mode; 450 bool force_30fps_mode;
450 bool use_fast_gpu_time; 451 bool use_fast_gpu_time;
451 452
diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp
index 1c3b03a1c..c781b3cfc 100644
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -201,6 +201,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
201 AddField(field_type, "Renderer_UseAsynchronousGpuEmulation", 201 AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
202 Settings::values.use_asynchronous_gpu_emulation); 202 Settings::values.use_asynchronous_gpu_emulation);
203 AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync); 203 AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync);
204 AddField(field_type, "Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
204 AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode); 205 AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode);
205} 206}
206 207
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index d23c53843..f00c71dae 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,6 +1,7 @@
1add_library(video_core STATIC 1add_library(video_core STATIC
2 buffer_cache/buffer_block.h 2 buffer_cache/buffer_block.h
3 buffer_cache/buffer_cache.h 3 buffer_cache/buffer_cache.h
4 buffer_cache/map_interval.cpp
4 buffer_cache/map_interval.h 5 buffer_cache/map_interval.h
5 dirty_flags.cpp 6 dirty_flags.cpp
6 dirty_flags.h 7 dirty_flags.h
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 56e570994..d9a4a1b4d 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -12,11 +12,12 @@
12#include <utility> 12#include <utility>
13#include <vector> 13#include <vector>
14 14
15#include <boost/icl/interval_map.hpp> 15#include <boost/container/small_vector.hpp>
16#include <boost/icl/interval_set.hpp> 16#include <boost/icl/interval_set.hpp>
17#include <boost/range/iterator_range.hpp> 17#include <boost/intrusive/set.hpp>
18 18
19#include "common/alignment.h" 19#include "common/alignment.h"
20#include "common/assert.h"
20#include "common/common_types.h" 21#include "common/common_types.h"
21#include "common/logging/log.h" 22#include "common/logging/log.h"
22#include "core/core.h" 23#include "core/core.h"
@@ -29,10 +30,12 @@
29 30
30namespace VideoCommon { 31namespace VideoCommon {
31 32
32using MapInterval = std::shared_ptr<MapIntervalBase>;
33
34template <typename OwnerBuffer, typename BufferType, typename StreamBuffer> 33template <typename OwnerBuffer, typename BufferType, typename StreamBuffer>
35class BufferCache { 34class BufferCache {
35 using IntervalSet = boost::icl::interval_set<VAddr>;
36 using IntervalType = typename IntervalSet::interval_type;
37 using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
38
36public: 39public:
37 using BufferInfo = std::pair<BufferType, u64>; 40 using BufferInfo = std::pair<BufferType, u64>;
38 41
@@ -40,14 +43,12 @@ public:
40 bool is_written = false, bool use_fast_cbuf = false) { 43 bool is_written = false, bool use_fast_cbuf = false) {
41 std::lock_guard lock{mutex}; 44 std::lock_guard lock{mutex};
42 45
43 const std::optional<VAddr> cpu_addr_opt = 46 const auto& memory_manager = system.GPU().MemoryManager();
44 system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); 47 const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
45
46 if (!cpu_addr_opt) { 48 if (!cpu_addr_opt) {
47 return {GetEmptyBuffer(size), 0}; 49 return {GetEmptyBuffer(size), 0};
48 } 50 }
49 51 const VAddr cpu_addr = *cpu_addr_opt;
50 VAddr cpu_addr = *cpu_addr_opt;
51 52
52 // Cache management is a big overhead, so only cache entries with a given size. 53 // Cache management is a big overhead, so only cache entries with a given size.
53 // TODO: Figure out which size is the best for given games. 54 // TODO: Figure out which size is the best for given games.
@@ -77,16 +78,19 @@ public:
77 } 78 }
78 } 79 }
79 80
80 auto block = GetBlock(cpu_addr, size); 81 OwnerBuffer block = GetBlock(cpu_addr, size);
81 auto map = MapAddress(block, gpu_addr, cpu_addr, size); 82 MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
83 if (!map) {
84 return {GetEmptyBuffer(size), 0};
85 }
82 if (is_written) { 86 if (is_written) {
83 map->MarkAsModified(true, GetModifiedTicks()); 87 map->MarkAsModified(true, GetModifiedTicks());
84 if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) { 88 if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
85 MarkForAsyncFlush(map); 89 MarkForAsyncFlush(map);
86 } 90 }
87 if (!map->IsWritten()) { 91 if (!map->is_written) {
88 map->MarkAsWritten(true); 92 map->is_written = true;
89 MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1); 93 MarkRegionAsWritten(map->start, map->end - 1);
90 } 94 }
91 } 95 }
92 96
@@ -132,12 +136,11 @@ public:
132 void FlushRegion(VAddr addr, std::size_t size) { 136 void FlushRegion(VAddr addr, std::size_t size) {
133 std::lock_guard lock{mutex}; 137 std::lock_guard lock{mutex};
134 138
135 std::vector<MapInterval> objects = GetMapsInRange(addr, size); 139 VectorMapInterval objects = GetMapsInRange(addr, size);
136 std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) { 140 std::sort(objects.begin(), objects.end(),
137 return a->GetModificationTick() < b->GetModificationTick(); 141 [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
138 }); 142 for (MapInterval* object : objects) {
139 for (auto& object : objects) { 143 if (object->is_modified && object->is_registered) {
140 if (object->IsModified() && object->IsRegistered()) {
141 mutex.unlock(); 144 mutex.unlock();
142 FlushMap(object); 145 FlushMap(object);
143 mutex.lock(); 146 mutex.lock();
@@ -148,9 +151,9 @@ public:
148 bool MustFlushRegion(VAddr addr, std::size_t size) { 151 bool MustFlushRegion(VAddr addr, std::size_t size) {
149 std::lock_guard lock{mutex}; 152 std::lock_guard lock{mutex};
150 153
151 const std::vector<MapInterval> objects = GetMapsInRange(addr, size); 154 const VectorMapInterval objects = GetMapsInRange(addr, size);
152 return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval& map) { 155 return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
153 return map->IsModified() && map->IsRegistered(); 156 return map->is_modified && map->is_registered;
154 }); 157 });
155 } 158 }
156 159
@@ -158,9 +161,8 @@ public:
158 void InvalidateRegion(VAddr addr, u64 size) { 161 void InvalidateRegion(VAddr addr, u64 size) {
159 std::lock_guard lock{mutex}; 162 std::lock_guard lock{mutex};
160 163
161 std::vector<MapInterval> objects = GetMapsInRange(addr, size); 164 for (auto& object : GetMapsInRange(addr, size)) {
162 for (auto& object : objects) { 165 if (object->is_registered) {
163 if (object->IsRegistered()) {
164 Unregister(object); 166 Unregister(object);
165 } 167 }
166 } 168 }
@@ -169,10 +171,10 @@ public:
169 void OnCPUWrite(VAddr addr, std::size_t size) { 171 void OnCPUWrite(VAddr addr, std::size_t size) {
170 std::lock_guard lock{mutex}; 172 std::lock_guard lock{mutex};
171 173
172 for (const auto& object : GetMapsInRange(addr, size)) { 174 for (MapInterval* object : GetMapsInRange(addr, size)) {
173 if (object->IsMemoryMarked() && object->IsRegistered()) { 175 if (object->is_memory_marked && object->is_registered) {
174 UnmarkMemory(object); 176 UnmarkMemory(object);
175 object->SetSyncPending(true); 177 object->is_sync_pending = true;
176 marked_for_unregister.emplace_back(object); 178 marked_for_unregister.emplace_back(object);
177 } 179 }
178 } 180 }
@@ -181,9 +183,9 @@ public:
181 void SyncGuestHost() { 183 void SyncGuestHost() {
182 std::lock_guard lock{mutex}; 184 std::lock_guard lock{mutex};
183 185
184 for (const auto& object : marked_for_unregister) { 186 for (auto& object : marked_for_unregister) {
185 if (object->IsRegistered()) { 187 if (object->is_registered) {
186 object->SetSyncPending(false); 188 object->is_sync_pending = false;
187 Unregister(object); 189 Unregister(object);
188 } 190 }
189 } 191 }
@@ -192,9 +194,9 @@ public:
192 194
193 void CommitAsyncFlushes() { 195 void CommitAsyncFlushes() {
194 if (uncommitted_flushes) { 196 if (uncommitted_flushes) {
195 auto commit_list = std::make_shared<std::list<MapInterval>>(); 197 auto commit_list = std::make_shared<std::list<MapInterval*>>();
196 for (auto& map : *uncommitted_flushes) { 198 for (MapInterval* map : *uncommitted_flushes) {
197 if (map->IsRegistered() && map->IsModified()) { 199 if (map->is_registered && map->is_modified) {
198 // TODO(Blinkhawk): Implement backend asynchronous flushing 200 // TODO(Blinkhawk): Implement backend asynchronous flushing
199 // AsyncFlushMap(map) 201 // AsyncFlushMap(map)
200 commit_list->push_back(map); 202 commit_list->push_back(map);
@@ -228,8 +230,8 @@ public:
228 committed_flushes.pop_front(); 230 committed_flushes.pop_front();
229 return; 231 return;
230 } 232 }
231 for (MapInterval& map : *flush_list) { 233 for (MapInterval* map : *flush_list) {
232 if (map->IsRegistered()) { 234 if (map->is_registered) {
233 // TODO(Blinkhawk): Replace this for reading the asynchronous flush 235 // TODO(Blinkhawk): Replace this for reading the asynchronous flush
234 FlushMap(map); 236 FlushMap(map);
235 } 237 }
@@ -265,61 +267,60 @@ protected:
265 } 267 }
266 268
267 /// Register an object into the cache 269 /// Register an object into the cache
268 void Register(const MapInterval& new_map, bool inherit_written = false) { 270 MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
269 const VAddr cpu_addr = new_map->GetStart(); 271 const VAddr cpu_addr = new_map.start;
270 if (!cpu_addr) { 272 if (!cpu_addr) {
271 LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}", 273 LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
272 new_map->GetGpuAddress()); 274 new_map.gpu_addr);
273 return; 275 return nullptr;
274 } 276 }
275 const std::size_t size = new_map->GetEnd() - new_map->GetStart(); 277 const std::size_t size = new_map.end - new_map.start;
276 new_map->MarkAsRegistered(true); 278 new_map.is_registered = true;
277 const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
278 mapped_addresses.insert({interval, new_map});
279 rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1); 279 rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
280 new_map->SetMemoryMarked(true); 280 new_map.is_memory_marked = true;
281 if (inherit_written) { 281 if (inherit_written) {
282 MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1); 282 MarkRegionAsWritten(new_map.start, new_map.end - 1);
283 new_map->MarkAsWritten(true); 283 new_map.is_written = true;
284 } 284 }
285 MapInterval* const storage = mapped_addresses_allocator.Allocate();
286 *storage = new_map;
287 mapped_addresses.insert(*storage);
288 return storage;
285 } 289 }
286 290
287 void UnmarkMemory(const MapInterval& map) { 291 void UnmarkMemory(MapInterval* map) {
288 if (!map->IsMemoryMarked()) { 292 if (!map->is_memory_marked) {
289 return; 293 return;
290 } 294 }
291 const std::size_t size = map->GetEnd() - map->GetStart(); 295 const std::size_t size = map->end - map->start;
292 rasterizer.UpdatePagesCachedCount(map->GetStart(), size, -1); 296 rasterizer.UpdatePagesCachedCount(map->start, size, -1);
293 map->SetMemoryMarked(false); 297 map->is_memory_marked = false;
294 } 298 }
295 299
296 /// Unregisters an object from the cache 300 /// Unregisters an object from the cache
297 void Unregister(const MapInterval& map) { 301 void Unregister(MapInterval* map) {
298 UnmarkMemory(map); 302 UnmarkMemory(map);
299 map->MarkAsRegistered(false); 303 map->is_registered = false;
300 if (map->IsSyncPending()) { 304 if (map->is_sync_pending) {
305 map->is_sync_pending = false;
301 marked_for_unregister.remove(map); 306 marked_for_unregister.remove(map);
302 map->SetSyncPending(false);
303 } 307 }
304 if (map->IsWritten()) { 308 if (map->is_written) {
305 UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1); 309 UnmarkRegionAsWritten(map->start, map->end - 1);
306 } 310 }
307 const IntervalType delete_interval{map->GetStart(), map->GetEnd()}; 311 const auto it = mapped_addresses.find(*map);
308 mapped_addresses.erase(delete_interval); 312 ASSERT(it != mapped_addresses.end());
313 mapped_addresses.erase(it);
314 mapped_addresses_allocator.Release(map);
309 } 315 }
310 316
311private: 317private:
312 MapInterval CreateMap(const VAddr start, const VAddr end, const GPUVAddr gpu_addr) { 318 MapInterval* MapAddress(const OwnerBuffer& block, GPUVAddr gpu_addr, VAddr cpu_addr,
313 return std::make_shared<MapIntervalBase>(start, end, gpu_addr); 319 std::size_t size) {
314 } 320 const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
315
316 MapInterval MapAddress(const OwnerBuffer& block, const GPUVAddr gpu_addr, const VAddr cpu_addr,
317 const std::size_t size) {
318 std::vector<MapInterval> overlaps = GetMapsInRange(cpu_addr, size);
319 if (overlaps.empty()) { 321 if (overlaps.empty()) {
320 auto& memory_manager = system.GPU().MemoryManager(); 322 auto& memory_manager = system.GPU().MemoryManager();
321 const VAddr cpu_addr_end = cpu_addr + size; 323 const VAddr cpu_addr_end = cpu_addr + size;
322 MapInterval new_map = CreateMap(cpu_addr, cpu_addr_end, gpu_addr);
323 if (memory_manager.IsGranularRange(gpu_addr, size)) { 324 if (memory_manager.IsGranularRange(gpu_addr, size)) {
324 u8* host_ptr = memory_manager.GetPointer(gpu_addr); 325 u8* host_ptr = memory_manager.GetPointer(gpu_addr);
325 UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr); 326 UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr);
@@ -328,13 +329,12 @@ private:
328 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); 329 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
329 UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data()); 330 UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data());
330 } 331 }
331 Register(new_map); 332 return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
332 return new_map;
333 } 333 }
334 334
335 const VAddr cpu_addr_end = cpu_addr + size; 335 const VAddr cpu_addr_end = cpu_addr + size;
336 if (overlaps.size() == 1) { 336 if (overlaps.size() == 1) {
337 MapInterval& current_map = overlaps[0]; 337 MapInterval* const current_map = overlaps[0];
338 if (current_map->IsInside(cpu_addr, cpu_addr_end)) { 338 if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
339 return current_map; 339 return current_map;
340 } 340 }
@@ -344,35 +344,39 @@ private:
344 bool write_inheritance = false; 344 bool write_inheritance = false;
345 bool modified_inheritance = false; 345 bool modified_inheritance = false;
346 // Calculate new buffer parameters 346 // Calculate new buffer parameters
347 for (auto& overlap : overlaps) { 347 for (MapInterval* overlap : overlaps) {
348 new_start = std::min(overlap->GetStart(), new_start); 348 new_start = std::min(overlap->start, new_start);
349 new_end = std::max(overlap->GetEnd(), new_end); 349 new_end = std::max(overlap->end, new_end);
350 write_inheritance |= overlap->IsWritten(); 350 write_inheritance |= overlap->is_written;
351 modified_inheritance |= overlap->IsModified(); 351 modified_inheritance |= overlap->is_modified;
352 } 352 }
353 GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; 353 GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
354 for (auto& overlap : overlaps) { 354 for (auto& overlap : overlaps) {
355 Unregister(overlap); 355 Unregister(overlap);
356 } 356 }
357 UpdateBlock(block, new_start, new_end, overlaps); 357 UpdateBlock(block, new_start, new_end, overlaps);
358 MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr); 358
359 const MapInterval new_map{new_start, new_end, new_gpu_addr};
360 MapInterval* const map = Register(new_map, write_inheritance);
361 if (!map) {
362 return nullptr;
363 }
359 if (modified_inheritance) { 364 if (modified_inheritance) {
360 new_map->MarkAsModified(true, GetModifiedTicks()); 365 map->MarkAsModified(true, GetModifiedTicks());
361 if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) { 366 if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
362 MarkForAsyncFlush(new_map); 367 MarkForAsyncFlush(map);
363 } 368 }
364 } 369 }
365 Register(new_map, write_inheritance); 370 return map;
366 return new_map;
367 } 371 }
368 372
369 void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end, 373 void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end,
370 std::vector<MapInterval>& overlaps) { 374 const VectorMapInterval& overlaps) {
371 const IntervalType base_interval{start, end}; 375 const IntervalType base_interval{start, end};
372 IntervalSet interval_set{}; 376 IntervalSet interval_set{};
373 interval_set.add(base_interval); 377 interval_set.add(base_interval);
374 for (auto& overlap : overlaps) { 378 for (auto& overlap : overlaps) {
375 const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()}; 379 const IntervalType subtract{overlap->start, overlap->end};
376 interval_set.subtract(subtract); 380 interval_set.subtract(subtract);
377 } 381 }
378 for (auto& interval : interval_set) { 382 for (auto& interval : interval_set) {
@@ -386,18 +390,24 @@ private:
386 } 390 }
387 } 391 }
388 392
389 std::vector<MapInterval> GetMapsInRange(VAddr addr, std::size_t size) { 393 VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
394 VectorMapInterval result;
390 if (size == 0) { 395 if (size == 0) {
391 return {}; 396 return result;
392 } 397 }
393 398
394 std::vector<MapInterval> objects{}; 399 const VAddr addr_end = addr + size;
395 const IntervalType interval{addr, addr + size}; 400 auto it = mapped_addresses.lower_bound(addr);
396 for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) { 401 if (it != mapped_addresses.begin()) {
397 objects.push_back(pair.second); 402 --it;
398 } 403 }
399 404 while (it != mapped_addresses.end() && it->start < addr_end) {
400 return objects; 405 if (it->Overlaps(addr, addr_end)) {
406 result.push_back(&*it);
407 }
408 ++it;
409 }
410 return result;
401 } 411 }
402 412
403 /// Returns a ticks counter used for tracking when cached objects were last modified 413 /// Returns a ticks counter used for tracking when cached objects were last modified
@@ -405,12 +415,12 @@ private:
405 return ++modified_ticks; 415 return ++modified_ticks;
406 } 416 }
407 417
408 void FlushMap(MapInterval map) { 418 void FlushMap(MapInterval* map) {
409 std::size_t size = map->GetEnd() - map->GetStart(); 419 const std::size_t size = map->end - map->start;
410 OwnerBuffer block = blocks[map->GetStart() >> block_page_bits]; 420 OwnerBuffer block = blocks[map->start >> block_page_bits];
411 staging_buffer.resize(size); 421 staging_buffer.resize(size);
412 DownloadBlockData(block, block->GetOffset(map->GetStart()), size, staging_buffer.data()); 422 DownloadBlockData(block, block->GetOffset(map->start), size, staging_buffer.data());
413 system.Memory().WriteBlockUnsafe(map->GetStart(), staging_buffer.data(), size); 423 system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
414 map->MarkAsModified(false, 0); 424 map->MarkAsModified(false, 0);
415 } 425 }
416 426
@@ -515,7 +525,7 @@ private:
515 } else { 525 } else {
516 written_pages[page_start] = 1; 526 written_pages[page_start] = 1;
517 } 527 }
518 page_start++; 528 ++page_start;
519 } 529 }
520 } 530 }
521 531
@@ -531,7 +541,7 @@ private:
531 written_pages.erase(it); 541 written_pages.erase(it);
532 } 542 }
533 } 543 }
534 page_start++; 544 ++page_start;
535 } 545 }
536 } 546 }
537 547
@@ -542,14 +552,14 @@ private:
542 if (written_pages.count(page_start) > 0) { 552 if (written_pages.count(page_start) > 0) {
543 return true; 553 return true;
544 } 554 }
545 page_start++; 555 ++page_start;
546 } 556 }
547 return false; 557 return false;
548 } 558 }
549 559
550 void MarkForAsyncFlush(MapInterval& map) { 560 void MarkForAsyncFlush(MapInterval* map) {
551 if (!uncommitted_flushes) { 561 if (!uncommitted_flushes) {
552 uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval>>(); 562 uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
553 } 563 }
554 uncommitted_flushes->insert(map); 564 uncommitted_flushes->insert(map);
555 } 565 }
@@ -566,10 +576,9 @@ private:
566 u64 buffer_offset = 0; 576 u64 buffer_offset = 0;
567 u64 buffer_offset_base = 0; 577 u64 buffer_offset_base = 0;
568 578
569 using IntervalSet = boost::icl::interval_set<VAddr>; 579 MapIntervalAllocator mapped_addresses_allocator;
570 using IntervalCache = boost::icl::interval_map<VAddr, MapInterval>; 580 boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
571 using IntervalType = typename IntervalCache::interval_type; 581 mapped_addresses;
572 IntervalCache mapped_addresses;
573 582
574 static constexpr u64 write_page_bit = 11; 583 static constexpr u64 write_page_bit = 11;
575 std::unordered_map<u64, u32> written_pages; 584 std::unordered_map<u64, u32> written_pages;
@@ -583,10 +592,10 @@ private:
583 u64 modified_ticks = 0; 592 u64 modified_ticks = 0;
584 593
585 std::vector<u8> staging_buffer; 594 std::vector<u8> staging_buffer;
586 std::list<MapInterval> marked_for_unregister; 595 std::list<MapInterval*> marked_for_unregister;
587 596
588 std::shared_ptr<std::unordered_set<MapInterval>> uncommitted_flushes{}; 597 std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
589 std::list<std::shared_ptr<std::list<MapInterval>>> committed_flushes; 598 std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
590 599
591 std::recursive_mutex mutex; 600 std::recursive_mutex mutex;
592}; 601};
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp
new file mode 100644
index 000000000..62587e18a
--- /dev/null
+++ b/src/video_core/buffer_cache/map_interval.cpp
@@ -0,0 +1,33 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <array>
7#include <cstddef>
8#include <memory>
9
10#include "video_core/buffer_cache/map_interval.h"
11
12namespace VideoCommon {
13
14MapIntervalAllocator::MapIntervalAllocator() {
15 FillFreeList(first_chunk);
16}
17
18MapIntervalAllocator::~MapIntervalAllocator() = default;
19
20void MapIntervalAllocator::AllocateNewChunk() {
21 *new_chunk = std::make_unique<Chunk>();
22 FillFreeList(**new_chunk);
23 new_chunk = &(*new_chunk)->next;
24}
25
26void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
27 const std::size_t old_size = free_list.size();
28 free_list.resize(old_size + chunk.data.size());
29 std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
30 [](MapInterval& interval) { return &interval; });
31}
32
33} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
index 29d8b26f3..fe0bcd1d8 100644
--- a/src/video_core/buffer_cache/map_interval.h
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -4,104 +4,89 @@
4 4
5#pragma once 5#pragma once
6 6
7#include <array>
8#include <cstddef>
9#include <memory>
10#include <vector>
11
12#include <boost/intrusive/set_hook.hpp>
13
7#include "common/common_types.h" 14#include "common/common_types.h"
8#include "video_core/gpu.h" 15#include "video_core/gpu.h"
9 16
10namespace VideoCommon { 17namespace VideoCommon {
11 18
12class MapIntervalBase { 19struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
13public: 20 MapInterval() = default;
14 MapIntervalBase(const VAddr start, const VAddr end, const GPUVAddr gpu_addr)
15 : start{start}, end{end}, gpu_addr{gpu_addr} {}
16
17 void SetCpuAddress(VAddr new_cpu_addr) {
18 cpu_addr = new_cpu_addr;
19 }
20
21 VAddr GetCpuAddress() const {
22 return cpu_addr;
23 }
24
25 GPUVAddr GetGpuAddress() const {
26 return gpu_addr;
27 }
28
29 bool IsInside(const VAddr other_start, const VAddr other_end) const {
30 return (start <= other_start && other_end <= end);
31 }
32
33 bool operator==(const MapIntervalBase& rhs) const {
34 return std::tie(start, end) == std::tie(rhs.start, rhs.end);
35 }
36
37 bool operator!=(const MapIntervalBase& rhs) const {
38 return !operator==(rhs);
39 }
40 21
41 void MarkAsRegistered(const bool registered) { 22 /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
42 is_registered = registered;
43 }
44 23
45 bool IsRegistered() const { 24 explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
46 return is_registered; 25 : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
47 }
48 26
49 void SetMemoryMarked(bool is_memory_marked_) { 27 bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
50 is_memory_marked = is_memory_marked_; 28 return start <= other_start && other_end <= end;
51 } 29 }
52 30
53 bool IsMemoryMarked() const { 31 bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
54 return is_memory_marked; 32 return start < other_end && other_start < end;
55 } 33 }
56 34
57 void SetSyncPending(bool is_sync_pending_) { 35 void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
58 is_sync_pending = is_sync_pending_; 36 is_modified = is_modified_;
59 } 37 ticks = ticks_;
38 }
39
40 boost::intrusive::set_member_hook<> member_hook_;
41 VAddr start = 0;
42 VAddr end = 0;
43 GPUVAddr gpu_addr = 0;
44 u64 ticks = 0;
45 bool is_written = false;
46 bool is_modified = false;
47 bool is_registered = false;
48 bool is_memory_marked = false;
49 bool is_sync_pending = false;
50};
60 51
61 bool IsSyncPending() const { 52struct MapIntervalCompare {
62 return is_sync_pending; 53 constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
54 return lhs.start < rhs.start;
63 } 55 }
56};
64 57
65 VAddr GetStart() const { 58class MapIntervalAllocator {
66 return start; 59public:
67 } 60 MapIntervalAllocator();
61 ~MapIntervalAllocator();
68 62
69 VAddr GetEnd() const { 63 MapInterval* Allocate() {
70 return end; 64 if (free_list.empty()) {
65 AllocateNewChunk();
66 }
67 MapInterval* const interval = free_list.back();
68 free_list.pop_back();
69 return interval;
71 } 70 }
72 71
73 void MarkAsModified(const bool is_modified_, const u64 tick) { 72 void Release(MapInterval* interval) {
74 is_modified = is_modified_; 73 free_list.push_back(interval);
75 ticks = tick;
76 } 74 }
77 75
78 bool IsModified() const { 76private:
79 return is_modified; 77 struct Chunk {
80 } 78 std::unique_ptr<Chunk> next;
79 std::array<MapInterval, 0x8000> data;
80 };
81 81
82 u64 GetModificationTick() const { 82 void AllocateNewChunk();
83 return ticks;
84 }
85 83
86 void MarkAsWritten(const bool is_written_) { 84 void FillFreeList(Chunk& chunk);
87 is_written = is_written_;
88 }
89 85
90 bool IsWritten() const { 86 std::vector<MapInterval*> free_list;
91 return is_written; 87 std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
92 }
93 88
94private: 89 Chunk first_chunk;
95 VAddr start;
96 VAddr end;
97 GPUVAddr gpu_addr;
98 VAddr cpu_addr{};
99 bool is_written{};
100 bool is_modified{};
101 bool is_registered{};
102 bool is_memory_marked{};
103 bool is_sync_pending{};
104 u64 ticks{};
105}; 90};
106 91
107} // namespace VideoCommon 92} // namespace VideoCommon
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
index 22987751e..096ee337c 100644
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -56,9 +56,27 @@ public:
56 last_modified_ticks = cache.GetModifiedTicks(); 56 last_modified_ticks = cache.GetModifiedTicks();
57 } 57 }
58 58
59 void SetMemoryMarked(bool is_memory_marked_) {
60 is_memory_marked = is_memory_marked_;
61 }
62
63 bool IsMemoryMarked() const {
64 return is_memory_marked;
65 }
66
67 void SetSyncPending(bool is_sync_pending_) {
68 is_sync_pending = is_sync_pending_;
69 }
70
71 bool IsSyncPending() const {
72 return is_sync_pending;
73 }
74
59private: 75private:
60 bool is_registered{}; ///< Whether the object is currently registered with the cache 76 bool is_registered{}; ///< Whether the object is currently registered with the cache
61 bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory) 77 bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory)
78 bool is_memory_marked{}; ///< Whether the object is marking rasterizer memory.
79 bool is_sync_pending{}; ///< Whether the object is pending deletion.
62 u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing 80 u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
63 VAddr cpu_addr{}; ///< Cpu address memory, unique from emulated virtual address space 81 VAddr cpu_addr{}; ///< Cpu address memory, unique from emulated virtual address space
64}; 82};
@@ -94,6 +112,30 @@ public:
94 } 112 }
95 } 113 }
96 114
115 void OnCPUWrite(VAddr addr, std::size_t size) {
116 std::lock_guard lock{mutex};
117
118 for (const auto& object : GetSortedObjectsFromRegion(addr, size)) {
119 if (object->IsRegistered()) {
120 UnmarkMemory(object);
121 object->SetSyncPending(true);
122 marked_for_unregister.emplace_back(object);
123 }
124 }
125 }
126
127 void SyncGuestHost() {
128 std::lock_guard lock{mutex};
129
130 for (const auto& object : marked_for_unregister) {
131 if (object->IsRegistered()) {
132 object->SetSyncPending(false);
133 Unregister(object);
134 }
135 }
136 marked_for_unregister.clear();
137 }
138
97 /// Invalidates everything in the cache 139 /// Invalidates everything in the cache
98 void InvalidateAll() { 140 void InvalidateAll() {
99 std::lock_guard lock{mutex}; 141 std::lock_guard lock{mutex};
@@ -120,19 +162,32 @@ protected:
120 interval_cache.add({GetInterval(object), ObjectSet{object}}); 162 interval_cache.add({GetInterval(object), ObjectSet{object}});
121 map_cache.insert({object->GetCpuAddr(), object}); 163 map_cache.insert({object->GetCpuAddr(), object});
122 rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1); 164 rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
165 object->SetMemoryMarked(true);
123 } 166 }
124 167
125 /// Unregisters an object from the cache 168 /// Unregisters an object from the cache
126 virtual void Unregister(const T& object) { 169 virtual void Unregister(const T& object) {
127 std::lock_guard lock{mutex}; 170 std::lock_guard lock{mutex};
128 171
172 UnmarkMemory(object);
129 object->SetIsRegistered(false); 173 object->SetIsRegistered(false);
130 rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1); 174 if (object->IsSyncPending()) {
175 marked_for_unregister.remove(object);
176 object->SetSyncPending(false);
177 }
131 const VAddr addr = object->GetCpuAddr(); 178 const VAddr addr = object->GetCpuAddr();
132 interval_cache.subtract({GetInterval(object), ObjectSet{object}}); 179 interval_cache.subtract({GetInterval(object), ObjectSet{object}});
133 map_cache.erase(addr); 180 map_cache.erase(addr);
134 } 181 }
135 182
183 void UnmarkMemory(const T& object) {
184 if (!object->IsMemoryMarked()) {
185 return;
186 }
187 rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
188 object->SetMemoryMarked(false);
189 }
190
136 /// Returns a ticks counter used for tracking when cached objects were last modified 191 /// Returns a ticks counter used for tracking when cached objects were last modified
137 u64 GetModifiedTicks() { 192 u64 GetModifiedTicks() {
138 std::lock_guard lock{mutex}; 193 std::lock_guard lock{mutex};
@@ -194,4 +249,5 @@ private:
194 IntervalCache interval_cache; ///< Cache of objects 249 IntervalCache interval_cache; ///< Cache of objects
195 u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing 250 u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing
196 VideoCore::RasterizerInterface& rasterizer; 251 VideoCore::RasterizerInterface& rasterizer;
252 std::list<T> marked_for_unregister;
197}; 253};
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index d2cab50bd..9964ea894 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -8,6 +8,7 @@
8 8
9#include "common/assert.h" 9#include "common/assert.h"
10#include "common/microprofile.h" 10#include "common/microprofile.h"
11#include "video_core/buffer_cache/buffer_cache.h"
11#include "video_core/engines/maxwell_3d.h" 12#include "video_core/engines/maxwell_3d.h"
12#include "video_core/rasterizer_interface.h" 13#include "video_core/rasterizer_interface.h"
13#include "video_core/renderer_opengl/gl_buffer_cache.h" 14#include "video_core/renderer_opengl/gl_buffer_cache.h"
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index d83dca25a..466a911db 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -13,6 +13,7 @@
13 13
14#include "common/logging/log.h" 14#include "common/logging/log.h"
15#include "common/scope_exit.h" 15#include "common/scope_exit.h"
16#include "core/settings.h"
16#include "video_core/renderer_opengl/gl_device.h" 17#include "video_core/renderer_opengl/gl_device.h"
17#include "video_core/renderer_opengl/gl_resource_manager.h" 18#include "video_core/renderer_opengl/gl_resource_manager.h"
18 19
@@ -183,10 +184,16 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
183 has_precise_bug = TestPreciseBug(); 184 has_precise_bug = TestPreciseBug();
184 has_broken_compute = is_intel_proprietary; 185 has_broken_compute = is_intel_proprietary;
185 has_fast_buffer_sub_data = is_nvidia; 186 has_fast_buffer_sub_data = is_nvidia;
187 use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
188 GLAD_GL_NV_compute_program5;
186 189
187 LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); 190 LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
188 LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); 191 LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
189 LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug); 192 LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
193
194 if (Settings::values.use_assembly_shaders && !use_assembly_shaders) {
195 LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
196 }
190} 197}
191 198
192Device::Device(std::nullptr_t) { 199Device::Device(std::nullptr_t) {
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index a55050cb5..e915dbd86 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -88,6 +88,10 @@ public:
88 return has_fast_buffer_sub_data; 88 return has_fast_buffer_sub_data;
89 } 89 }
90 90
91 bool UseAssemblyShaders() const {
92 return use_assembly_shaders;
93 }
94
91private: 95private:
92 static bool TestVariableAoffi(); 96 static bool TestVariableAoffi();
93 static bool TestPreciseBug(); 97 static bool TestPreciseBug();
@@ -107,6 +111,7 @@ private:
107 bool has_precise_bug{}; 111 bool has_precise_bug{};
108 bool has_broken_compute{}; 112 bool has_broken_compute{};
109 bool has_fast_buffer_sub_data{}; 113 bool has_fast_buffer_sub_data{};
114 bool use_assembly_shaders{};
110}; 115};
111 116
112} // namespace OpenGL 117} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp
index 99ddcb3f8..ec5421afa 100644
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -4,6 +4,7 @@
4 4
5#include "common/assert.h" 5#include "common/assert.h"
6 6
7#include "video_core/renderer_opengl/gl_buffer_cache.h"
7#include "video_core/renderer_opengl/gl_fence_manager.h" 8#include "video_core/renderer_opengl/gl_fence_manager.h"
8 9
9namespace OpenGL { 10namespace OpenGL {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 69dcf952f..8116a5daa 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -94,17 +94,30 @@ void oglEnable(GLenum cap, bool state) {
94} // Anonymous namespace 94} // Anonymous namespace
95 95
96RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, 96RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
97 ScreenInfo& info, GLShader::ProgramManager& program_manager, 97 const Device& device, ScreenInfo& info,
98 StateTracker& state_tracker) 98 ProgramManager& program_manager, StateTracker& state_tracker)
99 : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker}, 99 : RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device,
100 state_tracker},
100 shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, 101 shader_cache{*this, system, emu_window, device}, query_cache{system, *this},
101 buffer_cache{*this, system, device, STREAM_BUFFER_SIZE}, 102 buffer_cache{*this, system, device, STREAM_BUFFER_SIZE},
102 fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system}, 103 fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system},
103 screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { 104 screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
104 CheckExtensions(); 105 CheckExtensions();
106
107 if (device.UseAssemblyShaders()) {
108 glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
109 for (const GLuint cbuf : staging_cbufs) {
110 glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
111 nullptr, 0);
112 }
113 }
105} 114}
106 115
107RasterizerOpenGL::~RasterizerOpenGL() {} 116RasterizerOpenGL::~RasterizerOpenGL() {
117 if (device.UseAssemblyShaders()) {
118 glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
119 }
120}
108 121
109void RasterizerOpenGL::CheckExtensions() { 122void RasterizerOpenGL::CheckExtensions() {
110 if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) { 123 if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
@@ -230,6 +243,7 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
230void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { 243void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
231 MICROPROFILE_SCOPE(OpenGL_Shader); 244 MICROPROFILE_SCOPE(OpenGL_Shader);
232 auto& gpu = system.GPU().Maxwell3D(); 245 auto& gpu = system.GPU().Maxwell3D();
246 std::size_t num_ssbos = 0;
233 u32 clip_distances = 0; 247 u32 clip_distances = 0;
234 248
235 for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { 249 for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@@ -261,6 +275,14 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
261 275
262 Shader shader{shader_cache.GetStageProgram(program)}; 276 Shader shader{shader_cache.GetStageProgram(program)};
263 277
278 if (device.UseAssemblyShaders()) {
279 // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
280 // all stages share the same bindings.
281 const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size();
282 ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage");
283 num_ssbos += num_stage_ssbos;
284 }
285
264 // Stage indices are 0 - 5 286 // Stage indices are 0 - 5
265 const std::size_t stage = index == 0 ? 0 : index - 1; 287 const std::size_t stage = index == 0 ? 0 : index - 1;
266 SetupDrawConstBuffers(stage, shader); 288 SetupDrawConstBuffers(stage, shader);
@@ -526,6 +548,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
526 SyncFramebufferSRGB(); 548 SyncFramebufferSRGB();
527 549
528 buffer_cache.Acquire(); 550 buffer_cache.Acquire();
551 current_cbuf = 0;
529 552
530 std::size_t buffer_size = CalculateVertexArraysSize(); 553 std::size_t buffer_size = CalculateVertexArraysSize();
531 554
@@ -535,9 +558,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
535 } 558 }
536 559
537 // Uniform space for the 5 shader stages 560 // Uniform space for the 5 shader stages
538 buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + 561 buffer_size =
539 (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * 562 Common::AlignUp<std::size_t>(buffer_size, 4) +
540 Maxwell::MaxShaderStage; 563 (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
541 564
542 // Add space for at least 18 constant buffers 565 // Add space for at least 18 constant buffers
543 buffer_size += Maxwell::MaxConstBuffers * 566 buffer_size += Maxwell::MaxConstBuffers *
@@ -558,12 +581,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
558 } 581 }
559 582
560 // Setup emulation uniform buffer. 583 // Setup emulation uniform buffer.
561 GLShader::MaxwellUniformData ubo; 584 if (!device.UseAssemblyShaders()) {
562 ubo.SetFromRegs(gpu); 585 MaxwellUniformData ubo;
563 const auto [buffer, offset] = 586 ubo.SetFromRegs(gpu);
564 buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); 587 const auto [buffer, offset] =
565 glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset, 588 buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
566 static_cast<GLsizeiptr>(sizeof(ubo))); 589 glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
590 static_cast<GLsizeiptr>(sizeof(ubo)));
591 }
567 592
568 // Setup shaders and their used resources. 593 // Setup shaders and their used resources.
569 texture_cache.GuardSamplers(true); 594 texture_cache.GuardSamplers(true);
@@ -635,11 +660,11 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
635 } 660 }
636 661
637 buffer_cache.Acquire(); 662 buffer_cache.Acquire();
663 current_cbuf = 0;
638 664
639 auto kernel = shader_cache.GetComputeKernel(code_addr); 665 auto kernel = shader_cache.GetComputeKernel(code_addr);
640 SetupComputeTextures(kernel); 666 SetupComputeTextures(kernel);
641 SetupComputeImages(kernel); 667 SetupComputeImages(kernel);
642 program_manager.BindComputeShader(kernel->GetHandle());
643 668
644 const std::size_t buffer_size = 669 const std::size_t buffer_size =
645 Tegra::Engines::KeplerCompute::NumConstBuffers * 670 Tegra::Engines::KeplerCompute::NumConstBuffers *
@@ -652,6 +677,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
652 buffer_cache.Unmap(); 677 buffer_cache.Unmap();
653 678
654 const auto& launch_desc = system.GPU().KeplerCompute().launch_description; 679 const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
680 program_manager.BindCompute(kernel->GetHandle());
655 glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); 681 glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
656 ++num_queued_commands; 682 ++num_queued_commands;
657} 683}
@@ -701,15 +727,15 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
701 return; 727 return;
702 } 728 }
703 texture_cache.OnCPUWrite(addr, size); 729 texture_cache.OnCPUWrite(addr, size);
704 shader_cache.InvalidateRegion(addr, size); 730 shader_cache.OnCPUWrite(addr, size);
705 buffer_cache.OnCPUWrite(addr, size); 731 buffer_cache.OnCPUWrite(addr, size);
706 query_cache.InvalidateRegion(addr, size);
707} 732}
708 733
709void RasterizerOpenGL::SyncGuestHost() { 734void RasterizerOpenGL::SyncGuestHost() {
710 MICROPROFILE_SCOPE(OpenGL_CacheManagement); 735 MICROPROFILE_SCOPE(OpenGL_CacheManagement);
711 texture_cache.SyncGuestHost(); 736 texture_cache.SyncGuestHost();
712 buffer_cache.SyncGuestHost(); 737 buffer_cache.SyncGuestHost();
738 shader_cache.SyncGuestHost();
713} 739}
714 740
715void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) { 741void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
@@ -812,14 +838,20 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
812} 838}
813 839
814void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) { 840void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
841 static constexpr std::array PARAMETER_LUT = {
842 GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
843 GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
844 GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV};
845
815 MICROPROFILE_SCOPE(OpenGL_UBO); 846 MICROPROFILE_SCOPE(OpenGL_UBO);
816 const auto& stages = system.GPU().Maxwell3D().state.shader_stages; 847 const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
817 const auto& shader_stage = stages[stage_index]; 848 const auto& shader_stage = stages[stage_index];
818 849
819 u32 binding = device.GetBaseBindings(stage_index).uniform_buffer; 850 u32 binding =
851 device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer;
820 for (const auto& entry : shader->GetEntries().const_buffers) { 852 for (const auto& entry : shader->GetEntries().const_buffers) {
821 const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; 853 const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
822 SetupConstBuffer(binding++, buffer, entry); 854 SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry);
823 } 855 }
824} 856}
825 857
@@ -835,16 +867,21 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
835 buffer.address = config.Address(); 867 buffer.address = config.Address();
836 buffer.size = config.size; 868 buffer.size = config.size;
837 buffer.enabled = mask[entry.GetIndex()]; 869 buffer.enabled = mask[entry.GetIndex()];
838 SetupConstBuffer(binding++, buffer, entry); 870 SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry);
839 } 871 }
840} 872}
841 873
842void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, 874void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
875 const Tegra::Engines::ConstBufferInfo& buffer,
843 const ConstBufferEntry& entry) { 876 const ConstBufferEntry& entry) {
844 if (!buffer.enabled) { 877 if (!buffer.enabled) {
845 // Set values to zero to unbind buffers 878 // Set values to zero to unbind buffers
846 glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0, 879 if (device.UseAssemblyShaders()) {
847 sizeof(float)); 880 glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
881 } else {
882 glBindBufferRange(GL_UNIFORM_BUFFER, binding,
883 buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
884 }
848 return; 885 return;
849 } 886 }
850 887
@@ -853,9 +890,19 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const
853 const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); 890 const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
854 891
855 const auto alignment = device.GetUniformBufferAlignment(); 892 const auto alignment = device.GetUniformBufferAlignment();
856 const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, 893 auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
857 device.HasFastBufferSubData()); 894 device.HasFastBufferSubData());
858 glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); 895 if (!device.UseAssemblyShaders()) {
896 glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
897 return;
898 }
899 if (offset != 0) {
900 const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
901 glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
902 cbuf = staging_cbuf;
903 offset = 0;
904 }
905 glBindBufferRangeNV(stage, binding, cbuf, offset, size);
859} 906}
860 907
861void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { 908void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
@@ -863,7 +910,8 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
863 auto& memory_manager{gpu.MemoryManager()}; 910 auto& memory_manager{gpu.MemoryManager()};
864 const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; 911 const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
865 912
866 u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer; 913 u32 binding =
914 device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
867 for (const auto& entry : shader->GetEntries().global_memory_entries) { 915 for (const auto& entry : shader->GetEntries().global_memory_entries) {
868 const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; 916 const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
869 const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)}; 917 const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index b94c65907..87f7fe159 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -56,8 +56,8 @@ struct DrawParameters;
56class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { 56class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
57public: 57public:
58 explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, 58 explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
59 ScreenInfo& info, GLShader::ProgramManager& program_manager, 59 const Device& device, ScreenInfo& info,
60 StateTracker& state_tracker); 60 ProgramManager& program_manager, StateTracker& state_tracker);
61 ~RasterizerOpenGL() override; 61 ~RasterizerOpenGL() override;
62 62
63 void Draw(bool is_indexed, bool is_instanced) override; 63 void Draw(bool is_indexed, bool is_instanced) override;
@@ -106,7 +106,7 @@ private:
106 void SetupComputeConstBuffers(const Shader& kernel); 106 void SetupComputeConstBuffers(const Shader& kernel);
107 107
108 /// Configures a constant buffer. 108 /// Configures a constant buffer.
109 void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, 109 void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
110 const ConstBufferEntry& entry); 110 const ConstBufferEntry& entry);
111 111
112 /// Configures the current global memory entries to use for the draw command. 112 /// Configures the current global memory entries to use for the draw command.
@@ -224,7 +224,7 @@ private:
224 224
225 void SetupShaders(GLenum primitive_mode); 225 void SetupShaders(GLenum primitive_mode);
226 226
227 const Device device; 227 const Device& device;
228 228
229 TextureCacheOpenGL texture_cache; 229 TextureCacheOpenGL texture_cache;
230 ShaderCacheOpenGL shader_cache; 230 ShaderCacheOpenGL shader_cache;
@@ -236,7 +236,7 @@ private:
236 236
237 Core::System& system; 237 Core::System& system;
238 ScreenInfo& screen_info; 238 ScreenInfo& screen_info;
239 GLShader::ProgramManager& program_manager; 239 ProgramManager& program_manager;
240 StateTracker& state_tracker; 240 StateTracker& state_tracker;
241 241
242 static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; 242 static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
@@ -248,6 +248,12 @@ private:
248 std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> 248 std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
249 enabled_transform_feedback_buffers; 249 enabled_transform_feedback_buffers;
250 250
251 static constexpr std::size_t NUM_CONSTANT_BUFFERS =
252 Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
253 Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
254 std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
255 std::size_t current_cbuf = 0;
256
251 /// Number of commands queued to the OpenGL driver. Reseted on flush. 257 /// Number of commands queued to the OpenGL driver. Reseted on flush.
252 std::size_t num_queued_commands = 0; 258 std::size_t num_queued_commands = 0;
253 259
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 97803d480..a787e27d2 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -125,6 +125,15 @@ void OGLProgram::Release() {
125 handle = 0; 125 handle = 0;
126} 126}
127 127
128void OGLAssemblyProgram::Release() {
129 if (handle == 0) {
130 return;
131 }
132 MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
133 glDeleteProgramsARB(1, &handle);
134 handle = 0;
135}
136
128void OGLPipeline::Create() { 137void OGLPipeline::Create() {
129 if (handle != 0) 138 if (handle != 0)
130 return; 139 return;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index de93f4212..f8b322227 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -167,6 +167,22 @@ public:
167 GLuint handle = 0; 167 GLuint handle = 0;
168}; 168};
169 169
170class OGLAssemblyProgram : private NonCopyable {
171public:
172 OGLAssemblyProgram() = default;
173
174 OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
175
176 ~OGLAssemblyProgram() {
177 Release();
178 }
179
180 /// Deletes the internal OpenGL resource
181 void Release();
182
183 GLuint handle = 0;
184};
185
170class OGLPipeline : private NonCopyable { 186class OGLPipeline : private NonCopyable {
171public: 187public:
172 OGLPipeline() = default; 188 OGLPipeline() = default;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 9759a7078..4cd0f36cf 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -97,6 +97,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
97 return {}; 97 return {};
98} 98}
99 99
100constexpr GLenum AssemblyEnum(ShaderType shader_type) {
101 switch (shader_type) {
102 case ShaderType::Vertex:
103 return GL_VERTEX_PROGRAM_NV;
104 case ShaderType::TesselationControl:
105 return GL_TESS_CONTROL_PROGRAM_NV;
106 case ShaderType::TesselationEval:
107 return GL_TESS_EVALUATION_PROGRAM_NV;
108 case ShaderType::Geometry:
109 return GL_GEOMETRY_PROGRAM_NV;
110 case ShaderType::Fragment:
111 return GL_FRAGMENT_PROGRAM_NV;
112 case ShaderType::Compute:
113 return GL_COMPUTE_PROGRAM_NV;
114 }
115 return {};
116}
117
100std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) { 118std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
101 return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier); 119 return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
102} 120}
@@ -120,18 +138,43 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
120 return registry; 138 return registry;
121} 139}
122 140
123std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type, 141ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier,
124 u64 unique_identifier, const ShaderIR& ir, 142 const ShaderIR& ir, const Registry& registry,
125 const Registry& registry, bool hint_retrievable = false) { 143 bool hint_retrievable = false) {
126 const std::string shader_id = MakeShaderID(unique_identifier, shader_type); 144 const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
127 LOG_INFO(Render_OpenGL, "{}", shader_id); 145 LOG_INFO(Render_OpenGL, "{}", shader_id);
128 146
129 const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); 147 auto program = std::make_shared<ProgramHandle>();
130 OGLShader shader; 148
131 shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); 149 if (device.UseAssemblyShaders()) {
150 const std::string arb = "Not implemented";
151
152 GLuint& arb_prog = program->assembly_program.handle;
153
154// Commented out functions signal OpenGL errors but are compatible with apitrace.
155// Use them only to capture and replay on apitrace.
156#if 0
157 glGenProgramsNV(1, &arb_prog);
158 glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()),
159 reinterpret_cast<const GLubyte*>(arb.data()));
160#else
161 glGenProgramsARB(1, &arb_prog);
162 glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB,
163 static_cast<GLsizei>(arb.size()), arb.data());
164#endif
165 const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV));
166 if (err && *err) {
167 LOG_CRITICAL(Render_OpenGL, "{}", err);
168 LOG_INFO(Render_OpenGL, "\n{}", arb);
169 }
170 } else {
171 const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
172 OGLShader shader;
173 shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
174
175 program->source_program.Create(true, hint_retrievable, shader.handle);
176 }
132 177
133 auto program = std::make_shared<OGLProgram>();
134 program->Create(true, hint_retrievable, shader.handle);
135 return program; 178 return program;
136} 179}
137 180
@@ -153,15 +196,22 @@ std::unordered_set<GLenum> GetSupportedFormats() {
153 196
154CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, 197CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
155 std::shared_ptr<VideoCommon::Shader::Registry> registry, 198 std::shared_ptr<VideoCommon::Shader::Registry> registry,
156 ShaderEntries entries, std::shared_ptr<OGLProgram> program) 199 ShaderEntries entries, ProgramSharedPtr program_)
157 : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)}, 200 : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
158 size_in_bytes{size_in_bytes}, program{std::move(program)} {} 201 size_in_bytes{size_in_bytes}, program{std::move(program_)} {
202 // Assign either the assembly program or source program. We can't have both.
203 handle = program->assembly_program.handle;
204 if (handle == 0) {
205 handle = program->source_program.handle;
206 }
207 ASSERT(handle != 0);
208}
159 209
160CachedShader::~CachedShader() = default; 210CachedShader::~CachedShader() = default;
161 211
162GLuint CachedShader::GetHandle() const { 212GLuint CachedShader::GetHandle() const {
163 DEBUG_ASSERT(registry->IsConsistent()); 213 DEBUG_ASSERT(registry->IsConsistent());
164 return program->handle; 214 return handle;
165} 215}
166 216
167Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, 217Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
@@ -239,7 +289,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
239 return; 289 return;
240 } 290 }
241 291
242 const std::vector gl_cache = disk_cache.LoadPrecompiled(); 292 std::vector<ShaderDiskCachePrecompiled> gl_cache;
293 if (!device.UseAssemblyShaders()) {
294 // Only load precompiled cache when we are not using assembly shaders
295 gl_cache = disk_cache.LoadPrecompiled();
296 }
243 const auto supported_formats = GetSupportedFormats(); 297 const auto supported_formats = GetSupportedFormats();
244 298
245 // Track if precompiled cache was altered during loading to know if we have to 299 // Track if precompiled cache was altered during loading to know if we have to
@@ -278,7 +332,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
278 auto registry = MakeRegistry(entry); 332 auto registry = MakeRegistry(entry);
279 const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry); 333 const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);
280 334
281 std::shared_ptr<OGLProgram> program; 335 ProgramSharedPtr program;
282 if (precompiled_entry) { 336 if (precompiled_entry) {
283 // If the shader is precompiled, attempt to load it with 337 // If the shader is precompiled, attempt to load it with
284 program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats); 338 program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
@@ -332,6 +386,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
332 return; 386 return;
333 } 387 }
334 388
389 if (device.UseAssemblyShaders()) {
390 // Don't store precompiled binaries for assembly shaders.
391 return;
392 }
393
335 // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw 394 // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
336 // before precompiling them 395 // before precompiling them
337 396
@@ -339,7 +398,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
339 const u64 id = (*transferable)[i].unique_identifier; 398 const u64 id = (*transferable)[i].unique_identifier;
340 const auto it = find_precompiled(id); 399 const auto it = find_precompiled(id);
341 if (it == gl_cache.end()) { 400 if (it == gl_cache.end()) {
342 const GLuint program = runtime_cache.at(id).program->handle; 401 const GLuint program = runtime_cache.at(id).program->source_program.handle;
343 disk_cache.SavePrecompiled(id, program); 402 disk_cache.SavePrecompiled(id, program);
344 precompiled_cache_altered = true; 403 precompiled_cache_altered = true;
345 } 404 }
@@ -350,7 +409,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
350 } 409 }
351} 410}
352 411
353std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( 412ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
354 const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, 413 const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
355 const std::unordered_set<GLenum>& supported_formats) { 414 const std::unordered_set<GLenum>& supported_formats) {
356 if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) { 415 if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
@@ -358,15 +417,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
358 return {}; 417 return {};
359 } 418 }
360 419
361 auto program = std::make_shared<OGLProgram>(); 420 auto program = std::make_shared<ProgramHandle>();
362 program->handle = glCreateProgram(); 421 GLuint& handle = program->source_program.handle;
363 glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); 422 handle = glCreateProgram();
364 glProgramBinary(program->handle, precompiled_entry.binary_format, 423 glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
365 precompiled_entry.binary.data(), 424 glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(),
366 static_cast<GLsizei>(precompiled_entry.binary.size())); 425 static_cast<GLsizei>(precompiled_entry.binary.size()));
367 426
368 GLint link_status; 427 GLint link_status;
369 glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status); 428 glGetProgramiv(handle, GL_LINK_STATUS, &link_status);
370 if (link_status == GL_FALSE) { 429 if (link_status == GL_FALSE) {
371 LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing"); 430 LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
372 return {}; 431 return {};
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 91690b470..b2ae8d7f9 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -43,8 +43,14 @@ struct UnspecializedShader;
43using Shader = std::shared_ptr<CachedShader>; 43using Shader = std::shared_ptr<CachedShader>;
44using Maxwell = Tegra::Engines::Maxwell3D::Regs; 44using Maxwell = Tegra::Engines::Maxwell3D::Regs;
45 45
46struct ProgramHandle {
47 OGLProgram source_program;
48 OGLAssemblyProgram assembly_program;
49};
50using ProgramSharedPtr = std::shared_ptr<ProgramHandle>;
51
46struct PrecompiledShader { 52struct PrecompiledShader {
47 std::shared_ptr<OGLProgram> program; 53 ProgramSharedPtr program;
48 std::shared_ptr<VideoCommon::Shader::Registry> registry; 54 std::shared_ptr<VideoCommon::Shader::Registry> registry;
49 ShaderEntries entries; 55 ShaderEntries entries;
50}; 56};
@@ -87,12 +93,13 @@ public:
87private: 93private:
88 explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, 94 explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
89 std::shared_ptr<VideoCommon::Shader::Registry> registry, 95 std::shared_ptr<VideoCommon::Shader::Registry> registry,
90 ShaderEntries entries, std::shared_ptr<OGLProgram> program); 96 ShaderEntries entries, ProgramSharedPtr program);
91 97
92 std::shared_ptr<VideoCommon::Shader::Registry> registry; 98 std::shared_ptr<VideoCommon::Shader::Registry> registry;
93 ShaderEntries entries; 99 ShaderEntries entries;
94 std::size_t size_in_bytes = 0; 100 std::size_t size_in_bytes = 0;
95 std::shared_ptr<OGLProgram> program; 101 ProgramSharedPtr program;
102 GLuint handle = 0;
96}; 103};
97 104
98class ShaderCacheOpenGL final : public RasterizerCache<Shader> { 105class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
@@ -115,7 +122,7 @@ protected:
115 void FlushObjectInner(const Shader& object) override {} 122 void FlushObjectInner(const Shader& object) override {}
116 123
117private: 124private:
118 std::shared_ptr<OGLProgram> GeneratePrecompiledProgram( 125 ProgramSharedPtr GeneratePrecompiledProgram(
119 const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, 126 const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
120 const std::unordered_set<GLenum>& supported_formats); 127 const std::unordered_set<GLenum>& supported_formats);
121 128
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 960ebf1a1..253484968 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -1538,7 +1538,9 @@ private:
1538 Expression target; 1538 Expression target;
1539 if (const auto gpr = std::get_if<GprNode>(&*dest)) { 1539 if (const auto gpr = std::get_if<GprNode>(&*dest)) {
1540 if (gpr->GetIndex() == Register::ZeroIndex) { 1540 if (gpr->GetIndex() == Register::ZeroIndex) {
1541 // Writing to Register::ZeroIndex is a no op 1541 // Writing to Register::ZeroIndex is a no op but we still have to visit the source
1542 // as it might have side effects.
1543 code.AddLine("{};", Visit(src).GetCode());
1542 return {}; 1544 return {};
1543 } 1545 }
1544 target = {GetRegister(gpr->GetIndex()), Type::Float}; 1546 target = {GetRegister(gpr->GetIndex()), Type::Float};
@@ -2309,6 +2311,18 @@ private:
2309 return {"gl_SubGroupInvocationARB", Type::Uint}; 2311 return {"gl_SubGroupInvocationARB", Type::Uint};
2310 } 2312 }
2311 2313
2314 template <const std::string_view& comparison>
2315 Expression ThreadMask(Operation) {
2316 if (device.HasWarpIntrinsics()) {
2317 return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint};
2318 }
2319 if (device.HasShaderBallot()) {
2320 return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint};
2321 }
2322 LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader");
2323 return {"0U", Type::Uint};
2324 }
2325
2312 Expression ShuffleIndexed(Operation operation) { 2326 Expression ShuffleIndexed(Operation operation) {
2313 std::string value = VisitOperand(operation, 0).AsFloat(); 2327 std::string value = VisitOperand(operation, 0).AsFloat();
2314 2328
@@ -2321,6 +2335,15 @@ private:
2321 return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float}; 2335 return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float};
2322 } 2336 }
2323 2337
2338 Expression Barrier(Operation) {
2339 if (!ir.IsDecompiled()) {
2340 LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled");
2341 return {};
2342 }
2343 code.AddLine("barrier();");
2344 return {};
2345 }
2346
2324 Expression MemoryBarrierGL(Operation) { 2347 Expression MemoryBarrierGL(Operation) {
2325 code.AddLine("memoryBarrier();"); 2348 code.AddLine("memoryBarrier();");
2326 return {}; 2349 return {};
@@ -2337,6 +2360,12 @@ private:
2337 static constexpr std::string_view NotEqual = "!="; 2360 static constexpr std::string_view NotEqual = "!=";
2338 static constexpr std::string_view GreaterEqual = ">="; 2361 static constexpr std::string_view GreaterEqual = ">=";
2339 2362
2363 static constexpr std::string_view Eq = "Eq";
2364 static constexpr std::string_view Ge = "Ge";
2365 static constexpr std::string_view Gt = "Gt";
2366 static constexpr std::string_view Le = "Le";
2367 static constexpr std::string_view Lt = "Lt";
2368
2340 static constexpr std::string_view Add = "Add"; 2369 static constexpr std::string_view Add = "Add";
2341 static constexpr std::string_view Min = "Min"; 2370 static constexpr std::string_view Min = "Min";
2342 static constexpr std::string_view Max = "Max"; 2371 static constexpr std::string_view Max = "Max";
@@ -2554,8 +2583,14 @@ private:
2554 &GLSLDecompiler::VoteEqual, 2583 &GLSLDecompiler::VoteEqual,
2555 2584
2556 &GLSLDecompiler::ThreadId, 2585 &GLSLDecompiler::ThreadId,
2586 &GLSLDecompiler::ThreadMask<Func::Eq>,
2587 &GLSLDecompiler::ThreadMask<Func::Ge>,
2588 &GLSLDecompiler::ThreadMask<Func::Gt>,
2589 &GLSLDecompiler::ThreadMask<Func::Le>,
2590 &GLSLDecompiler::ThreadMask<Func::Lt>,
2557 &GLSLDecompiler::ShuffleIndexed, 2591 &GLSLDecompiler::ShuffleIndexed,
2558 2592
2593 &GLSLDecompiler::Barrier,
2559 &GLSLDecompiler::MemoryBarrierGL, 2594 &GLSLDecompiler::MemoryBarrierGL,
2560 }; 2595 };
2561 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); 2596 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 9c7b0adbd..96605db84 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -6,45 +6,105 @@
6 6
7#include "common/common_types.h" 7#include "common/common_types.h"
8#include "video_core/engines/maxwell_3d.h" 8#include "video_core/engines/maxwell_3d.h"
9#include "video_core/renderer_opengl/gl_device.h"
9#include "video_core/renderer_opengl/gl_shader_manager.h" 10#include "video_core/renderer_opengl/gl_shader_manager.h"
10 11
11namespace OpenGL::GLShader { 12namespace OpenGL {
12 13
13ProgramManager::ProgramManager() = default; 14ProgramManager::ProgramManager(const Device& device) {
15 use_assembly_programs = device.UseAssemblyShaders();
16 if (use_assembly_programs) {
17 glEnable(GL_COMPUTE_PROGRAM_NV);
18 } else {
19 graphics_pipeline.Create();
20 glBindProgramPipeline(graphics_pipeline.handle);
21 }
22}
14 23
15ProgramManager::~ProgramManager() = default; 24ProgramManager::~ProgramManager() = default;
16 25
17void ProgramManager::Create() { 26void ProgramManager::BindCompute(GLuint program) {
18 graphics_pipeline.Create(); 27 if (use_assembly_programs) {
19 glBindProgramPipeline(graphics_pipeline.handle); 28 glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
29 } else {
30 is_graphics_bound = false;
31 glUseProgram(program);
32 }
20} 33}
21 34
22void ProgramManager::BindGraphicsPipeline() { 35void ProgramManager::BindGraphicsPipeline() {
23 if (!is_graphics_bound) { 36 if (use_assembly_programs) {
24 is_graphics_bound = true; 37 UpdateAssemblyPrograms();
25 glUseProgram(0); 38 } else {
39 UpdateSourcePrograms();
26 } 40 }
41}
27 42
28 // Avoid updating the pipeline when values have no changed 43void ProgramManager::BindHostPipeline(GLuint pipeline) {
29 if (old_state == current_state) { 44 if (use_assembly_programs) {
30 return; 45 if (geometry_enabled) {
46 geometry_enabled = false;
47 old_state.geometry = 0;
48 glDisable(GL_GEOMETRY_PROGRAM_NV);
49 }
31 } 50 }
51 glBindProgramPipeline(pipeline);
52}
32 53
33 // Workaround for AMD bug 54void ProgramManager::RestoreGuestPipeline() {
34 static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | 55 if (use_assembly_programs) {
35 GL_FRAGMENT_SHADER_BIT}; 56 glBindProgramPipeline(0);
36 const GLuint handle = graphics_pipeline.handle; 57 } else {
37 glUseProgramStages(handle, all_used_stages, 0); 58 glBindProgramPipeline(graphics_pipeline.handle);
38 glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader); 59 }
39 glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader); 60}
40 glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader); 61
62void ProgramManager::UpdateAssemblyPrograms() {
63 const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) {
64 if (current == old) {
65 return;
66 }
67 if (current == 0) {
68 if (enabled) {
69 enabled = false;
70 glDisable(stage);
71 }
72 return;
73 }
74 if (!enabled) {
75 enabled = true;
76 glEnable(stage);
77 }
78 glBindProgramARB(stage, current);
79 };
80
81 update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex);
82 update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry,
83 old_state.geometry);
84 update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment,
85 old_state.fragment);
41 86
42 old_state = current_state; 87 old_state = current_state;
43} 88}
44 89
45void ProgramManager::BindComputeShader(GLuint program) { 90void ProgramManager::UpdateSourcePrograms() {
46 is_graphics_bound = false; 91 if (!is_graphics_bound) {
47 glUseProgram(program); 92 is_graphics_bound = true;
93 glUseProgram(0);
94 }
95
96 const GLuint handle = graphics_pipeline.handle;
97 const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) {
98 if (current == old) {
99 return;
100 }
101 glUseProgramStages(handle, stage, current);
102 };
103 update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex);
104 update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry);
105 update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment);
106
107 old_state = current_state;
48} 108}
49 109
50void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { 110void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
@@ -54,4 +114,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
54 y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f; 114 y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;
55} 115}
56 116
57} // namespace OpenGL::GLShader 117} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index d2e47f2a9..0f03b4f12 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -11,7 +11,9 @@
11#include "video_core/renderer_opengl/gl_resource_manager.h" 11#include "video_core/renderer_opengl/gl_resource_manager.h"
12#include "video_core/renderer_opengl/maxwell_to_gl.h" 12#include "video_core/renderer_opengl/maxwell_to_gl.h"
13 13
14namespace OpenGL::GLShader { 14namespace OpenGL {
15
16class Device;
15 17
16/// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned 18/// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
17/// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at 19/// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
@@ -28,50 +30,58 @@ static_assert(sizeof(MaxwellUniformData) < 16384,
28 30
29class ProgramManager { 31class ProgramManager {
30public: 32public:
31 explicit ProgramManager(); 33 explicit ProgramManager(const Device& device);
32 ~ProgramManager(); 34 ~ProgramManager();
33 35
34 void Create(); 36 /// Binds a compute program
37 void BindCompute(GLuint program);
35 38
36 /// Updates the graphics pipeline and binds it. 39 /// Updates bound programs.
37 void BindGraphicsPipeline(); 40 void BindGraphicsPipeline();
38 41
39 /// Binds a compute shader. 42 /// Binds an OpenGL pipeline object unsynchronized with the guest state.
40 void BindComputeShader(GLuint program); 43 void BindHostPipeline(GLuint pipeline);
44
45 /// Rewinds BindHostPipeline state changes.
46 void RestoreGuestPipeline();
41 47
42 void UseVertexShader(GLuint program) { 48 void UseVertexShader(GLuint program) {
43 current_state.vertex_shader = program; 49 current_state.vertex = program;
44 } 50 }
45 51
46 void UseGeometryShader(GLuint program) { 52 void UseGeometryShader(GLuint program) {
47 current_state.geometry_shader = program; 53 current_state.geometry = program;
48 } 54 }
49 55
50 void UseFragmentShader(GLuint program) { 56 void UseFragmentShader(GLuint program) {
51 current_state.fragment_shader = program; 57 current_state.fragment = program;
52 } 58 }
53 59
54private: 60private:
55 struct PipelineState { 61 struct PipelineState {
56 bool operator==(const PipelineState& rhs) const noexcept { 62 GLuint vertex = 0;
57 return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader && 63 GLuint geometry = 0;
58 geometry_shader == rhs.geometry_shader; 64 GLuint fragment = 0;
59 }
60
61 bool operator!=(const PipelineState& rhs) const noexcept {
62 return !operator==(rhs);
63 }
64
65 GLuint vertex_shader = 0;
66 GLuint fragment_shader = 0;
67 GLuint geometry_shader = 0;
68 }; 65 };
69 66
67 /// Update NV_gpu_program5 programs.
68 void UpdateAssemblyPrograms();
69
70 /// Update GLSL programs.
71 void UpdateSourcePrograms();
72
70 OGLPipeline graphics_pipeline; 73 OGLPipeline graphics_pipeline;
71 OGLPipeline compute_pipeline; 74
72 PipelineState current_state; 75 PipelineState current_state;
73 PipelineState old_state; 76 PipelineState old_state;
77
78 bool use_assembly_programs = false;
79
74 bool is_graphics_bound = true; 80 bool is_graphics_bound = true;
81
82 bool vertex_enabled = false;
83 bool geometry_enabled = false;
84 bool fragment_enabled = false;
75}; 85};
76 86
77} // namespace OpenGL::GLShader 87} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index b2a179746..6b489e6db 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -316,7 +316,7 @@ public:
316RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system, 316RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
317 Core::Frontend::GraphicsContext& context) 317 Core::Frontend::GraphicsContext& context)
318 : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context}, 318 : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context},
319 has_debug_tool{HasDebugTool()} {} 319 program_manager{device}, has_debug_tool{HasDebugTool()} {}
320 320
321RendererOpenGL::~RendererOpenGL() = default; 321RendererOpenGL::~RendererOpenGL() = default;
322 322
@@ -468,8 +468,9 @@ void RendererOpenGL::InitOpenGLObjects() {
468 vertex_program.Create(true, false, vertex_shader.handle); 468 vertex_program.Create(true, false, vertex_shader.handle);
469 fragment_program.Create(true, false, fragment_shader.handle); 469 fragment_program.Create(true, false, fragment_shader.handle);
470 470
471 // Create program pipeline 471 pipeline.Create();
472 program_manager.Create(); 472 glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle);
473 glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle);
473 474
474 // Generate VBO handle for drawing 475 // Generate VBO handle for drawing
475 vertex_buffer.Create(); 476 vertex_buffer.Create();
@@ -508,7 +509,7 @@ void RendererOpenGL::CreateRasterizer() {
508 if (rasterizer) { 509 if (rasterizer) {
509 return; 510 return;
510 } 511 }
511 rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info, 512 rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, device, screen_info,
512 program_manager, state_tracker); 513 program_manager, state_tracker);
513} 514}
514 515
@@ -620,10 +621,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
620 state_tracker.NotifyClipControl(); 621 state_tracker.NotifyClipControl();
621 state_tracker.NotifyAlphaTest(); 622 state_tracker.NotifyAlphaTest();
622 623
623 program_manager.UseVertexShader(vertex_program.handle); 624 program_manager.BindHostPipeline(pipeline.handle);
624 program_manager.UseGeometryShader(0);
625 program_manager.UseFragmentShader(fragment_program.handle);
626 program_manager.BindGraphicsPipeline();
627 625
628 glEnable(GL_CULL_FACE); 626 glEnable(GL_CULL_FACE);
629 if (screen_info.display_srgb) { 627 if (screen_info.display_srgb) {
@@ -665,6 +663,8 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
665 663
666 glClear(GL_COLOR_BUFFER_BIT); 664 glClear(GL_COLOR_BUFFER_BIT);
667 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); 665 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
666
667 program_manager.RestoreGuestPipeline();
668} 668}
669 669
670bool RendererOpenGL::TryPresent(int timeout_ms) { 670bool RendererOpenGL::TryPresent(int timeout_ms) {
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 50b647661..61bf507f4 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -9,6 +9,7 @@
9#include "common/common_types.h" 9#include "common/common_types.h"
10#include "common/math_util.h" 10#include "common/math_util.h"
11#include "video_core/renderer_base.h" 11#include "video_core/renderer_base.h"
12#include "video_core/renderer_opengl/gl_device.h"
12#include "video_core/renderer_opengl/gl_resource_manager.h" 13#include "video_core/renderer_opengl/gl_resource_manager.h"
13#include "video_core/renderer_opengl/gl_shader_manager.h" 14#include "video_core/renderer_opengl/gl_shader_manager.h"
14#include "video_core/renderer_opengl/gl_state_tracker.h" 15#include "video_core/renderer_opengl/gl_state_tracker.h"
@@ -95,6 +96,7 @@ private:
95 Core::Frontend::EmuWindow& emu_window; 96 Core::Frontend::EmuWindow& emu_window;
96 Core::System& system; 97 Core::System& system;
97 Core::Frontend::GraphicsContext& context; 98 Core::Frontend::GraphicsContext& context;
99 const Device device;
98 100
99 StateTracker state_tracker{system}; 101 StateTracker state_tracker{system};
100 102
@@ -102,13 +104,14 @@ private:
102 OGLBuffer vertex_buffer; 104 OGLBuffer vertex_buffer;
103 OGLProgram vertex_program; 105 OGLProgram vertex_program;
104 OGLProgram fragment_program; 106 OGLProgram fragment_program;
107 OGLPipeline pipeline;
105 OGLFramebuffer screenshot_framebuffer; 108 OGLFramebuffer screenshot_framebuffer;
106 109
107 /// Display information for Switch screen 110 /// Display information for Switch screen
108 ScreenInfo screen_info; 111 ScreenInfo screen_info;
109 112
110 /// Global dummy shader pipeline 113 /// Global dummy shader pipeline
111 GLShader::ProgramManager program_manager; 114 ProgramManager program_manager;
112 115
113 /// OpenGL framebuffer data 116 /// OpenGL framebuffer data
114 std::vector<u8> gl_framebuffer_data; 117 std::vector<u8> gl_framebuffer_data;
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 12be691a5..2871035f5 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -142,7 +142,7 @@ struct FormatTuple {
142 {VK_FORMAT_BC6H_UFLOAT_BLOCK}, // BC6H_UF16 142 {VK_FORMAT_BC6H_UFLOAT_BLOCK}, // BC6H_UF16
143 {VK_FORMAT_BC6H_SFLOAT_BLOCK}, // BC6H_SF16 143 {VK_FORMAT_BC6H_SFLOAT_BLOCK}, // BC6H_SF16
144 {VK_FORMAT_ASTC_4x4_UNORM_BLOCK}, // ASTC_2D_4X4 144 {VK_FORMAT_ASTC_4x4_UNORM_BLOCK}, // ASTC_2D_4X4
145 {VK_FORMAT_B8G8R8A8_UNORM}, // BGRA8 145 {VK_FORMAT_B8G8R8A8_UNORM, Attachable}, // BGRA8
146 {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage}, // RGBA32F 146 {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage}, // RGBA32F
147 {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage}, // RG32F 147 {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage}, // RG32F
148 {VK_FORMAT_R32_SFLOAT, Attachable | Storage}, // R32F 148 {VK_FORMAT_R32_SFLOAT, Attachable | Storage}, // R32F
@@ -168,7 +168,7 @@ struct FormatTuple {
168 {VK_FORMAT_ASTC_8x8_UNORM_BLOCK}, // ASTC_2D_8X8 168 {VK_FORMAT_ASTC_8x8_UNORM_BLOCK}, // ASTC_2D_8X8
169 {VK_FORMAT_UNDEFINED}, // ASTC_2D_8X5 169 {VK_FORMAT_UNDEFINED}, // ASTC_2D_8X5
170 {VK_FORMAT_UNDEFINED}, // ASTC_2D_5X4 170 {VK_FORMAT_UNDEFINED}, // ASTC_2D_5X4
171 {VK_FORMAT_UNDEFINED}, // BGRA8_SRGB 171 {VK_FORMAT_B8G8R8A8_SRGB, Attachable}, // BGRA8_SRGB
172 {VK_FORMAT_BC1_RGBA_SRGB_BLOCK}, // DXT1_SRGB 172 {VK_FORMAT_BC1_RGBA_SRGB_BLOCK}, // DXT1_SRGB
173 {VK_FORMAT_BC2_SRGB_BLOCK}, // DXT23_SRGB 173 {VK_FORMAT_BC2_SRGB_BLOCK}, // DXT23_SRGB
174 {VK_FORMAT_BC3_SRGB_BLOCK}, // DXT45_SRGB 174 {VK_FORMAT_BC3_SRGB_BLOCK}, // DXT45_SRGB
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 5b494da8c..5f33d9e40 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -7,6 +7,7 @@
7#include <memory> 7#include <memory>
8 8
9#include "core/core.h" 9#include "core/core.h"
10#include "video_core/buffer_cache/buffer_cache.h"
10#include "video_core/renderer_vulkan/vk_buffer_cache.h" 11#include "video_core/renderer_vulkan/vk_buffer_cache.h"
11#include "video_core/renderer_vulkan/vk_device.h" 12#include "video_core/renderer_vulkan/vk_device.h"
12#include "video_core/renderer_vulkan/vk_scheduler.h" 13#include "video_core/renderer_vulkan/vk_scheduler.h"
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index f0c491d00..750e5a0ca 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -104,6 +104,7 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
104 VK_FORMAT_R16_SFLOAT, 104 VK_FORMAT_R16_SFLOAT,
105 VK_FORMAT_R16G16B16A16_SFLOAT, 105 VK_FORMAT_R16G16B16A16_SFLOAT,
106 VK_FORMAT_B8G8R8A8_UNORM, 106 VK_FORMAT_B8G8R8A8_UNORM,
107 VK_FORMAT_B8G8R8A8_SRGB,
107 VK_FORMAT_R4G4B4A4_UNORM_PACK16, 108 VK_FORMAT_R4G4B4A4_UNORM_PACK16,
108 VK_FORMAT_D32_SFLOAT, 109 VK_FORMAT_D32_SFLOAT,
109 VK_FORMAT_D16_UNORM, 110 VK_FORMAT_D16_UNORM,
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h
index 04d07fe6a..043fe7947 100644
--- a/src/video_core/renderer_vulkan/vk_fence_manager.h
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.h
@@ -7,6 +7,7 @@
7#include <memory> 7#include <memory>
8 8
9#include "video_core/fence_manager.h" 9#include "video_core/fence_manager.h"
10#include "video_core/renderer_vulkan/vk_buffer_cache.h"
10#include "video_core/renderer_vulkan/wrapper.h" 11#include "video_core/renderer_vulkan/wrapper.h"
11 12
12namespace Core { 13namespace Core {
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index fe45ed269..a5c7b7945 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -329,8 +329,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
329 329
330 const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum); 330 const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum);
331 const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); 331 const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
332 ASSERT(cpu_addr); 332 const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader;
333 const auto shader = TryGet(*cpu_addr);
334 ASSERT(shader); 333 ASSERT(shader);
335 334
336 const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 335 const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 17a2efe8e..be5b77fae 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -532,14 +532,14 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
532 return; 532 return;
533 } 533 }
534 texture_cache.OnCPUWrite(addr, size); 534 texture_cache.OnCPUWrite(addr, size);
535 pipeline_cache.InvalidateRegion(addr, size); 535 pipeline_cache.OnCPUWrite(addr, size);
536 buffer_cache.OnCPUWrite(addr, size); 536 buffer_cache.OnCPUWrite(addr, size);
537 query_cache.InvalidateRegion(addr, size);
538} 537}
539 538
540void RasterizerVulkan::SyncGuestHost() { 539void RasterizerVulkan::SyncGuestHost() {
541 texture_cache.SyncGuestHost(); 540 texture_cache.SyncGuestHost();
542 buffer_cache.SyncGuestHost(); 541 buffer_cache.SyncGuestHost();
542 pipeline_cache.SyncGuestHost();
543} 543}
544 544
545void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) { 545void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) {
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 167e20e91..890f34a2c 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -515,6 +515,16 @@ private:
515 void DeclareCommon() { 515 void DeclareCommon() {
516 thread_id = 516 thread_id =
517 DeclareInputBuiltIn(spv::BuiltIn::SubgroupLocalInvocationId, t_in_uint, "thread_id"); 517 DeclareInputBuiltIn(spv::BuiltIn::SubgroupLocalInvocationId, t_in_uint, "thread_id");
518 thread_masks[0] =
519 DeclareInputBuiltIn(spv::BuiltIn::SubgroupEqMask, t_in_uint4, "thread_eq_mask");
520 thread_masks[1] =
521 DeclareInputBuiltIn(spv::BuiltIn::SubgroupGeMask, t_in_uint4, "thread_ge_mask");
522 thread_masks[2] =
523 DeclareInputBuiltIn(spv::BuiltIn::SubgroupGtMask, t_in_uint4, "thread_gt_mask");
524 thread_masks[3] =
525 DeclareInputBuiltIn(spv::BuiltIn::SubgroupLeMask, t_in_uint4, "thread_le_mask");
526 thread_masks[4] =
527 DeclareInputBuiltIn(spv::BuiltIn::SubgroupLtMask, t_in_uint4, "thread_lt_mask");
518 } 528 }
519 529
520 void DeclareVertex() { 530 void DeclareVertex() {
@@ -1071,8 +1081,7 @@ private:
1071 1081
1072 void VisitBasicBlock(const NodeBlock& bb) { 1082 void VisitBasicBlock(const NodeBlock& bb) {
1073 for (const auto& node : bb) { 1083 for (const auto& node : bb) {
1074 [[maybe_unused]] const Type type = Visit(node).type; 1084 Visit(node);
1075 ASSERT(type == Type::Void);
1076 } 1085 }
1077 } 1086 }
1078 1087
@@ -1362,7 +1371,9 @@ private:
1362 Expression target{}; 1371 Expression target{};
1363 if (const auto gpr = std::get_if<GprNode>(&*dest)) { 1372 if (const auto gpr = std::get_if<GprNode>(&*dest)) {
1364 if (gpr->GetIndex() == Register::ZeroIndex) { 1373 if (gpr->GetIndex() == Register::ZeroIndex) {
1365 // Writing to Register::ZeroIndex is a no op 1374 // Writing to Register::ZeroIndex is a no op but we still have to visit its source
1375 // because it might have side effects.
1376 Visit(src);
1366 return {}; 1377 return {};
1367 } 1378 }
1368 target = {registers.at(gpr->GetIndex()), Type::Float}; 1379 target = {registers.at(gpr->GetIndex()), Type::Float};
@@ -2175,12 +2186,35 @@ private:
2175 return {OpLoad(t_uint, thread_id), Type::Uint}; 2186 return {OpLoad(t_uint, thread_id), Type::Uint};
2176 } 2187 }
2177 2188
2189 template <std::size_t index>
2190 Expression ThreadMask(Operation) {
2191 // TODO(Rodrigo): Handle devices with different warp sizes
2192 const Id mask = thread_masks[index];
2193 return {OpLoad(t_uint, AccessElement(t_in_uint, mask, 0)), Type::Uint};
2194 }
2195
2178 Expression ShuffleIndexed(Operation operation) { 2196 Expression ShuffleIndexed(Operation operation) {
2179 const Id value = AsFloat(Visit(operation[0])); 2197 const Id value = AsFloat(Visit(operation[0]));
2180 const Id index = AsUint(Visit(operation[1])); 2198 const Id index = AsUint(Visit(operation[1]));
2181 return {OpSubgroupReadInvocationKHR(t_float, value, index), Type::Float}; 2199 return {OpSubgroupReadInvocationKHR(t_float, value, index), Type::Float};
2182 } 2200 }
2183 2201
2202 Expression Barrier(Operation) {
2203 if (!ir.IsDecompiled()) {
2204 LOG_ERROR(Render_Vulkan, "OpBarrier used by shader is not decompiled");
2205 return {};
2206 }
2207
2208 const auto scope = spv::Scope::Workgroup;
2209 const auto memory = spv::Scope::Workgroup;
2210 const auto semantics =
2211 spv::MemorySemanticsMask::WorkgroupMemory | spv::MemorySemanticsMask::AcquireRelease;
2212 OpControlBarrier(Constant(t_uint, static_cast<u32>(scope)),
2213 Constant(t_uint, static_cast<u32>(memory)),
2214 Constant(t_uint, static_cast<u32>(semantics)));
2215 return {};
2216 }
2217
2184 Expression MemoryBarrierGL(Operation) { 2218 Expression MemoryBarrierGL(Operation) {
2185 const auto scope = spv::Scope::Device; 2219 const auto scope = spv::Scope::Device;
2186 const auto semantics = 2220 const auto semantics =
@@ -2639,8 +2673,14 @@ private:
2639 &SPIRVDecompiler::Vote<&Module::OpSubgroupAllEqualKHR>, 2673 &SPIRVDecompiler::Vote<&Module::OpSubgroupAllEqualKHR>,
2640 2674
2641 &SPIRVDecompiler::ThreadId, 2675 &SPIRVDecompiler::ThreadId,
2676 &SPIRVDecompiler::ThreadMask<0>, // Eq
2677 &SPIRVDecompiler::ThreadMask<1>, // Ge
2678 &SPIRVDecompiler::ThreadMask<2>, // Gt
2679 &SPIRVDecompiler::ThreadMask<3>, // Le
2680 &SPIRVDecompiler::ThreadMask<4>, // Lt
2642 &SPIRVDecompiler::ShuffleIndexed, 2681 &SPIRVDecompiler::ShuffleIndexed,
2643 2682
2683 &SPIRVDecompiler::Barrier,
2644 &SPIRVDecompiler::MemoryBarrierGL, 2684 &SPIRVDecompiler::MemoryBarrierGL,
2645 }; 2685 };
2646 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); 2686 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
@@ -2763,6 +2803,7 @@ private:
2763 Id workgroup_id{}; 2803 Id workgroup_id{};
2764 Id local_invocation_id{}; 2804 Id local_invocation_id{};
2765 Id thread_id{}; 2805 Id thread_id{};
2806 std::array<Id, 5> thread_masks{}; // eq, ge, gt, le, lt
2766 2807
2767 VertexIndices in_indices; 2808 VertexIndices in_indices;
2768 VertexIndices out_indices; 2809 VertexIndices out_indices;
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index 9392f065b..63adbc4a3 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -387,7 +387,6 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
387 } 387 }
388 case OpCode::Id::RED: { 388 case OpCode::Id::RED: {
389 UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32); 389 UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32);
390 UNIMPLEMENTED_IF_MSG(instr.red.operation != AtomicOp::Add);
391 const auto [real_address, base_address, descriptor] = 390 const auto [real_address, base_address, descriptor] =
392 TrackGlobalMemory(bb, instr, true, true); 391 TrackGlobalMemory(bb, instr, true, true);
393 if (!real_address || !base_address) { 392 if (!real_address || !base_address) {
@@ -396,7 +395,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
396 } 395 }
397 Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); 396 Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
398 Node value = GetRegister(instr.gpr0); 397 Node value = GetRegister(instr.gpr0);
399 bb.push_back(Operation(OperationCode::ReduceIAdd, move(gmem), move(value))); 398 bb.push_back(Operation(GetAtomOperation(instr.red.operation), move(gmem), move(value)));
400 break; 399 break;
401 } 400 }
402 case OpCode::Id::ATOM: { 401 case OpCode::Id::ATOM: {
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index d4f95b18c..694b325e1 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -109,6 +109,27 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
109 return Operation(OperationCode::WorkGroupIdY); 109 return Operation(OperationCode::WorkGroupIdY);
110 case SystemVariable::CtaIdZ: 110 case SystemVariable::CtaIdZ:
111 return Operation(OperationCode::WorkGroupIdZ); 111 return Operation(OperationCode::WorkGroupIdZ);
112 case SystemVariable::EqMask:
113 case SystemVariable::LtMask:
114 case SystemVariable::LeMask:
115 case SystemVariable::GtMask:
116 case SystemVariable::GeMask:
117 uses_warps = true;
118 switch (instr.sys20) {
119 case SystemVariable::EqMask:
120 return Operation(OperationCode::ThreadEqMask);
121 case SystemVariable::LtMask:
122 return Operation(OperationCode::ThreadLtMask);
123 case SystemVariable::LeMask:
124 return Operation(OperationCode::ThreadLeMask);
125 case SystemVariable::GtMask:
126 return Operation(OperationCode::ThreadGtMask);
127 case SystemVariable::GeMask:
128 return Operation(OperationCode::ThreadGeMask);
129 default:
130 UNREACHABLE();
131 return Immediate(0u);
132 }
112 default: 133 default:
113 UNIMPLEMENTED_MSG("Unhandled system move: {}", 134 UNIMPLEMENTED_MSG("Unhandled system move: {}",
114 static_cast<u32>(instr.sys20.Value())); 135 static_cast<u32>(instr.sys20.Value()));
@@ -272,6 +293,11 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
272 SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8)); 293 SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8));
273 break; 294 break;
274 } 295 }
296 case OpCode::Id::BAR: {
297 UNIMPLEMENTED_IF_MSG(instr.value != 0xF0A81B8000070000ULL, "BAR is not BAR.SYNC 0x0");
298 bb.push_back(Operation(OperationCode::Barrier));
299 break;
300 }
275 case OpCode::Id::MEMBAR: { 301 case OpCode::Id::MEMBAR: {
276 UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL); 302 UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL);
277 UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default); 303 UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default);
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index f75b62240..c06512413 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -226,8 +226,14 @@ enum class OperationCode {
226 VoteEqual, /// (bool) -> bool 226 VoteEqual, /// (bool) -> bool
227 227
228 ThreadId, /// () -> uint 228 ThreadId, /// () -> uint
229 ThreadEqMask, /// () -> uint
230 ThreadGeMask, /// () -> uint
231 ThreadGtMask, /// () -> uint
232 ThreadLeMask, /// () -> uint
233 ThreadLtMask, /// () -> uint
229 ShuffleIndexed, /// (uint value, uint index) -> uint 234 ShuffleIndexed, /// (uint value, uint index) -> uint
230 235
236 Barrier, /// () -> void
231 MemoryBarrierGL, /// () -> void 237 MemoryBarrierGL, /// () -> void
232 238
233 Amount, 239 Amount,
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index 27775701d..b08b87426 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -643,6 +643,8 @@ void Config::ReadRendererValues() {
643 Settings::values.use_asynchronous_gpu_emulation = 643 Settings::values.use_asynchronous_gpu_emulation =
644 ReadSetting(QStringLiteral("use_asynchronous_gpu_emulation"), false).toBool(); 644 ReadSetting(QStringLiteral("use_asynchronous_gpu_emulation"), false).toBool();
645 Settings::values.use_vsync = ReadSetting(QStringLiteral("use_vsync"), true).toBool(); 645 Settings::values.use_vsync = ReadSetting(QStringLiteral("use_vsync"), true).toBool();
646 Settings::values.use_assembly_shaders =
647 ReadSetting(QStringLiteral("use_assembly_shaders"), false).toBool();
646 Settings::values.use_fast_gpu_time = 648 Settings::values.use_fast_gpu_time =
647 ReadSetting(QStringLiteral("use_fast_gpu_time"), true).toBool(); 649 ReadSetting(QStringLiteral("use_fast_gpu_time"), true).toBool();
648 Settings::values.force_30fps_mode = 650 Settings::values.force_30fps_mode =
@@ -1090,6 +1092,8 @@ void Config::SaveRendererValues() {
1090 WriteSetting(QStringLiteral("use_asynchronous_gpu_emulation"), 1092 WriteSetting(QStringLiteral("use_asynchronous_gpu_emulation"),
1091 Settings::values.use_asynchronous_gpu_emulation, false); 1093 Settings::values.use_asynchronous_gpu_emulation, false);
1092 WriteSetting(QStringLiteral("use_vsync"), Settings::values.use_vsync, true); 1094 WriteSetting(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
1095 WriteSetting(QStringLiteral("use_assembly_shaders"), Settings::values.use_assembly_shaders,
1096 false);
1093 WriteSetting(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, true); 1097 WriteSetting(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, true);
1094 WriteSetting(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, false); 1098 WriteSetting(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, false);
1095 1099
diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp
index 5bb2ae555..37aadf7f8 100644
--- a/src/yuzu/configuration/configure_graphics_advanced.cpp
+++ b/src/yuzu/configuration/configure_graphics_advanced.cpp
@@ -12,6 +12,9 @@ ConfigureGraphicsAdvanced::ConfigureGraphicsAdvanced(QWidget* parent)
12 12
13 ui->setupUi(this); 13 ui->setupUi(this);
14 14
15 // TODO: Remove this after assembly shaders are fully integrated
16 ui->use_assembly_shaders->setVisible(false);
17
15 SetConfiguration(); 18 SetConfiguration();
16} 19}
17 20
@@ -22,6 +25,8 @@ void ConfigureGraphicsAdvanced::SetConfiguration() {
22 ui->gpu_accuracy->setCurrentIndex(static_cast<int>(Settings::values.gpu_accuracy)); 25 ui->gpu_accuracy->setCurrentIndex(static_cast<int>(Settings::values.gpu_accuracy));
23 ui->use_vsync->setEnabled(runtime_lock); 26 ui->use_vsync->setEnabled(runtime_lock);
24 ui->use_vsync->setChecked(Settings::values.use_vsync); 27 ui->use_vsync->setChecked(Settings::values.use_vsync);
28 ui->use_assembly_shaders->setEnabled(runtime_lock);
29 ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders);
25 ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time); 30 ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time);
26 ui->force_30fps_mode->setEnabled(runtime_lock); 31 ui->force_30fps_mode->setEnabled(runtime_lock);
27 ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode); 32 ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode);
@@ -33,6 +38,7 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() {
33 auto gpu_accuracy = static_cast<Settings::GPUAccuracy>(ui->gpu_accuracy->currentIndex()); 38 auto gpu_accuracy = static_cast<Settings::GPUAccuracy>(ui->gpu_accuracy->currentIndex());
34 Settings::values.gpu_accuracy = gpu_accuracy; 39 Settings::values.gpu_accuracy = gpu_accuracy;
35 Settings::values.use_vsync = ui->use_vsync->isChecked(); 40 Settings::values.use_vsync = ui->use_vsync->isChecked();
41 Settings::values.use_assembly_shaders = ui->use_assembly_shaders->isChecked();
36 Settings::values.use_fast_gpu_time = ui->use_fast_gpu_time->isChecked(); 42 Settings::values.use_fast_gpu_time = ui->use_fast_gpu_time->isChecked();
37 Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked(); 43 Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked();
38 Settings::values.max_anisotropy = ui->anisotropic_filtering_combobox->currentIndex(); 44 Settings::values.max_anisotropy = ui->anisotropic_filtering_combobox->currentIndex();
diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui
index 770b80c50..0021607ac 100644
--- a/src/yuzu/configuration/configure_graphics_advanced.ui
+++ b/src/yuzu/configuration/configure_graphics_advanced.ui
@@ -63,6 +63,16 @@
63 </widget> 63 </widget>
64 </item> 64 </item>
65 <item> 65 <item>
66 <widget class="QCheckBox" name="use_assembly_shaders">
67 <property name="toolTip">
68 <string>Enabling this reduces shader stutter. Enables OpenGL assembly shaders on supported Nvidia devices (NV_gpu_program5 is required). This feature is experimental.</string>
69 </property>
70 <property name="text">
71 <string>Use assembly shaders (experimental, Nvidia OpenGL only)</string>
72 </property>
73 </widget>
74 </item>
75 <item>
66 <widget class="QCheckBox" name="force_30fps_mode"> 76 <widget class="QCheckBox" name="force_30fps_mode">
67 <property name="text"> 77 <property name="text">
68 <string>Force 30 FPS mode</string> 78 <string>Force 30 FPS mode</string>
diff --git a/src/yuzu/discord_impl.cpp b/src/yuzu/discord_impl.cpp
index ea0079353..a93733b26 100644
--- a/src/yuzu/discord_impl.cpp
+++ b/src/yuzu/discord_impl.cpp
@@ -18,7 +18,7 @@ DiscordImpl::DiscordImpl() {
18 18
19 // The number is the client ID for yuzu, it's used for images and the 19 // The number is the client ID for yuzu, it's used for images and the
20 // application name 20 // application name
21 Discord_Initialize("471872241299226636", &handlers, 1, nullptr); 21 Discord_Initialize("712465656758665259", &handlers, 1, nullptr);
22} 22}
23 23
24DiscordImpl::~DiscordImpl() { 24DiscordImpl::~DiscordImpl() {
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp
index 2348e6e0d..c20d48c42 100644
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -397,6 +397,8 @@ void Config::ReadValues() {
397 sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false); 397 sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false);
398 Settings::values.use_vsync = 398 Settings::values.use_vsync =
399 static_cast<u16>(sdl2_config->GetInteger("Renderer", "use_vsync", 1)); 399 static_cast<u16>(sdl2_config->GetInteger("Renderer", "use_vsync", 1));
400 Settings::values.use_assembly_shaders =
401 sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", false);
400 Settings::values.use_fast_gpu_time = 402 Settings::values.use_fast_gpu_time =
401 sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true); 403 sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true);
402 404
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h
index ae94b51c4..abc6e6e65 100644
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -134,6 +134,10 @@ max_anisotropy =
134# 0 (default): Off, 1: On 134# 0 (default): Off, 1: On
135use_vsync = 135use_vsync =
136 136
137# Whether to use OpenGL assembly shaders or not. NV_gpu_program5 is required.
138# 0 (default): Off, 1: On
139use_assembly_shaders =
140
137# Turns on the frame limiter, which will limit frames output to the target game speed 141# Turns on the frame limiter, which will limit frames output to the target game speed
138# 0: Off, 1: On (default) 142# 0: Off, 1: On (default)
139use_frame_limit = 143use_frame_limit =