summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitmodules6
-rw-r--r--CMakeLists.txt14
-rw-r--r--CMakeModules/GenerateSCMRev.cmake2
-rw-r--r--externals/CMakeLists.txt18
m---------externals/libressl0
m---------externals/sirit0
m---------externals/xbyak0
-rw-r--r--src/common/CMakeLists.txt8
-rw-r--r--src/common/memory_detect.cpp60
-rw-r--r--src/common/memory_detect.h22
-rw-r--r--src/common/x64/xbyak_abi.h266
-rw-r--r--src/common/x64/xbyak_util.h47
-rw-r--r--src/core/file_sys/patch_manager.cpp34
-rw-r--r--src/core/file_sys/patch_manager.h5
-rw-r--r--src/core/hle/kernel/process.cpp6
-rw-r--r--src/core/hle/kernel/readable_event.cpp2
-rw-r--r--src/core/hle/kernel/resource_limit.cpp6
-rw-r--r--src/core/hle/service/hid/hid.cpp13
-rw-r--r--src/core/hle/service/hid/hid.h1
-rw-r--r--src/core/hle/service/nifm/nifm.cpp3
-rw-r--r--src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp25
-rw-r--r--src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h18
-rw-r--r--src/core/settings.cpp1
-rw-r--r--src/core/settings.h4
-rw-r--r--src/core/telemetry_session.cpp1
-rw-r--r--src/video_core/CMakeLists.txt16
-rw-r--r--src/video_core/buffer_cache/buffer_block.h27
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h468
-rw-r--r--src/video_core/buffer_cache/map_interval.cpp33
-rw-r--r--src/video_core/buffer_cache/map_interval.h133
-rw-r--r--src/video_core/engines/const_buffer_engine_interface.h1
-rw-r--r--src/video_core/engines/kepler_compute.cpp5
-rw-r--r--src/video_core/engines/kepler_compute.h2
-rw-r--r--src/video_core/engines/maxwell_3d.cpp39
-rw-r--r--src/video_core/engines/maxwell_3d.h21
-rw-r--r--src/video_core/macro/macro.cpp45
-rw-r--r--src/video_core/macro/macro.h128
-rw-r--r--src/video_core/macro/macro_interpreter.cpp (renamed from src/video_core/macro_interpreter.cpp)198
-rw-r--r--src/video_core/macro/macro_interpreter.h (renamed from src/video_core/macro_interpreter.h)51
-rw-r--r--src/video_core/macro/macro_jit_x64.cpp640
-rw-r--r--src/video_core/macro/macro_jit_x64.h100
-rw-r--r--src/video_core/rasterizer_cache.cpp7
-rw-r--r--src/video_core/rasterizer_cache.h197
-rw-r--r--src/video_core/renderer_opengl/gl_arb_decompiler.cpp2074
-rw-r--r--src/video_core/renderer_opengl/gl_arb_decompiler.h29
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp22
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.h19
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp95
-rw-r--r--src/video_core/renderer_opengl/gl_device.h22
-rw-r--r--src/video_core/renderer_opengl/gl_fence_manager.cpp1
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp336
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h41
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.cpp9
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.h16
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp184
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.h64
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp160
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.h6
-rw-r--r--src/video_core/renderer_opengl/gl_shader_disk_cache.cpp64
-rw-r--r--src/video_core/renderer_opengl/gl_shader_disk_cache.h1
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.cpp110
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.h56
-rw-r--r--src/video_core/renderer_opengl/gl_stream_buffer.cpp8
-rw-r--r--src/video_core/renderer_opengl/gl_stream_buffer.h11
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp123
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h32
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.cpp19
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.h5
-rw-r--r--src/video_core/renderer_vulkan/fixed_pipeline_state.cpp3
-rw-r--r--src/video_core/renderer_vulkan/maxwell_to_vk.cpp6
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.cpp23
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.h17
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pipeline.cpp3
-rw-r--r--src/video_core/renderer_vulkan/vk_descriptor_pool.cpp1
-rw-r--r--src/video_core/renderer_vulkan/vk_device.cpp142
-rw-r--r--src/video_core/renderer_vulkan/vk_fence_manager.h1
-rw-r--r--src/video_core/renderer_vulkan/vk_pipeline_cache.cpp93
-rw-r--r--src/video_core/renderer_vulkan/vk_pipeline_cache.h33
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp92
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h18
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.cpp221
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.h12
-rw-r--r--src/video_core/renderer_vulkan/vk_stream_buffer.h2
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp109
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.h33
-rw-r--r--src/video_core/shader/decode/memory.cpp3
-rw-r--r--src/video_core/shader/decode/other.cpp42
-rw-r--r--src/video_core/shader/decode/texture.cpp55
-rw-r--r--src/video_core/shader/node.h84
-rw-r--r--src/video_core/shader/node_helper.h2
-rw-r--r--src/video_core/shader/registry.cpp20
-rw-r--r--src/video_core/shader/registry.h35
-rw-r--r--src/video_core/shader/shader_ir.h14
-rw-r--r--src/video_core/shader/track.cpp78
-rw-r--r--src/video_core/shader_cache.h228
-rw-r--r--src/video_core/texture_cache/format_lookup_table.cpp3
-rw-r--r--src/video_core/texture_cache/surface_base.cpp7
-rw-r--r--src/video_core/texture_cache/surface_base.h13
-rw-r--r--src/video_core/texture_cache/surface_params.cpp19
-rw-r--r--src/video_core/texture_cache/texture_cache.h247
-rw-r--r--src/yuzu/bootmanager.cpp3
-rw-r--r--src/yuzu/configuration/config.cpp17
-rw-r--r--src/yuzu/configuration/configure_debug.cpp3
-rw-r--r--src/yuzu/configuration/configure_debug.ui13
-rw-r--r--src/yuzu/configuration/configure_graphics.cpp45
-rw-r--r--src/yuzu/configuration/configure_graphics.ui40
-rw-r--r--src/yuzu/configuration/configure_graphics_advanced.cpp3
-rw-r--r--src/yuzu/configuration/configure_graphics_advanced.ui10
-rw-r--r--src/yuzu/configuration/configure_input_player.cpp3
-rw-r--r--src/yuzu/discord_impl.cpp2
-rw-r--r--src/yuzu/main.cpp10
-rw-r--r--src/yuzu_cmd/config.cpp6
-rw-r--r--src/yuzu_cmd/default_ini.h11
-rw-r--r--src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp3
-rw-r--r--src/yuzu_tester/config.cpp2
-rw-r--r--src/yuzu_tester/default_ini.h5
116 files changed, 6159 insertions, 1856 deletions
diff --git a/.gitmodules b/.gitmodules
index bf3b80d59..9ba8fe207 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,6 +13,9 @@
13[submodule "soundtouch"] 13[submodule "soundtouch"]
14 path = externals/soundtouch 14 path = externals/soundtouch
15 url = https://github.com/citra-emu/ext-soundtouch.git 15 url = https://github.com/citra-emu/ext-soundtouch.git
16[submodule "libressl"]
17 path = externals/libressl
18 url = https://github.com/citra-emu/ext-libressl-portable.git
16[submodule "discord-rpc"] 19[submodule "discord-rpc"]
17 path = externals/discord-rpc 20 path = externals/discord-rpc
18 url = https://github.com/discordapp/discord-rpc.git 21 url = https://github.com/discordapp/discord-rpc.git
@@ -28,3 +31,6 @@
28[submodule "libzip"] 31[submodule "libzip"]
29 path = externals/libzip/libzip 32 path = externals/libzip/libzip
30 url = https://github.com/nih-at/libzip.git 33 url = https://github.com/nih-at/libzip.git
34[submodule "xbyak"]
35 path = externals/xbyak
36 url = https://github.com/herumi/xbyak.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61321bf0a..b71071271 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
1cmake_minimum_required(VERSION 3.11) 1cmake_minimum_required(VERSION 3.15)
2 2
3list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules") 3list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
4list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules") 4list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules")
@@ -13,7 +13,7 @@ project(yuzu)
13option(ENABLE_SDL2 "Enable the SDL2 frontend" ON) 13option(ENABLE_SDL2 "Enable the SDL2 frontend" ON)
14 14
15option(ENABLE_QT "Enable the Qt frontend" ON) 15option(ENABLE_QT "Enable the Qt frontend" ON)
16CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" OFF "ENABLE_QT;MSVC" OFF) 16CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" ON "ENABLE_QT;MSVC" OFF)
17 17
18option(ENABLE_WEB_SERVICE "Enable web services (telemetry, etc.)" ON) 18option(ENABLE_WEB_SERVICE "Enable web services (telemetry, etc.)" ON)
19 19
@@ -152,7 +152,6 @@ macro(yuzu_find_packages)
152 "Boost 1.71 boost/1.72.0" 152 "Boost 1.71 boost/1.72.0"
153 "Catch2 2.11 catch2/2.11.0" 153 "Catch2 2.11 catch2/2.11.0"
154 "fmt 6.2 fmt/6.2.0" 154 "fmt 6.2 fmt/6.2.0"
155 "OpenSSL 1.1 openssl/1.1.1f"
156 # can't use until https://github.com/bincrafters/community/issues/1173 155 # can't use until https://github.com/bincrafters/community/issues/1173
157 #"libzip 1.5 libzip/1.5.2@bincrafters/stable" 156 #"libzip 1.5 libzip/1.5.2@bincrafters/stable"
158 "lz4 1.8 lz4/1.9.2" 157 "lz4 1.8 lz4/1.9.2"
@@ -312,15 +311,6 @@ elseif (TARGET Boost::boost)
312 add_library(boost ALIAS Boost::boost) 311 add_library(boost ALIAS Boost::boost)
313endif() 312endif()
314 313
315if (NOT TARGET OpenSSL::SSL)
316 set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE)
317 add_library(OpenSSL::SSL ALIAS OpenSSL::OpenSSL)
318endif()
319if (NOT TARGET OpenSSL::Crypto)
320 set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE)
321 add_library(OpenSSL::Crypto ALIAS OpenSSL::OpenSSL)
322endif()
323
324if (TARGET sdl2::sdl2) 314if (TARGET sdl2::sdl2)
325 # imported from the conan generated sdl2Config.cmake 315 # imported from the conan generated sdl2Config.cmake
326 set_target_properties(sdl2::sdl2 PROPERTIES IMPORTED_GLOBAL TRUE) 316 set_target_properties(sdl2::sdl2 PROPERTIES IMPORTED_GLOBAL TRUE)
diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake
index 83e4e9df2..311ba1c2e 100644
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -51,6 +51,8 @@ endif()
51# The variable SRC_DIR must be passed into the script (since it uses the current build directory for all values of CMAKE_*_DIR) 51# The variable SRC_DIR must be passed into the script (since it uses the current build directory for all values of CMAKE_*_DIR)
52set(VIDEO_CORE "${SRC_DIR}/src/video_core") 52set(VIDEO_CORE "${SRC_DIR}/src/video_core")
53set(HASH_FILES 53set(HASH_FILES
54 "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp"
55 "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h"
54 "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp" 56 "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp"
55 "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h" 57 "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h"
56 "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp" 58 "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp"
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 0b40cd1b0..b80b27605 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -4,6 +4,13 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules")
4list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/externals/find-modules") 4list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/externals/find-modules")
5include(DownloadExternals) 5include(DownloadExternals)
6 6
7# xbyak
8if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
9 add_library(xbyak INTERFACE)
10 target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
11 target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
12endif()
13
7# Catch 14# Catch
8add_library(catch-single-include INTERFACE) 15add_library(catch-single-include INTERFACE)
9target_include_directories(catch-single-include INTERFACE catch/single_include) 16target_include_directories(catch-single-include INTERFACE catch/single_include)
@@ -66,6 +73,15 @@ if (NOT LIBZIP_FOUND)
66endif() 73endif()
67 74
68if (ENABLE_WEB_SERVICE) 75if (ENABLE_WEB_SERVICE)
76 # LibreSSL
77 set(LIBRESSL_SKIP_INSTALL ON CACHE BOOL "")
78 add_subdirectory(libressl EXCLUDE_FROM_ALL)
79 target_include_directories(ssl INTERFACE ./libressl/include)
80 target_compile_definitions(ssl PRIVATE -DHAVE_INET_NTOP)
81 get_directory_property(OPENSSL_LIBRARIES
82 DIRECTORY libressl
83 DEFINITION OPENSSL_LIBS)
84
69 # lurlparser 85 # lurlparser
70 add_subdirectory(lurlparser EXCLUDE_FROM_ALL) 86 add_subdirectory(lurlparser EXCLUDE_FROM_ALL)
71 87
@@ -73,5 +89,5 @@ if (ENABLE_WEB_SERVICE)
73 add_library(httplib INTERFACE) 89 add_library(httplib INTERFACE)
74 target_include_directories(httplib INTERFACE ./httplib) 90 target_include_directories(httplib INTERFACE ./httplib)
75 target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT) 91 target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)
76 target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto) 92 target_link_libraries(httplib INTERFACE ${OPENSSL_LIBRARIES})
77endif() 93endif()
diff --git a/externals/libressl b/externals/libressl
new file mode 160000
Subproject 7d01cb01cb1a926ecb4c9c98b107ef3c26f59df
diff --git a/externals/sirit b/externals/sirit
Subproject 414fc4dbd28d8fe48f735a0c389db8a234f733c Subproject eefca56afd49379bdebc97ded8b480839f93088
diff --git a/externals/xbyak b/externals/xbyak
new file mode 160000
Subproject 82b70e665918efc2ee348091742fd0237b3b68c
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index e6769a5f3..0a3e2f4d1 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -32,6 +32,8 @@ add_custom_command(OUTPUT scm_rev.cpp
32 DEPENDS 32 DEPENDS
33 # WARNING! It was too much work to try and make a common location for this list, 33 # WARNING! It was too much work to try and make a common location for this list,
34 # so if you need to change it, please update CMakeModules/GenerateSCMRev.cmake as well 34 # so if you need to change it, please update CMakeModules/GenerateSCMRev.cmake as well
35 "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp"
36 "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h"
35 "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp" 37 "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp"
36 "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h" 38 "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h"
37 "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp" 39 "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp"
@@ -123,6 +125,8 @@ add_library(common STATIC
123 lz4_compression.cpp 125 lz4_compression.cpp
124 lz4_compression.h 126 lz4_compression.h
125 math_util.h 127 math_util.h
128 memory_detect.cpp
129 memory_detect.h
126 memory_hook.cpp 130 memory_hook.cpp
127 memory_hook.h 131 memory_hook.h
128 microprofile.cpp 132 microprofile.cpp
@@ -169,10 +173,12 @@ if(ARCHITECTURE_x86_64)
169 PRIVATE 173 PRIVATE
170 x64/cpu_detect.cpp 174 x64/cpu_detect.cpp
171 x64/cpu_detect.h 175 x64/cpu_detect.h
176 x64/xbyak_abi.h
177 x64/xbyak_util.h
172 ) 178 )
173endif() 179endif()
174 180
175create_target_directory_groups(common) 181create_target_directory_groups(common)
176 182
177target_link_libraries(common PUBLIC Boost::boost fmt::fmt microprofile) 183target_link_libraries(common PUBLIC Boost::boost fmt::fmt microprofile)
178target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd) 184target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd xbyak)
diff --git a/src/common/memory_detect.cpp b/src/common/memory_detect.cpp
new file mode 100644
index 000000000..3fdc309a2
--- /dev/null
+++ b/src/common/memory_detect.cpp
@@ -0,0 +1,60 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#ifdef _WIN32
6// clang-format off
7#include <windows.h>
8#include <sysinfoapi.h>
9// clang-format on
10#else
11#include <sys/types.h>
12#ifdef __APPLE__
13#include <sys/sysctl.h>
14#else
15#include <sys/sysinfo.h>
16#endif
17#endif
18
19#include "common/memory_detect.h"
20
21namespace Common {
22
23// Detects the RAM and Swapfile sizes
24static MemoryInfo Detect() {
25 MemoryInfo mem_info{};
26
27#ifdef _WIN32
28 MEMORYSTATUSEX memorystatus;
29 memorystatus.dwLength = sizeof(memorystatus);
30 GlobalMemoryStatusEx(&memorystatus);
31 mem_info.TotalPhysicalMemory = memorystatus.ullTotalPhys;
32 mem_info.TotalSwapMemory = memorystatus.ullTotalPageFile - mem_info.TotalPhysicalMemory;
33#elif defined(__APPLE__)
34 u64 ramsize;
35 struct xsw_usage vmusage;
36 std::size_t sizeof_ramsize = sizeof(ramsize);
37 std::size_t sizeof_vmusage = sizeof(vmusage);
38 // hw and vm are defined in sysctl.h
39 // https://github.com/apple/darwin-xnu/blob/master/bsd/sys/sysctl.h#L471
40 // sysctlbyname(const char *, void *, size_t *, void *, size_t);
41 sysctlbyname("hw.memsize", &ramsize, &sizeof_ramsize, NULL, 0);
42 sysctlbyname("vm.swapusage", &vmusage, &sizeof_vmusage, NULL, 0);
43 mem_info.TotalPhysicalMemory = ramsize;
44 mem_info.TotalSwapMemory = vmusage.xsu_total;
45#else
46 struct sysinfo meminfo;
47 sysinfo(&meminfo);
48 mem_info.TotalPhysicalMemory = meminfo.totalram;
49 mem_info.TotalSwapMemory = meminfo.totalswap;
50#endif
51
52 return mem_info;
53}
54
55const MemoryInfo& GetMemInfo() {
56 static MemoryInfo mem_info = Detect();
57 return mem_info;
58}
59
60} // namespace Common \ No newline at end of file
diff --git a/src/common/memory_detect.h b/src/common/memory_detect.h
new file mode 100644
index 000000000..a73c0f3f4
--- /dev/null
+++ b/src/common/memory_detect.h
@@ -0,0 +1,22 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "common/common_types.h"
8
9namespace Common {
10
11struct MemoryInfo {
12 u64 TotalPhysicalMemory{};
13 u64 TotalSwapMemory{};
14};
15
16/**
17 * Gets the memory info of the host system
18 * @return Reference to a MemoryInfo struct with the physical and swap memory sizes in bytes
19 */
20const MemoryInfo& GetMemInfo();
21
22} // namespace Common \ No newline at end of file
diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h
new file mode 100644
index 000000000..794da8a52
--- /dev/null
+++ b/src/common/x64/xbyak_abi.h
@@ -0,0 +1,266 @@
1// Copyright 2016 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <bitset>
8#include <initializer_list>
9#include <xbyak.h>
10#include "common/assert.h"
11
12namespace Common::X64 {
13
14inline int RegToIndex(const Xbyak::Reg& reg) {
15 using Kind = Xbyak::Reg::Kind;
16 ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
17 "RegSet only support GPRs and XMM registers.");
18 ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15.");
19 return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16);
20}
21
22inline Xbyak::Reg64 IndexToReg64(int reg_index) {
23 ASSERT(reg_index < 16);
24 return Xbyak::Reg64(reg_index);
25}
26
27inline Xbyak::Xmm IndexToXmm(int reg_index) {
28 ASSERT(reg_index >= 16 && reg_index < 32);
29 return Xbyak::Xmm(reg_index - 16);
30}
31
32inline Xbyak::Reg IndexToReg(int reg_index) {
33 if (reg_index < 16) {
34 return IndexToReg64(reg_index);
35 } else {
36 return IndexToXmm(reg_index);
37 }
38}
39
40inline std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
41 std::bitset<32> bits;
42 for (const Xbyak::Reg& reg : regs) {
43 bits[RegToIndex(reg)] = true;
44 }
45 return bits;
46}
47
48const std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
49const std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
50
51#ifdef _WIN32
52
53// Microsoft x64 ABI
54const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
55const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx;
56const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
57const Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
58const Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;
59
60const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
61 // GPRs
62 Xbyak::util::rcx,
63 Xbyak::util::rdx,
64 Xbyak::util::r8,
65 Xbyak::util::r9,
66 Xbyak::util::r10,
67 Xbyak::util::r11,
68 // XMMs
69 Xbyak::util::xmm0,
70 Xbyak::util::xmm1,
71 Xbyak::util::xmm2,
72 Xbyak::util::xmm3,
73 Xbyak::util::xmm4,
74 Xbyak::util::xmm5,
75});
76
77const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
78 // GPRs
79 Xbyak::util::rbx,
80 Xbyak::util::rsi,
81 Xbyak::util::rdi,
82 Xbyak::util::rbp,
83 Xbyak::util::r12,
84 Xbyak::util::r13,
85 Xbyak::util::r14,
86 Xbyak::util::r15,
87 // XMMs
88 Xbyak::util::xmm6,
89 Xbyak::util::xmm7,
90 Xbyak::util::xmm8,
91 Xbyak::util::xmm9,
92 Xbyak::util::xmm10,
93 Xbyak::util::xmm11,
94 Xbyak::util::xmm12,
95 Xbyak::util::xmm13,
96 Xbyak::util::xmm14,
97 Xbyak::util::xmm15,
98});
99
100constexpr size_t ABI_SHADOW_SPACE = 0x20;
101
102#else
103
104// System V x86-64 ABI
105const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
106const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi;
107const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
108const Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
109const Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;
110
111const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
112 // GPRs
113 Xbyak::util::rcx,
114 Xbyak::util::rdx,
115 Xbyak::util::rdi,
116 Xbyak::util::rsi,
117 Xbyak::util::r8,
118 Xbyak::util::r9,
119 Xbyak::util::r10,
120 Xbyak::util::r11,
121 // XMMs
122 Xbyak::util::xmm0,
123 Xbyak::util::xmm1,
124 Xbyak::util::xmm2,
125 Xbyak::util::xmm3,
126 Xbyak::util::xmm4,
127 Xbyak::util::xmm5,
128 Xbyak::util::xmm6,
129 Xbyak::util::xmm7,
130 Xbyak::util::xmm8,
131 Xbyak::util::xmm9,
132 Xbyak::util::xmm10,
133 Xbyak::util::xmm11,
134 Xbyak::util::xmm12,
135 Xbyak::util::xmm13,
136 Xbyak::util::xmm14,
137 Xbyak::util::xmm15,
138});
139
140const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
141 // GPRs
142 Xbyak::util::rbx,
143 Xbyak::util::rbp,
144 Xbyak::util::r12,
145 Xbyak::util::r13,
146 Xbyak::util::r14,
147 Xbyak::util::r15,
148});
149
150constexpr size_t ABI_SHADOW_SPACE = 0;
151
152#endif
153
154inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
155 size_t needed_frame_size, s32* out_subtraction,
156 s32* out_xmm_offset) {
157 const auto count = (regs & ABI_ALL_GPRS).count();
158 rsp_alignment -= count * 8;
159 size_t subtraction = 0;
160 const auto xmm_count = (regs & ABI_ALL_XMMS).count();
161 if (xmm_count) {
162 // If we have any XMMs to save, we must align the stack here.
163 subtraction = rsp_alignment & 0xF;
164 }
165 subtraction += 0x10 * xmm_count;
166 size_t xmm_base_subtraction = subtraction;
167 subtraction += needed_frame_size;
168 subtraction += ABI_SHADOW_SPACE;
169 // Final alignment.
170 rsp_alignment -= subtraction;
171 subtraction += rsp_alignment & 0xF;
172
173 *out_subtraction = (s32)subtraction;
174 *out_xmm_offset = (s32)(subtraction - xmm_base_subtraction);
175}
176
177inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
178 size_t rsp_alignment, size_t needed_frame_size = 0) {
179 s32 subtraction, xmm_offset;
180 ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
181 for (std::size_t i = 0; i < regs.size(); ++i) {
182 if (regs[i] && ABI_ALL_GPRS[i]) {
183 code.push(IndexToReg64(static_cast<int>(i)));
184 }
185 }
186 if (subtraction != 0) {
187 code.sub(code.rsp, subtraction);
188 }
189
190 for (int i = 0; i < regs.count(); i++) {
191 if (regs.test(i) & ABI_ALL_GPRS.test(i)) {
192 code.push(IndexToReg64(i));
193 }
194 }
195
196 for (std::size_t i = 0; i < regs.size(); ++i) {
197 if (regs[i] && ABI_ALL_XMMS[i]) {
198 code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(static_cast<int>(i)));
199 xmm_offset += 0x10;
200 }
201 }
202
203 return ABI_SHADOW_SPACE;
204}
205
206inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
207 size_t rsp_alignment, size_t needed_frame_size = 0) {
208 s32 subtraction, xmm_offset;
209 ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
210
211 for (std::size_t i = 0; i < regs.size(); ++i) {
212 if (regs[i] && ABI_ALL_XMMS[i]) {
213 code.movaps(IndexToXmm(static_cast<int>(i)), code.xword[code.rsp + xmm_offset]);
214 xmm_offset += 0x10;
215 }
216 }
217
218 if (subtraction != 0) {
219 code.add(code.rsp, subtraction);
220 }
221
222 // GPRs need to be popped in reverse order
223 for (int i = 15; i >= 0; i--) {
224 if (regs[i]) {
225 code.pop(IndexToReg64(i));
226 }
227 }
228}
229
230inline size_t ABI_PushRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
231 size_t rsp_alignment,
232 size_t needed_frame_size = 0) {
233 s32 subtraction, xmm_offset;
234 ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
235
236 for (std::size_t i = 0; i < regs.size(); ++i) {
237 if (regs[i] && ABI_ALL_GPRS[i]) {
238 code.push(IndexToReg64(static_cast<int>(i)));
239 }
240 }
241
242 if (subtraction != 0) {
243 code.sub(code.rsp, subtraction);
244 }
245
246 return ABI_SHADOW_SPACE;
247}
248
249inline void ABI_PopRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
250 size_t rsp_alignment, size_t needed_frame_size = 0) {
251 s32 subtraction, xmm_offset;
252 ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
253
254 if (subtraction != 0) {
255 code.add(code.rsp, subtraction);
256 }
257
258 // GPRs need to be popped in reverse order
259 for (int i = 15; i >= 0; i--) {
260 if (regs[i]) {
261 code.pop(IndexToReg64(i));
262 }
263 }
264}
265
266} // namespace Common::X64
diff --git a/src/common/x64/xbyak_util.h b/src/common/x64/xbyak_util.h
new file mode 100644
index 000000000..df17f8cbe
--- /dev/null
+++ b/src/common/x64/xbyak_util.h
@@ -0,0 +1,47 @@
1// Copyright 2016 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <type_traits>
8#include <xbyak.h>
9#include "common/x64/xbyak_abi.h"
10
11namespace Common::X64 {
12
13// Constants for use with cmpps/cmpss
14enum {
15 CMP_EQ = 0,
16 CMP_LT = 1,
17 CMP_LE = 2,
18 CMP_UNORD = 3,
19 CMP_NEQ = 4,
20 CMP_NLT = 5,
21 CMP_NLE = 6,
22 CMP_ORD = 7,
23};
24
25constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) {
26 const u64 distance = target - (ref + 5);
27 return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL);
28}
29
30inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) {
31 return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target);
32}
33
34template <typename T>
35inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) {
36 static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
37 size_t addr = reinterpret_cast<size_t>(f);
38 if (IsWithin2G(code, addr)) {
39 code.call(f);
40 } else {
41 // ABI_RETURN is a safe temp register to use before a call
42 code.mov(ABI_RETURN, addr);
43 code.call(ABI_RETURN);
44 }
45}
46
47} // namespace Common::X64
diff --git a/src/core/file_sys/patch_manager.cpp b/src/core/file_sys/patch_manager.cpp
index b93aa6935..c47ff863e 100644
--- a/src/core/file_sys/patch_manager.cpp
+++ b/src/core/file_sys/patch_manager.cpp
@@ -10,6 +10,7 @@
10#include "common/file_util.h" 10#include "common/file_util.h"
11#include "common/hex_util.h" 11#include "common/hex_util.h"
12#include "common/logging/log.h" 12#include "common/logging/log.h"
13#include "common/string_util.h"
13#include "core/core.h" 14#include "core/core.h"
14#include "core/file_sys/content_archive.h" 15#include "core/file_sys/content_archive.h"
15#include "core/file_sys/control_metadata.h" 16#include "core/file_sys/control_metadata.h"
@@ -48,6 +49,23 @@ std::string FormatTitleVersion(u32 version, TitleVersionFormat format) {
48 return fmt::format("v{}.{}.{}", bytes[3], bytes[2], bytes[1]); 49 return fmt::format("v{}.{}.{}", bytes[3], bytes[2], bytes[1]);
49} 50}
50 51
52std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
53 std::string_view name) {
54#ifdef _WIN32
55 return dir->GetSubdirectory(name);
56#else
57 const auto subdirs = dir->GetSubdirectories();
58 for (const auto& subdir : subdirs) {
59 std::string dir_name = Common::ToLower(subdir->GetName());
60 if (dir_name == name) {
61 return subdir;
62 }
63 }
64
65 return nullptr;
66#endif
67}
68
51PatchManager::PatchManager(u64 title_id) : title_id(title_id) {} 69PatchManager::PatchManager(u64 title_id) : title_id(title_id) {}
52 70
53PatchManager::~PatchManager() = default; 71PatchManager::~PatchManager() = default;
@@ -104,7 +122,7 @@ VirtualDir PatchManager::PatchExeFS(VirtualDir exefs) const {
104 if (std::find(disabled.begin(), disabled.end(), subdir->GetName()) != disabled.end()) 122 if (std::find(disabled.begin(), disabled.end(), subdir->GetName()) != disabled.end())
105 continue; 123 continue;
106 124
107 auto exefs_dir = subdir->GetSubdirectory("exefs"); 125 auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
108 if (exefs_dir != nullptr) 126 if (exefs_dir != nullptr)
109 layers.push_back(std::move(exefs_dir)); 127 layers.push_back(std::move(exefs_dir));
110 } 128 }
@@ -130,7 +148,7 @@ std::vector<VirtualFile> PatchManager::CollectPatches(const std::vector<VirtualD
130 if (std::find(disabled.cbegin(), disabled.cend(), subdir->GetName()) != disabled.cend()) 148 if (std::find(disabled.cbegin(), disabled.cend(), subdir->GetName()) != disabled.cend())
131 continue; 149 continue;
132 150
133 auto exefs_dir = subdir->GetSubdirectory("exefs"); 151 auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
134 if (exefs_dir != nullptr) { 152 if (exefs_dir != nullptr) {
135 for (const auto& file : exefs_dir->GetFiles()) { 153 for (const auto& file : exefs_dir->GetFiles()) {
136 if (file->GetExtension() == "ips") { 154 if (file->GetExtension() == "ips") {
@@ -295,7 +313,7 @@ std::vector<Core::Memory::CheatEntry> PatchManager::CreateCheatList(
295 continue; 313 continue;
296 } 314 }
297 315
298 auto cheats_dir = subdir->GetSubdirectory("cheats"); 316 auto cheats_dir = FindSubdirectoryCaseless(subdir, "cheats");
299 if (cheats_dir != nullptr) { 317 if (cheats_dir != nullptr) {
300 auto res = ReadCheatFileFromFolder(system, title_id, build_id_, cheats_dir, true); 318 auto res = ReadCheatFileFromFolder(system, title_id, build_id_, cheats_dir, true);
301 if (res.has_value()) { 319 if (res.has_value()) {
@@ -340,11 +358,11 @@ static void ApplyLayeredFS(VirtualFile& romfs, u64 title_id, ContentRecordType t
340 continue; 358 continue;
341 } 359 }
342 360
343 auto romfs_dir = subdir->GetSubdirectory("romfs"); 361 auto romfs_dir = FindSubdirectoryCaseless(subdir, "romfs");
344 if (romfs_dir != nullptr) 362 if (romfs_dir != nullptr)
345 layers.push_back(std::move(romfs_dir)); 363 layers.push_back(std::move(romfs_dir));
346 364
347 auto ext_dir = subdir->GetSubdirectory("romfs_ext"); 365 auto ext_dir = FindSubdirectoryCaseless(subdir, "romfs_ext");
348 if (ext_dir != nullptr) 366 if (ext_dir != nullptr)
349 layers_ext.push_back(std::move(ext_dir)); 367 layers_ext.push_back(std::move(ext_dir));
350 } 368 }
@@ -470,7 +488,7 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
470 for (const auto& mod : mod_dir->GetSubdirectories()) { 488 for (const auto& mod : mod_dir->GetSubdirectories()) {
471 std::string types; 489 std::string types;
472 490
473 const auto exefs_dir = mod->GetSubdirectory("exefs"); 491 const auto exefs_dir = FindSubdirectoryCaseless(mod, "exefs");
474 if (IsDirValidAndNonEmpty(exefs_dir)) { 492 if (IsDirValidAndNonEmpty(exefs_dir)) {
475 bool ips = false; 493 bool ips = false;
476 bool ipswitch = false; 494 bool ipswitch = false;
@@ -494,9 +512,9 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
494 if (layeredfs) 512 if (layeredfs)
495 AppendCommaIfNotEmpty(types, "LayeredExeFS"); 513 AppendCommaIfNotEmpty(types, "LayeredExeFS");
496 } 514 }
497 if (IsDirValidAndNonEmpty(mod->GetSubdirectory("romfs"))) 515 if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "romfs")))
498 AppendCommaIfNotEmpty(types, "LayeredFS"); 516 AppendCommaIfNotEmpty(types, "LayeredFS");
499 if (IsDirValidAndNonEmpty(mod->GetSubdirectory("cheats"))) 517 if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "cheats")))
500 AppendCommaIfNotEmpty(types, "Cheats"); 518 AppendCommaIfNotEmpty(types, "Cheats");
501 519
502 if (types.empty()) 520 if (types.empty())
diff --git a/src/core/file_sys/patch_manager.h b/src/core/file_sys/patch_manager.h
index ec6db524d..f4cb918dd 100644
--- a/src/core/file_sys/patch_manager.h
+++ b/src/core/file_sys/patch_manager.h
@@ -29,6 +29,11 @@ enum class TitleVersionFormat : u8 {
29std::string FormatTitleVersion(u32 version, 29std::string FormatTitleVersion(u32 version,
30 TitleVersionFormat format = TitleVersionFormat::ThreeElements); 30 TitleVersionFormat format = TitleVersionFormat::ThreeElements);
31 31
32// Returns a directory with name matching name case-insensitive. Returns nullptr if directory
33// doesn't have a directory with name.
34std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
35 std::string_view name);
36
32// A centralized class to manage patches to games. 37// A centralized class to manage patches to games.
33class PatchManager { 38class PatchManager {
34public: 39public:
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index 36724569f..c4c5199b1 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -132,7 +132,8 @@ std::shared_ptr<ResourceLimit> Process::GetResourceLimit() const {
132 132
133u64 Process::GetTotalPhysicalMemoryAvailable() const { 133u64 Process::GetTotalPhysicalMemoryAvailable() const {
134 const u64 capacity{resource_limit->GetCurrentResourceValue(ResourceType::PhysicalMemory) + 134 const u64 capacity{resource_limit->GetCurrentResourceValue(ResourceType::PhysicalMemory) +
135 page_table->GetTotalHeapSize() + image_size + main_thread_stack_size}; 135 page_table->GetTotalHeapSize() + GetSystemResourceSize() + image_size +
136 main_thread_stack_size};
136 137
137 if (capacity < memory_usage_capacity) { 138 if (capacity < memory_usage_capacity) {
138 return capacity; 139 return capacity;
@@ -146,7 +147,8 @@ u64 Process::GetTotalPhysicalMemoryAvailableWithoutSystemResource() const {
146} 147}
147 148
148u64 Process::GetTotalPhysicalMemoryUsed() const { 149u64 Process::GetTotalPhysicalMemoryUsed() const {
149 return image_size + main_thread_stack_size + page_table->GetTotalHeapSize(); 150 return image_size + main_thread_stack_size + page_table->GetTotalHeapSize() +
151 GetSystemResourceSize();
150} 152}
151 153
152u64 Process::GetTotalPhysicalMemoryUsedWithoutSystemResource() const { 154u64 Process::GetTotalPhysicalMemoryUsedWithoutSystemResource() const {
diff --git a/src/core/hle/kernel/readable_event.cpp b/src/core/hle/kernel/readable_event.cpp
index 00860fcbd..ef5e19e63 100644
--- a/src/core/hle/kernel/readable_event.cpp
+++ b/src/core/hle/kernel/readable_event.cpp
@@ -38,7 +38,7 @@ void ReadableEvent::Clear() {
38 38
39ResultCode ReadableEvent::Reset() { 39ResultCode ReadableEvent::Reset() {
40 if (!is_signaled) { 40 if (!is_signaled) {
41 LOG_ERROR(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}", 41 LOG_TRACE(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
42 GetObjectId(), GetTypeName(), GetName()); 42 GetObjectId(), GetTypeName(), GetName());
43 return ERR_INVALID_STATE; 43 return ERR_INVALID_STATE;
44 } 44 }
diff --git a/src/core/hle/kernel/resource_limit.cpp b/src/core/hle/kernel/resource_limit.cpp
index d9beaa3a4..212e442f4 100644
--- a/src/core/hle/kernel/resource_limit.cpp
+++ b/src/core/hle/kernel/resource_limit.cpp
@@ -24,13 +24,9 @@ bool ResourceLimit::Reserve(ResourceType resource, s64 amount, u64 timeout) {
24 const std::size_t index{ResourceTypeToIndex(resource)}; 24 const std::size_t index{ResourceTypeToIndex(resource)};
25 25
26 s64 new_value = current[index] + amount; 26 s64 new_value = current[index] + amount;
27 while (new_value > limit[index] && available[index] + amount <= limit[index]) { 27 if (new_value > limit[index] && available[index] + amount <= limit[index]) {
28 // TODO(bunnei): This is wrong for multicore, we should wait the calling thread for timeout 28 // TODO(bunnei): This is wrong for multicore, we should wait the calling thread for timeout
29 new_value = current[index] + amount; 29 new_value = current[index] + amount;
30
31 if (timeout >= 0) {
32 break;
33 }
34 } 30 }
35 31
36 if (new_value <= limit[index]) { 32 if (new_value <= limit[index]) {
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index c84cb1483..72a050de2 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -161,7 +161,7 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) {
161 {40, nullptr, "AcquireXpadIdEventHandle"}, 161 {40, nullptr, "AcquireXpadIdEventHandle"},
162 {41, nullptr, "ReleaseXpadIdEventHandle"}, 162 {41, nullptr, "ReleaseXpadIdEventHandle"},
163 {51, &Hid::ActivateXpad, "ActivateXpad"}, 163 {51, &Hid::ActivateXpad, "ActivateXpad"},
164 {55, nullptr, "GetXpadIds"}, 164 {55, &Hid::GetXpadIDs, "GetXpadIds"},
165 {56, nullptr, "ActivateJoyXpad"}, 165 {56, nullptr, "ActivateJoyXpad"},
166 {58, nullptr, "GetJoyXpadLifoHandle"}, 166 {58, nullptr, "GetJoyXpadLifoHandle"},
167 {59, nullptr, "GetJoyXpadIds"}, 167 {59, nullptr, "GetJoyXpadIds"},
@@ -319,6 +319,17 @@ void Hid::ActivateXpad(Kernel::HLERequestContext& ctx) {
319 rb.Push(RESULT_SUCCESS); 319 rb.Push(RESULT_SUCCESS);
320} 320}
321 321
322void Hid::GetXpadIDs(Kernel::HLERequestContext& ctx) {
323 IPC::RequestParser rp{ctx};
324 const auto applet_resource_user_id{rp.Pop<u64>()};
325
326 LOG_DEBUG(Service_HID, "(STUBBED) called, applet_resource_user_id={}", applet_resource_user_id);
327
328 IPC::ResponseBuilder rb{ctx, 3};
329 rb.Push(RESULT_SUCCESS);
330 rb.Push(0);
331}
332
322void Hid::ActivateDebugPad(Kernel::HLERequestContext& ctx) { 333void Hid::ActivateDebugPad(Kernel::HLERequestContext& ctx) {
323 IPC::RequestParser rp{ctx}; 334 IPC::RequestParser rp{ctx};
324 const auto applet_resource_user_id{rp.Pop<u64>()}; 335 const auto applet_resource_user_id{rp.Pop<u64>()};
diff --git a/src/core/hle/service/hid/hid.h b/src/core/hle/service/hid/hid.h
index c8ed4ad8b..d481a75f8 100644
--- a/src/core/hle/service/hid/hid.h
+++ b/src/core/hle/service/hid/hid.h
@@ -86,6 +86,7 @@ public:
86private: 86private:
87 void CreateAppletResource(Kernel::HLERequestContext& ctx); 87 void CreateAppletResource(Kernel::HLERequestContext& ctx);
88 void ActivateXpad(Kernel::HLERequestContext& ctx); 88 void ActivateXpad(Kernel::HLERequestContext& ctx);
89 void GetXpadIDs(Kernel::HLERequestContext& ctx);
89 void ActivateDebugPad(Kernel::HLERequestContext& ctx); 90 void ActivateDebugPad(Kernel::HLERequestContext& ctx);
90 void ActivateTouchScreen(Kernel::HLERequestContext& ctx); 91 void ActivateTouchScreen(Kernel::HLERequestContext& ctx);
91 void ActivateMouse(Kernel::HLERequestContext& ctx); 92 void ActivateMouse(Kernel::HLERequestContext& ctx);
diff --git a/src/core/hle/service/nifm/nifm.cpp b/src/core/hle/service/nifm/nifm.cpp
index 767158444..01ddcdbd6 100644
--- a/src/core/hle/service/nifm/nifm.cpp
+++ b/src/core/hle/service/nifm/nifm.cpp
@@ -177,7 +177,8 @@ private:
177 void CreateTemporaryNetworkProfile(Kernel::HLERequestContext& ctx) { 177 void CreateTemporaryNetworkProfile(Kernel::HLERequestContext& ctx) {
178 LOG_DEBUG(Service_NIFM, "called"); 178 LOG_DEBUG(Service_NIFM, "called");
179 179
180 ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c, "NetworkProfileData is not the correct size"); 180 ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c,
181 "SfNetworkProfileData is not the correct size");
181 u128 uuid{}; 182 u128 uuid{};
182 auto buffer = ctx.ReadBuffer(); 183 auto buffer = ctx.ReadBuffer();
183 std::memcpy(&uuid, buffer.data() + 8, sizeof(u128)); 184 std::memcpy(&uuid, buffer.data() + 8, sizeof(u128));
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
index cc2192e5c..0d913334e 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
@@ -25,7 +25,7 @@ u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector<u8>& input,
25 case IoctlCommand::IocGetCharacteristicsCommand: 25 case IoctlCommand::IocGetCharacteristicsCommand:
26 return GetCharacteristics(input, output, output2, version); 26 return GetCharacteristics(input, output, output2, version);
27 case IoctlCommand::IocGetTPCMasksCommand: 27 case IoctlCommand::IocGetTPCMasksCommand:
28 return GetTPCMasks(input, output); 28 return GetTPCMasks(input, output, output2, version);
29 case IoctlCommand::IocGetActiveSlotMaskCommand: 29 case IoctlCommand::IocGetActiveSlotMaskCommand:
30 return GetActiveSlotMask(input, output); 30 return GetActiveSlotMask(input, output);
31 case IoctlCommand::IocZcullGetCtxSizeCommand: 31 case IoctlCommand::IocZcullGetCtxSizeCommand:
@@ -98,17 +98,22 @@ u32 nvhost_ctrl_gpu::GetCharacteristics(const std::vector<u8>& input, std::vecto
98 return 0; 98 return 0;
99} 99}
100 100
101u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output) { 101u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output,
102 std::vector<u8>& output2, IoctlVersion version) {
102 IoctlGpuGetTpcMasksArgs params{}; 103 IoctlGpuGetTpcMasksArgs params{};
103 std::memcpy(&params, input.data(), input.size()); 104 std::memcpy(&params, input.data(), input.size());
104 LOG_INFO(Service_NVDRV, "called, mask=0x{:X}, mask_buf_addr=0x{:X}", params.mask_buf_size, 105 LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size);
105 params.mask_buf_addr); 106 if (params.mask_buffer_size != 0) {
106 // TODO(ogniK): Confirm value on hardware 107 params.tcp_mask = 3;
107 if (params.mask_buf_size) 108 }
108 params.tpc_mask_size = 4 * 1; // 4 * num_gpc 109
109 else 110 if (version == IoctlVersion::Version3) {
110 params.tpc_mask_size = 0; 111 std::memcpy(output.data(), input.data(), output.size());
111 std::memcpy(output.data(), &params, sizeof(params)); 112 std::memcpy(output2.data(), &params.tcp_mask, output2.size());
113 } else {
114 std::memcpy(output.data(), &params, output.size());
115 }
116
112 return 0; 117 return 0;
113} 118}
114 119
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
index 07b644ec5..ef60f72ce 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
@@ -92,16 +92,11 @@ private:
92 "IoctlCharacteristics is incorrect size"); 92 "IoctlCharacteristics is incorrect size");
93 93
94 struct IoctlGpuGetTpcMasksArgs { 94 struct IoctlGpuGetTpcMasksArgs {
95 /// [in] TPC mask buffer size reserved by userspace. Should be at least 95 u32_le mask_buffer_size{};
96 /// sizeof(__u32) * fls(gpc_mask) to receive TPC mask for each GPC. 96 INSERT_PADDING_WORDS(1);
97 /// [out] full kernel buffer size 97 u64_le mask_buffer_address{};
98 u32_le mask_buf_size; 98 u32_le tcp_mask{};
99 u32_le reserved; 99 INSERT_PADDING_WORDS(1);
100
101 /// [in] pointer to TPC mask buffer. It will receive one 32-bit TPC mask per GPC or 0 if
102 /// GPC is not enabled or not present. This parameter is ignored if mask_buf_size is 0.
103 u64_le mask_buf_addr;
104 u64_le tpc_mask_size; // Nintendo add this?
105 }; 100 };
106 static_assert(sizeof(IoctlGpuGetTpcMasksArgs) == 24, 101 static_assert(sizeof(IoctlGpuGetTpcMasksArgs) == 24,
107 "IoctlGpuGetTpcMasksArgs is incorrect size"); 102 "IoctlGpuGetTpcMasksArgs is incorrect size");
@@ -166,7 +161,8 @@ private:
166 161
167 u32 GetCharacteristics(const std::vector<u8>& input, std::vector<u8>& output, 162 u32 GetCharacteristics(const std::vector<u8>& input, std::vector<u8>& output,
168 std::vector<u8>& output2, IoctlVersion version); 163 std::vector<u8>& output2, IoctlVersion version);
169 u32 GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output); 164 u32 GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output, std::vector<u8>& output2,
165 IoctlVersion version);
170 u32 GetActiveSlotMask(const std::vector<u8>& input, std::vector<u8>& output); 166 u32 GetActiveSlotMask(const std::vector<u8>& input, std::vector<u8>& output);
171 u32 ZCullGetCtxSize(const std::vector<u8>& input, std::vector<u8>& output); 167 u32 ZCullGetCtxSize(const std::vector<u8>& input, std::vector<u8>& output);
172 u32 ZCullGetInfo(const std::vector<u8>& input, std::vector<u8>& output); 168 u32 ZCullGetInfo(const std::vector<u8>& input, std::vector<u8>& output);
diff --git a/src/core/settings.cpp b/src/core/settings.cpp
index da53cde05..4edff9cd8 100644
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -112,6 +112,7 @@ void LogSettings() {
112 LogSetting("Renderer_UseAsynchronousGpuEmulation", 112 LogSetting("Renderer_UseAsynchronousGpuEmulation",
113 Settings::values.use_asynchronous_gpu_emulation); 113 Settings::values.use_asynchronous_gpu_emulation);
114 LogSetting("Renderer_UseVsync", Settings::values.use_vsync); 114 LogSetting("Renderer_UseVsync", Settings::values.use_vsync);
115 LogSetting("Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
115 LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy); 116 LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy);
116 LogSetting("Audio_OutputEngine", Settings::values.sink_id); 117 LogSetting("Audio_OutputEngine", Settings::values.sink_id);
117 LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching); 118 LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching);
diff --git a/src/core/settings.h b/src/core/settings.h
index c1266b341..33e1e06cd 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -437,7 +437,7 @@ struct Values {
437 bool renderer_debug; 437 bool renderer_debug;
438 int vulkan_device; 438 int vulkan_device;
439 439
440 float resolution_factor; 440 u16 resolution_factor{1};
441 int aspect_ratio; 441 int aspect_ratio;
442 int max_anisotropy; 442 int max_anisotropy;
443 bool use_frame_limit; 443 bool use_frame_limit;
@@ -446,6 +446,7 @@ struct Values {
446 GPUAccuracy gpu_accuracy; 446 GPUAccuracy gpu_accuracy;
447 bool use_asynchronous_gpu_emulation; 447 bool use_asynchronous_gpu_emulation;
448 bool use_vsync; 448 bool use_vsync;
449 bool use_assembly_shaders;
449 bool force_30fps_mode; 450 bool force_30fps_mode;
450 bool use_fast_gpu_time; 451 bool use_fast_gpu_time;
451 452
@@ -473,6 +474,7 @@ struct Values {
473 bool reporting_services; 474 bool reporting_services;
474 bool quest_flag; 475 bool quest_flag;
475 bool disable_cpu_opt; 476 bool disable_cpu_opt;
477 bool disable_macro_jit;
476 478
477 // BCAT 479 // BCAT
478 std::string bcat_backend; 480 std::string bcat_backend;
diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp
index 1c3b03a1c..c781b3cfc 100644
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -201,6 +201,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
201 AddField(field_type, "Renderer_UseAsynchronousGpuEmulation", 201 AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
202 Settings::values.use_asynchronous_gpu_emulation); 202 Settings::values.use_asynchronous_gpu_emulation);
203 AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync); 203 AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync);
204 AddField(field_type, "Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
204 AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode); 205 AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode);
205} 206}
206 207
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index d23c53843..099bb446e 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,6 +1,7 @@
1add_library(video_core STATIC 1add_library(video_core STATIC
2 buffer_cache/buffer_block.h 2 buffer_cache/buffer_block.h
3 buffer_cache/buffer_cache.h 3 buffer_cache/buffer_cache.h
4 buffer_cache/map_interval.cpp
4 buffer_cache/map_interval.h 5 buffer_cache/map_interval.h
5 dirty_flags.cpp 6 dirty_flags.cpp
6 dirty_flags.h 7 dirty_flags.h
@@ -24,6 +25,12 @@ add_library(video_core STATIC
24 engines/shader_bytecode.h 25 engines/shader_bytecode.h
25 engines/shader_header.h 26 engines/shader_header.h
26 engines/shader_type.h 27 engines/shader_type.h
28 macro/macro.cpp
29 macro/macro.h
30 macro/macro_interpreter.cpp
31 macro/macro_interpreter.h
32 macro/macro_jit_x64.cpp
33 macro/macro_jit_x64.h
27 fence_manager.h 34 fence_manager.h
28 gpu.cpp 35 gpu.cpp
29 gpu.h 36 gpu.h
@@ -35,8 +42,6 @@ add_library(video_core STATIC
35 gpu_thread.h 42 gpu_thread.h
36 guest_driver.cpp 43 guest_driver.cpp
37 guest_driver.h 44 guest_driver.h
38 macro_interpreter.cpp
39 macro_interpreter.h
40 memory_manager.cpp 45 memory_manager.cpp
41 memory_manager.h 46 memory_manager.h
42 morton.cpp 47 morton.cpp
@@ -44,11 +49,11 @@ add_library(video_core STATIC
44 query_cache.h 49 query_cache.h
45 rasterizer_accelerated.cpp 50 rasterizer_accelerated.cpp
46 rasterizer_accelerated.h 51 rasterizer_accelerated.h
47 rasterizer_cache.cpp
48 rasterizer_cache.h
49 rasterizer_interface.h 52 rasterizer_interface.h
50 renderer_base.cpp 53 renderer_base.cpp
51 renderer_base.h 54 renderer_base.h
55 renderer_opengl/gl_arb_decompiler.cpp
56 renderer_opengl/gl_arb_decompiler.h
52 renderer_opengl/gl_buffer_cache.cpp 57 renderer_opengl/gl_buffer_cache.cpp
53 renderer_opengl/gl_buffer_cache.h 58 renderer_opengl/gl_buffer_cache.h
54 renderer_opengl/gl_device.cpp 59 renderer_opengl/gl_device.cpp
@@ -88,6 +93,7 @@ add_library(video_core STATIC
88 renderer_opengl/utils.h 93 renderer_opengl/utils.h
89 sampler_cache.cpp 94 sampler_cache.cpp
90 sampler_cache.h 95 sampler_cache.h
96 shader_cache.h
91 shader/decode/arithmetic.cpp 97 shader/decode/arithmetic.cpp
92 shader/decode/arithmetic_immediate.cpp 98 shader/decode/arithmetic_immediate.cpp
93 shader/decode/bfe.cpp 99 shader/decode/bfe.cpp
@@ -228,7 +234,7 @@ endif()
228create_target_directory_groups(video_core) 234create_target_directory_groups(video_core)
229 235
230target_link_libraries(video_core PUBLIC common core) 236target_link_libraries(video_core PUBLIC common core)
231target_link_libraries(video_core PRIVATE glad) 237target_link_libraries(video_core PRIVATE glad xbyak)
232 238
233if (ENABLE_VULKAN) 239if (ENABLE_VULKAN)
234 target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include) 240 target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
index e35ee0b67..e64170e66 100644
--- a/src/video_core/buffer_cache/buffer_block.h
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -15,48 +15,47 @@ namespace VideoCommon {
15 15
16class BufferBlock { 16class BufferBlock {
17public: 17public:
18 bool Overlaps(const VAddr start, const VAddr end) const { 18 bool Overlaps(VAddr start, VAddr end) const {
19 return (cpu_addr < end) && (cpu_addr_end > start); 19 return (cpu_addr < end) && (cpu_addr_end > start);
20 } 20 }
21 21
22 bool IsInside(const VAddr other_start, const VAddr other_end) const { 22 bool IsInside(VAddr other_start, VAddr other_end) const {
23 return cpu_addr <= other_start && other_end <= cpu_addr_end; 23 return cpu_addr <= other_start && other_end <= cpu_addr_end;
24 } 24 }
25 25
26 std::size_t GetOffset(const VAddr in_addr) { 26 std::size_t Offset(VAddr in_addr) const {
27 return static_cast<std::size_t>(in_addr - cpu_addr); 27 return static_cast<std::size_t>(in_addr - cpu_addr);
28 } 28 }
29 29
30 VAddr GetCpuAddr() const { 30 VAddr CpuAddr() const {
31 return cpu_addr; 31 return cpu_addr;
32 } 32 }
33 33
34 VAddr GetCpuAddrEnd() const { 34 VAddr CpuAddrEnd() const {
35 return cpu_addr_end; 35 return cpu_addr_end;
36 } 36 }
37 37
38 void SetCpuAddr(const VAddr new_addr) { 38 void SetCpuAddr(VAddr new_addr) {
39 cpu_addr = new_addr; 39 cpu_addr = new_addr;
40 cpu_addr_end = new_addr + size; 40 cpu_addr_end = new_addr + size;
41 } 41 }
42 42
43 std::size_t GetSize() const { 43 std::size_t Size() const {
44 return size; 44 return size;
45 } 45 }
46 46
47 void SetEpoch(u64 new_epoch) { 47 u64 Epoch() const {
48 epoch = new_epoch; 48 return epoch;
49 } 49 }
50 50
51 u64 GetEpoch() { 51 void SetEpoch(u64 new_epoch) {
52 return epoch; 52 epoch = new_epoch;
53 } 53 }
54 54
55protected: 55protected:
56 explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} { 56 explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
57 SetCpuAddr(cpu_addr); 57 SetCpuAddr(cpu_addr_);
58 } 58 }
59 ~BufferBlock() = default;
60 59
61private: 60private:
62 VAddr cpu_addr{}; 61 VAddr cpu_addr{};
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 56e570994..308d8b55f 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -12,11 +12,12 @@
12#include <utility> 12#include <utility>
13#include <vector> 13#include <vector>
14 14
15#include <boost/icl/interval_map.hpp> 15#include <boost/container/small_vector.hpp>
16#include <boost/icl/interval_set.hpp> 16#include <boost/icl/interval_set.hpp>
17#include <boost/range/iterator_range.hpp> 17#include <boost/intrusive/set.hpp>
18 18
19#include "common/alignment.h" 19#include "common/alignment.h"
20#include "common/assert.h"
20#include "common/common_types.h" 21#include "common/common_types.h"
21#include "common/logging/log.h" 22#include "common/logging/log.h"
22#include "core/core.h" 23#include "core/core.h"
@@ -29,10 +30,16 @@
29 30
30namespace VideoCommon { 31namespace VideoCommon {
31 32
32using MapInterval = std::shared_ptr<MapIntervalBase>; 33template <typename Buffer, typename BufferType, typename StreamBuffer>
33
34template <typename OwnerBuffer, typename BufferType, typename StreamBuffer>
35class BufferCache { 34class BufferCache {
35 using IntervalSet = boost::icl::interval_set<VAddr>;
36 using IntervalType = typename IntervalSet::interval_type;
37 using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
38
39 static constexpr u64 WRITE_PAGE_BIT = 11;
40 static constexpr u64 BLOCK_PAGE_BITS = 21;
41 static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
42
36public: 43public:
37 using BufferInfo = std::pair<BufferType, u64>; 44 using BufferInfo = std::pair<BufferType, u64>;
38 45
@@ -40,14 +47,12 @@ public:
40 bool is_written = false, bool use_fast_cbuf = false) { 47 bool is_written = false, bool use_fast_cbuf = false) {
41 std::lock_guard lock{mutex}; 48 std::lock_guard lock{mutex};
42 49
43 const std::optional<VAddr> cpu_addr_opt = 50 const auto& memory_manager = system.GPU().MemoryManager();
44 system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); 51 const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
45
46 if (!cpu_addr_opt) { 52 if (!cpu_addr_opt) {
47 return {GetEmptyBuffer(size), 0}; 53 return {GetEmptyBuffer(size), 0};
48 } 54 }
49 55 const VAddr cpu_addr = *cpu_addr_opt;
50 VAddr cpu_addr = *cpu_addr_opt;
51 56
52 // Cache management is a big overhead, so only cache entries with a given size. 57 // Cache management is a big overhead, so only cache entries with a given size.
53 // TODO: Figure out which size is the best for given games. 58 // TODO: Figure out which size is the best for given games.
@@ -55,76 +60,91 @@ public:
55 if (use_fast_cbuf || size < max_stream_size) { 60 if (use_fast_cbuf || size < max_stream_size) {
56 if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) { 61 if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) {
57 auto& memory_manager = system.GPU().MemoryManager(); 62 auto& memory_manager = system.GPU().MemoryManager();
63 const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size);
58 if (use_fast_cbuf) { 64 if (use_fast_cbuf) {
59 if (memory_manager.IsGranularRange(gpu_addr, size)) { 65 u8* dest;
60 const auto host_ptr = memory_manager.GetPointer(gpu_addr); 66 if (is_granular) {
61 return ConstBufferUpload(host_ptr, size); 67 dest = memory_manager.GetPointer(gpu_addr);
62 } else { 68 } else {
63 staging_buffer.resize(size); 69 staging_buffer.resize(size);
64 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); 70 dest = staging_buffer.data();
65 return ConstBufferUpload(staging_buffer.data(), size); 71 memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
66 } 72 }
73 return ConstBufferUpload(dest, size);
74 }
75 if (is_granular) {
76 u8* const host_ptr = memory_manager.GetPointer(gpu_addr);
77 return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
78 std::memcpy(dest, host_ptr, size);
79 });
67 } else { 80 } else {
68 if (memory_manager.IsGranularRange(gpu_addr, size)) { 81 return StreamBufferUpload(
69 const auto host_ptr = memory_manager.GetPointer(gpu_addr); 82 size, alignment, [&memory_manager, gpu_addr, size](u8* dest) {
70 return StreamBufferUpload(host_ptr, size, alignment); 83 memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
71 } else { 84 });
72 staging_buffer.resize(size);
73 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
74 return StreamBufferUpload(staging_buffer.data(), size, alignment);
75 }
76 } 85 }
77 } 86 }
78 } 87 }
79 88
80 auto block = GetBlock(cpu_addr, size); 89 Buffer* const block = GetBlock(cpu_addr, size);
81 auto map = MapAddress(block, gpu_addr, cpu_addr, size); 90 MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
91 if (!map) {
92 return {GetEmptyBuffer(size), 0};
93 }
82 if (is_written) { 94 if (is_written) {
83 map->MarkAsModified(true, GetModifiedTicks()); 95 map->MarkAsModified(true, GetModifiedTicks());
84 if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) { 96 if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
85 MarkForAsyncFlush(map); 97 MarkForAsyncFlush(map);
86 } 98 }
87 if (!map->IsWritten()) { 99 if (!map->is_written) {
88 map->MarkAsWritten(true); 100 map->is_written = true;
89 MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1); 101 MarkRegionAsWritten(map->start, map->end - 1);
90 } 102 }
91 } 103 }
92 104
93 return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))}; 105 return {block->Handle(), static_cast<u64>(block->Offset(cpu_addr))};
94 } 106 }
95 107
96 /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. 108 /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
97 BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, 109 BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
98 std::size_t alignment = 4) { 110 std::size_t alignment = 4) {
99 std::lock_guard lock{mutex}; 111 std::lock_guard lock{mutex};
100 return StreamBufferUpload(raw_pointer, size, alignment); 112 return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
113 std::memcpy(dest, raw_pointer, size);
114 });
101 } 115 }
102 116
103 void Map(std::size_t max_size) { 117 /// Prepares the buffer cache for data uploading
118 /// @param max_size Maximum number of bytes that will be uploaded
119 /// @return True when a stream buffer invalidation was required, false otherwise
120 bool Map(std::size_t max_size) {
104 std::lock_guard lock{mutex}; 121 std::lock_guard lock{mutex};
105 122
123 bool invalidated;
106 std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4); 124 std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
107 buffer_offset = buffer_offset_base; 125 buffer_offset = buffer_offset_base;
126
127 return invalidated;
108 } 128 }
109 129
110 /// Finishes the upload stream, returns true on bindings invalidation. 130 /// Finishes the upload stream
111 bool Unmap() { 131 void Unmap() {
112 std::lock_guard lock{mutex}; 132 std::lock_guard lock{mutex};
113
114 stream_buffer->Unmap(buffer_offset - buffer_offset_base); 133 stream_buffer->Unmap(buffer_offset - buffer_offset_base);
115 return std::exchange(invalidated, false);
116 } 134 }
117 135
136 /// Function called at the end of each frame, inteded for deferred operations
118 void TickFrame() { 137 void TickFrame() {
119 ++epoch; 138 ++epoch;
139
120 while (!pending_destruction.empty()) { 140 while (!pending_destruction.empty()) {
121 // Delay at least 4 frames before destruction. 141 // Delay at least 4 frames before destruction.
122 // This is due to triple buffering happening on some drivers. 142 // This is due to triple buffering happening on some drivers.
123 static constexpr u64 epochs_to_destroy = 5; 143 static constexpr u64 epochs_to_destroy = 5;
124 if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) { 144 if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
125 break; 145 break;
126 } 146 }
127 pending_destruction.pop_front(); 147 pending_destruction.pop();
128 } 148 }
129 } 149 }
130 150
@@ -132,12 +152,11 @@ public:
132 void FlushRegion(VAddr addr, std::size_t size) { 152 void FlushRegion(VAddr addr, std::size_t size) {
133 std::lock_guard lock{mutex}; 153 std::lock_guard lock{mutex};
134 154
135 std::vector<MapInterval> objects = GetMapsInRange(addr, size); 155 VectorMapInterval objects = GetMapsInRange(addr, size);
136 std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) { 156 std::sort(objects.begin(), objects.end(),
137 return a->GetModificationTick() < b->GetModificationTick(); 157 [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
138 }); 158 for (MapInterval* object : objects) {
139 for (auto& object : objects) { 159 if (object->is_modified && object->is_registered) {
140 if (object->IsModified() && object->IsRegistered()) {
141 mutex.unlock(); 160 mutex.unlock();
142 FlushMap(object); 161 FlushMap(object);
143 mutex.lock(); 162 mutex.lock();
@@ -148,9 +167,9 @@ public:
148 bool MustFlushRegion(VAddr addr, std::size_t size) { 167 bool MustFlushRegion(VAddr addr, std::size_t size) {
149 std::lock_guard lock{mutex}; 168 std::lock_guard lock{mutex};
150 169
151 const std::vector<MapInterval> objects = GetMapsInRange(addr, size); 170 const VectorMapInterval objects = GetMapsInRange(addr, size);
152 return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval& map) { 171 return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
153 return map->IsModified() && map->IsRegistered(); 172 return map->is_modified && map->is_registered;
154 }); 173 });
155 } 174 }
156 175
@@ -158,9 +177,8 @@ public:
158 void InvalidateRegion(VAddr addr, u64 size) { 177 void InvalidateRegion(VAddr addr, u64 size) {
159 std::lock_guard lock{mutex}; 178 std::lock_guard lock{mutex};
160 179
161 std::vector<MapInterval> objects = GetMapsInRange(addr, size); 180 for (auto& object : GetMapsInRange(addr, size)) {
162 for (auto& object : objects) { 181 if (object->is_registered) {
163 if (object->IsRegistered()) {
164 Unregister(object); 182 Unregister(object);
165 } 183 }
166 } 184 }
@@ -169,10 +187,10 @@ public:
169 void OnCPUWrite(VAddr addr, std::size_t size) { 187 void OnCPUWrite(VAddr addr, std::size_t size) {
170 std::lock_guard lock{mutex}; 188 std::lock_guard lock{mutex};
171 189
172 for (const auto& object : GetMapsInRange(addr, size)) { 190 for (MapInterval* object : GetMapsInRange(addr, size)) {
173 if (object->IsMemoryMarked() && object->IsRegistered()) { 191 if (object->is_memory_marked && object->is_registered) {
174 UnmarkMemory(object); 192 UnmarkMemory(object);
175 object->SetSyncPending(true); 193 object->is_sync_pending = true;
176 marked_for_unregister.emplace_back(object); 194 marked_for_unregister.emplace_back(object);
177 } 195 }
178 } 196 }
@@ -181,9 +199,9 @@ public:
181 void SyncGuestHost() { 199 void SyncGuestHost() {
182 std::lock_guard lock{mutex}; 200 std::lock_guard lock{mutex};
183 201
184 for (const auto& object : marked_for_unregister) { 202 for (auto& object : marked_for_unregister) {
185 if (object->IsRegistered()) { 203 if (object->is_registered) {
186 object->SetSyncPending(false); 204 object->is_sync_pending = false;
187 Unregister(object); 205 Unregister(object);
188 } 206 }
189 } 207 }
@@ -192,9 +210,9 @@ public:
192 210
193 void CommitAsyncFlushes() { 211 void CommitAsyncFlushes() {
194 if (uncommitted_flushes) { 212 if (uncommitted_flushes) {
195 auto commit_list = std::make_shared<std::list<MapInterval>>(); 213 auto commit_list = std::make_shared<std::list<MapInterval*>>();
196 for (auto& map : *uncommitted_flushes) { 214 for (MapInterval* map : *uncommitted_flushes) {
197 if (map->IsRegistered() && map->IsModified()) { 215 if (map->is_registered && map->is_modified) {
198 // TODO(Blinkhawk): Implement backend asynchronous flushing 216 // TODO(Blinkhawk): Implement backend asynchronous flushing
199 // AsyncFlushMap(map) 217 // AsyncFlushMap(map)
200 commit_list->push_back(map); 218 commit_list->push_back(map);
@@ -228,8 +246,8 @@ public:
228 committed_flushes.pop_front(); 246 committed_flushes.pop_front();
229 return; 247 return;
230 } 248 }
231 for (MapInterval& map : *flush_list) { 249 for (MapInterval* map : *flush_list) {
232 if (map->IsRegistered()) { 250 if (map->is_registered) {
233 // TODO(Blinkhawk): Replace this for reading the asynchronous flush 251 // TODO(Blinkhawk): Replace this for reading the asynchronous flush
234 FlushMap(map); 252 FlushMap(map);
235 } 253 }
@@ -241,23 +259,21 @@ public:
241 259
242protected: 260protected:
243 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, 261 explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
244 std::unique_ptr<StreamBuffer> stream_buffer) 262 std::unique_ptr<StreamBuffer> stream_buffer_)
245 : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)}, 263 : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer_)},
246 stream_buffer_handle{this->stream_buffer->GetHandle()} {} 264 stream_buffer_handle{stream_buffer->Handle()} {}
247 265
248 ~BufferCache() = default; 266 ~BufferCache() = default;
249 267
250 virtual BufferType ToHandle(const OwnerBuffer& storage) = 0; 268 virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
251
252 virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
253 269
254 virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size, 270 virtual void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
255 const u8* data) = 0; 271 const u8* data) = 0;
256 272
257 virtual void DownloadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size, 273 virtual void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
258 u8* data) = 0; 274 u8* data) = 0;
259 275
260 virtual void CopyBlock(const OwnerBuffer& src, const OwnerBuffer& dst, std::size_t src_offset, 276 virtual void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
261 std::size_t dst_offset, std::size_t size) = 0; 277 std::size_t dst_offset, std::size_t size) = 0;
262 278
263 virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { 279 virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
@@ -265,76 +281,74 @@ protected:
265 } 281 }
266 282
267 /// Register an object into the cache 283 /// Register an object into the cache
268 void Register(const MapInterval& new_map, bool inherit_written = false) { 284 MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
269 const VAddr cpu_addr = new_map->GetStart(); 285 const VAddr cpu_addr = new_map.start;
270 if (!cpu_addr) { 286 if (!cpu_addr) {
271 LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}", 287 LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
272 new_map->GetGpuAddress()); 288 new_map.gpu_addr);
273 return; 289 return nullptr;
274 } 290 }
275 const std::size_t size = new_map->GetEnd() - new_map->GetStart(); 291 const std::size_t size = new_map.end - new_map.start;
276 new_map->MarkAsRegistered(true); 292 new_map.is_registered = true;
277 const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
278 mapped_addresses.insert({interval, new_map});
279 rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1); 293 rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
280 new_map->SetMemoryMarked(true); 294 new_map.is_memory_marked = true;
281 if (inherit_written) { 295 if (inherit_written) {
282 MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1); 296 MarkRegionAsWritten(new_map.start, new_map.end - 1);
283 new_map->MarkAsWritten(true); 297 new_map.is_written = true;
284 } 298 }
299 MapInterval* const storage = mapped_addresses_allocator.Allocate();
300 *storage = new_map;
301 mapped_addresses.insert(*storage);
302 return storage;
285 } 303 }
286 304
287 void UnmarkMemory(const MapInterval& map) { 305 void UnmarkMemory(MapInterval* map) {
288 if (!map->IsMemoryMarked()) { 306 if (!map->is_memory_marked) {
289 return; 307 return;
290 } 308 }
291 const std::size_t size = map->GetEnd() - map->GetStart(); 309 const std::size_t size = map->end - map->start;
292 rasterizer.UpdatePagesCachedCount(map->GetStart(), size, -1); 310 rasterizer.UpdatePagesCachedCount(map->start, size, -1);
293 map->SetMemoryMarked(false); 311 map->is_memory_marked = false;
294 } 312 }
295 313
296 /// Unregisters an object from the cache 314 /// Unregisters an object from the cache
297 void Unregister(const MapInterval& map) { 315 void Unregister(MapInterval* map) {
298 UnmarkMemory(map); 316 UnmarkMemory(map);
299 map->MarkAsRegistered(false); 317 map->is_registered = false;
300 if (map->IsSyncPending()) { 318 if (map->is_sync_pending) {
319 map->is_sync_pending = false;
301 marked_for_unregister.remove(map); 320 marked_for_unregister.remove(map);
302 map->SetSyncPending(false);
303 } 321 }
304 if (map->IsWritten()) { 322 if (map->is_written) {
305 UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1); 323 UnmarkRegionAsWritten(map->start, map->end - 1);
306 } 324 }
307 const IntervalType delete_interval{map->GetStart(), map->GetEnd()}; 325 const auto it = mapped_addresses.find(*map);
308 mapped_addresses.erase(delete_interval); 326 ASSERT(it != mapped_addresses.end());
327 mapped_addresses.erase(it);
328 mapped_addresses_allocator.Release(map);
309 } 329 }
310 330
311private: 331private:
312 MapInterval CreateMap(const VAddr start, const VAddr end, const GPUVAddr gpu_addr) { 332 MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr,
313 return std::make_shared<MapIntervalBase>(start, end, gpu_addr); 333 std::size_t size) {
314 } 334 const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
315
316 MapInterval MapAddress(const OwnerBuffer& block, const GPUVAddr gpu_addr, const VAddr cpu_addr,
317 const std::size_t size) {
318 std::vector<MapInterval> overlaps = GetMapsInRange(cpu_addr, size);
319 if (overlaps.empty()) { 335 if (overlaps.empty()) {
320 auto& memory_manager = system.GPU().MemoryManager(); 336 auto& memory_manager = system.GPU().MemoryManager();
321 const VAddr cpu_addr_end = cpu_addr + size; 337 const VAddr cpu_addr_end = cpu_addr + size;
322 MapInterval new_map = CreateMap(cpu_addr, cpu_addr_end, gpu_addr);
323 if (memory_manager.IsGranularRange(gpu_addr, size)) { 338 if (memory_manager.IsGranularRange(gpu_addr, size)) {
324 u8* host_ptr = memory_manager.GetPointer(gpu_addr); 339 u8* host_ptr = memory_manager.GetPointer(gpu_addr);
325 UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr); 340 UploadBlockData(*block, block->Offset(cpu_addr), size, host_ptr);
326 } else { 341 } else {
327 staging_buffer.resize(size); 342 staging_buffer.resize(size);
328 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); 343 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
329 UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data()); 344 UploadBlockData(*block, block->Offset(cpu_addr), size, staging_buffer.data());
330 } 345 }
331 Register(new_map); 346 return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
332 return new_map;
333 } 347 }
334 348
335 const VAddr cpu_addr_end = cpu_addr + size; 349 const VAddr cpu_addr_end = cpu_addr + size;
336 if (overlaps.size() == 1) { 350 if (overlaps.size() == 1) {
337 MapInterval& current_map = overlaps[0]; 351 MapInterval* const current_map = overlaps[0];
338 if (current_map->IsInside(cpu_addr, cpu_addr_end)) { 352 if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
339 return current_map; 353 return current_map;
340 } 354 }
@@ -344,60 +358,70 @@ private:
344 bool write_inheritance = false; 358 bool write_inheritance = false;
345 bool modified_inheritance = false; 359 bool modified_inheritance = false;
346 // Calculate new buffer parameters 360 // Calculate new buffer parameters
347 for (auto& overlap : overlaps) { 361 for (MapInterval* overlap : overlaps) {
348 new_start = std::min(overlap->GetStart(), new_start); 362 new_start = std::min(overlap->start, new_start);
349 new_end = std::max(overlap->GetEnd(), new_end); 363 new_end = std::max(overlap->end, new_end);
350 write_inheritance |= overlap->IsWritten(); 364 write_inheritance |= overlap->is_written;
351 modified_inheritance |= overlap->IsModified(); 365 modified_inheritance |= overlap->is_modified;
352 } 366 }
353 GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; 367 GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
354 for (auto& overlap : overlaps) { 368 for (auto& overlap : overlaps) {
355 Unregister(overlap); 369 Unregister(overlap);
356 } 370 }
357 UpdateBlock(block, new_start, new_end, overlaps); 371 UpdateBlock(block, new_start, new_end, overlaps);
358 MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr); 372
373 const MapInterval new_map{new_start, new_end, new_gpu_addr};
374 MapInterval* const map = Register(new_map, write_inheritance);
375 if (!map) {
376 return nullptr;
377 }
359 if (modified_inheritance) { 378 if (modified_inheritance) {
360 new_map->MarkAsModified(true, GetModifiedTicks()); 379 map->MarkAsModified(true, GetModifiedTicks());
361 if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) { 380 if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
362 MarkForAsyncFlush(new_map); 381 MarkForAsyncFlush(map);
363 } 382 }
364 } 383 }
365 Register(new_map, write_inheritance); 384 return map;
366 return new_map;
367 } 385 }
368 386
369 void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end, 387 void UpdateBlock(const Buffer* block, VAddr start, VAddr end,
370 std::vector<MapInterval>& overlaps) { 388 const VectorMapInterval& overlaps) {
371 const IntervalType base_interval{start, end}; 389 const IntervalType base_interval{start, end};
372 IntervalSet interval_set{}; 390 IntervalSet interval_set{};
373 interval_set.add(base_interval); 391 interval_set.add(base_interval);
374 for (auto& overlap : overlaps) { 392 for (auto& overlap : overlaps) {
375 const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()}; 393 const IntervalType subtract{overlap->start, overlap->end};
376 interval_set.subtract(subtract); 394 interval_set.subtract(subtract);
377 } 395 }
378 for (auto& interval : interval_set) { 396 for (auto& interval : interval_set) {
379 std::size_t size = interval.upper() - interval.lower(); 397 const std::size_t size = interval.upper() - interval.lower();
380 if (size > 0) { 398 if (size == 0) {
381 staging_buffer.resize(size); 399 continue;
382 system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
383 UploadBlockData(block, block->GetOffset(interval.lower()), size,
384 staging_buffer.data());
385 } 400 }
401 staging_buffer.resize(size);
402 system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
403 UploadBlockData(*block, block->Offset(interval.lower()), size, staging_buffer.data());
386 } 404 }
387 } 405 }
388 406
389 std::vector<MapInterval> GetMapsInRange(VAddr addr, std::size_t size) { 407 VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
408 VectorMapInterval result;
390 if (size == 0) { 409 if (size == 0) {
391 return {}; 410 return result;
392 } 411 }
393 412
394 std::vector<MapInterval> objects{}; 413 const VAddr addr_end = addr + size;
395 const IntervalType interval{addr, addr + size}; 414 auto it = mapped_addresses.lower_bound(addr);
396 for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) { 415 if (it != mapped_addresses.begin()) {
397 objects.push_back(pair.second); 416 --it;
398 } 417 }
399 418 while (it != mapped_addresses.end() && it->start < addr_end) {
400 return objects; 419 if (it->Overlaps(addr, addr_end)) {
420 result.push_back(&*it);
421 }
422 ++it;
423 }
424 return result;
401 } 425 }
402 426
403 /// Returns a ticks counter used for tracking when cached objects were last modified 427 /// Returns a ticks counter used for tracking when cached objects were last modified
@@ -405,20 +429,24 @@ private:
405 return ++modified_ticks; 429 return ++modified_ticks;
406 } 430 }
407 431
408 void FlushMap(MapInterval map) { 432 void FlushMap(MapInterval* map) {
409 std::size_t size = map->GetEnd() - map->GetStart(); 433 const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
410 OwnerBuffer block = blocks[map->GetStart() >> block_page_bits]; 434 ASSERT_OR_EXECUTE(it != blocks.end(), return;);
435
436 std::shared_ptr<Buffer> block = it->second;
437
438 const std::size_t size = map->end - map->start;
411 staging_buffer.resize(size); 439 staging_buffer.resize(size);
412 DownloadBlockData(block, block->GetOffset(map->GetStart()), size, staging_buffer.data()); 440 DownloadBlockData(*block, block->Offset(map->start), size, staging_buffer.data());
413 system.Memory().WriteBlockUnsafe(map->GetStart(), staging_buffer.data(), size); 441 system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
414 map->MarkAsModified(false, 0); 442 map->MarkAsModified(false, 0);
415 } 443 }
416 444
417 BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size, 445 template <typename Callable>
418 std::size_t alignment) { 446 BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
419 AlignBuffer(alignment); 447 AlignBuffer(alignment);
420 const std::size_t uploaded_offset = buffer_offset; 448 const std::size_t uploaded_offset = buffer_offset;
421 std::memcpy(buffer_ptr, raw_pointer, size); 449 callable(buffer_ptr);
422 450
423 buffer_ptr += size; 451 buffer_ptr += size;
424 buffer_offset += size; 452 buffer_offset += size;
@@ -432,97 +460,89 @@ private:
432 buffer_offset = offset_aligned; 460 buffer_offset = offset_aligned;
433 } 461 }
434 462
435 OwnerBuffer EnlargeBlock(OwnerBuffer buffer) { 463 std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
436 const std::size_t old_size = buffer->GetSize(); 464 const std::size_t old_size = buffer->Size();
437 const std::size_t new_size = old_size + block_page_size; 465 const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
438 const VAddr cpu_addr = buffer->GetCpuAddr(); 466 const VAddr cpu_addr = buffer->CpuAddr();
439 OwnerBuffer new_buffer = CreateBlock(cpu_addr, new_size); 467 std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
440 CopyBlock(buffer, new_buffer, 0, 0, old_size); 468 CopyBlock(*buffer, *new_buffer, 0, 0, old_size);
441 buffer->SetEpoch(epoch); 469 QueueDestruction(std::move(buffer));
442 pending_destruction.push_back(buffer); 470
443 const VAddr cpu_addr_end = cpu_addr + new_size - 1; 471 const VAddr cpu_addr_end = cpu_addr + new_size - 1;
444 u64 page_start = cpu_addr >> block_page_bits; 472 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
445 const u64 page_end = cpu_addr_end >> block_page_bits; 473 for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
446 while (page_start <= page_end) { 474 blocks.insert_or_assign(page_start, new_buffer);
447 blocks[page_start] = new_buffer;
448 ++page_start;
449 } 475 }
476
450 return new_buffer; 477 return new_buffer;
451 } 478 }
452 479
453 OwnerBuffer MergeBlocks(OwnerBuffer first, OwnerBuffer second) { 480 std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
454 const std::size_t size_1 = first->GetSize(); 481 std::shared_ptr<Buffer> second) {
455 const std::size_t size_2 = second->GetSize(); 482 const std::size_t size_1 = first->Size();
456 const VAddr first_addr = first->GetCpuAddr(); 483 const std::size_t size_2 = second->Size();
457 const VAddr second_addr = second->GetCpuAddr(); 484 const VAddr first_addr = first->CpuAddr();
485 const VAddr second_addr = second->CpuAddr();
458 const VAddr new_addr = std::min(first_addr, second_addr); 486 const VAddr new_addr = std::min(first_addr, second_addr);
459 const std::size_t new_size = size_1 + size_2; 487 const std::size_t new_size = size_1 + size_2;
460 OwnerBuffer new_buffer = CreateBlock(new_addr, new_size); 488
461 CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1); 489 std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
462 CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2); 490 CopyBlock(*first, *new_buffer, 0, new_buffer->Offset(first_addr), size_1);
463 first->SetEpoch(epoch); 491 CopyBlock(*second, *new_buffer, 0, new_buffer->Offset(second_addr), size_2);
464 second->SetEpoch(epoch); 492 QueueDestruction(std::move(first));
465 pending_destruction.push_back(first); 493 QueueDestruction(std::move(second));
466 pending_destruction.push_back(second); 494
467 const VAddr cpu_addr_end = new_addr + new_size - 1; 495 const VAddr cpu_addr_end = new_addr + new_size - 1;
468 u64 page_start = new_addr >> block_page_bits; 496 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
469 const u64 page_end = cpu_addr_end >> block_page_bits; 497 for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
470 while (page_start <= page_end) { 498 blocks.insert_or_assign(page_start, new_buffer);
471 blocks[page_start] = new_buffer;
472 ++page_start;
473 } 499 }
474 return new_buffer; 500 return new_buffer;
475 } 501 }
476 502
477 OwnerBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) { 503 Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
478 OwnerBuffer found; 504 std::shared_ptr<Buffer> found;
505
479 const VAddr cpu_addr_end = cpu_addr + size - 1; 506 const VAddr cpu_addr_end = cpu_addr + size - 1;
480 u64 page_start = cpu_addr >> block_page_bits; 507 const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
481 const u64 page_end = cpu_addr_end >> block_page_bits; 508 for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
482 while (page_start <= page_end) {
483 auto it = blocks.find(page_start); 509 auto it = blocks.find(page_start);
484 if (it == blocks.end()) { 510 if (it == blocks.end()) {
485 if (found) { 511 if (found) {
486 found = EnlargeBlock(found); 512 found = EnlargeBlock(found);
487 } else { 513 continue;
488 const VAddr start_addr = (page_start << block_page_bits);
489 found = CreateBlock(start_addr, block_page_size);
490 blocks[page_start] = found;
491 }
492 } else {
493 if (found) {
494 if (found == it->second) {
495 ++page_start;
496 continue;
497 }
498 found = MergeBlocks(found, it->second);
499 } else {
500 found = it->second;
501 } 514 }
515 const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
516 found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
517 blocks.insert_or_assign(page_start, found);
518 continue;
519 }
520 if (!found) {
521 found = it->second;
522 continue;
523 }
524 if (found != it->second) {
525 found = MergeBlocks(std::move(found), it->second);
502 } 526 }
503 ++page_start;
504 } 527 }
505 return found; 528 return found.get();
506 } 529 }
507 530
508 void MarkRegionAsWritten(const VAddr start, const VAddr end) { 531 void MarkRegionAsWritten(VAddr start, VAddr end) {
509 u64 page_start = start >> write_page_bit; 532 const u64 page_end = end >> WRITE_PAGE_BIT;
510 const u64 page_end = end >> write_page_bit; 533 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
511 while (page_start <= page_end) {
512 auto it = written_pages.find(page_start); 534 auto it = written_pages.find(page_start);
513 if (it != written_pages.end()) { 535 if (it != written_pages.end()) {
514 it->second = it->second + 1; 536 it->second = it->second + 1;
515 } else { 537 } else {
516 written_pages[page_start] = 1; 538 written_pages.insert_or_assign(page_start, 1);
517 } 539 }
518 page_start++;
519 } 540 }
520 } 541 }
521 542
522 void UnmarkRegionAsWritten(const VAddr start, const VAddr end) { 543 void UnmarkRegionAsWritten(VAddr start, VAddr end) {
523 u64 page_start = start >> write_page_bit; 544 const u64 page_end = end >> WRITE_PAGE_BIT;
524 const u64 page_end = end >> write_page_bit; 545 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
525 while (page_start <= page_end) {
526 auto it = written_pages.find(page_start); 546 auto it = written_pages.find(page_start);
527 if (it != written_pages.end()) { 547 if (it != written_pages.end()) {
528 if (it->second > 1) { 548 if (it->second > 1) {
@@ -531,25 +551,27 @@ private:
531 written_pages.erase(it); 551 written_pages.erase(it);
532 } 552 }
533 } 553 }
534 page_start++;
535 } 554 }
536 } 555 }
537 556
538 bool IsRegionWritten(const VAddr start, const VAddr end) const { 557 bool IsRegionWritten(VAddr start, VAddr end) const {
539 u64 page_start = start >> write_page_bit; 558 const u64 page_end = end >> WRITE_PAGE_BIT;
540 const u64 page_end = end >> write_page_bit; 559 for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
541 while (page_start <= page_end) {
542 if (written_pages.count(page_start) > 0) { 560 if (written_pages.count(page_start) > 0) {
543 return true; 561 return true;
544 } 562 }
545 page_start++;
546 } 563 }
547 return false; 564 return false;
548 } 565 }
549 566
550 void MarkForAsyncFlush(MapInterval& map) { 567 void QueueDestruction(std::shared_ptr<Buffer> buffer) {
568 buffer->SetEpoch(epoch);
569 pending_destruction.push(std::move(buffer));
570 }
571
572 void MarkForAsyncFlush(MapInterval* map) {
551 if (!uncommitted_flushes) { 573 if (!uncommitted_flushes) {
552 uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval>>(); 574 uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
553 } 575 }
554 uncommitted_flushes->insert(map); 576 uncommitted_flushes->insert(map);
555 } 577 }
@@ -558,35 +580,29 @@ private:
558 Core::System& system; 580 Core::System& system;
559 581
560 std::unique_ptr<StreamBuffer> stream_buffer; 582 std::unique_ptr<StreamBuffer> stream_buffer;
561 BufferType stream_buffer_handle{}; 583 BufferType stream_buffer_handle;
562
563 bool invalidated = false;
564 584
565 u8* buffer_ptr = nullptr; 585 u8* buffer_ptr = nullptr;
566 u64 buffer_offset = 0; 586 u64 buffer_offset = 0;
567 u64 buffer_offset_base = 0; 587 u64 buffer_offset_base = 0;
568 588
569 using IntervalSet = boost::icl::interval_set<VAddr>; 589 MapIntervalAllocator mapped_addresses_allocator;
570 using IntervalCache = boost::icl::interval_map<VAddr, MapInterval>; 590 boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
571 using IntervalType = typename IntervalCache::interval_type; 591 mapped_addresses;
572 IntervalCache mapped_addresses;
573 592
574 static constexpr u64 write_page_bit = 11;
575 std::unordered_map<u64, u32> written_pages; 593 std::unordered_map<u64, u32> written_pages;
594 std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
576 595
577 static constexpr u64 block_page_bits = 21; 596 std::queue<std::shared_ptr<Buffer>> pending_destruction;
578 static constexpr u64 block_page_size = 1ULL << block_page_bits;
579 std::unordered_map<u64, OwnerBuffer> blocks;
580
581 std::list<OwnerBuffer> pending_destruction;
582 u64 epoch = 0; 597 u64 epoch = 0;
583 u64 modified_ticks = 0; 598 u64 modified_ticks = 0;
584 599
585 std::vector<u8> staging_buffer; 600 std::vector<u8> staging_buffer;
586 std::list<MapInterval> marked_for_unregister;
587 601
588 std::shared_ptr<std::unordered_set<MapInterval>> uncommitted_flushes{}; 602 std::list<MapInterval*> marked_for_unregister;
589 std::list<std::shared_ptr<std::list<MapInterval>>> committed_flushes; 603
604 std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
605 std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
590 606
591 std::recursive_mutex mutex; 607 std::recursive_mutex mutex;
592}; 608};
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp
new file mode 100644
index 000000000..62587e18a
--- /dev/null
+++ b/src/video_core/buffer_cache/map_interval.cpp
@@ -0,0 +1,33 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <array>
7#include <cstddef>
8#include <memory>
9
10#include "video_core/buffer_cache/map_interval.h"
11
12namespace VideoCommon {
13
14MapIntervalAllocator::MapIntervalAllocator() {
15 FillFreeList(first_chunk);
16}
17
18MapIntervalAllocator::~MapIntervalAllocator() = default;
19
20void MapIntervalAllocator::AllocateNewChunk() {
21 *new_chunk = std::make_unique<Chunk>();
22 FillFreeList(**new_chunk);
23 new_chunk = &(*new_chunk)->next;
24}
25
26void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
27 const std::size_t old_size = free_list.size();
28 free_list.resize(old_size + chunk.data.size());
29 std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
30 [](MapInterval& interval) { return &interval; });
31}
32
33} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
index 29d8b26f3..fe0bcd1d8 100644
--- a/src/video_core/buffer_cache/map_interval.h
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -4,104 +4,89 @@
4 4
5#pragma once 5#pragma once
6 6
7#include <array>
8#include <cstddef>
9#include <memory>
10#include <vector>
11
12#include <boost/intrusive/set_hook.hpp>
13
7#include "common/common_types.h" 14#include "common/common_types.h"
8#include "video_core/gpu.h" 15#include "video_core/gpu.h"
9 16
10namespace VideoCommon { 17namespace VideoCommon {
11 18
12class MapIntervalBase { 19struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
13public: 20 MapInterval() = default;
14 MapIntervalBase(const VAddr start, const VAddr end, const GPUVAddr gpu_addr)
15 : start{start}, end{end}, gpu_addr{gpu_addr} {}
16
17 void SetCpuAddress(VAddr new_cpu_addr) {
18 cpu_addr = new_cpu_addr;
19 }
20
21 VAddr GetCpuAddress() const {
22 return cpu_addr;
23 }
24
25 GPUVAddr GetGpuAddress() const {
26 return gpu_addr;
27 }
28
29 bool IsInside(const VAddr other_start, const VAddr other_end) const {
30 return (start <= other_start && other_end <= end);
31 }
32
33 bool operator==(const MapIntervalBase& rhs) const {
34 return std::tie(start, end) == std::tie(rhs.start, rhs.end);
35 }
36
37 bool operator!=(const MapIntervalBase& rhs) const {
38 return !operator==(rhs);
39 }
40 21
41 void MarkAsRegistered(const bool registered) { 22 /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
42 is_registered = registered;
43 }
44 23
45 bool IsRegistered() const { 24 explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
46 return is_registered; 25 : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
47 }
48 26
49 void SetMemoryMarked(bool is_memory_marked_) { 27 bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
50 is_memory_marked = is_memory_marked_; 28 return start <= other_start && other_end <= end;
51 } 29 }
52 30
53 bool IsMemoryMarked() const { 31 bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
54 return is_memory_marked; 32 return start < other_end && other_start < end;
55 } 33 }
56 34
57 void SetSyncPending(bool is_sync_pending_) { 35 void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
58 is_sync_pending = is_sync_pending_; 36 is_modified = is_modified_;
59 } 37 ticks = ticks_;
38 }
39
40 boost::intrusive::set_member_hook<> member_hook_;
41 VAddr start = 0;
42 VAddr end = 0;
43 GPUVAddr gpu_addr = 0;
44 u64 ticks = 0;
45 bool is_written = false;
46 bool is_modified = false;
47 bool is_registered = false;
48 bool is_memory_marked = false;
49 bool is_sync_pending = false;
50};
60 51
61 bool IsSyncPending() const { 52struct MapIntervalCompare {
62 return is_sync_pending; 53 constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
54 return lhs.start < rhs.start;
63 } 55 }
56};
64 57
65 VAddr GetStart() const { 58class MapIntervalAllocator {
66 return start; 59public:
67 } 60 MapIntervalAllocator();
61 ~MapIntervalAllocator();
68 62
69 VAddr GetEnd() const { 63 MapInterval* Allocate() {
70 return end; 64 if (free_list.empty()) {
65 AllocateNewChunk();
66 }
67 MapInterval* const interval = free_list.back();
68 free_list.pop_back();
69 return interval;
71 } 70 }
72 71
73 void MarkAsModified(const bool is_modified_, const u64 tick) { 72 void Release(MapInterval* interval) {
74 is_modified = is_modified_; 73 free_list.push_back(interval);
75 ticks = tick;
76 } 74 }
77 75
78 bool IsModified() const { 76private:
79 return is_modified; 77 struct Chunk {
80 } 78 std::unique_ptr<Chunk> next;
79 std::array<MapInterval, 0x8000> data;
80 };
81 81
82 u64 GetModificationTick() const { 82 void AllocateNewChunk();
83 return ticks;
84 }
85 83
86 void MarkAsWritten(const bool is_written_) { 84 void FillFreeList(Chunk& chunk);
87 is_written = is_written_;
88 }
89 85
90 bool IsWritten() const { 86 std::vector<MapInterval*> free_list;
91 return is_written; 87 std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
92 }
93 88
94private: 89 Chunk first_chunk;
95 VAddr start;
96 VAddr end;
97 GPUVAddr gpu_addr;
98 VAddr cpu_addr{};
99 bool is_written{};
100 bool is_modified{};
101 bool is_registered{};
102 bool is_memory_marked{};
103 bool is_sync_pending{};
104 u64 ticks{};
105}; 90};
106 91
107} // namespace VideoCommon 92} // namespace VideoCommon
diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h
index ebe139504..f46e81bb7 100644
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -93,6 +93,7 @@ public:
93 virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0; 93 virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;
94 virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, 94 virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
95 u64 offset) const = 0; 95 u64 offset) const = 0;
96 virtual SamplerDescriptor AccessSampler(u32 handle) const = 0;
96 virtual u32 GetBoundBuffer() const = 0; 97 virtual u32 GetBoundBuffer() const = 0;
97 98
98 virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0; 99 virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0;
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index f6237fc6a..a82b06a38 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -92,8 +92,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con
92 ASSERT(stage == ShaderType::Compute); 92 ASSERT(stage == ShaderType::Compute);
93 const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer]; 93 const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];
94 const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset; 94 const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset;
95 return AccessSampler(memory_manager.Read<u32>(tex_info_address));
96}
95 97
96 const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; 98SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
99 const Texture::TextureHandle tex_handle{handle};
97 const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); 100 const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
98 SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); 101 SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
99 result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); 102 result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 18ceedfaf..b7f668d88 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -219,6 +219,8 @@ public:
219 SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, 219 SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
220 u64 offset) const override; 220 u64 offset) const override;
221 221
222 SamplerDescriptor AccessSampler(u32 handle) const override;
223
222 u32 GetBoundBuffer() const override { 224 u32 GetBoundBuffer() const override {
223 return regs.tex_cb_index; 225 return regs.tex_cb_index;
224 } 226 }
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 024c9e43b..ea3c8a963 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
25Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, 25Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
26 MemoryManager& memory_manager) 26 MemoryManager& memory_manager)
27 : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, 27 : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
28 macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { 28 macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} {
29 dirty.flags.flip(); 29 dirty.flags.flip();
30
31 InitializeRegisterDefaults(); 30 InitializeRegisterDefaults();
32} 31}
33 32
@@ -106,7 +105,11 @@ void Maxwell3D::InitializeRegisterDefaults() {
106 regs.rasterize_enable = 1; 105 regs.rasterize_enable = 1;
107 regs.rt_separate_frag_data = 1; 106 regs.rt_separate_frag_data = 1;
108 regs.framebuffer_srgb = 1; 107 regs.framebuffer_srgb = 1;
108 regs.line_width_aliased = 1.0f;
109 regs.line_width_smooth = 1.0f;
109 regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise; 110 regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise;
111 regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill;
112 regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill;
110 113
111 shadow_state = regs; 114 shadow_state = regs;
112 115
@@ -116,7 +119,7 @@ void Maxwell3D::InitializeRegisterDefaults() {
116 mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; 119 mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
117} 120}
118 121
119void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) { 122void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
120 // Reset the current macro. 123 // Reset the current macro.
121 executing_macro = 0; 124 executing_macro = 0;
122 125
@@ -125,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3
125 ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size()); 128 ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
126 129
127 // Execute the current macro. 130 // Execute the current macro.
128 macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters); 131 macro_engine->Execute(macro_positions[entry], parameters);
129 if (mme_draw.current_mode != MMEDrawMode::Undefined) { 132 if (mme_draw.current_mode != MMEDrawMode::Undefined) {
130 FlushMMEInlineDraw(); 133 FlushMMEInlineDraw();
131 } 134 }
@@ -161,7 +164,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
161 164
162 // Call the macro when there are no more parameters in the command buffer 165 // Call the macro when there are no more parameters in the command buffer
163 if (is_last_call) { 166 if (is_last_call) {
164 CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); 167 CallMacroMethod(executing_macro, macro_params);
165 macro_params.clear(); 168 macro_params.clear();
166 } 169 }
167 return; 170 return;
@@ -197,7 +200,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
197 break; 200 break;
198 } 201 }
199 case MAXWELL3D_REG_INDEX(macros.data): { 202 case MAXWELL3D_REG_INDEX(macros.data): {
200 ProcessMacroUpload(arg); 203 macro_engine->AddCode(regs.macros.upload_address, arg);
201 break; 204 break;
202 } 205 }
203 case MAXWELL3D_REG_INDEX(macros.bind): { 206 case MAXWELL3D_REG_INDEX(macros.bind): {
@@ -306,7 +309,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
306 309
307 // Call the macro when there are no more parameters in the command buffer 310 // Call the macro when there are no more parameters in the command buffer
308 if (amount == methods_pending) { 311 if (amount == methods_pending) {
309 CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); 312 CallMacroMethod(executing_macro, macro_params);
310 macro_params.clear(); 313 macro_params.clear();
311 } 314 }
312 return; 315 return;
@@ -420,9 +423,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
420} 423}
421 424
422void Maxwell3D::ProcessMacroUpload(u32 data) { 425void Maxwell3D::ProcessMacroUpload(u32 data) {
423 ASSERT_MSG(regs.macros.upload_address < macro_memory.size(), 426 macro_engine->AddCode(regs.macros.upload_address++, data);
424 "upload_address exceeded macro_memory size!");
425 macro_memory[regs.macros.upload_address++] = data;
426} 427}
427 428
428void Maxwell3D::ProcessMacroBind(u32 data) { 429void Maxwell3D::ProcessMacroBind(u32 data) {
@@ -457,8 +458,9 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
457 458
458void Maxwell3D::ProcessQueryGet() { 459void Maxwell3D::ProcessQueryGet() {
459 // TODO(Subv): Support the other query units. 460 // TODO(Subv): Support the other query units.
460 ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop, 461 if (regs.query.query_get.unit != Regs::QueryUnit::Crop) {
461 "Units other than CROP are unimplemented"); 462 LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented");
463 }
462 464
463 switch (regs.query.query_get.operation) { 465 switch (regs.query.query_get.operation) {
464 case Regs::QueryOperation::Release: 466 case Regs::QueryOperation::Release:
@@ -534,8 +536,8 @@ void Maxwell3D::ProcessCounterReset() {
534 rasterizer.ResetCounter(QueryType::SamplesPassed); 536 rasterizer.ResetCounter(QueryType::SamplesPassed);
535 break; 537 break;
536 default: 538 default:
537 LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}", 539 LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}",
538 static_cast<int>(regs.counter_reset)); 540 static_cast<int>(regs.counter_reset));
539 break; 541 break;
540 } 542 }
541} 543}
@@ -592,8 +594,8 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
592 system.GPU().GetTicks()); 594 system.GPU().GetTicks());
593 return {}; 595 return {};
594 default: 596 default:
595 UNIMPLEMENTED_MSG("Unimplemented query select type {}", 597 LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
596 static_cast<u32>(regs.query.query_get.select.Value())); 598 static_cast<u32>(regs.query.query_get.select.Value()));
597 return 1; 599 return 1;
598 } 600 }
599} 601}
@@ -738,8 +740,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b
738 const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)]; 740 const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
739 const auto& tex_info_buffer = shader.const_buffers[const_buffer]; 741 const auto& tex_info_buffer = shader.const_buffers[const_buffer];
740 const GPUVAddr tex_info_address = tex_info_buffer.address + offset; 742 const GPUVAddr tex_info_address = tex_info_buffer.address + offset;
743 return AccessSampler(memory_manager.Read<u32>(tex_info_address));
744}
741 745
742 const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; 746SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
747 const Texture::TextureHandle tex_handle{handle};
743 const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); 748 const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
744 SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); 749 SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
745 result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); 750 result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 05dd6b39b..d5fe25065 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -23,7 +23,7 @@
23#include "video_core/engines/engine_upload.h" 23#include "video_core/engines/engine_upload.h"
24#include "video_core/engines/shader_type.h" 24#include "video_core/engines/shader_type.h"
25#include "video_core/gpu.h" 25#include "video_core/gpu.h"
26#include "video_core/macro_interpreter.h" 26#include "video_core/macro/macro.h"
27#include "video_core/textures/texture.h" 27#include "video_core/textures/texture.h"
28 28
29namespace Core { 29namespace Core {
@@ -598,6 +598,7 @@ public:
598 BitField<4, 3, u32> block_height; 598 BitField<4, 3, u32> block_height;
599 BitField<8, 3, u32> block_depth; 599 BitField<8, 3, u32> block_depth;
600 BitField<12, 1, InvMemoryLayout> type; 600 BitField<12, 1, InvMemoryLayout> type;
601 BitField<16, 1, u32> is_3d;
601 } memory_layout; 602 } memory_layout;
602 union { 603 union {
603 BitField<0, 16, u32> layers; 604 BitField<0, 16, u32> layers;
@@ -1403,6 +1404,8 @@ public:
1403 SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, 1404 SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
1404 u64 offset) const override; 1405 u64 offset) const override;
1405 1406
1407 SamplerDescriptor AccessSampler(u32 handle) const override;
1408
1406 u32 GetBoundBuffer() const override { 1409 u32 GetBoundBuffer() const override {
1407 return regs.tex_cb_index; 1410 return regs.tex_cb_index;
1408 } 1411 }
@@ -1411,15 +1414,6 @@ public:
1411 1414
1412 const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override; 1415 const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
1413 1416
1414 /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
1415 /// we've seen used.
1416 using MacroMemory = std::array<u32, 0x40000>;
1417
1418 /// Gets a reference to macro memory.
1419 const MacroMemory& GetMacroMemory() const {
1420 return macro_memory;
1421 }
1422
1423 bool ShouldExecute() const { 1417 bool ShouldExecute() const {
1424 return execute_on; 1418 return execute_on;
1425 } 1419 }
@@ -1468,16 +1462,13 @@ private:
1468 1462
1469 std::array<bool, Regs::NUM_REGS> mme_inline{}; 1463 std::array<bool, Regs::NUM_REGS> mme_inline{};
1470 1464
1471 /// Memory for macro code
1472 MacroMemory macro_memory;
1473
1474 /// Macro method that is currently being executed / being fed parameters. 1465 /// Macro method that is currently being executed / being fed parameters.
1475 u32 executing_macro = 0; 1466 u32 executing_macro = 0;
1476 /// Parameters that have been submitted to the macro call so far. 1467 /// Parameters that have been submitted to the macro call so far.
1477 std::vector<u32> macro_params; 1468 std::vector<u32> macro_params;
1478 1469
1479 /// Interpreter for the macro codes uploaded to the GPU. 1470 /// Interpreter for the macro codes uploaded to the GPU.
1480 MacroInterpreter macro_interpreter; 1471 std::unique_ptr<MacroEngine> macro_engine;
1481 1472
1482 static constexpr u32 null_cb_data = 0xFFFFFFFF; 1473 static constexpr u32 null_cb_data = 0xFFFFFFFF;
1483 struct { 1474 struct {
@@ -1506,7 +1497,7 @@ private:
1506 * @param num_parameters Number of arguments 1497 * @param num_parameters Number of arguments
1507 * @param parameters Arguments to the method call 1498 * @param parameters Arguments to the method call
1508 */ 1499 */
1509 void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters); 1500 void CallMacroMethod(u32 method, const std::vector<u32>& parameters);
1510 1501
1511 /// Handles writes to the macro uploading register. 1502 /// Handles writes to the macro uploading register.
1512 void ProcessMacroUpload(u32 data); 1503 void ProcessMacroUpload(u32 data);
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
new file mode 100644
index 000000000..89077a2d8
--- /dev/null
+++ b/src/video_core/macro/macro.cpp
@@ -0,0 +1,45 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/assert.h"
6#include "common/logging/log.h"
7#include "core/settings.h"
8#include "video_core/macro/macro.h"
9#include "video_core/macro/macro_interpreter.h"
10#include "video_core/macro/macro_jit_x64.h"
11
12namespace Tegra {
13
14void MacroEngine::AddCode(u32 method, u32 data) {
15 uploaded_macro_code[method].push_back(data);
16}
17
18void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
19 auto compiled_macro = macro_cache.find(method);
20 if (compiled_macro != macro_cache.end()) {
21 compiled_macro->second->Execute(parameters, method);
22 } else {
23 // Macro not compiled, check if it's uploaded and if so, compile it
24 auto macro_code = uploaded_macro_code.find(method);
25 if (macro_code == uploaded_macro_code.end()) {
26 UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
27 return;
28 }
29 macro_cache[method] = Compile(macro_code->second);
30 macro_cache[method]->Execute(parameters, method);
31 }
32}
33
34std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
35 if (Settings::values.disable_macro_jit) {
36 return std::make_unique<MacroInterpreter>(maxwell3d);
37 }
38#ifdef ARCHITECTURE_x86_64
39 return std::make_unique<MacroJITx64>(maxwell3d);
40#else
41 return std::make_unique<MacroInterpreter>(maxwell3d);
42#endif
43}
44
45} // namespace Tegra
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
new file mode 100644
index 000000000..b76ed891f
--- /dev/null
+++ b/src/video_core/macro/macro.h
@@ -0,0 +1,128 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <memory>
8#include <unordered_map>
9#include <vector>
10#include "common/bit_field.h"
11#include "common/common_types.h"
12
13namespace Tegra {
14namespace Engines {
15class Maxwell3D;
16}
17namespace Macro {
18constexpr std::size_t NUM_MACRO_REGISTERS = 8;
19enum class Operation : u32 {
20 ALU = 0,
21 AddImmediate = 1,
22 ExtractInsert = 2,
23 ExtractShiftLeftImmediate = 3,
24 ExtractShiftLeftRegister = 4,
25 Read = 5,
26 Unused = 6, // This operation doesn't seem to be a valid encoding.
27 Branch = 7,
28};
29
30enum class ALUOperation : u32 {
31 Add = 0,
32 AddWithCarry = 1,
33 Subtract = 2,
34 SubtractWithBorrow = 3,
35 // Operations 4-7 don't seem to be valid encodings.
36 Xor = 8,
37 Or = 9,
38 And = 10,
39 AndNot = 11,
40 Nand = 12
41};
42
43enum class ResultOperation : u32 {
44 IgnoreAndFetch = 0,
45 Move = 1,
46 MoveAndSetMethod = 2,
47 FetchAndSend = 3,
48 MoveAndSend = 4,
49 FetchAndSetMethod = 5,
50 MoveAndSetMethodFetchAndSend = 6,
51 MoveAndSetMethodSend = 7
52};
53
54enum class BranchCondition : u32 {
55 Zero = 0,
56 NotZero = 1,
57};
58
59union Opcode {
60 u32 raw;
61 BitField<0, 3, Operation> operation;
62 BitField<4, 3, ResultOperation> result_operation;
63 BitField<4, 1, BranchCondition> branch_condition;
64 // If set on a branch, then the branch doesn't have a delay slot.
65 BitField<5, 1, u32> branch_annul;
66 BitField<7, 1, u32> is_exit;
67 BitField<8, 3, u32> dst;
68 BitField<11, 3, u32> src_a;
69 BitField<14, 3, u32> src_b;
70 // The signed immediate overlaps the second source operand and the alu operation.
71 BitField<14, 18, s32> immediate;
72
73 BitField<17, 5, ALUOperation> alu_operation;
74
75 // Bitfield instructions data
76 BitField<17, 5, u32> bf_src_bit;
77 BitField<22, 5, u32> bf_size;
78 BitField<27, 5, u32> bf_dst_bit;
79
80 u32 GetBitfieldMask() const {
81 return (1 << bf_size) - 1;
82 }
83
84 s32 GetBranchTarget() const {
85 return static_cast<s32>(immediate * sizeof(u32));
86 }
87};
88
89union MethodAddress {
90 u32 raw;
91 BitField<0, 12, u32> address;
92 BitField<12, 6, u32> increment;
93};
94
95} // namespace Macro
96
97class CachedMacro {
98public:
99 virtual ~CachedMacro() = default;
100 /**
101 * Executes the macro code with the specified input parameters.
102 * @param code The macro byte code to execute
103 * @param parameters The parameters of the macro
104 */
105 virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
106};
107
108class MacroEngine {
109public:
110 virtual ~MacroEngine() = default;
111
112 // Store the uploaded macro code to compile them when they're called.
113 void AddCode(u32 method, u32 data);
114
115 // Compiles the macro if its not in the cache, and executes the compiled macro
116 void Execute(u32 method, const std::vector<u32>& parameters);
117
118protected:
119 virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
120
121private:
122 std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache;
123 std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
124};
125
126std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
127
128} // namespace Tegra
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp
index 947364928..5edff27aa 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -1,4 +1,4 @@
1// Copyright 2018 yuzu Emulator Project 1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
@@ -6,109 +6,46 @@
6#include "common/logging/log.h" 6#include "common/logging/log.h"
7#include "common/microprofile.h" 7#include "common/microprofile.h"
8#include "video_core/engines/maxwell_3d.h" 8#include "video_core/engines/maxwell_3d.h"
9#include "video_core/macro_interpreter.h" 9#include "video_core/macro/macro_interpreter.h"
10 10
11MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); 11MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
12 12
13namespace Tegra { 13namespace Tegra {
14namespace { 14MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
15enum class Operation : u32 {
16 ALU = 0,
17 AddImmediate = 1,
18 ExtractInsert = 2,
19 ExtractShiftLeftImmediate = 3,
20 ExtractShiftLeftRegister = 4,
21 Read = 5,
22 Unused = 6, // This operation doesn't seem to be a valid encoding.
23 Branch = 7,
24};
25} // Anonymous namespace
26
27enum class MacroInterpreter::ALUOperation : u32 {
28 Add = 0,
29 AddWithCarry = 1,
30 Subtract = 2,
31 SubtractWithBorrow = 3,
32 // Operations 4-7 don't seem to be valid encodings.
33 Xor = 8,
34 Or = 9,
35 And = 10,
36 AndNot = 11,
37 Nand = 12
38};
39
40enum class MacroInterpreter::ResultOperation : u32 {
41 IgnoreAndFetch = 0,
42 Move = 1,
43 MoveAndSetMethod = 2,
44 FetchAndSend = 3,
45 MoveAndSend = 4,
46 FetchAndSetMethod = 5,
47 MoveAndSetMethodFetchAndSend = 6,
48 MoveAndSetMethodSend = 7
49};
50
51enum class MacroInterpreter::BranchCondition : u32 {
52 Zero = 0,
53 NotZero = 1,
54};
55
56union MacroInterpreter::Opcode {
57 u32 raw;
58 BitField<0, 3, Operation> operation;
59 BitField<4, 3, ResultOperation> result_operation;
60 BitField<4, 1, BranchCondition> branch_condition;
61 // If set on a branch, then the branch doesn't have a delay slot.
62 BitField<5, 1, u32> branch_annul;
63 BitField<7, 1, u32> is_exit;
64 BitField<8, 3, u32> dst;
65 BitField<11, 3, u32> src_a;
66 BitField<14, 3, u32> src_b;
67 // The signed immediate overlaps the second source operand and the alu operation.
68 BitField<14, 18, s32> immediate;
69
70 BitField<17, 5, ALUOperation> alu_operation;
71
72 // Bitfield instructions data
73 BitField<17, 5, u32> bf_src_bit;
74 BitField<22, 5, u32> bf_size;
75 BitField<27, 5, u32> bf_dst_bit;
76
77 u32 GetBitfieldMask() const {
78 return (1 << bf_size) - 1;
79 }
80 15
81 s32 GetBranchTarget() const { 16std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
82 return static_cast<s32>(immediate * sizeof(u32)); 17 return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
83 } 18}
84};
85 19
86MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} 20MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d,
21 const std::vector<u32>& code)
22 : maxwell3d(maxwell3d), code(code) {}
87 23
88void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) { 24void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {
89 MICROPROFILE_SCOPE(MacroInterp); 25 MICROPROFILE_SCOPE(MacroInterp);
90 Reset(); 26 Reset();
91 27
92 registers[1] = parameters[0]; 28 registers[1] = parameters[0];
29 num_parameters = parameters.size();
93 30
94 if (num_parameters > parameters_capacity) { 31 if (num_parameters > parameters_capacity) {
95 parameters_capacity = num_parameters; 32 parameters_capacity = num_parameters;
96 this->parameters = std::make_unique<u32[]>(num_parameters); 33 this->parameters = std::make_unique<u32[]>(num_parameters);
97 } 34 }
98 std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32)); 35 std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));
99 this->num_parameters = num_parameters; 36 this->num_parameters = num_parameters;
100 37
101 // Execute the code until we hit an exit condition. 38 // Execute the code until we hit an exit condition.
102 bool keep_executing = true; 39 bool keep_executing = true;
103 while (keep_executing) { 40 while (keep_executing) {
104 keep_executing = Step(offset, false); 41 keep_executing = Step(false);
105 } 42 }
106 43
107 // Assert the the macro used all the input parameters 44 // Assert the the macro used all the input parameters
108 ASSERT(next_parameter_index == num_parameters); 45 ASSERT(next_parameter_index == num_parameters);
109} 46}
110 47
111void MacroInterpreter::Reset() { 48void MacroInterpreterImpl::Reset() {
112 registers = {}; 49 registers = {};
113 pc = 0; 50 pc = 0;
114 delayed_pc = {}; 51 delayed_pc = {};
@@ -120,10 +57,10 @@ void MacroInterpreter::Reset() {
120 carry_flag = false; 57 carry_flag = false;
121} 58}
122 59
123bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { 60bool MacroInterpreterImpl::Step(bool is_delay_slot) {
124 u32 base_address = pc; 61 u32 base_address = pc;
125 62
126 Opcode opcode = GetOpcode(offset); 63 Macro::Opcode opcode = GetOpcode();
127 pc += 4; 64 pc += 4;
128 65
129 // Update the program counter if we were delayed 66 // Update the program counter if we were delayed
@@ -134,18 +71,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
134 } 71 }
135 72
136 switch (opcode.operation) { 73 switch (opcode.operation) {
137 case Operation::ALU: { 74 case Macro::Operation::ALU: {
138 u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a), 75 u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
139 GetRegister(opcode.src_b)); 76 GetRegister(opcode.src_b));
140 ProcessResult(opcode.result_operation, opcode.dst, result); 77 ProcessResult(opcode.result_operation, opcode.dst, result);
141 break; 78 break;
142 } 79 }
143 case Operation::AddImmediate: { 80 case Macro::Operation::AddImmediate: {
144 ProcessResult(opcode.result_operation, opcode.dst, 81 ProcessResult(opcode.result_operation, opcode.dst,
145 GetRegister(opcode.src_a) + opcode.immediate); 82 GetRegister(opcode.src_a) + opcode.immediate);
146 break; 83 break;
147 } 84 }
148 case Operation::ExtractInsert: { 85 case Macro::Operation::ExtractInsert: {
149 u32 dst = GetRegister(opcode.src_a); 86 u32 dst = GetRegister(opcode.src_a);
150 u32 src = GetRegister(opcode.src_b); 87 u32 src = GetRegister(opcode.src_b);
151 88
@@ -155,7 +92,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
155 ProcessResult(opcode.result_operation, opcode.dst, dst); 92 ProcessResult(opcode.result_operation, opcode.dst, dst);
156 break; 93 break;
157 } 94 }
158 case Operation::ExtractShiftLeftImmediate: { 95 case Macro::Operation::ExtractShiftLeftImmediate: {
159 u32 dst = GetRegister(opcode.src_a); 96 u32 dst = GetRegister(opcode.src_a);
160 u32 src = GetRegister(opcode.src_b); 97 u32 src = GetRegister(opcode.src_b);
161 98
@@ -164,7 +101,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
164 ProcessResult(opcode.result_operation, opcode.dst, result); 101 ProcessResult(opcode.result_operation, opcode.dst, result);
165 break; 102 break;
166 } 103 }
167 case Operation::ExtractShiftLeftRegister: { 104 case Macro::Operation::ExtractShiftLeftRegister: {
168 u32 dst = GetRegister(opcode.src_a); 105 u32 dst = GetRegister(opcode.src_a);
169 u32 src = GetRegister(opcode.src_b); 106 u32 src = GetRegister(opcode.src_b);
170 107
@@ -173,12 +110,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
173 ProcessResult(opcode.result_operation, opcode.dst, result); 110 ProcessResult(opcode.result_operation, opcode.dst, result);
174 break; 111 break;
175 } 112 }
176 case Operation::Read: { 113 case Macro::Operation::Read: {
177 u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate); 114 u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
178 ProcessResult(opcode.result_operation, opcode.dst, result); 115 ProcessResult(opcode.result_operation, opcode.dst, result);
179 break; 116 break;
180 } 117 }
181 case Operation::Branch: { 118 case Macro::Operation::Branch: {
182 ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); 119 ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
183 u32 value = GetRegister(opcode.src_a); 120 u32 value = GetRegister(opcode.src_a);
184 bool taken = EvaluateBranchCondition(opcode.branch_condition, value); 121 bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
@@ -191,7 +128,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
191 128
192 delayed_pc = base_address + opcode.GetBranchTarget(); 129 delayed_pc = base_address + opcode.GetBranchTarget();
193 // Execute one more instruction due to the delay slot. 130 // Execute one more instruction due to the delay slot.
194 return Step(offset, true); 131 return Step(true);
195 } 132 }
196 break; 133 break;
197 } 134 }
@@ -204,51 +141,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
204 // cause an exit if it's executed inside a delay slot. 141 // cause an exit if it's executed inside a delay slot.
205 if (opcode.is_exit && !is_delay_slot) { 142 if (opcode.is_exit && !is_delay_slot) {
206 // Exit has a delay slot, execute the next instruction 143 // Exit has a delay slot, execute the next instruction
207 Step(offset, true); 144 Step(true);
208 return false; 145 return false;
209 } 146 }
210 147
211 return true; 148 return true;
212} 149}
213 150
214MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const { 151u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
215 const auto& macro_memory{maxwell3d.GetMacroMemory()};
216 ASSERT((pc % sizeof(u32)) == 0);
217 ASSERT((pc + offset) < macro_memory.size() * sizeof(u32));
218 return {macro_memory[offset + pc / sizeof(u32)]};
219}
220
221u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) {
222 switch (operation) { 152 switch (operation) {
223 case ALUOperation::Add: { 153 case Macro::ALUOperation::Add: {
224 const u64 result{static_cast<u64>(src_a) + src_b}; 154 const u64 result{static_cast<u64>(src_a) + src_b};
225 carry_flag = result > 0xffffffff; 155 carry_flag = result > 0xffffffff;
226 return static_cast<u32>(result); 156 return static_cast<u32>(result);
227 } 157 }
228 case ALUOperation::AddWithCarry: { 158 case Macro::ALUOperation::AddWithCarry: {
229 const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)}; 159 const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
230 carry_flag = result > 0xffffffff; 160 carry_flag = result > 0xffffffff;
231 return static_cast<u32>(result); 161 return static_cast<u32>(result);
232 } 162 }
233 case ALUOperation::Subtract: { 163 case Macro::ALUOperation::Subtract: {
234 const u64 result{static_cast<u64>(src_a) - src_b}; 164 const u64 result{static_cast<u64>(src_a) - src_b};
235 carry_flag = result < 0x100000000; 165 carry_flag = result < 0x100000000;
236 return static_cast<u32>(result); 166 return static_cast<u32>(result);
237 } 167 }
238 case ALUOperation::SubtractWithBorrow: { 168 case Macro::ALUOperation::SubtractWithBorrow: {
239 const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)}; 169 const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
240 carry_flag = result < 0x100000000; 170 carry_flag = result < 0x100000000;
241 return static_cast<u32>(result); 171 return static_cast<u32>(result);
242 } 172 }
243 case ALUOperation::Xor: 173 case Macro::ALUOperation::Xor:
244 return src_a ^ src_b; 174 return src_a ^ src_b;
245 case ALUOperation::Or: 175 case Macro::ALUOperation::Or:
246 return src_a | src_b; 176 return src_a | src_b;
247 case ALUOperation::And: 177 case Macro::ALUOperation::And:
248 return src_a & src_b; 178 return src_a & src_b;
249 case ALUOperation::AndNot: 179 case Macro::ALUOperation::AndNot:
250 return src_a & ~src_b; 180 return src_a & ~src_b;
251 case ALUOperation::Nand: 181 case Macro::ALUOperation::Nand:
252 return ~(src_a & src_b); 182 return ~(src_a & src_b);
253 183
254 default: 184 default:
@@ -257,43 +187,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b)
257 } 187 }
258} 188}
259 189
260void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) { 190void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
261 switch (operation) { 191 switch (operation) {
262 case ResultOperation::IgnoreAndFetch: 192 case Macro::ResultOperation::IgnoreAndFetch:
263 // Fetch parameter and ignore result. 193 // Fetch parameter and ignore result.
264 SetRegister(reg, FetchParameter()); 194 SetRegister(reg, FetchParameter());
265 break; 195 break;
266 case ResultOperation::Move: 196 case Macro::ResultOperation::Move:
267 // Move result. 197 // Move result.
268 SetRegister(reg, result); 198 SetRegister(reg, result);
269 break; 199 break;
270 case ResultOperation::MoveAndSetMethod: 200 case Macro::ResultOperation::MoveAndSetMethod:
271 // Move result and use as Method Address. 201 // Move result and use as Method Address.
272 SetRegister(reg, result); 202 SetRegister(reg, result);
273 SetMethodAddress(result); 203 SetMethodAddress(result);
274 break; 204 break;
275 case ResultOperation::FetchAndSend: 205 case Macro::ResultOperation::FetchAndSend:
276 // Fetch parameter and send result. 206 // Fetch parameter and send result.
277 SetRegister(reg, FetchParameter()); 207 SetRegister(reg, FetchParameter());
278 Send(result); 208 Send(result);
279 break; 209 break;
280 case ResultOperation::MoveAndSend: 210 case Macro::ResultOperation::MoveAndSend:
281 // Move and send result. 211 // Move and send result.
282 SetRegister(reg, result); 212 SetRegister(reg, result);
283 Send(result); 213 Send(result);
284 break; 214 break;
285 case ResultOperation::FetchAndSetMethod: 215 case Macro::ResultOperation::FetchAndSetMethod:
286 // Fetch parameter and use result as Method Address. 216 // Fetch parameter and use result as Method Address.
287 SetRegister(reg, FetchParameter()); 217 SetRegister(reg, FetchParameter());
288 SetMethodAddress(result); 218 SetMethodAddress(result);
289 break; 219 break;
290 case ResultOperation::MoveAndSetMethodFetchAndSend: 220 case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
291 // Move result and use as Method Address, then fetch and send parameter. 221 // Move result and use as Method Address, then fetch and send parameter.
292 SetRegister(reg, result); 222 SetRegister(reg, result);
293 SetMethodAddress(result); 223 SetMethodAddress(result);
294 Send(FetchParameter()); 224 Send(FetchParameter());
295 break; 225 break;
296 case ResultOperation::MoveAndSetMethodSend: 226 case Macro::ResultOperation::MoveAndSetMethodSend:
297 // Move result and use as Method Address, then send bits 12:17 of result. 227 // Move result and use as Method Address, then send bits 12:17 of result.
298 SetRegister(reg, result); 228 SetRegister(reg, result);
299 SetMethodAddress(result); 229 SetMethodAddress(result);
@@ -304,16 +234,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res
304 } 234 }
305} 235}
306 236
307u32 MacroInterpreter::FetchParameter() { 237bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
308 ASSERT(next_parameter_index < num_parameters); 238 switch (cond) {
309 return parameters[next_parameter_index++]; 239 case Macro::BranchCondition::Zero:
240 return value == 0;
241 case Macro::BranchCondition::NotZero:
242 return value != 0;
243 }
244 UNREACHABLE();
245 return true;
310} 246}
311 247
312u32 MacroInterpreter::GetRegister(u32 register_id) const { 248Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
249 ASSERT((pc % sizeof(u32)) == 0);
250 ASSERT(pc < code.size() * sizeof(u32));
251 return {code[pc / sizeof(u32)]};
252}
253
254u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
313 return registers.at(register_id); 255 return registers.at(register_id);
314} 256}
315 257
316void MacroInterpreter::SetRegister(u32 register_id, u32 value) { 258void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
317 // Register 0 is hardwired as the zero register. 259 // Register 0 is hardwired as the zero register.
318 // Ensure no writes to it actually occur. 260 // Ensure no writes to it actually occur.
319 if (register_id == 0) { 261 if (register_id == 0) {
@@ -323,30 +265,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
323 registers.at(register_id) = value; 265 registers.at(register_id) = value;
324} 266}
325 267
326void MacroInterpreter::SetMethodAddress(u32 address) { 268void MacroInterpreterImpl::SetMethodAddress(u32 address) {
327 method_address.raw = address; 269 method_address.raw = address;
328} 270}
329 271
330void MacroInterpreter::Send(u32 value) { 272void MacroInterpreterImpl::Send(u32 value) {
331 maxwell3d.CallMethodFromMME(method_address.address, value); 273 maxwell3d.CallMethodFromMME(method_address.address, value);
332 // Increment the method address by the method increment. 274 // Increment the method address by the method increment.
333 method_address.address.Assign(method_address.address.Value() + 275 method_address.address.Assign(method_address.address.Value() +
334 method_address.increment.Value()); 276 method_address.increment.Value());
335} 277}
336 278
337u32 MacroInterpreter::Read(u32 method) const { 279u32 MacroInterpreterImpl::Read(u32 method) const {
338 return maxwell3d.GetRegisterValue(method); 280 return maxwell3d.GetRegisterValue(method);
339} 281}
340 282
341bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const { 283u32 MacroInterpreterImpl::FetchParameter() {
342 switch (cond) { 284 ASSERT(next_parameter_index < num_parameters);
343 case BranchCondition::Zero: 285 return parameters[next_parameter_index++];
344 return value == 0;
345 case BranchCondition::NotZero:
346 return value != 0;
347 }
348 UNREACHABLE();
349 return true;
350} 286}
351 287
352} // namespace Tegra 288} // namespace Tegra
diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h
index 631146d89..90217fc89 100644
--- a/src/video_core/macro_interpreter.h
+++ b/src/video_core/macro/macro_interpreter.h
@@ -1,44 +1,37 @@
1// Copyright 2018 yuzu Emulator Project 1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#pragma once 5#pragma once
6
7#include <array> 6#include <array>
8#include <optional> 7#include <optional>
9 8#include <vector>
10#include "common/bit_field.h" 9#include "common/bit_field.h"
11#include "common/common_types.h" 10#include "common/common_types.h"
11#include "video_core/macro/macro.h"
12 12
13namespace Tegra { 13namespace Tegra {
14namespace Engines { 14namespace Engines {
15class Maxwell3D; 15class Maxwell3D;
16} 16}
17 17
18class MacroInterpreter final { 18class MacroInterpreter final : public MacroEngine {
19public: 19public:
20 explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d); 20 explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d);
21 21
22 /** 22protected:
23 * Executes the macro code with the specified input parameters. 23 std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
24 * @param offset Offset to start execution at.
25 * @param parameters The parameters of the macro.
26 */
27 void Execute(u32 offset, std::size_t num_parameters, const u32* parameters);
28 24
29private: 25private:
30 enum class ALUOperation : u32; 26 Engines::Maxwell3D& maxwell3d;
31 enum class BranchCondition : u32; 27};
32 enum class ResultOperation : u32;
33
34 union Opcode;
35 28
36 union MethodAddress { 29class MacroInterpreterImpl : public CachedMacro {
37 u32 raw; 30public:
38 BitField<0, 12, u32> address; 31 MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
39 BitField<12, 6, u32> increment; 32 void Execute(const std::vector<u32>& parameters, u32 method) override;
40 };
41 33
34private:
42 /// Resets the execution engine state, zeroing registers, etc. 35 /// Resets the execution engine state, zeroing registers, etc.
43 void Reset(); 36 void Reset();
44 37
@@ -49,20 +42,20 @@ private:
49 * @param is_delay_slot Whether the current step is being executed due to a delay slot in a 42 * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
50 * previous instruction. 43 * previous instruction.
51 */ 44 */
52 bool Step(u32 offset, bool is_delay_slot); 45 bool Step(bool is_delay_slot);
53 46
54 /// Calculates the result of an ALU operation. src_a OP src_b; 47 /// Calculates the result of an ALU operation. src_a OP src_b;
55 u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b); 48 u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
56 49
57 /// Performs the result operation on the input result and stores it in the specified register 50 /// Performs the result operation on the input result and stores it in the specified register
58 /// (if necessary). 51 /// (if necessary).
59 void ProcessResult(ResultOperation operation, u32 reg, u32 result); 52 void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
60 53
61 /// Evaluates the branch condition and returns whether the branch should be taken or not. 54 /// Evaluates the branch condition and returns whether the branch should be taken or not.
62 bool EvaluateBranchCondition(BranchCondition cond, u32 value) const; 55 bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
63 56
64 /// Reads an opcode at the current program counter location. 57 /// Reads an opcode at the current program counter location.
65 Opcode GetOpcode(u32 offset) const; 58 Macro::Opcode GetOpcode() const;
66 59
67 /// Returns the specified register's value. Register 0 is hardcoded to always return 0. 60 /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
68 u32 GetRegister(u32 register_id) const; 61 u32 GetRegister(u32 register_id) const;
@@ -89,13 +82,11 @@ private:
89 /// Program counter to execute at after the delay slot is executed. 82 /// Program counter to execute at after the delay slot is executed.
90 std::optional<u32> delayed_pc; 83 std::optional<u32> delayed_pc;
91 84
92 static constexpr std::size_t NumMacroRegisters = 8;
93
94 /// General purpose macro registers. 85 /// General purpose macro registers.
95 std::array<u32, NumMacroRegisters> registers = {}; 86 std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
96 87
97 /// Method address to use for the next Send instruction. 88 /// Method address to use for the next Send instruction.
98 MethodAddress method_address = {}; 89 Macro::MethodAddress method_address = {};
99 90
100 /// Input parameters of the current macro. 91 /// Input parameters of the current macro.
101 std::unique_ptr<u32[]> parameters; 92 std::unique_ptr<u32[]> parameters;
@@ -105,5 +96,7 @@ private:
105 u32 next_parameter_index = 0; 96 u32 next_parameter_index = 0;
106 97
107 bool carry_flag = false; 98 bool carry_flag = false;
99 const std::vector<u32>& code;
108}; 100};
101
109} // namespace Tegra 102} // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
new file mode 100644
index 000000000..11c1cc3be
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -0,0 +1,640 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/assert.h"
6#include "common/logging/log.h"
7#include "common/microprofile.h"
8#include "common/x64/xbyak_util.h"
9#include "video_core/engines/maxwell_3d.h"
10#include "video_core/macro/macro_interpreter.h"
11#include "video_core/macro/macro_jit_x64.h"
12
13MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47));
14MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
15
16namespace Tegra {
17static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9;
18static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10;
19static const Xbyak::Reg64 STATE = Xbyak::util::r11;
20static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12;
21static const Xbyak::Reg32 RESULT = Xbyak::util::r13d;
22static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13;
23static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
24static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14;
25static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
26
27static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
28 PARAMETERS,
29 REGISTERS,
30 STATE,
31 NEXT_PARAMETER,
32 RESULT,
33 METHOD_ADDRESS,
34 BRANCH_HOLDER,
35});
36
37MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
38
39std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
40 return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
41}
42
43MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code)
44 : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) {
45 Compile();
46}
47
48MacroJITx64Impl::~MacroJITx64Impl() = default;
49
50void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
51 MICROPROFILE_SCOPE(MacroJitExecute);
52 ASSERT_OR_EXECUTE(program != nullptr, { return; });
53 JITState state{};
54 state.maxwell3d = &maxwell3d;
55 state.registers = {};
56 state.parameters = parameters.data();
57 program(&state);
58}
59
60void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
61 const bool is_a_zero = opcode.src_a == 0;
62 const bool is_b_zero = opcode.src_b == 0;
63 const bool valid_operation = !is_a_zero && !is_b_zero;
64 const bool is_move_operation = !is_a_zero && is_b_zero;
65 const bool has_zero_register = is_a_zero || is_b_zero;
66
67 Xbyak::Reg64 src_a;
68 Xbyak::Reg32 src_b;
69
70 if (!optimizer.zero_reg_skip) {
71 src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
72 src_b = Compile_GetRegister(opcode.src_b, ebx);
73 } else {
74 if (!is_a_zero) {
75 src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
76 }
77 if (!is_b_zero) {
78 src_b = Compile_GetRegister(opcode.src_b, ebx);
79 }
80 }
81 Xbyak::Label skip_carry{};
82
83 bool has_emitted = false;
84
85 switch (opcode.alu_operation) {
86 case Macro::ALUOperation::Add:
87 if (optimizer.zero_reg_skip) {
88 if (valid_operation) {
89 add(src_a, src_b);
90 }
91 } else {
92 add(src_a, src_b);
93 }
94
95 if (!optimizer.can_skip_carry) {
96 setc(byte[STATE + offsetof(JITState, carry_flag)]);
97 }
98 break;
99 case Macro::ALUOperation::AddWithCarry:
100 bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
101 adc(src_a, src_b);
102 setc(byte[STATE + offsetof(JITState, carry_flag)]);
103 break;
104 case Macro::ALUOperation::Subtract:
105 if (optimizer.zero_reg_skip) {
106 if (valid_operation) {
107 sub(src_a, src_b);
108 has_emitted = true;
109 }
110 } else {
111 sub(src_a, src_b);
112 has_emitted = true;
113 }
114 if (!optimizer.can_skip_carry && has_emitted) {
115 setc(byte[STATE + offsetof(JITState, carry_flag)]);
116 }
117 break;
118 case Macro::ALUOperation::SubtractWithBorrow:
119 bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
120 sbb(src_a, src_b);
121 setc(byte[STATE + offsetof(JITState, carry_flag)]);
122 break;
123 case Macro::ALUOperation::Xor:
124 if (optimizer.zero_reg_skip) {
125 if (valid_operation) {
126 xor_(src_a, src_b);
127 }
128 } else {
129 xor_(src_a, src_b);
130 }
131 break;
132 case Macro::ALUOperation::Or:
133 if (optimizer.zero_reg_skip) {
134 if (valid_operation) {
135 or_(src_a, src_b);
136 }
137 } else {
138 or_(src_a, src_b);
139 }
140 break;
141 case Macro::ALUOperation::And:
142 if (optimizer.zero_reg_skip) {
143 if (!has_zero_register) {
144 and_(src_a, src_b);
145 }
146 } else {
147 and_(src_a, src_b);
148 }
149 break;
150 case Macro::ALUOperation::AndNot:
151 if (optimizer.zero_reg_skip) {
152 if (!is_a_zero) {
153 not_(src_b);
154 and_(src_a, src_b);
155 }
156 } else {
157 not_(src_b);
158 and_(src_a, src_b);
159 }
160 break;
161 case Macro::ALUOperation::Nand:
162 if (optimizer.zero_reg_skip) {
163 if (!is_a_zero) {
164 and_(src_a, src_b);
165 not_(src_a);
166 }
167 } else {
168 and_(src_a, src_b);
169 not_(src_a);
170 }
171 break;
172 default:
173 UNIMPLEMENTED_MSG("Unimplemented ALU operation {}",
174 static_cast<std::size_t>(opcode.alu_operation.Value()));
175 break;
176 }
177 Compile_ProcessResult(opcode.result_operation, opcode.dst);
178}
179
180void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
181 if (optimizer.skip_dummy_addimmediate) {
182 // Games tend to use this as an exit instruction placeholder. It's to encode an instruction
183 // without doing anything. In our case we can just not emit anything.
184 if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
185 return;
186 }
187 }
188 // Check for redundant moves
189 if (optimizer.optimize_for_method_move &&
190 opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
191 if (next_opcode.has_value()) {
192 const auto next = *next_opcode;
193 if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
194 return;
195 }
196 }
197 }
198 if (optimizer.zero_reg_skip && opcode.src_a == 0) {
199 if (opcode.immediate == 0) {
200 xor_(RESULT, RESULT);
201 } else {
202 mov(RESULT, opcode.immediate);
203 }
204 } else {
205 auto result = Compile_GetRegister(opcode.src_a, RESULT);
206 if (opcode.immediate > 2) {
207 add(result, opcode.immediate);
208 } else if (opcode.immediate == 1) {
209 inc(result);
210 } else if (opcode.immediate < 0) {
211 sub(result, opcode.immediate * -1);
212 }
213 }
214 Compile_ProcessResult(opcode.result_operation, opcode.dst);
215}
216
217void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
218 auto dst = Compile_GetRegister(opcode.src_a, RESULT);
219 auto src = Compile_GetRegister(opcode.src_b, eax);
220
221 if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) {
222 shr(src, opcode.bf_src_bit);
223 } else if (opcode.bf_src_bit == 31) {
224 xor_(src, src);
225 }
226 // Don't bother masking the whole register since we're using a 32 bit register
227 if (opcode.bf_size != 31 && opcode.bf_size != 0) {
228 and_(src, opcode.GetBitfieldMask());
229 } else if (opcode.bf_size == 0) {
230 xor_(src, src);
231 }
232 if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) {
233 shl(src, opcode.bf_dst_bit);
234 } else if (opcode.bf_dst_bit == 31) {
235 xor_(src, src);
236 }
237
238 const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
239 if (mask != 0xffffffff) {
240 and_(dst, mask);
241 }
242 or_(dst, src);
243 Compile_ProcessResult(opcode.result_operation, opcode.dst);
244}
245
246void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
247 auto dst = Compile_GetRegister(opcode.src_a, eax);
248 auto src = Compile_GetRegister(opcode.src_b, RESULT);
249
250 shr(src, al);
251 if (opcode.bf_size != 0 && opcode.bf_size != 31) {
252 and_(src, opcode.GetBitfieldMask());
253 } else if (opcode.bf_size == 0) {
254 xor_(src, src);
255 }
256
257 if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) {
258 shl(src, opcode.bf_dst_bit);
259 } else if (opcode.bf_dst_bit == 31) {
260 xor_(src, src);
261 }
262 Compile_ProcessResult(opcode.result_operation, opcode.dst);
263}
264
265void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
266 auto dst = Compile_GetRegister(opcode.src_a, eax);
267 auto src = Compile_GetRegister(opcode.src_b, RESULT);
268
269 if (opcode.bf_src_bit != 0) {
270 shr(src, opcode.bf_src_bit);
271 }
272
273 if (opcode.bf_size != 31) {
274 and_(src, opcode.GetBitfieldMask());
275 }
276 shl(src, al);
277 Compile_ProcessResult(opcode.result_operation, opcode.dst);
278}
279
280static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) {
281 return maxwell3d->GetRegisterValue(method);
282}
283
284static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
285 maxwell3d->CallMethodFromMME(method_address.address, value);
286}
287
288void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
289 if (optimizer.zero_reg_skip && opcode.src_a == 0) {
290 if (opcode.immediate == 0) {
291 xor_(RESULT, RESULT);
292 } else {
293 mov(RESULT, opcode.immediate);
294 }
295 } else {
296 auto result = Compile_GetRegister(opcode.src_a, RESULT);
297 if (opcode.immediate > 2) {
298 add(result, opcode.immediate);
299 } else if (opcode.immediate == 1) {
300 inc(result);
301 } else if (opcode.immediate < 0) {
302 sub(result, opcode.immediate * -1);
303 }
304 }
305 Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
306 mov(Common::X64::ABI_PARAM1, qword[STATE]);
307 mov(Common::X64::ABI_PARAM2, RESULT);
308 Common::X64::CallFarFunction(*this, &Read);
309 Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
310 mov(RESULT, Common::X64::ABI_RETURN.cvt32());
311 Compile_ProcessResult(opcode.result_operation, opcode.dst);
312}
313
314void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
315 Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
316 mov(Common::X64::ABI_PARAM1, qword[STATE]);
317 mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
318 mov(Common::X64::ABI_PARAM3, value);
319 Common::X64::CallFarFunction(*this, &Send);
320 Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
321
322 Xbyak::Label dont_process{};
323 // Get increment
324 test(METHOD_ADDRESS, 0x3f000);
325 // If zero, method address doesn't update
326 je(dont_process);
327
328 mov(ecx, METHOD_ADDRESS);
329 and_(METHOD_ADDRESS, 0xfff);
330 shr(ecx, 12);
331 and_(ecx, 0x3f);
332 lea(eax, ptr[rcx + METHOD_ADDRESS_64]);
333 sal(ecx, 12);
334 or_(eax, ecx);
335
336 mov(METHOD_ADDRESS, eax);
337
338 L(dont_process);
339}
340
341void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
342 ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
343 const s32 jump_address =
344 static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
345
346 Xbyak::Label end;
347 auto value = Compile_GetRegister(opcode.src_a, eax);
348 test(value, value);
349 if (optimizer.has_delayed_pc) {
350 switch (opcode.branch_condition) {
351 case Macro::BranchCondition::Zero:
352 jne(end, T_NEAR);
353 break;
354 case Macro::BranchCondition::NotZero:
355 je(end, T_NEAR);
356 break;
357 }
358
359 if (opcode.branch_annul) {
360 xor_(BRANCH_HOLDER, BRANCH_HOLDER);
361 jmp(labels[jump_address], T_NEAR);
362 } else {
363 Xbyak::Label handle_post_exit{};
364 Xbyak::Label skip{};
365 jmp(skip, T_NEAR);
366 if (opcode.is_exit) {
367 L(handle_post_exit);
368 // Execute 1 instruction
369 mov(BRANCH_HOLDER, end_of_code);
370 // Jump to next instruction to skip delay slot check
371 jmp(labels[jump_address], T_NEAR);
372 } else {
373 L(handle_post_exit);
374 xor_(BRANCH_HOLDER, BRANCH_HOLDER);
375 jmp(labels[jump_address], T_NEAR);
376 }
377 L(skip);
378 mov(BRANCH_HOLDER, handle_post_exit);
379 jmp(delay_skip[pc], T_NEAR);
380 }
381 } else {
382 switch (opcode.branch_condition) {
383 case Macro::BranchCondition::Zero:
384 je(labels[jump_address], T_NEAR);
385 break;
386 case Macro::BranchCondition::NotZero:
387 jne(labels[jump_address], T_NEAR);
388 break;
389 }
390 }
391
392 L(end);
393}
394
395void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
396 optimizer.can_skip_carry = true;
397 optimizer.has_delayed_pc = false;
398 for (auto raw_op : code) {
399 Macro::Opcode op{};
400 op.raw = raw_op;
401
402 if (op.operation == Macro::Operation::ALU) {
403 // Scan for any ALU operations which actually use the carry flag, if they don't exist in
404 // our current code we can skip emitting the carry flag handling operations
405 if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
406 op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
407 optimizer.can_skip_carry = false;
408 }
409 }
410
411 if (op.operation == Macro::Operation::Branch) {
412 if (!op.branch_annul) {
413 optimizer.has_delayed_pc = true;
414 }
415 }
416 }
417}
418
419void MacroJITx64Impl::Compile() {
420 MICROPROFILE_SCOPE(MacroJitCompile);
421 bool keep_executing = true;
422 labels.fill(Xbyak::Label());
423
424 Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
425 // JIT state
426 mov(STATE, Common::X64::ABI_PARAM1);
427 mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 +
428 static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]);
429 mov(REGISTERS, Common::X64::ABI_PARAM1);
430 add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers)));
431 xor_(RESULT, RESULT);
432 xor_(METHOD_ADDRESS, METHOD_ADDRESS);
433 xor_(NEXT_PARAMETER, NEXT_PARAMETER);
434 xor_(BRANCH_HOLDER, BRANCH_HOLDER);
435
436 mov(dword[REGISTERS + 4], Compile_FetchParameter());
437
438 // Track get register for zero registers and mark it as no-op
439 optimizer.zero_reg_skip = true;
440
441 // AddImmediate tends to be used as a NOP instruction, if we detect this we can
442 // completely skip the entire code path and no emit anything
443 optimizer.skip_dummy_addimmediate = true;
444
445 // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
446 // one if our register isn't "dirty"
447 optimizer.optimize_for_method_move = true;
448
449 // Check to see if we can skip emitting certain instructions
450 Optimizer_ScanFlags();
451
452 const u32 op_count = static_cast<u32>(code.size());
453 for (u32 i = 0; i < op_count; i++) {
454 if (i < op_count - 1) {
455 pc = i + 1;
456 next_opcode = GetOpCode();
457 } else {
458 next_opcode = {};
459 }
460 pc = i;
461 Compile_NextInstruction();
462 }
463
464 L(end_of_code);
465
466 Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
467 ret();
468 ready();
469 program = getCode<ProgramType>();
470}
471
472bool MacroJITx64Impl::Compile_NextInstruction() {
473 const auto opcode = GetOpCode();
474 if (labels[pc].getAddress()) {
475 return false;
476 }
477
478 L(labels[pc]);
479
480 switch (opcode.operation) {
481 case Macro::Operation::ALU:
482 Compile_ALU(opcode);
483 break;
484 case Macro::Operation::AddImmediate:
485 Compile_AddImmediate(opcode);
486 break;
487 case Macro::Operation::ExtractInsert:
488 Compile_ExtractInsert(opcode);
489 break;
490 case Macro::Operation::ExtractShiftLeftImmediate:
491 Compile_ExtractShiftLeftImmediate(opcode);
492 break;
493 case Macro::Operation::ExtractShiftLeftRegister:
494 Compile_ExtractShiftLeftRegister(opcode);
495 break;
496 case Macro::Operation::Read:
497 Compile_Read(opcode);
498 break;
499 case Macro::Operation::Branch:
500 Compile_Branch(opcode);
501 break;
502 default:
503 UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
504 break;
505 }
506
507 if (optimizer.has_delayed_pc) {
508 if (opcode.is_exit) {
509 mov(rax, end_of_code);
510 test(BRANCH_HOLDER, BRANCH_HOLDER);
511 cmove(BRANCH_HOLDER, rax);
512 // Jump to next instruction to skip delay slot check
513 je(labels[pc + 1], T_NEAR);
514 } else {
515 // TODO(ogniK): Optimize delay slot branching
516 Xbyak::Label no_delay_slot{};
517 test(BRANCH_HOLDER, BRANCH_HOLDER);
518 je(no_delay_slot, T_NEAR);
519 mov(rax, BRANCH_HOLDER);
520 xor_(BRANCH_HOLDER, BRANCH_HOLDER);
521 jmp(rax);
522 L(no_delay_slot);
523 }
524 L(delay_skip[pc]);
525 if (opcode.is_exit) {
526 return false;
527 }
528 } else {
529 test(BRANCH_HOLDER, BRANCH_HOLDER);
530 jne(end_of_code, T_NEAR);
531 if (opcode.is_exit) {
532 inc(BRANCH_HOLDER);
533 return false;
534 }
535 }
536 return true;
537}
538
539Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
540 mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]);
541 inc(NEXT_PARAMETER);
542 return eax;
543}
544
545Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
546 if (index == 0) {
547 // Register 0 is always zero
548 xor_(dst, dst);
549 } else {
550 mov(dst, dword[REGISTERS + index * sizeof(u32)]);
551 }
552
553 return dst;
554}
555
556Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) {
557 if (index == 0) {
558 // Register 0 is always zero
559 xor_(dst, dst);
560 } else {
561 mov(dst, dword[REGISTERS + index * sizeof(u32)]);
562 }
563
564 return dst;
565}
566
567void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) {
568 Xbyak::Label zero{}, end{};
569 xor_(ecx, ecx);
570 shr(dst, 32);
571 setne(cl);
572 mov(dword[STATE + offsetof(JITState, carry_flag)], ecx);
573}
574
575void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
576 auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) {
577 // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
578 // register.
579 if (reg == 0) {
580 return;
581 }
582 mov(dword[REGISTERS + reg * sizeof(u32)], result);
583 };
584 auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); };
585
586 switch (operation) {
587 case Macro::ResultOperation::IgnoreAndFetch:
588 SetRegister(reg, Compile_FetchParameter());
589 break;
590 case Macro::ResultOperation::Move:
591 SetRegister(reg, RESULT);
592 break;
593 case Macro::ResultOperation::MoveAndSetMethod:
594 SetRegister(reg, RESULT);
595 SetMethodAddress(RESULT);
596 break;
597 case Macro::ResultOperation::FetchAndSend:
598 // Fetch parameter and send result.
599 SetRegister(reg, Compile_FetchParameter());
600 Compile_Send(RESULT);
601 break;
602 case Macro::ResultOperation::MoveAndSend:
603 // Move and send result.
604 SetRegister(reg, RESULT);
605 Compile_Send(RESULT);
606 break;
607 case Macro::ResultOperation::FetchAndSetMethod:
608 // Fetch parameter and use result as Method Address.
609 SetRegister(reg, Compile_FetchParameter());
610 SetMethodAddress(RESULT);
611 break;
612 case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
613 // Move result and use as Method Address, then fetch and send parameter.
614 SetRegister(reg, RESULT);
615 SetMethodAddress(RESULT);
616 Compile_Send(Compile_FetchParameter());
617 break;
618 case Macro::ResultOperation::MoveAndSetMethodSend:
619 // Move result and use as Method Address, then send bits 12:17 of result.
620 SetRegister(reg, RESULT);
621 SetMethodAddress(RESULT);
622 shr(RESULT, 12);
623 and_(RESULT, 0b111111);
624 Compile_Send(RESULT);
625 break;
626 default:
627 UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation));
628 }
629}
630
631Macro::Opcode MacroJITx64Impl::GetOpCode() const {
632 ASSERT(pc < code.size());
633 return {code[pc]};
634}
635
636std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
637 return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
638}
639
640} // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h
new file mode 100644
index 000000000..71f738b9a
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -0,0 +1,100 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <bitset>
9#include <xbyak.h>
10#include "common/bit_field.h"
11#include "common/common_types.h"
12#include "common/x64/xbyak_abi.h"
13#include "video_core/macro/macro.h"
14
15namespace Tegra {
16
17namespace Engines {
18class Maxwell3D;
19}
20
21/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
22constexpr size_t MAX_CODE_SIZE = 0x10000;
23
24class MacroJITx64 final : public MacroEngine {
25public:
26 explicit MacroJITx64(Engines::Maxwell3D& maxwell3d);
27
28protected:
29 std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
30
31private:
32 Engines::Maxwell3D& maxwell3d;
33};
34
35class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
36public:
37 MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
38 ~MacroJITx64Impl();
39
40 void Execute(const std::vector<u32>& parameters, u32 method) override;
41
42 void Compile_ALU(Macro::Opcode opcode);
43 void Compile_AddImmediate(Macro::Opcode opcode);
44 void Compile_ExtractInsert(Macro::Opcode opcode);
45 void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
46 void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
47 void Compile_Read(Macro::Opcode opcode);
48 void Compile_Branch(Macro::Opcode opcode);
49
50private:
51 void Optimizer_ScanFlags();
52
53 void Compile();
54 bool Compile_NextInstruction();
55
56 Xbyak::Reg32 Compile_FetchParameter();
57 Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
58 Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst);
59 void Compile_WriteCarry(Xbyak::Reg64 dst);
60
61 void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
62 void Compile_Send(Xbyak::Reg32 value);
63
64 Macro::Opcode GetOpCode() const;
65 std::bitset<32> PersistentCallerSavedRegs() const;
66
67 struct JITState {
68 Engines::Maxwell3D* maxwell3d{};
69 std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
70 const u32* parameters{};
71 u32 carry_flag{};
72 };
73 static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
74 using ProgramType = void (*)(JITState*);
75
76 struct OptimizerState {
77 bool can_skip_carry{};
78 bool has_delayed_pc{};
79 bool zero_reg_skip{};
80 bool skip_dummy_addimmediate{};
81 bool optimize_for_method_move{};
82 };
83 OptimizerState optimizer{};
84
85 std::optional<Macro::Opcode> next_opcode{};
86 ProgramType program{nullptr};
87
88 std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
89 std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
90 Xbyak::Label end_of_code{};
91
92 bool is_delay_slot{};
93 u32 pc{};
94 std::optional<u32> delayed_pc;
95
96 const std::vector<u32>& code;
97 Engines::Maxwell3D& maxwell3d;
98};
99
100} // namespace Tegra
diff --git a/src/video_core/rasterizer_cache.cpp b/src/video_core/rasterizer_cache.cpp
deleted file mode 100644
index 093b2cdf4..000000000
--- a/src/video_core/rasterizer_cache.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
1// Copyright 2018 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "video_core/rasterizer_cache.h"
6
7RasterizerCacheObject::~RasterizerCacheObject() = default;
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
deleted file mode 100644
index 22987751e..000000000
--- a/src/video_core/rasterizer_cache.h
+++ /dev/null
@@ -1,197 +0,0 @@
1// Copyright 2018 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <mutex>
8#include <set>
9#include <unordered_map>
10
11#include <boost/icl/interval_map.hpp>
12#include <boost/range/iterator_range_core.hpp>
13
14#include "common/common_types.h"
15#include "core/settings.h"
16#include "video_core/gpu.h"
17#include "video_core/rasterizer_interface.h"
18
19class RasterizerCacheObject {
20public:
21 explicit RasterizerCacheObject(const VAddr cpu_addr) : cpu_addr{cpu_addr} {}
22
23 virtual ~RasterizerCacheObject();
24
25 VAddr GetCpuAddr() const {
26 return cpu_addr;
27 }
28
29 /// Gets the size of the shader in guest memory, required for cache management
30 virtual std::size_t GetSizeInBytes() const = 0;
31
32 /// Sets whether the cached object should be considered registered
33 void SetIsRegistered(bool registered) {
34 is_registered = registered;
35 }
36
37 /// Returns true if the cached object is registered
38 bool IsRegistered() const {
39 return is_registered;
40 }
41
42 /// Returns true if the cached object is dirty
43 bool IsDirty() const {
44 return is_dirty;
45 }
46
47 /// Returns ticks from when this cached object was last modified
48 u64 GetLastModifiedTicks() const {
49 return last_modified_ticks;
50 }
51
52 /// Marks an object as recently modified, used to specify whether it is clean or dirty
53 template <class T>
54 void MarkAsModified(bool dirty, T& cache) {
55 is_dirty = dirty;
56 last_modified_ticks = cache.GetModifiedTicks();
57 }
58
59private:
60 bool is_registered{}; ///< Whether the object is currently registered with the cache
61 bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory)
62 u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
63 VAddr cpu_addr{}; ///< Cpu address memory, unique from emulated virtual address space
64};
65
66template <class T>
67class RasterizerCache : NonCopyable {
68 friend class RasterizerCacheObject;
69
70public:
71 explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
72
73 /// Write any cached resources overlapping the specified region back to memory
74 void FlushRegion(VAddr addr, std::size_t size) {
75 std::lock_guard lock{mutex};
76
77 const auto& objects{GetSortedObjectsFromRegion(addr, size)};
78 for (auto& object : objects) {
79 FlushObject(object);
80 }
81 }
82
83 /// Mark the specified region as being invalidated
84 void InvalidateRegion(VAddr addr, u64 size) {
85 std::lock_guard lock{mutex};
86
87 const auto& objects{GetSortedObjectsFromRegion(addr, size)};
88 for (auto& object : objects) {
89 if (!object->IsRegistered()) {
90 // Skip duplicates
91 continue;
92 }
93 Unregister(object);
94 }
95 }
96
97 /// Invalidates everything in the cache
98 void InvalidateAll() {
99 std::lock_guard lock{mutex};
100
101 while (interval_cache.begin() != interval_cache.end()) {
102 Unregister(*interval_cache.begin()->second.begin());
103 }
104 }
105
106protected:
107 /// Tries to get an object from the cache with the specified cache address
108 T TryGet(VAddr addr) const {
109 const auto iter = map_cache.find(addr);
110 if (iter != map_cache.end())
111 return iter->second;
112 return nullptr;
113 }
114
115 /// Register an object into the cache
116 virtual void Register(const T& object) {
117 std::lock_guard lock{mutex};
118
119 object->SetIsRegistered(true);
120 interval_cache.add({GetInterval(object), ObjectSet{object}});
121 map_cache.insert({object->GetCpuAddr(), object});
122 rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
123 }
124
125 /// Unregisters an object from the cache
126 virtual void Unregister(const T& object) {
127 std::lock_guard lock{mutex};
128
129 object->SetIsRegistered(false);
130 rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
131 const VAddr addr = object->GetCpuAddr();
132 interval_cache.subtract({GetInterval(object), ObjectSet{object}});
133 map_cache.erase(addr);
134 }
135
136 /// Returns a ticks counter used for tracking when cached objects were last modified
137 u64 GetModifiedTicks() {
138 std::lock_guard lock{mutex};
139
140 return ++modified_ticks;
141 }
142
143 virtual void FlushObjectInner(const T& object) = 0;
144
145 /// Flushes the specified object, updating appropriate cache state as needed
146 void FlushObject(const T& object) {
147 std::lock_guard lock{mutex};
148
149 if (!object->IsDirty()) {
150 return;
151 }
152 FlushObjectInner(object);
153 object->MarkAsModified(false, *this);
154 }
155
156 std::recursive_mutex mutex;
157
158private:
159 /// Returns a list of cached objects from the specified memory region, ordered by access time
160 std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) {
161 if (size == 0) {
162 return {};
163 }
164
165 std::vector<T> objects;
166 const ObjectInterval interval{addr, addr + size};
167 for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) {
168 for (auto& cached_object : pair.second) {
169 if (!cached_object) {
170 continue;
171 }
172 objects.push_back(cached_object);
173 }
174 }
175
176 std::sort(objects.begin(), objects.end(), [](const T& a, const T& b) -> bool {
177 return a->GetLastModifiedTicks() < b->GetLastModifiedTicks();
178 });
179
180 return objects;
181 }
182
183 using ObjectSet = std::set<T>;
184 using ObjectCache = std::unordered_map<VAddr, T>;
185 using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>;
186 using ObjectInterval = typename IntervalCache::interval_type;
187
188 static auto GetInterval(const T& object) {
189 return ObjectInterval::right_open(object->GetCpuAddr(),
190 object->GetCpuAddr() + object->GetSizeInBytes());
191 }
192
193 ObjectCache map_cache;
194 IntervalCache interval_cache; ///< Cache of objects
195 u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing
196 VideoCore::RasterizerInterface& rasterizer;
197};
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
new file mode 100644
index 000000000..1e96b0310
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -0,0 +1,2074 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <array>
7#include <cstddef>
8#include <string>
9#include <string_view>
10#include <utility>
11#include <variant>
12
13#include <fmt/format.h>
14
15#include "common/alignment.h"
16#include "common/assert.h"
17#include "common/common_types.h"
18#include "video_core/renderer_opengl/gl_arb_decompiler.h"
19#include "video_core/renderer_opengl/gl_device.h"
20#include "video_core/shader/registry.h"
21#include "video_core/shader/shader_ir.h"
22
23// Predicates in the decompiled code follow the convention that -1 means true and 0 means false.
24// GLASM lacks booleans, so they have to be implemented as integers.
25// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to
26// select between two values, because -1 will be evaluated as true and 0 as false.
27
28namespace OpenGL {
29
30namespace {
31
32using Tegra::Engines::ShaderType;
33using Tegra::Shader::Attribute;
34using Tegra::Shader::PixelImap;
35using Tegra::Shader::Register;
36using namespace VideoCommon::Shader;
37using Operation = const OperationNode&;
38
39constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};
40
41char Swizzle(std::size_t component) {
42 ASSERT(component < 4);
43 return component["xyzw"];
44}
45
46constexpr bool IsGenericAttribute(Attribute::Index index) {
47 return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31;
48}
49
50u32 GetGenericAttributeIndex(Attribute::Index index) {
51 ASSERT(IsGenericAttribute(index));
52 return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
53}
54
55std::string_view Modifiers(Operation operation) {
56 const auto meta = std::get_if<MetaArithmetic>(&operation.GetMeta());
57 if (meta && meta->precise) {
58 return ".PREC";
59 }
60 return "";
61}
62
63std::string_view GetInputFlags(PixelImap attribute) {
64 switch (attribute) {
65 case PixelImap::Perspective:
66 return "";
67 case PixelImap::Constant:
68 return "FLAT ";
69 case PixelImap::ScreenLinear:
70 return "NOPERSPECTIVE ";
71 case PixelImap::Unused:
72 break;
73 }
74 UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute));
75 return {};
76}
77
78std::string_view ImageType(Tegra::Shader::ImageType image_type) {
79 switch (image_type) {
80 case Tegra::Shader::ImageType::Texture1D:
81 return "1D";
82 case Tegra::Shader::ImageType::TextureBuffer:
83 return "BUFFER";
84 case Tegra::Shader::ImageType::Texture1DArray:
85 return "ARRAY1D";
86 case Tegra::Shader::ImageType::Texture2D:
87 return "2D";
88 case Tegra::Shader::ImageType::Texture2DArray:
89 return "ARRAY2D";
90 case Tegra::Shader::ImageType::Texture3D:
91 return "3D";
92 }
93 UNREACHABLE();
94 return {};
95}
96
97std::string_view StackName(MetaStackClass stack) {
98 switch (stack) {
99 case MetaStackClass::Ssy:
100 return "SSY";
101 case MetaStackClass::Pbk:
102 return "PBK";
103 }
104 UNREACHABLE();
105 return "";
106};
107
108std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) {
109 switch (topology) {
110 case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points:
111 return "POINTS";
112 case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines:
113 case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip:
114 return "LINES";
115 case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
116 case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
117 return "LINES_ADJACENCY";
118 case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles:
119 case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
120 case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
121 return "TRIANGLES";
122 case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
123 case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
124 return "TRIANGLES_ADJACENCY";
125 default:
126 UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
127 return "POINTS";
128 }
129}
130
131std::string_view TopologyName(Tegra::Shader::OutputTopology topology) {
132 switch (topology) {
133 case Tegra::Shader::OutputTopology::PointList:
134 return "POINTS";
135 case Tegra::Shader::OutputTopology::LineStrip:
136 return "LINE_STRIP";
137 case Tegra::Shader::OutputTopology::TriangleStrip:
138 return "TRIANGLE_STRIP";
139 default:
140 UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology));
141 return "points";
142 }
143}
144
145std::string_view StageInputName(ShaderType stage) {
146 switch (stage) {
147 case ShaderType::Vertex:
148 case ShaderType::Geometry:
149 return "vertex";
150 case ShaderType::Fragment:
151 return "fragment";
152 case ShaderType::Compute:
153 return "invocation";
154 default:
155 UNREACHABLE();
156 return "";
157 }
158}
159
160std::string TextureType(const MetaTexture& meta) {
161 if (meta.sampler.is_buffer) {
162 return "BUFFER";
163 }
164 std::string type;
165 if (meta.sampler.is_shadow) {
166 type += "SHADOW";
167 }
168 if (meta.sampler.is_array) {
169 type += "ARRAY";
170 }
171 type += [&meta] {
172 switch (meta.sampler.type) {
173 case Tegra::Shader::TextureType::Texture1D:
174 return "1D";
175 case Tegra::Shader::TextureType::Texture2D:
176 return "2D";
177 case Tegra::Shader::TextureType::Texture3D:
178 return "3D";
179 case Tegra::Shader::TextureType::TextureCube:
180 return "CUBE";
181 }
182 UNREACHABLE();
183 return "2D";
184 }();
185 return type;
186}
187
188std::string GlobalMemoryName(const GlobalMemoryBase& base) {
189 return fmt::format("gmem{}_{}", base.cbuf_index, base.cbuf_offset);
190}
191
192class ARBDecompiler final {
193public:
194 explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
195 ShaderType stage, std::string_view identifier);
196
197 std::string Code() const {
198 return shader_source;
199 }
200
201private:
202 void DeclareHeader();
203 void DeclareVertex();
204 void DeclareGeometry();
205 void DeclareFragment();
206 void DeclareCompute();
207 void DeclareInputAttributes();
208 void DeclareOutputAttributes();
209 void DeclareLocalMemory();
210 void DeclareGlobalMemory();
211 void DeclareConstantBuffers();
212 void DeclareRegisters();
213 void DeclareTemporaries();
214 void DeclarePredicates();
215 void DeclareInternalFlags();
216
217 void InitializeVariables();
218
219 void DecompileAST();
220 void DecompileBranchMode();
221
222 void VisitAST(const ASTNode& node);
223 std::string VisitExpression(const Expr& node);
224
225 void VisitBlock(const NodeBlock& bb);
226
227 std::string Visit(const Node& node);
228
229 std::pair<std::string, std::size_t> BuildCoords(Operation);
230 std::string BuildAoffi(Operation);
231 void Exit();
232
233 std::string Assign(Operation);
234 std::string Select(Operation);
235 std::string FClamp(Operation);
236 std::string FCastHalf0(Operation);
237 std::string FCastHalf1(Operation);
238 std::string FSqrt(Operation);
239 std::string FSwizzleAdd(Operation);
240 std::string HAdd2(Operation);
241 std::string HMul2(Operation);
242 std::string HFma2(Operation);
243 std::string HAbsolute(Operation);
244 std::string HNegate(Operation);
245 std::string HClamp(Operation);
246 std::string HCastFloat(Operation);
247 std::string HUnpack(Operation);
248 std::string HMergeF32(Operation);
249 std::string HMergeH0(Operation);
250 std::string HMergeH1(Operation);
251 std::string HPack2(Operation);
252 std::string LogicalAssign(Operation);
253 std::string LogicalPick2(Operation);
254 std::string LogicalAnd2(Operation);
255 std::string FloatOrdered(Operation);
256 std::string FloatUnordered(Operation);
257 std::string LogicalAddCarry(Operation);
258 std::string Texture(Operation);
259 std::string TextureGather(Operation);
260 std::string TextureQueryDimensions(Operation);
261 std::string TextureQueryLod(Operation);
262 std::string TexelFetch(Operation);
263 std::string TextureGradient(Operation);
264 std::string ImageLoad(Operation);
265 std::string ImageStore(Operation);
266 std::string Branch(Operation);
267 std::string BranchIndirect(Operation);
268 std::string PushFlowStack(Operation);
269 std::string PopFlowStack(Operation);
270 std::string Exit(Operation);
271 std::string Discard(Operation);
272 std::string EmitVertex(Operation);
273 std::string EndPrimitive(Operation);
274 std::string InvocationId(Operation);
275 std::string YNegate(Operation);
276 std::string ThreadId(Operation);
277 std::string ShuffleIndexed(Operation);
278 std::string Barrier(Operation);
279 std::string MemoryBarrierGroup(Operation);
280 std::string MemoryBarrierGlobal(Operation);
281
282 template <const std::string_view& op>
283 std::string Unary(Operation operation) {
284 const std::string temporary = AllocTemporary();
285 AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]));
286 return temporary;
287 }
288
289 template <const std::string_view& op>
290 std::string Binary(Operation operation) {
291 const std::string temporary = AllocTemporary();
292 AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
293 Visit(operation[1]));
294 return temporary;
295 }
296
297 template <const std::string_view& op>
298 std::string Trinary(Operation operation) {
299 const std::string temporary = AllocTemporary();
300 AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
301 Visit(operation[1]), Visit(operation[2]));
302 return temporary;
303 }
304
305 template <const std::string_view& op, bool unordered>
306 std::string FloatComparison(Operation operation) {
307 const std::string temporary = AllocTemporary();
308 AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation));
309 AddLine("MOV.S {}, 0;", temporary);
310 AddLine("MOV.S {} (NE.x), -1;", temporary);
311
312 const std::string op_a = Visit(operation[0]);
313 const std::string op_b = Visit(operation[1]);
314 if constexpr (unordered) {
315 AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
316 AddLine("TRUNC.U.CC RC.x, RC.x;");
317 AddLine("MOV.S {} (NE.x), -1;", temporary);
318 AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
319 AddLine("TRUNC.U.CC RC.x, RC.x;");
320 AddLine("MOV.S {} (NE.x), -1;", temporary);
321 } else if (op == SNE_F) {
322 AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
323 AddLine("TRUNC.U.CC RC.x, RC.x;");
324 AddLine("MOV.S {} (NE.x), 0;", temporary);
325 AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
326 AddLine("TRUNC.U.CC RC.x, RC.x;");
327 AddLine("MOV.S {} (NE.x), 0;", temporary);
328 }
329 return temporary;
330 }
331
332 template <const std::string_view& op, bool is_nan>
333 std::string HalfComparison(Operation operation) {
334 const std::string tmp1 = AllocVectorTemporary();
335 const std::string tmp2 = AllocVectorTemporary();
336 const std::string op_a = Visit(operation[0]);
337 const std::string op_b = Visit(operation[1]);
338 AddLine("UP2H.F {}, {};", tmp1, op_a);
339 AddLine("UP2H.F {}, {};", tmp2, op_b);
340 AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2);
341 AddLine("TRUNC.U.CC RC.xy, {};", tmp1);
342 AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1);
343 AddLine("MOV.S {}.x (NE.x), -1;", tmp1);
344 AddLine("MOV.S {}.y (NE.y), -1;", tmp1);
345 if constexpr (is_nan) {
346 AddLine("MOVC.F RC.x, {};", op_a);
347 AddLine("MOV.S {}.x (NAN.x), -1;", tmp1);
348 AddLine("MOVC.F RC.x, {};", op_b);
349 AddLine("MOV.S {}.y (NAN.x), -1;", tmp1);
350 }
351 return tmp1;
352 }
353
354 template <const std::string_view& op, const std::string_view& type>
355 std::string AtomicImage(Operation operation) {
356 const auto& meta = std::get<MetaImage>(operation.GetMeta());
357 const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
358 const std::size_t num_coords = operation.GetOperandsCount();
359 const std::size_t num_values = meta.values.size();
360
361 const std::string coord = AllocVectorTemporary();
362 const std::string value = AllocVectorTemporary();
363 for (std::size_t i = 0; i < num_coords; ++i) {
364 AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
365 }
366 for (std::size_t i = 0; i < num_values; ++i) {
367 AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
368 }
369
370 const std::string result = coord;
371 AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, result, value, coord,
372 image_id, ImageType(meta.image.type));
373 return fmt::format("{}.x", result);
374 }
375
376 template <const std::string_view& op, const std::string_view& type>
377 std::string Atomic(Operation operation) {
378 const std::string temporary = AllocTemporary();
379 std::string address;
380 std::string_view opname;
381 if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
382 AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
383 Visit(gmem->GetBaseAddress()));
384 address = fmt::format("{}[{}]", GlobalMemoryName(gmem->GetDescriptor()), temporary);
385 opname = "ATOMB";
386 } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
387 address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
388 opname = "ATOMS";
389 } else {
390 UNREACHABLE();
391 return "{0, 0, 0, 0}";
392 }
393 AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address);
394 return temporary;
395 }
396
397 template <char type>
398 std::string Negate(Operation operation) {
399 const std::string temporary = AllocTemporary();
400 if constexpr (type == 'F') {
401 AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0]));
402 } else {
403 AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0]));
404 }
405 return temporary;
406 }
407
408 template <char type>
409 std::string Absolute(Operation operation) {
410 const std::string temporary = AllocTemporary();
411 AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0]));
412 return temporary;
413 }
414
415 template <char type>
416 std::string BitfieldInsert(Operation operation) {
417 const std::string temporary = AllocVectorTemporary();
418 AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3]));
419 AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2]));
420 AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]),
421 Visit(operation[0]));
422 return fmt::format("{}.x", temporary);
423 }
424
425 template <char type>
426 std::string BitfieldExtract(Operation operation) {
427 const std::string temporary = AllocVectorTemporary();
428 AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2]));
429 AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1]));
430 AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0]));
431 return fmt::format("{}.x", temporary);
432 }
433
434 template <char swizzle>
435 std::string LocalInvocationId(Operation) {
436 return fmt::format("invocation.localid.{}", swizzle);
437 }
438
439 template <char swizzle>
440 std::string WorkGroupId(Operation) {
441 return fmt::format("invocation.groupid.{}", swizzle);
442 }
443
444 template <char c1, char c2>
445 std::string ThreadMask(Operation) {
446 return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2);
447 }
448
449 template <typename... Args>
450 void AddExpression(std::string_view text, Args&&... args) {
451 shader_source += fmt::format(text, std::forward<Args>(args)...);
452 }
453
454 template <typename... Args>
455 void AddLine(std::string_view text, Args&&... args) {
456 AddExpression(text, std::forward<Args>(args)...);
457 shader_source += '\n';
458 }
459
460 std::string AllocTemporary() {
461 max_temporaries = std::max(max_temporaries, num_temporaries + 1);
462 return fmt::format("T{}.x", num_temporaries++);
463 }
464
465 std::string AllocVectorTemporary() {
466 max_temporaries = std::max(max_temporaries, num_temporaries + 1);
467 return fmt::format("T{}", num_temporaries++);
468 }
469
470 void ResetTemporaries() noexcept {
471 num_temporaries = 0;
472 }
473
474 const Device& device;
475 const ShaderIR& ir;
476 const Registry& registry;
477 const ShaderType stage;
478
479 std::size_t num_temporaries = 0;
480 std::size_t max_temporaries = 0;
481
482 std::string shader_source;
483
484 static constexpr std::string_view ADD_F32 = "ADD.F32";
485 static constexpr std::string_view ADD_S = "ADD.S";
486 static constexpr std::string_view ADD_U = "ADD.U";
487 static constexpr std::string_view MUL_F32 = "MUL.F32";
488 static constexpr std::string_view MUL_S = "MUL.S";
489 static constexpr std::string_view MUL_U = "MUL.U";
490 static constexpr std::string_view DIV_F32 = "DIV.F32";
491 static constexpr std::string_view DIV_S = "DIV.S";
492 static constexpr std::string_view DIV_U = "DIV.U";
493 static constexpr std::string_view MAD_F32 = "MAD.F32";
494 static constexpr std::string_view RSQ_F32 = "RSQ.F32";
495 static constexpr std::string_view COS_F32 = "COS.F32";
496 static constexpr std::string_view SIN_F32 = "SIN.F32";
497 static constexpr std::string_view EX2_F32 = "EX2.F32";
498 static constexpr std::string_view LG2_F32 = "LG2.F32";
499 static constexpr std::string_view SLT_F = "SLT.F32";
500 static constexpr std::string_view SLT_S = "SLT.S";
501 static constexpr std::string_view SLT_U = "SLT.U";
502 static constexpr std::string_view SEQ_F = "SEQ.F32";
503 static constexpr std::string_view SEQ_S = "SEQ.S";
504 static constexpr std::string_view SEQ_U = "SEQ.U";
505 static constexpr std::string_view SLE_F = "SLE.F32";
506 static constexpr std::string_view SLE_S = "SLE.S";
507 static constexpr std::string_view SLE_U = "SLE.U";
508 static constexpr std::string_view SGT_F = "SGT.F32";
509 static constexpr std::string_view SGT_S = "SGT.S";
510 static constexpr std::string_view SGT_U = "SGT.U";
511 static constexpr std::string_view SNE_F = "SNE.F32";
512 static constexpr std::string_view SNE_S = "SNE.S";
513 static constexpr std::string_view SNE_U = "SNE.U";
514 static constexpr std::string_view SGE_F = "SGE.F32";
515 static constexpr std::string_view SGE_S = "SGE.S";
516 static constexpr std::string_view SGE_U = "SGE.U";
517 static constexpr std::string_view AND_S = "AND.S";
518 static constexpr std::string_view AND_U = "AND.U";
519 static constexpr std::string_view TRUNC_F = "TRUNC.F";
520 static constexpr std::string_view TRUNC_S = "TRUNC.S";
521 static constexpr std::string_view TRUNC_U = "TRUNC.U";
522 static constexpr std::string_view SHL_S = "SHL.S";
523 static constexpr std::string_view SHL_U = "SHL.U";
524 static constexpr std::string_view SHR_S = "SHR.S";
525 static constexpr std::string_view SHR_U = "SHR.U";
526 static constexpr std::string_view OR_S = "OR.S";
527 static constexpr std::string_view OR_U = "OR.U";
528 static constexpr std::string_view XOR_S = "XOR.S";
529 static constexpr std::string_view XOR_U = "XOR.U";
530 static constexpr std::string_view NOT_S = "NOT.S";
531 static constexpr std::string_view NOT_U = "NOT.U";
532 static constexpr std::string_view BTC_S = "BTC.S";
533 static constexpr std::string_view BTC_U = "BTC.U";
534 static constexpr std::string_view BTFM_S = "BTFM.S";
535 static constexpr std::string_view BTFM_U = "BTFM.U";
536 static constexpr std::string_view ROUND_F = "ROUND.F";
537 static constexpr std::string_view CEIL_F = "CEIL.F";
538 static constexpr std::string_view FLR_F = "FLR.F";
539 static constexpr std::string_view I2F_S = "I2F.S";
540 static constexpr std::string_view I2F_U = "I2F.U";
541 static constexpr std::string_view MIN_F = "MIN.F";
542 static constexpr std::string_view MIN_S = "MIN.S";
543 static constexpr std::string_view MIN_U = "MIN.U";
544 static constexpr std::string_view MAX_F = "MAX.F";
545 static constexpr std::string_view MAX_S = "MAX.S";
546 static constexpr std::string_view MAX_U = "MAX.U";
547 static constexpr std::string_view MOV_U = "MOV.U";
548 static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U";
549 static constexpr std::string_view TGALL_U = "TGALL.U";
550 static constexpr std::string_view TGANY_U = "TGANY.U";
551 static constexpr std::string_view TGEQ_U = "TGEQ.U";
552 static constexpr std::string_view EXCH = "EXCH";
553 static constexpr std::string_view ADD = "ADD";
554 static constexpr std::string_view MIN = "MIN";
555 static constexpr std::string_view MAX = "MAX";
556 static constexpr std::string_view AND = "AND";
557 static constexpr std::string_view OR = "OR";
558 static constexpr std::string_view XOR = "XOR";
559 static constexpr std::string_view U32 = "U32";
560 static constexpr std::string_view S32 = "S32";
561
562 static constexpr std::size_t NUM_ENTRIES = static_cast<std::size_t>(OperationCode::Amount);
563 using DecompilerType = std::string (ARBDecompiler::*)(Operation);
564 static constexpr std::array<DecompilerType, NUM_ENTRIES> OPERATION_DECOMPILERS = {
565 &ARBDecompiler::Assign,
566
567 &ARBDecompiler::Select,
568
569 &ARBDecompiler::Binary<ADD_F32>,
570 &ARBDecompiler::Binary<MUL_F32>,
571 &ARBDecompiler::Binary<DIV_F32>,
572 &ARBDecompiler::Trinary<MAD_F32>,
573 &ARBDecompiler::Negate<'F'>,
574 &ARBDecompiler::Absolute<'F'>,
575 &ARBDecompiler::FClamp,
576 &ARBDecompiler::FCastHalf0,
577 &ARBDecompiler::FCastHalf1,
578 &ARBDecompiler::Binary<MIN_F>,
579 &ARBDecompiler::Binary<MAX_F>,
580 &ARBDecompiler::Unary<COS_F32>,
581 &ARBDecompiler::Unary<SIN_F32>,
582 &ARBDecompiler::Unary<EX2_F32>,
583 &ARBDecompiler::Unary<LG2_F32>,
584 &ARBDecompiler::Unary<RSQ_F32>,
585 &ARBDecompiler::FSqrt,
586 &ARBDecompiler::Unary<ROUND_F>,
587 &ARBDecompiler::Unary<FLR_F>,
588 &ARBDecompiler::Unary<CEIL_F>,
589 &ARBDecompiler::Unary<TRUNC_F>,
590 &ARBDecompiler::Unary<I2F_S>,
591 &ARBDecompiler::Unary<I2F_U>,
592 &ARBDecompiler::FSwizzleAdd,
593
594 &ARBDecompiler::Binary<ADD_S>,
595 &ARBDecompiler::Binary<MUL_S>,
596 &ARBDecompiler::Binary<DIV_S>,
597 &ARBDecompiler::Negate<'S'>,
598 &ARBDecompiler::Absolute<'S'>,
599 &ARBDecompiler::Binary<MIN_S>,
600 &ARBDecompiler::Binary<MAX_S>,
601
602 &ARBDecompiler::Unary<TRUNC_S>,
603 &ARBDecompiler::Unary<MOV_U>,
604 &ARBDecompiler::Binary<SHL_S>,
605 &ARBDecompiler::Binary<SHR_U>,
606 &ARBDecompiler::Binary<SHR_S>,
607 &ARBDecompiler::Binary<AND_S>,
608 &ARBDecompiler::Binary<OR_S>,
609 &ARBDecompiler::Binary<XOR_S>,
610 &ARBDecompiler::Unary<NOT_S>,
611 &ARBDecompiler::BitfieldInsert<'S'>,
612 &ARBDecompiler::BitfieldExtract<'S'>,
613 &ARBDecompiler::Unary<BTC_S>,
614 &ARBDecompiler::Unary<BTFM_S>,
615
616 &ARBDecompiler::Binary<ADD_U>,
617 &ARBDecompiler::Binary<MUL_U>,
618 &ARBDecompiler::Binary<DIV_U>,
619 &ARBDecompiler::Binary<MIN_U>,
620 &ARBDecompiler::Binary<MAX_U>,
621 &ARBDecompiler::Unary<TRUNC_U>,
622 &ARBDecompiler::Unary<MOV_U>,
623 &ARBDecompiler::Binary<SHL_U>,
624 &ARBDecompiler::Binary<SHR_U>,
625 &ARBDecompiler::Binary<SHR_U>,
626 &ARBDecompiler::Binary<AND_U>,
627 &ARBDecompiler::Binary<OR_U>,
628 &ARBDecompiler::Binary<XOR_U>,
629 &ARBDecompiler::Unary<NOT_U>,
630 &ARBDecompiler::BitfieldInsert<'U'>,
631 &ARBDecompiler::BitfieldExtract<'U'>,
632 &ARBDecompiler::Unary<BTC_U>,
633 &ARBDecompiler::Unary<BTFM_U>,
634
635 &ARBDecompiler::HAdd2,
636 &ARBDecompiler::HMul2,
637 &ARBDecompiler::HFma2,
638 &ARBDecompiler::HAbsolute,
639 &ARBDecompiler::HNegate,
640 &ARBDecompiler::HClamp,
641 &ARBDecompiler::HCastFloat,
642 &ARBDecompiler::HUnpack,
643 &ARBDecompiler::HMergeF32,
644 &ARBDecompiler::HMergeH0,
645 &ARBDecompiler::HMergeH1,
646 &ARBDecompiler::HPack2,
647
648 &ARBDecompiler::LogicalAssign,
649 &ARBDecompiler::Binary<AND_U>,
650 &ARBDecompiler::Binary<OR_U>,
651 &ARBDecompiler::Binary<XOR_U>,
652 &ARBDecompiler::Unary<NOT_U>,
653 &ARBDecompiler::LogicalPick2,
654 &ARBDecompiler::LogicalAnd2,
655
656 &ARBDecompiler::FloatComparison<SLT_F, false>,
657 &ARBDecompiler::FloatComparison<SEQ_F, false>,
658 &ARBDecompiler::FloatComparison<SLE_F, false>,
659 &ARBDecompiler::FloatComparison<SGT_F, false>,
660 &ARBDecompiler::FloatComparison<SNE_F, false>,
661 &ARBDecompiler::FloatComparison<SGE_F, false>,
662 &ARBDecompiler::FloatOrdered,
663 &ARBDecompiler::FloatUnordered,
664 &ARBDecompiler::FloatComparison<SLT_F, true>,
665 &ARBDecompiler::FloatComparison<SEQ_F, true>,
666 &ARBDecompiler::FloatComparison<SLE_F, true>,
667 &ARBDecompiler::FloatComparison<SGT_F, true>,
668 &ARBDecompiler::FloatComparison<SNE_F, true>,
669 &ARBDecompiler::FloatComparison<SGE_F, true>,
670
671 &ARBDecompiler::Binary<SLT_S>,
672 &ARBDecompiler::Binary<SEQ_S>,
673 &ARBDecompiler::Binary<SLE_S>,
674 &ARBDecompiler::Binary<SGT_S>,
675 &ARBDecompiler::Binary<SNE_S>,
676 &ARBDecompiler::Binary<SGE_S>,
677
678 &ARBDecompiler::Binary<SLT_U>,
679 &ARBDecompiler::Binary<SEQ_U>,
680 &ARBDecompiler::Binary<SLE_U>,
681 &ARBDecompiler::Binary<SGT_U>,
682 &ARBDecompiler::Binary<SNE_U>,
683 &ARBDecompiler::Binary<SGE_U>,
684
685 &ARBDecompiler::LogicalAddCarry,
686
687 &ARBDecompiler::HalfComparison<SLT_F, false>,
688 &ARBDecompiler::HalfComparison<SEQ_F, false>,
689 &ARBDecompiler::HalfComparison<SLE_F, false>,
690 &ARBDecompiler::HalfComparison<SGT_F, false>,
691 &ARBDecompiler::HalfComparison<SNE_F, false>,
692 &ARBDecompiler::HalfComparison<SGE_F, false>,
693 &ARBDecompiler::HalfComparison<SLT_F, true>,
694 &ARBDecompiler::HalfComparison<SEQ_F, true>,
695 &ARBDecompiler::HalfComparison<SLE_F, true>,
696 &ARBDecompiler::HalfComparison<SGT_F, true>,
697 &ARBDecompiler::HalfComparison<SNE_F, true>,
698 &ARBDecompiler::HalfComparison<SGE_F, true>,
699
700 &ARBDecompiler::Texture,
701 &ARBDecompiler::Texture,
702 &ARBDecompiler::TextureGather,
703 &ARBDecompiler::TextureQueryDimensions,
704 &ARBDecompiler::TextureQueryLod,
705 &ARBDecompiler::TexelFetch,
706 &ARBDecompiler::TextureGradient,
707
708 &ARBDecompiler::ImageLoad,
709 &ARBDecompiler::ImageStore,
710
711 &ARBDecompiler::AtomicImage<ADD, U32>,
712 &ARBDecompiler::AtomicImage<AND, U32>,
713 &ARBDecompiler::AtomicImage<OR, U32>,
714 &ARBDecompiler::AtomicImage<XOR, U32>,
715 &ARBDecompiler::AtomicImage<EXCH, U32>,
716
717 &ARBDecompiler::Atomic<EXCH, U32>,
718 &ARBDecompiler::Atomic<ADD, U32>,
719 &ARBDecompiler::Atomic<MIN, U32>,
720 &ARBDecompiler::Atomic<MAX, U32>,
721 &ARBDecompiler::Atomic<AND, U32>,
722 &ARBDecompiler::Atomic<OR, U32>,
723 &ARBDecompiler::Atomic<XOR, U32>,
724
725 &ARBDecompiler::Atomic<EXCH, S32>,
726 &ARBDecompiler::Atomic<ADD, S32>,
727 &ARBDecompiler::Atomic<MIN, S32>,
728 &ARBDecompiler::Atomic<MAX, S32>,
729 &ARBDecompiler::Atomic<AND, S32>,
730 &ARBDecompiler::Atomic<OR, S32>,
731 &ARBDecompiler::Atomic<XOR, S32>,
732
733 &ARBDecompiler::Atomic<ADD, U32>,
734 &ARBDecompiler::Atomic<MIN, U32>,
735 &ARBDecompiler::Atomic<MAX, U32>,
736 &ARBDecompiler::Atomic<AND, U32>,
737 &ARBDecompiler::Atomic<OR, U32>,
738 &ARBDecompiler::Atomic<XOR, U32>,
739
740 &ARBDecompiler::Atomic<ADD, S32>,
741 &ARBDecompiler::Atomic<MIN, S32>,
742 &ARBDecompiler::Atomic<MAX, S32>,
743 &ARBDecompiler::Atomic<AND, S32>,
744 &ARBDecompiler::Atomic<OR, S32>,
745 &ARBDecompiler::Atomic<XOR, S32>,
746
747 &ARBDecompiler::Branch,
748 &ARBDecompiler::BranchIndirect,
749 &ARBDecompiler::PushFlowStack,
750 &ARBDecompiler::PopFlowStack,
751 &ARBDecompiler::Exit,
752 &ARBDecompiler::Discard,
753
754 &ARBDecompiler::EmitVertex,
755 &ARBDecompiler::EndPrimitive,
756
757 &ARBDecompiler::InvocationId,
758 &ARBDecompiler::YNegate,
759 &ARBDecompiler::LocalInvocationId<'x'>,
760 &ARBDecompiler::LocalInvocationId<'y'>,
761 &ARBDecompiler::LocalInvocationId<'z'>,
762 &ARBDecompiler::WorkGroupId<'x'>,
763 &ARBDecompiler::WorkGroupId<'y'>,
764 &ARBDecompiler::WorkGroupId<'z'>,
765
766 &ARBDecompiler::Unary<TGBALLOT_U>,
767 &ARBDecompiler::Unary<TGALL_U>,
768 &ARBDecompiler::Unary<TGANY_U>,
769 &ARBDecompiler::Unary<TGEQ_U>,
770
771 &ARBDecompiler::ThreadId,
772 &ARBDecompiler::ThreadMask<'e', 'q'>,
773 &ARBDecompiler::ThreadMask<'g', 'e'>,
774 &ARBDecompiler::ThreadMask<'g', 't'>,
775 &ARBDecompiler::ThreadMask<'l', 'e'>,
776 &ARBDecompiler::ThreadMask<'l', 't'>,
777 &ARBDecompiler::ShuffleIndexed,
778
779 &ARBDecompiler::Barrier,
780 &ARBDecompiler::MemoryBarrierGroup,
781 &ARBDecompiler::MemoryBarrierGlobal,
782 };
783};
784
785ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
786 ShaderType stage, std::string_view identifier)
787 : device{device}, ir{ir}, registry{registry}, stage{stage} {
788 AddLine("TEMP RC;");
789 AddLine("TEMP FSWZA[4];");
790 AddLine("TEMP FSWZB[4];");
791 if (ir.IsDecompiled()) {
792 DecompileAST();
793 } else {
794 DecompileBranchMode();
795 }
796 AddLine("END");
797
798 const std::string code = std::move(shader_source);
799 DeclareHeader();
800 DeclareVertex();
801 DeclareGeometry();
802 DeclareFragment();
803 DeclareCompute();
804 DeclareInputAttributes();
805 DeclareOutputAttributes();
806 DeclareLocalMemory();
807 DeclareGlobalMemory();
808 DeclareConstantBuffers();
809 DeclareRegisters();
810 DeclareTemporaries();
811 DeclarePredicates();
812 DeclareInternalFlags();
813
814 shader_source += code;
815}
816
817std::string_view HeaderStageName(ShaderType stage) {
818 switch (stage) {
819 case ShaderType::Vertex:
820 return "vp";
821 case ShaderType::Geometry:
822 return "gp";
823 case ShaderType::Fragment:
824 return "fp";
825 case ShaderType::Compute:
826 return "cp";
827 default:
828 UNREACHABLE();
829 return "";
830 }
831}
832
833void ARBDecompiler::DeclareHeader() {
834 AddLine("!!NV{}5.0", HeaderStageName(stage));
835 // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D
836 AddLine("OPTION NV_internal;");
837 AddLine("OPTION NV_gpu_program_fp64;");
838 AddLine("OPTION NV_shader_storage_buffer;");
839 AddLine("OPTION NV_shader_thread_group;");
840 if (ir.UsesWarps() && device.HasWarpIntrinsics()) {
841 AddLine("OPTION NV_shader_thread_shuffle;");
842 }
843 if (stage == ShaderType::Vertex) {
844 if (device.HasNvViewportArray2()) {
845 AddLine("OPTION NV_viewport_array2;");
846 }
847 }
848 if (stage == ShaderType::Fragment) {
849 AddLine("OPTION ARB_draw_buffers;");
850 }
851 if (device.HasImageLoadFormatted()) {
852 AddLine("OPTION EXT_shader_image_load_formatted;");
853 }
854}
855
856void ARBDecompiler::DeclareVertex() {
857 if (stage != ShaderType::Vertex) {
858 return;
859 }
860 AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};");
861}
862
863void ARBDecompiler::DeclareGeometry() {
864 if (stage != ShaderType::Geometry) {
865 return;
866 }
867 const auto& info = registry.GetGraphicsInfo();
868 const auto& header = ir.GetHeader();
869 AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology));
870 AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology));
871 AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value());
872 AddLine("ATTRIB vertex_position = vertex.position;");
873}
874
875void ARBDecompiler::DeclareFragment() {
876 if (stage != ShaderType::Fragment) {
877 return;
878 }
879 AddLine("OUTPUT result_color7 = result.color[7];");
880 AddLine("OUTPUT result_color6 = result.color[6];");
881 AddLine("OUTPUT result_color5 = result.color[5];");
882 AddLine("OUTPUT result_color4 = result.color[4];");
883 AddLine("OUTPUT result_color3 = result.color[3];");
884 AddLine("OUTPUT result_color2 = result.color[2];");
885 AddLine("OUTPUT result_color1 = result.color[1];");
886 AddLine("OUTPUT result_color0 = result.color;");
887}
888
889void ARBDecompiler::DeclareCompute() {
890 if (stage != ShaderType::Compute) {
891 return;
892 }
893 const ComputeInfo& info = registry.GetComputeInfo();
894 AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1],
895 info.workgroup_size[2]);
896 if (info.shared_memory_size_in_words > 0) {
897 const u32 size_in_bytes = info.shared_memory_size_in_words * 4;
898 AddLine("SHARED_MEMORY {};", size_in_bytes);
899 AddLine("SHARED shared_mem[] = {{program.sharedmem}};");
900 }
901}
902
903void ARBDecompiler::DeclareInputAttributes() {
904 if (stage == ShaderType::Compute) {
905 return;
906 }
907 const std::string_view stage_name = StageInputName(stage);
908 for (const auto attribute : ir.GetInputAttributes()) {
909 if (!IsGenericAttribute(attribute)) {
910 continue;
911 }
912 const u32 index = GetGenericAttributeIndex(attribute);
913
914 std::string_view suffix;
915 if (stage == ShaderType::Fragment) {
916 const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)};
917 if (input_mode == PixelImap::Unused) {
918 return;
919 }
920 suffix = GetInputFlags(input_mode);
921 }
922 AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index,
923 index);
924 }
925}
926
927void ARBDecompiler::DeclareOutputAttributes() {
928 if (stage == ShaderType::Compute) {
929 return;
930 }
931 for (const auto attribute : ir.GetOutputAttributes()) {
932 if (!IsGenericAttribute(attribute)) {
933 continue;
934 }
935 const u32 index = GetGenericAttributeIndex(attribute);
936 AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index);
937 }
938}
939
940void ARBDecompiler::DeclareLocalMemory() {
941 u64 size = 0;
942 if (stage == ShaderType::Compute) {
943 size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL;
944 } else {
945 size = ir.GetHeader().GetLocalMemorySize();
946 }
947 if (size == 0) {
948 return;
949 }
950 const u64 element_count = Common::AlignUp(size, 4) / 4;
951 AddLine("TEMP lmem[{}];", element_count);
952}
953
954void ARBDecompiler::DeclareGlobalMemory() {
955 u32 binding = 0; // device.GetBaseBindings(stage).shader_storage_buffer;
956 for (const auto& pair : ir.GetGlobalMemory()) {
957 const auto& base = pair.first;
958 AddLine("STORAGE {}[] = {{ program.storage[{}] }};", GlobalMemoryName(base), binding);
959 ++binding;
960 }
961}
962
963void ARBDecompiler::DeclareConstantBuffers() {
964 u32 binding = 0;
965 for (const auto& cbuf : ir.GetConstantBuffers()) {
966 AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding);
967 ++binding;
968 }
969}
970
971void ARBDecompiler::DeclareRegisters() {
972 for (const u32 gpr : ir.GetRegisters()) {
973 AddLine("TEMP R{};", gpr);
974 }
975}
976
977void ARBDecompiler::DeclareTemporaries() {
978 for (std::size_t i = 0; i < max_temporaries; ++i) {
979 AddLine("TEMP T{};", i);
980 }
981}
982
983void ARBDecompiler::DeclarePredicates() {
984 for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
985 AddLine("TEMP P{};", static_cast<u64>(pred));
986 }
987}
988
989void ARBDecompiler::DeclareInternalFlags() {
990 for (const char* name : INTERNAL_FLAG_NAMES) {
991 AddLine("TEMP {};", name);
992 }
993}
994
995void ARBDecompiler::InitializeVariables() {
996 AddLine("MOV.F32 FSWZA[0], -1;");
997 AddLine("MOV.F32 FSWZA[1], 1;");
998 AddLine("MOV.F32 FSWZA[2], -1;");
999 AddLine("MOV.F32 FSWZA[3], 0;");
1000 AddLine("MOV.F32 FSWZB[0], -1;");
1001 AddLine("MOV.F32 FSWZB[1], -1;");
1002 AddLine("MOV.F32 FSWZB[2], 1;");
1003 AddLine("MOV.F32 FSWZB[3], -1;");
1004
1005 if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) {
1006 AddLine("MOV.F result.position, {{0, 0, 0, 1}};");
1007 }
1008 for (const auto attribute : ir.GetOutputAttributes()) {
1009 if (!IsGenericAttribute(attribute)) {
1010 continue;
1011 }
1012 const u32 index = GetGenericAttributeIndex(attribute);
1013 AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index);
1014 }
1015 for (const u32 gpr : ir.GetRegisters()) {
1016 AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr);
1017 }
1018 for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
1019 AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast<u64>(pred));
1020 }
1021}
1022
1023void ARBDecompiler::DecompileAST() {
1024 const u32 num_flow_variables = ir.GetASTNumVariables();
1025 for (u32 i = 0; i < num_flow_variables; ++i) {
1026 AddLine("TEMP F{};", i);
1027 }
1028 for (u32 i = 0; i < num_flow_variables; ++i) {
1029 AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i);
1030 }
1031
1032 InitializeVariables();
1033
1034 VisitAST(ir.GetASTProgram());
1035}
1036
1037void ARBDecompiler::DecompileBranchMode() {
1038 static constexpr u32 FLOW_STACK_SIZE = 20;
1039 if (!ir.IsFlowStackDisabled()) {
1040 AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE);
1041 AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE);
1042 AddLine("TEMP SSY_TOP;");
1043 AddLine("TEMP PBK_TOP;");
1044 }
1045
1046 AddLine("TEMP PC;");
1047
1048 if (!ir.IsFlowStackDisabled()) {
1049 AddLine("MOV.U SSY_TOP.x, 0;");
1050 AddLine("MOV.U PBK_TOP.x, 0;");
1051 }
1052
1053 InitializeVariables();
1054
1055 const auto basic_block_end = ir.GetBasicBlocks().end();
1056 auto basic_block_it = ir.GetBasicBlocks().begin();
1057 const u32 first_address = basic_block_it->first;
1058 AddLine("MOV.U PC.x, {};", first_address);
1059
1060 AddLine("REP;");
1061
1062 std::size_t num_blocks = 0;
1063 while (basic_block_it != basic_block_end) {
1064 const auto& [address, bb] = *basic_block_it;
1065 ++num_blocks;
1066
1067 AddLine("SEQ.S.CC RC.x, PC.x, {};", address);
1068 AddLine("IF NE.x;");
1069
1070 VisitBlock(bb);
1071
1072 ++basic_block_it;
1073
1074 if (basic_block_it != basic_block_end) {
1075 const auto op = std::get_if<OperationNode>(&*bb[bb.size() - 1]);
1076 if (!op || op->GetCode() != OperationCode::Branch) {
1077 const u32 next_address = basic_block_it->first;
1078 AddLine("MOV.U PC.x, {};", next_address);
1079 AddLine("CONT;");
1080 }
1081 }
1082
1083 AddLine("ELSE;");
1084 }
1085 AddLine("RET;");
1086 while (num_blocks--) {
1087 AddLine("ENDIF;");
1088 }
1089
1090 AddLine("ENDREP;");
1091}
1092
1093void ARBDecompiler::VisitAST(const ASTNode& node) {
1094 if (const auto ast = std::get_if<ASTProgram>(&*node->GetInnerData())) {
1095 for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
1096 VisitAST(current);
1097 }
1098 } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
1099 const std::string condition = VisitExpression(ast->condition);
1100 ResetTemporaries();
1101
1102 AddLine("MOVC.U RC.x, {};", condition);
1103 AddLine("IF NE.x;");
1104 for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
1105 VisitAST(current);
1106 }
1107 AddLine("ENDIF;");
1108 } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
1109 AddLine("ELSE;");
1110 for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
1111 VisitAST(current);
1112 }
1113 } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
1114 VisitBlock(ast->nodes);
1115 } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
1116 AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition));
1117 ResetTemporaries();
1118 } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
1119 const std::string condition = VisitExpression(ast->condition);
1120 ResetTemporaries();
1121 AddLine("REP;");
1122 for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
1123 VisitAST(current);
1124 }
1125 AddLine("MOVC.U RC.x, {};", condition);
1126 AddLine("BRK (NE.x);");
1127 AddLine("ENDREP;");
1128 } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) {
1129 const bool is_true = ExprIsTrue(ast->condition);
1130 if (!is_true) {
1131 AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
1132 AddLine("IF NE.x;");
1133 ResetTemporaries();
1134 }
1135 if (ast->kills) {
1136 AddLine("KIL TR;");
1137 } else {
1138 Exit();
1139 }
1140 if (!is_true) {
1141 AddLine("ENDIF;");
1142 }
1143 } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) {
1144 if (ExprIsTrue(ast->condition)) {
1145 AddLine("BRK;");
1146 } else {
1147 AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
1148 AddLine("BRK (NE.x);");
1149 ResetTemporaries();
1150 }
1151 } else if (std::holds_alternative<ASTLabel>(*node->GetInnerData())) {
1152 // Nothing to do
1153 } else {
1154 UNREACHABLE();
1155 }
1156}
1157
1158std::string ARBDecompiler::VisitExpression(const Expr& node) {
1159 const std::string result = AllocTemporary();
1160 if (const auto expr = std::get_if<ExprAnd>(&*node)) {
1161 AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1),
1162 VisitExpression(expr->operand2));
1163 return result;
1164 }
1165 if (const auto expr = std::get_if<ExprOr>(&*node)) {
1166 const std::string result = AllocTemporary();
1167 AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1),
1168 VisitExpression(expr->operand2));
1169 return result;
1170 }
1171 if (const auto expr = std::get_if<ExprNot>(&*node)) {
1172 const std::string result = AllocTemporary();
1173 AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1));
1174 return result;
1175 }
1176 if (const auto expr = std::get_if<ExprPredicate>(&*node)) {
1177 return fmt::format("P{}.x", static_cast<u64>(expr->predicate));
1178 }
1179 if (const auto expr = std::get_if<ExprCondCode>(&*node)) {
1180 return Visit(ir.GetConditionCode(expr->cc));
1181 }
1182 if (const auto expr = std::get_if<ExprVar>(&*node)) {
1183 return fmt::format("F{}.x", expr->var_index);
1184 }
1185 if (const auto expr = std::get_if<ExprBoolean>(&*node)) {
1186 return expr->value ? "0xffffffff" : "0";
1187 }
1188 if (const auto expr = std::get_if<ExprGprEqual>(&*node)) {
1189 const std::string result = AllocTemporary();
1190 AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value);
1191 return result;
1192 }
1193 UNREACHABLE();
1194 return "0";
1195}
1196
1197void ARBDecompiler::VisitBlock(const NodeBlock& bb) {
1198 for (const auto& node : bb) {
1199 Visit(node);
1200 }
1201}
1202
1203std::string ARBDecompiler::Visit(const Node& node) {
1204 if (const auto operation = std::get_if<OperationNode>(&*node)) {
1205 if (const auto amend_index = operation->GetAmendIndex()) {
1206 Visit(ir.GetAmendNode(*amend_index));
1207 }
1208 const std::size_t index = static_cast<std::size_t>(operation->GetCode());
1209 if (index >= OPERATION_DECOMPILERS.size()) {
1210 UNREACHABLE_MSG("Out of bounds operation: {}", index);
1211 return {};
1212 }
1213 const auto decompiler = OPERATION_DECOMPILERS[index];
1214 if (decompiler == nullptr) {
1215 UNREACHABLE_MSG("Undefined operation: {}", index);
1216 return {};
1217 }
1218 return (this->*decompiler)(*operation);
1219 }
1220
1221 if (const auto gpr = std::get_if<GprNode>(&*node)) {
1222 const u32 index = gpr->GetIndex();
1223 if (index == Register::ZeroIndex) {
1224 return "{0, 0, 0, 0}.x";
1225 }
1226 return fmt::format("R{}.x", index);
1227 }
1228
1229 if (const auto cv = std::get_if<CustomVarNode>(&*node)) {
1230 return fmt::format("CV{}.x", cv->GetIndex());
1231 }
1232
1233 if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
1234 const std::string temporary = AllocTemporary();
1235 AddLine("MOV.U {}, {};", temporary, immediate->GetValue());
1236 return temporary;
1237 }
1238
1239 if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
1240 const std::string temporary = AllocTemporary();
1241 switch (const auto index = predicate->GetIndex(); index) {
1242 case Tegra::Shader::Pred::UnusedIndex:
1243 AddLine("MOV.S {}, -1;", temporary);
1244 break;
1245 case Tegra::Shader::Pred::NeverExecute:
1246 AddLine("MOV.S {}, 0;", temporary);
1247 break;
1248 default:
1249 AddLine("MOV.S {}, P{}.x;", temporary, static_cast<u64>(index));
1250 break;
1251 }
1252 if (predicate->IsNegated()) {
1253 AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary);
1254 }
1255 return temporary;
1256 }
1257
1258 if (const auto abuf = std::get_if<AbufNode>(&*node)) {
1259 if (abuf->IsPhysicalBuffer()) {
1260 UNIMPLEMENTED_MSG("Physical buffers are not implemented");
1261 return "{0, 0, 0, 0}.x";
1262 }
1263
1264 const auto buffer_index = [this, &abuf]() -> std::string {
1265 if (stage != ShaderType::Geometry) {
1266 return "";
1267 }
1268 return fmt::format("[{}]", Visit(abuf->GetBuffer()));
1269 };
1270
1271 const Attribute::Index index = abuf->GetIndex();
1272 const u32 element = abuf->GetElement();
1273 const char swizzle = Swizzle(element);
1274 switch (index) {
1275 case Attribute::Index::Position: {
1276 if (stage == ShaderType::Geometry) {
1277 return fmt::format("{}_position[{}].{}", StageInputName(stage),
1278 Visit(abuf->GetBuffer()), swizzle);
1279 } else {
1280 return fmt::format("{}.position.{}", StageInputName(stage), swizzle);
1281 }
1282 }
1283 case Attribute::Index::TessCoordInstanceIDVertexID:
1284 ASSERT(stage == ShaderType::Vertex);
1285 switch (element) {
1286 case 2:
1287 return "vertex.instance";
1288 case 3:
1289 return "vertex.id";
1290 }
1291 UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
1292 break;
1293 case Attribute::Index::PointCoord:
1294 switch (element) {
1295 case 0:
1296 return "fragment.pointcoord.x";
1297 case 1:
1298 return "fragment.pointcoord.y";
1299 }
1300 UNIMPLEMENTED();
1301 break;
1302 case Attribute::Index::FrontFacing: {
1303 ASSERT(stage == ShaderType::Fragment);
1304 ASSERT(element == 3);
1305 const std::string temporary = AllocVectorTemporary();
1306 AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};");
1307 AddLine("MOV.U.CC RC.x, -RC;");
1308 AddLine("MOV.S {}.x, 0;", temporary);
1309 AddLine("MOV.S {}.x (NE.x), -1;", temporary);
1310 return fmt::format("{}.x", temporary);
1311 }
1312 default:
1313 if (IsGenericAttribute(index)) {
1314 if (stage == ShaderType::Geometry) {
1315 return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index),
1316 Visit(abuf->GetBuffer()), swizzle);
1317 } else {
1318 return fmt::format("{}.attrib[{}].{}", StageInputName(stage),
1319 GetGenericAttributeIndex(index), swizzle);
1320 }
1321 }
1322 UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index));
1323 break;
1324 }
1325 return "{0, 0, 0, 0}.x";
1326 }
1327
1328 if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
1329 std::string offset_string;
1330 const auto& offset = cbuf->GetOffset();
1331 if (const auto imm = std::get_if<ImmediateNode>(&*offset)) {
1332 offset_string = std::to_string(imm->GetValue());
1333 } else {
1334 offset_string = Visit(offset);
1335 }
1336 const std::string temporary = AllocTemporary();
1337 AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string);
1338 return temporary;
1339 }
1340
1341 if (const auto gmem = std::get_if<GmemNode>(&*node)) {
1342 const std::string temporary = AllocTemporary();
1343 AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
1344 Visit(gmem->GetBaseAddress()));
1345 AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()),
1346 temporary);
1347 return temporary;
1348 }
1349
1350 if (const auto lmem = std::get_if<LmemNode>(&*node)) {
1351 const std::string temporary = Visit(lmem->GetAddress());
1352 AddLine("SHR.U {}, {}, 2;", temporary, temporary);
1353 AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary);
1354 return temporary;
1355 }
1356
1357 if (const auto smem = std::get_if<SmemNode>(&*node)) {
1358 const std::string temporary = Visit(smem->GetAddress());
1359 AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary);
1360 return temporary;
1361 }
1362
1363 if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
1364 const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
1365 return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
1366 }
1367
1368 if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
1369 if (const auto amend_index = conditional->GetAmendIndex()) {
1370 Visit(ir.GetAmendNode(*amend_index));
1371 }
1372 AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition()));
1373 AddLine("IF NE.x;");
1374 VisitBlock(conditional->GetCode());
1375 AddLine("ENDIF;");
1376 return {};
1377 }
1378
1379 if (const auto cmt = std::get_if<CommentNode>(&*node)) {
1380 // Uncommenting this will generate invalid code. GLASM lacks comments.
1381 // AddLine("// {}", cmt->GetText());
1382 return {};
1383 }
1384
1385 UNIMPLEMENTED();
1386 return {};
1387}
1388
1389std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
1390 const auto& meta = std::get<MetaTexture>(operation.GetMeta());
1391 UNIMPLEMENTED_IF(meta.sampler.is_indexed);
1392 UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array &&
1393 meta.sampler.type == Tegra::Shader::TextureType::TextureCube);
1394
1395 const std::size_t count = operation.GetOperandsCount();
1396 std::string temporary = AllocVectorTemporary();
1397 std::size_t i = 0;
1398 for (; i < count; ++i) {
1399 AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
1400 }
1401 if (meta.sampler.is_array) {
1402 AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array));
1403 }
1404 if (meta.sampler.is_shadow) {
1405 AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare));
1406 }
1407 return {std::move(temporary), i};
1408}
1409
1410std::string ARBDecompiler::BuildAoffi(Operation operation) {
1411 const auto& meta = std::get<MetaTexture>(operation.GetMeta());
1412 if (meta.aoffi.empty()) {
1413 return {};
1414 }
1415 const std::string temporary = AllocVectorTemporary();
1416 std::size_t i = 0;
1417 for (auto& node : meta.aoffi) {
1418 AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node));
1419 }
1420 return fmt::format(", offset({})", temporary);
1421}
1422
1423void ARBDecompiler::Exit() {
1424 if (stage != ShaderType::Fragment) {
1425 AddLine("RET;");
1426 return;
1427 }
1428
1429 const auto safe_get_register = [this](u32 reg) -> std::string {
1430 // TODO(Rodrigo): Replace with contains once C++20 releases
1431 const auto& used_registers = ir.GetRegisters();
1432 if (used_registers.find(reg) != used_registers.end()) {
1433 return fmt::format("R{}.x", reg);
1434 }
1435 return "{0, 0, 0, 0}.x";
1436 };
1437
1438 const auto& header = ir.GetHeader();
1439 u32 current_reg = 0;
1440 for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) {
1441 for (u32 component = 0; component < 4; ++component) {
1442 if (!header.ps.IsColorComponentOutputEnabled(rt, component)) {
1443 continue;
1444 }
1445 AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component),
1446 safe_get_register(current_reg));
1447 ++current_reg;
1448 }
1449 }
1450 if (header.ps.omap.depth) {
1451 AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1));
1452 }
1453
1454 AddLine("RET;");
1455}
1456
1457std::string ARBDecompiler::Assign(Operation operation) {
1458 const Node& dest = operation[0];
1459 const Node& src = operation[1];
1460
1461 std::string dest_name;
1462 if (const auto gpr = std::get_if<GprNode>(&*dest)) {
1463 if (gpr->GetIndex() == Register::ZeroIndex) {
1464 // Writing to Register::ZeroIndex is a no op
1465 return {};
1466 }
1467 dest_name = fmt::format("R{}.x", gpr->GetIndex());
1468 } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
1469 const u32 element = abuf->GetElement();
1470 const char swizzle = Swizzle(element);
1471 switch (const Attribute::Index index = abuf->GetIndex()) {
1472 case Attribute::Index::Position:
1473 dest_name = fmt::format("result.position.{}", swizzle);
1474 break;
1475 case Attribute::Index::LayerViewportPointSize:
1476 switch (element) {
1477 case 0:
1478 UNIMPLEMENTED();
1479 return {};
1480 case 1:
1481 case 2:
1482 if (!device.HasNvViewportArray2()) {
1483 LOG_ERROR(
1484 Render_OpenGL,
1485 "NV_viewport_array2 is missing. Maxwell gen 2 or better is required.");
1486 return {};
1487 }
1488 dest_name = element == 1 ? "result.layer.x" : "result.viewport.x";
1489 break;
1490 case 3:
1491 dest_name = "result.pointsize.x";
1492 break;
1493 }
1494 break;
1495 case Attribute::Index::ClipDistances0123:
1496 dest_name = fmt::format("result.clip[{}].x", element);
1497 break;
1498 case Attribute::Index::ClipDistances4567:
1499 dest_name = fmt::format("result.clip[{}].x", element + 4);
1500 break;
1501 default:
1502 if (!IsGenericAttribute(index)) {
1503 UNREACHABLE();
1504 return {};
1505 }
1506 dest_name =
1507 fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle);
1508 break;
1509 }
1510 } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
1511 const std::string address = Visit(lmem->GetAddress());
1512 AddLine("SHR.U {}, {}, 2;", address, address);
1513 dest_name = fmt::format("lmem[{}].x", address);
1514 } else if (const auto smem = std::get_if<SmemNode>(&*dest)) {
1515 AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress()));
1516 ResetTemporaries();
1517 return {};
1518 } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
1519 const std::string temporary = AllocTemporary();
1520 AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
1521 Visit(gmem->GetBaseAddress()));
1522 AddLine("STB.U32 {}, {}[{}];", Visit(src), GlobalMemoryName(gmem->GetDescriptor()),
1523 temporary);
1524 ResetTemporaries();
1525 return {};
1526 } else {
1527 UNREACHABLE();
1528 ResetTemporaries();
1529 return {};
1530 }
1531
1532 AddLine("MOV.U {}, {};", dest_name, Visit(src));
1533 ResetTemporaries();
1534 return {};
1535}
1536
1537std::string ARBDecompiler::Select(Operation operation) {
1538 const std::string temporary = AllocTemporary();
1539 AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]),
1540 Visit(operation[2]));
1541 return temporary;
1542}
1543
1544std::string ARBDecompiler::FClamp(Operation operation) {
1545 // 1.0f in hex, replace with std::bit_cast on C++20
1546 static constexpr u32 POSITIVE_ONE = 0x3f800000;
1547
1548 const std::string temporary = AllocTemporary();
1549 const Node& value = operation[0];
1550 const Node& low = operation[1];
1551 const Node& high = operation[2];
1552 const auto imm_low = std::get_if<ImmediateNode>(&*low);
1553 const auto imm_high = std::get_if<ImmediateNode>(&*high);
1554 if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) {
1555 AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value));
1556 } else {
1557 AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high));
1558 AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low));
1559 }
1560 return temporary;
1561}
1562
1563std::string ARBDecompiler::FCastHalf0(Operation operation) {
1564 const std::string temporary = AllocVectorTemporary();
1565 AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0]));
1566 return fmt::format("{}.x", temporary);
1567}
1568
1569std::string ARBDecompiler::FCastHalf1(Operation operation) {
1570 const std::string temporary = AllocVectorTemporary();
1571 AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0]));
1572 AddLine("MOV {}.x, {}.y;", temporary, temporary);
1573 return fmt::format("{}.x", temporary);
1574}
1575
1576std::string ARBDecompiler::FSqrt(Operation operation) {
1577 const std::string temporary = AllocTemporary();
1578 AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0]));
1579 AddLine("RCP.F32 {}, {};", temporary, temporary);
1580 return temporary;
1581}
1582
1583std::string ARBDecompiler::FSwizzleAdd(Operation operation) {
1584 const std::string temporary = AllocVectorTemporary();
1585 if (!device.HasWarpIntrinsics()) {
1586 LOG_ERROR(Render_OpenGL,
1587 "NV_shader_thread_shuffle is missing. Kepler or better is required.");
1588 AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]));
1589 return fmt::format("{}.x", temporary);
1590 }
1591 const std::string lut = AllocVectorTemporary();
1592 AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage));
1593 AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary);
1594 AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary);
1595 AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary);
1596 AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary);
1597 AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary);
1598 AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary);
1599 return fmt::format("{}.x", temporary);
1600}
1601
1602std::string ARBDecompiler::HAdd2(Operation operation) {
1603 const std::string tmp1 = AllocVectorTemporary();
1604 const std::string tmp2 = AllocVectorTemporary();
1605 AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
1606 AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
1607 AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2);
1608 AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
1609 return fmt::format("{}.x", tmp1);
1610}
1611
1612std::string ARBDecompiler::HMul2(Operation operation) {
1613 const std::string tmp1 = AllocVectorTemporary();
1614 const std::string tmp2 = AllocVectorTemporary();
1615 AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
1616 AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
1617 AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2);
1618 AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
1619 return fmt::format("{}.x", tmp1);
1620}
1621
1622std::string ARBDecompiler::HFma2(Operation operation) {
1623 const std::string tmp1 = AllocVectorTemporary();
1624 const std::string tmp2 = AllocVectorTemporary();
1625 const std::string tmp3 = AllocVectorTemporary();
1626 AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
1627 AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
1628 AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2]));
1629 AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3);
1630 AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
1631 return fmt::format("{}.x", tmp1);
1632}
1633
1634std::string ARBDecompiler::HAbsolute(Operation operation) {
1635 const std::string temporary = AllocVectorTemporary();
1636 AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
1637 AddLine("PK2H.F {}.x, |{}|;", temporary, temporary);
1638 return fmt::format("{}.x", temporary);
1639}
1640
1641std::string ARBDecompiler::HNegate(Operation operation) {
1642 const std::string temporary = AllocVectorTemporary();
1643 AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
1644 AddLine("MOVC.S RC.x, {};", Visit(operation[1]));
1645 AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary);
1646 AddLine("MOVC.S RC.x, {};", Visit(operation[2]));
1647 AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary);
1648 AddLine("PK2H.F {}.x, {};", temporary, temporary);
1649 return fmt::format("{}.x", temporary);
1650}
1651
1652std::string ARBDecompiler::HClamp(Operation operation) {
1653 const std::string tmp1 = AllocVectorTemporary();
1654 const std::string tmp2 = AllocVectorTemporary();
1655 AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
1656 AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1]));
1657 AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
1658 AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2);
1659 AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2]));
1660 AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
1661 AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2);
1662 AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
1663 return fmt::format("{}.x", tmp1);
1664}
1665
1666std::string ARBDecompiler::HCastFloat(Operation operation) {
1667 const std::string temporary = AllocVectorTemporary();
1668 AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary);
1669 AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0]));
1670 AddLine("PK2H.F {}.x, {};", temporary, temporary);
1671 return fmt::format("{}.x", temporary);
1672}
1673
1674std::string ARBDecompiler::HUnpack(Operation operation) {
1675 const std::string operand = Visit(operation[0]);
1676 switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
1677 case Tegra::Shader::HalfType::H0_H1:
1678 return operand;
1679 case Tegra::Shader::HalfType::F32: {
1680 const std::string temporary = AllocVectorTemporary();
1681 AddLine("MOV.U {}.x, {};", temporary, operand);
1682 AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
1683 AddLine("PK2H.F {}.x, {};", temporary, temporary);
1684 return fmt::format("{}.x", temporary);
1685 }
1686 case Tegra::Shader::HalfType::H0_H0: {
1687 const std::string temporary = AllocVectorTemporary();
1688 AddLine("UP2H.F {}.xy, {};", temporary, operand);
1689 AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
1690 AddLine("PK2H.F {}.x, {};", temporary, temporary);
1691 return fmt::format("{}.x", temporary);
1692 }
1693 case Tegra::Shader::HalfType::H1_H1: {
1694 const std::string temporary = AllocVectorTemporary();
1695 AddLine("UP2H.F {}.xy, {};", temporary, operand);
1696 AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
1697 AddLine("PK2H.F {}.x, {};", temporary, temporary);
1698 return fmt::format("{}.x", temporary);
1699 }
1700 }
1701 UNREACHABLE();
1702 return "{0, 0, 0, 0}.x";
1703}
1704
1705std::string ARBDecompiler::HMergeF32(Operation operation) {
1706 const std::string temporary = AllocVectorTemporary();
1707 AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
1708 return fmt::format("{}.x", temporary);
1709}
1710
1711std::string ARBDecompiler::HMergeH0(Operation operation) {
1712 const std::string temporary = AllocVectorTemporary();
1713 AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
1714 AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
1715 AddLine("MOV.U {}.x, {}.z;", temporary, temporary);
1716 AddLine("PK2H.F {}.x, {};", temporary, temporary);
1717 return fmt::format("{}.x", temporary);
1718}
1719
1720std::string ARBDecompiler::HMergeH1(Operation operation) {
1721 const std::string temporary = AllocVectorTemporary();
1722 AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
1723 AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
1724 AddLine("MOV.U {}.y, {}.w;", temporary, temporary);
1725 AddLine("PK2H.F {}.x, {};", temporary, temporary);
1726 return fmt::format("{}.x", temporary);
1727}
1728
1729std::string ARBDecompiler::HPack2(Operation operation) {
1730 const std::string temporary = AllocVectorTemporary();
1731 AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0]));
1732 AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1]));
1733 AddLine("PK2H.F {}.x, {};", temporary, temporary);
1734 return fmt::format("{}.x", temporary);
1735}
1736
1737std::string ARBDecompiler::LogicalAssign(Operation operation) {
1738 const Node& dest = operation[0];
1739 const Node& src = operation[1];
1740
1741 std::string target;
1742
1743 if (const auto pred = std::get_if<PredicateNode>(&*dest)) {
1744 ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
1745
1746 const Tegra::Shader::Pred index = pred->GetIndex();
1747 switch (index) {
1748 case Tegra::Shader::Pred::NeverExecute:
1749 case Tegra::Shader::Pred::UnusedIndex:
1750 // Writing to these predicates is a no-op
1751 return {};
1752 }
1753 target = fmt::format("P{}.x", static_cast<u64>(index));
1754 } else if (const auto internal_flag = std::get_if<InternalFlagNode>(&*dest)) {
1755 const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
1756 target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
1757 } else {
1758 UNREACHABLE();
1759 ResetTemporaries();
1760 return {};
1761 }
1762
1763 AddLine("MOV.U {}, {};", target, Visit(src));
1764 ResetTemporaries();
1765 return {};
1766}
1767
1768std::string ARBDecompiler::LogicalPick2(Operation operation) {
1769 const std::string temporary = AllocTemporary();
1770 const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue();
1771 AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index));
1772 return temporary;
1773}
1774
1775std::string ARBDecompiler::LogicalAnd2(Operation operation) {
1776 const std::string temporary = AllocTemporary();
1777 const std::string op = Visit(operation[0]);
1778 AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op);
1779 return temporary;
1780}
1781
1782std::string ARBDecompiler::FloatOrdered(Operation operation) {
1783 const std::string temporary = AllocTemporary();
1784 AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
1785 AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
1786 AddLine("MOV.S {}, -1;", temporary);
1787 AddLine("MOV.S {} (NAN.x), 0;", temporary);
1788 AddLine("MOV.S {} (NAN.y), 0;", temporary);
1789 return temporary;
1790}
1791
1792std::string ARBDecompiler::FloatUnordered(Operation operation) {
1793 const std::string temporary = AllocTemporary();
1794 AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
1795 AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
1796 AddLine("MOV.S {}, 0;", temporary);
1797 AddLine("MOV.S {} (NAN.x), -1;", temporary);
1798 AddLine("MOV.S {} (NAN.y), -1;", temporary);
1799 return temporary;
1800}
1801
1802std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
1803 const std::string temporary = AllocTemporary();
1804 AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1]));
1805 AddLine("MOV.S {}, 0;", temporary);
1806 AddLine("IF CF.x;");
1807 AddLine("MOV.S {}, -1;", temporary);
1808 AddLine("ENDIF;");
1809 return temporary;
1810}
1811
1812std::string ARBDecompiler::Texture(Operation operation) {
1813 const auto& meta = std::get<MetaTexture>(operation.GetMeta());
1814 const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
1815 const auto [temporary, swizzle] = BuildCoords(operation);
1816
1817 std::string_view opcode = "TEX";
1818 std::string extra;
1819 if (meta.bias) {
1820 ASSERT(!meta.lod);
1821 opcode = "TXB";
1822
1823 if (swizzle < 4) {
1824 AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias));
1825 } else {
1826 const std::string bias = AllocTemporary();
1827 AddLine("MOV.F {}, {};", bias, Visit(meta.bias));
1828 extra = fmt::format(" {},", bias);
1829 }
1830 }
1831 if (meta.lod) {
1832 ASSERT(!meta.bias);
1833 opcode = "TXL";
1834
1835 if (swizzle < 4) {
1836 AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
1837 } else {
1838 const std::string lod = AllocTemporary();
1839 AddLine("MOV.F {}, {};", lod, Visit(meta.lod));
1840 extra = fmt::format(" {},", lod);
1841 }
1842 }
1843
1844 AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id,
1845 TextureType(meta), BuildAoffi(operation));
1846 AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
1847 return fmt::format("{}.x", temporary);
1848}
1849
1850std::string ARBDecompiler::TextureGather(Operation operation) {
1851 const auto& meta = std::get<MetaTexture>(operation.GetMeta());
1852 const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
1853 const auto [temporary, swizzle] = BuildCoords(operation);
1854
1855 std::string comp;
1856 if (!meta.sampler.is_shadow) {
1857 const auto& immediate = std::get<ImmediateNode>(*meta.component);
1858 comp = fmt::format(".{}", Swizzle(immediate.GetValue()));
1859 }
1860
1861 AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
1862 TextureType(meta), BuildAoffi(operation));
1863 AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
1864 return fmt::format("{}.x", temporary);
1865}
1866
1867std::string ARBDecompiler::TextureQueryDimensions(Operation operation) {
1868 const auto& meta = std::get<MetaTexture>(operation.GetMeta());
1869 const std::string temporary = AllocVectorTemporary();
1870 const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
1871
1872 ASSERT(!meta.sampler.is_array);
1873
1874 const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0";
1875 AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta));
1876 AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
1877 return fmt::format("{}.x", temporary);
1878}
1879
1880std::string ARBDecompiler::TextureQueryLod(Operation operation) {
1881 const auto& meta = std::get<MetaTexture>(operation.GetMeta());
1882 const std::string temporary = AllocVectorTemporary();
1883 const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
1884
1885 ASSERT(!meta.sampler.is_array);
1886
1887 const std::size_t count = operation.GetOperandsCount();
1888 for (std::size_t i = 0; i < count; ++i) {
1889 AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
1890 }
1891 AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta));
1892 AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary);
1893 AddLine("TRUNC.S {}, {};", temporary, temporary);
1894 AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
1895 return fmt::format("{}.x", temporary);
1896}
1897
1898std::string ARBDecompiler::TexelFetch(Operation operation) {
1899 const auto& meta = std::get<MetaTexture>(operation.GetMeta());
1900 const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
1901 const auto [temporary, swizzle] = BuildCoords(operation);
1902
1903 if (!meta.sampler.is_buffer) {
1904 ASSERT(swizzle < 4);
1905 AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
1906 }
1907 AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta),
1908 BuildAoffi(operation));
1909 AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
1910 return fmt::format("{}.x", temporary);
1911}
1912
1913std::string ARBDecompiler::TextureGradient(Operation operation) {
1914 const auto& meta = std::get<MetaTexture>(operation.GetMeta());
1915 const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
1916 const std::string ddx = AllocVectorTemporary();
1917 const std::string ddy = AllocVectorTemporary();
1918 const std::string coord = BuildCoords(operation).first;
1919
1920 const std::size_t num_components = meta.derivates.size() / 2;
1921 for (std::size_t index = 0; index < num_components; ++index) {
1922 const char swizzle = Swizzle(index);
1923 AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2]));
1924 AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1]));
1925 }
1926
1927 const std::string_view result = coord;
1928 AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id,
1929 TextureType(meta), BuildAoffi(operation));
1930 AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element));
1931 return fmt::format("{}.x", result);
1932}
1933
1934std::string ARBDecompiler::ImageLoad(Operation operation) {
1935 const auto& meta = std::get<MetaImage>(operation.GetMeta());
1936 const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
1937 const std::size_t count = operation.GetOperandsCount();
1938 const std::string_view type = ImageType(meta.image.type);
1939
1940 const std::string temporary = AllocVectorTemporary();
1941 for (std::size_t i = 0; i < count; ++i) {
1942 AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
1943 }
1944 AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type);
1945 AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
1946 return fmt::format("{}.x", temporary);
1947}
1948
1949std::string ARBDecompiler::ImageStore(Operation operation) {
1950 const auto& meta = std::get<MetaImage>(operation.GetMeta());
1951 const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
1952 const std::size_t num_coords = operation.GetOperandsCount();
1953 const std::size_t num_values = meta.values.size();
1954 const std::string_view type = ImageType(meta.image.type);
1955
1956 const std::string coord = AllocVectorTemporary();
1957 const std::string value = AllocVectorTemporary();
1958 for (std::size_t i = 0; i < num_coords; ++i) {
1959 AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
1960 }
1961 for (std::size_t i = 0; i < num_values; ++i) {
1962 AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
1963 }
1964 AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type);
1965 return {};
1966}
1967
1968std::string ARBDecompiler::Branch(Operation operation) {
1969 const auto target = std::get<ImmediateNode>(*operation[0]);
1970 AddLine("MOV.U PC.x, {};", target.GetValue());
1971 AddLine("CONT;");
1972 return {};
1973}
1974
1975std::string ARBDecompiler::BranchIndirect(Operation operation) {
1976 AddLine("MOV.U PC.x, {};", Visit(operation[0]));
1977 AddLine("CONT;");
1978 return {};
1979}
1980
1981std::string ARBDecompiler::PushFlowStack(Operation operation) {
1982 const auto stack = std::get<MetaStackClass>(operation.GetMeta());
1983 const u32 target = std::get<ImmediateNode>(*operation[0]).GetValue();
1984 const std::string_view stack_name = StackName(stack);
1985 AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target);
1986 AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
1987 return {};
1988}
1989
1990std::string ARBDecompiler::PopFlowStack(Operation operation) {
1991 const auto stack = std::get<MetaStackClass>(operation.GetMeta());
1992 const std::string_view stack_name = StackName(stack);
1993 AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
1994 AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name);
1995 AddLine("CONT;");
1996 return {};
1997}
1998
1999std::string ARBDecompiler::Exit(Operation) {
2000 Exit();
2001 return {};
2002}
2003
2004std::string ARBDecompiler::Discard(Operation) {
2005 AddLine("KIL TR;");
2006 return {};
2007}
2008
2009std::string ARBDecompiler::EmitVertex(Operation) {
2010 AddLine("EMIT;");
2011 return {};
2012}
2013
2014std::string ARBDecompiler::EndPrimitive(Operation) {
2015 AddLine("ENDPRIM;");
2016 return {};
2017}
2018
2019std::string ARBDecompiler::InvocationId(Operation) {
2020 return "primitive.invocation";
2021}
2022
2023std::string ARBDecompiler::YNegate(Operation) {
2024 LOG_WARNING(Render_OpenGL, "(STUBBED)");
2025 const std::string temporary = AllocTemporary();
2026 AddLine("MOV.F {}, 1;", temporary);
2027 return temporary;
2028}
2029
2030std::string ARBDecompiler::ThreadId(Operation) {
2031 return fmt::format("{}.threadid", StageInputName(stage));
2032}
2033
2034std::string ARBDecompiler::ShuffleIndexed(Operation operation) {
2035 if (!device.HasWarpIntrinsics()) {
2036 LOG_ERROR(Render_OpenGL,
2037 "NV_shader_thread_shuffle is missing. Kepler or better is required.");
2038 return Visit(operation[0]);
2039 }
2040 const std::string temporary = AllocVectorTemporary();
2041 AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]),
2042 Visit(operation[1]));
2043 AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
2044 return fmt::format("{}.x", temporary);
2045}
2046
2047std::string ARBDecompiler::Barrier(Operation) {
2048 if (!ir.IsDecompiled()) {
2049 LOG_ERROR(Render_OpenGL, "BAR used but shader is not decompiled");
2050 return {};
2051 }
2052 AddLine("BAR;");
2053 return {};
2054}
2055
2056std::string ARBDecompiler::MemoryBarrierGroup(Operation) {
2057 AddLine("MEMBAR.CTA;");
2058 return {};
2059}
2060
2061std::string ARBDecompiler::MemoryBarrierGlobal(Operation) {
2062 AddLine("MEMBAR;");
2063 return {};
2064}
2065
2066} // Anonymous namespace
2067
2068std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
2069 const VideoCommon::Shader::Registry& registry,
2070 Tegra::Engines::ShaderType stage, std::string_view identifier) {
2071 return ARBDecompiler(device, ir, registry, stage, identifier).Code();
2072}
2073
2074} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h
new file mode 100644
index 000000000..6afc87220
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h
@@ -0,0 +1,29 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <string>
8#include <string_view>
9
10#include "common/common_types.h"
11
12namespace Tegra::Engines {
13enum class ShaderType : u32;
14}
15
16namespace VideoCommon::Shader {
17class ShaderIR;
18class Registry;
19} // namespace VideoCommon::Shader
20
21namespace OpenGL {
22
23class Device;
24
25std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
26 const VideoCommon::Shader::Registry& registry,
27 Tegra::Engines::ShaderType stage, std::string_view identifier);
28
29} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index d2cab50bd..ad0577a4f 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -8,6 +8,7 @@
8 8
9#include "common/assert.h" 9#include "common/assert.h"
10#include "common/microprofile.h" 10#include "common/microprofile.h"
11#include "video_core/buffer_cache/buffer_cache.h"
11#include "video_core/engines/maxwell_3d.h" 12#include "video_core/engines/maxwell_3d.h"
12#include "video_core/rasterizer_interface.h" 13#include "video_core/rasterizer_interface.h"
13#include "video_core/renderer_opengl/gl_buffer_cache.h" 14#include "video_core/renderer_opengl/gl_buffer_cache.h"
@@ -21,13 +22,12 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
21 22
22MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); 23MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
23 24
24CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size) 25Buffer::Buffer(VAddr cpu_addr, const std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} {
25 : VideoCommon::BufferBlock{cpu_addr, size} {
26 gl_buffer.Create(); 26 gl_buffer.Create();
27 glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); 27 glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
28} 28}
29 29
30CachedBufferBlock::~CachedBufferBlock() = default; 30Buffer::~Buffer() = default;
31 31
32OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, 32OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
33 const Device& device, std::size_t stream_size) 33 const Device& device, std::size_t stream_size)
@@ -47,12 +47,8 @@ OGLBufferCache::~OGLBufferCache() {
47 glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); 47 glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
48} 48}
49 49
50Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { 50std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
51 return std::make_shared<CachedBufferBlock>(cpu_addr, size); 51 return std::make_shared<Buffer>(cpu_addr, size);
52}
53
54GLuint OGLBufferCache::ToHandle(const Buffer& buffer) {
55 return buffer->GetHandle();
56} 52}
57 53
58GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) { 54GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
@@ -61,7 +57,7 @@ GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
61 57
62void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, 58void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
63 const u8* data) { 59 const u8* data) {
64 glNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset), 60 glNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
65 static_cast<GLsizeiptr>(size), data); 61 static_cast<GLsizeiptr>(size), data);
66} 62}
67 63
@@ -69,20 +65,20 @@ void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
69 u8* data) { 65 u8* data) {
70 MICROPROFILE_SCOPE(OpenGL_Buffer_Download); 66 MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
71 glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); 67 glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
72 glGetNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset), 68 glGetNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
73 static_cast<GLsizeiptr>(size), data); 69 static_cast<GLsizeiptr>(size), data);
74} 70}
75 71
76void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, 72void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
77 std::size_t dst_offset, std::size_t size) { 73 std::size_t dst_offset, std::size_t size) {
78 glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast<GLintptr>(src_offset), 74 glCopyNamedBufferSubData(src.Handle(), dst.Handle(), static_cast<GLintptr>(src_offset),
79 static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); 75 static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
80} 76}
81 77
82OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, 78OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
83 std::size_t size) { 79 std::size_t size) {
84 DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); 80 DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
85 const GLuint& cbuf = cbufs[cbuf_cursor++]; 81 const GLuint cbuf = cbufs[cbuf_cursor++];
86 glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer); 82 glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
87 return {cbuf, 0}; 83 return {cbuf, 0};
88} 84}
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index a9e86cfc7..a49aaf9c4 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -10,7 +10,6 @@
10#include "common/common_types.h" 10#include "common/common_types.h"
11#include "video_core/buffer_cache/buffer_cache.h" 11#include "video_core/buffer_cache/buffer_cache.h"
12#include "video_core/engines/maxwell_3d.h" 12#include "video_core/engines/maxwell_3d.h"
13#include "video_core/rasterizer_cache.h"
14#include "video_core/renderer_opengl/gl_resource_manager.h" 13#include "video_core/renderer_opengl/gl_resource_manager.h"
15#include "video_core/renderer_opengl/gl_stream_buffer.h" 14#include "video_core/renderer_opengl/gl_stream_buffer.h"
16 15
@@ -24,17 +23,12 @@ class Device;
24class OGLStreamBuffer; 23class OGLStreamBuffer;
25class RasterizerOpenGL; 24class RasterizerOpenGL;
26 25
27class CachedBufferBlock; 26class Buffer : public VideoCommon::BufferBlock {
28
29using Buffer = std::shared_ptr<CachedBufferBlock>;
30using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
31
32class CachedBufferBlock : public VideoCommon::BufferBlock {
33public: 27public:
34 explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size); 28 explicit Buffer(VAddr cpu_addr, const std::size_t size);
35 ~CachedBufferBlock(); 29 ~Buffer();
36 30
37 GLuint GetHandle() const { 31 GLuint Handle() const {
38 return gl_buffer.handle; 32 return gl_buffer.handle;
39 } 33 }
40 34
@@ -42,6 +36,7 @@ private:
42 OGLBuffer gl_buffer; 36 OGLBuffer gl_buffer;
43}; 37};
44 38
39using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
45class OGLBufferCache final : public GenericBufferCache { 40class OGLBufferCache final : public GenericBufferCache {
46public: 41public:
47 explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, 42 explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
@@ -55,9 +50,7 @@ public:
55 } 50 }
56 51
57protected: 52protected:
58 Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; 53 std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
59
60 GLuint ToHandle(const Buffer& buffer) override;
61 54
62 void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, 55 void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
63 const u8* data) override; 56 const u8* data) override;
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index d83dca25a..e245e27ec 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -6,6 +6,7 @@
6#include <array> 6#include <array>
7#include <cstddef> 7#include <cstddef>
8#include <cstring> 8#include <cstring>
9#include <limits>
9#include <optional> 10#include <optional>
10#include <vector> 11#include <vector>
11 12
@@ -13,6 +14,7 @@
13 14
14#include "common/logging/log.h" 15#include "common/logging/log.h"
15#include "common/scope_exit.h" 16#include "common/scope_exit.h"
17#include "core/settings.h"
16#include "video_core/renderer_opengl/gl_device.h" 18#include "video_core/renderer_opengl/gl_device.h"
17#include "video_core/renderer_opengl/gl_resource_manager.h" 19#include "video_core/renderer_opengl/gl_resource_manager.h"
18 20
@@ -25,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;
25 27
26constexpr u32 NumStages = 5; 28constexpr u32 NumStages = 5;
27 29
28constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, 30constexpr std::array LimitUBOs = {
29 GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, 31 GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
30 GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS}; 32 GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
33 GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS};
31 34
32constexpr std::array LimitSSBOs = { 35constexpr std::array LimitSSBOs = {
33 GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, 36 GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
34 GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, 37 GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
35 GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS}; 38 GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
36 39
37constexpr std::array LimitSamplers = { 40constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
38 GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, 41 GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
39 GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, 42 GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
40 GL_MAX_TEXTURE_IMAGE_UNITS}; 43 GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
44 GL_MAX_TEXTURE_IMAGE_UNITS,
45 GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
41 46
42constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS, 47constexpr std::array LimitImages = {
43 GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, 48 GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
44 GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, 49 GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
45 GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS}; 50 GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS};
46 51
47template <typename T> 52template <typename T>
48T GetInteger(GLenum pname) { 53T GetInteger(GLenum pname) {
@@ -84,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
84 return std::exchange(base, base + amount); 89 return std::exchange(base, base + amount);
85} 90}
86 91
92std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
93 std::array<u32, Tegra::Engines::MaxShaderTypes> max;
94 std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
95 [](GLenum pname) { return GetInteger<u32>(pname); });
96 return max;
97}
98
87std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept { 99std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
88 std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings; 100 std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
89 101
@@ -132,6 +144,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
132} 144}
133 145
134bool IsASTCSupported() { 146bool IsASTCSupported() {
147 static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
135 static constexpr std::array formats = { 148 static constexpr std::array formats = {
136 GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x4_KHR, 149 GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x4_KHR,
137 GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_COMPRESSED_RGBA_ASTC_6x5_KHR, 150 GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_COMPRESSED_RGBA_ASTC_6x5_KHR,
@@ -148,25 +161,43 @@ bool IsASTCSupported() {
148 GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR, 161 GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR,
149 GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR, 162 GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR,
150 }; 163 };
151 return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) { 164 static constexpr std::array required_support = {
152 GLint supported; 165 GL_VERTEX_TEXTURE, GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE,
153 glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1, 166 GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE, GL_COMPUTE_TEXTURE,
154 &supported); 167 };
155 return supported == GL_TRUE; 168
156 }) == formats.end(); 169 for (const GLenum target : targets) {
170 for (const GLenum format : formats) {
171 for (const GLenum support : required_support) {
172 GLint value;
173 glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value);
174 if (value != GL_FULL_SUPPORT) {
175 return false;
176 }
177 }
178 }
179 }
180 return true;
157} 181}
158 182
159} // Anonymous namespace 183} // Anonymous namespace
160 184
161Device::Device() : base_bindings{BuildBaseBindings()} { 185Device::Device()
186 : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
162 const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); 187 const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
163 const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); 188 const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
164 const std::vector extensions = GetExtensions(); 189 const std::vector extensions = GetExtensions();
165 190
166 const bool is_nvidia = vendor == "NVIDIA Corporation"; 191 const bool is_nvidia = vendor == "NVIDIA Corporation";
167 const bool is_amd = vendor == "ATI Technologies Inc."; 192 const bool is_amd = vendor == "ATI Technologies Inc.";
168 const bool is_intel = vendor == "Intel"; 193
169 const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr; 194 bool disable_fast_buffer_sub_data = false;
195 if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
196 LOG_WARNING(
197 Render_OpenGL,
198 "Beta driver 443.24 is known to have issues. There might be performance issues.");
199 disable_fast_buffer_sub_data = true;
200 }
170 201
171 uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); 202 uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
172 shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); 203 shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
@@ -181,16 +212,25 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
181 has_variable_aoffi = TestVariableAoffi(); 212 has_variable_aoffi = TestVariableAoffi();
182 has_component_indexing_bug = is_amd; 213 has_component_indexing_bug = is_amd;
183 has_precise_bug = TestPreciseBug(); 214 has_precise_bug = TestPreciseBug();
184 has_broken_compute = is_intel_proprietary; 215 has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
185 has_fast_buffer_sub_data = is_nvidia; 216 has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
217 use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
218 GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
219 GLAD_GL_NV_transform_feedback2;
186 220
187 LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); 221 LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
188 LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); 222 LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
189 LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug); 223 LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
224
225 if (Settings::values.use_assembly_shaders && !use_assembly_shaders) {
226 LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
227 }
190} 228}
191 229
192Device::Device(std::nullptr_t) { 230Device::Device(std::nullptr_t) {
193 uniform_buffer_alignment = 0; 231 max_uniform_buffers.fill(std::numeric_limits<u32>::max());
232 uniform_buffer_alignment = 4;
233 shader_storage_alignment = 4;
194 max_vertex_attributes = 16; 234 max_vertex_attributes = 16;
195 max_varyings = 15; 235 max_varyings = 15;
196 has_warp_intrinsics = true; 236 has_warp_intrinsics = true;
@@ -198,9 +238,6 @@ Device::Device(std::nullptr_t) {
198 has_vertex_viewport_layer = true; 238 has_vertex_viewport_layer = true;
199 has_image_load_formatted = true; 239 has_image_load_formatted = true;
200 has_variable_aoffi = true; 240 has_variable_aoffi = true;
201 has_component_indexing_bug = false;
202 has_broken_compute = false;
203 has_precise_bug = false;
204} 241}
205 242
206bool Device::TestVariableAoffi() { 243bool Device::TestVariableAoffi() {
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index a55050cb5..145347943 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -24,6 +24,10 @@ public:
24 explicit Device(); 24 explicit Device();
25 explicit Device(std::nullptr_t); 25 explicit Device(std::nullptr_t);
26 26
27 u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
28 return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
29 }
30
27 const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept { 31 const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
28 return base_bindings[stage_index]; 32 return base_bindings[stage_index];
29 } 33 }
@@ -80,19 +84,24 @@ public:
80 return has_precise_bug; 84 return has_precise_bug;
81 } 85 }
82 86
83 bool HasBrokenCompute() const {
84 return has_broken_compute;
85 }
86
87 bool HasFastBufferSubData() const { 87 bool HasFastBufferSubData() const {
88 return has_fast_buffer_sub_data; 88 return has_fast_buffer_sub_data;
89 } 89 }
90 90
91 bool HasNvViewportArray2() const {
92 return has_nv_viewport_array2;
93 }
94
95 bool UseAssemblyShaders() const {
96 return use_assembly_shaders;
97 }
98
91private: 99private:
92 static bool TestVariableAoffi(); 100 static bool TestVariableAoffi();
93 static bool TestPreciseBug(); 101 static bool TestPreciseBug();
94 102
95 std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings; 103 std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
104 std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
96 std::size_t uniform_buffer_alignment{}; 105 std::size_t uniform_buffer_alignment{};
97 std::size_t shader_storage_alignment{}; 106 std::size_t shader_storage_alignment{};
98 u32 max_vertex_attributes{}; 107 u32 max_vertex_attributes{};
@@ -105,8 +114,9 @@ private:
105 bool has_variable_aoffi{}; 114 bool has_variable_aoffi{};
106 bool has_component_indexing_bug{}; 115 bool has_component_indexing_bug{};
107 bool has_precise_bug{}; 116 bool has_precise_bug{};
108 bool has_broken_compute{};
109 bool has_fast_buffer_sub_data{}; 117 bool has_fast_buffer_sub_data{};
118 bool has_nv_viewport_array2{};
119 bool use_assembly_shaders{};
110}; 120};
111 121
112} // namespace OpenGL 122} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp
index 99ddcb3f8..ec5421afa 100644
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -4,6 +4,7 @@
4 4
5#include "common/assert.h" 5#include "common/assert.h"
6 6
7#include "video_core/renderer_opengl/gl_buffer_cache.h"
7#include "video_core/renderer_opengl/gl_fence_manager.h" 8#include "video_core/renderer_opengl/gl_fence_manager.h"
8 9
9namespace OpenGL { 10namespace OpenGL {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 69dcf952f..2d6c11320 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -30,6 +30,7 @@
30#include "video_core/renderer_opengl/gl_shader_cache.h" 30#include "video_core/renderer_opengl/gl_shader_cache.h"
31#include "video_core/renderer_opengl/maxwell_to_gl.h" 31#include "video_core/renderer_opengl/maxwell_to_gl.h"
32#include "video_core/renderer_opengl/renderer_opengl.h" 32#include "video_core/renderer_opengl/renderer_opengl.h"
33#include "video_core/shader_cache.h"
33 34
34namespace OpenGL { 35namespace OpenGL {
35 36
@@ -54,15 +55,33 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255
54 55
55namespace { 56namespace {
56 57
58constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
59constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
60 NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
61constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
62 NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
63
57constexpr std::size_t NumSupportedVertexAttributes = 16; 64constexpr std::size_t NumSupportedVertexAttributes = 16;
58 65
59template <typename Engine, typename Entry> 66template <typename Engine, typename Entry>
60Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, 67Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
61 ShaderType shader_type, std::size_t index = 0) { 68 ShaderType shader_type, std::size_t index = 0) {
69 if constexpr (std::is_same_v<Entry, SamplerEntry>) {
70 if (entry.is_separated) {
71 const u32 buffer_1 = entry.buffer;
72 const u32 buffer_2 = entry.secondary_buffer;
73 const u32 offset_1 = entry.offset;
74 const u32 offset_2 = entry.secondary_offset;
75 const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1);
76 const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2);
77 return engine.GetTextureInfo(handle_1 | handle_2);
78 }
79 }
62 if (entry.is_bindless) { 80 if (entry.is_bindless) {
63 const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); 81 const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
64 return engine.GetTextureInfo(tex_handle); 82 return engine.GetTextureInfo(handle);
65 } 83 }
84
66 const auto& gpu_profile = engine.AccessGuestDriverProfile(); 85 const auto& gpu_profile = engine.AccessGuestDriverProfile();
67 const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); 86 const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
68 if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { 87 if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
@@ -87,6 +106,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
87 return buffer.size; 106 return buffer.size;
88} 107}
89 108
109/// Translates hardware transform feedback indices
110/// @param location Hardware location
111/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
112/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt
113std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
114 const u8 index = location / 4;
115 if (index >= 8 && index <= 39) {
116 return {GL_GENERIC_ATTRIB_NV, index - 8};
117 }
118 if (index >= 48 && index <= 55) {
119 return {GL_TEXTURE_COORD_NV, index - 48};
120 }
121 switch (index) {
122 case 7:
123 return {GL_POSITION, 0};
124 case 40:
125 return {GL_PRIMARY_COLOR_NV, 0};
126 case 41:
127 return {GL_SECONDARY_COLOR_NV, 0};
128 case 42:
129 return {GL_BACK_PRIMARY_COLOR_NV, 0};
130 case 43:
131 return {GL_BACK_SECONDARY_COLOR_NV, 0};
132 }
133 UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
134 return {GL_POSITION, 0};
135}
136
90void oglEnable(GLenum cap, bool state) { 137void oglEnable(GLenum cap, bool state) {
91 (state ? glEnable : glDisable)(cap); 138 (state ? glEnable : glDisable)(cap);
92} 139}
@@ -94,17 +141,33 @@ void oglEnable(GLenum cap, bool state) {
94} // Anonymous namespace 141} // Anonymous namespace
95 142
96RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, 143RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
97 ScreenInfo& info, GLShader::ProgramManager& program_manager, 144 const Device& device, ScreenInfo& info,
98 StateTracker& state_tracker) 145 ProgramManager& program_manager, StateTracker& state_tracker)
99 : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker}, 146 : RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device,
147 state_tracker},
100 shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, 148 shader_cache{*this, system, emu_window, device}, query_cache{system, *this},
101 buffer_cache{*this, system, device, STREAM_BUFFER_SIZE}, 149 buffer_cache{*this, system, device, STREAM_BUFFER_SIZE},
102 fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system}, 150 fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system},
103 screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { 151 screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
104 CheckExtensions(); 152 CheckExtensions();
153
154 unified_uniform_buffer.Create();
155 glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
156
157 if (device.UseAssemblyShaders()) {
158 glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
159 for (const GLuint cbuf : staging_cbufs) {
160 glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
161 nullptr, 0);
162 }
163 }
105} 164}
106 165
107RasterizerOpenGL::~RasterizerOpenGL() {} 166RasterizerOpenGL::~RasterizerOpenGL() {
167 if (device.UseAssemblyShaders()) {
168 glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
169 }
170}
108 171
109void RasterizerOpenGL::CheckExtensions() { 172void RasterizerOpenGL::CheckExtensions() {
110 if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) { 173 if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
@@ -230,6 +293,7 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
230void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { 293void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
231 MICROPROFILE_SCOPE(OpenGL_Shader); 294 MICROPROFILE_SCOPE(OpenGL_Shader);
232 auto& gpu = system.GPU().Maxwell3D(); 295 auto& gpu = system.GPU().Maxwell3D();
296 std::size_t num_ssbos = 0;
233 u32 clip_distances = 0; 297 u32 clip_distances = 0;
234 298
235 for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { 299 for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@@ -259,7 +323,15 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
259 continue; 323 continue;
260 } 324 }
261 325
262 Shader shader{shader_cache.GetStageProgram(program)}; 326 Shader* const shader = shader_cache.GetStageProgram(program);
327
328 if (device.UseAssemblyShaders()) {
329 // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
330 // all stages share the same bindings.
331 const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size();
332 ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage");
333 num_ssbos += num_stage_ssbos;
334 }
263 335
264 // Stage indices are 0 - 5 336 // Stage indices are 0 - 5
265 const std::size_t stage = index == 0 ? 0 : index - 1; 337 const std::size_t stage = index == 0 ? 0 : index - 1;
@@ -526,6 +598,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
526 SyncFramebufferSRGB(); 598 SyncFramebufferSRGB();
527 599
528 buffer_cache.Acquire(); 600 buffer_cache.Acquire();
601 current_cbuf = 0;
529 602
530 std::size_t buffer_size = CalculateVertexArraysSize(); 603 std::size_t buffer_size = CalculateVertexArraysSize();
531 604
@@ -535,16 +608,25 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
535 } 608 }
536 609
537 // Uniform space for the 5 shader stages 610 // Uniform space for the 5 shader stages
538 buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + 611 buffer_size =
539 (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * 612 Common::AlignUp<std::size_t>(buffer_size, 4) +
540 Maxwell::MaxShaderStage; 613 (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
541 614
542 // Add space for at least 18 constant buffers 615 // Add space for at least 18 constant buffers
543 buffer_size += Maxwell::MaxConstBuffers * 616 buffer_size += Maxwell::MaxConstBuffers *
544 (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); 617 (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
545 618
546 // Prepare the vertex array. 619 // Prepare the vertex array.
547 buffer_cache.Map(buffer_size); 620 const bool invalidated = buffer_cache.Map(buffer_size);
621
622 if (invalidated) {
623 // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
624 auto& dirty = gpu.dirty.flags;
625 dirty[Dirty::VertexBuffers] = true;
626 for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
627 dirty[index] = true;
628 }
629 }
548 630
549 // Prepare vertex array format. 631 // Prepare vertex array format.
550 SetupVertexFormat(); 632 SetupVertexFormat();
@@ -558,12 +640,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
558 } 640 }
559 641
560 // Setup emulation uniform buffer. 642 // Setup emulation uniform buffer.
561 GLShader::MaxwellUniformData ubo; 643 if (!device.UseAssemblyShaders()) {
562 ubo.SetFromRegs(gpu); 644 MaxwellUniformData ubo;
563 const auto [buffer, offset] = 645 ubo.SetFromRegs(gpu);
564 buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); 646 const auto [buffer, offset] =
565 glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset, 647 buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
566 static_cast<GLsizeiptr>(sizeof(ubo))); 648 glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
649 static_cast<GLsizeiptr>(sizeof(ubo)));
650 }
567 651
568 // Setup shaders and their used resources. 652 // Setup shaders and their used resources.
569 texture_cache.GuardSamplers(true); 653 texture_cache.GuardSamplers(true);
@@ -630,16 +714,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
630} 714}
631 715
632void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { 716void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
633 if (device.HasBrokenCompute()) {
634 return;
635 }
636
637 buffer_cache.Acquire(); 717 buffer_cache.Acquire();
718 current_cbuf = 0;
638 719
639 auto kernel = shader_cache.GetComputeKernel(code_addr); 720 auto kernel = shader_cache.GetComputeKernel(code_addr);
640 SetupComputeTextures(kernel); 721 SetupComputeTextures(kernel);
641 SetupComputeImages(kernel); 722 SetupComputeImages(kernel);
642 program_manager.BindComputeShader(kernel->GetHandle());
643 723
644 const std::size_t buffer_size = 724 const std::size_t buffer_size =
645 Tegra::Engines::KeplerCompute::NumConstBuffers * 725 Tegra::Engines::KeplerCompute::NumConstBuffers *
@@ -652,6 +732,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
652 buffer_cache.Unmap(); 732 buffer_cache.Unmap();
653 733
654 const auto& launch_desc = system.GPU().KeplerCompute().launch_description; 734 const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
735 program_manager.BindCompute(kernel->GetHandle());
655 glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); 736 glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
656 ++num_queued_commands; 737 ++num_queued_commands;
657} 738}
@@ -701,15 +782,15 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
701 return; 782 return;
702 } 783 }
703 texture_cache.OnCPUWrite(addr, size); 784 texture_cache.OnCPUWrite(addr, size);
704 shader_cache.InvalidateRegion(addr, size); 785 shader_cache.OnCPUWrite(addr, size);
705 buffer_cache.OnCPUWrite(addr, size); 786 buffer_cache.OnCPUWrite(addr, size);
706 query_cache.InvalidateRegion(addr, size);
707} 787}
708 788
709void RasterizerOpenGL::SyncGuestHost() { 789void RasterizerOpenGL::SyncGuestHost() {
710 MICROPROFILE_SCOPE(OpenGL_CacheManagement); 790 MICROPROFILE_SCOPE(OpenGL_CacheManagement);
711 texture_cache.SyncGuestHost(); 791 texture_cache.SyncGuestHost();
712 buffer_cache.SyncGuestHost(); 792 buffer_cache.SyncGuestHost();
793 shader_cache.SyncGuestHost();
713} 794}
714 795
715void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) { 796void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
@@ -811,40 +892,73 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
811 return true; 892 return true;
812} 893}
813 894
814void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) { 895void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
896 static constexpr std::array PARAMETER_LUT = {
897 GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
898 GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
899 GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV};
900
815 MICROPROFILE_SCOPE(OpenGL_UBO); 901 MICROPROFILE_SCOPE(OpenGL_UBO);
816 const auto& stages = system.GPU().Maxwell3D().state.shader_stages; 902 const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
817 const auto& shader_stage = stages[stage_index]; 903 const auto& shader_stage = stages[stage_index];
904 const auto& entries = shader->GetEntries();
905 const bool use_unified = entries.use_unified_uniforms;
906 const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
818 907
819 u32 binding = device.GetBaseBindings(stage_index).uniform_buffer; 908 const auto base_bindings = device.GetBaseBindings(stage_index);
820 for (const auto& entry : shader->GetEntries().const_buffers) { 909 u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
821 const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; 910 for (const auto& entry : entries.const_buffers) {
822 SetupConstBuffer(binding++, buffer, entry); 911 const u32 index = entry.GetIndex();
912 const auto& buffer = shader_stage.const_buffers[index];
913 SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
914 base_unified_offset + index * Maxwell::MaxConstBufferSize);
915 ++binding;
916 }
917 if (use_unified) {
918 const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
919 entries.global_memory_entries.size());
920 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
921 base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
823 } 922 }
824} 923}
825 924
826void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { 925void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
827 MICROPROFILE_SCOPE(OpenGL_UBO); 926 MICROPROFILE_SCOPE(OpenGL_UBO);
828 const auto& launch_desc = system.GPU().KeplerCompute().launch_description; 927 const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
928 const auto& entries = kernel->GetEntries();
929 const bool use_unified = entries.use_unified_uniforms;
829 930
830 u32 binding = 0; 931 u32 binding = 0;
831 for (const auto& entry : kernel->GetEntries().const_buffers) { 932 for (const auto& entry : entries.const_buffers) {
832 const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; 933 const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
833 const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); 934 const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
834 Tegra::Engines::ConstBufferInfo buffer; 935 Tegra::Engines::ConstBufferInfo buffer;
835 buffer.address = config.Address(); 936 buffer.address = config.Address();
836 buffer.size = config.size; 937 buffer.size = config.size;
837 buffer.enabled = mask[entry.GetIndex()]; 938 buffer.enabled = mask[entry.GetIndex()];
838 SetupConstBuffer(binding++, buffer, entry); 939 SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
940 use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
941 ++binding;
942 }
943 if (use_unified) {
944 const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
945 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
946 NUM_CONST_BUFFERS_BYTES_PER_STAGE);
839 } 947 }
840} 948}
841 949
842void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, 950void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
843 const ConstBufferEntry& entry) { 951 const Tegra::Engines::ConstBufferInfo& buffer,
952 const ConstBufferEntry& entry, bool use_unified,
953 std::size_t unified_offset) {
844 if (!buffer.enabled) { 954 if (!buffer.enabled) {
845 // Set values to zero to unbind buffers 955 // Set values to zero to unbind buffers
846 glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0, 956 if (device.UseAssemblyShaders()) {
847 sizeof(float)); 957 glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
958 } else {
959 glBindBufferRange(GL_UNIFORM_BUFFER, binding,
960 buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
961 }
848 return; 962 return;
849 } 963 }
850 964
@@ -852,18 +966,38 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const
852 // UBO alignment requirements. 966 // UBO alignment requirements.
853 const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); 967 const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
854 968
855 const auto alignment = device.GetUniformBufferAlignment(); 969 const bool fast_upload = !use_unified && device.HasFastBufferSubData();
856 const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, 970
857 device.HasFastBufferSubData()); 971 const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
858 glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); 972 const GPUVAddr gpu_addr = buffer.address;
973 auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
974
975 if (device.UseAssemblyShaders()) {
976 UNIMPLEMENTED_IF(use_unified);
977 if (offset != 0) {
978 const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
979 glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
980 cbuf = staging_cbuf;
981 offset = 0;
982 }
983 glBindBufferRangeNV(stage, binding, cbuf, offset, size);
984 return;
985 }
986
987 if (use_unified) {
988 glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
989 } else {
990 glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
991 }
859} 992}
860 993
861void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { 994void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
862 auto& gpu{system.GPU()}; 995 auto& gpu{system.GPU()};
863 auto& memory_manager{gpu.MemoryManager()}; 996 auto& memory_manager{gpu.MemoryManager()};
864 const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; 997 const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
865 998
866 u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer; 999 u32 binding =
1000 device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
867 for (const auto& entry : shader->GetEntries().global_memory_entries) { 1001 for (const auto& entry : shader->GetEntries().global_memory_entries) {
868 const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; 1002 const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
869 const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)}; 1003 const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
@@ -872,7 +1006,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
872 } 1006 }
873} 1007}
874 1008
875void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { 1009void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
876 auto& gpu{system.GPU()}; 1010 auto& gpu{system.GPU()};
877 auto& memory_manager{gpu.MemoryManager()}; 1011 auto& memory_manager{gpu.MemoryManager()};
878 const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; 1012 const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
@@ -895,7 +1029,7 @@ void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& e
895 static_cast<GLsizeiptr>(size)); 1029 static_cast<GLsizeiptr>(size));
896} 1030}
897 1031
898void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) { 1032void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
899 MICROPROFILE_SCOPE(OpenGL_Texture); 1033 MICROPROFILE_SCOPE(OpenGL_Texture);
900 const auto& maxwell3d = system.GPU().Maxwell3D(); 1034 const auto& maxwell3d = system.GPU().Maxwell3D();
901 u32 binding = device.GetBaseBindings(stage_index).sampler; 1035 u32 binding = device.GetBaseBindings(stage_index).sampler;
@@ -908,7 +1042,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&
908 } 1042 }
909} 1043}
910 1044
911void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) { 1045void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {
912 MICROPROFILE_SCOPE(OpenGL_Texture); 1046 MICROPROFILE_SCOPE(OpenGL_Texture);
913 const auto& compute = system.GPU().KeplerCompute(); 1047 const auto& compute = system.GPU().KeplerCompute();
914 u32 binding = 0; 1048 u32 binding = 0;
@@ -929,19 +1063,15 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
929 glBindTextureUnit(binding, 0); 1063 glBindTextureUnit(binding, 0);
930 return; 1064 return;
931 } 1065 }
932 glBindTextureUnit(binding, view->GetTexture()); 1066 const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source,
933 1067 texture.tic.z_source, texture.tic.w_source);
934 if (view->GetSurfaceParams().IsBuffer()) { 1068 glBindTextureUnit(binding, handle);
935 return; 1069 if (!view->GetSurfaceParams().IsBuffer()) {
1070 glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
936 } 1071 }
937 // Apply swizzle to textures that are not buffers.
938 view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source,
939 texture.tic.w_source);
940
941 glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
942} 1072}
943 1073
944void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) { 1074void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {
945 const auto& maxwell3d = system.GPU().Maxwell3D(); 1075 const auto& maxwell3d = system.GPU().Maxwell3D();
946 u32 binding = device.GetBaseBindings(stage_index).image; 1076 u32 binding = device.GetBaseBindings(stage_index).image;
947 for (const auto& entry : shader->GetEntries().images) { 1077 for (const auto& entry : shader->GetEntries().images) {
@@ -951,7 +1081,7 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh
951 } 1081 }
952} 1082}
953 1083
954void RasterizerOpenGL::SetupComputeImages(const Shader& shader) { 1084void RasterizerOpenGL::SetupComputeImages(Shader* shader) {
955 const auto& compute = system.GPU().KeplerCompute(); 1085 const auto& compute = system.GPU().KeplerCompute();
956 u32 binding = 0; 1086 u32 binding = 0;
957 for (const auto& entry : shader->GetEntries().images) { 1087 for (const auto& entry : shader->GetEntries().images) {
@@ -967,14 +1097,11 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t
967 glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8); 1097 glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
968 return; 1098 return;
969 } 1099 }
970 if (!tic.IsBuffer()) {
971 view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
972 }
973 if (entry.is_written) { 1100 if (entry.is_written) {
974 view->MarkAsModified(texture_cache.Tick()); 1101 view->MarkAsModified(texture_cache.Tick());
975 } 1102 }
976 glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE, 1103 const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
977 view->GetFormat()); 1104 glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat());
978} 1105}
979 1106
980void RasterizerOpenGL::SyncViewport() { 1107void RasterizerOpenGL::SyncViewport() {
@@ -983,6 +1110,26 @@ void RasterizerOpenGL::SyncViewport() {
983 const auto& regs = gpu.regs; 1110 const auto& regs = gpu.regs;
984 1111
985 const bool dirty_viewport = flags[Dirty::Viewports]; 1112 const bool dirty_viewport = flags[Dirty::Viewports];
1113 const bool dirty_clip_control = flags[Dirty::ClipControl];
1114
1115 if (dirty_clip_control || flags[Dirty::FrontFace]) {
1116 flags[Dirty::FrontFace] = false;
1117
1118 GLenum mode = MaxwellToGL::FrontFace(regs.front_face);
1119 if (regs.screen_y_control.triangle_rast_flip != 0 &&
1120 regs.viewport_transform[0].scale_y < 0.0f) {
1121 switch (mode) {
1122 case GL_CW:
1123 mode = GL_CCW;
1124 break;
1125 case GL_CCW:
1126 mode = GL_CW;
1127 break;
1128 }
1129 }
1130 glFrontFace(mode);
1131 }
1132
986 if (dirty_viewport || flags[Dirty::ClipControl]) { 1133 if (dirty_viewport || flags[Dirty::ClipControl]) {
987 flags[Dirty::ClipControl] = false; 1134 flags[Dirty::ClipControl] = false;
988 1135
@@ -1080,11 +1227,6 @@ void RasterizerOpenGL::SyncCullMode() {
1080 glDisable(GL_CULL_FACE); 1227 glDisable(GL_CULL_FACE);
1081 } 1228 }
1082 } 1229 }
1083
1084 if (flags[Dirty::FrontFace]) {
1085 flags[Dirty::FrontFace] = false;
1086 glFrontFace(MaxwellToGL::FrontFace(regs.front_face));
1087 }
1088} 1230}
1089 1231
1090void RasterizerOpenGL::SyncPrimitiveRestart() { 1232void RasterizerOpenGL::SyncPrimitiveRestart() {
@@ -1455,12 +1597,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
1455 oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb); 1597 oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
1456} 1598}
1457 1599
1600void RasterizerOpenGL::SyncTransformFeedback() {
1601 // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
1602 // when this is required.
1603 const auto& regs = system.GPU().Maxwell3D().regs;
1604
1605 static constexpr std::size_t STRIDE = 3;
1606 std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
1607 std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams;
1608
1609 GLint* cursor = attribs.data();
1610 GLint* current_stream = streams.data();
1611
1612 for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) {
1613 const auto& layout = regs.tfb_layouts[feedback];
1614 UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding");
1615 if (layout.varying_count == 0) {
1616 continue;
1617 }
1618
1619 *current_stream = static_cast<GLint>(feedback);
1620 if (current_stream != streams.data()) {
1621 // When stepping one stream, push the expected token
1622 cursor[0] = GL_NEXT_BUFFER_NV;
1623 cursor[1] = 0;
1624 cursor[2] = 0;
1625 cursor += STRIDE;
1626 }
1627 ++current_stream;
1628
1629 const auto& locations = regs.tfb_varying_locs[feedback];
1630 std::optional<u8> current_index;
1631 for (u32 offset = 0; offset < layout.varying_count; ++offset) {
1632 const u8 location = locations[offset];
1633 const u8 index = location / 4;
1634
1635 if (current_index == index) {
1636 // Increase number of components of the previous attachment
1637 ++cursor[-2];
1638 continue;
1639 }
1640 current_index = index;
1641
1642 std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location);
1643 cursor[1] = 1;
1644 cursor += STRIDE;
1645 }
1646 }
1647
1648 const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE);
1649 const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data());
1650 glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(),
1651 GL_INTERLEAVED_ATTRIBS);
1652}
1653
1458void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { 1654void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
1459 const auto& regs = system.GPU().Maxwell3D().regs; 1655 const auto& regs = system.GPU().Maxwell3D().regs;
1460 if (regs.tfb_enabled == 0) { 1656 if (regs.tfb_enabled == 0) {
1461 return; 1657 return;
1462 } 1658 }
1463 1659
1660 if (device.UseAssemblyShaders()) {
1661 SyncTransformFeedback();
1662 }
1663
1464 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || 1664 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
1465 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || 1665 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
1466 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); 1666 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
@@ -1487,6 +1687,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
1487 static_cast<GLsizeiptr>(size)); 1687 static_cast<GLsizeiptr>(size));
1488 } 1688 }
1489 1689
1690 // We may have to call BeginTransformFeedbackNV here since they seem to call different
1691 // implementations on Nvidia's driver (the pointer is different) but we are using
1692 // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
1693 // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
1490 glBeginTransformFeedback(GL_POINTS); 1694 glBeginTransformFeedback(GL_POINTS);
1491} 1695}
1492 1696
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index b94c65907..4f082592f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -19,7 +19,6 @@
19#include "video_core/engines/const_buffer_info.h" 19#include "video_core/engines/const_buffer_info.h"
20#include "video_core/engines/maxwell_3d.h" 20#include "video_core/engines/maxwell_3d.h"
21#include "video_core/rasterizer_accelerated.h" 21#include "video_core/rasterizer_accelerated.h"
22#include "video_core/rasterizer_cache.h"
23#include "video_core/rasterizer_interface.h" 22#include "video_core/rasterizer_interface.h"
24#include "video_core/renderer_opengl/gl_buffer_cache.h" 23#include "video_core/renderer_opengl/gl_buffer_cache.h"
25#include "video_core/renderer_opengl/gl_device.h" 24#include "video_core/renderer_opengl/gl_device.h"
@@ -56,8 +55,8 @@ struct DrawParameters;
56class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { 55class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
57public: 56public:
58 explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, 57 explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
59 ScreenInfo& info, GLShader::ProgramManager& program_manager, 58 const Device& device, ScreenInfo& info,
60 StateTracker& state_tracker); 59 ProgramManager& program_manager, StateTracker& state_tracker);
61 ~RasterizerOpenGL() override; 60 ~RasterizerOpenGL() override;
62 61
63 void Draw(bool is_indexed, bool is_instanced) override; 62 void Draw(bool is_indexed, bool is_instanced) override;
@@ -100,40 +99,41 @@ private:
100 void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil); 99 void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil);
101 100
102 /// Configures the current constbuffers to use for the draw command. 101 /// Configures the current constbuffers to use for the draw command.
103 void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader); 102 void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
104 103
105 /// Configures the current constbuffers to use for the kernel invocation. 104 /// Configures the current constbuffers to use for the kernel invocation.
106 void SetupComputeConstBuffers(const Shader& kernel); 105 void SetupComputeConstBuffers(Shader* kernel);
107 106
108 /// Configures a constant buffer. 107 /// Configures a constant buffer.
109 void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, 108 void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
110 const ConstBufferEntry& entry); 109 const ConstBufferEntry& entry, bool use_unified,
110 std::size_t unified_offset);
111 111
112 /// Configures the current global memory entries to use for the draw command. 112 /// Configures the current global memory entries to use for the draw command.
113 void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); 113 void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);
114 114
115 /// Configures the current global memory entries to use for the kernel invocation. 115 /// Configures the current global memory entries to use for the kernel invocation.
116 void SetupComputeGlobalMemory(const Shader& kernel); 116 void SetupComputeGlobalMemory(Shader* kernel);
117 117
118 /// Configures a constant buffer. 118 /// Configures a constant buffer.
119 void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, 119 void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
120 std::size_t size); 120 std::size_t size);
121 121
122 /// Configures the current textures to use for the draw command. 122 /// Configures the current textures to use for the draw command.
123 void SetupDrawTextures(std::size_t stage_index, const Shader& shader); 123 void SetupDrawTextures(std::size_t stage_index, Shader* shader);
124 124
125 /// Configures the textures used in a compute shader. 125 /// Configures the textures used in a compute shader.
126 void SetupComputeTextures(const Shader& kernel); 126 void SetupComputeTextures(Shader* kernel);
127 127
128 /// Configures a texture. 128 /// Configures a texture.
129 void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, 129 void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
130 const SamplerEntry& entry); 130 const SamplerEntry& entry);
131 131
132 /// Configures images in a graphics shader. 132 /// Configures images in a graphics shader.
133 void SetupDrawImages(std::size_t stage_index, const Shader& shader); 133 void SetupDrawImages(std::size_t stage_index, Shader* shader);
134 134
135 /// Configures images in a compute shader. 135 /// Configures images in a compute shader.
136 void SetupComputeImages(const Shader& shader); 136 void SetupComputeImages(Shader* shader);
137 137
138 /// Configures an image. 138 /// Configures an image.
139 void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); 139 void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
@@ -201,6 +201,10 @@ private:
201 /// Syncs the framebuffer sRGB state to match the guest state 201 /// Syncs the framebuffer sRGB state to match the guest state
202 void SyncFramebufferSRGB(); 202 void SyncFramebufferSRGB();
203 203
204 /// Syncs transform feedback state to match guest state
205 /// @note Only valid on assembly shaders
206 void SyncTransformFeedback();
207
204 /// Begin a transform feedback 208 /// Begin a transform feedback
205 void BeginTransformFeedback(GLenum primitive_mode); 209 void BeginTransformFeedback(GLenum primitive_mode);
206 210
@@ -224,7 +228,7 @@ private:
224 228
225 void SetupShaders(GLenum primitive_mode); 229 void SetupShaders(GLenum primitive_mode);
226 230
227 const Device device; 231 const Device& device;
228 232
229 TextureCacheOpenGL texture_cache; 233 TextureCacheOpenGL texture_cache;
230 ShaderCacheOpenGL shader_cache; 234 ShaderCacheOpenGL shader_cache;
@@ -236,7 +240,7 @@ private:
236 240
237 Core::System& system; 241 Core::System& system;
238 ScreenInfo& screen_info; 242 ScreenInfo& screen_info;
239 GLShader::ProgramManager& program_manager; 243 ProgramManager& program_manager;
240 StateTracker& state_tracker; 244 StateTracker& state_tracker;
241 245
242 static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; 246 static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
@@ -248,6 +252,13 @@ private:
248 std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> 252 std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
249 enabled_transform_feedback_buffers; 253 enabled_transform_feedback_buffers;
250 254
255 static constexpr std::size_t NUM_CONSTANT_BUFFERS =
256 Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
257 Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
258 std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
259 std::size_t current_cbuf = 0;
260 OGLBuffer unified_uniform_buffer;
261
251 /// Number of commands queued to the OpenGL driver. Reseted on flush. 262 /// Number of commands queued to the OpenGL driver. Reseted on flush.
252 std::size_t num_queued_commands = 0; 263 std::size_t num_queued_commands = 0;
253 264
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 97803d480..a787e27d2 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -125,6 +125,15 @@ void OGLProgram::Release() {
125 handle = 0; 125 handle = 0;
126} 126}
127 127
128void OGLAssemblyProgram::Release() {
129 if (handle == 0) {
130 return;
131 }
132 MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
133 glDeleteProgramsARB(1, &handle);
134 handle = 0;
135}
136
128void OGLPipeline::Create() { 137void OGLPipeline::Create() {
129 if (handle != 0) 138 if (handle != 0)
130 return; 139 return;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index de93f4212..f8b322227 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -167,6 +167,22 @@ public:
167 GLuint handle = 0; 167 GLuint handle = 0;
168}; 168};
169 169
170class OGLAssemblyProgram : private NonCopyable {
171public:
172 OGLAssemblyProgram() = default;
173
174 OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
175
176 ~OGLAssemblyProgram() {
177 Release();
178 }
179
180 /// Deletes the internal OpenGL resource
181 void Release();
182
183 GLuint handle = 0;
184};
185
170class OGLPipeline : private NonCopyable { 186class OGLPipeline : private NonCopyable {
171public: 187public:
172 OGLPipeline() = default; 188 OGLPipeline() = default;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 9759a7078..46e780a06 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -20,6 +20,7 @@
20#include "video_core/engines/maxwell_3d.h" 20#include "video_core/engines/maxwell_3d.h"
21#include "video_core/engines/shader_type.h" 21#include "video_core/engines/shader_type.h"
22#include "video_core/memory_manager.h" 22#include "video_core/memory_manager.h"
23#include "video_core/renderer_opengl/gl_arb_decompiler.h"
23#include "video_core/renderer_opengl/gl_rasterizer.h" 24#include "video_core/renderer_opengl/gl_rasterizer.h"
24#include "video_core/renderer_opengl/gl_shader_cache.h" 25#include "video_core/renderer_opengl/gl_shader_cache.h"
25#include "video_core/renderer_opengl/gl_shader_decompiler.h" 26#include "video_core/renderer_opengl/gl_shader_decompiler.h"
@@ -29,6 +30,7 @@
29#include "video_core/shader/memory_util.h" 30#include "video_core/shader/memory_util.h"
30#include "video_core/shader/registry.h" 31#include "video_core/shader/registry.h"
31#include "video_core/shader/shader_ir.h" 32#include "video_core/shader/shader_ir.h"
33#include "video_core/shader_cache.h"
32 34
33namespace OpenGL { 35namespace OpenGL {
34 36
@@ -97,6 +99,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
97 return {}; 99 return {};
98} 100}
99 101
102constexpr GLenum AssemblyEnum(ShaderType shader_type) {
103 switch (shader_type) {
104 case ShaderType::Vertex:
105 return GL_VERTEX_PROGRAM_NV;
106 case ShaderType::TesselationControl:
107 return GL_TESS_CONTROL_PROGRAM_NV;
108 case ShaderType::TesselationEval:
109 return GL_TESS_EVALUATION_PROGRAM_NV;
110 case ShaderType::Geometry:
111 return GL_GEOMETRY_PROGRAM_NV;
112 case ShaderType::Fragment:
113 return GL_FRAGMENT_PROGRAM_NV;
114 case ShaderType::Compute:
115 return GL_COMPUTE_PROGRAM_NV;
116 }
117 return {};
118}
119
100std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) { 120std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
101 return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier); 121 return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
102} 122}
@@ -120,18 +140,44 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
120 return registry; 140 return registry;
121} 141}
122 142
123std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type, 143ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier,
124 u64 unique_identifier, const ShaderIR& ir, 144 const ShaderIR& ir, const Registry& registry,
125 const Registry& registry, bool hint_retrievable = false) { 145 bool hint_retrievable = false) {
126 const std::string shader_id = MakeShaderID(unique_identifier, shader_type); 146 const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
127 LOG_INFO(Render_OpenGL, "{}", shader_id); 147 LOG_INFO(Render_OpenGL, "{}", shader_id);
128 148
129 const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); 149 auto program = std::make_shared<ProgramHandle>();
130 OGLShader shader; 150
131 shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); 151 if (device.UseAssemblyShaders()) {
152 const std::string arb =
153 DecompileAssemblyShader(device, ir, registry, shader_type, shader_id);
154
155 GLuint& arb_prog = program->assembly_program.handle;
156
157// Commented out functions signal OpenGL errors but are compatible with apitrace.
158// Use them only to capture and replay on apitrace.
159#if 0
160 glGenProgramsNV(1, &arb_prog);
161 glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()),
162 reinterpret_cast<const GLubyte*>(arb.data()));
163#else
164 glGenProgramsARB(1, &arb_prog);
165 glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB,
166 static_cast<GLsizei>(arb.size()), arb.data());
167#endif
168 const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV));
169 if (err && *err) {
170 LOG_CRITICAL(Render_OpenGL, "{}", err);
171 LOG_INFO(Render_OpenGL, "\n{}", arb);
172 }
173 } else {
174 const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
175 OGLShader shader;
176 shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
177
178 program->source_program.Create(true, hint_retrievable, shader.handle);
179 }
132 180
133 auto program = std::make_shared<OGLProgram>();
134 program->Create(true, hint_retrievable, shader.handle);
135 return program; 181 return program;
136} 182}
137 183
@@ -151,22 +197,26 @@ std::unordered_set<GLenum> GetSupportedFormats() {
151 197
152} // Anonymous namespace 198} // Anonymous namespace
153 199
154CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, 200Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_,
155 std::shared_ptr<VideoCommon::Shader::Registry> registry, 201 ProgramSharedPtr program_)
156 ShaderEntries entries, std::shared_ptr<OGLProgram> program) 202 : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)} {
157 : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)}, 203 handle = program->assembly_program.handle;
158 size_in_bytes{size_in_bytes}, program{std::move(program)} {} 204 if (handle == 0) {
205 handle = program->source_program.handle;
206 }
207 ASSERT(handle != 0);
208}
159 209
160CachedShader::~CachedShader() = default; 210Shader::~Shader() = default;
161 211
162GLuint CachedShader::GetHandle() const { 212GLuint Shader::GetHandle() const {
163 DEBUG_ASSERT(registry->IsConsistent()); 213 DEBUG_ASSERT(registry->IsConsistent());
164 return program->handle; 214 return handle;
165} 215}
166 216
167Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, 217std::unique_ptr<Shader> Shader::CreateStageFromMemory(const ShaderParameters& params,
168 Maxwell::ShaderProgram program_type, ProgramCode code, 218 Maxwell::ShaderProgram program_type,
169 ProgramCode code_b) { 219 ProgramCode code, ProgramCode code_b) {
170 const auto shader_type = GetShaderType(program_type); 220 const auto shader_type = GetShaderType(program_type);
171 const std::size_t size_in_bytes = code.size() * sizeof(u64); 221 const std::size_t size_in_bytes = code.size() * sizeof(u64);
172 222
@@ -191,11 +241,12 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
191 entry.bindless_samplers = registry->GetBindlessSamplers(); 241 entry.bindless_samplers = registry->GetBindlessSamplers();
192 params.disk_cache.SaveEntry(std::move(entry)); 242 params.disk_cache.SaveEntry(std::move(entry));
193 243
194 return std::shared_ptr<CachedShader>(new CachedShader( 244 return std::unique_ptr<Shader>(new Shader(
195 params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); 245 std::move(registry), MakeEntries(params.device, ir, shader_type), std::move(program)));
196} 246}
197 247
198Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { 248std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params,
249 ProgramCode code) {
199 const std::size_t size_in_bytes = code.size() * sizeof(u64); 250 const std::size_t size_in_bytes = code.size() * sizeof(u64);
200 251
201 auto& engine = params.system.GPU().KeplerCompute(); 252 auto& engine = params.system.GPU().KeplerCompute();
@@ -215,22 +266,23 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
215 entry.bindless_samplers = registry->GetBindlessSamplers(); 266 entry.bindless_samplers = registry->GetBindlessSamplers();
216 params.disk_cache.SaveEntry(std::move(entry)); 267 params.disk_cache.SaveEntry(std::move(entry));
217 268
218 return std::shared_ptr<CachedShader>(new CachedShader( 269 return std::unique_ptr<Shader>(new Shader(std::move(registry),
219 params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); 270 MakeEntries(params.device, ir, ShaderType::Compute),
271 std::move(program)));
220} 272}
221 273
222Shader CachedShader::CreateFromCache(const ShaderParameters& params, 274std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params,
223 const PrecompiledShader& precompiled_shader, 275 const PrecompiledShader& precompiled_shader) {
224 std::size_t size_in_bytes) { 276 return std::unique_ptr<Shader>(new Shader(
225 return std::shared_ptr<CachedShader>( 277 precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program));
226 new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry,
227 precompiled_shader.entries, precompiled_shader.program));
228} 278}
229 279
230ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, 280ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
231 Core::Frontend::EmuWindow& emu_window, const Device& device) 281 Core::Frontend::EmuWindow& emu_window, const Device& device)
232 : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device}, 282 : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system},
233 disk_cache{system} {} 283 emu_window{emu_window}, device{device}, disk_cache{system} {}
284
285ShaderCacheOpenGL::~ShaderCacheOpenGL() = default;
234 286
235void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, 287void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
236 const VideoCore::DiskResourceLoadCallback& callback) { 288 const VideoCore::DiskResourceLoadCallback& callback) {
@@ -239,7 +291,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
239 return; 291 return;
240 } 292 }
241 293
242 const std::vector gl_cache = disk_cache.LoadPrecompiled(); 294 std::vector<ShaderDiskCachePrecompiled> gl_cache;
295 if (!device.UseAssemblyShaders()) {
296 // Only load precompiled cache when we are not using assembly shaders
297 gl_cache = disk_cache.LoadPrecompiled();
298 }
243 const auto supported_formats = GetSupportedFormats(); 299 const auto supported_formats = GetSupportedFormats();
244 300
245 // Track if precompiled cache was altered during loading to know if we have to 301 // Track if precompiled cache was altered during loading to know if we have to
@@ -278,7 +334,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
278 auto registry = MakeRegistry(entry); 334 auto registry = MakeRegistry(entry);
279 const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry); 335 const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);
280 336
281 std::shared_ptr<OGLProgram> program; 337 ProgramSharedPtr program;
282 if (precompiled_entry) { 338 if (precompiled_entry) {
283 // If the shader is precompiled, attempt to load it with 339 // If the shader is precompiled, attempt to load it with
284 program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats); 340 program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
@@ -294,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
294 PrecompiledShader shader; 350 PrecompiledShader shader;
295 shader.program = std::move(program); 351 shader.program = std::move(program);
296 shader.registry = std::move(registry); 352 shader.registry = std::move(registry);
297 shader.entries = MakeEntries(ir); 353 shader.entries = MakeEntries(device, ir, entry.type);
298 354
299 std::scoped_lock lock{mutex}; 355 std::scoped_lock lock{mutex};
300 if (callback) { 356 if (callback) {
@@ -332,6 +388,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
332 return; 388 return;
333 } 389 }
334 390
391 if (device.UseAssemblyShaders()) {
392 // Don't store precompiled binaries for assembly shaders.
393 return;
394 }
395
335 // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw 396 // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
336 // before precompiling them 397 // before precompiling them
337 398
@@ -339,7 +400,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
339 const u64 id = (*transferable)[i].unique_identifier; 400 const u64 id = (*transferable)[i].unique_identifier;
340 const auto it = find_precompiled(id); 401 const auto it = find_precompiled(id);
341 if (it == gl_cache.end()) { 402 if (it == gl_cache.end()) {
342 const GLuint program = runtime_cache.at(id).program->handle; 403 const GLuint program = runtime_cache.at(id).program->source_program.handle;
343 disk_cache.SavePrecompiled(id, program); 404 disk_cache.SavePrecompiled(id, program);
344 precompiled_cache_altered = true; 405 precompiled_cache_altered = true;
345 } 406 }
@@ -350,7 +411,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
350 } 411 }
351} 412}
352 413
353std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( 414ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
354 const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, 415 const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
355 const std::unordered_set<GLenum>& supported_formats) { 416 const std::unordered_set<GLenum>& supported_formats) {
356 if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) { 417 if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
@@ -358,15 +419,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
358 return {}; 419 return {};
359 } 420 }
360 421
361 auto program = std::make_shared<OGLProgram>(); 422 auto program = std::make_shared<ProgramHandle>();
362 program->handle = glCreateProgram(); 423 GLuint& handle = program->source_program.handle;
363 glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); 424 handle = glCreateProgram();
364 glProgramBinary(program->handle, precompiled_entry.binary_format, 425 glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
365 precompiled_entry.binary.data(), 426 glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(),
366 static_cast<GLsizei>(precompiled_entry.binary.size())); 427 static_cast<GLsizei>(precompiled_entry.binary.size()));
367 428
368 GLint link_status; 429 GLint link_status;
369 glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status); 430 glGetProgramiv(handle, GL_LINK_STATUS, &link_status);
370 if (link_status == GL_FALSE) { 431 if (link_status == GL_FALSE) {
371 LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing"); 432 LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
372 return {}; 433 return {};
@@ -375,7 +436,7 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
375 return program; 436 return program;
376} 437}
377 438
378Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { 439Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
379 if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) { 440 if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) {
380 return last_shaders[static_cast<std::size_t>(program)]; 441 return last_shaders[static_cast<std::size_t>(program)];
381 } 442 }
@@ -385,8 +446,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
385 446
386 // Look up shader in the cache based on address 447 // Look up shader in the cache based on address
387 const auto cpu_addr{memory_manager.GpuToCpuAddress(address)}; 448 const auto cpu_addr{memory_manager.GpuToCpuAddress(address)};
388 Shader shader{cpu_addr ? TryGet(*cpu_addr) : null_shader}; 449 if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) {
389 if (shader) {
390 return last_shaders[static_cast<std::size_t>(program)] = shader; 450 return last_shaders[static_cast<std::size_t>(program)] = shader;
391 } 451 }
392 452
@@ -407,30 +467,29 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
407 const ShaderParameters params{system, disk_cache, device, 467 const ShaderParameters params{system, disk_cache, device,
408 *cpu_addr, host_ptr, unique_identifier}; 468 *cpu_addr, host_ptr, unique_identifier};
409 469
470 std::unique_ptr<Shader> shader;
410 const auto found = runtime_cache.find(unique_identifier); 471 const auto found = runtime_cache.find(unique_identifier);
411 if (found == runtime_cache.end()) { 472 if (found == runtime_cache.end()) {
412 shader = CachedShader::CreateStageFromMemory(params, program, std::move(code), 473 shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b));
413 std::move(code_b));
414 } else { 474 } else {
415 const std::size_t size_in_bytes = code.size() * sizeof(u64); 475 shader = Shader::CreateFromCache(params, found->second);
416 shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
417 } 476 }
418 477
478 Shader* const result = shader.get();
419 if (cpu_addr) { 479 if (cpu_addr) {
420 Register(shader); 480 Register(std::move(shader), *cpu_addr, code.size() * sizeof(u64));
421 } else { 481 } else {
422 null_shader = shader; 482 null_shader = std::move(shader);
423 } 483 }
424 484
425 return last_shaders[static_cast<std::size_t>(program)] = shader; 485 return last_shaders[static_cast<std::size_t>(program)] = result;
426} 486}
427 487
428Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { 488Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
429 auto& memory_manager{system.GPU().MemoryManager()}; 489 auto& memory_manager{system.GPU().MemoryManager()};
430 const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)}; 490 const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)};
431 491
432 auto kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel; 492 if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) {
433 if (kernel) {
434 return kernel; 493 return kernel;
435 } 494 }
436 495
@@ -442,20 +501,21 @@ Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
442 const ShaderParameters params{system, disk_cache, device, 501 const ShaderParameters params{system, disk_cache, device,
443 *cpu_addr, host_ptr, unique_identifier}; 502 *cpu_addr, host_ptr, unique_identifier};
444 503
504 std::unique_ptr<Shader> kernel;
445 const auto found = runtime_cache.find(unique_identifier); 505 const auto found = runtime_cache.find(unique_identifier);
446 if (found == runtime_cache.end()) { 506 if (found == runtime_cache.end()) {
447 kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); 507 kernel = Shader::CreateKernelFromMemory(params, std::move(code));
448 } else { 508 } else {
449 const std::size_t size_in_bytes = code.size() * sizeof(u64); 509 kernel = Shader::CreateFromCache(params, found->second);
450 kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
451 } 510 }
452 511
512 Shader* const result = kernel.get();
453 if (cpu_addr) { 513 if (cpu_addr) {
454 Register(kernel); 514 Register(std::move(kernel), *cpu_addr, code.size() * sizeof(u64));
455 } else { 515 } else {
456 null_kernel = kernel; 516 null_kernel = std::move(kernel);
457 } 517 }
458 return kernel; 518 return result;
459} 519}
460 520
461} // namespace OpenGL 521} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 91690b470..6848f1388 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -18,12 +18,12 @@
18 18
19#include "common/common_types.h" 19#include "common/common_types.h"
20#include "video_core/engines/shader_type.h" 20#include "video_core/engines/shader_type.h"
21#include "video_core/rasterizer_cache.h"
22#include "video_core/renderer_opengl/gl_resource_manager.h" 21#include "video_core/renderer_opengl/gl_resource_manager.h"
23#include "video_core/renderer_opengl/gl_shader_decompiler.h" 22#include "video_core/renderer_opengl/gl_shader_decompiler.h"
24#include "video_core/renderer_opengl/gl_shader_disk_cache.h" 23#include "video_core/renderer_opengl/gl_shader_disk_cache.h"
25#include "video_core/shader/registry.h" 24#include "video_core/shader/registry.h"
26#include "video_core/shader/shader_ir.h" 25#include "video_core/shader/shader_ir.h"
26#include "video_core/shader_cache.h"
27 27
28namespace Core { 28namespace Core {
29class System; 29class System;
@@ -35,16 +35,20 @@ class EmuWindow;
35 35
36namespace OpenGL { 36namespace OpenGL {
37 37
38class CachedShader;
39class Device; 38class Device;
40class RasterizerOpenGL; 39class RasterizerOpenGL;
41struct UnspecializedShader; 40struct UnspecializedShader;
42 41
43using Shader = std::shared_ptr<CachedShader>;
44using Maxwell = Tegra::Engines::Maxwell3D::Regs; 42using Maxwell = Tegra::Engines::Maxwell3D::Regs;
45 43
44struct ProgramHandle {
45 OGLProgram source_program;
46 OGLAssemblyProgram assembly_program;
47};
48using ProgramSharedPtr = std::shared_ptr<ProgramHandle>;
49
46struct PrecompiledShader { 50struct PrecompiledShader {
47 std::shared_ptr<OGLProgram> program; 51 ProgramSharedPtr program;
48 std::shared_ptr<VideoCommon::Shader::Registry> registry; 52 std::shared_ptr<VideoCommon::Shader::Registry> registry;
49 ShaderEntries entries; 53 ShaderEntries entries;
50}; 54};
@@ -58,64 +62,56 @@ struct ShaderParameters {
58 u64 unique_identifier; 62 u64 unique_identifier;
59}; 63};
60 64
61class CachedShader final : public RasterizerCacheObject { 65class Shader final {
62public: 66public:
63 ~CachedShader(); 67 ~Shader();
64 68
65 /// Gets the GL program handle for the shader 69 /// Gets the GL program handle for the shader
66 GLuint GetHandle() const; 70 GLuint GetHandle() const;
67 71
68 /// Returns the size in bytes of the shader
69 std::size_t GetSizeInBytes() const override {
70 return size_in_bytes;
71 }
72
73 /// Gets the shader entries for the shader 72 /// Gets the shader entries for the shader
74 const ShaderEntries& GetEntries() const { 73 const ShaderEntries& GetEntries() const {
75 return entries; 74 return entries;
76 } 75 }
77 76
78 static Shader CreateStageFromMemory(const ShaderParameters& params, 77 static std::unique_ptr<Shader> CreateStageFromMemory(const ShaderParameters& params,
79 Maxwell::ShaderProgram program_type, 78 Maxwell::ShaderProgram program_type,
80 ProgramCode program_code, ProgramCode program_code_b); 79 ProgramCode program_code,
81 static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); 80 ProgramCode program_code_b);
81 static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params,
82 ProgramCode code);
82 83
83 static Shader CreateFromCache(const ShaderParameters& params, 84 static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params,
84 const PrecompiledShader& precompiled_shader, 85 const PrecompiledShader& precompiled_shader);
85 std::size_t size_in_bytes);
86 86
87private: 87private:
88 explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, 88 explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries,
89 std::shared_ptr<VideoCommon::Shader::Registry> registry, 89 ProgramSharedPtr program);
90 ShaderEntries entries, std::shared_ptr<OGLProgram> program);
91 90
92 std::shared_ptr<VideoCommon::Shader::Registry> registry; 91 std::shared_ptr<VideoCommon::Shader::Registry> registry;
93 ShaderEntries entries; 92 ShaderEntries entries;
94 std::size_t size_in_bytes = 0; 93 ProgramSharedPtr program;
95 std::shared_ptr<OGLProgram> program; 94 GLuint handle = 0;
96}; 95};
97 96
98class ShaderCacheOpenGL final : public RasterizerCache<Shader> { 97class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> {
99public: 98public:
100 explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, 99 explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
101 Core::Frontend::EmuWindow& emu_window, const Device& device); 100 Core::Frontend::EmuWindow& emu_window, const Device& device);
101 ~ShaderCacheOpenGL() override;
102 102
103 /// Loads disk cache for the current game 103 /// Loads disk cache for the current game
104 void LoadDiskCache(const std::atomic_bool& stop_loading, 104 void LoadDiskCache(const std::atomic_bool& stop_loading,
105 const VideoCore::DiskResourceLoadCallback& callback); 105 const VideoCore::DiskResourceLoadCallback& callback);
106 106
107 /// Gets the current specified shader stage program 107 /// Gets the current specified shader stage program
108 Shader GetStageProgram(Maxwell::ShaderProgram program); 108 Shader* GetStageProgram(Maxwell::ShaderProgram program);
109 109
110 /// Gets a compute kernel in the passed address 110 /// Gets a compute kernel in the passed address
111 Shader GetComputeKernel(GPUVAddr code_addr); 111 Shader* GetComputeKernel(GPUVAddr code_addr);
112
113protected:
114 // We do not have to flush this cache as things in it are never modified by us.
115 void FlushObjectInner(const Shader& object) override {}
116 112
117private: 113private:
118 std::shared_ptr<OGLProgram> GeneratePrecompiledProgram( 114 ProgramSharedPtr GeneratePrecompiledProgram(
119 const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, 115 const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
120 const std::unordered_set<GLenum>& supported_formats); 116 const std::unordered_set<GLenum>& supported_formats);
121 117
@@ -125,10 +121,10 @@ private:
125 ShaderDiskCacheOpenGL disk_cache; 121 ShaderDiskCacheOpenGL disk_cache;
126 std::unordered_map<u64, PrecompiledShader> runtime_cache; 122 std::unordered_map<u64, PrecompiledShader> runtime_cache;
127 123
128 Shader null_shader{}; 124 std::unique_ptr<Shader> null_shader;
129 Shader null_kernel{}; 125 std::unique_ptr<Shader> null_kernel;
130 126
131 std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; 127 std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
132}; 128};
133 129
134} // namespace OpenGL 130} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 960ebf1a1..d6e30b321 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -61,8 +61,8 @@ struct TextureDerivates {};
61using TextureArgument = std::pair<Type, Node>; 61using TextureArgument = std::pair<Type, Node>;
62using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>; 62using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;
63 63
64constexpr u32 MAX_CONSTBUFFER_ELEMENTS = 64constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
65 static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); 65constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
66 66
67constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt 67constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
68#define ftou floatBitsToUint 68#define ftou floatBitsToUint
@@ -402,6 +402,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
402 return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); 402 return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
403} 403}
404 404
405bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
406 const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
407 // We waste one UBO for emulation
408 const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
409 return num_ubos > num_available_ubos;
410}
411
405struct GenericVaryingDescription { 412struct GenericVaryingDescription {
406 std::string name; 413 std::string name;
407 u8 first_element = 0; 414 u8 first_element = 0;
@@ -412,8 +419,9 @@ class GLSLDecompiler final {
412public: 419public:
413 explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, 420 explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
414 ShaderType stage, std::string_view identifier, std::string_view suffix) 421 ShaderType stage, std::string_view identifier, std::string_view suffix)
415 : device{device}, ir{ir}, registry{registry}, stage{stage}, 422 : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
416 identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} { 423 suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
424 UseUnifiedUniforms(device, ir, stage)} {
417 if (stage != ShaderType::Compute) { 425 if (stage != ShaderType::Compute) {
418 transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); 426 transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
419 } 427 }
@@ -618,7 +626,9 @@ private:
618 break; 626 break;
619 } 627 }
620 } 628 }
621 if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) { 629
630 if (stage != ShaderType::Geometry &&
631 (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) {
622 if (ir.UsesLayer()) { 632 if (ir.UsesLayer()) {
623 code.AddLine("int gl_Layer;"); 633 code.AddLine("int gl_Layer;");
624 } 634 }
@@ -647,6 +657,16 @@ private:
647 --code.scope; 657 --code.scope;
648 code.AddLine("}};"); 658 code.AddLine("}};");
649 code.AddNewLine(); 659 code.AddNewLine();
660
661 if (stage == ShaderType::Geometry) {
662 if (ir.UsesLayer()) {
663 code.AddLine("out int gl_Layer;");
664 }
665 if (ir.UsesViewportIndex()) {
666 code.AddLine("out int gl_ViewportIndex;");
667 }
668 }
669 code.AddNewLine();
650 } 670 }
651 671
652 void DeclareRegisters() { 672 void DeclareRegisters() {
@@ -834,12 +854,24 @@ private:
834 } 854 }
835 855
836 void DeclareConstantBuffers() { 856 void DeclareConstantBuffers() {
857 if (use_unified_uniforms) {
858 const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
859 static_cast<u32>(ir.GetGlobalMemory().size());
860 code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
861 binding);
862 code.AddLine(" uint cbufs[];");
863 code.AddLine("}};");
864 code.AddNewLine();
865 return;
866 }
867
837 u32 binding = device.GetBaseBindings(stage).uniform_buffer; 868 u32 binding = device.GetBaseBindings(stage).uniform_buffer;
838 for (const auto& buffers : ir.GetConstantBuffers()) { 869 for (const auto [index, info] : ir.GetConstantBuffers()) {
839 const auto index = buffers.first; 870 const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
871 const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
840 code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, 872 code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
841 GetConstBufferBlock(index)); 873 GetConstBufferBlock(index));
842 code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS); 874 code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size);
843 code.AddLine("}};"); 875 code.AddLine("}};");
844 code.AddNewLine(); 876 code.AddNewLine();
845 } 877 }
@@ -1038,42 +1070,51 @@ private:
1038 1070
1039 if (const auto cbuf = std::get_if<CbufNode>(&*node)) { 1071 if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
1040 const Node offset = cbuf->GetOffset(); 1072 const Node offset = cbuf->GetOffset();
1073 const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
1074
1041 if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { 1075 if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
1042 // Direct access 1076 // Direct access
1043 const u32 offset_imm = immediate->GetValue(); 1077 const u32 offset_imm = immediate->GetValue();
1044 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); 1078 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
1045 return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), 1079 if (use_unified_uniforms) {
1046 offset_imm / (4 * 4), (offset_imm / 4) % 4), 1080 return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
1047 Type::Uint}; 1081 Type::Uint};
1082 } else {
1083 return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
1084 offset_imm / (4 * 4), (offset_imm / 4) % 4),
1085 Type::Uint};
1086 }
1048 } 1087 }
1049 1088
1050 if (std::holds_alternative<OperationNode>(*offset)) { 1089 // Indirect access
1051 // Indirect access 1090 if (use_unified_uniforms) {
1052 const std::string final_offset = code.GenerateTemporary(); 1091 return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
1053 code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); 1092 Visit(offset).AsUint()),
1093 Type::Uint};
1094 }
1054 1095
1055 if (!device.HasComponentIndexingBug()) { 1096 const std::string final_offset = code.GenerateTemporary();
1056 return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), 1097 code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
1057 final_offset, final_offset),
1058 Type::Uint};
1059 }
1060 1098
1061 // AMD's proprietary GLSL compiler emits ill code for variable component access. 1099 if (!device.HasComponentIndexingBug()) {
1062 // To bypass this driver bug generate 4 ifs, one per each component. 1100 return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
1063 const std::string pack = code.GenerateTemporary(); 1101 final_offset, final_offset),
1064 code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), 1102 Type::Uint};
1065 final_offset);
1066
1067 const std::string result = code.GenerateTemporary();
1068 code.AddLine("uint {};", result);
1069 for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
1070 code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
1071 pack, GetSwizzle(swizzle));
1072 }
1073 return {result, Type::Uint};
1074 } 1103 }
1075 1104
1076 UNREACHABLE_MSG("Unmanaged offset node type"); 1105 // AMD's proprietary GLSL compiler emits ill code for variable component access.
1106 // To bypass this driver bug generate 4 ifs, one per each component.
1107 const std::string pack = code.GenerateTemporary();
1108 code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
1109 final_offset);
1110
1111 const std::string result = code.GenerateTemporary();
1112 code.AddLine("uint {};", result);
1113 for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
1114 code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
1115 GetSwizzle(swizzle));
1116 }
1117 return {result, Type::Uint};
1077 } 1118 }
1078 1119
1079 if (const auto gmem = std::get_if<GmemNode>(&*node)) { 1120 if (const auto gmem = std::get_if<GmemNode>(&*node)) {
@@ -1538,7 +1579,9 @@ private:
1538 Expression target; 1579 Expression target;
1539 if (const auto gpr = std::get_if<GprNode>(&*dest)) { 1580 if (const auto gpr = std::get_if<GprNode>(&*dest)) {
1540 if (gpr->GetIndex() == Register::ZeroIndex) { 1581 if (gpr->GetIndex() == Register::ZeroIndex) {
1541 // Writing to Register::ZeroIndex is a no op 1582 // Writing to Register::ZeroIndex is a no op but we still have to visit the source
1583 // as it might have side effects.
1584 code.AddLine("{};", Visit(src).GetCode());
1542 return {}; 1585 return {};
1543 } 1586 }
1544 target = {GetRegister(gpr->GetIndex()), Type::Float}; 1587 target = {GetRegister(gpr->GetIndex()), Type::Float};
@@ -2309,6 +2352,18 @@ private:
2309 return {"gl_SubGroupInvocationARB", Type::Uint}; 2352 return {"gl_SubGroupInvocationARB", Type::Uint};
2310 } 2353 }
2311 2354
2355 template <const std::string_view& comparison>
2356 Expression ThreadMask(Operation) {
2357 if (device.HasWarpIntrinsics()) {
2358 return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint};
2359 }
2360 if (device.HasShaderBallot()) {
2361 return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint};
2362 }
2363 LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader");
2364 return {"0U", Type::Uint};
2365 }
2366
2312 Expression ShuffleIndexed(Operation operation) { 2367 Expression ShuffleIndexed(Operation operation) {
2313 std::string value = VisitOperand(operation, 0).AsFloat(); 2368 std::string value = VisitOperand(operation, 0).AsFloat();
2314 2369
@@ -2321,7 +2376,21 @@ private:
2321 return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float}; 2376 return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float};
2322 } 2377 }
2323 2378
2324 Expression MemoryBarrierGL(Operation) { 2379 Expression Barrier(Operation) {
2380 if (!ir.IsDecompiled()) {
2381 LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled");
2382 return {};
2383 }
2384 code.AddLine("barrier();");
2385 return {};
2386 }
2387
2388 Expression MemoryBarrierGroup(Operation) {
2389 code.AddLine("groupMemoryBarrier();");
2390 return {};
2391 }
2392
2393 Expression MemoryBarrierGlobal(Operation) {
2325 code.AddLine("memoryBarrier();"); 2394 code.AddLine("memoryBarrier();");
2326 return {}; 2395 return {};
2327 } 2396 }
@@ -2337,6 +2406,12 @@ private:
2337 static constexpr std::string_view NotEqual = "!="; 2406 static constexpr std::string_view NotEqual = "!=";
2338 static constexpr std::string_view GreaterEqual = ">="; 2407 static constexpr std::string_view GreaterEqual = ">=";
2339 2408
2409 static constexpr std::string_view Eq = "Eq";
2410 static constexpr std::string_view Ge = "Ge";
2411 static constexpr std::string_view Gt = "Gt";
2412 static constexpr std::string_view Le = "Le";
2413 static constexpr std::string_view Lt = "Lt";
2414
2340 static constexpr std::string_view Add = "Add"; 2415 static constexpr std::string_view Add = "Add";
2341 static constexpr std::string_view Min = "Min"; 2416 static constexpr std::string_view Min = "Min";
2342 static constexpr std::string_view Max = "Max"; 2417 static constexpr std::string_view Max = "Max";
@@ -2554,9 +2629,16 @@ private:
2554 &GLSLDecompiler::VoteEqual, 2629 &GLSLDecompiler::VoteEqual,
2555 2630
2556 &GLSLDecompiler::ThreadId, 2631 &GLSLDecompiler::ThreadId,
2632 &GLSLDecompiler::ThreadMask<Func::Eq>,
2633 &GLSLDecompiler::ThreadMask<Func::Ge>,
2634 &GLSLDecompiler::ThreadMask<Func::Gt>,
2635 &GLSLDecompiler::ThreadMask<Func::Le>,
2636 &GLSLDecompiler::ThreadMask<Func::Lt>,
2557 &GLSLDecompiler::ShuffleIndexed, 2637 &GLSLDecompiler::ShuffleIndexed,
2558 2638
2559 &GLSLDecompiler::MemoryBarrierGL, 2639 &GLSLDecompiler::Barrier,
2640 &GLSLDecompiler::MemoryBarrierGroup,
2641 &GLSLDecompiler::MemoryBarrierGlobal,
2560 }; 2642 };
2561 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); 2643 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
2562 2644
@@ -2669,6 +2751,7 @@ private:
2669 const std::string_view identifier; 2751 const std::string_view identifier;
2670 const std::string_view suffix; 2752 const std::string_view suffix;
2671 const Header header; 2753 const Header header;
2754 const bool use_unified_uniforms;
2672 std::unordered_map<u8, VaryingTFB> transform_feedback; 2755 std::unordered_map<u8, VaryingTFB> transform_feedback;
2673 2756
2674 ShaderWriter code; 2757 ShaderWriter code;
@@ -2864,7 +2947,7 @@ void GLSLDecompiler::DecompileAST() {
2864 2947
2865} // Anonymous namespace 2948} // Anonymous namespace
2866 2949
2867ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { 2950ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
2868 ShaderEntries entries; 2951 ShaderEntries entries;
2869 for (const auto& cbuf : ir.GetConstantBuffers()) { 2952 for (const auto& cbuf : ir.GetConstantBuffers()) {
2870 entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), 2953 entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2885,6 +2968,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
2885 entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; 2968 entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
2886 } 2969 }
2887 entries.shader_length = ir.GetLength(); 2970 entries.shader_length = ir.GetLength();
2971 entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
2888 return entries; 2972 return entries;
2889} 2973}
2890 2974
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index e8a178764..451c9689a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -53,11 +53,13 @@ struct ShaderEntries {
53 std::vector<GlobalMemoryEntry> global_memory_entries; 53 std::vector<GlobalMemoryEntry> global_memory_entries;
54 std::vector<SamplerEntry> samplers; 54 std::vector<SamplerEntry> samplers;
55 std::vector<ImageEntry> images; 55 std::vector<ImageEntry> images;
56 u32 clip_distances{};
57 std::size_t shader_length{}; 56 std::size_t shader_length{};
57 u32 clip_distances{};
58 bool use_unified_uniforms{};
58}; 59};
59 60
60ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir); 61ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
62 Tegra::Engines::ShaderType stage);
61 63
62std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, 64std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
63 const VideoCommon::Shader::Registry& registry, 65 const VideoCommon::Shader::Registry& registry,
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 9e95a122b..653c3f2f9 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap;
29 29
30namespace { 30namespace {
31 31
32using VideoCommon::Shader::SeparateSamplerKey;
33
32using ShaderCacheVersionHash = std::array<u8, 64>; 34using ShaderCacheVersionHash = std::array<u8, 64>;
33 35
34struct ConstBufferKey { 36struct ConstBufferKey {
@@ -37,18 +39,26 @@ struct ConstBufferKey {
37 u32 value = 0; 39 u32 value = 0;
38}; 40};
39 41
40struct BoundSamplerKey { 42struct BoundSamplerEntry {
41 u32 offset = 0; 43 u32 offset = 0;
42 Tegra::Engines::SamplerDescriptor sampler; 44 Tegra::Engines::SamplerDescriptor sampler;
43}; 45};
44 46
45struct BindlessSamplerKey { 47struct SeparateSamplerEntry {
48 u32 cbuf1 = 0;
49 u32 cbuf2 = 0;
50 u32 offset1 = 0;
51 u32 offset2 = 0;
52 Tegra::Engines::SamplerDescriptor sampler;
53};
54
55struct BindlessSamplerEntry {
46 u32 cbuf = 0; 56 u32 cbuf = 0;
47 u32 offset = 0; 57 u32 offset = 0;
48 Tegra::Engines::SamplerDescriptor sampler; 58 Tegra::Engines::SamplerDescriptor sampler;
49}; 59};
50 60
51constexpr u32 NativeVersion = 20; 61constexpr u32 NativeVersion = 21;
52 62
53ShaderCacheVersionHash GetShaderCacheVersionHash() { 63ShaderCacheVersionHash GetShaderCacheVersionHash() {
54 ShaderCacheVersionHash hash{}; 64 ShaderCacheVersionHash hash{};
@@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
87 u32 texture_handler_size_value; 97 u32 texture_handler_size_value;
88 u32 num_keys; 98 u32 num_keys;
89 u32 num_bound_samplers; 99 u32 num_bound_samplers;
100 u32 num_separate_samplers;
90 u32 num_bindless_samplers; 101 u32 num_bindless_samplers;
91 if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 || 102 if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
92 file.ReadArray(&is_texture_handler_size_known, 1) != 1 || 103 file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
93 file.ReadArray(&texture_handler_size_value, 1) != 1 || 104 file.ReadArray(&texture_handler_size_value, 1) != 1 ||
94 file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 || 105 file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
95 file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 || 106 file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
107 file.ReadArray(&num_separate_samplers, 1) != 1 ||
96 file.ReadArray(&num_bindless_samplers, 1) != 1) { 108 file.ReadArray(&num_bindless_samplers, 1) != 1) {
97 return false; 109 return false;
98 } 110 }
@@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
101 } 113 }
102 114
103 std::vector<ConstBufferKey> flat_keys(num_keys); 115 std::vector<ConstBufferKey> flat_keys(num_keys);
104 std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers); 116 std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers);
105 std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers); 117 std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers);
118 std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers);
106 if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() || 119 if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
107 file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) != 120 file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
108 flat_bound_samplers.size() || 121 flat_bound_samplers.size() ||
122 file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) !=
123 flat_separate_samplers.size() ||
109 file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) != 124 file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
110 flat_bindless_samplers.size()) { 125 flat_bindless_samplers.size()) {
111 return false; 126 return false;
112 } 127 }
113 for (const auto& key : flat_keys) { 128 for (const auto& entry : flat_keys) {
114 keys.insert({{key.cbuf, key.offset}, key.value}); 129 keys.insert({{entry.cbuf, entry.offset}, entry.value});
115 } 130 }
116 for (const auto& key : flat_bound_samplers) { 131 for (const auto& entry : flat_bound_samplers) {
117 bound_samplers.emplace(key.offset, key.sampler); 132 bound_samplers.emplace(entry.offset, entry.sampler);
118 } 133 }
119 for (const auto& key : flat_bindless_samplers) { 134 for (const auto& entry : flat_separate_samplers) {
120 bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); 135 SeparateSamplerKey key;
136 key.buffers = {entry.cbuf1, entry.cbuf2};
137 key.offsets = {entry.offset1, entry.offset2};
138 separate_samplers.emplace(key, entry.sampler);
139 }
140 for (const auto& entry : flat_bindless_samplers) {
141 bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler});
121 } 142 }
122 143
123 return true; 144 return true;
@@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
142 file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 || 163 file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
143 file.WriteObject(static_cast<u32>(keys.size())) != 1 || 164 file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
144 file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 || 165 file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
166 file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 ||
145 file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) { 167 file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
146 return false; 168 return false;
147 } 169 }
@@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
152 flat_keys.push_back(ConstBufferKey{address.first, address.second, value}); 174 flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
153 } 175 }
154 176
155 std::vector<BoundSamplerKey> flat_bound_samplers; 177 std::vector<BoundSamplerEntry> flat_bound_samplers;
156 flat_bound_samplers.reserve(bound_samplers.size()); 178 flat_bound_samplers.reserve(bound_samplers.size());
157 for (const auto& [address, sampler] : bound_samplers) { 179 for (const auto& [address, sampler] : bound_samplers) {
158 flat_bound_samplers.push_back(BoundSamplerKey{address, sampler}); 180 flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler});
181 }
182
183 std::vector<SeparateSamplerEntry> flat_separate_samplers;
184 flat_separate_samplers.reserve(separate_samplers.size());
185 for (const auto& [key, sampler] : separate_samplers) {
186 SeparateSamplerEntry entry;
187 std::tie(entry.cbuf1, entry.cbuf2) = key.buffers;
188 std::tie(entry.offset1, entry.offset2) = key.offsets;
189 entry.sampler = sampler;
190 flat_separate_samplers.push_back(entry);
159 } 191 }
160 192
161 std::vector<BindlessSamplerKey> flat_bindless_samplers; 193 std::vector<BindlessSamplerEntry> flat_bindless_samplers;
162 flat_bindless_samplers.reserve(bindless_samplers.size()); 194 flat_bindless_samplers.reserve(bindless_samplers.size());
163 for (const auto& [address, sampler] : bindless_samplers) { 195 for (const auto& [address, sampler] : bindless_samplers) {
164 flat_bindless_samplers.push_back( 196 flat_bindless_samplers.push_back(
165 BindlessSamplerKey{address.first, address.second, sampler}); 197 BindlessSamplerEntry{address.first, address.second, sampler});
166 } 198 }
167 199
168 return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() && 200 return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
169 file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) == 201 file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
170 flat_bound_samplers.size() && 202 flat_bound_samplers.size() &&
203 file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) ==
204 flat_separate_samplers.size() &&
171 file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) == 205 file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
172 flat_bindless_samplers.size(); 206 flat_bindless_samplers.size();
173} 207}
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index d5be52e40..a79cef0e9 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry {
57 VideoCommon::Shader::ComputeInfo compute_info; 57 VideoCommon::Shader::ComputeInfo compute_info;
58 VideoCommon::Shader::KeyMap keys; 58 VideoCommon::Shader::KeyMap keys;
59 VideoCommon::Shader::BoundSamplerMap bound_samplers; 59 VideoCommon::Shader::BoundSamplerMap bound_samplers;
60 VideoCommon::Shader::SeparateSamplerMap separate_samplers;
60 VideoCommon::Shader::BindlessSamplerMap bindless_samplers; 61 VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
61}; 62};
62 63
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 9c7b0adbd..8e754fa90 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -6,45 +6,109 @@
6 6
7#include "common/common_types.h" 7#include "common/common_types.h"
8#include "video_core/engines/maxwell_3d.h" 8#include "video_core/engines/maxwell_3d.h"
9#include "video_core/renderer_opengl/gl_device.h"
9#include "video_core/renderer_opengl/gl_shader_manager.h" 10#include "video_core/renderer_opengl/gl_shader_manager.h"
10 11
11namespace OpenGL::GLShader { 12namespace OpenGL {
12 13
13ProgramManager::ProgramManager() = default; 14ProgramManager::ProgramManager(const Device& device) {
15 use_assembly_programs = device.UseAssemblyShaders();
16 if (use_assembly_programs) {
17 glEnable(GL_COMPUTE_PROGRAM_NV);
18 } else {
19 graphics_pipeline.Create();
20 glBindProgramPipeline(graphics_pipeline.handle);
21 }
22}
14 23
15ProgramManager::~ProgramManager() = default; 24ProgramManager::~ProgramManager() = default;
16 25
17void ProgramManager::Create() { 26void ProgramManager::BindCompute(GLuint program) {
18 graphics_pipeline.Create(); 27 if (use_assembly_programs) {
19 glBindProgramPipeline(graphics_pipeline.handle); 28 glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
29 } else {
30 is_graphics_bound = false;
31 glUseProgram(program);
32 }
20} 33}
21 34
22void ProgramManager::BindGraphicsPipeline() { 35void ProgramManager::BindGraphicsPipeline() {
23 if (!is_graphics_bound) { 36 if (use_assembly_programs) {
24 is_graphics_bound = true; 37 UpdateAssemblyPrograms();
25 glUseProgram(0); 38 } else {
39 UpdateSourcePrograms();
26 } 40 }
41}
27 42
28 // Avoid updating the pipeline when values have no changed 43void ProgramManager::BindHostPipeline(GLuint pipeline) {
29 if (old_state == current_state) { 44 if (use_assembly_programs) {
30 return; 45 if (geometry_enabled) {
46 geometry_enabled = false;
47 old_state.geometry = 0;
48 glDisable(GL_GEOMETRY_PROGRAM_NV);
49 }
50 } else {
51 if (!is_graphics_bound) {
52 glUseProgram(0);
53 }
31 } 54 }
55 glBindProgramPipeline(pipeline);
56}
32 57
33 // Workaround for AMD bug 58void ProgramManager::RestoreGuestPipeline() {
34 static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | 59 if (use_assembly_programs) {
35 GL_FRAGMENT_SHADER_BIT}; 60 glBindProgramPipeline(0);
36 const GLuint handle = graphics_pipeline.handle; 61 } else {
37 glUseProgramStages(handle, all_used_stages, 0); 62 glBindProgramPipeline(graphics_pipeline.handle);
38 glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader); 63 }
39 glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader); 64}
40 glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader); 65
66void ProgramManager::UpdateAssemblyPrograms() {
67 const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) {
68 if (current == old) {
69 return;
70 }
71 if (current == 0) {
72 if (enabled) {
73 enabled = false;
74 glDisable(stage);
75 }
76 return;
77 }
78 if (!enabled) {
79 enabled = true;
80 glEnable(stage);
81 }
82 glBindProgramARB(stage, current);
83 };
84
85 update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex);
86 update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry,
87 old_state.geometry);
88 update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment,
89 old_state.fragment);
41 90
42 old_state = current_state; 91 old_state = current_state;
43} 92}
44 93
45void ProgramManager::BindComputeShader(GLuint program) { 94void ProgramManager::UpdateSourcePrograms() {
46 is_graphics_bound = false; 95 if (!is_graphics_bound) {
47 glUseProgram(program); 96 is_graphics_bound = true;
97 glUseProgram(0);
98 }
99
100 const GLuint handle = graphics_pipeline.handle;
101 const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) {
102 if (current == old) {
103 return;
104 }
105 glUseProgramStages(handle, stage, current);
106 };
107 update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex);
108 update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry);
109 update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment);
110
111 old_state = current_state;
48} 112}
49 113
50void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { 114void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
@@ -54,4 +118,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
54 y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f; 118 y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;
55} 119}
56 120
57} // namespace OpenGL::GLShader 121} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index d2e47f2a9..0f03b4f12 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -11,7 +11,9 @@
11#include "video_core/renderer_opengl/gl_resource_manager.h" 11#include "video_core/renderer_opengl/gl_resource_manager.h"
12#include "video_core/renderer_opengl/maxwell_to_gl.h" 12#include "video_core/renderer_opengl/maxwell_to_gl.h"
13 13
14namespace OpenGL::GLShader { 14namespace OpenGL {
15
16class Device;
15 17
16/// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned 18/// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
17/// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at 19/// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
@@ -28,50 +30,58 @@ static_assert(sizeof(MaxwellUniformData) < 16384,
28 30
29class ProgramManager { 31class ProgramManager {
30public: 32public:
31 explicit ProgramManager(); 33 explicit ProgramManager(const Device& device);
32 ~ProgramManager(); 34 ~ProgramManager();
33 35
34 void Create(); 36 /// Binds a compute program
37 void BindCompute(GLuint program);
35 38
36 /// Updates the graphics pipeline and binds it. 39 /// Updates bound programs.
37 void BindGraphicsPipeline(); 40 void BindGraphicsPipeline();
38 41
39 /// Binds a compute shader. 42 /// Binds an OpenGL pipeline object unsynchronized with the guest state.
40 void BindComputeShader(GLuint program); 43 void BindHostPipeline(GLuint pipeline);
44
45 /// Rewinds BindHostPipeline state changes.
46 void RestoreGuestPipeline();
41 47
42 void UseVertexShader(GLuint program) { 48 void UseVertexShader(GLuint program) {
43 current_state.vertex_shader = program; 49 current_state.vertex = program;
44 } 50 }
45 51
46 void UseGeometryShader(GLuint program) { 52 void UseGeometryShader(GLuint program) {
47 current_state.geometry_shader = program; 53 current_state.geometry = program;
48 } 54 }
49 55
50 void UseFragmentShader(GLuint program) { 56 void UseFragmentShader(GLuint program) {
51 current_state.fragment_shader = program; 57 current_state.fragment = program;
52 } 58 }
53 59
54private: 60private:
55 struct PipelineState { 61 struct PipelineState {
56 bool operator==(const PipelineState& rhs) const noexcept { 62 GLuint vertex = 0;
57 return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader && 63 GLuint geometry = 0;
58 geometry_shader == rhs.geometry_shader; 64 GLuint fragment = 0;
59 }
60
61 bool operator!=(const PipelineState& rhs) const noexcept {
62 return !operator==(rhs);
63 }
64
65 GLuint vertex_shader = 0;
66 GLuint fragment_shader = 0;
67 GLuint geometry_shader = 0;
68 }; 65 };
69 66
67 /// Update NV_gpu_program5 programs.
68 void UpdateAssemblyPrograms();
69
70 /// Update GLSL programs.
71 void UpdateSourcePrograms();
72
70 OGLPipeline graphics_pipeline; 73 OGLPipeline graphics_pipeline;
71 OGLPipeline compute_pipeline; 74
72 PipelineState current_state; 75 PipelineState current_state;
73 PipelineState old_state; 76 PipelineState old_state;
77
78 bool use_assembly_programs = false;
79
74 bool is_graphics_bound = true; 80 bool is_graphics_bound = true;
81
82 bool vertex_enabled = false;
83 bool geometry_enabled = false;
84 bool fragment_enabled = false;
75}; 85};
76 86
77} // namespace OpenGL::GLShader 87} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 6ec328c53..932a2f69e 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -49,14 +49,6 @@ OGLStreamBuffer::~OGLStreamBuffer() {
49 gl_buffer.Release(); 49 gl_buffer.Release();
50} 50}
51 51
52GLuint OGLStreamBuffer::GetHandle() const {
53 return gl_buffer.handle;
54}
55
56GLsizeiptr OGLStreamBuffer::GetSize() const {
57 return buffer_size;
58}
59
60std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { 52std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
61 ASSERT(size <= buffer_size); 53 ASSERT(size <= buffer_size);
62 ASSERT(alignment <= buffer_size); 54 ASSERT(alignment <= buffer_size);
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index f8383cbd4..866da3594 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -17,9 +17,6 @@ public:
17 bool use_persistent = true); 17 bool use_persistent = true);
18 ~OGLStreamBuffer(); 18 ~OGLStreamBuffer();
19 19
20 GLuint GetHandle() const;
21 GLsizeiptr GetSize() const;
22
23 /* 20 /*
24 * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes 21 * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
25 * and the optional alignment requirement. 22 * and the optional alignment requirement.
@@ -32,6 +29,14 @@ public:
32 29
33 void Unmap(GLsizeiptr size); 30 void Unmap(GLsizeiptr size);
34 31
32 GLuint Handle() const {
33 return gl_buffer.handle;
34 }
35
36 GLsizeiptr Size() const {
37 return buffer_size;
38 }
39
35private: 40private:
36 OGLBuffer gl_buffer; 41 OGLBuffer gl_buffer;
37 42
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 94fbd2a22..61505879b 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -35,7 +35,7 @@ MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
35namespace { 35namespace {
36 36
37struct FormatTuple { 37struct FormatTuple {
38 GLint internal_format; 38 GLenum internal_format;
39 GLenum format = GL_NONE; 39 GLenum format = GL_NONE;
40 GLenum type = GL_NONE; 40 GLenum type = GL_NONE;
41}; 41};
@@ -238,6 +238,12 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
238 return texture; 238 return texture;
239} 239}
240 240
241constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source,
242 SwizzleSource w_source) {
243 return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
244 (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
245}
246
241} // Anonymous namespace 247} // Anonymous namespace
242 248
243CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params, 249CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params,
@@ -257,9 +263,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param
257 target = GetTextureTarget(params.target); 263 target = GetTextureTarget(params.target);
258 texture = CreateTexture(params, target, internal_format, texture_buffer); 264 texture = CreateTexture(params, target, internal_format, texture_buffer);
259 DecorateSurfaceName(); 265 DecorateSurfaceName();
260 main_view = CreateViewInner( 266
261 ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels), 267 u32 num_layers = 1;
262 true); 268 if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
269 num_layers = params.depth;
270 }
271
272 main_view =
273 CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true);
263} 274}
264 275
265CachedSurface::~CachedSurface() = default; 276CachedSurface::~CachedSurface() = default;
@@ -381,7 +392,7 @@ void CachedSurface::DecorateSurfaceName() {
381} 392}
382 393
383void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) { 394void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) {
384 LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix); 395 LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix);
385} 396}
386 397
387View CachedSurface::CreateView(const ViewParams& view_key) { 398View CachedSurface::CreateView(const ViewParams& view_key) {
@@ -397,32 +408,33 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr
397} 408}
398 409
399CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params, 410CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params,
400 const bool is_proxy) 411 bool is_proxy)
401 : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} { 412 : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format},
402 target = GetTextureTarget(params.target); 413 target{GetTextureTarget(params.target)}, is_proxy{is_proxy} {
403 format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format;
404 if (!is_proxy) { 414 if (!is_proxy) {
405 texture_view = CreateTextureView(); 415 main_view = CreateTextureView();
406 } 416 }
407 swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A);
408} 417}
409 418
410CachedSurfaceView::~CachedSurfaceView() = default; 419CachedSurfaceView::~CachedSurfaceView() = default;
411 420
412void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { 421void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const {
413 ASSERT(params.num_levels == 1); 422 ASSERT(params.num_levels == 1);
414 423
424 if (params.target == SurfaceTarget::Texture3D) {
425 if (params.num_layers > 1) {
426 ASSERT(params.base_layer == 0);
427 glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level);
428 } else {
429 glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle,
430 params.base_level, params.base_layer);
431 }
432 return;
433 }
434
415 if (params.num_layers > 1) { 435 if (params.num_layers > 1) {
416 // Layered framebuffer attachments
417 UNIMPLEMENTED_IF(params.base_layer != 0); 436 UNIMPLEMENTED_IF(params.base_layer != 0);
418 437 glFramebufferTexture(fb_target, attachment, GetTexture(), 0);
419 switch (params.target) {
420 case SurfaceTarget::Texture2DArray:
421 glFramebufferTexture(target, attachment, GetTexture(), 0);
422 break;
423 default:
424 UNIMPLEMENTED();
425 }
426 return; 438 return;
427 } 439 }
428 440
@@ -430,16 +442,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
430 const GLuint texture = surface.GetTexture(); 442 const GLuint texture = surface.GetTexture();
431 switch (surface.GetSurfaceParams().target) { 443 switch (surface.GetSurfaceParams().target) {
432 case SurfaceTarget::Texture1D: 444 case SurfaceTarget::Texture1D:
433 glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); 445 glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level);
434 break; 446 break;
435 case SurfaceTarget::Texture2D: 447 case SurfaceTarget::Texture2D:
436 glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level); 448 glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level);
437 break; 449 break;
438 case SurfaceTarget::Texture1DArray: 450 case SurfaceTarget::Texture1DArray:
439 case SurfaceTarget::Texture2DArray: 451 case SurfaceTarget::Texture2DArray:
440 case SurfaceTarget::TextureCubemap: 452 case SurfaceTarget::TextureCubemap:
441 case SurfaceTarget::TextureCubeArray: 453 case SurfaceTarget::TextureCubeArray:
442 glFramebufferTextureLayer(target, attachment, texture, params.base_level, 454 glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level,
443 params.base_layer); 455 params.base_layer);
444 break; 456 break;
445 default: 457 default:
@@ -447,35 +459,62 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
447 } 459 }
448} 460}
449 461
450void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source, 462GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source,
451 SwizzleSource z_source, SwizzleSource w_source) { 463 SwizzleSource z_source, SwizzleSource w_source) {
452 u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); 464 if (GetSurfaceParams().IsBuffer()) {
453 if (new_swizzle == swizzle) 465 return GetTexture();
454 return; 466 }
455 swizzle = new_swizzle; 467 const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
456 const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source), 468 if (current_swizzle == new_swizzle) {
457 GetSwizzleSource(z_source), GetSwizzleSource(w_source)}; 469 return current_view;
458 const GLuint handle = GetTexture(); 470 }
459 const PixelFormat format = surface.GetSurfaceParams().pixel_format; 471 current_swizzle = new_swizzle;
460 switch (format) { 472
473 const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
474 OGLTextureView& view = entry->second;
475 if (!is_cache_miss) {
476 current_view = view.handle;
477 return view.handle;
478 }
479 view = CreateTextureView();
480 current_view = view.handle;
481
482 std::array swizzle{x_source, y_source, z_source, w_source};
483
484 switch (const PixelFormat format = GetSurfaceParams().pixel_format) {
461 case PixelFormat::Z24S8: 485 case PixelFormat::Z24S8:
462 case PixelFormat::Z32FS8: 486 case PixelFormat::Z32FS8:
463 case PixelFormat::S8Z24: 487 case PixelFormat::S8Z24:
464 glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, 488 UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G);
489 glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
465 GetComponent(format, x_source == SwizzleSource::R)); 490 GetComponent(format, x_source == SwizzleSource::R));
466 break; 491
467 default: 492 // Make sure we sample the first component
468 glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); 493 std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) {
494 return value == SwizzleSource::G ? SwizzleSource::R : value;
495 });
496 [[fallthrough]];
497 default: {
498 const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]),
499 GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])};
500 glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
469 break; 501 break;
470 } 502 }
503 }
504 return view.handle;
471} 505}
472 506
473OGLTextureView CachedSurfaceView::CreateTextureView() const { 507OGLTextureView CachedSurfaceView::CreateTextureView() const {
474 OGLTextureView texture_view; 508 OGLTextureView texture_view;
475 texture_view.Create(); 509 texture_view.Create();
476 510
477 glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level, 511 if (target == GL_TEXTURE_3D) {
478 params.num_levels, params.base_layer, params.num_layers); 512 glTextureView(texture_view.handle, target, surface.texture.handle, format,
513 params.base_level, params.num_levels, 0, 1);
514 } else {
515 glTextureView(texture_view.handle, target, surface.texture.handle, format,
516 params.base_level, params.num_levels, params.base_layer, params.num_layers);
517 }
479 ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle); 518 ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle);
480 519
481 return texture_view; 520 return texture_view;
@@ -518,8 +557,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
518 const Tegra::Engines::Fermi2D::Config& copy_config) { 557 const Tegra::Engines::Fermi2D::Config& copy_config) {
519 const auto& src_params{src_view->GetSurfaceParams()}; 558 const auto& src_params{src_view->GetSurfaceParams()};
520 const auto& dst_params{dst_view->GetSurfaceParams()}; 559 const auto& dst_params{dst_view->GetSurfaceParams()};
521 UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D); 560 UNIMPLEMENTED_IF(src_params.depth != 1);
522 UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D); 561 UNIMPLEMENTED_IF(dst_params.depth != 1);
523 562
524 state_tracker.NotifyScissor0(); 563 state_tracker.NotifyScissor0();
525 state_tracker.NotifyFramebuffer(); 564 state_tracker.NotifyFramebuffer();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 02d9981a1..bfc4ddf5d 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -80,10 +80,12 @@ public:
80 explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy); 80 explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy);
81 ~CachedSurfaceView(); 81 ~CachedSurfaceView();
82 82
83 /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER 83 /// @brief Attaches this texture view to the currently bound fb_target framebuffer
84 void Attach(GLenum attachment, GLenum target) const; 84 /// @param attachment Attachment to bind textures to
85 /// @param fb_target Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER)
86 void Attach(GLenum attachment, GLenum fb_target) const;
85 87
86 void ApplySwizzle(Tegra::Texture::SwizzleSource x_source, 88 GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
87 Tegra::Texture::SwizzleSource y_source, 89 Tegra::Texture::SwizzleSource y_source,
88 Tegra::Texture::SwizzleSource z_source, 90 Tegra::Texture::SwizzleSource z_source,
89 Tegra::Texture::SwizzleSource w_source); 91 Tegra::Texture::SwizzleSource w_source);
@@ -98,7 +100,7 @@ public:
98 if (is_proxy) { 100 if (is_proxy) {
99 return surface.GetTexture(); 101 return surface.GetTexture();
100 } 102 }
101 return texture_view.handle; 103 return main_view.handle;
102 } 104 }
103 105
104 GLenum GetFormat() const { 106 GLenum GetFormat() const {
@@ -110,23 +112,19 @@ public:
110 } 112 }
111 113
112private: 114private:
113 u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
114 Tegra::Texture::SwizzleSource y_source,
115 Tegra::Texture::SwizzleSource z_source,
116 Tegra::Texture::SwizzleSource w_source) const {
117 return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
118 (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
119 }
120
121 OGLTextureView CreateTextureView() const; 115 OGLTextureView CreateTextureView() const;
122 116
123 CachedSurface& surface; 117 CachedSurface& surface;
124 GLenum target{}; 118 const GLenum format;
125 GLenum format{}; 119 const GLenum target;
120 const bool is_proxy;
121
122 std::unordered_map<u32, OGLTextureView> view_cache;
123 OGLTextureView main_view;
126 124
127 OGLTextureView texture_view; 125 // Use an invalid default so it always fails the comparison test
128 u32 swizzle{}; 126 u32 current_swizzle = 0xffffffff;
129 bool is_proxy{}; 127 GLuint current_view = 0;
130}; 128};
131 129
132class TextureCacheOpenGL final : public TextureCacheBase { 130class TextureCacheOpenGL final : public TextureCacheBase {
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index b2a179746..6214fcbc3 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -316,7 +316,7 @@ public:
316RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system, 316RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
317 Core::Frontend::GraphicsContext& context) 317 Core::Frontend::GraphicsContext& context)
318 : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context}, 318 : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context},
319 has_debug_tool{HasDebugTool()} {} 319 program_manager{device}, has_debug_tool{HasDebugTool()} {}
320 320
321RendererOpenGL::~RendererOpenGL() = default; 321RendererOpenGL::~RendererOpenGL() = default;
322 322
@@ -468,8 +468,9 @@ void RendererOpenGL::InitOpenGLObjects() {
468 vertex_program.Create(true, false, vertex_shader.handle); 468 vertex_program.Create(true, false, vertex_shader.handle);
469 fragment_program.Create(true, false, fragment_shader.handle); 469 fragment_program.Create(true, false, fragment_shader.handle);
470 470
471 // Create program pipeline 471 pipeline.Create();
472 program_manager.Create(); 472 glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle);
473 glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle);
473 474
474 // Generate VBO handle for drawing 475 // Generate VBO handle for drawing
475 vertex_buffer.Create(); 476 vertex_buffer.Create();
@@ -508,7 +509,7 @@ void RendererOpenGL::CreateRasterizer() {
508 if (rasterizer) { 509 if (rasterizer) {
509 return; 510 return;
510 } 511 }
511 rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info, 512 rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, device, screen_info,
512 program_manager, state_tracker); 513 program_manager, state_tracker);
513} 514}
514 515
@@ -620,10 +621,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
620 state_tracker.NotifyClipControl(); 621 state_tracker.NotifyClipControl();
621 state_tracker.NotifyAlphaTest(); 622 state_tracker.NotifyAlphaTest();
622 623
623 program_manager.UseVertexShader(vertex_program.handle); 624 program_manager.BindHostPipeline(pipeline.handle);
624 program_manager.UseGeometryShader(0);
625 program_manager.UseFragmentShader(fragment_program.handle);
626 program_manager.BindGraphicsPipeline();
627 625
628 glEnable(GL_CULL_FACE); 626 glEnable(GL_CULL_FACE);
629 if (screen_info.display_srgb) { 627 if (screen_info.display_srgb) {
@@ -665,6 +663,8 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
665 663
666 glClear(GL_COLOR_BUFFER_BIT); 664 glClear(GL_COLOR_BUFFER_BIT);
667 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); 665 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
666
667 program_manager.RestoreGuestPipeline();
668} 668}
669 669
670bool RendererOpenGL::TryPresent(int timeout_ms) { 670bool RendererOpenGL::TryPresent(int timeout_ms) {
@@ -751,8 +751,9 @@ void RendererOpenGL::RenderScreenshot() {
751} 751}
752 752
753bool RendererOpenGL::Init() { 753bool RendererOpenGL::Init() {
754 if (GLAD_GL_KHR_debug) { 754 if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
755 glEnable(GL_DEBUG_OUTPUT); 755 glEnable(GL_DEBUG_OUTPUT);
756 glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
756 glDebugMessageCallback(DebugHandler, nullptr); 757 glDebugMessageCallback(DebugHandler, nullptr);
757 } 758 }
758 759
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 50b647661..61bf507f4 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -9,6 +9,7 @@
9#include "common/common_types.h" 9#include "common/common_types.h"
10#include "common/math_util.h" 10#include "common/math_util.h"
11#include "video_core/renderer_base.h" 11#include "video_core/renderer_base.h"
12#include "video_core/renderer_opengl/gl_device.h"
12#include "video_core/renderer_opengl/gl_resource_manager.h" 13#include "video_core/renderer_opengl/gl_resource_manager.h"
13#include "video_core/renderer_opengl/gl_shader_manager.h" 14#include "video_core/renderer_opengl/gl_shader_manager.h"
14#include "video_core/renderer_opengl/gl_state_tracker.h" 15#include "video_core/renderer_opengl/gl_state_tracker.h"
@@ -95,6 +96,7 @@ private:
95 Core::Frontend::EmuWindow& emu_window; 96 Core::Frontend::EmuWindow& emu_window;
96 Core::System& system; 97 Core::System& system;
97 Core::Frontend::GraphicsContext& context; 98 Core::Frontend::GraphicsContext& context;
99 const Device device;
98 100
99 StateTracker state_tracker{system}; 101 StateTracker state_tracker{system};
100 102
@@ -102,13 +104,14 @@ private:
102 OGLBuffer vertex_buffer; 104 OGLBuffer vertex_buffer;
103 OGLProgram vertex_program; 105 OGLProgram vertex_program;
104 OGLProgram fragment_program; 106 OGLProgram fragment_program;
107 OGLPipeline pipeline;
105 OGLFramebuffer screenshot_framebuffer; 108 OGLFramebuffer screenshot_framebuffer;
106 109
107 /// Display information for Switch screen 110 /// Display information for Switch screen
108 ScreenInfo screen_info; 111 ScreenInfo screen_info;
109 112
110 /// Global dummy shader pipeline 113 /// Global dummy shader pipeline
111 GLShader::ProgramManager program_manager; 114 ProgramManager program_manager;
112 115
113 /// OpenGL framebuffer data 116 /// OpenGL framebuffer data
114 std::vector<u8> gl_framebuffer_data; 117 std::vector<u8> gl_framebuffer_data;
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
index 568744e3c..424278816 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
@@ -71,8 +71,7 @@ void FixedPipelineState::Rasterizer::Fill(const Maxwell& regs) noexcept {
71 const u32 topology_index = static_cast<u32>(regs.draw.topology.Value()); 71 const u32 topology_index = static_cast<u32>(regs.draw.topology.Value());
72 72
73 u32 packed_front_face = PackFrontFace(regs.front_face); 73 u32 packed_front_face = PackFrontFace(regs.front_face);
74 if (regs.screen_y_control.triangle_rast_flip != 0 && 74 if (regs.screen_y_control.triangle_rast_flip != 0) {
75 regs.viewport_transform[0].scale_y > 0.0f) {
76 // Flip front face 75 // Flip front face
77 packed_front_face = 1 - packed_front_face; 76 packed_front_face = 1 - packed_front_face;
78 } 77 }
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 12be691a5..62e950d31 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -142,14 +142,14 @@ struct FormatTuple {
142 {VK_FORMAT_BC6H_UFLOAT_BLOCK}, // BC6H_UF16 142 {VK_FORMAT_BC6H_UFLOAT_BLOCK}, // BC6H_UF16
143 {VK_FORMAT_BC6H_SFLOAT_BLOCK}, // BC6H_SF16 143 {VK_FORMAT_BC6H_SFLOAT_BLOCK}, // BC6H_SF16
144 {VK_FORMAT_ASTC_4x4_UNORM_BLOCK}, // ASTC_2D_4X4 144 {VK_FORMAT_ASTC_4x4_UNORM_BLOCK}, // ASTC_2D_4X4
145 {VK_FORMAT_B8G8R8A8_UNORM}, // BGRA8 145 {VK_FORMAT_B8G8R8A8_UNORM, Attachable}, // BGRA8
146 {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage}, // RGBA32F 146 {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage}, // RGBA32F
147 {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage}, // RG32F 147 {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage}, // RG32F
148 {VK_FORMAT_R32_SFLOAT, Attachable | Storage}, // R32F 148 {VK_FORMAT_R32_SFLOAT, Attachable | Storage}, // R32F
149 {VK_FORMAT_R16_SFLOAT, Attachable | Storage}, // R16F 149 {VK_FORMAT_R16_SFLOAT, Attachable | Storage}, // R16F
150 {VK_FORMAT_R16_UNORM, Attachable | Storage}, // R16U 150 {VK_FORMAT_R16_UNORM, Attachable | Storage}, // R16U
151 {VK_FORMAT_UNDEFINED}, // R16S 151 {VK_FORMAT_UNDEFINED}, // R16S
152 {VK_FORMAT_UNDEFINED}, // R16UI 152 {VK_FORMAT_R16_UINT, Attachable | Storage}, // R16UI
153 {VK_FORMAT_UNDEFINED}, // R16I 153 {VK_FORMAT_UNDEFINED}, // R16I
154 {VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // RG16 154 {VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // RG16
155 {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // RG16F 155 {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // RG16F
@@ -168,7 +168,7 @@ struct FormatTuple {
168 {VK_FORMAT_ASTC_8x8_UNORM_BLOCK}, // ASTC_2D_8X8 168 {VK_FORMAT_ASTC_8x8_UNORM_BLOCK}, // ASTC_2D_8X8
169 {VK_FORMAT_UNDEFINED}, // ASTC_2D_8X5 169 {VK_FORMAT_UNDEFINED}, // ASTC_2D_8X5
170 {VK_FORMAT_UNDEFINED}, // ASTC_2D_5X4 170 {VK_FORMAT_UNDEFINED}, // ASTC_2D_5X4
171 {VK_FORMAT_UNDEFINED}, // BGRA8_SRGB 171 {VK_FORMAT_B8G8R8A8_SRGB, Attachable}, // BGRA8_SRGB
172 {VK_FORMAT_BC1_RGBA_SRGB_BLOCK}, // DXT1_SRGB 172 {VK_FORMAT_BC1_RGBA_SRGB_BLOCK}, // DXT1_SRGB
173 {VK_FORMAT_BC2_SRGB_BLOCK}, // DXT23_SRGB 173 {VK_FORMAT_BC2_SRGB_BLOCK}, // DXT23_SRGB
174 {VK_FORMAT_BC3_SRGB_BLOCK}, // DXT45_SRGB 174 {VK_FORMAT_BC3_SRGB_BLOCK}, // DXT45_SRGB
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 5b494da8c..1fde38328 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -7,6 +7,7 @@
7#include <memory> 7#include <memory>
8 8
9#include "core/core.h" 9#include "core/core.h"
10#include "video_core/buffer_cache/buffer_cache.h"
10#include "video_core/renderer_vulkan/vk_buffer_cache.h" 11#include "video_core/renderer_vulkan/vk_buffer_cache.h"
11#include "video_core/renderer_vulkan/vk_device.h" 12#include "video_core/renderer_vulkan/vk_device.h"
12#include "video_core/renderer_vulkan/vk_scheduler.h" 13#include "video_core/renderer_vulkan/vk_scheduler.h"
@@ -36,8 +37,8 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch
36 37
37} // Anonymous namespace 38} // Anonymous namespace
38 39
39CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager, 40Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
40 VAddr cpu_addr, std::size_t size) 41 std::size_t size)
41 : VideoCommon::BufferBlock{cpu_addr, size} { 42 : VideoCommon::BufferBlock{cpu_addr, size} {
42 VkBufferCreateInfo ci; 43 VkBufferCreateInfo ci;
43 ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; 44 ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
@@ -53,7 +54,7 @@ CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& me
53 buffer.commit = memory_manager.Commit(buffer.handle, false); 54 buffer.commit = memory_manager.Commit(buffer.handle, false);
54} 55}
55 56
56CachedBufferBlock::~CachedBufferBlock() = default; 57Buffer::~Buffer() = default;
57 58
58VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, 59VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
59 const VKDevice& device, VKMemoryManager& memory_manager, 60 const VKDevice& device, VKMemoryManager& memory_manager,
@@ -66,12 +67,8 @@ VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::S
66 67
67VKBufferCache::~VKBufferCache() = default; 68VKBufferCache::~VKBufferCache() = default;
68 69
69Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { 70std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
70 return std::make_shared<CachedBufferBlock>(device, memory_manager, cpu_addr, size); 71 return std::make_shared<Buffer>(device, memory_manager, cpu_addr, size);
71}
72
73VkBuffer VKBufferCache::ToHandle(const Buffer& buffer) {
74 return buffer->GetHandle();
75} 72}
76 73
77VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) { 74VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) {
@@ -90,7 +87,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
90 std::memcpy(staging.commit->Map(size), data, size); 87 std::memcpy(staging.commit->Map(size), data, size);
91 88
92 scheduler.RequestOutsideRenderPassOperationContext(); 89 scheduler.RequestOutsideRenderPassOperationContext();
93 scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset, 90 scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
94 size](vk::CommandBuffer cmdbuf) { 91 size](vk::CommandBuffer cmdbuf) {
95 cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size}); 92 cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size});
96 93
@@ -113,7 +110,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
113 u8* data) { 110 u8* data) {
114 const auto& staging = staging_pool.GetUnusedBuffer(size, true); 111 const auto& staging = staging_pool.GetUnusedBuffer(size, true);
115 scheduler.RequestOutsideRenderPassOperationContext(); 112 scheduler.RequestOutsideRenderPassOperationContext();
116 scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset, 113 scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
117 size](vk::CommandBuffer cmdbuf) { 114 size](vk::CommandBuffer cmdbuf) {
118 VkBufferMemoryBarrier barrier; 115 VkBufferMemoryBarrier barrier;
119 barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; 116 barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -140,8 +137,8 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
140void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, 137void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
141 std::size_t dst_offset, std::size_t size) { 138 std::size_t dst_offset, std::size_t size) {
142 scheduler.RequestOutsideRenderPassOperationContext(); 139 scheduler.RequestOutsideRenderPassOperationContext();
143 scheduler.Record([src_buffer = src->GetHandle(), dst_buffer = dst->GetHandle(), src_offset, 140 scheduler.Record([src_buffer = src.Handle(), dst_buffer = dst.Handle(), src_offset, dst_offset,
144 dst_offset, size](vk::CommandBuffer cmdbuf) { 141 size](vk::CommandBuffer cmdbuf) {
145 cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size}); 142 cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size});
146 143
147 std::array<VkBufferMemoryBarrier, 2> barriers; 144 std::array<VkBufferMemoryBarrier, 2> barriers;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index a54583e7d..9ebbef835 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -8,7 +8,6 @@
8 8
9#include "common/common_types.h" 9#include "common/common_types.h"
10#include "video_core/buffer_cache/buffer_cache.h" 10#include "video_core/buffer_cache/buffer_cache.h"
11#include "video_core/rasterizer_cache.h"
12#include "video_core/renderer_vulkan/vk_memory_manager.h" 11#include "video_core/renderer_vulkan/vk_memory_manager.h"
13#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" 12#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
14#include "video_core/renderer_vulkan/vk_stream_buffer.h" 13#include "video_core/renderer_vulkan/vk_stream_buffer.h"
@@ -24,13 +23,13 @@ class VKDevice;
24class VKMemoryManager; 23class VKMemoryManager;
25class VKScheduler; 24class VKScheduler;
26 25
27class CachedBufferBlock final : public VideoCommon::BufferBlock { 26class Buffer final : public VideoCommon::BufferBlock {
28public: 27public:
29 explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager, 28 explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
30 VAddr cpu_addr, std::size_t size); 29 std::size_t size);
31 ~CachedBufferBlock(); 30 ~Buffer();
32 31
33 VkBuffer GetHandle() const { 32 VkBuffer Handle() const {
34 return *buffer.handle; 33 return *buffer.handle;
35 } 34 }
36 35
@@ -38,8 +37,6 @@ private:
38 VKBuffer buffer; 37 VKBuffer buffer;
39}; 38};
40 39
41using Buffer = std::shared_ptr<CachedBufferBlock>;
42
43class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> { 40class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {
44public: 41public:
45 explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, 42 explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
@@ -50,9 +47,7 @@ public:
50 VkBuffer GetEmptyBuffer(std::size_t size) override; 47 VkBuffer GetEmptyBuffer(std::size_t size) override;
51 48
52protected: 49protected:
53 VkBuffer ToHandle(const Buffer& buffer) override; 50 std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
54
55 Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
56 51
57 void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, 52 void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
58 const u8* data) override; 53 const u8* data) override;
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
index 8e1b46277..281bf9ac3 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -53,8 +53,9 @@ vk::DescriptorSetLayout VKComputePipeline::CreateDescriptorSetLayout() const {
53 }; 53 };
54 add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size()); 54 add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size());
55 add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size()); 55 add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size());
56 add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.texel_buffers.size()); 56 add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.uniform_texels.size());
57 add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size()); 57 add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size());
58 add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, entries.storage_texels.size());
58 add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size()); 59 add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size());
59 60
60 VkDescriptorSetLayoutCreateInfo ci; 61 VkDescriptorSetLayoutCreateInfo ci;
diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
index 890fd52cf..9259b618d 100644
--- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
@@ -42,6 +42,7 @@ vk::DescriptorPool* VKDescriptorPool::AllocateNewPool() {
42 {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60}, 42 {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60},
43 {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64}, 43 {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64},
44 {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64}, 44 {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64},
45 {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, num_sets * 64},
45 {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}}; 46 {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}};
46 47
47 VkDescriptorPoolCreateInfo ci; 48 VkDescriptorPoolCreateInfo ci;
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index f0c491d00..9fd8ac3f6 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -73,75 +73,79 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType
73 73
74std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( 74std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
75 vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) { 75 vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) {
76 static constexpr std::array formats{VK_FORMAT_A8B8G8R8_UNORM_PACK32, 76 static constexpr std::array formats{
77 VK_FORMAT_A8B8G8R8_UINT_PACK32, 77 VK_FORMAT_A8B8G8R8_UNORM_PACK32,
78 VK_FORMAT_A8B8G8R8_SNORM_PACK32, 78 VK_FORMAT_A8B8G8R8_UINT_PACK32,
79 VK_FORMAT_A8B8G8R8_SRGB_PACK32, 79 VK_FORMAT_A8B8G8R8_SNORM_PACK32,
80 VK_FORMAT_B5G6R5_UNORM_PACK16, 80 VK_FORMAT_A8B8G8R8_SRGB_PACK32,
81 VK_FORMAT_A2B10G10R10_UNORM_PACK32, 81 VK_FORMAT_B5G6R5_UNORM_PACK16,
82 VK_FORMAT_A1R5G5B5_UNORM_PACK16, 82 VK_FORMAT_A2B10G10R10_UNORM_PACK32,
83 VK_FORMAT_R32G32B32A32_SFLOAT, 83 VK_FORMAT_A1R5G5B5_UNORM_PACK16,
84 VK_FORMAT_R32G32B32A32_UINT, 84 VK_FORMAT_R32G32B32A32_SFLOAT,
85 VK_FORMAT_R32G32_SFLOAT, 85 VK_FORMAT_R32G32B32A32_UINT,
86 VK_FORMAT_R32G32_UINT, 86 VK_FORMAT_R32G32_SFLOAT,
87 VK_FORMAT_R16G16B16A16_UINT, 87 VK_FORMAT_R32G32_UINT,
88 VK_FORMAT_R16G16B16A16_SNORM, 88 VK_FORMAT_R16G16B16A16_UINT,
89 VK_FORMAT_R16G16B16A16_UNORM, 89 VK_FORMAT_R16G16B16A16_SNORM,
90 VK_FORMAT_R16G16_UNORM, 90 VK_FORMAT_R16G16B16A16_UNORM,
91 VK_FORMAT_R16G16_SNORM, 91 VK_FORMAT_R16G16_UNORM,
92 VK_FORMAT_R16G16_SFLOAT, 92 VK_FORMAT_R16G16_SNORM,
93 VK_FORMAT_R16_UNORM, 93 VK_FORMAT_R16G16_SFLOAT,
94 VK_FORMAT_R8G8B8A8_SRGB, 94 VK_FORMAT_R16_UNORM,
95 VK_FORMAT_R8G8_UNORM, 95 VK_FORMAT_R16_UINT,
96 VK_FORMAT_R8G8_SNORM, 96 VK_FORMAT_R8G8B8A8_SRGB,
97 VK_FORMAT_R8G8_UINT, 97 VK_FORMAT_R8G8_UNORM,
98 VK_FORMAT_R8_UNORM, 98 VK_FORMAT_R8G8_SNORM,
99 VK_FORMAT_R8_UINT, 99 VK_FORMAT_R8G8_UINT,
100 VK_FORMAT_B10G11R11_UFLOAT_PACK32, 100 VK_FORMAT_R8_UNORM,
101 VK_FORMAT_R32_SFLOAT, 101 VK_FORMAT_R8_UINT,
102 VK_FORMAT_R32_UINT, 102 VK_FORMAT_B10G11R11_UFLOAT_PACK32,
103 VK_FORMAT_R32_SINT, 103 VK_FORMAT_R32_SFLOAT,
104 VK_FORMAT_R16_SFLOAT, 104 VK_FORMAT_R32_UINT,
105 VK_FORMAT_R16G16B16A16_SFLOAT, 105 VK_FORMAT_R32_SINT,
106 VK_FORMAT_B8G8R8A8_UNORM, 106 VK_FORMAT_R16_SFLOAT,
107 VK_FORMAT_R4G4B4A4_UNORM_PACK16, 107 VK_FORMAT_R16G16B16A16_SFLOAT,
108 VK_FORMAT_D32_SFLOAT, 108 VK_FORMAT_B8G8R8A8_UNORM,
109 VK_FORMAT_D16_UNORM, 109 VK_FORMAT_B8G8R8A8_SRGB,
110 VK_FORMAT_D16_UNORM_S8_UINT, 110 VK_FORMAT_R4G4B4A4_UNORM_PACK16,
111 VK_FORMAT_D24_UNORM_S8_UINT, 111 VK_FORMAT_D32_SFLOAT,
112 VK_FORMAT_D32_SFLOAT_S8_UINT, 112 VK_FORMAT_D16_UNORM,
113 VK_FORMAT_BC1_RGBA_UNORM_BLOCK, 113 VK_FORMAT_D16_UNORM_S8_UINT,
114 VK_FORMAT_BC2_UNORM_BLOCK, 114 VK_FORMAT_D24_UNORM_S8_UINT,
115 VK_FORMAT_BC3_UNORM_BLOCK, 115 VK_FORMAT_D32_SFLOAT_S8_UINT,
116 VK_FORMAT_BC4_UNORM_BLOCK, 116 VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
117 VK_FORMAT_BC5_UNORM_BLOCK, 117 VK_FORMAT_BC2_UNORM_BLOCK,
118 VK_FORMAT_BC5_SNORM_BLOCK, 118 VK_FORMAT_BC3_UNORM_BLOCK,
119 VK_FORMAT_BC7_UNORM_BLOCK, 119 VK_FORMAT_BC4_UNORM_BLOCK,
120 VK_FORMAT_BC6H_UFLOAT_BLOCK, 120 VK_FORMAT_BC5_UNORM_BLOCK,
121 VK_FORMAT_BC6H_SFLOAT_BLOCK, 121 VK_FORMAT_BC5_SNORM_BLOCK,
122 VK_FORMAT_BC1_RGBA_SRGB_BLOCK, 122 VK_FORMAT_BC7_UNORM_BLOCK,
123 VK_FORMAT_BC2_SRGB_BLOCK, 123 VK_FORMAT_BC6H_UFLOAT_BLOCK,
124 VK_FORMAT_BC3_SRGB_BLOCK, 124 VK_FORMAT_BC6H_SFLOAT_BLOCK,
125 VK_FORMAT_BC7_SRGB_BLOCK, 125 VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
126 VK_FORMAT_ASTC_4x4_SRGB_BLOCK, 126 VK_FORMAT_BC2_SRGB_BLOCK,
127 VK_FORMAT_ASTC_8x8_SRGB_BLOCK, 127 VK_FORMAT_BC3_SRGB_BLOCK,
128 VK_FORMAT_ASTC_8x5_SRGB_BLOCK, 128 VK_FORMAT_BC7_SRGB_BLOCK,
129 VK_FORMAT_ASTC_5x4_SRGB_BLOCK, 129 VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
130 VK_FORMAT_ASTC_5x5_UNORM_BLOCK, 130 VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
131 VK_FORMAT_ASTC_5x5_SRGB_BLOCK, 131 VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
132 VK_FORMAT_ASTC_10x8_UNORM_BLOCK, 132 VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
133 VK_FORMAT_ASTC_10x8_SRGB_BLOCK, 133 VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
134 VK_FORMAT_ASTC_6x6_UNORM_BLOCK, 134 VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
135 VK_FORMAT_ASTC_6x6_SRGB_BLOCK, 135 VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
136 VK_FORMAT_ASTC_10x10_UNORM_BLOCK, 136 VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
137 VK_FORMAT_ASTC_10x10_SRGB_BLOCK, 137 VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
138 VK_FORMAT_ASTC_12x12_UNORM_BLOCK, 138 VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
139 VK_FORMAT_ASTC_12x12_SRGB_BLOCK, 139 VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
140 VK_FORMAT_ASTC_8x6_UNORM_BLOCK, 140 VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
141 VK_FORMAT_ASTC_8x6_SRGB_BLOCK, 141 VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
142 VK_FORMAT_ASTC_6x5_UNORM_BLOCK, 142 VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
143 VK_FORMAT_ASTC_6x5_SRGB_BLOCK, 143 VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
144 VK_FORMAT_E5B9G9R9_UFLOAT_PACK32}; 144 VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
145 VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
146 VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
147 VK_FORMAT_E5B9G9R9_UFLOAT_PACK32,
148 };
145 std::unordered_map<VkFormat, VkFormatProperties> format_properties; 149 std::unordered_map<VkFormat, VkFormatProperties> format_properties;
146 for (const auto format : formats) { 150 for (const auto format : formats) {
147 format_properties.emplace(format, physical.GetFormatProperties(format)); 151 format_properties.emplace(format, physical.GetFormatProperties(format));
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h
index 04d07fe6a..043fe7947 100644
--- a/src/video_core/renderer_vulkan/vk_fence_manager.h
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.h
@@ -7,6 +7,7 @@
7#include <memory> 7#include <memory>
8 8
9#include "video_core/fence_manager.h" 9#include "video_core/fence_manager.h"
10#include "video_core/renderer_vulkan/vk_buffer_cache.h"
10#include "video_core/renderer_vulkan/wrapper.h" 11#include "video_core/renderer_vulkan/wrapper.h"
11 12
12namespace Core { 13namespace Core {
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index fe45ed269..ea66e621e 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -27,6 +27,7 @@
27#include "video_core/renderer_vulkan/wrapper.h" 27#include "video_core/renderer_vulkan/wrapper.h"
28#include "video_core/shader/compiler_settings.h" 28#include "video_core/shader/compiler_settings.h"
29#include "video_core/shader/memory_util.h" 29#include "video_core/shader/memory_util.h"
30#include "video_core/shader_cache.h"
30 31
31namespace Vulkan { 32namespace Vulkan {
32 33
@@ -45,6 +46,7 @@ constexpr VkDescriptorType UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
45constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; 46constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
46constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER; 47constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
47constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; 48constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
49constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER;
48constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; 50constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
49 51
50constexpr VideoCommon::Shader::CompilerSettings compiler_settings{ 52constexpr VideoCommon::Shader::CompilerSettings compiler_settings{
@@ -104,8 +106,9 @@ u32 FillDescriptorLayout(const ShaderEntries& entries,
104 u32 binding = base_binding; 106 u32 binding = base_binding;
105 AddBindings<UNIFORM_BUFFER>(bindings, binding, flags, entries.const_buffers); 107 AddBindings<UNIFORM_BUFFER>(bindings, binding, flags, entries.const_buffers);
106 AddBindings<STORAGE_BUFFER>(bindings, binding, flags, entries.global_buffers); 108 AddBindings<STORAGE_BUFFER>(bindings, binding, flags, entries.global_buffers);
107 AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.texel_buffers); 109 AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.uniform_texels);
108 AddBindings<COMBINED_IMAGE_SAMPLER>(bindings, binding, flags, entries.samplers); 110 AddBindings<COMBINED_IMAGE_SAMPLER>(bindings, binding, flags, entries.samplers);
111 AddBindings<STORAGE_TEXEL_BUFFER>(bindings, binding, flags, entries.storage_texels);
109 AddBindings<STORAGE_IMAGE>(bindings, binding, flags, entries.images); 112 AddBindings<STORAGE_IMAGE>(bindings, binding, flags, entries.images);
110 return binding; 113 return binding;
111} 114}
@@ -130,19 +133,18 @@ bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) con
130 return std::memcmp(&rhs, this, sizeof *this) == 0; 133 return std::memcmp(&rhs, this, sizeof *this) == 0;
131} 134}
132 135
133CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, 136Shader::Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
134 GPUVAddr gpu_addr, VAddr cpu_addr, ProgramCode program_code, 137 VideoCommon::Shader::ProgramCode program_code, u32 main_offset)
135 u32 main_offset) 138 : gpu_addr{gpu_addr}, program_code{std::move(program_code)},
136 : RasterizerCacheObject{cpu_addr}, gpu_addr{gpu_addr}, program_code{std::move(program_code)},
137 registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset, 139 registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset,
138 compiler_settings, registry}, 140 compiler_settings, registry},
139 entries{GenerateShaderEntries(shader_ir)} {} 141 entries{GenerateShaderEntries(shader_ir)} {}
140 142
141CachedShader::~CachedShader() = default; 143Shader::~Shader() = default;
142 144
143Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine( 145Tegra::Engines::ConstBufferEngineInterface& Shader::GetEngine(Core::System& system,
144 Core::System& system, Tegra::Engines::ShaderType stage) { 146 Tegra::Engines::ShaderType stage) {
145 if (stage == Tegra::Engines::ShaderType::Compute) { 147 if (stage == ShaderType::Compute) {
146 return system.GPU().KeplerCompute(); 148 return system.GPU().KeplerCompute();
147 } else { 149 } else {
148 return system.GPU().Maxwell3D(); 150 return system.GPU().Maxwell3D();
@@ -154,16 +156,16 @@ VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasteri
154 VKDescriptorPool& descriptor_pool, 156 VKDescriptorPool& descriptor_pool,
155 VKUpdateDescriptorQueue& update_descriptor_queue, 157 VKUpdateDescriptorQueue& update_descriptor_queue,
156 VKRenderPassCache& renderpass_cache) 158 VKRenderPassCache& renderpass_cache)
157 : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler}, 159 : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, device{device},
158 descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue}, 160 scheduler{scheduler}, descriptor_pool{descriptor_pool},
159 renderpass_cache{renderpass_cache} {} 161 update_descriptor_queue{update_descriptor_queue}, renderpass_cache{renderpass_cache} {}
160 162
161VKPipelineCache::~VKPipelineCache() = default; 163VKPipelineCache::~VKPipelineCache() = default;
162 164
163std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { 165std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
164 const auto& gpu = system.GPU().Maxwell3D(); 166 const auto& gpu = system.GPU().Maxwell3D();
165 167
166 std::array<Shader, Maxwell::MaxShaderProgram> shaders; 168 std::array<Shader*, Maxwell::MaxShaderProgram> shaders{};
167 for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { 169 for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
168 const auto program{static_cast<Maxwell::ShaderProgram>(index)}; 170 const auto program{static_cast<Maxwell::ShaderProgram>(index)};
169 171
@@ -176,24 +178,28 @@ std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
176 const GPUVAddr program_addr{GetShaderAddress(system, program)}; 178 const GPUVAddr program_addr{GetShaderAddress(system, program)};
177 const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr); 179 const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
178 ASSERT(cpu_addr); 180 ASSERT(cpu_addr);
179 auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader; 181
180 if (!shader) { 182 Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
183 if (!result) {
181 const auto host_ptr{memory_manager.GetPointer(program_addr)}; 184 const auto host_ptr{memory_manager.GetPointer(program_addr)};
182 185
183 // No shader found - create a new one 186 // No shader found - create a new one
184 constexpr u32 stage_offset = STAGE_MAIN_OFFSET; 187 constexpr u32 stage_offset = STAGE_MAIN_OFFSET;
185 const auto stage = static_cast<Tegra::Engines::ShaderType>(index == 0 ? 0 : index - 1); 188 const auto stage = static_cast<ShaderType>(index == 0 ? 0 : index - 1);
186 ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, false); 189 ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, false);
190 const std::size_t size_in_bytes = code.size() * sizeof(u64);
191
192 auto shader = std::make_unique<Shader>(system, stage, program_addr, std::move(code),
193 stage_offset);
194 result = shader.get();
187 195
188 shader = std::make_shared<CachedShader>(system, stage, program_addr, *cpu_addr,
189 std::move(code), stage_offset);
190 if (cpu_addr) { 196 if (cpu_addr) {
191 Register(shader); 197 Register(std::move(shader), *cpu_addr, size_in_bytes);
192 } else { 198 } else {
193 null_shader = shader; 199 null_shader = std::move(shader);
194 } 200 }
195 } 201 }
196 shaders[index] = std::move(shader); 202 shaders[index] = result;
197 } 203 }
198 return last_shaders = shaders; 204 return last_shaders = shaders;
199} 205}
@@ -234,19 +240,22 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
234 const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr); 240 const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
235 ASSERT(cpu_addr); 241 ASSERT(cpu_addr);
236 242
237 auto shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel; 243 Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get();
238 if (!shader) { 244 if (!shader) {
239 // No shader found - create a new one 245 // No shader found - create a new one
240 const auto host_ptr = memory_manager.GetPointer(program_addr); 246 const auto host_ptr = memory_manager.GetPointer(program_addr);
241 247
242 ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, true); 248 ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, true);
243 shader = std::make_shared<CachedShader>(system, Tegra::Engines::ShaderType::Compute, 249 const std::size_t size_in_bytes = code.size() * sizeof(u64);
244 program_addr, *cpu_addr, std::move(code), 250
245 KERNEL_MAIN_OFFSET); 251 auto shader_info = std::make_unique<Shader>(system, ShaderType::Compute, program_addr,
252 std::move(code), KERNEL_MAIN_OFFSET);
253 shader = shader_info.get();
254
246 if (cpu_addr) { 255 if (cpu_addr) {
247 Register(shader); 256 Register(std::move(shader_info), *cpu_addr, size_in_bytes);
248 } else { 257 } else {
249 null_kernel = shader; 258 null_kernel = std::move(shader_info);
250 } 259 }
251 } 260 }
252 261
@@ -262,7 +271,7 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
262 return *entry; 271 return *entry;
263} 272}
264 273
265void VKPipelineCache::Unregister(const Shader& shader) { 274void VKPipelineCache::OnShaderRemoval(Shader* shader) {
266 bool finished = false; 275 bool finished = false;
267 const auto Finish = [&] { 276 const auto Finish = [&] {
268 // TODO(Rodrigo): Instead of finishing here, wait for the fences that use this pipeline and 277 // TODO(Rodrigo): Instead of finishing here, wait for the fences that use this pipeline and
@@ -294,8 +303,6 @@ void VKPipelineCache::Unregister(const Shader& shader) {
294 Finish(); 303 Finish();
295 it = compute_cache.erase(it); 304 it = compute_cache.erase(it);
296 } 305 }
297
298 RasterizerCache::Unregister(shader);
299} 306}
300 307
301std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> 308std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>>
@@ -312,7 +319,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
312 ASSERT(point_size != 0.0f); 319 ASSERT(point_size != 0.0f);
313 } 320 }
314 for (std::size_t i = 0; i < Maxwell::NumVertexAttributes; ++i) { 321 for (std::size_t i = 0; i < Maxwell::NumVertexAttributes; ++i) {
315 specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].Type(); 322 const auto& attribute = fixed_state.vertex_input.attributes[i];
323 specialization.enabled_attributes[i] = attribute.enabled.Value() != 0;
324 specialization.attribute_types[i] = attribute.Type();
316 } 325 }
317 specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one; 326 specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one;
318 327
@@ -328,13 +337,11 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
328 } 337 }
329 338
330 const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum); 339 const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum);
331 const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); 340 const std::optional<VAddr> cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
332 ASSERT(cpu_addr); 341 Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
333 const auto shader = TryGet(*cpu_addr);
334 ASSERT(shader);
335 342
336 const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 343 const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
337 const auto program_type = GetShaderType(program_enum); 344 const ShaderType program_type = GetShaderType(program_enum);
338 const auto& entries = shader->GetEntries(); 345 const auto& entries = shader->GetEntries();
339 program[stage] = { 346 program[stage] = {
340 Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization), 347 Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization),
@@ -376,16 +383,17 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3
376 return; 383 return;
377 } 384 }
378 385
379 if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER) { 386 if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER ||
380 // Nvidia has a bug where updating multiple uniform texels at once causes the driver to 387 descriptor_type == STORAGE_TEXEL_BUFFER) {
381 // crash. 388 // Nvidia has a bug where updating multiple texels at once causes the driver to crash.
389 // Note: Fixed in driver Windows 443.24, Linux 440.66.15
382 for (u32 i = 0; i < count; ++i) { 390 for (u32 i = 0; i < count; ++i) {
383 VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back(); 391 VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back();
384 entry.dstBinding = binding + i; 392 entry.dstBinding = binding + i;
385 entry.dstArrayElement = 0; 393 entry.dstArrayElement = 0;
386 entry.descriptorCount = 1; 394 entry.descriptorCount = 1;
387 entry.descriptorType = descriptor_type; 395 entry.descriptorType = descriptor_type;
388 entry.offset = offset + i * entry_size; 396 entry.offset = static_cast<std::size_t>(offset + i * entry_size);
389 entry.stride = entry_size; 397 entry.stride = entry_size;
390 } 398 }
391 } else if (count > 0) { 399 } else if (count > 0) {
@@ -406,8 +414,9 @@ void FillDescriptorUpdateTemplateEntries(
406 std::vector<VkDescriptorUpdateTemplateEntryKHR>& template_entries) { 414 std::vector<VkDescriptorUpdateTemplateEntryKHR>& template_entries) {
407 AddEntry<UNIFORM_BUFFER>(template_entries, offset, binding, entries.const_buffers); 415 AddEntry<UNIFORM_BUFFER>(template_entries, offset, binding, entries.const_buffers);
408 AddEntry<STORAGE_BUFFER>(template_entries, offset, binding, entries.global_buffers); 416 AddEntry<STORAGE_BUFFER>(template_entries, offset, binding, entries.global_buffers);
409 AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.texel_buffers); 417 AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.uniform_texels);
410 AddEntry<COMBINED_IMAGE_SAMPLER>(template_entries, offset, binding, entries.samplers); 418 AddEntry<COMBINED_IMAGE_SAMPLER>(template_entries, offset, binding, entries.samplers);
419 AddEntry<STORAGE_TEXEL_BUFFER>(template_entries, offset, binding, entries.storage_texels);
411 AddEntry<STORAGE_IMAGE>(template_entries, offset, binding, entries.images); 420 AddEntry<STORAGE_IMAGE>(template_entries, offset, binding, entries.images);
412} 421}
413 422
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
index 0b5796fef..0a36e5112 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -17,7 +17,6 @@
17#include "common/common_types.h" 17#include "common/common_types.h"
18#include "video_core/engines/const_buffer_engine_interface.h" 18#include "video_core/engines/const_buffer_engine_interface.h"
19#include "video_core/engines/maxwell_3d.h" 19#include "video_core/engines/maxwell_3d.h"
20#include "video_core/rasterizer_cache.h"
21#include "video_core/renderer_vulkan/fixed_pipeline_state.h" 20#include "video_core/renderer_vulkan/fixed_pipeline_state.h"
22#include "video_core/renderer_vulkan/vk_graphics_pipeline.h" 21#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
23#include "video_core/renderer_vulkan/vk_renderpass_cache.h" 22#include "video_core/renderer_vulkan/vk_renderpass_cache.h"
@@ -26,6 +25,7 @@
26#include "video_core/shader/memory_util.h" 25#include "video_core/shader/memory_util.h"
27#include "video_core/shader/registry.h" 26#include "video_core/shader/registry.h"
28#include "video_core/shader/shader_ir.h" 27#include "video_core/shader/shader_ir.h"
28#include "video_core/shader_cache.h"
29 29
30namespace Core { 30namespace Core {
31class System; 31class System;
@@ -41,8 +41,6 @@ class VKFence;
41class VKScheduler; 41class VKScheduler;
42class VKUpdateDescriptorQueue; 42class VKUpdateDescriptorQueue;
43 43
44class CachedShader;
45using Shader = std::shared_ptr<CachedShader>;
46using Maxwell = Tegra::Engines::Maxwell3D::Regs; 44using Maxwell = Tegra::Engines::Maxwell3D::Regs;
47 45
48struct GraphicsPipelineCacheKey { 46struct GraphicsPipelineCacheKey {
@@ -102,21 +100,16 @@ struct hash<Vulkan::ComputePipelineCacheKey> {
102 100
103namespace Vulkan { 101namespace Vulkan {
104 102
105class CachedShader final : public RasterizerCacheObject { 103class Shader {
106public: 104public:
107 explicit CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, 105 explicit Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
108 VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code, 106 VideoCommon::Shader::ProgramCode program_code, u32 main_offset);
109 u32 main_offset); 107 ~Shader();
110 ~CachedShader();
111 108
112 GPUVAddr GetGpuAddr() const { 109 GPUVAddr GetGpuAddr() const {
113 return gpu_addr; 110 return gpu_addr;
114 } 111 }
115 112
116 std::size_t GetSizeInBytes() const override {
117 return program_code.size() * sizeof(u64);
118 }
119
120 VideoCommon::Shader::ShaderIR& GetIR() { 113 VideoCommon::Shader::ShaderIR& GetIR() {
121 return shader_ir; 114 return shader_ir;
122 } 115 }
@@ -144,25 +137,23 @@ private:
144 ShaderEntries entries; 137 ShaderEntries entries;
145}; 138};
146 139
147class VKPipelineCache final : public RasterizerCache<Shader> { 140class VKPipelineCache final : public VideoCommon::ShaderCache<Shader> {
148public: 141public:
149 explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, 142 explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer,
150 const VKDevice& device, VKScheduler& scheduler, 143 const VKDevice& device, VKScheduler& scheduler,
151 VKDescriptorPool& descriptor_pool, 144 VKDescriptorPool& descriptor_pool,
152 VKUpdateDescriptorQueue& update_descriptor_queue, 145 VKUpdateDescriptorQueue& update_descriptor_queue,
153 VKRenderPassCache& renderpass_cache); 146 VKRenderPassCache& renderpass_cache);
154 ~VKPipelineCache(); 147 ~VKPipelineCache() override;
155 148
156 std::array<Shader, Maxwell::MaxShaderProgram> GetShaders(); 149 std::array<Shader*, Maxwell::MaxShaderProgram> GetShaders();
157 150
158 VKGraphicsPipeline& GetGraphicsPipeline(const GraphicsPipelineCacheKey& key); 151 VKGraphicsPipeline& GetGraphicsPipeline(const GraphicsPipelineCacheKey& key);
159 152
160 VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key); 153 VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key);
161 154
162protected: 155protected:
163 void Unregister(const Shader& shader) override; 156 void OnShaderRemoval(Shader* shader) final;
164
165 void FlushObjectInner(const Shader& object) override {}
166 157
167private: 158private:
168 std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders( 159 std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders(
@@ -175,10 +166,10 @@ private:
175 VKUpdateDescriptorQueue& update_descriptor_queue; 166 VKUpdateDescriptorQueue& update_descriptor_queue;
176 VKRenderPassCache& renderpass_cache; 167 VKRenderPassCache& renderpass_cache;
177 168
178 Shader null_shader{}; 169 std::unique_ptr<Shader> null_shader;
179 Shader null_kernel{}; 170 std::unique_ptr<Shader> null_kernel;
180 171
181 std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; 172 std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
182 173
183 GraphicsPipelineCacheKey last_graphics_key; 174 GraphicsPipelineCacheKey last_graphics_key;
184 VKGraphicsPipeline* last_graphics_pipeline = nullptr; 175 VKGraphicsPipeline* last_graphics_pipeline = nullptr;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 17a2efe8e..184b2238a 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -38,6 +38,7 @@
38#include "video_core/renderer_vulkan/vk_texture_cache.h" 38#include "video_core/renderer_vulkan/vk_texture_cache.h"
39#include "video_core/renderer_vulkan/vk_update_descriptor.h" 39#include "video_core/renderer_vulkan/vk_update_descriptor.h"
40#include "video_core/renderer_vulkan/wrapper.h" 40#include "video_core/renderer_vulkan/wrapper.h"
41#include "video_core/shader_cache.h"
41 42
42namespace Vulkan { 43namespace Vulkan {
43 44
@@ -98,7 +99,7 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) {
98} 99}
99 100
100std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses( 101std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses(
101 const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { 102 const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {
102 std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses; 103 std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses;
103 for (std::size_t i = 0; i < std::size(addresses); ++i) { 104 for (std::size_t i = 0; i < std::size(addresses); ++i) {
104 addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0; 105 addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0;
@@ -117,6 +118,17 @@ template <typename Engine, typename Entry>
117Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, 118Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
118 std::size_t stage, std::size_t index = 0) { 119 std::size_t stage, std::size_t index = 0) {
119 const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage); 120 const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage);
121 if constexpr (std::is_same_v<Entry, SamplerEntry>) {
122 if (entry.is_separated) {
123 const u32 buffer_1 = entry.buffer;
124 const u32 buffer_2 = entry.secondary_buffer;
125 const u32 offset_1 = entry.offset;
126 const u32 offset_2 = entry.secondary_offset;
127 const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1);
128 const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2);
129 return engine.GetTextureInfo(handle_1 | handle_2);
130 }
131 }
120 if (entry.is_bindless) { 132 if (entry.is_bindless) {
121 const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset); 133 const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset);
122 return engine.GetTextureInfo(tex_handle); 134 return engine.GetTextureInfo(tex_handle);
@@ -468,8 +480,9 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
468 const auto& entries = pipeline.GetEntries(); 480 const auto& entries = pipeline.GetEntries();
469 SetupComputeConstBuffers(entries); 481 SetupComputeConstBuffers(entries);
470 SetupComputeGlobalBuffers(entries); 482 SetupComputeGlobalBuffers(entries);
471 SetupComputeTexelBuffers(entries); 483 SetupComputeUniformTexels(entries);
472 SetupComputeTextures(entries); 484 SetupComputeTextures(entries);
485 SetupComputeStorageTexels(entries);
473 SetupComputeImages(entries); 486 SetupComputeImages(entries);
474 487
475 buffer_cache.Unmap(); 488 buffer_cache.Unmap();
@@ -532,14 +545,14 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
532 return; 545 return;
533 } 546 }
534 texture_cache.OnCPUWrite(addr, size); 547 texture_cache.OnCPUWrite(addr, size);
535 pipeline_cache.InvalidateRegion(addr, size); 548 pipeline_cache.OnCPUWrite(addr, size);
536 buffer_cache.OnCPUWrite(addr, size); 549 buffer_cache.OnCPUWrite(addr, size);
537 query_cache.InvalidateRegion(addr, size);
538} 550}
539 551
540void RasterizerVulkan::SyncGuestHost() { 552void RasterizerVulkan::SyncGuestHost() {
541 texture_cache.SyncGuestHost(); 553 texture_cache.SyncGuestHost();
542 buffer_cache.SyncGuestHost(); 554 buffer_cache.SyncGuestHost();
555 pipeline_cache.SyncGuestHost();
543} 556}
544 557
545void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) { 558void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) {
@@ -715,7 +728,7 @@ std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers(
715 if (!view) { 728 if (!view) {
716 return false; 729 return false;
717 } 730 }
718 key.views.push_back(view->GetHandle()); 731 key.views.push_back(view->GetAttachment());
719 key.width = std::min(key.width, view->GetWidth()); 732 key.width = std::min(key.width, view->GetWidth());
720 key.height = std::min(key.height, view->GetHeight()); 733 key.height = std::min(key.height, view->GetHeight());
721 key.layers = std::min(key.layers, view->GetNumLayers()); 734 key.layers = std::min(key.layers, view->GetNumLayers());
@@ -775,20 +788,21 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt
775} 788}
776 789
777void RasterizerVulkan::SetupShaderDescriptors( 790void RasterizerVulkan::SetupShaderDescriptors(
778 const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { 791 const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {
779 texture_cache.GuardSamplers(true); 792 texture_cache.GuardSamplers(true);
780 793
781 for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { 794 for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
782 // Skip VertexA stage 795 // Skip VertexA stage
783 const auto& shader = shaders[stage + 1]; 796 Shader* const shader = shaders[stage + 1];
784 if (!shader) { 797 if (!shader) {
785 continue; 798 continue;
786 } 799 }
787 const auto& entries = shader->GetEntries(); 800 const auto& entries = shader->GetEntries();
788 SetupGraphicsConstBuffers(entries, stage); 801 SetupGraphicsConstBuffers(entries, stage);
789 SetupGraphicsGlobalBuffers(entries, stage); 802 SetupGraphicsGlobalBuffers(entries, stage);
790 SetupGraphicsTexelBuffers(entries, stage); 803 SetupGraphicsUniformTexels(entries, stage);
791 SetupGraphicsTextures(entries, stage); 804 SetupGraphicsTextures(entries, stage);
805 SetupGraphicsStorageTexels(entries, stage);
792 SetupGraphicsImages(entries, stage); 806 SetupGraphicsImages(entries, stage);
793 } 807 }
794 texture_cache.GuardSamplers(false); 808 texture_cache.GuardSamplers(false);
@@ -838,6 +852,10 @@ void RasterizerVulkan::BeginTransformFeedback() {
838 if (regs.tfb_enabled == 0) { 852 if (regs.tfb_enabled == 0) {
839 return; 853 return;
840 } 854 }
855 if (!device.IsExtTransformFeedbackSupported()) {
856 LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
857 return;
858 }
841 859
842 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || 860 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
843 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || 861 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
@@ -866,6 +884,9 @@ void RasterizerVulkan::EndTransformFeedback() {
866 if (regs.tfb_enabled == 0) { 884 if (regs.tfb_enabled == 0) {
867 return; 885 return;
868 } 886 }
887 if (!device.IsExtTransformFeedbackSupported()) {
888 return;
889 }
869 890
870 scheduler.Record( 891 scheduler.Record(
871 [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); 892 [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
@@ -877,14 +898,10 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex
877 898
878 for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) { 899 for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) {
879 const auto& attrib = regs.vertex_attrib_format[index]; 900 const auto& attrib = regs.vertex_attrib_format[index];
880 if (!attrib.IsValid()) { 901 if (attrib.IsConstant()) {
881 vertex_input.SetAttribute(index, false, 0, 0, {}, {}); 902 vertex_input.SetAttribute(index, false, 0, 0, {}, {});
882 continue; 903 continue;
883 } 904 }
884
885 [[maybe_unused]] const auto& buffer = regs.vertex_array[attrib.buffer];
886 ASSERT(buffer.IsEnabled());
887
888 vertex_input.SetAttribute(index, true, attrib.buffer, attrib.offset, attrib.type.Value(), 905 vertex_input.SetAttribute(index, true, attrib.buffer, attrib.offset, attrib.type.Value(),
889 attrib.size.Value()); 906 attrib.size.Value());
890 } 907 }
@@ -980,12 +997,12 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries,
980 } 997 }
981} 998}
982 999
983void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) { 1000void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) {
984 MICROPROFILE_SCOPE(Vulkan_Textures); 1001 MICROPROFILE_SCOPE(Vulkan_Textures);
985 const auto& gpu = system.GPU().Maxwell3D(); 1002 const auto& gpu = system.GPU().Maxwell3D();
986 for (const auto& entry : entries.texel_buffers) { 1003 for (const auto& entry : entries.uniform_texels) {
987 const auto image = GetTextureInfo(gpu, entry, stage).tic; 1004 const auto image = GetTextureInfo(gpu, entry, stage).tic;
988 SetupTexelBuffer(image, entry); 1005 SetupUniformTexels(image, entry);
989 } 1006 }
990} 1007}
991 1008
@@ -1000,6 +1017,15 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::
1000 } 1017 }
1001} 1018}
1002 1019
1020void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) {
1021 MICROPROFILE_SCOPE(Vulkan_Textures);
1022 const auto& gpu = system.GPU().Maxwell3D();
1023 for (const auto& entry : entries.storage_texels) {
1024 const auto image = GetTextureInfo(gpu, entry, stage).tic;
1025 SetupStorageTexel(image, entry);
1026 }
1027}
1028
1003void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) { 1029void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) {
1004 MICROPROFILE_SCOPE(Vulkan_Images); 1030 MICROPROFILE_SCOPE(Vulkan_Images);
1005 const auto& gpu = system.GPU().Maxwell3D(); 1031 const auto& gpu = system.GPU().Maxwell3D();
@@ -1032,12 +1058,12 @@ void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
1032 } 1058 }
1033} 1059}
1034 1060
1035void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) { 1061void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
1036 MICROPROFILE_SCOPE(Vulkan_Textures); 1062 MICROPROFILE_SCOPE(Vulkan_Textures);
1037 const auto& gpu = system.GPU().KeplerCompute(); 1063 const auto& gpu = system.GPU().KeplerCompute();
1038 for (const auto& entry : entries.texel_buffers) { 1064 for (const auto& entry : entries.uniform_texels) {
1039 const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; 1065 const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
1040 SetupTexelBuffer(image, entry); 1066 SetupUniformTexels(image, entry);
1041 } 1067 }
1042} 1068}
1043 1069
@@ -1052,6 +1078,15 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
1052 } 1078 }
1053} 1079}
1054 1080
1081void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
1082 MICROPROFILE_SCOPE(Vulkan_Textures);
1083 const auto& gpu = system.GPU().KeplerCompute();
1084 for (const auto& entry : entries.storage_texels) {
1085 const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
1086 SetupStorageTexel(image, entry);
1087 }
1088}
1089
1055void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { 1090void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
1056 MICROPROFILE_SCOPE(Vulkan_Images); 1091 MICROPROFILE_SCOPE(Vulkan_Images);
1057 const auto& gpu = system.GPU().KeplerCompute(); 1092 const auto& gpu = system.GPU().KeplerCompute();
@@ -1101,8 +1136,8 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd
1101 update_descriptor_queue.AddBuffer(buffer, offset, size); 1136 update_descriptor_queue.AddBuffer(buffer, offset, size);
1102} 1137}
1103 1138
1104void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic, 1139void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic,
1105 const TexelBufferEntry& entry) { 1140 const UniformTexelEntry& entry) {
1106 const auto view = texture_cache.GetTextureSurface(tic, entry); 1141 const auto view = texture_cache.GetTextureSurface(tic, entry);
1107 ASSERT(view->IsBufferView()); 1142 ASSERT(view->IsBufferView());
1108 1143
@@ -1114,8 +1149,8 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
1114 auto view = texture_cache.GetTextureSurface(texture.tic, entry); 1149 auto view = texture_cache.GetTextureSurface(texture.tic, entry);
1115 ASSERT(!view->IsBufferView()); 1150 ASSERT(!view->IsBufferView());
1116 1151
1117 const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source, 1152 const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source,
1118 texture.tic.z_source, texture.tic.w_source); 1153 texture.tic.z_source, texture.tic.w_source);
1119 const auto sampler = sampler_cache.GetSampler(texture.tsc); 1154 const auto sampler = sampler_cache.GetSampler(texture.tsc);
1120 update_descriptor_queue.AddSampledImage(sampler, image_view); 1155 update_descriptor_queue.AddSampledImage(sampler, image_view);
1121 1156
@@ -1124,6 +1159,14 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
1124 sampled_views.push_back(ImageView{std::move(view), image_layout}); 1159 sampled_views.push_back(ImageView{std::move(view), image_layout});
1125} 1160}
1126 1161
1162void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic,
1163 const StorageTexelEntry& entry) {
1164 const auto view = texture_cache.GetImageSurface(tic, entry);
1165 ASSERT(view->IsBufferView());
1166
1167 update_descriptor_queue.AddTexelBuffer(view->GetBufferView());
1168}
1169
1127void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) { 1170void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) {
1128 auto view = texture_cache.GetImageSurface(tic, entry); 1171 auto view = texture_cache.GetImageSurface(tic, entry);
1129 1172
@@ -1133,7 +1176,8 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima
1133 1176
1134 UNIMPLEMENTED_IF(tic.IsBuffer()); 1177 UNIMPLEMENTED_IF(tic.IsBuffer());
1135 1178
1136 const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source); 1179 const VkImageView image_view =
1180 view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
1137 update_descriptor_queue.AddImage(image_view); 1181 update_descriptor_queue.AddImage(image_view);
1138 1182
1139 const auto image_layout = update_descriptor_queue.GetLastImageLayout(); 1183 const auto image_layout = update_descriptor_queue.GetLastImageLayout();
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 0ed0e48c6..c8c187606 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -168,7 +168,7 @@ private:
168 bool is_indexed, bool is_instanced); 168 bool is_indexed, bool is_instanced);
169 169
170 /// Setup descriptors in the graphics pipeline. 170 /// Setup descriptors in the graphics pipeline.
171 void SetupShaderDescriptors(const std::array<Shader, Maxwell::MaxShaderProgram>& shaders); 171 void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders);
172 172
173 void SetupImageTransitions(Texceptions texceptions, 173 void SetupImageTransitions(Texceptions texceptions,
174 const std::array<View, Maxwell::NumRenderTargets>& color_attachments, 174 const std::array<View, Maxwell::NumRenderTargets>& color_attachments,
@@ -193,12 +193,15 @@ private:
193 /// Setup global buffers in the graphics pipeline. 193 /// Setup global buffers in the graphics pipeline.
194 void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage); 194 void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage);
195 195
196 /// Setup texel buffers in the graphics pipeline. 196 /// Setup uniform texels in the graphics pipeline.
197 void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage); 197 void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);
198 198
199 /// Setup textures in the graphics pipeline. 199 /// Setup textures in the graphics pipeline.
200 void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage); 200 void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage);
201 201
202 /// Setup storage texels in the graphics pipeline.
203 void SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage);
204
202 /// Setup images in the graphics pipeline. 205 /// Setup images in the graphics pipeline.
203 void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage); 206 void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage);
204 207
@@ -209,11 +212,14 @@ private:
209 void SetupComputeGlobalBuffers(const ShaderEntries& entries); 212 void SetupComputeGlobalBuffers(const ShaderEntries& entries);
210 213
211 /// Setup texel buffers in the compute pipeline. 214 /// Setup texel buffers in the compute pipeline.
212 void SetupComputeTexelBuffers(const ShaderEntries& entries); 215 void SetupComputeUniformTexels(const ShaderEntries& entries);
213 216
214 /// Setup textures in the compute pipeline. 217 /// Setup textures in the compute pipeline.
215 void SetupComputeTextures(const ShaderEntries& entries); 218 void SetupComputeTextures(const ShaderEntries& entries);
216 219
220 /// Setup storage texels in the compute pipeline.
221 void SetupComputeStorageTexels(const ShaderEntries& entries);
222
217 /// Setup images in the compute pipeline. 223 /// Setup images in the compute pipeline.
218 void SetupComputeImages(const ShaderEntries& entries); 224 void SetupComputeImages(const ShaderEntries& entries);
219 225
@@ -222,10 +228,12 @@ private:
222 228
223 void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address); 229 void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address);
224 230
225 void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry); 231 void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry);
226 232
227 void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); 233 void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry);
228 234
235 void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry);
236
229 void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); 237 void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
230 238
231 void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); 239 void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 167e20e91..97429cc59 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -400,8 +400,9 @@ private:
400 u32 binding = specialization.base_binding; 400 u32 binding = specialization.base_binding;
401 binding = DeclareConstantBuffers(binding); 401 binding = DeclareConstantBuffers(binding);
402 binding = DeclareGlobalBuffers(binding); 402 binding = DeclareGlobalBuffers(binding);
403 binding = DeclareTexelBuffers(binding); 403 binding = DeclareUniformTexels(binding);
404 binding = DeclareSamplers(binding); 404 binding = DeclareSamplers(binding);
405 binding = DeclareStorageTexels(binding);
405 binding = DeclareImages(binding); 406 binding = DeclareImages(binding);
406 407
407 const Id main = OpFunction(t_void, {}, TypeFunction(t_void)); 408 const Id main = OpFunction(t_void, {}, TypeFunction(t_void));
@@ -515,6 +516,16 @@ private:
515 void DeclareCommon() { 516 void DeclareCommon() {
516 thread_id = 517 thread_id =
517 DeclareInputBuiltIn(spv::BuiltIn::SubgroupLocalInvocationId, t_in_uint, "thread_id"); 518 DeclareInputBuiltIn(spv::BuiltIn::SubgroupLocalInvocationId, t_in_uint, "thread_id");
519 thread_masks[0] =
520 DeclareInputBuiltIn(spv::BuiltIn::SubgroupEqMask, t_in_uint4, "thread_eq_mask");
521 thread_masks[1] =
522 DeclareInputBuiltIn(spv::BuiltIn::SubgroupGeMask, t_in_uint4, "thread_ge_mask");
523 thread_masks[2] =
524 DeclareInputBuiltIn(spv::BuiltIn::SubgroupGtMask, t_in_uint4, "thread_gt_mask");
525 thread_masks[3] =
526 DeclareInputBuiltIn(spv::BuiltIn::SubgroupLeMask, t_in_uint4, "thread_le_mask");
527 thread_masks[4] =
528 DeclareInputBuiltIn(spv::BuiltIn::SubgroupLtMask, t_in_uint4, "thread_lt_mask");
518 } 529 }
519 530
520 void DeclareVertex() { 531 void DeclareVertex() {
@@ -731,8 +742,10 @@ private:
731 if (!IsGenericAttribute(index)) { 742 if (!IsGenericAttribute(index)) {
732 continue; 743 continue;
733 } 744 }
734
735 const u32 location = GetGenericAttributeLocation(index); 745 const u32 location = GetGenericAttributeLocation(index);
746 if (!IsAttributeEnabled(location)) {
747 continue;
748 }
736 const auto type_descriptor = GetAttributeType(location); 749 const auto type_descriptor = GetAttributeType(location);
737 Id type; 750 Id type;
738 if (IsInputAttributeArray()) { 751 if (IsInputAttributeArray()) {
@@ -877,7 +890,7 @@ private:
877 return binding; 890 return binding;
878 } 891 }
879 892
880 u32 DeclareTexelBuffers(u32 binding) { 893 u32 DeclareUniformTexels(u32 binding) {
881 for (const auto& sampler : ir.GetSamplers()) { 894 for (const auto& sampler : ir.GetSamplers()) {
882 if (!sampler.is_buffer) { 895 if (!sampler.is_buffer) {
883 continue; 896 continue;
@@ -898,7 +911,7 @@ private:
898 Decorate(id, spv::Decoration::Binding, binding++); 911 Decorate(id, spv::Decoration::Binding, binding++);
899 Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); 912 Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
900 913
901 texel_buffers.emplace(sampler.index, TexelBuffer{image_type, id}); 914 uniform_texels.emplace(sampler.index, TexelBuffer{image_type, id});
902 } 915 }
903 return binding; 916 return binding;
904 } 917 }
@@ -933,31 +946,48 @@ private:
933 return binding; 946 return binding;
934 } 947 }
935 948
936 u32 DeclareImages(u32 binding) { 949 u32 DeclareStorageTexels(u32 binding) {
937 for (const auto& image : ir.GetImages()) { 950 for (const auto& image : ir.GetImages()) {
938 const auto [dim, arrayed] = GetImageDim(image); 951 if (image.type != Tegra::Shader::ImageType::TextureBuffer) {
939 constexpr int depth = 0; 952 continue;
940 constexpr bool ms = false;
941 constexpr int sampled = 2; // This won't be accessed with a sampler
942 constexpr auto format = spv::ImageFormat::Unknown;
943 const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
944 const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
945 const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
946 AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
947
948 Decorate(id, spv::Decoration::Binding, binding++);
949 Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
950 if (image.is_read && !image.is_written) {
951 Decorate(id, spv::Decoration::NonWritable);
952 } else if (image.is_written && !image.is_read) {
953 Decorate(id, spv::Decoration::NonReadable);
954 } 953 }
954 DeclareImage(image, binding);
955 }
956 return binding;
957 }
955 958
956 images.emplace(image.index, StorageImage{image_type, id}); 959 u32 DeclareImages(u32 binding) {
960 for (const auto& image : ir.GetImages()) {
961 if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
962 continue;
963 }
964 DeclareImage(image, binding);
957 } 965 }
958 return binding; 966 return binding;
959 } 967 }
960 968
969 void DeclareImage(const Image& image, u32& binding) {
970 const auto [dim, arrayed] = GetImageDim(image);
971 constexpr int depth = 0;
972 constexpr bool ms = false;
973 constexpr int sampled = 2; // This won't be accessed with a sampler
974 const auto format = image.is_atomic ? spv::ImageFormat::R32ui : spv::ImageFormat::Unknown;
975 const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
976 const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
977 const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
978 AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
979
980 Decorate(id, spv::Decoration::Binding, binding++);
981 Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
982 if (image.is_read && !image.is_written) {
983 Decorate(id, spv::Decoration::NonWritable);
984 } else if (image.is_written && !image.is_read) {
985 Decorate(id, spv::Decoration::NonReadable);
986 }
987
988 images.emplace(image.index, StorageImage{image_type, id});
989 }
990
961 bool IsRenderTargetEnabled(u32 rt) const { 991 bool IsRenderTargetEnabled(u32 rt) const {
962 for (u32 component = 0; component < 4; ++component) { 992 for (u32 component = 0; component < 4; ++component) {
963 if (header.ps.IsColorComponentOutputEnabled(rt, component)) { 993 if (header.ps.IsColorComponentOutputEnabled(rt, component)) {
@@ -976,6 +1006,10 @@ private:
976 return stage == ShaderType::TesselationControl; 1006 return stage == ShaderType::TesselationControl;
977 } 1007 }
978 1008
1009 bool IsAttributeEnabled(u32 location) const {
1010 return stage != ShaderType::Vertex || specialization.enabled_attributes[location];
1011 }
1012
979 u32 GetNumInputVertices() const { 1013 u32 GetNumInputVertices() const {
980 switch (stage) { 1014 switch (stage) {
981 case ShaderType::Geometry: 1015 case ShaderType::Geometry:
@@ -1071,8 +1105,7 @@ private:
1071 1105
1072 void VisitBasicBlock(const NodeBlock& bb) { 1106 void VisitBasicBlock(const NodeBlock& bb) {
1073 for (const auto& node : bb) { 1107 for (const auto& node : bb) {
1074 [[maybe_unused]] const Type type = Visit(node).type; 1108 Visit(node);
1075 ASSERT(type == Type::Void);
1076 } 1109 }
1077 } 1110 }
1078 1111
@@ -1192,16 +1225,20 @@ private:
1192 UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element); 1225 UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element);
1193 return {v_float_zero, Type::Float}; 1226 return {v_float_zero, Type::Float};
1194 default: 1227 default:
1195 if (IsGenericAttribute(attribute)) { 1228 if (!IsGenericAttribute(attribute)) {
1196 const u32 location = GetGenericAttributeLocation(attribute); 1229 break;
1197 const auto type_descriptor = GetAttributeType(location);
1198 const Type type = type_descriptor.type;
1199 const Id attribute_id = input_attributes.at(attribute);
1200 const std::vector elements = {element};
1201 const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
1202 return {OpLoad(GetTypeDefinition(type), pointer), type};
1203 } 1230 }
1204 break; 1231 const u32 location = GetGenericAttributeLocation(attribute);
1232 if (!IsAttributeEnabled(location)) {
1233 // Disabled attributes (also known as constant attributes) always return zero.
1234 return {v_float_zero, Type::Float};
1235 }
1236 const auto type_descriptor = GetAttributeType(location);
1237 const Type type = type_descriptor.type;
1238 const Id attribute_id = input_attributes.at(attribute);
1239 const std::vector elements = {element};
1240 const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
1241 return {OpLoad(GetTypeDefinition(type), pointer), type};
1205 } 1242 }
1206 UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute)); 1243 UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
1207 return {v_float_zero, Type::Float}; 1244 return {v_float_zero, Type::Float};
@@ -1237,7 +1274,7 @@ private:
1237 } else { 1274 } else {
1238 UNREACHABLE_MSG("Unmanaged offset node type"); 1275 UNREACHABLE_MSG("Unmanaged offset node type");
1239 } 1276 }
1240 pointer = OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), buffer_index, 1277 pointer = OpAccessChain(t_cbuf_float, buffer_id, v_uint_zero, buffer_index,
1241 buffer_element); 1278 buffer_element);
1242 } 1279 }
1243 return {OpLoad(t_float, pointer), Type::Float}; 1280 return {OpLoad(t_float, pointer), Type::Float};
@@ -1362,7 +1399,9 @@ private:
1362 Expression target{}; 1399 Expression target{};
1363 if (const auto gpr = std::get_if<GprNode>(&*dest)) { 1400 if (const auto gpr = std::get_if<GprNode>(&*dest)) {
1364 if (gpr->GetIndex() == Register::ZeroIndex) { 1401 if (gpr->GetIndex() == Register::ZeroIndex) {
1365 // Writing to Register::ZeroIndex is a no op 1402 // Writing to Register::ZeroIndex is a no op but we still have to visit its source
1403 // because it might have side effects.
1404 Visit(src);
1366 return {}; 1405 return {};
1367 } 1406 }
1368 target = {registers.at(gpr->GetIndex()), Type::Float}; 1407 target = {registers.at(gpr->GetIndex()), Type::Float};
@@ -1590,7 +1629,7 @@ private:
1590 1629
1591 const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b); 1630 const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b);
1592 const Id carry = OpCompositeExtract(t_uint, result, 1); 1631 const Id carry = OpCompositeExtract(t_uint, result, 1);
1593 return {OpINotEqual(t_bool, carry, Constant(t_uint, 0)), Type::Bool}; 1632 return {OpINotEqual(t_bool, carry, v_uint_zero), Type::Bool};
1594 } 1633 }
1595 1634
1596 Expression LogicalAssign(Operation operation) { 1635 Expression LogicalAssign(Operation operation) {
@@ -1653,7 +1692,7 @@ private:
1653 const auto& meta = std::get<MetaTexture>(operation.GetMeta()); 1692 const auto& meta = std::get<MetaTexture>(operation.GetMeta());
1654 const u32 index = meta.sampler.index; 1693 const u32 index = meta.sampler.index;
1655 if (meta.sampler.is_buffer) { 1694 if (meta.sampler.is_buffer) {
1656 const auto& entry = texel_buffers.at(index); 1695 const auto& entry = uniform_texels.at(index);
1657 return OpLoad(entry.image_type, entry.image); 1696 return OpLoad(entry.image_type, entry.image);
1658 } else { 1697 } else {
1659 const auto& entry = sampled_images.at(index); 1698 const auto& entry = sampled_images.at(index);
@@ -1930,39 +1969,20 @@ private:
1930 return {}; 1969 return {};
1931 } 1970 }
1932 1971
1933 Expression AtomicImageAdd(Operation operation) { 1972 template <Id (Module::*func)(Id, Id, Id, Id, Id)>
1934 UNIMPLEMENTED(); 1973 Expression AtomicImage(Operation operation) {
1935 return {}; 1974 const auto& meta{std::get<MetaImage>(operation.GetMeta())};
1936 } 1975 ASSERT(meta.values.size() == 1);
1937
1938 Expression AtomicImageMin(Operation operation) {
1939 UNIMPLEMENTED();
1940 return {};
1941 }
1942
1943 Expression AtomicImageMax(Operation operation) {
1944 UNIMPLEMENTED();
1945 return {};
1946 }
1947
1948 Expression AtomicImageAnd(Operation operation) {
1949 UNIMPLEMENTED();
1950 return {};
1951 }
1952
1953 Expression AtomicImageOr(Operation operation) {
1954 UNIMPLEMENTED();
1955 return {};
1956 }
1957 1976
1958 Expression AtomicImageXor(Operation operation) { 1977 const Id coordinate = GetCoordinates(operation, Type::Int);
1959 UNIMPLEMENTED(); 1978 const Id image = images.at(meta.image.index).image;
1960 return {}; 1979 const Id sample = v_uint_zero;
1961 } 1980 const Id pointer = OpImageTexelPointer(t_image_uint, image, coordinate, sample);
1962 1981
1963 Expression AtomicImageExchange(Operation operation) { 1982 const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
1964 UNIMPLEMENTED(); 1983 const Id semantics = v_uint_zero;
1965 return {}; 1984 const Id value = AsUint(Visit(meta.values[0]));
1985 return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
1966 } 1986 }
1967 1987
1968 template <Id (Module::*func)(Id, Id, Id, Id, Id)> 1988 template <Id (Module::*func)(Id, Id, Id, Id, Id)>
@@ -1977,7 +1997,7 @@ private:
1977 return {v_float_zero, Type::Float}; 1997 return {v_float_zero, Type::Float};
1978 } 1998 }
1979 const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device)); 1999 const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
1980 const Id semantics = Constant(t_uint, 0); 2000 const Id semantics = v_uint_zero;
1981 const Id value = AsUint(Visit(operation[1])); 2001 const Id value = AsUint(Visit(operation[1]));
1982 2002
1983 return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint}; 2003 return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
@@ -2175,14 +2195,37 @@ private:
2175 return {OpLoad(t_uint, thread_id), Type::Uint}; 2195 return {OpLoad(t_uint, thread_id), Type::Uint};
2176 } 2196 }
2177 2197
2198 template <std::size_t index>
2199 Expression ThreadMask(Operation) {
2200 // TODO(Rodrigo): Handle devices with different warp sizes
2201 const Id mask = thread_masks[index];
2202 return {OpLoad(t_uint, AccessElement(t_in_uint, mask, 0)), Type::Uint};
2203 }
2204
2178 Expression ShuffleIndexed(Operation operation) { 2205 Expression ShuffleIndexed(Operation operation) {
2179 const Id value = AsFloat(Visit(operation[0])); 2206 const Id value = AsFloat(Visit(operation[0]));
2180 const Id index = AsUint(Visit(operation[1])); 2207 const Id index = AsUint(Visit(operation[1]));
2181 return {OpSubgroupReadInvocationKHR(t_float, value, index), Type::Float}; 2208 return {OpSubgroupReadInvocationKHR(t_float, value, index), Type::Float};
2182 } 2209 }
2183 2210
2184 Expression MemoryBarrierGL(Operation) { 2211 Expression Barrier(Operation) {
2185 const auto scope = spv::Scope::Device; 2212 if (!ir.IsDecompiled()) {
2213 LOG_ERROR(Render_Vulkan, "OpBarrier used by shader is not decompiled");
2214 return {};
2215 }
2216
2217 const auto scope = spv::Scope::Workgroup;
2218 const auto memory = spv::Scope::Workgroup;
2219 const auto semantics =
2220 spv::MemorySemanticsMask::WorkgroupMemory | spv::MemorySemanticsMask::AcquireRelease;
2221 OpControlBarrier(Constant(t_uint, static_cast<u32>(scope)),
2222 Constant(t_uint, static_cast<u32>(memory)),
2223 Constant(t_uint, static_cast<u32>(semantics)));
2224 return {};
2225 }
2226
2227 template <spv::Scope scope>
2228 Expression MemoryBarrier(Operation) {
2186 const auto semantics = 2229 const auto semantics =
2187 spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory | 2230 spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory |
2188 spv::MemorySemanticsMask::WorkgroupMemory | 2231 spv::MemorySemanticsMask::WorkgroupMemory |
@@ -2578,11 +2621,11 @@ private:
2578 2621
2579 &SPIRVDecompiler::ImageLoad, 2622 &SPIRVDecompiler::ImageLoad,
2580 &SPIRVDecompiler::ImageStore, 2623 &SPIRVDecompiler::ImageStore,
2581 &SPIRVDecompiler::AtomicImageAdd, 2624 &SPIRVDecompiler::AtomicImage<&Module::OpAtomicIAdd>,
2582 &SPIRVDecompiler::AtomicImageAnd, 2625 &SPIRVDecompiler::AtomicImage<&Module::OpAtomicAnd>,
2583 &SPIRVDecompiler::AtomicImageOr, 2626 &SPIRVDecompiler::AtomicImage<&Module::OpAtomicOr>,
2584 &SPIRVDecompiler::AtomicImageXor, 2627 &SPIRVDecompiler::AtomicImage<&Module::OpAtomicXor>,
2585 &SPIRVDecompiler::AtomicImageExchange, 2628 &SPIRVDecompiler::AtomicImage<&Module::OpAtomicExchange>,
2586 2629
2587 &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>, 2630 &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,
2588 &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>, 2631 &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>,
@@ -2639,9 +2682,16 @@ private:
2639 &SPIRVDecompiler::Vote<&Module::OpSubgroupAllEqualKHR>, 2682 &SPIRVDecompiler::Vote<&Module::OpSubgroupAllEqualKHR>,
2640 2683
2641 &SPIRVDecompiler::ThreadId, 2684 &SPIRVDecompiler::ThreadId,
2685 &SPIRVDecompiler::ThreadMask<0>, // Eq
2686 &SPIRVDecompiler::ThreadMask<1>, // Ge
2687 &SPIRVDecompiler::ThreadMask<2>, // Gt
2688 &SPIRVDecompiler::ThreadMask<3>, // Le
2689 &SPIRVDecompiler::ThreadMask<4>, // Lt
2642 &SPIRVDecompiler::ShuffleIndexed, 2690 &SPIRVDecompiler::ShuffleIndexed,
2643 2691
2644 &SPIRVDecompiler::MemoryBarrierGL, 2692 &SPIRVDecompiler::Barrier,
2693 &SPIRVDecompiler::MemoryBarrier<spv::Scope::Workgroup>,
2694 &SPIRVDecompiler::MemoryBarrier<spv::Scope::Device>,
2645 }; 2695 };
2646 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); 2696 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
2647 2697
@@ -2717,8 +2767,11 @@ private:
2717 Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0); 2767 Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
2718 const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct); 2768 const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct);
2719 2769
2770 const Id t_image_uint = TypePointer(spv::StorageClass::Image, t_uint);
2771
2720 const Id v_float_zero = Constant(t_float, 0.0f); 2772 const Id v_float_zero = Constant(t_float, 0.0f);
2721 const Id v_float_one = Constant(t_float, 1.0f); 2773 const Id v_float_one = Constant(t_float, 1.0f);
2774 const Id v_uint_zero = Constant(t_uint, 0);
2722 2775
2723 // Nvidia uses these defaults for varyings (e.g. position and generic attributes) 2776 // Nvidia uses these defaults for varyings (e.g. position and generic attributes)
2724 const Id v_varying_default = 2777 const Id v_varying_default =
@@ -2743,15 +2796,16 @@ private:
2743 std::unordered_map<u8, GenericVaryingDescription> output_attributes; 2796 std::unordered_map<u8, GenericVaryingDescription> output_attributes;
2744 std::map<u32, Id> constant_buffers; 2797 std::map<u32, Id> constant_buffers;
2745 std::map<GlobalMemoryBase, Id> global_buffers; 2798 std::map<GlobalMemoryBase, Id> global_buffers;
2746 std::map<u32, TexelBuffer> texel_buffers; 2799 std::map<u32, TexelBuffer> uniform_texels;
2747 std::map<u32, SampledImage> sampled_images; 2800 std::map<u32, SampledImage> sampled_images;
2801 std::map<u32, TexelBuffer> storage_texels;
2748 std::map<u32, StorageImage> images; 2802 std::map<u32, StorageImage> images;
2749 2803
2804 std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
2750 Id instance_index{}; 2805 Id instance_index{};
2751 Id vertex_index{}; 2806 Id vertex_index{};
2752 Id base_instance{}; 2807 Id base_instance{};
2753 Id base_vertex{}; 2808 Id base_vertex{};
2754 std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
2755 Id frag_depth{}; 2809 Id frag_depth{};
2756 Id frag_coord{}; 2810 Id frag_coord{};
2757 Id front_facing{}; 2811 Id front_facing{};
@@ -2763,6 +2817,7 @@ private:
2763 Id workgroup_id{}; 2817 Id workgroup_id{};
2764 Id local_invocation_id{}; 2818 Id local_invocation_id{};
2765 Id thread_id{}; 2819 Id thread_id{};
2820 std::array<Id, 5> thread_masks{}; // eq, ge, gt, le, lt
2766 2821
2767 VertexIndices in_indices; 2822 VertexIndices in_indices;
2768 VertexIndices out_indices; 2823 VertexIndices out_indices;
@@ -3006,13 +3061,17 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
3006 } 3061 }
3007 for (const auto& sampler : ir.GetSamplers()) { 3062 for (const auto& sampler : ir.GetSamplers()) {
3008 if (sampler.is_buffer) { 3063 if (sampler.is_buffer) {
3009 entries.texel_buffers.emplace_back(sampler); 3064 entries.uniform_texels.emplace_back(sampler);
3010 } else { 3065 } else {
3011 entries.samplers.emplace_back(sampler); 3066 entries.samplers.emplace_back(sampler);
3012 } 3067 }
3013 } 3068 }
3014 for (const auto& image : ir.GetImages()) { 3069 for (const auto& image : ir.GetImages()) {
3015 entries.images.emplace_back(image); 3070 if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
3071 entries.storage_texels.emplace_back(image);
3072 } else {
3073 entries.images.emplace_back(image);
3074 }
3016 } 3075 }
3017 for (const auto& attribute : ir.GetInputAttributes()) { 3076 for (const auto& attribute : ir.GetInputAttributes()) {
3018 if (IsGenericAttribute(attribute)) { 3077 if (IsGenericAttribute(attribute)) {
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
index f4c05ac3c..2b0e90396 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -21,8 +21,9 @@ class VKDevice;
21namespace Vulkan { 21namespace Vulkan {
22 22
23using Maxwell = Tegra::Engines::Maxwell3D::Regs; 23using Maxwell = Tegra::Engines::Maxwell3D::Regs;
24using TexelBufferEntry = VideoCommon::Shader::Sampler; 24using UniformTexelEntry = VideoCommon::Shader::Sampler;
25using SamplerEntry = VideoCommon::Shader::Sampler; 25using SamplerEntry = VideoCommon::Shader::Sampler;
26using StorageTexelEntry = VideoCommon::Shader::Image;
26using ImageEntry = VideoCommon::Shader::Image; 27using ImageEntry = VideoCommon::Shader::Image;
27 28
28constexpr u32 DESCRIPTOR_SET = 0; 29constexpr u32 DESCRIPTOR_SET = 0;
@@ -66,13 +67,15 @@ private:
66struct ShaderEntries { 67struct ShaderEntries {
67 u32 NumBindings() const { 68 u32 NumBindings() const {
68 return static_cast<u32>(const_buffers.size() + global_buffers.size() + 69 return static_cast<u32>(const_buffers.size() + global_buffers.size() +
69 texel_buffers.size() + samplers.size() + images.size()); 70 uniform_texels.size() + samplers.size() + storage_texels.size() +
71 images.size());
70 } 72 }
71 73
72 std::vector<ConstBufferEntry> const_buffers; 74 std::vector<ConstBufferEntry> const_buffers;
73 std::vector<GlobalBufferEntry> global_buffers; 75 std::vector<GlobalBufferEntry> global_buffers;
74 std::vector<TexelBufferEntry> texel_buffers; 76 std::vector<UniformTexelEntry> uniform_texels;
75 std::vector<SamplerEntry> samplers; 77 std::vector<SamplerEntry> samplers;
78 std::vector<StorageTexelEntry> storage_texels;
76 std::vector<ImageEntry> images; 79 std::vector<ImageEntry> images;
77 std::set<u32> attributes; 80 std::set<u32> attributes;
78 std::array<bool, Maxwell::NumClipDistances> clip_distances{}; 81 std::array<bool, Maxwell::NumClipDistances> clip_distances{};
@@ -88,7 +91,8 @@ struct Specialization final {
88 u32 shared_memory_size{}; 91 u32 shared_memory_size{};
89 92
90 // Graphics specific 93 // Graphics specific
91 std::optional<float> point_size{}; 94 std::optional<float> point_size;
95 std::bitset<Maxwell::NumVertexAttributes> enabled_attributes;
92 std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{}; 96 std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{};
93 bool ndc_minus_one_to_one{}; 97 bool ndc_minus_one_to_one{};
94}; 98};
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
index dfddf7ad6..c765c60a0 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -35,7 +35,7 @@ public:
35 /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy. 35 /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
36 void Unmap(u64 size); 36 void Unmap(u64 size);
37 37
38 VkBuffer GetHandle() const { 38 VkBuffer Handle() const {
39 return *buffer; 39 return *buffer;
40 } 40 }
41 41
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 55f43e61b..430031665 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -100,8 +100,8 @@ vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params,
100 ci.pNext = nullptr; 100 ci.pNext = nullptr;
101 ci.flags = 0; 101 ci.flags = 0;
102 ci.size = static_cast<VkDeviceSize>(host_memory_size); 102 ci.size = static_cast<VkDeviceSize>(host_memory_size);
103 ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | 103 ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT |
104 VK_BUFFER_USAGE_TRANSFER_DST_BIT; 104 VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
105 ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE; 105 ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
106 ci.queueFamilyIndexCount = 0; 106 ci.queueFamilyIndexCount = 0;
107 ci.pQueueFamilyIndices = nullptr; 107 ci.pQueueFamilyIndices = nullptr;
@@ -167,6 +167,7 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
167 ci.extent = {params.width, params.height, 1}; 167 ci.extent = {params.width, params.height, 1};
168 break; 168 break;
169 case SurfaceTarget::Texture3D: 169 case SurfaceTarget::Texture3D:
170 ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
170 ci.extent = {params.width, params.height, params.depth}; 171 ci.extent = {params.width, params.height, params.depth};
171 break; 172 break;
172 case SurfaceTarget::TextureBuffer: 173 case SurfaceTarget::TextureBuffer:
@@ -176,6 +177,12 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
176 return ci; 177 return ci;
177} 178}
178 179
180u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source,
181 Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) {
182 return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
183 (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
184}
185
179} // Anonymous namespace 186} // Anonymous namespace
180 187
181CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, 188CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
@@ -203,9 +210,11 @@ CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
203 } 210 }
204 211
205 // TODO(Rodrigo): Move this to a virtual function. 212 // TODO(Rodrigo): Move this to a virtual function.
206 main_view = CreateViewInner( 213 u32 num_layers = 1;
207 ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels), 214 if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
208 true); 215 num_layers = params.depth;
216 }
217 main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels));
209} 218}
210 219
211CachedSurface::~CachedSurface() = default; 220CachedSurface::~CachedSurface() = default;
@@ -253,12 +262,8 @@ void CachedSurface::DecorateSurfaceName() {
253} 262}
254 263
255View CachedSurface::CreateView(const ViewParams& params) { 264View CachedSurface::CreateView(const ViewParams& params) {
256 return CreateViewInner(params, false);
257}
258
259View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) {
260 // TODO(Rodrigo): Add name decorations 265 // TODO(Rodrigo): Add name decorations
261 return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy); 266 return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params);
262} 267}
263 268
264void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) { 269void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) {
@@ -342,38 +347,44 @@ VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const {
342} 347}
343 348
344CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface, 349CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
345 const ViewParams& params, bool is_proxy) 350 const ViewParams& params)
346 : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()}, 351 : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()},
347 image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()}, 352 image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()},
348 aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface}, 353 aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface},
349 base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level}, 354 base_level{params.base_level}, num_levels{params.num_levels},
350 num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target) 355 image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} {
351 : VK_IMAGE_VIEW_TYPE_1D} {} 356 if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
357 base_layer = 0;
358 num_layers = 1;
359 base_slice = params.base_layer;
360 num_slices = params.num_layers;
361 } else {
362 base_layer = params.base_layer;
363 num_layers = params.num_layers;
364 }
365}
352 366
353CachedSurfaceView::~CachedSurfaceView() = default; 367CachedSurfaceView::~CachedSurfaceView() = default;
354 368
355VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source, 369VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source,
356 SwizzleSource z_source, SwizzleSource w_source) { 370 SwizzleSource z_source, SwizzleSource w_source) {
357 const u32 swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); 371 const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
358 if (last_image_view && last_swizzle == swizzle) { 372 if (last_image_view && last_swizzle == new_swizzle) {
359 return last_image_view; 373 return last_image_view;
360 } 374 }
361 last_swizzle = swizzle; 375 last_swizzle = new_swizzle;
362 376
363 const auto [entry, is_cache_miss] = view_cache.try_emplace(swizzle); 377 const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
364 auto& image_view = entry->second; 378 auto& image_view = entry->second;
365 if (!is_cache_miss) { 379 if (!is_cache_miss) {
366 return last_image_view = *image_view; 380 return last_image_view = *image_view;
367 } 381 }
368 382
369 auto swizzle_x = MaxwellToVK::SwizzleSource(x_source); 383 std::array swizzle{MaxwellToVK::SwizzleSource(x_source), MaxwellToVK::SwizzleSource(y_source),
370 auto swizzle_y = MaxwellToVK::SwizzleSource(y_source); 384 MaxwellToVK::SwizzleSource(z_source), MaxwellToVK::SwizzleSource(w_source)};
371 auto swizzle_z = MaxwellToVK::SwizzleSource(z_source);
372 auto swizzle_w = MaxwellToVK::SwizzleSource(w_source);
373
374 if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) { 385 if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) {
375 // A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here. 386 // A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here.
376 std::swap(swizzle_x, swizzle_z); 387 std::swap(swizzle[0], swizzle[2]);
377 } 388 }
378 389
379 // Games can sample depth or stencil values on textures. This is decided by the swizzle value on 390 // Games can sample depth or stencil values on textures. This is decided by the swizzle value on
@@ -395,11 +406,16 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
395 UNIMPLEMENTED(); 406 UNIMPLEMENTED();
396 } 407 }
397 408
398 // Vulkan doesn't seem to understand swizzling of a depth stencil image, use identity 409 // Make sure we sample the first component
399 swizzle_x = VK_COMPONENT_SWIZZLE_R; 410 std::transform(
400 swizzle_y = VK_COMPONENT_SWIZZLE_G; 411 swizzle.begin(), swizzle.end(), swizzle.begin(), [](VkComponentSwizzle component) {
401 swizzle_z = VK_COMPONENT_SWIZZLE_B; 412 return component == VK_COMPONENT_SWIZZLE_G ? VK_COMPONENT_SWIZZLE_R : component;
402 swizzle_w = VK_COMPONENT_SWIZZLE_A; 413 });
414 }
415
416 if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
417 ASSERT(base_slice == 0);
418 ASSERT(num_slices == params.depth);
403 } 419 }
404 420
405 VkImageViewCreateInfo ci; 421 VkImageViewCreateInfo ci;
@@ -409,7 +425,7 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
409 ci.image = surface.GetImageHandle(); 425 ci.image = surface.GetImageHandle();
410 ci.viewType = image_view_type; 426 ci.viewType = image_view_type;
411 ci.format = surface.GetImage().GetFormat(); 427 ci.format = surface.GetImage().GetFormat();
412 ci.components = {swizzle_x, swizzle_y, swizzle_z, swizzle_w}; 428 ci.components = {swizzle[0], swizzle[1], swizzle[2], swizzle[3]};
413 ci.subresourceRange.aspectMask = aspect; 429 ci.subresourceRange.aspectMask = aspect;
414 ci.subresourceRange.baseMipLevel = base_level; 430 ci.subresourceRange.baseMipLevel = base_level;
415 ci.subresourceRange.levelCount = num_levels; 431 ci.subresourceRange.levelCount = num_levels;
@@ -420,6 +436,35 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
420 return last_image_view = *image_view; 436 return last_image_view = *image_view;
421} 437}
422 438
439VkImageView CachedSurfaceView::GetAttachment() {
440 if (render_target) {
441 return *render_target;
442 }
443
444 VkImageViewCreateInfo ci;
445 ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
446 ci.pNext = nullptr;
447 ci.flags = 0;
448 ci.image = surface.GetImageHandle();
449 ci.format = surface.GetImage().GetFormat();
450 ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY,
451 VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY};
452 ci.subresourceRange.aspectMask = aspect_mask;
453 ci.subresourceRange.baseMipLevel = base_level;
454 ci.subresourceRange.levelCount = num_levels;
455 if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
456 ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D;
457 ci.subresourceRange.baseArrayLayer = base_slice;
458 ci.subresourceRange.layerCount = num_slices;
459 } else {
460 ci.viewType = image_view_type;
461 ci.subresourceRange.baseArrayLayer = base_layer;
462 ci.subresourceRange.layerCount = num_layers;
463 }
464 render_target = device.GetLogical().CreateImageView(ci);
465 return *render_target;
466}
467
423VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, 468VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
424 const VKDevice& device, VKResourceManager& resource_manager, 469 const VKDevice& device, VKResourceManager& resource_manager,
425 VKMemoryManager& memory_manager, VKScheduler& scheduler, 470 VKMemoryManager& memory_manager, VKScheduler& scheduler,
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index f211ccb1e..807e26c8a 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -91,7 +91,6 @@ protected:
91 void DecorateSurfaceName(); 91 void DecorateSurfaceName();
92 92
93 View CreateView(const ViewParams& params) override; 93 View CreateView(const ViewParams& params) override;
94 View CreateViewInner(const ViewParams& params, bool is_proxy);
95 94
96private: 95private:
97 void UploadBuffer(const std::vector<u8>& staging_buffer); 96 void UploadBuffer(const std::vector<u8>& staging_buffer);
@@ -120,23 +119,20 @@ private:
120class CachedSurfaceView final : public VideoCommon::ViewBase { 119class CachedSurfaceView final : public VideoCommon::ViewBase {
121public: 120public:
122 explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface, 121 explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
123 const ViewParams& params, bool is_proxy); 122 const ViewParams& params);
124 ~CachedSurfaceView(); 123 ~CachedSurfaceView();
125 124
126 VkImageView GetHandle(Tegra::Texture::SwizzleSource x_source, 125 VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source,
127 Tegra::Texture::SwizzleSource y_source, 126 Tegra::Texture::SwizzleSource y_source,
128 Tegra::Texture::SwizzleSource z_source, 127 Tegra::Texture::SwizzleSource z_source,
129 Tegra::Texture::SwizzleSource w_source); 128 Tegra::Texture::SwizzleSource w_source);
129
130 VkImageView GetAttachment();
130 131
131 bool IsSameSurface(const CachedSurfaceView& rhs) const { 132 bool IsSameSurface(const CachedSurfaceView& rhs) const {
132 return &surface == &rhs.surface; 133 return &surface == &rhs.surface;
133 } 134 }
134 135
135 VkImageView GetHandle() {
136 return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G,
137 Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A);
138 }
139
140 u32 GetWidth() const { 136 u32 GetWidth() const {
141 return params.GetMipWidth(base_level); 137 return params.GetMipWidth(base_level);
142 } 138 }
@@ -180,14 +176,6 @@ public:
180 } 176 }
181 177
182private: 178private:
183 static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
184 Tegra::Texture::SwizzleSource y_source,
185 Tegra::Texture::SwizzleSource z_source,
186 Tegra::Texture::SwizzleSource w_source) {
187 return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
188 (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
189 }
190
191 // Store a copy of these values to avoid double dereference when reading them 179 // Store a copy of these values to avoid double dereference when reading them
192 const SurfaceParams params; 180 const SurfaceParams params;
193 const VkImage image; 181 const VkImage image;
@@ -196,15 +184,18 @@ private:
196 184
197 const VKDevice& device; 185 const VKDevice& device;
198 CachedSurface& surface; 186 CachedSurface& surface;
199 const u32 base_layer;
200 const u32 num_layers;
201 const u32 base_level; 187 const u32 base_level;
202 const u32 num_levels; 188 const u32 num_levels;
203 const VkImageViewType image_view_type; 189 const VkImageViewType image_view_type;
190 u32 base_layer = 0;
191 u32 num_layers = 0;
192 u32 base_slice = 0;
193 u32 num_slices = 0;
204 194
205 VkImageView last_image_view = nullptr; 195 VkImageView last_image_view = nullptr;
206 u32 last_swizzle = 0; 196 u32 last_swizzle = 0;
207 197
198 vk::ImageView render_target;
208 std::unordered_map<u32, vk::ImageView> view_cache; 199 std::unordered_map<u32, vk::ImageView> view_cache;
209}; 200};
210 201
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index 9392f065b..63adbc4a3 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -387,7 +387,6 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
387 } 387 }
388 case OpCode::Id::RED: { 388 case OpCode::Id::RED: {
389 UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32); 389 UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32);
390 UNIMPLEMENTED_IF_MSG(instr.red.operation != AtomicOp::Add);
391 const auto [real_address, base_address, descriptor] = 390 const auto [real_address, base_address, descriptor] =
392 TrackGlobalMemory(bb, instr, true, true); 391 TrackGlobalMemory(bb, instr, true, true);
393 if (!real_address || !base_address) { 392 if (!real_address || !base_address) {
@@ -396,7 +395,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
396 } 395 }
397 Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); 396 Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
398 Node value = GetRegister(instr.gpr0); 397 Node value = GetRegister(instr.gpr0);
399 bb.push_back(Operation(OperationCode::ReduceIAdd, move(gmem), move(value))); 398 bb.push_back(Operation(GetAtomOperation(instr.red.operation), move(gmem), move(value)));
400 break; 399 break;
401 } 400 }
402 case OpCode::Id::ATOM: { 401 case OpCode::Id::ATOM: {
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index d4f95b18c..c0a8f233f 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -83,7 +83,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
83 return Operation(OperationCode::YNegate); 83 return Operation(OperationCode::YNegate);
84 case SystemVariable::InvocationInfo: 84 case SystemVariable::InvocationInfo:
85 LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete"); 85 LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete");
86 return Immediate(0U); 86 return Immediate(0x00ff'0000U);
87 case SystemVariable::WscaleFactorXY: 87 case SystemVariable::WscaleFactorXY:
88 UNIMPLEMENTED_MSG("S2R WscaleFactorXY is not implemented"); 88 UNIMPLEMENTED_MSG("S2R WscaleFactorXY is not implemented");
89 return Immediate(0U); 89 return Immediate(0U);
@@ -109,6 +109,27 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
109 return Operation(OperationCode::WorkGroupIdY); 109 return Operation(OperationCode::WorkGroupIdY);
110 case SystemVariable::CtaIdZ: 110 case SystemVariable::CtaIdZ:
111 return Operation(OperationCode::WorkGroupIdZ); 111 return Operation(OperationCode::WorkGroupIdZ);
112 case SystemVariable::EqMask:
113 case SystemVariable::LtMask:
114 case SystemVariable::LeMask:
115 case SystemVariable::GtMask:
116 case SystemVariable::GeMask:
117 uses_warps = true;
118 switch (instr.sys20) {
119 case SystemVariable::EqMask:
120 return Operation(OperationCode::ThreadEqMask);
121 case SystemVariable::LtMask:
122 return Operation(OperationCode::ThreadLtMask);
123 case SystemVariable::LeMask:
124 return Operation(OperationCode::ThreadLeMask);
125 case SystemVariable::GtMask:
126 return Operation(OperationCode::ThreadGtMask);
127 case SystemVariable::GeMask:
128 return Operation(OperationCode::ThreadGeMask);
129 default:
130 UNREACHABLE();
131 return Immediate(0u);
132 }
112 default: 133 default:
113 UNIMPLEMENTED_MSG("Unhandled system move: {}", 134 UNIMPLEMENTED_MSG("Unhandled system move: {}",
114 static_cast<u32>(instr.sys20.Value())); 135 static_cast<u32>(instr.sys20.Value()));
@@ -272,10 +293,25 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
272 SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8)); 293 SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8));
273 break; 294 break;
274 } 295 }
296 case OpCode::Id::BAR: {
297 UNIMPLEMENTED_IF_MSG(instr.value != 0xF0A81B8000070000ULL, "BAR is not BAR.SYNC 0x0");
298 bb.push_back(Operation(OperationCode::Barrier));
299 break;
300 }
275 case OpCode::Id::MEMBAR: { 301 case OpCode::Id::MEMBAR: {
276 UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL);
277 UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default); 302 UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default);
278 bb.push_back(Operation(OperationCode::MemoryBarrierGL)); 303 const OperationCode type = [instr] {
304 switch (instr.membar.type) {
305 case Tegra::Shader::MembarType::CTA:
306 return OperationCode::MemoryBarrierGroup;
307 case Tegra::Shader::MembarType::GL:
308 return OperationCode::MemoryBarrierGlobal;
309 default:
310 UNIMPLEMENTED_MSG("MEMBAR type={}", static_cast<int>(instr.membar.type.Value()));
311 return OperationCode::MemoryBarrierGlobal;
312 }
313 }();
314 bb.push_back(Operation(type));
279 break; 315 break;
280 } 316 }
281 case OpCode::Id::DEPBAR: { 317 case OpCode::Id::DEPBAR: {
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 8f0bb996e..29ebf65ba 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -357,13 +357,11 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
357 return pc; 357 return pc;
358} 358}
359 359
360ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset, 360ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(
361 std::optional<u32> buffer) { 361 SamplerInfo info, std::optional<Tegra::Engines::SamplerDescriptor> sampler) {
362 if (info.IsComplete()) { 362 if (info.IsComplete()) {
363 return info; 363 return info;
364 } 364 }
365 const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset)
366 : registry.ObtainBoundSampler(offset);
367 if (!sampler) { 365 if (!sampler) {
368 LOG_WARNING(HW_GPU, "Unknown sampler info"); 366 LOG_WARNING(HW_GPU, "Unknown sampler info");
369 info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D); 367 info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D);
@@ -381,8 +379,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset,
381 379
382std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, 380std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler,
383 SamplerInfo sampler_info) { 381 SamplerInfo sampler_info) {
384 const auto offset = static_cast<u32>(sampler.index.Value()); 382 const u32 offset = static_cast<u32>(sampler.index.Value());
385 const auto info = GetSamplerInfo(sampler_info, offset); 383 const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset));
386 384
387 // If this sampler has already been used, return the existing mapping. 385 // If this sampler has already been used, return the existing mapping.
388 const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), 386 const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
@@ -404,20 +402,19 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
404 const Node sampler_register = GetRegister(reg); 402 const Node sampler_register = GetRegister(reg);
405 const auto [base_node, tracked_sampler_info] = 403 const auto [base_node, tracked_sampler_info] =
406 TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size())); 404 TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size()));
407 ASSERT(base_node != nullptr); 405 if (!base_node) {
408 if (base_node == nullptr) { 406 UNREACHABLE();
409 return std::nullopt; 407 return std::nullopt;
410 } 408 }
411 409
412 if (const auto bindless_sampler_info = 410 if (const auto sampler_info = std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
413 std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) { 411 const u32 buffer = sampler_info->index;
414 const u32 buffer = bindless_sampler_info->GetIndex(); 412 const u32 offset = sampler_info->offset;
415 const u32 offset = bindless_sampler_info->GetOffset(); 413 info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset));
416 info = GetSamplerInfo(info, offset, buffer);
417 414
418 // If this sampler has already been used, return the existing mapping. 415 // If this sampler has already been used, return the existing mapping.
419 const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), 416 const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
420 [buffer = buffer, offset = offset](const Sampler& entry) { 417 [buffer, offset](const Sampler& entry) {
421 return entry.buffer == buffer && entry.offset == offset; 418 return entry.buffer == buffer && entry.offset == offset;
422 }); 419 });
423 if (it != used_samplers.end()) { 420 if (it != used_samplers.end()) {
@@ -431,10 +428,32 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
431 return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array, 428 return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array,
432 *info.is_shadow, *info.is_buffer, false); 429 *info.is_shadow, *info.is_buffer, false);
433 } 430 }
434 if (const auto array_sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) { 431 if (const auto sampler_info = std::get_if<SeparateSamplerNode>(&*tracked_sampler_info)) {
435 const u32 base_offset = array_sampler_info->GetBaseOffset() / 4; 432 const std::pair indices = sampler_info->indices;
436 index_var = GetCustomVariable(array_sampler_info->GetIndexVar()); 433 const std::pair offsets = sampler_info->offsets;
437 info = GetSamplerInfo(info, base_offset); 434 info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets));
435
436 // Try to use an already created sampler if it exists
437 const auto it = std::find_if(
438 used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) {
439 return offsets == std::pair{entry.offset, entry.secondary_offset} &&
440 indices == std::pair{entry.buffer, entry.secondary_buffer};
441 });
442 if (it != used_samplers.end()) {
443 ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array &&
444 it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer);
445 return *it;
446 }
447
448 // Otherwise create a new mapping for this sampler
449 const u32 next_index = static_cast<u32>(used_samplers.size());
450 return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array,
451 *info.is_shadow, *info.is_buffer);
452 }
453 if (const auto sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
454 const u32 base_offset = sampler_info->base_offset / 4;
455 index_var = GetCustomVariable(sampler_info->bindless_var);
456 info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset));
438 457
439 // If this sampler has already been used, return the existing mapping. 458 // If this sampler has already been used, return the existing mapping.
440 const auto it = std::find_if( 459 const auto it = std::find_if(
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index f75b62240..8f230d57a 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -226,9 +226,16 @@ enum class OperationCode {
226 VoteEqual, /// (bool) -> bool 226 VoteEqual, /// (bool) -> bool
227 227
228 ThreadId, /// () -> uint 228 ThreadId, /// () -> uint
229 ThreadEqMask, /// () -> uint
230 ThreadGeMask, /// () -> uint
231 ThreadGtMask, /// () -> uint
232 ThreadLeMask, /// () -> uint
233 ThreadLtMask, /// () -> uint
229 ShuffleIndexed, /// (uint value, uint index) -> uint 234 ShuffleIndexed, /// (uint value, uint index) -> uint
230 235
231 MemoryBarrierGL, /// () -> void 236 Barrier, /// () -> void
237 MemoryBarrierGroup, /// () -> void
238 MemoryBarrierGlobal, /// () -> void
232 239
233 Amount, 240 Amount,
234}; 241};
@@ -268,10 +275,11 @@ using Node = std::shared_ptr<NodeData>;
268using Node4 = std::array<Node, 4>; 275using Node4 = std::array<Node, 4>;
269using NodeBlock = std::vector<Node>; 276using NodeBlock = std::vector<Node>;
270 277
271class BindlessSamplerNode; 278struct ArraySamplerNode;
272class ArraySamplerNode; 279struct BindlessSamplerNode;
280struct SeparateSamplerNode;
273 281
274using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>; 282using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>;
275using TrackSampler = std::shared_ptr<TrackSamplerData>; 283using TrackSampler = std::shared_ptr<TrackSamplerData>;
276 284
277struct Sampler { 285struct Sampler {
@@ -281,63 +289,51 @@ struct Sampler {
281 : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow}, 289 : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow},
282 is_buffer{is_buffer}, is_indexed{is_indexed} {} 290 is_buffer{is_buffer}, is_indexed{is_indexed} {}
283 291
292 /// Separate sampler constructor
293 constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers,
294 Tegra::Shader::TextureType type, bool is_array, bool is_shadow,
295 bool is_buffer)
296 : index{index}, offset{offsets.first}, secondary_offset{offsets.second},
297 buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array},
298 is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {}
299
284 /// Bindless samplers constructor 300 /// Bindless samplers constructor
285 constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type, 301 constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type,
286 bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) 302 bool is_array, bool is_shadow, bool is_buffer, bool is_indexed)
287 : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array}, 303 : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array},
288 is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {} 304 is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {}
289 305
290 u32 index = 0; ///< Emulated index given for the this sampler. 306 u32 index = 0; ///< Emulated index given for the this sampler.
291 u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. 307 u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read.
292 u32 buffer = 0; ///< Buffer where the bindless sampler is being read (unused on bound samplers). 308 u32 secondary_offset = 0; ///< Secondary offset in the const buffer.
293 u32 size = 1; ///< Size of the sampler. 309 u32 buffer = 0; ///< Buffer where the bindless sampler is read.
310 u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read.
311 u32 size = 1; ///< Size of the sampler.
294 312
295 Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc) 313 Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
296 bool is_array = false; ///< Whether the texture is being sampled as an array texture or not. 314 bool is_array = false; ///< Whether the texture is being sampled as an array texture or not.
297 bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not. 315 bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not.
298 bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler. 316 bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler.
299 bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not. 317 bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not.
300 bool is_indexed = false; ///< Whether this sampler is an indexed array of textures. 318 bool is_indexed = false; ///< Whether this sampler is an indexed array of textures.
319 bool is_separated = false; ///< Whether the image and sampler is separated or not.
301}; 320};
302 321
303/// Represents a tracked bindless sampler into a direct const buffer 322/// Represents a tracked bindless sampler into a direct const buffer
304class ArraySamplerNode final { 323struct ArraySamplerNode {
305public:
306 explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var)
307 : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {}
308
309 constexpr u32 GetIndex() const {
310 return index;
311 }
312
313 constexpr u32 GetBaseOffset() const {
314 return base_offset;
315 }
316
317 constexpr u32 GetIndexVar() const {
318 return bindless_var;
319 }
320
321private:
322 u32 index; 324 u32 index;
323 u32 base_offset; 325 u32 base_offset;
324 u32 bindless_var; 326 u32 bindless_var;
325}; 327};
326 328
327/// Represents a tracked bindless sampler into a direct const buffer 329/// Represents a tracked separate sampler image pair that was folded statically
328class BindlessSamplerNode final { 330struct SeparateSamplerNode {
329public: 331 std::pair<u32, u32> indices;
330 explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {} 332 std::pair<u32, u32> offsets;
331 333};
332 constexpr u32 GetIndex() const {
333 return index;
334 }
335
336 constexpr u32 GetOffset() const {
337 return offset;
338 }
339 334
340private: 335/// Represents a tracked bindless sampler into a direct const buffer
336struct BindlessSamplerNode {
341 u32 index; 337 u32 index;
342 u32 offset; 338 u32 offset;
343}; 339};
diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h
index 11231bbea..1e0886185 100644
--- a/src/video_core/shader/node_helper.h
+++ b/src/video_core/shader/node_helper.h
@@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) {
48template <typename T, typename... Args> 48template <typename T, typename... Args>
49TrackSampler MakeTrackSampler(Args&&... args) { 49TrackSampler MakeTrackSampler(Args&&... args) {
50 static_assert(std::is_convertible_v<T, TrackSamplerData>); 50 static_assert(std::is_convertible_v<T, TrackSamplerData>);
51 return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...)); 51 return std::make_shared<TrackSamplerData>(T{std::forward<Args>(args)...});
52} 52}
53 53
54template <typename... Args> 54template <typename... Args>
diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp
index af70b3f35..cdf274e54 100644
--- a/src/video_core/shader/registry.cpp
+++ b/src/video_core/shader/registry.cpp
@@ -93,6 +93,26 @@ std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) {
93 return value; 93 return value;
94} 94}
95 95
96std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler(
97 std::pair<u32, u32> buffers, std::pair<u32, u32> offsets) {
98 SeparateSamplerKey key;
99 key.buffers = buffers;
100 key.offsets = offsets;
101 const auto iter = separate_samplers.find(key);
102 if (iter != separate_samplers.end()) {
103 return iter->second;
104 }
105 if (!engine) {
106 return std::nullopt;
107 }
108
109 const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first);
110 const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second);
111 const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2);
112 separate_samplers.emplace(key, value);
113 return value;
114}
115
96std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, 116std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer,
97 u32 offset) { 117 u32 offset) {
98 const std::pair key = {buffer, offset}; 118 const std::pair key = {buffer, offset};
diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h
index 0c80d35fd..231206765 100644
--- a/src/video_core/shader/registry.h
+++ b/src/video_core/shader/registry.h
@@ -19,8 +19,39 @@
19 19
20namespace VideoCommon::Shader { 20namespace VideoCommon::Shader {
21 21
22struct SeparateSamplerKey {
23 std::pair<u32, u32> buffers;
24 std::pair<u32, u32> offsets;
25};
26
27} // namespace VideoCommon::Shader
28
29namespace std {
30
31template <>
32struct hash<VideoCommon::Shader::SeparateSamplerKey> {
33 std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept {
34 return std::hash<u32>{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^
35 key.offsets.second);
36 }
37};
38
39template <>
40struct equal_to<VideoCommon::Shader::SeparateSamplerKey> {
41 bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs,
42 const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept {
43 return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets;
44 }
45};
46
47} // namespace std
48
49namespace VideoCommon::Shader {
50
22using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>; 51using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
23using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>; 52using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
53using SeparateSamplerMap =
54 std::unordered_map<SeparateSamplerKey, Tegra::Engines::SamplerDescriptor>;
24using BindlessSamplerMap = 55using BindlessSamplerMap =
25 std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>; 56 std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
26 57
@@ -73,6 +104,9 @@ public:
73 104
74 std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset); 105 std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
75 106
107 std::optional<Tegra::Engines::SamplerDescriptor> ObtainSeparateSampler(
108 std::pair<u32, u32> buffers, std::pair<u32, u32> offsets);
109
76 std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset); 110 std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
77 111
78 /// Inserts a key. 112 /// Inserts a key.
@@ -128,6 +162,7 @@ private:
128 Tegra::Engines::ConstBufferEngineInterface* engine = nullptr; 162 Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
129 KeyMap keys; 163 KeyMap keys;
130 BoundSamplerMap bound_samplers; 164 BoundSamplerMap bound_samplers;
165 SeparateSamplerMap separate_samplers;
131 BindlessSamplerMap bindless_samplers; 166 BindlessSamplerMap bindless_samplers;
132 u32 bound_buffer; 167 u32 bound_buffer;
133 GraphicsInfo graphics_info; 168 GraphicsInfo graphics_info;
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 15ae152f2..3a98b2104 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -330,8 +330,8 @@ private:
330 OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation); 330 OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation);
331 331
332 /// Queries the missing sampler info from the execution context. 332 /// Queries the missing sampler info from the execution context.
333 SamplerInfo GetSamplerInfo(SamplerInfo info, u32 offset, 333 SamplerInfo GetSamplerInfo(SamplerInfo info,
334 std::optional<u32> buffer = std::nullopt); 334 std::optional<Tegra::Engines::SamplerDescriptor> sampler);
335 335
336 /// Accesses a texture sampler. 336 /// Accesses a texture sampler.
337 std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info); 337 std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info);
@@ -409,8 +409,14 @@ private:
409 409
410 std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const; 410 std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
411 411
412 std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code, 412 std::pair<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
413 s64 cursor); 413 s64 cursor);
414
415 std::pair<Node, TrackSampler> HandleBindlessIndirectRead(const CbufNode& cbuf,
416 const OperationNode& operation,
417 Node gpr, Node base_offset,
418 Node tracked, const NodeBlock& code,
419 s64 cursor);
414 420
415 std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const; 421 std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;
416 422
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index eb97bfd41..d5ed81442 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -14,6 +14,7 @@
14namespace VideoCommon::Shader { 14namespace VideoCommon::Shader {
15 15
16namespace { 16namespace {
17
17std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor, 18std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
18 OperationCode operation_code) { 19 OperationCode operation_code) {
19 for (; cursor >= 0; --cursor) { 20 for (; cursor >= 0; --cursor) {
@@ -63,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
63 if (const auto operation = std::get_if<OperationNode>(&*node)) { 64 if (const auto operation = std::get_if<OperationNode>(&*node)) {
64 operation->SetAmendIndex(amend_index); 65 operation->SetAmendIndex(amend_index);
65 return true; 66 return true;
66 } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { 67 }
68 if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
67 conditional->SetAmendIndex(amend_index); 69 conditional->SetAmendIndex(amend_index);
68 return true; 70 return true;
69 } 71 }
@@ -72,40 +74,27 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
72 74
73} // Anonymous namespace 75} // Anonymous namespace
74 76
75std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, 77std::pair<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
76 s64 cursor) { 78 s64 cursor) {
77 if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { 79 if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
80 const u32 cbuf_index = cbuf->GetIndex();
81
78 // Constant buffer found, test if it's an immediate 82 // Constant buffer found, test if it's an immediate
79 const auto& offset = cbuf->GetOffset(); 83 const auto& offset = cbuf->GetOffset();
80 if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { 84 if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
81 auto track = 85 auto track = MakeTrackSampler<BindlessSamplerNode>(cbuf_index, immediate->GetValue());
82 MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue());
83 return {tracked, track}; 86 return {tracked, track};
84 } 87 }
85 if (const auto operation = std::get_if<OperationNode>(&*offset)) { 88 if (const auto operation = std::get_if<OperationNode>(&*offset)) {
86 const u32 bound_buffer = registry.GetBoundBuffer(); 89 const u32 bound_buffer = registry.GetBoundBuffer();
87 if (bound_buffer != cbuf->GetIndex()) { 90 if (bound_buffer != cbuf_index) {
88 return {}; 91 return {};
89 } 92 }
90 const auto pair = DecoupleIndirectRead(*operation); 93 if (const std::optional pair = DecoupleIndirectRead(*operation)) {
91 if (!pair) { 94 auto [gpr, base_offset] = *pair;
92 return {}; 95 return HandleBindlessIndirectRead(*cbuf, *operation, gpr, base_offset, tracked,
96 code, cursor);
93 } 97 }
94 auto [gpr, base_offset] = *pair;
95 const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset);
96 const auto& gpu_driver = registry.AccessGuestDriverProfile();
97 const u32 bindless_cv = NewCustomVariable();
98 Node op =
99 Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize()));
100
101 const Node cv_node = GetCustomVariable(bindless_cv);
102 Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op));
103 const std::size_t amend_index = DeclareAmend(std::move(amend_op));
104 AmendNodeCv(amend_index, code[cursor]);
105 // TODO Implement Bindless Index custom variable
106 auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(),
107 offset_inm->GetValue(), bindless_cv);
108 return {tracked, track};
109 } 98 }
110 return {}; 99 return {};
111 } 100 }
@@ -122,10 +111,23 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
122 return TrackBindlessSampler(source, code, new_cursor); 111 return TrackBindlessSampler(source, code, new_cursor);
123 } 112 }
124 if (const auto operation = std::get_if<OperationNode>(&*tracked)) { 113 if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
125 for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { 114 const OperationNode& op = *operation;
126 if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor); 115
127 std::get<0>(found)) { 116 const OperationCode opcode = operation->GetCode();
128 // Cbuf found in operand. 117 if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) {
118 ASSERT(op.GetOperandsCount() == 2);
119 auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor);
120 auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor);
121 if (node_a && node_b) {
122 auto track = MakeTrackSampler<SeparateSamplerNode>(std::pair{index_a, index_b},
123 std::pair{offset_a, offset_b});
124 return {tracked, std::move(track)};
125 }
126 }
127 std::size_t i = op.GetOperandsCount();
128 while (i--) {
129 if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) {
130 // Constant buffer found in operand.
129 return found; 131 return found;
130 } 132 }
131 } 133 }
@@ -139,6 +141,26 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
139 return {}; 141 return {};
140} 142}
141 143
144std::pair<Node, TrackSampler> ShaderIR::HandleBindlessIndirectRead(
145 const CbufNode& cbuf, const OperationNode& operation, Node gpr, Node base_offset, Node tracked,
146 const NodeBlock& code, s64 cursor) {
147 const auto offset_imm = std::get<ImmediateNode>(*base_offset);
148 const auto& gpu_driver = registry.AccessGuestDriverProfile();
149 const u32 bindless_cv = NewCustomVariable();
150 const u32 texture_handler_size = gpu_driver.GetTextureHandlerSize();
151 Node op = Operation(OperationCode::UDiv, gpr, Immediate(texture_handler_size));
152
153 Node cv_node = GetCustomVariable(bindless_cv);
154 Node amend_op = Operation(OperationCode::Assign, std::move(cv_node), std::move(op));
155 const std::size_t amend_index = DeclareAmend(std::move(amend_op));
156 AmendNodeCv(amend_index, code[cursor]);
157
158 // TODO: Implement bindless index custom variable
159 auto track =
160 MakeTrackSampler<ArraySamplerNode>(cbuf.GetIndex(), offset_imm.GetValue(), bindless_cv);
161 return {tracked, track};
162}
163
142std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, 164std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,
143 s64 cursor) const { 165 s64 cursor) const {
144 if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { 166 if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h
new file mode 100644
index 000000000..a23c23886
--- /dev/null
+++ b/src/video_core/shader_cache.h
@@ -0,0 +1,228 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <algorithm>
8#include <memory>
9#include <mutex>
10#include <unordered_map>
11#include <utility>
12#include <vector>
13
14#include "common/assert.h"
15#include "common/common_types.h"
16#include "video_core/rasterizer_interface.h"
17
18namespace VideoCommon {
19
20template <class T>
21class ShaderCache {
22 static constexpr u64 PAGE_SHIFT = 14;
23
24 struct Entry {
25 VAddr addr_start;
26 VAddr addr_end;
27 T* data;
28
29 bool is_memory_marked = true;
30
31 constexpr bool Overlaps(VAddr start, VAddr end) const noexcept {
32 return start < addr_end && addr_start < end;
33 }
34 };
35
36public:
37 virtual ~ShaderCache() = default;
38
39 /// @brief Removes shaders inside a given region
40 /// @note Checks for ranges
41 /// @param addr Start address of the invalidation
42 /// @param size Number of bytes of the invalidation
43 void InvalidateRegion(VAddr addr, std::size_t size) {
44 std::scoped_lock lock{invalidation_mutex};
45 InvalidatePagesInRegion(addr, size);
46 RemovePendingShaders();
47 }
48
49 /// @brief Unmarks a memory region as cached and marks it for removal
50 /// @param addr Start address of the CPU write operation
51 /// @param size Number of bytes of the CPU write operation
52 void OnCPUWrite(VAddr addr, std::size_t size) {
53 std::lock_guard lock{invalidation_mutex};
54 InvalidatePagesInRegion(addr, size);
55 }
56
57 /// @brief Flushes delayed removal operations
58 void SyncGuestHost() {
59 std::scoped_lock lock{invalidation_mutex};
60 RemovePendingShaders();
61 }
62
63 /// @brief Tries to obtain a cached shader starting in a given address
64 /// @note Doesn't check for ranges, the given address has to be the start of the shader
65 /// @param addr Start address of the shader, this doesn't cache for region
66 /// @return Pointer to a valid shader, nullptr when nothing is found
67 T* TryGet(VAddr addr) const {
68 std::scoped_lock lock{lookup_mutex};
69
70 const auto it = lookup_cache.find(addr);
71 if (it == lookup_cache.end()) {
72 return nullptr;
73 }
74 return it->second->data;
75 }
76
77protected:
78 explicit ShaderCache(VideoCore::RasterizerInterface& rasterizer_) : rasterizer{rasterizer_} {}
79
80 /// @brief Register in the cache a given entry
81 /// @param data Shader to store in the cache
82 /// @param addr Start address of the shader that will be registered
83 /// @param size Size in bytes of the shader
84 void Register(std::unique_ptr<T> data, VAddr addr, std::size_t size) {
85 std::scoped_lock lock{invalidation_mutex, lookup_mutex};
86
87 const VAddr addr_end = addr + size;
88 Entry* const entry = NewEntry(addr, addr_end, data.get());
89
90 const u64 page_end = addr_end >> PAGE_SHIFT;
91 for (u64 page = addr >> PAGE_SHIFT; page <= page_end; ++page) {
92 invalidation_cache[page].push_back(entry);
93 }
94
95 storage.push_back(std::move(data));
96
97 rasterizer.UpdatePagesCachedCount(addr, size, 1);
98 }
99
100 /// @brief Called when a shader is going to be removed
101 /// @param shader Shader that will be removed
102 /// @pre invalidation_cache is locked
103 /// @pre lookup_mutex is locked
104 virtual void OnShaderRemoval([[maybe_unused]] T* shader) {}
105
106private:
107 /// @brief Invalidate pages in a given region
108 /// @pre invalidation_mutex is locked
109 void InvalidatePagesInRegion(VAddr addr, std::size_t size) {
110 const VAddr addr_end = addr + size;
111 const u64 page_end = addr_end >> PAGE_SHIFT;
112 for (u64 page = addr >> PAGE_SHIFT; page <= page_end; ++page) {
113 const auto it = invalidation_cache.find(page);
114 if (it == invalidation_cache.end()) {
115 continue;
116 }
117
118 std::vector<Entry*>& entries = it->second;
119 InvalidatePageEntries(entries, addr, addr_end);
120
121 // If there's nothing else in this page, remove it to avoid overpopulating the hash map.
122 if (entries.empty()) {
123 invalidation_cache.erase(it);
124 }
125 }
126 }
127
128 /// @brief Remove shaders marked for deletion
129 /// @pre invalidation_mutex is locked
130 void RemovePendingShaders() {
131 if (marked_for_removal.empty()) {
132 return;
133 }
134 std::scoped_lock lock{lookup_mutex};
135
136 std::vector<T*> removed_shaders;
137 removed_shaders.reserve(marked_for_removal.size());
138
139 for (Entry* const entry : marked_for_removal) {
140 if (lookup_cache.erase(entry->addr_start) > 0) {
141 removed_shaders.push_back(entry->data);
142 }
143 }
144 marked_for_removal.clear();
145
146 if (!removed_shaders.empty()) {
147 RemoveShadersFromStorage(std::move(removed_shaders));
148 }
149 }
150
151 /// @brief Invalidates entries in a given range for the passed page
152 /// @param entries Vector of entries in the page, it will be modified on overlaps
153 /// @param addr Start address of the invalidation
154 /// @param addr_end Non-inclusive end address of the invalidation
155 /// @pre invalidation_mutex is locked
156 void InvalidatePageEntries(std::vector<Entry*>& entries, VAddr addr, VAddr addr_end) {
157 auto it = entries.begin();
158 while (it != entries.end()) {
159 Entry* const entry = *it;
160 if (!entry->Overlaps(addr, addr_end)) {
161 ++it;
162 continue;
163 }
164 UnmarkMemory(entry);
165 marked_for_removal.push_back(entry);
166
167 it = entries.erase(it);
168 }
169 }
170
171 /// @brief Unmarks an entry from the rasterizer cache
172 /// @param entry Entry to unmark from memory
173 void UnmarkMemory(Entry* entry) {
174 if (!entry->is_memory_marked) {
175 return;
176 }
177 entry->is_memory_marked = false;
178
179 const VAddr addr = entry->addr_start;
180 const std::size_t size = entry->addr_end - addr;
181 rasterizer.UpdatePagesCachedCount(addr, size, -1);
182 }
183
184 /// @brief Removes a vector of shaders from a list
185 /// @param removed_shaders Shaders to be removed from the storage, it can contain duplicates
186 /// @pre invalidation_mutex is locked
187 /// @pre lookup_mutex is locked
188 void RemoveShadersFromStorage(std::vector<T*> removed_shaders) {
189 // Remove duplicates
190 std::sort(removed_shaders.begin(), removed_shaders.end());
191 removed_shaders.erase(std::unique(removed_shaders.begin(), removed_shaders.end()),
192 removed_shaders.end());
193
194 // Now that there are no duplicates, we can notify removals
195 for (T* const shader : removed_shaders) {
196 OnShaderRemoval(shader);
197 }
198
199 // Remove them from the cache
200 const auto is_removed = [&removed_shaders](std::unique_ptr<T>& shader) {
201 return std::find(removed_shaders.begin(), removed_shaders.end(), shader.get()) !=
202 removed_shaders.end();
203 };
204 storage.erase(std::remove_if(storage.begin(), storage.end(), is_removed), storage.end());
205 }
206
207 /// @brief Creates a new entry in the lookup cache and returns its pointer
208 /// @pre lookup_mutex is locked
209 Entry* NewEntry(VAddr addr, VAddr addr_end, T* data) {
210 auto entry = std::make_unique<Entry>(Entry{addr, addr_end, data});
211 Entry* const entry_pointer = entry.get();
212
213 lookup_cache.emplace(addr, std::move(entry));
214 return entry_pointer;
215 }
216
217 VideoCore::RasterizerInterface& rasterizer;
218
219 mutable std::mutex lookup_mutex;
220 std::mutex invalidation_mutex;
221
222 std::unordered_map<u64, std::unique_ptr<Entry>> lookup_cache;
223 std::unordered_map<u64, std::vector<Entry*>> invalidation_cache;
224 std::vector<std::unique_ptr<T>> storage;
225 std::vector<Entry*> marked_for_removal;
226};
227
228} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp
index 7032e0059..f476f03b0 100644
--- a/src/video_core/texture_cache/format_lookup_table.cpp
+++ b/src/video_core/texture_cache/format_lookup_table.cpp
@@ -41,7 +41,7 @@ struct Table {
41 ComponentType alpha_component; 41 ComponentType alpha_component;
42 bool is_srgb; 42 bool is_srgb;
43}; 43};
44constexpr std::array<Table, 77> DefinitionTable = {{ 44constexpr std::array<Table, 78> DefinitionTable = {{
45 {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U}, 45 {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U},
46 {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S}, 46 {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S},
47 {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, 47 {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI},
@@ -98,6 +98,7 @@ constexpr std::array<Table, 77> DefinitionTable = {{
98 {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F}, 98 {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F},
99 {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16}, 99 {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16},
100 {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24}, 100 {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
101 {TextureFormat::G24R8, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
101 {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8}, 102 {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8},
102 103
103 {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1}, 104 {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1},
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 715f39d0d..94d3a6ae5 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -248,12 +248,11 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
248 248
249 // Use an extra temporal buffer 249 // Use an extra temporal buffer
250 auto& tmp_buffer = staging_cache.GetBuffer(1); 250 auto& tmp_buffer = staging_cache.GetBuffer(1);
251 // Special case for 3D Texture Segments
252 const bool must_read_current_data =
253 params.block_depth > 0 && params.target == VideoCore::Surface::SurfaceTarget::Texture2D;
254 tmp_buffer.resize(guest_memory_size); 251 tmp_buffer.resize(guest_memory_size);
255 host_ptr = tmp_buffer.data(); 252 host_ptr = tmp_buffer.data();
256 if (must_read_current_data) { 253
254 if (params.target == SurfaceTarget::Texture3D) {
255 // Special case for 3D texture segments
257 memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size); 256 memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size);
258 } 257 }
259 258
diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h
index 79e10ffbb..173f2edba 100644
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -217,8 +217,8 @@ public:
217 } 217 }
218 218
219 bool IsProtected() const { 219 bool IsProtected() const {
220 // Only 3D Slices are to be protected 220 // Only 3D slices are to be protected
221 return is_target && params.block_depth > 0; 221 return is_target && params.target == SurfaceTarget::Texture3D;
222 } 222 }
223 223
224 bool IsRenderTarget() const { 224 bool IsRenderTarget() const {
@@ -250,6 +250,11 @@ public:
250 return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels)); 250 return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels));
251 } 251 }
252 252
253 TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) {
254 return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth,
255 base_level, num_levels));
256 }
257
253 std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params, 258 std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params,
254 const GPUVAddr view_addr, 259 const GPUVAddr view_addr,
255 const std::size_t candidate_size, const u32 mipmap, 260 const std::size_t candidate_size, const u32 mipmap,
@@ -272,8 +277,8 @@ public:
272 std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr, 277 std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr,
273 const std::size_t candidate_size) { 278 const std::size_t candidate_size) {
274 if (params.target == SurfaceTarget::Texture3D || 279 if (params.target == SurfaceTarget::Texture3D ||
275 (params.num_levels == 1 && !params.is_layered) || 280 view_params.target == SurfaceTarget::Texture3D ||
276 view_params.target == SurfaceTarget::Texture3D) { 281 (params.num_levels == 1 && !params.is_layered)) {
277 return {}; 282 return {};
278 } 283 }
279 const auto layer_mipmap{GetLayerMipmap(view_addr)}; 284 const auto layer_mipmap{GetLayerMipmap(view_addr)};
diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp
index 884fabffe..0b2b2b8c4 100644
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -215,10 +215,19 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz
215 params.num_levels = 1; 215 params.num_levels = 1;
216 params.emulated_levels = 1; 216 params.emulated_levels = 1;
217 217
218 const bool is_layered = config.layers > 1 && params.block_depth == 0; 218 if (config.memory_layout.is_3d != 0) {
219 params.is_layered = is_layered; 219 params.depth = config.layers.Value();
220 params.depth = is_layered ? config.layers.Value() : 1; 220 params.is_layered = false;
221 params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; 221 params.target = SurfaceTarget::Texture3D;
222 } else if (config.layers > 1) {
223 params.depth = config.layers.Value();
224 params.is_layered = true;
225 params.target = SurfaceTarget::Texture2DArray;
226 } else {
227 params.depth = 1;
228 params.is_layered = false;
229 params.target = SurfaceTarget::Texture2D;
230 }
222 return params; 231 return params;
223} 232}
224 233
@@ -237,7 +246,7 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface(
237 params.width = config.width; 246 params.width = config.width;
238 params.height = config.height; 247 params.height = config.height;
239 params.pitch = config.pitch; 248 params.pitch = config.pitch;
240 // TODO(Rodrigo): Try to guess the surface target from depth and layer parameters 249 // TODO(Rodrigo): Try to guess texture arrays from parameters
241 params.target = SurfaceTarget::Texture2D; 250 params.target = SurfaceTarget::Texture2D;
242 params.depth = 1; 251 params.depth = 1;
243 params.num_levels = 1; 252 params.num_levels = 1;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index d6efc34b2..b543fc8c0 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -14,6 +14,7 @@
14#include <unordered_map> 14#include <unordered_map>
15#include <vector> 15#include <vector>
16 16
17#include <boost/container/small_vector.hpp>
17#include <boost/icl/interval_map.hpp> 18#include <boost/icl/interval_map.hpp>
18#include <boost/range/iterator_range.hpp> 19#include <boost/range/iterator_range.hpp>
19 20
@@ -53,6 +54,7 @@ using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;
53 54
54template <typename TSurface, typename TView> 55template <typename TSurface, typename TView>
55class TextureCache { 56class TextureCache {
57 using VectorSurface = boost::container::small_vector<TSurface, 1>;
56 58
57public: 59public:
58 void InvalidateRegion(VAddr addr, std::size_t size) { 60 void InvalidateRegion(VAddr addr, std::size_t size) {
@@ -296,30 +298,30 @@ public:
296 const GPUVAddr src_gpu_addr = src_config.Address(); 298 const GPUVAddr src_gpu_addr = src_config.Address();
297 const GPUVAddr dst_gpu_addr = dst_config.Address(); 299 const GPUVAddr dst_gpu_addr = dst_config.Address();
298 DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr); 300 DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr);
299 const std::optional<VAddr> dst_cpu_addr = 301
300 system.GPU().MemoryManager().GpuToCpuAddress(dst_gpu_addr); 302 const auto& memory_manager = system.GPU().MemoryManager();
301 const std::optional<VAddr> src_cpu_addr = 303 const std::optional<VAddr> dst_cpu_addr = memory_manager.GpuToCpuAddress(dst_gpu_addr);
302 system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr); 304 const std::optional<VAddr> src_cpu_addr = memory_manager.GpuToCpuAddress(src_gpu_addr);
303 std::pair<TSurface, TView> dst_surface = 305 std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
304 GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); 306 TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second;
305 std::pair<TSurface, TView> src_surface = 307 ImageBlit(src_surface, dst_surface.second, copy_config);
306 GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false);
307 ImageBlit(src_surface.second, dst_surface.second, copy_config);
308 dst_surface.first->MarkAsModified(true, Tick()); 308 dst_surface.first->MarkAsModified(true, Tick());
309 } 309 }
310 310
311 TSurface TryFindFramebufferSurface(VAddr addr) { 311 TSurface TryFindFramebufferSurface(VAddr addr) const {
312 if (!addr) { 312 if (!addr) {
313 return nullptr; 313 return nullptr;
314 } 314 }
315 const VAddr page = addr >> registry_page_bits; 315 const VAddr page = addr >> registry_page_bits;
316 std::vector<TSurface>& list = registry[page]; 316 const auto it = registry.find(page);
317 for (auto& surface : list) { 317 if (it == registry.end()) {
318 if (surface->GetCpuAddr() == addr) { 318 return nullptr;
319 return surface;
320 }
321 } 319 }
322 return nullptr; 320 const auto& list = it->second;
321 const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) {
322 return surface->GetCpuAddr() == addr;
323 });
324 return found != list.end() ? *found : nullptr;
323 } 325 }
324 326
325 u64 Tick() { 327 u64 Tick() {
@@ -498,18 +500,18 @@ private:
498 * @param untopological Indicates to the recycler that the texture has no way 500 * @param untopological Indicates to the recycler that the texture has no way
499 * to match the overlaps due to topological reasons. 501 * to match the overlaps due to topological reasons.
500 **/ 502 **/
501 RecycleStrategy PickStrategy(std::vector<TSurface>& overlaps, const SurfaceParams& params, 503 RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params,
502 const GPUVAddr gpu_addr, const MatchTopologyResult untopological) { 504 const GPUVAddr gpu_addr, const MatchTopologyResult untopological) {
503 if (Settings::IsGPULevelExtreme()) { 505 if (Settings::IsGPULevelExtreme()) {
504 return RecycleStrategy::Flush; 506 return RecycleStrategy::Flush;
505 } 507 }
506 // 3D Textures decision 508 // 3D Textures decision
507 if (params.block_depth > 1 || params.target == SurfaceTarget::Texture3D) { 509 if (params.target == SurfaceTarget::Texture3D) {
508 return RecycleStrategy::Flush; 510 return RecycleStrategy::Flush;
509 } 511 }
510 for (const auto& s : overlaps) { 512 for (const auto& s : overlaps) {
511 const auto& s_params = s->GetSurfaceParams(); 513 const auto& s_params = s->GetSurfaceParams();
512 if (s_params.block_depth > 1 || s_params.target == SurfaceTarget::Texture3D) { 514 if (s_params.target == SurfaceTarget::Texture3D) {
513 return RecycleStrategy::Flush; 515 return RecycleStrategy::Flush;
514 } 516 }
515 } 517 }
@@ -538,9 +540,8 @@ private:
538 * @param untopological Indicates to the recycler that the texture has no way to match the 540 * @param untopological Indicates to the recycler that the texture has no way to match the
539 * overlaps due to topological reasons. 541 * overlaps due to topological reasons.
540 **/ 542 **/
541 std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps, 543 std::pair<TSurface, TView> RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params,
542 const SurfaceParams& params, const GPUVAddr gpu_addr, 544 const GPUVAddr gpu_addr, const bool preserve_contents,
543 const bool preserve_contents,
544 const MatchTopologyResult untopological) { 545 const MatchTopologyResult untopological) {
545 const bool do_load = preserve_contents && Settings::IsGPULevelExtreme(); 546 const bool do_load = preserve_contents && Settings::IsGPULevelExtreme();
546 for (auto& surface : overlaps) { 547 for (auto& surface : overlaps) {
@@ -650,47 +651,65 @@ private:
650 * @param params The parameters on the new surface. 651 * @param params The parameters on the new surface.
651 * @param gpu_addr The starting address of the new surface. 652 * @param gpu_addr The starting address of the new surface.
652 **/ 653 **/
653 std::optional<std::pair<TSurface, TView>> TryReconstructSurface(std::vector<TSurface>& overlaps, 654 std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps,
654 const SurfaceParams& params, 655 const SurfaceParams& params,
655 const GPUVAddr gpu_addr) { 656 GPUVAddr gpu_addr) {
656 if (params.target == SurfaceTarget::Texture3D) { 657 if (params.target == SurfaceTarget::Texture3D) {
657 return {}; 658 return std::nullopt;
658 } 659 }
659 bool modified = false; 660 const auto test_modified = [](TSurface& surface) { return surface->IsModified(); };
660 TSurface new_surface = GetUncachedSurface(gpu_addr, params); 661 TSurface new_surface = GetUncachedSurface(gpu_addr, params);
661 u32 passed_tests = 0; 662
663 if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) {
664 LoadSurface(new_surface);
665 for (const auto& surface : overlaps) {
666 Unregister(surface);
667 }
668 Register(new_surface);
669 return {{new_surface, new_surface->GetMainView()}};
670 }
671
672 std::size_t passed_tests = 0;
662 for (auto& surface : overlaps) { 673 for (auto& surface : overlaps) {
663 const SurfaceParams& src_params = surface->GetSurfaceParams(); 674 const SurfaceParams& src_params = surface->GetSurfaceParams();
664 if (src_params.is_layered || src_params.num_levels > 1) { 675 const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
665 // We send this cases to recycle as they are more complex to handle
666 return {};
667 }
668 const std::size_t candidate_size = surface->GetSizeInBytes();
669 auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
670 if (!mipmap_layer) { 676 if (!mipmap_layer) {
671 continue; 677 continue;
672 } 678 }
673 const auto [layer, mipmap] = *mipmap_layer; 679 const auto [base_layer, base_mipmap] = *mipmap_layer;
674 if (new_surface->GetMipmapSize(mipmap) != candidate_size) { 680 if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) {
675 continue; 681 continue;
676 } 682 }
677 modified |= surface->IsModified(); 683 ++passed_tests;
678 // Now we got all the data set up 684
679 const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap); 685 // Copy all mipmaps and layers
680 const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap); 686 const u32 block_width = params.GetDefaultBlockWidth();
681 const CopyParams copy_params(0, 0, 0, 0, 0, layer, 0, mipmap, width, height, 1); 687 const u32 block_height = params.GetDefaultBlockHeight();
682 passed_tests++; 688 for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) {
683 ImageCopy(surface, new_surface, copy_params); 689 const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
690 const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
691 if (width < block_width || height < block_height) {
692 // Current APIs forbid copying small compressed textures, avoid errors
693 break;
694 }
695 const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,
696 src_params.depth);
697 ImageCopy(surface, new_surface, copy_params);
698 }
684 } 699 }
685 if (passed_tests == 0) { 700 if (passed_tests == 0) {
686 return {}; 701 return std::nullopt;
702 }
703 if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
687 // In Accurate GPU all tests should pass, else we recycle 704 // In Accurate GPU all tests should pass, else we recycle
688 } else if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) { 705 return std::nullopt;
689 return {};
690 } 706 }
707
708 const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified);
691 for (const auto& surface : overlaps) { 709 for (const auto& surface : overlaps) {
692 Unregister(surface); 710 Unregister(surface);
693 } 711 }
712
694 new_surface->MarkAsModified(modified, Tick()); 713 new_surface->MarkAsModified(modified, Tick());
695 Register(new_surface); 714 Register(new_surface);
696 return {{new_surface, new_surface->GetMainView()}}; 715 return {{new_surface, new_surface->GetMainView()}};
@@ -708,53 +727,11 @@ private:
708 * @param preserve_contents Indicates that the new surface should be loaded from memory or 727 * @param preserve_contents Indicates that the new surface should be loaded from memory or
709 * left blank. 728 * left blank.
710 */ 729 */
711 std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps, 730 std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps,
712 const SurfaceParams& params, 731 const SurfaceParams& params,
713 const GPUVAddr gpu_addr, 732 GPUVAddr gpu_addr, VAddr cpu_addr,
714 const VAddr cpu_addr,
715 bool preserve_contents) { 733 bool preserve_contents) {
716 if (params.target == SurfaceTarget::Texture3D) { 734 if (params.target != SurfaceTarget::Texture3D) {
717 bool failed = false;
718 if (params.num_levels > 1) {
719 // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
720 return std::nullopt;
721 }
722 TSurface new_surface = GetUncachedSurface(gpu_addr, params);
723 bool modified = false;
724 for (auto& surface : overlaps) {
725 const SurfaceParams& src_params = surface->GetSurfaceParams();
726 if (src_params.target != SurfaceTarget::Texture2D) {
727 failed = true;
728 break;
729 }
730 if (src_params.height != params.height) {
731 failed = true;
732 break;
733 }
734 if (src_params.block_depth != params.block_depth ||
735 src_params.block_height != params.block_height) {
736 failed = true;
737 break;
738 }
739 const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
740 const auto offsets = params.GetBlockOffsetXYZ(offset);
741 const auto z = std::get<2>(offsets);
742 modified |= surface->IsModified();
743 const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height,
744 1);
745 ImageCopy(surface, new_surface, copy_params);
746 }
747 if (failed) {
748 return std::nullopt;
749 }
750 for (const auto& surface : overlaps) {
751 Unregister(surface);
752 }
753 new_surface->MarkAsModified(modified, Tick());
754 Register(new_surface);
755 auto view = new_surface->GetMainView();
756 return {{std::move(new_surface), view}};
757 } else {
758 for (const auto& surface : overlaps) { 735 for (const auto& surface : overlaps) {
759 if (!surface->MatchTarget(params.target)) { 736 if (!surface->MatchTarget(params.target)) {
760 if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) { 737 if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) {
@@ -770,11 +747,60 @@ private:
770 continue; 747 continue;
771 } 748 }
772 if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) { 749 if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) {
773 return {{surface, surface->GetMainView()}}; 750 return std::make_pair(surface, surface->GetMainView());
774 } 751 }
775 } 752 }
776 return InitializeSurface(gpu_addr, params, preserve_contents); 753 return InitializeSurface(gpu_addr, params, preserve_contents);
777 } 754 }
755
756 if (params.num_levels > 1) {
757 // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
758 return std::nullopt;
759 }
760
761 if (overlaps.size() == 1) {
762 const auto& surface = overlaps[0];
763 const SurfaceParams& overlap_params = surface->GetSurfaceParams();
764 // Don't attempt to render to textures with more than one level for now
765 // The texture has to be to the right or the sample address if we want to render to it
766 if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) {
767 const u32 offset = static_cast<u32>(cpu_addr - surface->GetCpuAddr());
768 const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
769 if (slice < overlap_params.depth) {
770 auto view = surface->Emplace3DView(slice, params.depth, 0, 1);
771 return std::make_pair(std::move(surface), std::move(view));
772 }
773 }
774 }
775
776 TSurface new_surface = GetUncachedSurface(gpu_addr, params);
777 bool modified = false;
778
779 for (auto& surface : overlaps) {
780 const SurfaceParams& src_params = surface->GetSurfaceParams();
781 if (src_params.target != SurfaceTarget::Texture2D ||
782 src_params.height != params.height ||
783 src_params.block_depth != params.block_depth ||
784 src_params.block_height != params.block_height) {
785 return std::nullopt;
786 }
787 modified |= surface->IsModified();
788
789 const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
790 const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
791 const u32 width = params.width;
792 const u32 height = params.height;
793 const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1);
794 ImageCopy(surface, new_surface, copy_params);
795 }
796 for (const auto& surface : overlaps) {
797 Unregister(surface);
798 }
799 new_surface->MarkAsModified(modified, Tick());
800 Register(new_surface);
801
802 TView view = new_surface->GetMainView();
803 return std::make_pair(std::move(new_surface), std::move(view));
778 } 804 }
779 805
780 /** 806 /**
@@ -810,7 +836,7 @@ private:
810 TSurface& current_surface = iter->second; 836 TSurface& current_surface = iter->second;
811 const auto topological_result = current_surface->MatchesTopology(params); 837 const auto topological_result = current_surface->MatchesTopology(params);
812 if (topological_result != MatchTopologyResult::FullMatch) { 838 if (topological_result != MatchTopologyResult::FullMatch) {
813 std::vector<TSurface> overlaps{current_surface}; 839 VectorSurface overlaps{current_surface};
814 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, 840 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
815 topological_result); 841 topological_result);
816 } 842 }
@@ -852,7 +878,7 @@ private:
852 } 878 }
853 } 879 }
854 880
855 // Check if it's a 3D texture 881 // Manage 3D textures
856 if (params.block_depth > 0) { 882 if (params.block_depth > 0) {
857 auto surface = 883 auto surface =
858 Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents); 884 Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents);
@@ -868,12 +894,9 @@ private:
868 // two things either the candidate surface is a supertexture of the overlap 894 // two things either the candidate surface is a supertexture of the overlap
869 // or they don't match in any known way. 895 // or they don't match in any known way.
870 if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) { 896 if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) {
871 if (current_surface->GetGpuAddr() == gpu_addr) { 897 const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr);
872 std::optional<std::pair<TSurface, TView>> view = 898 if (view) {
873 TryReconstructSurface(overlaps, params, gpu_addr); 899 return *view;
874 if (view) {
875 return *view;
876 }
877 } 900 }
878 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, 901 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
879 MatchTopologyResult::FullMatch); 902 MatchTopologyResult::FullMatch);
@@ -991,7 +1014,9 @@ private:
991 params.target = target; 1014 params.target = target;
992 params.is_tiled = false; 1015 params.is_tiled = false;
993 params.srgb_conversion = false; 1016 params.srgb_conversion = false;
994 params.is_layered = false; 1017 params.is_layered =
1018 target == SurfaceTarget::Texture1DArray || target == SurfaceTarget::Texture2DArray ||
1019 target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray;
995 params.block_width = 0; 1020 params.block_width = 0;
996 params.block_height = 0; 1021 params.block_height = 0;
997 params.block_depth = 0; 1022 params.block_depth = 0;
@@ -1124,23 +1149,25 @@ private:
1124 } 1149 }
1125 } 1150 }
1126 1151
1127 std::vector<TSurface> GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) { 1152 VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
1128 if (size == 0) { 1153 if (size == 0) {
1129 return {}; 1154 return {};
1130 } 1155 }
1131 const VAddr cpu_addr_end = cpu_addr + size; 1156 const VAddr cpu_addr_end = cpu_addr + size;
1132 VAddr start = cpu_addr >> registry_page_bits;
1133 const VAddr end = (cpu_addr_end - 1) >> registry_page_bits; 1157 const VAddr end = (cpu_addr_end - 1) >> registry_page_bits;
1134 std::vector<TSurface> surfaces; 1158 VectorSurface surfaces;
1135 while (start <= end) { 1159 for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) {
1136 std::vector<TSurface>& list = registry[start]; 1160 const auto it = registry.find(start);
1137 for (auto& surface : list) { 1161 if (it == registry.end()) {
1138 if (!surface->IsPicked() && surface->Overlaps(cpu_addr, cpu_addr_end)) { 1162 continue;
1139 surface->MarkAsPicked(true); 1163 }
1140 surfaces.push_back(surface); 1164 for (auto& surface : it->second) {
1165 if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) {
1166 continue;
1141 } 1167 }
1168 surface->MarkAsPicked(true);
1169 surfaces.push_back(surface);
1142 } 1170 }
1143 start++;
1144 } 1171 }
1145 for (auto& surface : surfaces) { 1172 for (auto& surface : surfaces) {
1146 surface->MarkAsPicked(false); 1173 surface->MarkAsPicked(false);
diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp
index 1adf8932b..1f5e43043 100644
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -106,6 +106,9 @@ public:
106 format.setVersion(4, 3); 106 format.setVersion(4, 3);
107 format.setProfile(QSurfaceFormat::CompatibilityProfile); 107 format.setProfile(QSurfaceFormat::CompatibilityProfile);
108 format.setOption(QSurfaceFormat::FormatOption::DeprecatedFunctions); 108 format.setOption(QSurfaceFormat::FormatOption::DeprecatedFunctions);
109 if (Settings::values.renderer_debug) {
110 format.setOption(QSurfaceFormat::FormatOption::DebugContext);
111 }
109 // TODO: expose a setting for buffer value (ie default/single/double/triple) 112 // TODO: expose a setting for buffer value (ie default/single/double/triple)
110 format.setSwapBehavior(QSurfaceFormat::DefaultSwapBehavior); 113 format.setSwapBehavior(QSurfaceFormat::DefaultSwapBehavior);
111 format.setSwapInterval(0); 114 format.setSwapInterval(0);
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index 27775701d..32c81dc70 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -533,6 +533,8 @@ void Config::ReadDebuggingValues() {
533 Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool(); 533 Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool();
534 Settings::values.disable_cpu_opt = 534 Settings::values.disable_cpu_opt =
535 ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool(); 535 ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool();
536 Settings::values.disable_macro_jit =
537 ReadSetting(QStringLiteral("disable_macro_jit"), false).toBool();
536 538
537 qt_config->endGroup(); 539 qt_config->endGroup();
538} 540}
@@ -629,13 +631,11 @@ void Config::ReadRendererValues() {
629 static_cast<Settings::RendererBackend>(ReadSetting(QStringLiteral("backend"), 0).toInt()); 631 static_cast<Settings::RendererBackend>(ReadSetting(QStringLiteral("backend"), 0).toInt());
630 Settings::values.renderer_debug = ReadSetting(QStringLiteral("debug"), false).toBool(); 632 Settings::values.renderer_debug = ReadSetting(QStringLiteral("debug"), false).toBool();
631 Settings::values.vulkan_device = ReadSetting(QStringLiteral("vulkan_device"), 0).toInt(); 633 Settings::values.vulkan_device = ReadSetting(QStringLiteral("vulkan_device"), 0).toInt();
632 Settings::values.resolution_factor =
633 ReadSetting(QStringLiteral("resolution_factor"), 1.0).toFloat();
634 Settings::values.aspect_ratio = ReadSetting(QStringLiteral("aspect_ratio"), 0).toInt(); 634 Settings::values.aspect_ratio = ReadSetting(QStringLiteral("aspect_ratio"), 0).toInt();
635 Settings::values.max_anisotropy = ReadSetting(QStringLiteral("max_anisotropy"), 0).toInt(); 635 Settings::values.max_anisotropy = ReadSetting(QStringLiteral("max_anisotropy"), 0).toInt();
636 Settings::values.use_frame_limit = 636 Settings::values.use_frame_limit =
637 ReadSetting(QStringLiteral("use_frame_limit"), true).toBool(); 637 ReadSetting(QStringLiteral("use_frame_limit"), true).toBool();
638 Settings::values.frame_limit = ReadSetting(QStringLiteral("frame_limit"), 100).toInt(); 638 Settings::values.frame_limit = ReadSetting(QStringLiteral("frame_limit"), 100).toUInt();
639 Settings::values.use_disk_shader_cache = 639 Settings::values.use_disk_shader_cache =
640 ReadSetting(QStringLiteral("use_disk_shader_cache"), true).toBool(); 640 ReadSetting(QStringLiteral("use_disk_shader_cache"), true).toBool();
641 const int gpu_accuracy_level = ReadSetting(QStringLiteral("gpu_accuracy"), 0).toInt(); 641 const int gpu_accuracy_level = ReadSetting(QStringLiteral("gpu_accuracy"), 0).toInt();
@@ -643,6 +643,8 @@ void Config::ReadRendererValues() {
643 Settings::values.use_asynchronous_gpu_emulation = 643 Settings::values.use_asynchronous_gpu_emulation =
644 ReadSetting(QStringLiteral("use_asynchronous_gpu_emulation"), false).toBool(); 644 ReadSetting(QStringLiteral("use_asynchronous_gpu_emulation"), false).toBool();
645 Settings::values.use_vsync = ReadSetting(QStringLiteral("use_vsync"), true).toBool(); 645 Settings::values.use_vsync = ReadSetting(QStringLiteral("use_vsync"), true).toBool();
646 Settings::values.use_assembly_shaders =
647 ReadSetting(QStringLiteral("use_assembly_shaders"), false).toBool();
646 Settings::values.use_fast_gpu_time = 648 Settings::values.use_fast_gpu_time =
647 ReadSetting(QStringLiteral("use_fast_gpu_time"), true).toBool(); 649 ReadSetting(QStringLiteral("use_fast_gpu_time"), true).toBool();
648 Settings::values.force_30fps_mode = 650 Settings::values.force_30fps_mode =
@@ -718,8 +720,6 @@ void Config::ReadUIValues() {
718 .toString(); 720 .toString();
719 UISettings::values.enable_discord_presence = 721 UISettings::values.enable_discord_presence =
720 ReadSetting(QStringLiteral("enable_discord_presence"), true).toBool(); 722 ReadSetting(QStringLiteral("enable_discord_presence"), true).toBool();
721 UISettings::values.screenshot_resolution_factor =
722 static_cast<u16>(ReadSetting(QStringLiteral("screenshot_resolution_factor"), 0).toUInt());
723 UISettings::values.select_user_on_boot = 723 UISettings::values.select_user_on_boot =
724 ReadSetting(QStringLiteral("select_user_on_boot"), false).toBool(); 724 ReadSetting(QStringLiteral("select_user_on_boot"), false).toBool();
725 725
@@ -1009,6 +1009,7 @@ void Config::SaveDebuggingValues() {
1009 WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false); 1009 WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false);
1010 WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false); 1010 WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false);
1011 WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false); 1011 WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false);
1012 WriteSetting(QStringLiteral("disable_macro_jit"), Settings::values.disable_macro_jit, false);
1012 1013
1013 qt_config->endGroup(); 1014 qt_config->endGroup();
1014} 1015}
@@ -1077,8 +1078,6 @@ void Config::SaveRendererValues() {
1077 WriteSetting(QStringLiteral("backend"), static_cast<int>(Settings::values.renderer_backend), 0); 1078 WriteSetting(QStringLiteral("backend"), static_cast<int>(Settings::values.renderer_backend), 0);
1078 WriteSetting(QStringLiteral("debug"), Settings::values.renderer_debug, false); 1079 WriteSetting(QStringLiteral("debug"), Settings::values.renderer_debug, false);
1079 WriteSetting(QStringLiteral("vulkan_device"), Settings::values.vulkan_device, 0); 1080 WriteSetting(QStringLiteral("vulkan_device"), Settings::values.vulkan_device, 0);
1080 WriteSetting(QStringLiteral("resolution_factor"),
1081 static_cast<double>(Settings::values.resolution_factor), 1.0);
1082 WriteSetting(QStringLiteral("aspect_ratio"), Settings::values.aspect_ratio, 0); 1081 WriteSetting(QStringLiteral("aspect_ratio"), Settings::values.aspect_ratio, 0);
1083 WriteSetting(QStringLiteral("max_anisotropy"), Settings::values.max_anisotropy, 0); 1082 WriteSetting(QStringLiteral("max_anisotropy"), Settings::values.max_anisotropy, 0);
1084 WriteSetting(QStringLiteral("use_frame_limit"), Settings::values.use_frame_limit, true); 1083 WriteSetting(QStringLiteral("use_frame_limit"), Settings::values.use_frame_limit, true);
@@ -1090,6 +1089,8 @@ void Config::SaveRendererValues() {
1090 WriteSetting(QStringLiteral("use_asynchronous_gpu_emulation"), 1089 WriteSetting(QStringLiteral("use_asynchronous_gpu_emulation"),
1091 Settings::values.use_asynchronous_gpu_emulation, false); 1090 Settings::values.use_asynchronous_gpu_emulation, false);
1092 WriteSetting(QStringLiteral("use_vsync"), Settings::values.use_vsync, true); 1091 WriteSetting(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
1092 WriteSetting(QStringLiteral("use_assembly_shaders"), Settings::values.use_assembly_shaders,
1093 false);
1093 WriteSetting(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, true); 1094 WriteSetting(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, true);
1094 WriteSetting(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, false); 1095 WriteSetting(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, false);
1095 1096
@@ -1152,8 +1153,6 @@ void Config::SaveUIValues() {
1152 QString::fromUtf8(UISettings::themes[0].second)); 1153 QString::fromUtf8(UISettings::themes[0].second));
1153 WriteSetting(QStringLiteral("enable_discord_presence"), 1154 WriteSetting(QStringLiteral("enable_discord_presence"),
1154 UISettings::values.enable_discord_presence, true); 1155 UISettings::values.enable_discord_presence, true);
1155 WriteSetting(QStringLiteral("screenshot_resolution_factor"),
1156 UISettings::values.screenshot_resolution_factor, 0);
1157 WriteSetting(QStringLiteral("select_user_on_boot"), UISettings::values.select_user_on_boot, 1156 WriteSetting(QStringLiteral("select_user_on_boot"), UISettings::values.select_user_on_boot,
1158 false); 1157 false);
1159 1158
diff --git a/src/yuzu/configuration/configure_debug.cpp b/src/yuzu/configuration/configure_debug.cpp
index c2026763e..2c77441fd 100644
--- a/src/yuzu/configuration/configure_debug.cpp
+++ b/src/yuzu/configuration/configure_debug.cpp
@@ -39,6 +39,8 @@ void ConfigureDebug::SetConfiguration() {
39 ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt); 39 ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt);
40 ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn()); 40 ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn());
41 ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug); 41 ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug);
42 ui->disable_macro_jit->setEnabled(!Core::System::GetInstance().IsPoweredOn());
43 ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit);
42} 44}
43 45
44void ConfigureDebug::ApplyConfiguration() { 46void ConfigureDebug::ApplyConfiguration() {
@@ -51,6 +53,7 @@ void ConfigureDebug::ApplyConfiguration() {
51 Settings::values.quest_flag = ui->quest_flag->isChecked(); 53 Settings::values.quest_flag = ui->quest_flag->isChecked();
52 Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked(); 54 Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked();
53 Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked(); 55 Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked();
56 Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked();
54 Debugger::ToggleConsole(); 57 Debugger::ToggleConsole();
55 Log::Filter filter; 58 Log::Filter filter;
56 filter.ParseFilterString(Settings::values.log_filter); 59 filter.ParseFilterString(Settings::values.log_filter);
diff --git a/src/yuzu/configuration/configure_debug.ui b/src/yuzu/configuration/configure_debug.ui
index e0d4c4a44..46f0208c6 100644
--- a/src/yuzu/configuration/configure_debug.ui
+++ b/src/yuzu/configuration/configure_debug.ui
@@ -148,6 +148,19 @@
148 </property> 148 </property>
149 </widget> 149 </widget>
150 </item> 150 </item>
151 <item>
152 <widget class="QCheckBox" name="disable_macro_jit">
153 <property name="enabled">
154 <bool>true</bool>
155 </property>
156 <property name="whatsThis">
157 <string>When checked, it disables the macro Just In Time compiler. Enabled this makes games run slower</string>
158 </property>
159 <property name="text">
160 <string>Disable Macro JIT</string>
161 </property>
162 </widget>
163 </item>
151 </layout> 164 </layout>
152 </widget> 165 </widget>
153 </item> 166 </item>
diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp
index ea667caef..304625cd7 100644
--- a/src/yuzu/configuration/configure_graphics.cpp
+++ b/src/yuzu/configuration/configure_graphics.cpp
@@ -19,47 +19,6 @@
19#include "video_core/renderer_vulkan/renderer_vulkan.h" 19#include "video_core/renderer_vulkan/renderer_vulkan.h"
20#endif 20#endif
21 21
22namespace {
23enum class Resolution : int {
24 Auto,
25 Scale1x,
26 Scale2x,
27 Scale3x,
28 Scale4x,
29};
30
31float ToResolutionFactor(Resolution option) {
32 switch (option) {
33 case Resolution::Auto:
34 return 0.f;
35 case Resolution::Scale1x:
36 return 1.f;
37 case Resolution::Scale2x:
38 return 2.f;
39 case Resolution::Scale3x:
40 return 3.f;
41 case Resolution::Scale4x:
42 return 4.f;
43 }
44 return 0.f;
45}
46
47Resolution FromResolutionFactor(float factor) {
48 if (factor == 0.f) {
49 return Resolution::Auto;
50 } else if (factor == 1.f) {
51 return Resolution::Scale1x;
52 } else if (factor == 2.f) {
53 return Resolution::Scale2x;
54 } else if (factor == 3.f) {
55 return Resolution::Scale3x;
56 } else if (factor == 4.f) {
57 return Resolution::Scale4x;
58 }
59 return Resolution::Auto;
60}
61} // Anonymous namespace
62
63ConfigureGraphics::ConfigureGraphics(QWidget* parent) 22ConfigureGraphics::ConfigureGraphics(QWidget* parent)
64 : QWidget(parent), ui(new Ui::ConfigureGraphics) { 23 : QWidget(parent), ui(new Ui::ConfigureGraphics) {
65 vulkan_device = Settings::values.vulkan_device; 24 vulkan_device = Settings::values.vulkan_device;
@@ -99,8 +58,6 @@ void ConfigureGraphics::SetConfiguration() {
99 58
100 ui->api->setEnabled(runtime_lock); 59 ui->api->setEnabled(runtime_lock);
101 ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend)); 60 ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend));
102 ui->resolution_factor_combobox->setCurrentIndex(
103 static_cast<int>(FromResolutionFactor(Settings::values.resolution_factor)));
104 ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio); 61 ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio);
105 ui->use_disk_shader_cache->setEnabled(runtime_lock); 62 ui->use_disk_shader_cache->setEnabled(runtime_lock);
106 ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache); 63 ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache);
@@ -114,8 +71,6 @@ void ConfigureGraphics::SetConfiguration() {
114void ConfigureGraphics::ApplyConfiguration() { 71void ConfigureGraphics::ApplyConfiguration() {
115 Settings::values.renderer_backend = GetCurrentGraphicsBackend(); 72 Settings::values.renderer_backend = GetCurrentGraphicsBackend();
116 Settings::values.vulkan_device = vulkan_device; 73 Settings::values.vulkan_device = vulkan_device;
117 Settings::values.resolution_factor =
118 ToResolutionFactor(static_cast<Resolution>(ui->resolution_factor_combobox->currentIndex()));
119 Settings::values.aspect_ratio = ui->aspect_ratio_combobox->currentIndex(); 74 Settings::values.aspect_ratio = ui->aspect_ratio_combobox->currentIndex();
120 Settings::values.use_disk_shader_cache = ui->use_disk_shader_cache->isChecked(); 75 Settings::values.use_disk_shader_cache = ui->use_disk_shader_cache->isChecked();
121 Settings::values.use_asynchronous_gpu_emulation = 76 Settings::values.use_asynchronous_gpu_emulation =
diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui
index c816d6108..6e75447a5 100644
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@@ -85,46 +85,6 @@
85 </widget> 85 </widget>
86 </item> 86 </item>
87 <item> 87 <item>
88 <layout class="QHBoxLayout" name="horizontalLayout_2">
89 <item>
90 <widget class="QLabel" name="label">
91 <property name="text">
92 <string>Internal Resolution:</string>
93 </property>
94 </widget>
95 </item>
96 <item>
97 <widget class="QComboBox" name="resolution_factor_combobox">
98 <item>
99 <property name="text">
100 <string>Auto (Window Size)</string>
101 </property>
102 </item>
103 <item>
104 <property name="text">
105 <string>Native (1280x720)</string>
106 </property>
107 </item>
108 <item>
109 <property name="text">
110 <string>2x Native (2560x1440)</string>
111 </property>
112 </item>
113 <item>
114 <property name="text">
115 <string>3x Native (3840x2160)</string>
116 </property>
117 </item>
118 <item>
119 <property name="text">
120 <string>4x Native (5120x2880)</string>
121 </property>
122 </item>
123 </widget>
124 </item>
125 </layout>
126 </item>
127 <item>
128 <layout class="QHBoxLayout" name="horizontalLayout_6"> 88 <layout class="QHBoxLayout" name="horizontalLayout_6">
129 <item> 89 <item>
130 <widget class="QLabel" name="ar_label"> 90 <widget class="QLabel" name="ar_label">
diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp
index 5bb2ae555..be5006ad3 100644
--- a/src/yuzu/configuration/configure_graphics_advanced.cpp
+++ b/src/yuzu/configuration/configure_graphics_advanced.cpp
@@ -22,6 +22,8 @@ void ConfigureGraphicsAdvanced::SetConfiguration() {
22 ui->gpu_accuracy->setCurrentIndex(static_cast<int>(Settings::values.gpu_accuracy)); 22 ui->gpu_accuracy->setCurrentIndex(static_cast<int>(Settings::values.gpu_accuracy));
23 ui->use_vsync->setEnabled(runtime_lock); 23 ui->use_vsync->setEnabled(runtime_lock);
24 ui->use_vsync->setChecked(Settings::values.use_vsync); 24 ui->use_vsync->setChecked(Settings::values.use_vsync);
25 ui->use_assembly_shaders->setEnabled(runtime_lock);
26 ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders);
25 ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time); 27 ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time);
26 ui->force_30fps_mode->setEnabled(runtime_lock); 28 ui->force_30fps_mode->setEnabled(runtime_lock);
27 ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode); 29 ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode);
@@ -33,6 +35,7 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() {
33 auto gpu_accuracy = static_cast<Settings::GPUAccuracy>(ui->gpu_accuracy->currentIndex()); 35 auto gpu_accuracy = static_cast<Settings::GPUAccuracy>(ui->gpu_accuracy->currentIndex());
34 Settings::values.gpu_accuracy = gpu_accuracy; 36 Settings::values.gpu_accuracy = gpu_accuracy;
35 Settings::values.use_vsync = ui->use_vsync->isChecked(); 37 Settings::values.use_vsync = ui->use_vsync->isChecked();
38 Settings::values.use_assembly_shaders = ui->use_assembly_shaders->isChecked();
36 Settings::values.use_fast_gpu_time = ui->use_fast_gpu_time->isChecked(); 39 Settings::values.use_fast_gpu_time = ui->use_fast_gpu_time->isChecked();
37 Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked(); 40 Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked();
38 Settings::values.max_anisotropy = ui->anisotropic_filtering_combobox->currentIndex(); 41 Settings::values.max_anisotropy = ui->anisotropic_filtering_combobox->currentIndex();
diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui
index 770b80c50..0021607ac 100644
--- a/src/yuzu/configuration/configure_graphics_advanced.ui
+++ b/src/yuzu/configuration/configure_graphics_advanced.ui
@@ -63,6 +63,16 @@
63 </widget> 63 </widget>
64 </item> 64 </item>
65 <item> 65 <item>
66 <widget class="QCheckBox" name="use_assembly_shaders">
67 <property name="toolTip">
68 <string>Enabling this reduces shader stutter. Enables OpenGL assembly shaders on supported Nvidia devices (NV_gpu_program5 is required). This feature is experimental.</string>
69 </property>
70 <property name="text">
71 <string>Use assembly shaders (experimental, Nvidia OpenGL only)</string>
72 </property>
73 </widget>
74 </item>
75 <item>
66 <widget class="QCheckBox" name="force_30fps_mode"> 76 <widget class="QCheckBox" name="force_30fps_mode">
67 <property name="text"> 77 <property name="text">
68 <string>Force 30 FPS mode</string> 78 <string>Force 30 FPS mode</string>
diff --git a/src/yuzu/configuration/configure_input_player.cpp b/src/yuzu/configuration/configure_input_player.cpp
index e4eb5594b..a05fa64ba 100644
--- a/src/yuzu/configuration/configure_input_player.cpp
+++ b/src/yuzu/configuration/configure_input_player.cpp
@@ -480,7 +480,9 @@ void ConfigureInputPlayer::RestoreDefaults() {
480 SetAnalogButton(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]); 480 SetAnalogButton(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]);
481 } 481 }
482 } 482 }
483
483 UpdateButtonLabels(); 484 UpdateButtonLabels();
485 ApplyConfiguration();
484} 486}
485 487
486void ConfigureInputPlayer::ClearAll() { 488void ConfigureInputPlayer::ClearAll() {
@@ -505,6 +507,7 @@ void ConfigureInputPlayer::ClearAll() {
505 } 507 }
506 508
507 UpdateButtonLabels(); 509 UpdateButtonLabels();
510 ApplyConfiguration();
508} 511}
509 512
510void ConfigureInputPlayer::UpdateButtonLabels() { 513void ConfigureInputPlayer::UpdateButtonLabels() {
diff --git a/src/yuzu/discord_impl.cpp b/src/yuzu/discord_impl.cpp
index ea0079353..a93733b26 100644
--- a/src/yuzu/discord_impl.cpp
+++ b/src/yuzu/discord_impl.cpp
@@ -18,7 +18,7 @@ DiscordImpl::DiscordImpl() {
18 18
19 // The number is the client ID for yuzu, it's used for images and the 19 // The number is the client ID for yuzu, it's used for images and the
20 // application name 20 // application name
21 Discord_Initialize("471872241299226636", &handlers, 1, nullptr); 21 Discord_Initialize("712465656758665259", &handlers, 1, nullptr);
22} 22}
23 23
24DiscordImpl::~DiscordImpl() { 24DiscordImpl::~DiscordImpl() {
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index 0b291c7d0..4119d7907 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -65,6 +65,7 @@ static FileSys::VirtualFile VfsDirectoryCreateFileWrapper(const FileSys::Virtual
65#include "common/logging/backend.h" 65#include "common/logging/backend.h"
66#include "common/logging/filter.h" 66#include "common/logging/filter.h"
67#include "common/logging/log.h" 67#include "common/logging/log.h"
68#include "common/memory_detect.h"
68#include "common/microprofile.h" 69#include "common/microprofile.h"
69#include "common/scm_rev.h" 70#include "common/scm_rev.h"
70#include "common/scope_exit.h" 71#include "common/scope_exit.h"
@@ -219,6 +220,10 @@ GMainWindow::GMainWindow()
219 LOG_INFO(Frontend, "Host CPU: {}", Common::GetCPUCaps().cpu_string); 220 LOG_INFO(Frontend, "Host CPU: {}", Common::GetCPUCaps().cpu_string);
220#endif 221#endif
221 LOG_INFO(Frontend, "Host OS: {}", QSysInfo::prettyProductName().toStdString()); 222 LOG_INFO(Frontend, "Host OS: {}", QSysInfo::prettyProductName().toStdString());
223 LOG_INFO(Frontend, "Host RAM: {:.2f} GB",
224 Common::GetMemInfo().TotalPhysicalMemory / 1024.0f / 1024 / 1024);
225 LOG_INFO(Frontend, "Host Swap: {:.2f} GB",
226 Common::GetMemInfo().TotalSwapMemory / 1024.0f / 1024 / 1024);
222 UpdateWindowTitle(); 227 UpdateWindowTitle();
223 228
224 show(); 229 show();
@@ -684,10 +689,7 @@ void GMainWindow::InitializeHotkeys() {
684 Settings::values.use_frame_limit = !Settings::values.use_frame_limit; 689 Settings::values.use_frame_limit = !Settings::values.use_frame_limit;
685 UpdateStatusBar(); 690 UpdateStatusBar();
686 }); 691 });
687 // TODO: Remove this comment/static whenever the next major release of 692 constexpr u16 SPEED_LIMIT_STEP = 5;
688 // MSVC occurs and we make it a requirement (see:
689 // https://developercommunity.visualstudio.com/content/problem/93922/constexprs-are-trying-to-be-captured-in-lambda-fun.html)
690 static constexpr u16 SPEED_LIMIT_STEP = 5;
691 connect(hotkey_registry.GetHotkey(main_window, QStringLiteral("Increase Speed Limit"), this), 693 connect(hotkey_registry.GetHotkey(main_window, QStringLiteral("Increase Speed Limit"), this),
692 &QShortcut::activated, this, [&] { 694 &QShortcut::activated, this, [&] {
693 if (Settings::values.frame_limit < 9999 - SPEED_LIMIT_STEP) { 695 if (Settings::values.frame_limit < 9999 - SPEED_LIMIT_STEP) {
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp
index 2348e6e0d..659b9f701 100644
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -380,8 +380,6 @@ void Config::ReadValues() {
380 Settings::values.renderer_debug = sdl2_config->GetBoolean("Renderer", "debug", false); 380 Settings::values.renderer_debug = sdl2_config->GetBoolean("Renderer", "debug", false);
381 Settings::values.vulkan_device = sdl2_config->GetInteger("Renderer", "vulkan_device", 0); 381 Settings::values.vulkan_device = sdl2_config->GetInteger("Renderer", "vulkan_device", 0);
382 382
383 Settings::values.resolution_factor =
384 static_cast<float>(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0));
385 Settings::values.aspect_ratio = 383 Settings::values.aspect_ratio =
386 static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0)); 384 static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0));
387 Settings::values.max_anisotropy = 385 Settings::values.max_anisotropy =
@@ -397,6 +395,8 @@ void Config::ReadValues() {
397 sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false); 395 sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false);
398 Settings::values.use_vsync = 396 Settings::values.use_vsync =
399 static_cast<u16>(sdl2_config->GetInteger("Renderer", "use_vsync", 1)); 397 static_cast<u16>(sdl2_config->GetInteger("Renderer", "use_vsync", 1));
398 Settings::values.use_assembly_shaders =
399 sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", false);
400 Settings::values.use_fast_gpu_time = 400 Settings::values.use_fast_gpu_time =
401 sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true); 401 sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true);
402 402
@@ -430,6 +430,8 @@ void Config::ReadValues() {
430 Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false); 430 Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false);
431 Settings::values.disable_cpu_opt = 431 Settings::values.disable_cpu_opt =
432 sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false); 432 sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false);
433 Settings::values.disable_macro_jit =
434 sdl2_config->GetBoolean("Debugging", "disable_macro_jit", false);
433 435
434 const auto title_list = sdl2_config->Get("AddOns", "title_ids", ""); 436 const auto title_list = sdl2_config->Get("AddOns", "title_ids", "");
435 std::stringstream ss(title_list); 437 std::stringstream ss(title_list);
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h
index ae94b51c4..45c07ed5d 100644
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -117,11 +117,6 @@ use_hw_renderer =
117# 0: Interpreter (slow), 1 (default): JIT (fast) 117# 0: Interpreter (slow), 1 (default): JIT (fast)
118use_shader_jit = 118use_shader_jit =
119 119
120# Resolution scale factor
121# 0: Auto (scales resolution to window size), 1: Native Switch screen resolution, Otherwise a scale
122# factor for the Switch resolution
123resolution_factor =
124
125# Aspect ratio 120# Aspect ratio
126# 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window 121# 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window
127aspect_ratio = 122aspect_ratio =
@@ -134,6 +129,10 @@ max_anisotropy =
134# 0 (default): Off, 1: On 129# 0 (default): Off, 1: On
135use_vsync = 130use_vsync =
136 131
132# Whether to use OpenGL assembly shaders or not. NV_gpu_program5 is required.
133# 0 (default): Off, 1: On
134use_assembly_shaders =
135
137# Turns on the frame limiter, which will limit frames output to the target game speed 136# Turns on the frame limiter, which will limit frames output to the target game speed
138# 0: Off, 1: On (default) 137# 0: Off, 1: On (default)
139use_frame_limit = 138use_frame_limit =
@@ -287,6 +286,8 @@ quest_flag =
287# Determines whether or not JIT CPU optimizations are enabled 286# Determines whether or not JIT CPU optimizations are enabled
288# false: Optimizations Enabled, true: Optimizations Disabled 287# false: Optimizations Enabled, true: Optimizations Disabled
289disable_cpu_opt = 288disable_cpu_opt =
289# Enables/Disables the macro JIT compiler
290disable_macro_jit=false
290 291
291[WebService] 292[WebService]
292# Whether or not to enable telemetry 293# Whether or not to enable telemetry
diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
index 411e7e647..09cc0a3b5 100644
--- a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
+++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
@@ -98,6 +98,9 @@ EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(Core::System& system, bool fullscreen)
98 SDL_GL_SetAttribute(SDL_GL_BLUE_SIZE, 8); 98 SDL_GL_SetAttribute(SDL_GL_BLUE_SIZE, 8);
99 SDL_GL_SetAttribute(SDL_GL_ALPHA_SIZE, 0); 99 SDL_GL_SetAttribute(SDL_GL_ALPHA_SIZE, 0);
100 SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1); 100 SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1);
101 if (Settings::values.renderer_debug) {
102 SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
103 }
101 SDL_GL_SetSwapInterval(0); 104 SDL_GL_SetSwapInterval(0);
102 105
103 std::string window_title = fmt::format("yuzu {} | {}-{}", Common::g_build_fullname, 106 std::string window_title = fmt::format("yuzu {} | {}-{}", Common::g_build_fullname,
diff --git a/src/yuzu_tester/config.cpp b/src/yuzu_tester/config.cpp
index 3be58b15d..1566c2e3f 100644
--- a/src/yuzu_tester/config.cpp
+++ b/src/yuzu_tester/config.cpp
@@ -116,8 +116,6 @@ void Config::ReadValues() {
116 Settings::values.use_multi_core = sdl2_config->GetBoolean("Core", "use_multi_core", false); 116 Settings::values.use_multi_core = sdl2_config->GetBoolean("Core", "use_multi_core", false);
117 117
118 // Renderer 118 // Renderer
119 Settings::values.resolution_factor =
120 static_cast<float>(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0));
121 Settings::values.aspect_ratio = 119 Settings::values.aspect_ratio =
122 static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0)); 120 static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0));
123 Settings::values.max_anisotropy = 121 Settings::values.max_anisotropy =
diff --git a/src/yuzu_tester/default_ini.h b/src/yuzu_tester/default_ini.h
index ca203b64d..41bbbbf60 100644
--- a/src/yuzu_tester/default_ini.h
+++ b/src/yuzu_tester/default_ini.h
@@ -21,11 +21,6 @@ use_hw_renderer =
21# 0: Interpreter (slow), 1 (default): JIT (fast) 21# 0: Interpreter (slow), 1 (default): JIT (fast)
22use_shader_jit = 22use_shader_jit =
23 23
24# Resolution scale factor
25# 0: Auto (scales resolution to window size), 1: Native Switch screen resolution, Otherwise a scale
26# factor for the Switch resolution
27resolution_factor =
28
29# Aspect ratio 24# Aspect ratio
30# 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window 25# 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window
31aspect_ratio = 26aspect_ratio =