summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/common/CMakeLists.txt8
-rw-r--r--src/common/page_table.cpp12
-rw-r--r--src/common/page_table.h15
-rw-r--r--src/core/hle/service/time/time_zone_content_manager.cpp2
-rw-r--r--src/core/settings.h1
-rw-r--r--src/input_common/udp/udp.cpp3
-rw-r--r--src/video_core/CMakeLists.txt8
-rw-r--r--src/video_core/dirty_flags.cpp8
-rw-r--r--src/video_core/dirty_flags.h2
-rw-r--r--src/video_core/engines/const_buffer_engine_interface.h67
-rw-r--r--src/video_core/engines/kepler_compute.cpp10
-rw-r--r--src/video_core/engines/maxwell_3d.cpp2
-rw-r--r--src/video_core/engines/maxwell_3d.h67
-rw-r--r--src/video_core/engines/shader_bytecode.h11
-rw-r--r--src/video_core/gpu.h1
-rw-r--r--src/video_core/guest_driver.cpp7
-rw-r--r--src/video_core/guest_driver.h21
-rw-r--r--src/video_core/memory_manager.h2
-rw-r--r--src/video_core/morton.cpp2
-rw-r--r--src/video_core/rasterizer_interface.h1
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp160
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h26
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp510
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.h99
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp426
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.h22
-rw-r--r--src/video_core/renderer_opengl/gl_shader_disk_cache.cpp404
-rw-r--r--src/video_core/renderer_opengl/gl_shader_disk_cache.h153
-rw-r--r--src/video_core/renderer_opengl/gl_shader_gen.cpp109
-rw-r--r--src/video_core/renderer_opengl/gl_shader_gen.h34
-rw-r--r--src/video_core/renderer_opengl/gl_state_tracker.cpp11
-rw-r--r--src/video_core/renderer_opengl/gl_state_tracker.h11
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp1
-rw-r--r--src/video_core/renderer_opengl/maxwell_to_gl.h13
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.cpp290
-rw-r--r--src/video_core/renderer_vulkan/maxwell_to_vk.cpp9
-rw-r--r--src/video_core/renderer_vulkan/vk_device.cpp48
-rw-r--r--src/video_core/renderer_vulkan/vk_device.h45
-rw-r--r--src/video_core/renderer_vulkan/vk_pipeline_cache.cpp27
-rw-r--r--src/video_core/renderer_vulkan/vk_pipeline_cache.h14
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp132
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h5
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.cpp138
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.h13
-rw-r--r--src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp2
-rw-r--r--src/video_core/renderer_vulkan/vk_state_tracker.cpp2
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp6
-rw-r--r--src/video_core/shader/const_buffer_locker.cpp126
-rw-r--r--src/video_core/shader/const_buffer_locker.h103
-rw-r--r--src/video_core/shader/control_flow.cpp13
-rw-r--r--src/video_core/shader/control_flow.h3
-rw-r--r--src/video_core/shader/decode.cpp22
-rw-r--r--src/video_core/shader/decode/bfe.cpp69
-rw-r--r--src/video_core/shader/decode/texture.cpp5
-rw-r--r--src/video_core/shader/node_helper.cpp2
-rw-r--r--src/video_core/shader/registry.cpp161
-rw-r--r--src/video_core/shader/registry.h137
-rw-r--r--src/video_core/shader/shader_ir.cpp5
-rw-r--r--src/video_core/shader/shader_ir.h6
-rw-r--r--src/video_core/shader/track.cpp18
-rw-r--r--src/video_core/shader/transform_feedback.cpp115
-rw-r--r--src/video_core/shader/transform_feedback.h23
-rw-r--r--src/video_core/surface.cpp2
-rw-r--r--src/video_core/surface.h142
-rw-r--r--src/video_core/texture_cache/format_lookup_table.cpp3
-rw-r--r--src/video_core/texture_cache/surface_params.cpp6
-rw-r--r--src/video_core/texture_cache/texture_cache.h41
-rw-r--r--src/video_core/textures/astc.cpp1074
-rw-r--r--src/yuzu/configuration/config.cpp4
-rw-r--r--src/yuzu/configuration/configure_system.cpp2
-rw-r--r--src/yuzu/loading_screen.cpp17
71 files changed, 2658 insertions, 2401 deletions
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 9afc6105d..fbebed715 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -38,8 +38,6 @@ add_custom_command(OUTPUT scm_rev.cpp
38 "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.h" 38 "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.h"
39 "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.cpp" 39 "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.cpp"
40 "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.h" 40 "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.h"
41 "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.cpp"
42 "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.h"
43 "${VIDEO_CORE}/shader/decode/arithmetic.cpp" 41 "${VIDEO_CORE}/shader/decode/arithmetic.cpp"
44 "${VIDEO_CORE}/shader/decode/arithmetic_half.cpp" 42 "${VIDEO_CORE}/shader/decode/arithmetic_half.cpp"
45 "${VIDEO_CORE}/shader/decode/arithmetic_half_immediate.cpp" 43 "${VIDEO_CORE}/shader/decode/arithmetic_half_immediate.cpp"
@@ -72,8 +70,6 @@ add_custom_command(OUTPUT scm_rev.cpp
72 "${VIDEO_CORE}/shader/ast.h" 70 "${VIDEO_CORE}/shader/ast.h"
73 "${VIDEO_CORE}/shader/compiler_settings.cpp" 71 "${VIDEO_CORE}/shader/compiler_settings.cpp"
74 "${VIDEO_CORE}/shader/compiler_settings.h" 72 "${VIDEO_CORE}/shader/compiler_settings.h"
75 "${VIDEO_CORE}/shader/const_buffer_locker.cpp"
76 "${VIDEO_CORE}/shader/const_buffer_locker.h"
77 "${VIDEO_CORE}/shader/control_flow.cpp" 73 "${VIDEO_CORE}/shader/control_flow.cpp"
78 "${VIDEO_CORE}/shader/control_flow.h" 74 "${VIDEO_CORE}/shader/control_flow.h"
79 "${VIDEO_CORE}/shader/decode.cpp" 75 "${VIDEO_CORE}/shader/decode.cpp"
@@ -82,9 +78,13 @@ add_custom_command(OUTPUT scm_rev.cpp
82 "${VIDEO_CORE}/shader/node.h" 78 "${VIDEO_CORE}/shader/node.h"
83 "${VIDEO_CORE}/shader/node_helper.cpp" 79 "${VIDEO_CORE}/shader/node_helper.cpp"
84 "${VIDEO_CORE}/shader/node_helper.h" 80 "${VIDEO_CORE}/shader/node_helper.h"
81 "${VIDEO_CORE}/shader/registry.cpp"
82 "${VIDEO_CORE}/shader/registry.h"
85 "${VIDEO_CORE}/shader/shader_ir.cpp" 83 "${VIDEO_CORE}/shader/shader_ir.cpp"
86 "${VIDEO_CORE}/shader/shader_ir.h" 84 "${VIDEO_CORE}/shader/shader_ir.h"
87 "${VIDEO_CORE}/shader/track.cpp" 85 "${VIDEO_CORE}/shader/track.cpp"
86 "${VIDEO_CORE}/shader/transform_feedback.cpp"
87 "${VIDEO_CORE}/shader/transform_feedback.h"
88 # and also check that the scm_rev files haven't changed 88 # and also check that the scm_rev files haven't changed
89 "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.cpp.in" 89 "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.cpp.in"
90 "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.h" 90 "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.h"
diff --git a/src/common/page_table.cpp b/src/common/page_table.cpp
index 69b7abc54..566b57b62 100644
--- a/src/common/page_table.cpp
+++ b/src/common/page_table.cpp
@@ -16,7 +16,6 @@ void PageTable::Resize(std::size_t address_space_width_in_bits) {
16 16
17 pointers.resize(num_page_table_entries); 17 pointers.resize(num_page_table_entries);
18 attributes.resize(num_page_table_entries); 18 attributes.resize(num_page_table_entries);
19 backing_addr.resize(num_page_table_entries);
20 19
21 // The default is a 39-bit address space, which causes an initial 1GB allocation size. If the 20 // The default is a 39-bit address space, which causes an initial 1GB allocation size. If the
22 // vector size is subsequently decreased (via resize), the vector might not automatically 21 // vector size is subsequently decreased (via resize), the vector might not automatically
@@ -25,6 +24,17 @@ void PageTable::Resize(std::size_t address_space_width_in_bits) {
25 24
26 pointers.shrink_to_fit(); 25 pointers.shrink_to_fit();
27 attributes.shrink_to_fit(); 26 attributes.shrink_to_fit();
27}
28
29BackingPageTable::BackingPageTable(std::size_t page_size_in_bits) : PageTable{page_size_in_bits} {}
30
31BackingPageTable::~BackingPageTable() = default;
32
33void BackingPageTable::Resize(std::size_t address_space_width_in_bits) {
34 PageTable::Resize(address_space_width_in_bits);
35 const std::size_t num_page_table_entries = 1ULL
36 << (address_space_width_in_bits - page_size_in_bits);
37 backing_addr.resize(num_page_table_entries);
28 backing_addr.shrink_to_fit(); 38 backing_addr.shrink_to_fit();
29} 39}
30 40
diff --git a/src/common/page_table.h b/src/common/page_table.h
index 8b8ff0bb8..dbc272ab7 100644
--- a/src/common/page_table.h
+++ b/src/common/page_table.h
@@ -76,9 +76,20 @@ struct PageTable {
76 */ 76 */
77 std::vector<PageType> attributes; 77 std::vector<PageType> attributes;
78 78
79 std::vector<u64> backing_addr;
80
81 const std::size_t page_size_in_bits{}; 79 const std::size_t page_size_in_bits{};
82}; 80};
83 81
82/**
83 * A more advanced Page Table with the ability to save a backing address when using it
84 * depends on another MMU.
85 */
86struct BackingPageTable : PageTable {
87 explicit BackingPageTable(std::size_t page_size_in_bits);
88 ~BackingPageTable();
89
90 void Resize(std::size_t address_space_width_in_bits);
91
92 std::vector<u64> backing_addr;
93};
94
84} // namespace Common 95} // namespace Common
diff --git a/src/core/hle/service/time/time_zone_content_manager.cpp b/src/core/hle/service/time/time_zone_content_manager.cpp
index 57b1a2bca..78d4acd95 100644
--- a/src/core/hle/service/time/time_zone_content_manager.cpp
+++ b/src/core/hle/service/time/time_zone_content_manager.cpp
@@ -53,7 +53,7 @@ static std::vector<std::string> BuildLocationNameCache(Core::System& system) {
53 return {}; 53 return {};
54 } 54 }
55 55
56 std::vector<char> raw_data(binary_list->GetSize()); 56 std::vector<char> raw_data(binary_list->GetSize() + 1);
57 binary_list->ReadBytes<char>(raw_data.data(), binary_list->GetSize()); 57 binary_list->ReadBytes<char>(raw_data.data(), binary_list->GetSize());
58 58
59 std::stringstream data_stream{raw_data.data()}; 59 std::stringstream data_stream{raw_data.data()};
diff --git a/src/core/settings.h b/src/core/settings.h
index cb5979e6f..12e2cc9e7 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -387,6 +387,7 @@ struct Values {
387 387
388 s32 current_user; 388 s32 current_user;
389 s32 language_index; 389 s32 language_index;
390 s32 sound_index;
390 391
391 // Controls 392 // Controls
392 std::array<PlayerInput, 10> players; 393 std::array<PlayerInput, 10> players;
diff --git a/src/input_common/udp/udp.cpp b/src/input_common/udp/udp.cpp
index ca99cc22f..8c6ef1394 100644
--- a/src/input_common/udp/udp.cpp
+++ b/src/input_common/udp/udp.cpp
@@ -3,6 +3,7 @@
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <mutex> 5#include <mutex>
6#include <optional>
6#include <tuple> 7#include <tuple>
7 8
8#include "common/param_package.h" 9#include "common/param_package.h"
@@ -44,7 +45,7 @@ public:
44 std::unique_ptr<Input::TouchDevice> Create(const Common::ParamPackage& params) override { 45 std::unique_ptr<Input::TouchDevice> Create(const Common::ParamPackage& params) override {
45 { 46 {
46 std::lock_guard guard(status->update_mutex); 47 std::lock_guard guard(status->update_mutex);
47 status->touch_calibration.emplace(); 48 status->touch_calibration = DeviceStatus::CalibrationData{};
48 // These default values work well for DS4 but probably not other touch inputs 49 // These default values work well for DS4 but probably not other touch inputs
49 status->touch_calibration->min_x = params.Get("min_x", 100); 50 status->touch_calibration->min_x = params.Get("min_x", 100);
50 status->touch_calibration->min_y = params.Get("min_y", 50); 51 status->touch_calibration->min_y = params.Get("min_y", 50);
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 14f3b4569..91df062d7 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -65,8 +65,6 @@ add_library(video_core STATIC
65 renderer_opengl/gl_shader_decompiler.h 65 renderer_opengl/gl_shader_decompiler.h
66 renderer_opengl/gl_shader_disk_cache.cpp 66 renderer_opengl/gl_shader_disk_cache.cpp
67 renderer_opengl/gl_shader_disk_cache.h 67 renderer_opengl/gl_shader_disk_cache.h
68 renderer_opengl/gl_shader_gen.cpp
69 renderer_opengl/gl_shader_gen.h
70 renderer_opengl/gl_shader_manager.cpp 68 renderer_opengl/gl_shader_manager.cpp
71 renderer_opengl/gl_shader_manager.h 69 renderer_opengl/gl_shader_manager.h
72 renderer_opengl/gl_shader_util.cpp 70 renderer_opengl/gl_shader_util.cpp
@@ -118,8 +116,6 @@ add_library(video_core STATIC
118 shader/ast.h 116 shader/ast.h
119 shader/compiler_settings.cpp 117 shader/compiler_settings.cpp
120 shader/compiler_settings.h 118 shader/compiler_settings.h
121 shader/const_buffer_locker.cpp
122 shader/const_buffer_locker.h
123 shader/control_flow.cpp 119 shader/control_flow.cpp
124 shader/control_flow.h 120 shader/control_flow.h
125 shader/decode.cpp 121 shader/decode.cpp
@@ -128,9 +124,13 @@ add_library(video_core STATIC
128 shader/node_helper.cpp 124 shader/node_helper.cpp
129 shader/node_helper.h 125 shader/node_helper.h
130 shader/node.h 126 shader/node.h
127 shader/registry.cpp
128 shader/registry.h
131 shader/shader_ir.cpp 129 shader/shader_ir.cpp
132 shader/shader_ir.h 130 shader/shader_ir.h
133 shader/track.cpp 131 shader/track.cpp
132 shader/transform_feedback.cpp
133 shader/transform_feedback.h
134 surface.cpp 134 surface.cpp
135 surface.h 135 surface.h
136 texture_cache/format_lookup_table.cpp 136 texture_cache/format_lookup_table.cpp
diff --git a/src/video_core/dirty_flags.cpp b/src/video_core/dirty_flags.cpp
index 4429f3405..e16075993 100644
--- a/src/video_core/dirty_flags.cpp
+++ b/src/video_core/dirty_flags.cpp
@@ -15,14 +15,6 @@ namespace VideoCommon::Dirty {
15 15
16using Tegra::Engines::Maxwell3D; 16using Tegra::Engines::Maxwell3D;
17 17
18void SetupCommonOnWriteStores(Tegra::Engines::Maxwell3D::DirtyState::Flags& store) {
19 store[RenderTargets] = true;
20 store[ZetaBuffer] = true;
21 for (std::size_t i = 0; i < Maxwell3D::Regs::NumRenderTargets; ++i) {
22 store[ColorBuffer0 + i] = true;
23 }
24}
25
26void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) { 18void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) {
27 static constexpr std::size_t num_per_rt = NUM(rt[0]); 19 static constexpr std::size_t num_per_rt = NUM(rt[0]);
28 static constexpr std::size_t begin = OFF(rt); 20 static constexpr std::size_t begin = OFF(rt);
diff --git a/src/video_core/dirty_flags.h b/src/video_core/dirty_flags.h
index 0dbafd3ef..3f6c1d83a 100644
--- a/src/video_core/dirty_flags.h
+++ b/src/video_core/dirty_flags.h
@@ -44,8 +44,6 @@ void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_
44 FillBlock(tables[1], begin, num, index_b); 44 FillBlock(tables[1], begin, num, index_b);
45} 45}
46 46
47void SetupCommonOnWriteStores(Tegra::Engines::Maxwell3D::DirtyState::Flags& store);
48
49void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables); 47void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables);
50 48
51} // namespace VideoCommon::Dirty 49} // namespace VideoCommon::Dirty
diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h
index d56a47710..724ee0fd6 100644
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -16,11 +16,12 @@ namespace Tegra::Engines {
16 16
17struct SamplerDescriptor { 17struct SamplerDescriptor {
18 union { 18 union {
19 BitField<0, 20, Tegra::Shader::TextureType> texture_type; 19 u32 raw = 0;
20 BitField<20, 1, u32> is_array; 20 BitField<0, 2, Tegra::Shader::TextureType> texture_type;
21 BitField<21, 1, u32> is_buffer; 21 BitField<2, 3, Tegra::Texture::ComponentType> component_type;
22 BitField<22, 1, u32> is_shadow; 22 BitField<5, 1, u32> is_array;
23 u32 raw{}; 23 BitField<6, 1, u32> is_buffer;
24 BitField<7, 1, u32> is_shadow;
24 }; 25 };
25 26
26 bool operator==(const SamplerDescriptor& rhs) const noexcept { 27 bool operator==(const SamplerDescriptor& rhs) const noexcept {
@@ -31,68 +32,48 @@ struct SamplerDescriptor {
31 return !operator==(rhs); 32 return !operator==(rhs);
32 } 33 }
33 34
34 static SamplerDescriptor FromTicTexture(Tegra::Texture::TextureType tic_texture_type) { 35 static SamplerDescriptor FromTIC(const Tegra::Texture::TICEntry& tic) {
36 using Tegra::Shader::TextureType;
35 SamplerDescriptor result; 37 SamplerDescriptor result;
36 switch (tic_texture_type) { 38
39 // This is going to be used to determine the shading language type.
40 // Because of that we don't care about all component types on color textures.
41 result.component_type.Assign(tic.r_type.Value());
42
43 switch (tic.texture_type.Value()) {
37 case Tegra::Texture::TextureType::Texture1D: 44 case Tegra::Texture::TextureType::Texture1D:
38 result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D); 45 result.texture_type.Assign(TextureType::Texture1D);
39 result.is_array.Assign(0);
40 result.is_buffer.Assign(0);
41 result.is_shadow.Assign(0);
42 return result; 46 return result;
43 case Tegra::Texture::TextureType::Texture2D: 47 case Tegra::Texture::TextureType::Texture2D:
44 result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); 48 result.texture_type.Assign(TextureType::Texture2D);
45 result.is_array.Assign(0);
46 result.is_buffer.Assign(0);
47 result.is_shadow.Assign(0);
48 return result; 49 return result;
49 case Tegra::Texture::TextureType::Texture3D: 50 case Tegra::Texture::TextureType::Texture3D:
50 result.texture_type.Assign(Tegra::Shader::TextureType::Texture3D); 51 result.texture_type.Assign(TextureType::Texture3D);
51 result.is_array.Assign(0);
52 result.is_buffer.Assign(0);
53 result.is_shadow.Assign(0);
54 return result; 52 return result;
55 case Tegra::Texture::TextureType::TextureCubemap: 53 case Tegra::Texture::TextureType::TextureCubemap:
56 result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube); 54 result.texture_type.Assign(TextureType::TextureCube);
57 result.is_array.Assign(0);
58 result.is_buffer.Assign(0);
59 result.is_shadow.Assign(0);
60 return result; 55 return result;
61 case Tegra::Texture::TextureType::Texture1DArray: 56 case Tegra::Texture::TextureType::Texture1DArray:
62 result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D); 57 result.texture_type.Assign(TextureType::Texture1D);
63 result.is_array.Assign(1); 58 result.is_array.Assign(1);
64 result.is_buffer.Assign(0);
65 result.is_shadow.Assign(0);
66 return result; 59 return result;
67 case Tegra::Texture::TextureType::Texture2DArray: 60 case Tegra::Texture::TextureType::Texture2DArray:
68 result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); 61 result.texture_type.Assign(TextureType::Texture2D);
69 result.is_array.Assign(1); 62 result.is_array.Assign(1);
70 result.is_buffer.Assign(0);
71 result.is_shadow.Assign(0);
72 return result; 63 return result;
73 case Tegra::Texture::TextureType::Texture1DBuffer: 64 case Tegra::Texture::TextureType::Texture1DBuffer:
74 result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D); 65 result.texture_type.Assign(TextureType::Texture1D);
75 result.is_array.Assign(0);
76 result.is_buffer.Assign(1); 66 result.is_buffer.Assign(1);
77 result.is_shadow.Assign(0);
78 return result; 67 return result;
79 case Tegra::Texture::TextureType::Texture2DNoMipmap: 68 case Tegra::Texture::TextureType::Texture2DNoMipmap:
80 result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); 69 result.texture_type.Assign(TextureType::Texture2D);
81 result.is_array.Assign(0);
82 result.is_buffer.Assign(0);
83 result.is_shadow.Assign(0);
84 return result; 70 return result;
85 case Tegra::Texture::TextureType::TextureCubeArray: 71 case Tegra::Texture::TextureType::TextureCubeArray:
86 result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube); 72 result.texture_type.Assign(TextureType::TextureCube);
87 result.is_array.Assign(1); 73 result.is_array.Assign(1);
88 result.is_buffer.Assign(0);
89 result.is_shadow.Assign(0);
90 return result; 74 return result;
91 default: 75 default:
92 result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); 76 result.texture_type.Assign(TextureType::Texture2D);
93 result.is_array.Assign(0);
94 result.is_buffer.Assign(0);
95 result.is_shadow.Assign(0);
96 return result; 77 return result;
97 } 78 }
98 } 79 }
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index ae52afa79..368c75a66 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -89,7 +89,7 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con
89 89
90 const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; 90 const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
91 const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); 91 const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
92 SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value()); 92 SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
93 result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); 93 result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
94 return result; 94 return result;
95} 95}
@@ -119,14 +119,6 @@ Texture::TICEntry KeplerCompute::GetTICEntry(u32 tic_index) const {
119 Texture::TICEntry tic_entry; 119 Texture::TICEntry tic_entry;
120 memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); 120 memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
121 121
122 const auto r_type{tic_entry.r_type.Value()};
123 const auto g_type{tic_entry.g_type.Value()};
124 const auto b_type{tic_entry.b_type.Value()};
125 const auto a_type{tic_entry.a_type.Value()};
126
127 // TODO(Subv): Different data types for separate components are not supported
128 DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
129
130 return tic_entry; 122 return tic_entry;
131} 123}
132 124
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 89050361e..ce536e29b 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -638,7 +638,7 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b
638 638
639 const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; 639 const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
640 const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); 640 const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
641 SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value()); 641 SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
642 result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); 642 result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
643 return result; 643 return result;
644} 644}
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 491cff370..8a9e9992e 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -67,6 +67,7 @@ public:
67 static constexpr std::size_t NumVaryings = 31; 67 static constexpr std::size_t NumVaryings = 31;
68 static constexpr std::size_t NumImages = 8; // TODO(Rodrigo): Investigate this number 68 static constexpr std::size_t NumImages = 8; // TODO(Rodrigo): Investigate this number
69 static constexpr std::size_t NumClipDistances = 8; 69 static constexpr std::size_t NumClipDistances = 8;
70 static constexpr std::size_t NumTransformFeedbackBuffers = 4;
70 static constexpr std::size_t MaxShaderProgram = 6; 71 static constexpr std::size_t MaxShaderProgram = 6;
71 static constexpr std::size_t MaxShaderStage = 5; 72 static constexpr std::size_t MaxShaderStage = 5;
72 // Maximum number of const buffers per shader stage. 73 // Maximum number of const buffers per shader stage.
@@ -524,6 +525,12 @@ public:
524 FractionalEven = 2, 525 FractionalEven = 2,
525 }; 526 };
526 527
528 enum class PolygonMode : u32 {
529 Point = 0x1b00,
530 Line = 0x1b01,
531 Fill = 0x1b02,
532 };
533
527 struct RenderTargetConfig { 534 struct RenderTargetConfig {
528 u32 address_high; 535 u32 address_high;
529 u32 address_low; 536 u32 address_low;
@@ -621,6 +628,29 @@ public:
621 float depth_range_far; 628 float depth_range_far;
622 }; 629 };
623 630
631 struct TransformFeedbackBinding {
632 u32 buffer_enable;
633 u32 address_high;
634 u32 address_low;
635 s32 buffer_size;
636 s32 buffer_offset;
637 INSERT_UNION_PADDING_WORDS(3);
638
639 GPUVAddr Address() const {
640 return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
641 address_low);
642 }
643 };
644 static_assert(sizeof(TransformFeedbackBinding) == 32);
645
646 struct TransformFeedbackLayout {
647 u32 stream;
648 u32 varying_count;
649 u32 stride;
650 INSERT_UNION_PADDING_WORDS(1);
651 };
652 static_assert(sizeof(TransformFeedbackLayout) == 16);
653
624 bool IsShaderConfigEnabled(std::size_t index) const { 654 bool IsShaderConfigEnabled(std::size_t index) const {
625 // The VertexB is always enabled. 655 // The VertexB is always enabled.
626 if (index == static_cast<std::size_t>(Regs::ShaderProgram::VertexB)) { 656 if (index == static_cast<std::size_t>(Regs::ShaderProgram::VertexB)) {
@@ -629,6 +659,10 @@ public:
629 return shader_config[index].enable != 0; 659 return shader_config[index].enable != 0;
630 } 660 }
631 661
662 bool IsShaderConfigEnabled(Regs::ShaderProgram type) const {
663 return IsShaderConfigEnabled(static_cast<std::size_t>(type));
664 }
665
632 union { 666 union {
633 struct { 667 struct {
634 INSERT_UNION_PADDING_WORDS(0x45); 668 INSERT_UNION_PADDING_WORDS(0x45);
@@ -677,7 +711,13 @@ public:
677 711
678 u32 rasterize_enable; 712 u32 rasterize_enable;
679 713
680 INSERT_UNION_PADDING_WORDS(0xF1); 714 std::array<TransformFeedbackBinding, NumTransformFeedbackBuffers> tfb_bindings;
715
716 INSERT_UNION_PADDING_WORDS(0xC0);
717
718 std::array<TransformFeedbackLayout, NumTransformFeedbackBuffers> tfb_layouts;
719
720 INSERT_UNION_PADDING_WORDS(0x1);
681 721
682 u32 tfb_enabled; 722 u32 tfb_enabled;
683 723
@@ -705,7 +745,12 @@ public:
705 745
706 s32 clear_stencil; 746 s32 clear_stencil;
707 747
708 INSERT_UNION_PADDING_WORDS(0x7); 748 INSERT_UNION_PADDING_WORDS(0x2);
749
750 PolygonMode polygon_mode_front;
751 PolygonMode polygon_mode_back;
752
753 INSERT_UNION_PADDING_WORDS(0x3);
709 754
710 u32 polygon_offset_point_enable; 755 u32 polygon_offset_point_enable;
711 u32 polygon_offset_line_enable; 756 u32 polygon_offset_line_enable;
@@ -764,7 +809,11 @@ public:
764 BitField<12, 4, u32> viewport; 809 BitField<12, 4, u32> viewport;
765 } clear_flags; 810 } clear_flags;
766 811
767 INSERT_UNION_PADDING_WORDS(0x19); 812 INSERT_UNION_PADDING_WORDS(0x10);
813
814 u32 fill_rectangle;
815
816 INSERT_UNION_PADDING_WORDS(0x8);
768 817
769 std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format; 818 std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format;
770 819
@@ -1187,7 +1236,11 @@ public:
1187 1236
1188 u32 tex_cb_index; 1237 u32 tex_cb_index;
1189 1238
1190 INSERT_UNION_PADDING_WORDS(0x395); 1239 INSERT_UNION_PADDING_WORDS(0x7D);
1240
1241 std::array<std::array<u8, 128>, NumTransformFeedbackBuffers> tfb_varying_locs;
1242
1243 INSERT_UNION_PADDING_WORDS(0x298);
1191 1244
1192 struct { 1245 struct {
1193 /// Compressed address of a buffer that holds information about bound SSBOs. 1246 /// Compressed address of a buffer that holds information about bound SSBOs.
@@ -1413,6 +1466,8 @@ ASSERT_REG_POSITION(tess_mode, 0xC8);
1413ASSERT_REG_POSITION(tess_level_outer, 0xC9); 1466ASSERT_REG_POSITION(tess_level_outer, 0xC9);
1414ASSERT_REG_POSITION(tess_level_inner, 0xCD); 1467ASSERT_REG_POSITION(tess_level_inner, 0xCD);
1415ASSERT_REG_POSITION(rasterize_enable, 0xDF); 1468ASSERT_REG_POSITION(rasterize_enable, 0xDF);
1469ASSERT_REG_POSITION(tfb_bindings, 0xE0);
1470ASSERT_REG_POSITION(tfb_layouts, 0x1C0);
1416ASSERT_REG_POSITION(tfb_enabled, 0x1D1); 1471ASSERT_REG_POSITION(tfb_enabled, 0x1D1);
1417ASSERT_REG_POSITION(rt, 0x200); 1472ASSERT_REG_POSITION(rt, 0x200);
1418ASSERT_REG_POSITION(viewport_transform, 0x280); 1473ASSERT_REG_POSITION(viewport_transform, 0x280);
@@ -1422,6 +1477,8 @@ ASSERT_REG_POSITION(depth_mode, 0x35F);
1422ASSERT_REG_POSITION(clear_color[0], 0x360); 1477ASSERT_REG_POSITION(clear_color[0], 0x360);
1423ASSERT_REG_POSITION(clear_depth, 0x364); 1478ASSERT_REG_POSITION(clear_depth, 0x364);
1424ASSERT_REG_POSITION(clear_stencil, 0x368); 1479ASSERT_REG_POSITION(clear_stencil, 0x368);
1480ASSERT_REG_POSITION(polygon_mode_front, 0x36B);
1481ASSERT_REG_POSITION(polygon_mode_back, 0x36C);
1425ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370); 1482ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370);
1426ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371); 1483ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371);
1427ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372); 1484ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372);
@@ -1435,6 +1492,7 @@ ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
1435ASSERT_REG_POSITION(depth_bounds, 0x3E7); 1492ASSERT_REG_POSITION(depth_bounds, 0x3E7);
1436ASSERT_REG_POSITION(zeta, 0x3F8); 1493ASSERT_REG_POSITION(zeta, 0x3F8);
1437ASSERT_REG_POSITION(clear_flags, 0x43E); 1494ASSERT_REG_POSITION(clear_flags, 0x43E);
1495ASSERT_REG_POSITION(fill_rectangle, 0x44F);
1438ASSERT_REG_POSITION(vertex_attrib_format, 0x458); 1496ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
1439ASSERT_REG_POSITION(rt_control, 0x487); 1497ASSERT_REG_POSITION(rt_control, 0x487);
1440ASSERT_REG_POSITION(zeta_width, 0x48a); 1498ASSERT_REG_POSITION(zeta_width, 0x48a);
@@ -1508,6 +1566,7 @@ ASSERT_REG_POSITION(firmware, 0x8C0);
1508ASSERT_REG_POSITION(const_buffer, 0x8E0); 1566ASSERT_REG_POSITION(const_buffer, 0x8E0);
1509ASSERT_REG_POSITION(cb_bind[0], 0x904); 1567ASSERT_REG_POSITION(cb_bind[0], 0x904);
1510ASSERT_REG_POSITION(tex_cb_index, 0x982); 1568ASSERT_REG_POSITION(tex_cb_index, 0x982);
1569ASSERT_REG_POSITION(tfb_varying_locs, 0xA00);
1511ASSERT_REG_POSITION(ssbo_info, 0xD18); 1570ASSERT_REG_POSITION(ssbo_info, 0xD18);
1512ASSERT_REG_POSITION(tex_info_buffers.address[0], 0xD2A); 1571ASSERT_REG_POSITION(tex_info_buffers.address[0], 0xD2A);
1513ASSERT_REG_POSITION(tex_info_buffers.size[0], 0xD2F); 1572ASSERT_REG_POSITION(tex_info_buffers.size[0], 0xD2F);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index c9bc83cd7..eba42deb4 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -911,14 +911,9 @@ union Instruction {
911 } fadd32i; 911 } fadd32i;
912 912
913 union { 913 union {
914 BitField<20, 8, u64> shift_position; 914 BitField<40, 1, u64> brev;
915 BitField<28, 8, u64> shift_length; 915 BitField<47, 1, u64> rd_cc;
916 BitField<48, 1, u64> negate_b; 916 BitField<48, 1, u64> is_signed;
917 BitField<49, 1, u64> negate_a;
918
919 u64 GetLeftShiftValue() const {
920 return 32 - (shift_position + shift_length);
921 }
922 } bfe; 917 } bfe;
923 918
924 union { 919 union {
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index ba8c9d665..64acb17df 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -39,6 +39,7 @@ enum class RenderTargetFormat : u32 {
39 RGBA32_FLOAT = 0xC0, 39 RGBA32_FLOAT = 0xC0,
40 RGBA32_UINT = 0xC2, 40 RGBA32_UINT = 0xC2,
41 RGBA16_UNORM = 0xC6, 41 RGBA16_UNORM = 0xC6,
42 RGBA16_SNORM = 0xC7,
42 RGBA16_UINT = 0xC9, 43 RGBA16_UINT = 0xC9,
43 RGBA16_FLOAT = 0xCA, 44 RGBA16_FLOAT = 0xCA,
44 RG32_FLOAT = 0xCB, 45 RG32_FLOAT = 0xCB,
diff --git a/src/video_core/guest_driver.cpp b/src/video_core/guest_driver.cpp
index 6adef459e..f058f2744 100644
--- a/src/video_core/guest_driver.cpp
+++ b/src/video_core/guest_driver.cpp
@@ -4,13 +4,15 @@
4 4
5#include <algorithm> 5#include <algorithm>
6#include <limits> 6#include <limits>
7#include <vector>
7 8
9#include "common/common_types.h"
8#include "video_core/guest_driver.h" 10#include "video_core/guest_driver.h"
9 11
10namespace VideoCore { 12namespace VideoCore {
11 13
12void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets) { 14void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32> bound_offsets) {
13 if (texture_handler_size_deduced) { 15 if (texture_handler_size) {
14 return; 16 return;
15 } 17 }
16 const std::size_t size = bound_offsets.size(); 18 const std::size_t size = bound_offsets.size();
@@ -29,7 +31,6 @@ void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offse
29 if (min_val > 2) { 31 if (min_val > 2) {
30 return; 32 return;
31 } 33 }
32 texture_handler_size_deduced = true;
33 texture_handler_size = min_texture_handler_size * min_val; 34 texture_handler_size = min_texture_handler_size * min_val;
34} 35}
35 36
diff --git a/src/video_core/guest_driver.h b/src/video_core/guest_driver.h
index fc1917347..99450777e 100644
--- a/src/video_core/guest_driver.h
+++ b/src/video_core/guest_driver.h
@@ -4,6 +4,7 @@
4 4
5#pragma once 5#pragma once
6 6
7#include <optional>
7#include <vector> 8#include <vector>
8 9
9#include "common/common_types.h" 10#include "common/common_types.h"
@@ -17,25 +18,29 @@ namespace VideoCore {
17 */ 18 */
18class GuestDriverProfile { 19class GuestDriverProfile {
19public: 20public:
20 void DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets); 21 explicit GuestDriverProfile() = default;
22 explicit GuestDriverProfile(std::optional<u32> texture_handler_size)
23 : texture_handler_size{texture_handler_size} {}
24
25 void DeduceTextureHandlerSize(std::vector<u32> bound_offsets);
21 26
22 u32 GetTextureHandlerSize() const { 27 u32 GetTextureHandlerSize() const {
23 return texture_handler_size; 28 return texture_handler_size.value_or(default_texture_handler_size);
24 } 29 }
25 30
26 bool TextureHandlerSizeKnown() const { 31 bool IsTextureHandlerSizeKnown() const {
27 return texture_handler_size_deduced; 32 return texture_handler_size.has_value();
28 } 33 }
29 34
30private: 35private:
31 // Minimum size of texture handler any driver can use. 36 // Minimum size of texture handler any driver can use.
32 static constexpr u32 min_texture_handler_size = 4; 37 static constexpr u32 min_texture_handler_size = 4;
33 // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily 38
34 // use 4 bytes instead. Thus, certain drivers may squish the size. 39 // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily use 4 bytes instead.
40 // Thus, certain drivers may squish the size.
35 static constexpr u32 default_texture_handler_size = 8; 41 static constexpr u32 default_texture_handler_size = 8;
36 42
37 u32 texture_handler_size = default_texture_handler_size; 43 std::optional<u32> texture_handler_size = default_texture_handler_size;
38 bool texture_handler_size_deduced = false;
39}; 44};
40 45
41} // namespace VideoCore 46} // namespace VideoCore
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index aea010087..073bdb491 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -174,7 +174,7 @@ private:
174 /// End of address space, based on address space in bits. 174 /// End of address space, based on address space in bits.
175 static constexpr GPUVAddr address_space_end{1ULL << address_space_width}; 175 static constexpr GPUVAddr address_space_end{1ULL << address_space_width};
176 176
177 Common::PageTable page_table{page_bits}; 177 Common::BackingPageTable page_table{page_bits};
178 VMAMap vma_map; 178 VMAMap vma_map;
179 VideoCore::RasterizerInterface& rasterizer; 179 VideoCore::RasterizerInterface& rasterizer;
180 180
diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp
index f2c83266e..6d522c318 100644
--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
@@ -51,6 +51,7 @@ static constexpr ConversionArray morton_to_linear_fns = {
51 MortonCopy<true, PixelFormat::R8UI>, 51 MortonCopy<true, PixelFormat::R8UI>,
52 MortonCopy<true, PixelFormat::RGBA16F>, 52 MortonCopy<true, PixelFormat::RGBA16F>,
53 MortonCopy<true, PixelFormat::RGBA16U>, 53 MortonCopy<true, PixelFormat::RGBA16U>,
54 MortonCopy<true, PixelFormat::RGBA16S>,
54 MortonCopy<true, PixelFormat::RGBA16UI>, 55 MortonCopy<true, PixelFormat::RGBA16UI>,
55 MortonCopy<true, PixelFormat::R11FG11FB10F>, 56 MortonCopy<true, PixelFormat::R11FG11FB10F>,
56 MortonCopy<true, PixelFormat::RGBA32UI>, 57 MortonCopy<true, PixelFormat::RGBA32UI>,
@@ -131,6 +132,7 @@ static constexpr ConversionArray linear_to_morton_fns = {
131 MortonCopy<false, PixelFormat::R8U>, 132 MortonCopy<false, PixelFormat::R8U>,
132 MortonCopy<false, PixelFormat::R8UI>, 133 MortonCopy<false, PixelFormat::R8UI>,
133 MortonCopy<false, PixelFormat::RGBA16F>, 134 MortonCopy<false, PixelFormat::RGBA16F>,
135 MortonCopy<false, PixelFormat::RGBA16S>,
134 MortonCopy<false, PixelFormat::RGBA16U>, 136 MortonCopy<false, PixelFormat::RGBA16U>,
135 MortonCopy<false, PixelFormat::RGBA16UI>, 137 MortonCopy<false, PixelFormat::RGBA16UI>,
136 MortonCopy<false, PixelFormat::R11FG11FB10F>, 138 MortonCopy<false, PixelFormat::R11FG11FB10F>,
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 3e4514b94..1a68e3caa 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -25,7 +25,6 @@ constexpr std::size_t NumQueryTypes = 1;
25 25
26enum class LoadCallbackStage { 26enum class LoadCallbackStage {
27 Prepare, 27 Prepare,
28 Decompile,
29 Build, 28 Build,
30 Complete, 29 Complete,
31}; 30};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 4e4138573..063f41327 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -28,7 +28,6 @@
28#include "video_core/renderer_opengl/gl_query_cache.h" 28#include "video_core/renderer_opengl/gl_query_cache.h"
29#include "video_core/renderer_opengl/gl_rasterizer.h" 29#include "video_core/renderer_opengl/gl_rasterizer.h"
30#include "video_core/renderer_opengl/gl_shader_cache.h" 30#include "video_core/renderer_opengl/gl_shader_cache.h"
31#include "video_core/renderer_opengl/gl_shader_gen.h"
32#include "video_core/renderer_opengl/maxwell_to_gl.h" 31#include "video_core/renderer_opengl/maxwell_to_gl.h"
33#include "video_core/renderer_opengl/renderer_opengl.h" 32#include "video_core/renderer_opengl/renderer_opengl.h"
34 33
@@ -76,7 +75,7 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry
76} 75}
77 76
78std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, 77std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
79 const GLShader::ConstBufferEntry& entry) { 78 const ConstBufferEntry& entry) {
80 if (!entry.IsIndirect()) { 79 if (!entry.IsIndirect()) {
81 return entry.GetSize(); 80 return entry.GetSize();
82 } 81 }
@@ -94,10 +93,6 @@ void oglEnable(GLenum cap, bool state) {
94 (state ? glEnable : glDisable)(cap); 93 (state ? glEnable : glDisable)(cap);
95} 94}
96 95
97void oglEnablei(GLenum cap, bool state, GLuint index) {
98 (state ? glEnablei : glDisablei)(cap, index);
99}
100
101} // Anonymous namespace 96} // Anonymous namespace
102 97
103RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, 98RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
@@ -272,9 +267,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
272 SetupDrawTextures(stage, shader); 267 SetupDrawTextures(stage, shader);
273 SetupDrawImages(stage, shader); 268 SetupDrawImages(stage, shader);
274 269
275 const ProgramVariant variant(primitive_mode); 270 const GLuint program_handle = shader->GetHandle();
276 const auto program_handle = shader->GetHandle(variant);
277
278 switch (program) { 271 switch (program) {
279 case Maxwell::ShaderProgram::VertexA: 272 case Maxwell::ShaderProgram::VertexA:
280 case Maxwell::ShaderProgram::VertexB: 273 case Maxwell::ShaderProgram::VertexB:
@@ -295,7 +288,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
295 // When a clip distance is enabled but not set in the shader it crops parts of the screen 288 // When a clip distance is enabled but not set in the shader it crops parts of the screen
296 // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the 289 // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
297 // clip distances only when it's written by a shader stage. 290 // clip distances only when it's written by a shader stage.
298 clip_distances |= shader->GetShaderEntries().clip_distances; 291 clip_distances |= shader->GetEntries().clip_distances;
299 292
300 // When VertexA is enabled, we have dual vertex shaders 293 // When VertexA is enabled, we have dual vertex shaders
301 if (program == Maxwell::ShaderProgram::VertexA) { 294 if (program == Maxwell::ShaderProgram::VertexA) {
@@ -481,12 +474,12 @@ void RasterizerOpenGL::Clear() {
481void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { 474void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
482 MICROPROFILE_SCOPE(OpenGL_Drawing); 475 MICROPROFILE_SCOPE(OpenGL_Drawing);
483 auto& gpu = system.GPU().Maxwell3D(); 476 auto& gpu = system.GPU().Maxwell3D();
484 const auto& regs = gpu.regs;
485 477
486 query_cache.UpdateCounters(); 478 query_cache.UpdateCounters();
487 479
488 SyncViewport(); 480 SyncViewport();
489 SyncRasterizeEnable(); 481 SyncRasterizeEnable();
482 SyncPolygonModes();
490 SyncColorMask(); 483 SyncColorMask();
491 SyncFragmentColorClampState(); 484 SyncFragmentColorClampState();
492 SyncMultiSampleState(); 485 SyncMultiSampleState();
@@ -498,7 +491,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
498 SyncCullMode(); 491 SyncCullMode();
499 SyncPrimitiveRestart(); 492 SyncPrimitiveRestart();
500 SyncScissorTest(); 493 SyncScissorTest();
501 SyncTransformFeedback();
502 SyncPointState(); 494 SyncPointState();
503 SyncPolygonOffset(); 495 SyncPolygonOffset();
504 SyncAlphaTest(); 496 SyncAlphaTest();
@@ -532,7 +524,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
532 // Upload vertex and index data. 524 // Upload vertex and index data.
533 SetupVertexBuffer(); 525 SetupVertexBuffer();
534 SetupVertexInstances(); 526 SetupVertexInstances();
535 GLintptr index_buffer_offset; 527 GLintptr index_buffer_offset = 0;
536 if (is_indexed) { 528 if (is_indexed) {
537 index_buffer_offset = SetupIndexBuffer(); 529 index_buffer_offset = SetupIndexBuffer();
538 } 530 }
@@ -558,7 +550,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
558 ConfigureFramebuffers(); 550 ConfigureFramebuffers();
559 551
560 // Signal the buffer cache that we are not going to upload more things. 552 // Signal the buffer cache that we are not going to upload more things.
561 const bool invalidate = buffer_cache.Unmap(); 553 buffer_cache.Unmap();
562 554
563 // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL. 555 // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL.
564 vertex_array_pushbuffer.Bind(); 556 vertex_array_pushbuffer.Bind();
@@ -571,7 +563,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
571 glTextureBarrier(); 563 glTextureBarrier();
572 } 564 }
573 565
574 ++num_queued_commands; 566 BeginTransformFeedback(primitive_mode);
575 567
576 const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance); 568 const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
577 const GLsizei num_instances = 569 const GLsizei num_instances =
@@ -610,6 +602,10 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
610 num_instances, base_instance); 602 num_instances, base_instance);
611 } 603 }
612 } 604 }
605
606 EndTransformFeedback();
607
608 ++num_queued_commands;
613} 609}
614 610
615void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { 611void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
@@ -622,12 +618,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
622 auto kernel = shader_cache.GetComputeKernel(code_addr); 618 auto kernel = shader_cache.GetComputeKernel(code_addr);
623 SetupComputeTextures(kernel); 619 SetupComputeTextures(kernel);
624 SetupComputeImages(kernel); 620 SetupComputeImages(kernel);
625 621 program_manager.BindComputeShader(kernel->GetHandle());
626 const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
627 const ProgramVariant variant(launch_desc.block_dim_x, launch_desc.block_dim_y,
628 launch_desc.block_dim_z, launch_desc.shared_alloc,
629 launch_desc.local_pos_alloc);
630 program_manager.BindComputeShader(kernel->GetHandle(variant));
631 622
632 const std::size_t buffer_size = 623 const std::size_t buffer_size =
633 Tegra::Engines::KeplerCompute::NumConstBuffers * 624 Tegra::Engines::KeplerCompute::NumConstBuffers *
@@ -645,6 +636,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
645 bind_ubo_pushbuffer.Bind(); 636 bind_ubo_pushbuffer.Bind();
646 bind_ssbo_pushbuffer.Bind(); 637 bind_ssbo_pushbuffer.Bind();
647 638
639 const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
648 glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); 640 glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
649 ++num_queued_commands; 641 ++num_queued_commands;
650} 642}
@@ -749,7 +741,7 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad
749 const auto& shader_stage = stages[stage_index]; 741 const auto& shader_stage = stages[stage_index];
750 742
751 u32 binding = device.GetBaseBindings(stage_index).uniform_buffer; 743 u32 binding = device.GetBaseBindings(stage_index).uniform_buffer;
752 for (const auto& entry : shader->GetShaderEntries().const_buffers) { 744 for (const auto& entry : shader->GetEntries().const_buffers) {
753 const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; 745 const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
754 SetupConstBuffer(binding++, buffer, entry); 746 SetupConstBuffer(binding++, buffer, entry);
755 } 747 }
@@ -760,7 +752,7 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
760 const auto& launch_desc = system.GPU().KeplerCompute().launch_description; 752 const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
761 753
762 u32 binding = 0; 754 u32 binding = 0;
763 for (const auto& entry : kernel->GetShaderEntries().const_buffers) { 755 for (const auto& entry : kernel->GetEntries().const_buffers) {
764 const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; 756 const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
765 const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); 757 const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
766 Tegra::Engines::ConstBufferInfo buffer; 758 Tegra::Engines::ConstBufferInfo buffer;
@@ -772,7 +764,7 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
772} 764}
773 765
774void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, 766void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
775 const GLShader::ConstBufferEntry& entry) { 767 const ConstBufferEntry& entry) {
776 if (!buffer.enabled) { 768 if (!buffer.enabled) {
777 // Set values to zero to unbind buffers 769 // Set values to zero to unbind buffers
778 bind_ubo_pushbuffer.Push(binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0, 770 bind_ubo_pushbuffer.Push(binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
@@ -796,7 +788,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
796 const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; 788 const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
797 789
798 u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer; 790 u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer;
799 for (const auto& entry : shader->GetShaderEntries().global_memory_entries) { 791 for (const auto& entry : shader->GetEntries().global_memory_entries) {
800 const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()}; 792 const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
801 const auto gpu_addr{memory_manager.Read<u64>(addr)}; 793 const auto gpu_addr{memory_manager.Read<u64>(addr)};
802 const auto size{memory_manager.Read<u32>(addr + 8)}; 794 const auto size{memory_manager.Read<u32>(addr + 8)};
@@ -810,7 +802,7 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
810 const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; 802 const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
811 803
812 u32 binding = 0; 804 u32 binding = 0;
813 for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) { 805 for (const auto& entry : kernel->GetEntries().global_memory_entries) {
814 const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; 806 const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
815 const auto gpu_addr{memory_manager.Read<u64>(addr)}; 807 const auto gpu_addr{memory_manager.Read<u64>(addr)};
816 const auto size{memory_manager.Read<u32>(addr + 8)}; 808 const auto size{memory_manager.Read<u32>(addr + 8)};
@@ -818,7 +810,7 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
818 } 810 }
819} 811}
820 812
821void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, 813void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
822 GPUVAddr gpu_addr, std::size_t size) { 814 GPUVAddr gpu_addr, std::size_t size) {
823 const auto alignment{device.GetShaderStorageBufferAlignment()}; 815 const auto alignment{device.GetShaderStorageBufferAlignment()};
824 const auto [ssbo, buffer_offset] = 816 const auto [ssbo, buffer_offset] =
@@ -830,7 +822,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&
830 MICROPROFILE_SCOPE(OpenGL_Texture); 822 MICROPROFILE_SCOPE(OpenGL_Texture);
831 const auto& maxwell3d = system.GPU().Maxwell3D(); 823 const auto& maxwell3d = system.GPU().Maxwell3D();
832 u32 binding = device.GetBaseBindings(stage_index).sampler; 824 u32 binding = device.GetBaseBindings(stage_index).sampler;
833 for (const auto& entry : shader->GetShaderEntries().samplers) { 825 for (const auto& entry : shader->GetEntries().samplers) {
834 const auto shader_type = static_cast<ShaderType>(stage_index); 826 const auto shader_type = static_cast<ShaderType>(stage_index);
835 for (std::size_t i = 0; i < entry.Size(); ++i) { 827 for (std::size_t i = 0; i < entry.Size(); ++i) {
836 const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i); 828 const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
@@ -843,7 +835,7 @@ void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
843 MICROPROFILE_SCOPE(OpenGL_Texture); 835 MICROPROFILE_SCOPE(OpenGL_Texture);
844 const auto& compute = system.GPU().KeplerCompute(); 836 const auto& compute = system.GPU().KeplerCompute();
845 u32 binding = 0; 837 u32 binding = 0;
846 for (const auto& entry : kernel->GetShaderEntries().samplers) { 838 for (const auto& entry : kernel->GetEntries().samplers) {
847 for (std::size_t i = 0; i < entry.Size(); ++i) { 839 for (std::size_t i = 0; i < entry.Size(); ++i) {
848 const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i); 840 const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i);
849 SetupTexture(binding++, texture, entry); 841 SetupTexture(binding++, texture, entry);
@@ -852,7 +844,7 @@ void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
852} 844}
853 845
854void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, 846void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
855 const GLShader::SamplerEntry& entry) { 847 const SamplerEntry& entry) {
856 const auto view = texture_cache.GetTextureSurface(texture.tic, entry); 848 const auto view = texture_cache.GetTextureSurface(texture.tic, entry);
857 if (!view) { 849 if (!view) {
858 // Can occur when texture addr is null or its memory is unmapped/invalid 850 // Can occur when texture addr is null or its memory is unmapped/invalid
@@ -875,7 +867,7 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
875void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) { 867void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
876 const auto& maxwell3d = system.GPU().Maxwell3D(); 868 const auto& maxwell3d = system.GPU().Maxwell3D();
877 u32 binding = device.GetBaseBindings(stage_index).image; 869 u32 binding = device.GetBaseBindings(stage_index).image;
878 for (const auto& entry : shader->GetShaderEntries().images) { 870 for (const auto& entry : shader->GetEntries().images) {
879 const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index); 871 const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index);
880 const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic; 872 const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic;
881 SetupImage(binding++, tic, entry); 873 SetupImage(binding++, tic, entry);
@@ -885,14 +877,14 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh
885void RasterizerOpenGL::SetupComputeImages(const Shader& shader) { 877void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
886 const auto& compute = system.GPU().KeplerCompute(); 878 const auto& compute = system.GPU().KeplerCompute();
887 u32 binding = 0; 879 u32 binding = 0;
888 for (const auto& entry : shader->GetShaderEntries().images) { 880 for (const auto& entry : shader->GetEntries().images) {
889 const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic; 881 const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic;
890 SetupImage(binding++, tic, entry); 882 SetupImage(binding++, tic, entry);
891 } 883 }
892} 884}
893 885
894void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, 886void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic,
895 const GLShader::ImageEntry& entry) { 887 const ImageEntry& entry) {
896 const auto view = texture_cache.GetImageSurface(tic, entry); 888 const auto view = texture_cache.GetImageSurface(tic, entry);
897 if (!view) { 889 if (!view) {
898 glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8); 890 glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
@@ -1096,6 +1088,45 @@ void RasterizerOpenGL::SyncRasterizeEnable() {
1096 oglEnable(GL_RASTERIZER_DISCARD, gpu.regs.rasterize_enable == 0); 1088 oglEnable(GL_RASTERIZER_DISCARD, gpu.regs.rasterize_enable == 0);
1097} 1089}
1098 1090
1091void RasterizerOpenGL::SyncPolygonModes() {
1092 auto& gpu = system.GPU().Maxwell3D();
1093 auto& flags = gpu.dirty.flags;
1094 if (!flags[Dirty::PolygonModes]) {
1095 return;
1096 }
1097 flags[Dirty::PolygonModes] = false;
1098
1099 if (gpu.regs.fill_rectangle) {
1100 if (!GLAD_GL_NV_fill_rectangle) {
1101 LOG_ERROR(Render_OpenGL, "GL_NV_fill_rectangle used and not supported");
1102 glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
1103 return;
1104 }
1105
1106 flags[Dirty::PolygonModeFront] = true;
1107 flags[Dirty::PolygonModeBack] = true;
1108 glPolygonMode(GL_FRONT_AND_BACK, GL_FILL_RECTANGLE_NV);
1109 return;
1110 }
1111
1112 if (gpu.regs.polygon_mode_front == gpu.regs.polygon_mode_back) {
1113 flags[Dirty::PolygonModeFront] = false;
1114 flags[Dirty::PolygonModeBack] = false;
1115 glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front));
1116 return;
1117 }
1118
1119 if (flags[Dirty::PolygonModeFront]) {
1120 flags[Dirty::PolygonModeFront] = false;
1121 glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front));
1122 }
1123
1124 if (flags[Dirty::PolygonModeBack]) {
1125 flags[Dirty::PolygonModeBack] = false;
1126 glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_back));
1127 }
1128}
1129
1099void RasterizerOpenGL::SyncColorMask() { 1130void RasterizerOpenGL::SyncColorMask() {
1100 auto& gpu = system.GPU().Maxwell3D(); 1131 auto& gpu = system.GPU().Maxwell3D();
1101 auto& flags = gpu.dirty.flags; 1132 auto& flags = gpu.dirty.flags;
@@ -1257,11 +1288,6 @@ void RasterizerOpenGL::SyncScissorTest() {
1257 } 1288 }
1258} 1289}
1259 1290
1260void RasterizerOpenGL::SyncTransformFeedback() {
1261 const auto& regs = system.GPU().Maxwell3D().regs;
1262 UNIMPLEMENTED_IF_MSG(regs.tfb_enabled != 0, "Transform feedbacks are not implemented");
1263}
1264
1265void RasterizerOpenGL::SyncPointState() { 1291void RasterizerOpenGL::SyncPointState() {
1266 auto& gpu = system.GPU().Maxwell3D(); 1292 auto& gpu = system.GPU().Maxwell3D();
1267 auto& flags = gpu.dirty.flags; 1293 auto& flags = gpu.dirty.flags;
@@ -1337,4 +1363,62 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
1337 oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb); 1363 oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
1338} 1364}
1339 1365
1366void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
1367 const auto& regs = system.GPU().Maxwell3D().regs;
1368 if (regs.tfb_enabled == 0) {
1369 return;
1370 }
1371
1372 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
1373 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
1374 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
1375
1376 for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
1377 const auto& binding = regs.tfb_bindings[index];
1378 if (!binding.buffer_enable) {
1379 if (enabled_transform_feedback_buffers[index]) {
1380 glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0,
1381 0);
1382 }
1383 enabled_transform_feedback_buffers[index] = false;
1384 continue;
1385 }
1386 enabled_transform_feedback_buffers[index] = true;
1387
1388 auto& tfb_buffer = transform_feedback_buffers[index];
1389 tfb_buffer.Create();
1390
1391 const GLuint handle = tfb_buffer.handle;
1392 const std::size_t size = binding.buffer_size;
1393 glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY);
1394 glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0,
1395 static_cast<GLsizeiptr>(size));
1396 }
1397
1398 glBeginTransformFeedback(GL_POINTS);
1399}
1400
1401void RasterizerOpenGL::EndTransformFeedback() {
1402 const auto& regs = system.GPU().Maxwell3D().regs;
1403 if (regs.tfb_enabled == 0) {
1404 return;
1405 }
1406
1407 glEndTransformFeedback();
1408
1409 for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
1410 const auto& binding = regs.tfb_bindings[index];
1411 if (!binding.buffer_enable) {
1412 continue;
1413 }
1414 UNIMPLEMENTED_IF(binding.buffer_offset != 0);
1415
1416 const GLuint handle = transform_feedback_buffers[index].handle;
1417 const GPUVAddr gpu_addr = binding.Address();
1418 const std::size_t size = binding.buffer_size;
1419 const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
1420 glCopyNamedBufferSubData(handle, *dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
1421 }
1422}
1423
1340} // namespace OpenGL 1424} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index b24c6661b..2d3be2437 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -98,7 +98,7 @@ private:
98 98
99 /// Configures a constant buffer. 99 /// Configures a constant buffer.
100 void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, 100 void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
101 const GLShader::ConstBufferEntry& entry); 101 const ConstBufferEntry& entry);
102 102
103 /// Configures the current global memory entries to use for the draw command. 103 /// Configures the current global memory entries to use for the draw command.
104 void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); 104 void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
@@ -107,7 +107,7 @@ private:
107 void SetupComputeGlobalMemory(const Shader& kernel); 107 void SetupComputeGlobalMemory(const Shader& kernel);
108 108
109 /// Configures a constant buffer. 109 /// Configures a constant buffer.
110 void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr, 110 void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
111 std::size_t size); 111 std::size_t size);
112 112
113 /// Configures the current textures to use for the draw command. 113 /// Configures the current textures to use for the draw command.
@@ -118,7 +118,7 @@ private:
118 118
119 /// Configures a texture. 119 /// Configures a texture.
120 void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, 120 void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
121 const GLShader::SamplerEntry& entry); 121 const SamplerEntry& entry);
122 122
123 /// Configures images in a graphics shader. 123 /// Configures images in a graphics shader.
124 void SetupDrawImages(std::size_t stage_index, const Shader& shader); 124 void SetupDrawImages(std::size_t stage_index, const Shader& shader);
@@ -127,8 +127,7 @@ private:
127 void SetupComputeImages(const Shader& shader); 127 void SetupComputeImages(const Shader& shader);
128 128
129 /// Configures an image. 129 /// Configures an image.
130 void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, 130 void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
131 const GLShader::ImageEntry& entry);
132 131
133 /// Syncs the viewport and depth range to match the guest state 132 /// Syncs the viewport and depth range to match the guest state
134 void SyncViewport(); 133 void SyncViewport();
@@ -169,15 +168,15 @@ private:
169 /// Syncs the scissor test state to match the guest state 168 /// Syncs the scissor test state to match the guest state
170 void SyncScissorTest(); 169 void SyncScissorTest();
171 170
172 /// Syncs the transform feedback state to match the guest state
173 void SyncTransformFeedback();
174
175 /// Syncs the point state to match the guest state 171 /// Syncs the point state to match the guest state
176 void SyncPointState(); 172 void SyncPointState();
177 173
178 /// Syncs the rasterizer enable state to match the guest state 174 /// Syncs the rasterizer enable state to match the guest state
179 void SyncRasterizeEnable(); 175 void SyncRasterizeEnable();
180 176
177 /// Syncs polygon modes to match the guest state
178 void SyncPolygonModes();
179
181 /// Syncs Color Mask 180 /// Syncs Color Mask
182 void SyncColorMask(); 181 void SyncColorMask();
183 182
@@ -190,6 +189,12 @@ private:
190 /// Syncs the framebuffer sRGB state to match the guest state 189 /// Syncs the framebuffer sRGB state to match the guest state
191 void SyncFramebufferSRGB(); 190 void SyncFramebufferSRGB();
192 191
192 /// Begin a transform feedback
193 void BeginTransformFeedback(GLenum primitive_mode);
194
195 /// End a transform feedback
196 void EndTransformFeedback();
197
193 /// Check for extension that are not strictly required but are needed for correct emulation 198 /// Check for extension that are not strictly required but are needed for correct emulation
194 void CheckExtensions(); 199 void CheckExtensions();
195 200
@@ -227,6 +232,11 @@ private:
227 BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; 232 BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
228 BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; 233 BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
229 234
235 std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
236 transform_feedback_buffers;
237 std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
238 enabled_transform_feedback_buffers;
239
230 /// Number of commands queued to the OpenGL driver. Reseted on flush. 240 /// Number of commands queued to the OpenGL driver. Reseted on flush.
231 std::size_t num_queued_commands = 0; 241 std::size_t num_queued_commands = 0;
232 242
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 4cb89db8c..e3d31c3eb 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -2,12 +2,16 @@
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <atomic>
6#include <functional>
5#include <mutex> 7#include <mutex>
6#include <optional> 8#include <optional>
7#include <string> 9#include <string>
8#include <thread> 10#include <thread>
9#include <unordered_set> 11#include <unordered_set>
12
10#include <boost/functional/hash.hpp> 13#include <boost/functional/hash.hpp>
14
11#include "common/alignment.h" 15#include "common/alignment.h"
12#include "common/assert.h" 16#include "common/assert.h"
13#include "common/logging/log.h" 17#include "common/logging/log.h"
@@ -24,13 +28,14 @@
24#include "video_core/renderer_opengl/gl_shader_disk_cache.h" 28#include "video_core/renderer_opengl/gl_shader_disk_cache.h"
25#include "video_core/renderer_opengl/gl_state_tracker.h" 29#include "video_core/renderer_opengl/gl_state_tracker.h"
26#include "video_core/renderer_opengl/utils.h" 30#include "video_core/renderer_opengl/utils.h"
31#include "video_core/shader/registry.h"
27#include "video_core/shader/shader_ir.h" 32#include "video_core/shader/shader_ir.h"
28 33
29namespace OpenGL { 34namespace OpenGL {
30 35
31using Tegra::Engines::ShaderType; 36using Tegra::Engines::ShaderType;
32using VideoCommon::Shader::ConstBufferLocker;
33using VideoCommon::Shader::ProgramCode; 37using VideoCommon::Shader::ProgramCode;
38using VideoCommon::Shader::Registry;
34using VideoCommon::Shader::ShaderIR; 39using VideoCommon::Shader::ShaderIR;
35 40
36namespace { 41namespace {
@@ -56,7 +61,7 @@ constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) {
56} 61}
57 62
58/// Calculates the size of a program stream 63/// Calculates the size of a program stream
59std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) { 64std::size_t CalculateProgramSize(const ProgramCode& program) {
60 constexpr std::size_t start_offset = 10; 65 constexpr std::size_t start_offset = 10;
61 // This is the encoded version of BRA that jumps to itself. All Nvidia 66 // This is the encoded version of BRA that jumps to itself. All Nvidia
62 // shaders end with one. 67 // shaders end with one.
@@ -109,32 +114,9 @@ constexpr GLenum GetGLShaderType(ShaderType shader_type) {
109 } 114 }
110} 115}
111 116
112/// Describes primitive behavior on geometry shaders
113constexpr std::pair<const char*, u32> GetPrimitiveDescription(GLenum primitive_mode) {
114 switch (primitive_mode) {
115 case GL_POINTS:
116 return {"points", 1};
117 case GL_LINES:
118 case GL_LINE_STRIP:
119 return {"lines", 2};
120 case GL_LINES_ADJACENCY:
121 case GL_LINE_STRIP_ADJACENCY:
122 return {"lines_adjacency", 4};
123 case GL_TRIANGLES:
124 case GL_TRIANGLE_STRIP:
125 case GL_TRIANGLE_FAN:
126 return {"triangles", 3};
127 case GL_TRIANGLES_ADJACENCY:
128 case GL_TRIANGLE_STRIP_ADJACENCY:
129 return {"triangles_adjacency", 6};
130 default:
131 return {"points", 1};
132 }
133}
134
135/// Hashes one (or two) program streams 117/// Hashes one (or two) program streams
136u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& code, 118u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& code,
137 const ProgramCode& code_b) { 119 const ProgramCode& code_b = {}) {
138 u64 unique_identifier = boost::hash_value(code); 120 u64 unique_identifier = boost::hash_value(code);
139 if (is_a) { 121 if (is_a) {
140 // VertexA programs include two programs 122 // VertexA programs include two programs
@@ -143,24 +125,6 @@ u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& co
143 return unique_identifier; 125 return unique_identifier;
144} 126}
145 127
146/// Creates an unspecialized program from code streams
147std::string GenerateGLSL(const Device& device, ShaderType shader_type, const ShaderIR& ir,
148 const std::optional<ShaderIR>& ir_b) {
149 switch (shader_type) {
150 case ShaderType::Vertex:
151 return GLShader::GenerateVertexShader(device, ir, ir_b ? &*ir_b : nullptr);
152 case ShaderType::Geometry:
153 return GLShader::GenerateGeometryShader(device, ir);
154 case ShaderType::Fragment:
155 return GLShader::GenerateFragmentShader(device, ir);
156 case ShaderType::Compute:
157 return GLShader::GenerateComputeShader(device, ir);
158 default:
159 UNIMPLEMENTED_MSG("Unimplemented shader_type={}", static_cast<u32>(shader_type));
160 return {};
161 }
162}
163
164constexpr const char* GetShaderTypeName(ShaderType shader_type) { 128constexpr const char* GetShaderTypeName(ShaderType shader_type) {
165 switch (shader_type) { 129 switch (shader_type) {
166 case ShaderType::Vertex: 130 case ShaderType::Vertex:
@@ -196,102 +160,38 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
196 return {}; 160 return {};
197} 161}
198 162
199std::string GetShaderId(u64 unique_identifier, ShaderType shader_type) { 163std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
200 return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier); 164 return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
201} 165}
202 166
203Tegra::Engines::ConstBufferEngineInterface& GetConstBufferEngineInterface(Core::System& system, 167std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
204 ShaderType shader_type) { 168 const VideoCore::GuestDriverProfile guest_profile{entry.texture_handler_size};
205 if (shader_type == ShaderType::Compute) { 169 const VideoCommon::Shader::SerializedRegistryInfo info{guest_profile, entry.bound_buffer,
206 return system.GPU().KeplerCompute(); 170 entry.graphics_info, entry.compute_info};
207 } else { 171 const auto registry = std::make_shared<Registry>(entry.type, info);
208 return system.GPU().Maxwell3D(); 172 for (const auto& [address, value] : entry.keys) {
209 } 173 const auto [buffer, offset] = address;
210} 174 registry->InsertKey(buffer, offset, value);
211
212std::unique_ptr<ConstBufferLocker> MakeLocker(Core::System& system, ShaderType shader_type) {
213 return std::make_unique<ConstBufferLocker>(shader_type,
214 GetConstBufferEngineInterface(system, shader_type));
215}
216
217void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) {
218 locker.SetBoundBuffer(usage.bound_buffer);
219 for (const auto& key : usage.keys) {
220 const auto [buffer, offset] = key.first;
221 locker.InsertKey(buffer, offset, key.second);
222 } 175 }
223 for (const auto& [offset, sampler] : usage.bound_samplers) { 176 for (const auto& [offset, sampler] : entry.bound_samplers) {
224 locker.InsertBoundSampler(offset, sampler); 177 registry->InsertBoundSampler(offset, sampler);
225 } 178 }
226 for (const auto& [key, sampler] : usage.bindless_samplers) { 179 for (const auto& [key, sampler] : entry.bindless_samplers) {
227 const auto [buffer, offset] = key; 180 const auto [buffer, offset] = key;
228 locker.InsertBindlessSampler(buffer, offset, sampler); 181 registry->InsertBindlessSampler(buffer, offset, sampler);
229 } 182 }
183 return registry;
230} 184}
231 185
232CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderType shader_type, 186std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type,
233 const ProgramCode& code, const ProgramCode& code_b, 187 u64 unique_identifier, const ShaderIR& ir,
234 ConstBufferLocker& locker, const ProgramVariant& variant, 188 const Registry& registry, bool hint_retrievable = false) {
235 bool hint_retrievable = false) { 189 const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
236 LOG_INFO(Render_OpenGL, "called. {}", GetShaderId(unique_identifier, shader_type)); 190 LOG_INFO(Render_OpenGL, "{}", shader_id);
237
238 const bool is_compute = shader_type == ShaderType::Compute;
239 const u32 main_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
240 const ShaderIR ir(code, main_offset, COMPILER_SETTINGS, locker);
241 std::optional<ShaderIR> ir_b;
242 if (!code_b.empty()) {
243 ir_b.emplace(code_b, main_offset, COMPILER_SETTINGS, locker);
244 }
245
246 std::string source = fmt::format(R"(// {}
247#version 430 core
248#extension GL_ARB_separate_shader_objects : enable
249)",
250 GetShaderId(unique_identifier, shader_type));
251 if (device.HasShaderBallot()) {
252 source += "#extension GL_ARB_shader_ballot : require\n";
253 }
254 if (device.HasVertexViewportLayer()) {
255 source += "#extension GL_ARB_shader_viewport_layer_array : require\n";
256 }
257 if (device.HasImageLoadFormatted()) {
258 source += "#extension GL_EXT_shader_image_load_formatted : require\n";
259 }
260 if (device.HasWarpIntrinsics()) {
261 source += "#extension GL_NV_gpu_shader5 : require\n"
262 "#extension GL_NV_shader_thread_group : require\n"
263 "#extension GL_NV_shader_thread_shuffle : require\n";
264 }
265 // This pragma stops Nvidia's driver from over optimizing math (probably using fp16 operations)
266 // on places where we don't want to.
267 // Thanks to Ryujinx for finding this workaround.
268 source += "#pragma optionNV(fastmath off)\n";
269
270 if (shader_type == ShaderType::Geometry) {
271 const auto [glsl_topology, max_vertices] = GetPrimitiveDescription(variant.primitive_mode);
272 source += fmt::format("#define MAX_VERTEX_INPUT {}\n", max_vertices);
273 source += fmt::format("layout ({}) in;\n", glsl_topology);
274 }
275 if (shader_type == ShaderType::Compute) {
276 if (variant.local_memory_size > 0) {
277 source += fmt::format("#define LOCAL_MEMORY_SIZE {}\n",
278 Common::AlignUp(variant.local_memory_size, 4) / 4);
279 }
280 source +=
281 fmt::format("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;\n",
282 variant.block_x, variant.block_y, variant.block_z);
283
284 if (variant.shared_memory_size > 0) {
285 // shared_memory_size is described in number of words
286 source += fmt::format("shared uint smem[{}];\n", variant.shared_memory_size);
287 }
288 }
289
290 source += '\n';
291 source += GenerateGLSL(device, shader_type, ir, ir_b);
292 191
192 const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
293 OGLShader shader; 193 OGLShader shader;
294 shader.Create(source.c_str(), GetGLShaderType(shader_type)); 194 shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
295 195
296 auto program = std::make_shared<OGLProgram>(); 196 auto program = std::make_shared<OGLProgram>();
297 program->Create(true, hint_retrievable, shader.handle); 197 program->Create(true, hint_retrievable, shader.handle);
@@ -299,7 +199,7 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderTyp
299} 199}
300 200
301std::unordered_set<GLenum> GetSupportedFormats() { 201std::unordered_set<GLenum> GetSupportedFormats() {
302 GLint num_formats{}; 202 GLint num_formats;
303 glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats); 203 glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats);
304 204
305 std::vector<GLint> formats(num_formats); 205 std::vector<GLint> formats(num_formats);
@@ -314,115 +214,82 @@ std::unordered_set<GLenum> GetSupportedFormats() {
314 214
315} // Anonymous namespace 215} // Anonymous namespace
316 216
317CachedShader::CachedShader(const ShaderParameters& params, ShaderType shader_type, 217CachedShader::CachedShader(const u8* host_ptr, VAddr cpu_addr, std::size_t size_in_bytes,
318 GLShader::ShaderEntries entries, ProgramCode code, ProgramCode code_b) 218 std::shared_ptr<VideoCommon::Shader::Registry> registry,
319 : RasterizerCacheObject{params.host_ptr}, system{params.system}, 219 ShaderEntries entries, std::shared_ptr<OGLProgram> program)
320 disk_cache{params.disk_cache}, device{params.device}, cpu_addr{params.cpu_addr}, 220 : RasterizerCacheObject{host_ptr}, registry{std::move(registry)}, entries{std::move(entries)},
321 unique_identifier{params.unique_identifier}, shader_type{shader_type}, 221 cpu_addr{cpu_addr}, size_in_bytes{size_in_bytes}, program{std::move(program)} {}
322 entries{std::move(entries)}, code{std::move(code)}, code_b{std::move(code_b)} { 222
323 if (!params.precompiled_variants) { 223CachedShader::~CachedShader() = default;
324 return; 224
325 } 225GLuint CachedShader::GetHandle() const {
326 for (const auto& pair : *params.precompiled_variants) { 226 DEBUG_ASSERT(registry->IsConsistent());
327 auto locker = MakeLocker(system, shader_type); 227 return program->handle;
328 const auto& usage = pair->first;
329 FillLocker(*locker, usage);
330
331 std::unique_ptr<LockerVariant>* locker_variant = nullptr;
332 const auto it =
333 std::find_if(locker_variants.begin(), locker_variants.end(), [&](const auto& variant) {
334 return variant->locker->HasEqualKeys(*locker);
335 });
336 if (it == locker_variants.end()) {
337 locker_variant = &locker_variants.emplace_back();
338 *locker_variant = std::make_unique<LockerVariant>();
339 locker_variant->get()->locker = std::move(locker);
340 } else {
341 locker_variant = &*it;
342 }
343 locker_variant->get()->programs.emplace(usage.variant, pair->second);
344 }
345} 228}
346 229
347Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, 230Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
348 Maxwell::ShaderProgram program_type, ProgramCode code, 231 Maxwell::ShaderProgram program_type, ProgramCode code,
349 ProgramCode code_b) { 232 ProgramCode code_b) {
350 const auto shader_type = GetShaderType(program_type); 233 const auto shader_type = GetShaderType(program_type);
351 params.disk_cache.SaveRaw( 234 const std::size_t size_in_bytes = code.size() * sizeof(u64);
352 ShaderDiskCacheRaw(params.unique_identifier, shader_type, code, code_b));
353 235
354 ConstBufferLocker locker(shader_type, params.system.GPU().Maxwell3D()); 236 auto registry = std::make_shared<Registry>(shader_type, params.system.GPU().Maxwell3D());
355 const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, locker); 237 const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
356 // TODO(Rodrigo): Handle VertexA shaders 238 // TODO(Rodrigo): Handle VertexA shaders
357 // std::optional<ShaderIR> ir_b; 239 // std::optional<ShaderIR> ir_b;
358 // if (!code_b.empty()) { 240 // if (!code_b.empty()) {
359 // ir_b.emplace(code_b, STAGE_MAIN_OFFSET); 241 // ir_b.emplace(code_b, STAGE_MAIN_OFFSET);
360 // } 242 // }
361 return std::shared_ptr<CachedShader>(new CachedShader( 243 auto program = BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry);
362 params, shader_type, GLShader::GetEntries(ir), std::move(code), std::move(code_b))); 244
245 ShaderDiskCacheEntry entry;
246 entry.type = shader_type;
247 entry.code = std::move(code);
248 entry.code_b = std::move(code_b);
249 entry.unique_identifier = params.unique_identifier;
250 entry.bound_buffer = registry->GetBoundBuffer();
251 entry.graphics_info = registry->GetGraphicsInfo();
252 entry.keys = registry->GetKeys();
253 entry.bound_samplers = registry->GetBoundSamplers();
254 entry.bindless_samplers = registry->GetBindlessSamplers();
255 params.disk_cache.SaveEntry(std::move(entry));
256
257 return std::shared_ptr<CachedShader>(new CachedShader(params.host_ptr, params.cpu_addr,
258 size_in_bytes, std::move(registry),
259 MakeEntries(ir), std::move(program)));
363} 260}
364 261
365Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { 262Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
366 params.disk_cache.SaveRaw( 263 const std::size_t size_in_bytes = code.size() * sizeof(u64);
367 ShaderDiskCacheRaw(params.unique_identifier, ShaderType::Compute, code)); 264
368 265 auto& engine = params.system.GPU().KeplerCompute();
369 ConstBufferLocker locker(Tegra::Engines::ShaderType::Compute, 266 auto registry = std::make_shared<Registry>(ShaderType::Compute, engine);
370 params.system.GPU().KeplerCompute()); 267 const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
371 const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, locker); 268 const u64 uid = params.unique_identifier;
372 return std::shared_ptr<CachedShader>(new CachedShader( 269 auto program = BuildShader(params.device, ShaderType::Compute, uid, ir, *registry);
373 params, ShaderType::Compute, GLShader::GetEntries(ir), std::move(code), {})); 270
271 ShaderDiskCacheEntry entry;
272 entry.type = ShaderType::Compute;
273 entry.code = std::move(code);
274 entry.unique_identifier = uid;
275 entry.bound_buffer = registry->GetBoundBuffer();
276 entry.compute_info = registry->GetComputeInfo();
277 entry.keys = registry->GetKeys();
278 entry.bound_samplers = registry->GetBoundSamplers();
279 entry.bindless_samplers = registry->GetBindlessSamplers();
280 params.disk_cache.SaveEntry(std::move(entry));
281
282 return std::shared_ptr<CachedShader>(new CachedShader(params.host_ptr, params.cpu_addr,
283 size_in_bytes, std::move(registry),
284 MakeEntries(ir), std::move(program)));
374} 285}
375 286
376Shader CachedShader::CreateFromCache(const ShaderParameters& params, 287Shader CachedShader::CreateFromCache(const ShaderParameters& params,
377 const UnspecializedShader& unspecialized) { 288 const PrecompiledShader& precompiled_shader,
378 return std::shared_ptr<CachedShader>(new CachedShader(params, unspecialized.type, 289 std::size_t size_in_bytes) {
379 unspecialized.entries, unspecialized.code, 290 return std::shared_ptr<CachedShader>(new CachedShader(
380 unspecialized.code_b)); 291 params.host_ptr, params.cpu_addr, size_in_bytes, precompiled_shader.registry,
381} 292 precompiled_shader.entries, precompiled_shader.program));
382
383GLuint CachedShader::GetHandle(const ProgramVariant& variant) {
384 EnsureValidLockerVariant();
385
386 const auto [entry, is_cache_miss] = curr_locker_variant->programs.try_emplace(variant);
387 auto& program = entry->second;
388 if (!is_cache_miss) {
389 return program->handle;
390 }
391
392 program = BuildShader(device, unique_identifier, shader_type, code, code_b,
393 *curr_locker_variant->locker, variant);
394 disk_cache.SaveUsage(GetUsage(variant, *curr_locker_variant->locker));
395
396 LabelGLObject(GL_PROGRAM, program->handle, cpu_addr);
397 return program->handle;
398}
399
400bool CachedShader::EnsureValidLockerVariant() {
401 const auto previous_variant = curr_locker_variant;
402 if (curr_locker_variant && !curr_locker_variant->locker->IsConsistent()) {
403 curr_locker_variant = nullptr;
404 }
405 if (!curr_locker_variant) {
406 for (auto& variant : locker_variants) {
407 if (variant->locker->IsConsistent()) {
408 curr_locker_variant = variant.get();
409 }
410 }
411 }
412 if (!curr_locker_variant) {
413 auto& new_variant = locker_variants.emplace_back();
414 new_variant = std::make_unique<LockerVariant>();
415 new_variant->locker = MakeLocker(system, shader_type);
416 curr_locker_variant = new_variant.get();
417 }
418 return previous_variant == curr_locker_variant;
419}
420
421ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant,
422 const ConstBufferLocker& locker) const {
423 return ShaderDiskCacheUsage{unique_identifier, variant,
424 locker.GetBoundBuffer(), locker.GetKeys(),
425 locker.GetBoundSamplers(), locker.GetBindlessSamplers()};
426} 293}
427 294
428ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, 295ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
@@ -432,16 +299,12 @@ ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System&
432 299
433void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, 300void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
434 const VideoCore::DiskResourceLoadCallback& callback) { 301 const VideoCore::DiskResourceLoadCallback& callback) {
435 const auto transferable = disk_cache.LoadTransferable(); 302 const std::optional transferable = disk_cache.LoadTransferable();
436 if (!transferable) { 303 if (!transferable) {
437 return; 304 return;
438 } 305 }
439 const auto [raws, shader_usages] = *transferable;
440 if (!GenerateUnspecializedShaders(stop_loading, callback, raws) || stop_loading) {
441 return;
442 }
443 306
444 const auto dumps = disk_cache.LoadPrecompiled(); 307 const std::vector gl_cache = disk_cache.LoadPrecompiled();
445 const auto supported_formats = GetSupportedFormats(); 308 const auto supported_formats = GetSupportedFormats();
446 309
447 // Track if precompiled cache was altered during loading to know if we have to 310 // Track if precompiled cache was altered during loading to know if we have to
@@ -450,77 +313,82 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
450 313
451 // Inform the frontend about shader build initialization 314 // Inform the frontend about shader build initialization
452 if (callback) { 315 if (callback) {
453 callback(VideoCore::LoadCallbackStage::Build, 0, shader_usages.size()); 316 callback(VideoCore::LoadCallbackStage::Build, 0, transferable->size());
454 } 317 }
455 318
456 std::mutex mutex; 319 std::mutex mutex;
457 std::size_t built_shaders = 0; // It doesn't have be atomic since it's used behind a mutex 320 std::size_t built_shaders = 0; // It doesn't have be atomic since it's used behind a mutex
458 std::atomic_bool compilation_failed = false; 321 std::atomic_bool gl_cache_failed = false;
459 322
460 const auto Worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin, 323 const auto find_precompiled = [&gl_cache](u64 id) {
461 std::size_t end, const std::vector<ShaderDiskCacheUsage>& shader_usages, 324 return std::find_if(gl_cache.begin(), gl_cache.end(),
462 const ShaderDumpsMap& dumps) { 325 [id](const auto& entry) { return entry.unique_identifier == id; });
326 };
327
328 const auto worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin,
329 std::size_t end) {
463 context->MakeCurrent(); 330 context->MakeCurrent();
464 SCOPE_EXIT({ return context->DoneCurrent(); }); 331 SCOPE_EXIT({ return context->DoneCurrent(); });
465 332
466 for (std::size_t i = begin; i < end; ++i) { 333 for (std::size_t i = begin; i < end; ++i) {
467 if (stop_loading || compilation_failed) { 334 if (stop_loading) {
468 return; 335 return;
469 } 336 }
470 const auto& usage{shader_usages[i]}; 337 const auto& entry = (*transferable)[i];
471 const auto& unspecialized{unspecialized_shaders.at(usage.unique_identifier)}; 338 const u64 uid = entry.unique_identifier;
472 const auto dump{dumps.find(usage)}; 339 const auto it = find_precompiled(uid);
473 340 const auto precompiled_entry = it != gl_cache.end() ? &*it : nullptr;
474 CachedProgram shader; 341
475 if (dump != dumps.end()) { 342 const bool is_compute = entry.type == ShaderType::Compute;
476 // If the shader is dumped, attempt to load it with 343 const u32 main_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
477 shader = GeneratePrecompiledProgram(dump->second, supported_formats); 344 auto registry = MakeRegistry(entry);
478 if (!shader) { 345 const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);
479 compilation_failed = true; 346
480 return; 347 std::shared_ptr<OGLProgram> program;
348 if (precompiled_entry) {
349 // If the shader is precompiled, attempt to load it with
350 program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
351 if (!program) {
352 gl_cache_failed = true;
481 } 353 }
482 } 354 }
483 if (!shader) { 355 if (!program) {
484 auto locker{MakeLocker(system, unspecialized.type)}; 356 // Otherwise compile it from GLSL
485 FillLocker(*locker, usage); 357 program = BuildShader(device, entry.type, uid, ir, *registry, true);
486
487 shader = BuildShader(device, usage.unique_identifier, unspecialized.type,
488 unspecialized.code, unspecialized.code_b, *locker,
489 usage.variant, true);
490 } 358 }
491 359
360 PrecompiledShader shader;
361 shader.program = std::move(program);
362 shader.registry = std::move(registry);
363 shader.entries = MakeEntries(ir);
364
492 std::scoped_lock lock{mutex}; 365 std::scoped_lock lock{mutex};
493 if (callback) { 366 if (callback) {
494 callback(VideoCore::LoadCallbackStage::Build, ++built_shaders, 367 callback(VideoCore::LoadCallbackStage::Build, ++built_shaders,
495 shader_usages.size()); 368 transferable->size());
496 } 369 }
497 370 runtime_cache.emplace(entry.unique_identifier, std::move(shader));
498 precompiled_programs.emplace(usage, std::move(shader));
499
500 // TODO(Rodrigo): Is there a better way to do this?
501 precompiled_variants[usage.unique_identifier].push_back(
502 precompiled_programs.find(usage));
503 } 371 }
504 }; 372 };
505 373
506 const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1ULL)}; 374 const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1ULL)};
507 const std::size_t bucket_size{shader_usages.size() / num_workers}; 375 const std::size_t bucket_size{transferable->size() / num_workers};
508 std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers); 376 std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers);
509 std::vector<std::thread> threads(num_workers); 377 std::vector<std::thread> threads(num_workers);
510 for (std::size_t i = 0; i < num_workers; ++i) { 378 for (std::size_t i = 0; i < num_workers; ++i) {
511 const bool is_last_worker = i + 1 == num_workers; 379 const bool is_last_worker = i + 1 == num_workers;
512 const std::size_t start{bucket_size * i}; 380 const std::size_t start{bucket_size * i};
513 const std::size_t end{is_last_worker ? shader_usages.size() : start + bucket_size}; 381 const std::size_t end{is_last_worker ? transferable->size() : start + bucket_size};
514 382
515 // On some platforms the shared context has to be created from the GUI thread 383 // On some platforms the shared context has to be created from the GUI thread
516 contexts[i] = emu_window.CreateSharedContext(); 384 contexts[i] = emu_window.CreateSharedContext();
517 threads[i] = std::thread(Worker, contexts[i].get(), start, end, shader_usages, dumps); 385 threads[i] = std::thread(worker, contexts[i].get(), start, end);
518 } 386 }
519 for (auto& thread : threads) { 387 for (auto& thread : threads) {
520 thread.join(); 388 thread.join();
521 } 389 }
522 390
523 if (compilation_failed) { 391 if (gl_cache_failed) {
524 // Invalidate the precompiled cache if a shader dumped shader was rejected 392 // Invalidate the precompiled cache if a shader dumped shader was rejected
525 disk_cache.InvalidatePrecompiled(); 393 disk_cache.InvalidatePrecompiled();
526 precompiled_cache_altered = true; 394 precompiled_cache_altered = true;
@@ -533,11 +401,12 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
533 // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw 401 // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
534 // before precompiling them 402 // before precompiling them
535 403
536 for (std::size_t i = 0; i < shader_usages.size(); ++i) { 404 for (std::size_t i = 0; i < transferable->size(); ++i) {
537 const auto& usage{shader_usages[i]}; 405 const u64 id = (*transferable)[i].unique_identifier;
538 if (dumps.find(usage) == dumps.end()) { 406 const auto it = find_precompiled(id);
539 const auto& program{precompiled_programs.at(usage)}; 407 if (it == gl_cache.end()) {
540 disk_cache.SaveDump(usage, program->handle); 408 const GLuint program = runtime_cache.at(id).program->handle;
409 disk_cache.SavePrecompiled(id, program);
541 precompiled_cache_altered = true; 410 precompiled_cache_altered = true;
542 } 411 }
543 } 412 }
@@ -547,80 +416,29 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
547 } 416 }
548} 417}
549 418
550const PrecompiledVariants* ShaderCacheOpenGL::GetPrecompiledVariants(u64 unique_identifier) const { 419std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
551 const auto it = precompiled_variants.find(unique_identifier); 420 const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
552 return it == precompiled_variants.end() ? nullptr : &it->second; 421 const std::unordered_set<GLenum>& supported_formats) {
553} 422 if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
554 423 LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format, removing");
555CachedProgram ShaderCacheOpenGL::GeneratePrecompiledProgram(
556 const ShaderDiskCacheDump& dump, const std::unordered_set<GLenum>& supported_formats) {
557 if (supported_formats.find(dump.binary_format) == supported_formats.end()) {
558 LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format - removing");
559 return {}; 424 return {};
560 } 425 }
561 426
562 CachedProgram shader = std::make_shared<OGLProgram>(); 427 auto program = std::make_shared<OGLProgram>();
563 shader->handle = glCreateProgram(); 428 program->handle = glCreateProgram();
564 glProgramParameteri(shader->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); 429 glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
565 glProgramBinary(shader->handle, dump.binary_format, dump.binary.data(), 430 glProgramBinary(program->handle, precompiled_entry.binary_format,
566 static_cast<GLsizei>(dump.binary.size())); 431 precompiled_entry.binary.data(),
567 432 static_cast<GLsizei>(precompiled_entry.binary.size()));
568 GLint link_status{}; 433
569 glGetProgramiv(shader->handle, GL_LINK_STATUS, &link_status); 434 GLint link_status;
435 glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status);
570 if (link_status == GL_FALSE) { 436 if (link_status == GL_FALSE) {
571 LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver - removing"); 437 LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
572 return {}; 438 return {};
573 } 439 }
574 440
575 return shader; 441 return program;
576}
577
578bool ShaderCacheOpenGL::GenerateUnspecializedShaders(
579 const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback,
580 const std::vector<ShaderDiskCacheRaw>& raws) {
581 if (callback) {
582 callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size());
583 }
584
585 for (std::size_t i = 0; i < raws.size(); ++i) {
586 if (stop_loading) {
587 return false;
588 }
589 const auto& raw{raws[i]};
590 const u64 unique_identifier{raw.GetUniqueIdentifier()};
591 const u64 calculated_hash{
592 GetUniqueIdentifier(raw.GetType(), raw.HasProgramA(), raw.GetCode(), raw.GetCodeB())};
593 if (unique_identifier != calculated_hash) {
594 LOG_ERROR(Render_OpenGL,
595 "Invalid hash in entry={:016x} (obtained hash={:016x}) - "
596 "removing shader cache",
597 raw.GetUniqueIdentifier(), calculated_hash);
598 disk_cache.InvalidateTransferable();
599 return false;
600 }
601
602 const u32 main_offset =
603 raw.GetType() == ShaderType::Compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
604 ConstBufferLocker locker(raw.GetType());
605 const ShaderIR ir(raw.GetCode(), main_offset, COMPILER_SETTINGS, locker);
606 // TODO(Rodrigo): Handle VertexA shaders
607 // std::optional<ShaderIR> ir_b;
608 // if (raw.HasProgramA()) {
609 // ir_b.emplace(raw.GetProgramCodeB(), main_offset);
610 // }
611
612 UnspecializedShader unspecialized;
613 unspecialized.entries = GLShader::GetEntries(ir);
614 unspecialized.type = raw.GetType();
615 unspecialized.code = raw.GetCode();
616 unspecialized.code_b = raw.GetCodeB();
617 unspecialized_shaders.emplace(raw.GetUniqueIdentifier(), unspecialized);
618
619 if (callback) {
620 callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size());
621 }
622 }
623 return true;
624} 442}
625 443
626Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { 444Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
@@ -648,17 +466,17 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
648 466
649 const auto unique_identifier = GetUniqueIdentifier( 467 const auto unique_identifier = GetUniqueIdentifier(
650 GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b); 468 GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
651 const auto precompiled_variants = GetPrecompiledVariants(unique_identifier);
652 const auto cpu_addr{*memory_manager.GpuToCpuAddress(address)}; 469 const auto cpu_addr{*memory_manager.GpuToCpuAddress(address)};
653 const ShaderParameters params{system, disk_cache, precompiled_variants, device, 470 const ShaderParameters params{system, disk_cache, device,
654 cpu_addr, host_ptr, unique_identifier}; 471 cpu_addr, host_ptr, unique_identifier};
655 472
656 const auto found = unspecialized_shaders.find(unique_identifier); 473 const auto found = runtime_cache.find(unique_identifier);
657 if (found == unspecialized_shaders.end()) { 474 if (found == runtime_cache.end()) {
658 shader = CachedShader::CreateStageFromMemory(params, program, std::move(code), 475 shader = CachedShader::CreateStageFromMemory(params, program, std::move(code),
659 std::move(code_b)); 476 std::move(code_b));
660 } else { 477 } else {
661 shader = CachedShader::CreateFromCache(params, found->second); 478 const std::size_t size_in_bytes = code.size() * sizeof(u64);
479 shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
662 } 480 }
663 Register(shader); 481 Register(shader);
664 482
@@ -673,19 +491,19 @@ Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
673 return kernel; 491 return kernel;
674 } 492 }
675 493
676 // No kernel found - create a new one 494 // No kernel found, create a new one
677 auto code{GetShaderCode(memory_manager, code_addr, host_ptr)}; 495 auto code{GetShaderCode(memory_manager, code_addr, host_ptr)};
678 const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code, {})}; 496 const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
679 const auto precompiled_variants = GetPrecompiledVariants(unique_identifier);
680 const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)}; 497 const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)};
681 const ShaderParameters params{system, disk_cache, precompiled_variants, device, 498 const ShaderParameters params{system, disk_cache, device,
682 cpu_addr, host_ptr, unique_identifier}; 499 cpu_addr, host_ptr, unique_identifier};
683 500
684 const auto found = unspecialized_shaders.find(unique_identifier); 501 const auto found = runtime_cache.find(unique_identifier);
685 if (found == unspecialized_shaders.end()) { 502 if (found == runtime_cache.end()) {
686 kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); 503 kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
687 } else { 504 } else {
688 kernel = CachedShader::CreateFromCache(params, found->second); 505 const std::size_t size_in_bytes = code.size() * sizeof(u64);
506 kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
689 } 507 }
690 508
691 Register(kernel); 509 Register(kernel);
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 7b1470db3..4935019fc 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -22,7 +22,7 @@
22#include "video_core/renderer_opengl/gl_resource_manager.h" 22#include "video_core/renderer_opengl/gl_resource_manager.h"
23#include "video_core/renderer_opengl/gl_shader_decompiler.h" 23#include "video_core/renderer_opengl/gl_shader_decompiler.h"
24#include "video_core/renderer_opengl/gl_shader_disk_cache.h" 24#include "video_core/renderer_opengl/gl_shader_disk_cache.h"
25#include "video_core/shader/const_buffer_locker.h" 25#include "video_core/shader/registry.h"
26#include "video_core/shader/shader_ir.h" 26#include "video_core/shader/shader_ir.h"
27 27
28namespace Core { 28namespace Core {
@@ -41,22 +41,17 @@ class RasterizerOpenGL;
41struct UnspecializedShader; 41struct UnspecializedShader;
42 42
43using Shader = std::shared_ptr<CachedShader>; 43using Shader = std::shared_ptr<CachedShader>;
44using CachedProgram = std::shared_ptr<OGLProgram>;
45using Maxwell = Tegra::Engines::Maxwell3D::Regs; 44using Maxwell = Tegra::Engines::Maxwell3D::Regs;
46using PrecompiledPrograms = std::unordered_map<ShaderDiskCacheUsage, CachedProgram>; 45
47using PrecompiledVariants = std::vector<PrecompiledPrograms::iterator>; 46struct PrecompiledShader {
48 47 std::shared_ptr<OGLProgram> program;
49struct UnspecializedShader { 48 std::shared_ptr<VideoCommon::Shader::Registry> registry;
50 GLShader::ShaderEntries entries; 49 ShaderEntries entries;
51 Tegra::Engines::ShaderType type;
52 ProgramCode code;
53 ProgramCode code_b;
54}; 50};
55 51
56struct ShaderParameters { 52struct ShaderParameters {
57 Core::System& system; 53 Core::System& system;
58 ShaderDiskCacheOpenGL& disk_cache; 54 ShaderDiskCacheOpenGL& disk_cache;
59 const PrecompiledVariants* precompiled_variants;
60 const Device& device; 55 const Device& device;
61 VAddr cpu_addr; 56 VAddr cpu_addr;
62 u8* host_ptr; 57 u8* host_ptr;
@@ -65,61 +60,45 @@ struct ShaderParameters {
65 60
66class CachedShader final : public RasterizerCacheObject { 61class CachedShader final : public RasterizerCacheObject {
67public: 62public:
68 static Shader CreateStageFromMemory(const ShaderParameters& params, 63 ~CachedShader();
69 Maxwell::ShaderProgram program_type,
70 ProgramCode program_code, ProgramCode program_code_b);
71 static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);
72 64
73 static Shader CreateFromCache(const ShaderParameters& params, 65 /// Gets the GL program handle for the shader
74 const UnspecializedShader& unspecialized); 66 GLuint GetHandle() const;
75 67
68 /// Returns the guest CPU address of the shader
76 VAddr GetCpuAddr() const override { 69 VAddr GetCpuAddr() const override {
77 return cpu_addr; 70 return cpu_addr;
78 } 71 }
79 72
73 /// Returns the size in bytes of the shader
80 std::size_t GetSizeInBytes() const override { 74 std::size_t GetSizeInBytes() const override {
81 return code.size() * sizeof(u64); 75 return size_in_bytes;
82 } 76 }
83 77
84 /// Gets the shader entries for the shader 78 /// Gets the shader entries for the shader
85 const GLShader::ShaderEntries& GetShaderEntries() const { 79 const ShaderEntries& GetEntries() const {
86 return entries; 80 return entries;
87 } 81 }
88 82
89 /// Gets the GL program handle for the shader 83 static Shader CreateStageFromMemory(const ShaderParameters& params,
90 GLuint GetHandle(const ProgramVariant& variant); 84 Maxwell::ShaderProgram program_type,
91 85 ProgramCode program_code, ProgramCode program_code_b);
92private: 86 static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);
93 struct LockerVariant {
94 std::unique_ptr<VideoCommon::Shader::ConstBufferLocker> locker;
95 std::unordered_map<ProgramVariant, CachedProgram> programs;
96 };
97
98 explicit CachedShader(const ShaderParameters& params, Tegra::Engines::ShaderType shader_type,
99 GLShader::ShaderEntries entries, ProgramCode program_code,
100 ProgramCode program_code_b);
101
102 bool EnsureValidLockerVariant();
103
104 ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant,
105 const VideoCommon::Shader::ConstBufferLocker& locker) const;
106
107 Core::System& system;
108 ShaderDiskCacheOpenGL& disk_cache;
109 const Device& device;
110
111 VAddr cpu_addr{};
112
113 u64 unique_identifier{};
114 Tegra::Engines::ShaderType shader_type{};
115
116 GLShader::ShaderEntries entries;
117 87
118 ProgramCode code; 88 static Shader CreateFromCache(const ShaderParameters& params,
119 ProgramCode code_b; 89 const PrecompiledShader& precompiled_shader,
90 std::size_t size_in_bytes);
120 91
121 LockerVariant* curr_locker_variant = nullptr; 92private:
122 std::vector<std::unique_ptr<LockerVariant>> locker_variants; 93 explicit CachedShader(const u8* host_ptr, VAddr cpu_addr, std::size_t size_in_bytes,
94 std::shared_ptr<VideoCommon::Shader::Registry> registry,
95 ShaderEntries entries, std::shared_ptr<OGLProgram> program);
96
97 std::shared_ptr<VideoCommon::Shader::Registry> registry;
98 ShaderEntries entries;
99 VAddr cpu_addr = 0;
100 std::size_t size_in_bytes = 0;
101 std::shared_ptr<OGLProgram> program;
123}; 102};
124 103
125class ShaderCacheOpenGL final : public RasterizerCache<Shader> { 104class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
@@ -142,25 +121,15 @@ protected:
142 void FlushObjectInner(const Shader& object) override {} 121 void FlushObjectInner(const Shader& object) override {}
143 122
144private: 123private:
145 bool GenerateUnspecializedShaders(const std::atomic_bool& stop_loading, 124 std::shared_ptr<OGLProgram> GeneratePrecompiledProgram(
146 const VideoCore::DiskResourceLoadCallback& callback, 125 const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
147 const std::vector<ShaderDiskCacheRaw>& raws); 126 const std::unordered_set<GLenum>& supported_formats);
148
149 CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump,
150 const std::unordered_set<GLenum>& supported_formats);
151
152 const PrecompiledVariants* GetPrecompiledVariants(u64 unique_identifier) const;
153 127
154 Core::System& system; 128 Core::System& system;
155 Core::Frontend::EmuWindow& emu_window; 129 Core::Frontend::EmuWindow& emu_window;
156 const Device& device; 130 const Device& device;
157
158 ShaderDiskCacheOpenGL disk_cache; 131 ShaderDiskCacheOpenGL disk_cache;
159 132 std::unordered_map<u64, PrecompiledShader> runtime_cache;
160 PrecompiledPrograms precompiled_programs;
161 std::unordered_map<u64, PrecompiledVariants> precompiled_variants;
162
163 std::unordered_map<u64, UnspecializedShader> unspecialized_shaders;
164 133
165 std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; 134 std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
166}; 135};
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 3a41ed30c..2c38f57fd 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -23,8 +23,9 @@
23#include "video_core/shader/ast.h" 23#include "video_core/shader/ast.h"
24#include "video_core/shader/node.h" 24#include "video_core/shader/node.h"
25#include "video_core/shader/shader_ir.h" 25#include "video_core/shader/shader_ir.h"
26#include "video_core/shader/transform_feedback.h"
26 27
27namespace OpenGL::GLShader { 28namespace OpenGL {
28 29
29namespace { 30namespace {
30 31
@@ -36,6 +37,8 @@ using Tegra::Shader::IpaInterpMode;
36using Tegra::Shader::IpaMode; 37using Tegra::Shader::IpaMode;
37using Tegra::Shader::IpaSampleMode; 38using Tegra::Shader::IpaSampleMode;
38using Tegra::Shader::Register; 39using Tegra::Shader::Register;
40using VideoCommon::Shader::BuildTransformFeedback;
41using VideoCommon::Shader::Registry;
39 42
40using namespace std::string_literals; 43using namespace std::string_literals;
41using namespace VideoCommon::Shader; 44using namespace VideoCommon::Shader;
@@ -48,6 +51,11 @@ class ExprDecompiler;
48 51
49enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat }; 52enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat };
50 53
54constexpr std::array FLOAT_TYPES{"float", "vec2", "vec3", "vec4"};
55
56constexpr std::string_view INPUT_ATTRIBUTE_NAME = "in_attr";
57constexpr std::string_view OUTPUT_ATTRIBUTE_NAME = "out_attr";
58
51struct TextureOffset {}; 59struct TextureOffset {};
52struct TextureDerivates {}; 60struct TextureDerivates {};
53using TextureArgument = std::pair<Type, Node>; 61using TextureArgument = std::pair<Type, Node>;
@@ -56,6 +64,25 @@ using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>
56constexpr u32 MAX_CONSTBUFFER_ELEMENTS = 64constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
57 static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); 65 static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
58 66
67constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
68#define ftou floatBitsToUint
69#define itof intBitsToFloat
70#define utof uintBitsToFloat
71
72bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{
73 bvec2 is_nan1 = isnan(pair1);
74 bvec2 is_nan2 = isnan(pair2);
75 return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || is_nan2.y);
76}}
77
78const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f );
79const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f );
80
81layout (std140, binding = {}) uniform vs_config {{
82 float y_direction;
83}};
84)";
85
59class ShaderWriter final { 86class ShaderWriter final {
60public: 87public:
61 void AddExpression(std::string_view text) { 88 void AddExpression(std::string_view text) {
@@ -269,12 +296,41 @@ const char* GetImageTypeDeclaration(Tegra::Shader::ImageType image_type) {
269 } 296 }
270} 297}
271 298
299/// Describes primitive behavior on geometry shaders
300std::pair<const char*, u32> GetPrimitiveDescription(Maxwell::PrimitiveTopology topology) {
301 switch (topology) {
302 case Maxwell::PrimitiveTopology::Points:
303 return {"points", 1};
304 case Maxwell::PrimitiveTopology::Lines:
305 case Maxwell::PrimitiveTopology::LineStrip:
306 return {"lines", 2};
307 case Maxwell::PrimitiveTopology::LinesAdjacency:
308 case Maxwell::PrimitiveTopology::LineStripAdjacency:
309 return {"lines_adjacency", 4};
310 case Maxwell::PrimitiveTopology::Triangles:
311 case Maxwell::PrimitiveTopology::TriangleStrip:
312 case Maxwell::PrimitiveTopology::TriangleFan:
313 return {"triangles", 3};
314 case Maxwell::PrimitiveTopology::TrianglesAdjacency:
315 case Maxwell::PrimitiveTopology::TriangleStripAdjacency:
316 return {"triangles_adjacency", 6};
317 default:
318 UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
319 return {"points", 1};
320 }
321}
322
272/// Generates code to use for a swizzle operation. 323/// Generates code to use for a swizzle operation.
273constexpr const char* GetSwizzle(u32 element) { 324constexpr const char* GetSwizzle(std::size_t element) {
274 constexpr std::array swizzle = {".x", ".y", ".z", ".w"}; 325 constexpr std::array swizzle = {".x", ".y", ".z", ".w"};
275 return swizzle.at(element); 326 return swizzle.at(element);
276} 327}
277 328
329constexpr const char* GetColorSwizzle(std::size_t element) {
330 constexpr std::array swizzle = {".r", ".g", ".b", ".a"};
331 return swizzle.at(element);
332}
333
278/// Translate topology 334/// Translate topology
279std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { 335std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
280 switch (topology) { 336 switch (topology) {
@@ -337,15 +393,66 @@ std::string FlowStackTopName(MetaStackClass stack) {
337 return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); 393 return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
338} 394}
339 395
340[[deprecated]] constexpr bool IsVertexShader(ShaderType stage) { 396struct GenericVaryingDescription {
341 return stage == ShaderType::Vertex; 397 std::string name;
342} 398 u8 first_element = 0;
399 bool is_scalar = false;
400};
343 401
344class GLSLDecompiler final { 402class GLSLDecompiler final {
345public: 403public:
346 explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderType stage, 404 explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
347 std::string suffix) 405 ShaderType stage, std::string_view identifier, std::string_view suffix)
348 : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {} 406 : device{device}, ir{ir}, registry{registry}, stage{stage},
407 identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
408 if (stage != ShaderType::Compute) {
409 transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
410 }
411 }
412
413 void Decompile() {
414 DeclareHeader();
415 DeclareVertex();
416 DeclareGeometry();
417 DeclareFragment();
418 DeclareCompute();
419 DeclareInputAttributes();
420 DeclareOutputAttributes();
421 DeclareImages();
422 DeclareSamplers();
423 DeclareGlobalMemory();
424 DeclareConstantBuffers();
425 DeclareLocalMemory();
426 DeclareRegisters();
427 DeclarePredicates();
428 DeclareInternalFlags();
429 DeclareCustomVariables();
430 DeclarePhysicalAttributeReader();
431
432 code.AddLine("void main() {{");
433 ++code.scope;
434
435 if (stage == ShaderType::Vertex) {
436 code.AddLine("gl_Position = vec4(0.0f, 0.0f, 0.0f, 1.0f);");
437 }
438
439 if (ir.IsDecompiled()) {
440 DecompileAST();
441 } else {
442 DecompileBranchMode();
443 }
444
445 --code.scope;
446 code.AddLine("}}");
447 }
448
449 std::string GetResult() {
450 return code.GetResult();
451 }
452
453private:
454 friend class ASTDecompiler;
455 friend class ExprDecompiler;
349 456
350 void DecompileBranchMode() { 457 void DecompileBranchMode() {
351 // VM's program counter 458 // VM's program counter
@@ -387,46 +494,40 @@ public:
387 494
388 void DecompileAST(); 495 void DecompileAST();
389 496
390 void Decompile() { 497 void DeclareHeader() {
391 DeclareVertex(); 498 if (!identifier.empty()) {
392 DeclareGeometry(); 499 code.AddLine("// {}", identifier);
393 DeclareRegisters(); 500 }
394 DeclareCustomVariables(); 501 code.AddLine("#version 440 core");
395 DeclarePredicates(); 502 code.AddLine("#extension GL_ARB_separate_shader_objects : enable");
396 DeclareLocalMemory(); 503 if (device.HasShaderBallot()) {
397 DeclareInternalFlags(); 504 code.AddLine("#extension GL_ARB_shader_ballot : require");
398 DeclareInputAttributes(); 505 }
399 DeclareOutputAttributes(); 506 if (device.HasVertexViewportLayer()) {
400 DeclareConstantBuffers(); 507 code.AddLine("#extension GL_ARB_shader_viewport_layer_array : require");
401 DeclareGlobalMemory();
402 DeclareSamplers();
403 DeclareImages();
404 DeclarePhysicalAttributeReader();
405
406 code.AddLine("void execute_{}() {{", suffix);
407 ++code.scope;
408
409 if (ir.IsDecompiled()) {
410 DecompileAST();
411 } else {
412 DecompileBranchMode();
413 } 508 }
509 if (device.HasImageLoadFormatted()) {
510 code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
511 }
512 if (device.HasWarpIntrinsics()) {
513 code.AddLine("#extension GL_NV_gpu_shader5 : require");
514 code.AddLine("#extension GL_NV_shader_thread_group : require");
515 code.AddLine("#extension GL_NV_shader_thread_shuffle : require");
516 }
517 // This pragma stops Nvidia's driver from over optimizing math (probably using fp16
518 // operations) on places where we don't want to.
519 // Thanks to Ryujinx for finding this workaround.
520 code.AddLine("#pragma optionNV(fastmath off)");
414 521
415 --code.scope; 522 code.AddNewLine();
416 code.AddLine("}}");
417 }
418 523
419 std::string GetResult() { 524 code.AddLine(CommonDeclarations, EmulationUniformBlockBinding);
420 return code.GetResult();
421 } 525 }
422 526
423private:
424 friend class ASTDecompiler;
425 friend class ExprDecompiler;
426
427 void DeclareVertex() { 527 void DeclareVertex() {
428 if (!IsVertexShader(stage)) 528 if (stage != ShaderType::Vertex) {
429 return; 529 return;
530 }
430 531
431 DeclareVertexRedeclarations(); 532 DeclareVertexRedeclarations();
432 } 533 }
@@ -436,9 +537,15 @@ private:
436 return; 537 return;
437 } 538 }
438 539
540 const auto& info = registry.GetGraphicsInfo();
541 const auto input_topology = info.primitive_topology;
542 const auto [glsl_topology, max_vertices] = GetPrimitiveDescription(input_topology);
543 max_input_vertices = max_vertices;
544 code.AddLine("layout ({}) in;", glsl_topology);
545
439 const auto topology = GetTopologyName(header.common3.output_topology); 546 const auto topology = GetTopologyName(header.common3.output_topology);
440 const auto max_vertices = header.common4.max_output_vertices.Value(); 547 const auto max_output_vertices = header.common4.max_output_vertices.Value();
441 code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_vertices); 548 code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_output_vertices);
442 code.AddNewLine(); 549 code.AddNewLine();
443 550
444 code.AddLine("in gl_PerVertex {{"); 551 code.AddLine("in gl_PerVertex {{");
@@ -450,11 +557,40 @@ private:
450 DeclareVertexRedeclarations(); 557 DeclareVertexRedeclarations();
451 } 558 }
452 559
560 void DeclareFragment() {
561 if (stage != ShaderType::Fragment) {
562 return;
563 }
564 for (u32 rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
565 code.AddLine("layout (location = {}) out vec4 frag_color{};", rt, rt);
566 }
567 }
568
569 void DeclareCompute() {
570 if (stage != ShaderType::Compute) {
571 return;
572 }
573 const auto& info = registry.GetComputeInfo();
574 if (const u32 size = info.shared_memory_size_in_words; size > 0) {
575 code.AddLine("shared uint smem[{}];", size);
576 code.AddNewLine();
577 }
578 code.AddLine("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;",
579 info.workgroup_size[0], info.workgroup_size[1], info.workgroup_size[2]);
580 code.AddNewLine();
581 }
582
453 void DeclareVertexRedeclarations() { 583 void DeclareVertexRedeclarations() {
454 code.AddLine("out gl_PerVertex {{"); 584 code.AddLine("out gl_PerVertex {{");
455 ++code.scope; 585 ++code.scope;
456 586
457 code.AddLine("vec4 gl_Position;"); 587 auto pos_xfb = GetTransformFeedbackDecoration(Attribute::Index::Position);
588 if (!pos_xfb.empty()) {
589 pos_xfb = fmt::format("layout ({}) ", pos_xfb);
590 }
591 const char* pos_type =
592 FLOAT_TYPES.at(GetNumComponents(Attribute::Index::Position).value_or(4) - 1);
593 code.AddLine("{}{} gl_Position;", pos_xfb, pos_type);
458 594
459 for (const auto attribute : ir.GetOutputAttributes()) { 595 for (const auto attribute : ir.GetOutputAttributes()) {
460 if (attribute == Attribute::Index::ClipDistances0123 || 596 if (attribute == Attribute::Index::ClipDistances0123 ||
@@ -463,14 +599,14 @@ private:
463 break; 599 break;
464 } 600 }
465 } 601 }
466 if (!IsVertexShader(stage) || device.HasVertexViewportLayer()) { 602 if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) {
467 if (ir.UsesLayer()) { 603 if (ir.UsesLayer()) {
468 code.AddLine("int gl_Layer;"); 604 code.AddLine("int gl_Layer;");
469 } 605 }
470 if (ir.UsesViewportIndex()) { 606 if (ir.UsesViewportIndex()) {
471 code.AddLine("int gl_ViewportIndex;"); 607 code.AddLine("int gl_ViewportIndex;");
472 } 608 }
473 } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && IsVertexShader(stage) && 609 } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderType::Vertex &&
474 !device.HasVertexViewportLayer()) { 610 !device.HasVertexViewportLayer()) {
475 LOG_ERROR( 611 LOG_ERROR(
476 Render_OpenGL, 612 Render_OpenGL,
@@ -525,18 +661,16 @@ private:
525 } 661 }
526 662
527 void DeclareLocalMemory() { 663 void DeclareLocalMemory() {
664 u64 local_memory_size = 0;
528 if (stage == ShaderType::Compute) { 665 if (stage == ShaderType::Compute) {
529 code.AddLine("#ifdef LOCAL_MEMORY_SIZE"); 666 local_memory_size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL;
530 code.AddLine("uint {}[LOCAL_MEMORY_SIZE];", GetLocalMemory()); 667 } else {
531 code.AddLine("#endif"); 668 local_memory_size = header.GetLocalMemorySize();
532 return;
533 } 669 }
534
535 const u64 local_memory_size = header.GetLocalMemorySize();
536 if (local_memory_size == 0) { 670 if (local_memory_size == 0) {
537 return; 671 return;
538 } 672 }
539 const auto element_count = Common::AlignUp(local_memory_size, 4) / 4; 673 const u64 element_count = Common::AlignUp(local_memory_size, 4) / 4;
540 code.AddLine("uint {}[{}];", GetLocalMemory(), element_count); 674 code.AddLine("uint {}[{}];", GetLocalMemory(), element_count);
541 code.AddNewLine(); 675 code.AddNewLine();
542 } 676 }
@@ -589,7 +723,7 @@ private:
589 void DeclareInputAttribute(Attribute::Index index, bool skip_unused) { 723 void DeclareInputAttribute(Attribute::Index index, bool skip_unused) {
590 const u32 location{GetGenericAttributeIndex(index)}; 724 const u32 location{GetGenericAttributeIndex(index)};
591 725
592 std::string name{GetInputAttribute(index)}; 726 std::string name{GetGenericInputAttribute(index)};
593 if (stage == ShaderType::Geometry) { 727 if (stage == ShaderType::Geometry) {
594 name = "gs_" + name + "[]"; 728 name = "gs_" + name + "[]";
595 } 729 }
@@ -626,9 +760,59 @@ private:
626 } 760 }
627 } 761 }
628 762
763 std::optional<std::size_t> GetNumComponents(Attribute::Index index, u8 element = 0) const {
764 const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
765 const auto it = transform_feedback.find(location);
766 if (it == transform_feedback.end()) {
767 return {};
768 }
769 return it->second.components;
770 }
771
772 std::string GetTransformFeedbackDecoration(Attribute::Index index, u8 element = 0) const {
773 const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
774 const auto it = transform_feedback.find(location);
775 if (it == transform_feedback.end()) {
776 return {};
777 }
778
779 const VaryingTFB& tfb = it->second;
780 return fmt::format("xfb_buffer = {}, xfb_offset = {}, xfb_stride = {}", tfb.buffer,
781 tfb.offset, tfb.stride);
782 }
783
629 void DeclareOutputAttribute(Attribute::Index index) { 784 void DeclareOutputAttribute(Attribute::Index index) {
630 const u32 location{GetGenericAttributeIndex(index)}; 785 static constexpr std::string_view swizzle = "xyzw";
631 code.AddLine("layout (location = {}) out vec4 {};", location, GetOutputAttribute(index)); 786 u8 element = 0;
787 while (element < 4) {
788 auto xfb = GetTransformFeedbackDecoration(index, element);
789 if (!xfb.empty()) {
790 xfb = fmt::format(", {}", xfb);
791 }
792 const std::size_t remainder = 4 - element;
793 const std::size_t num_components = GetNumComponents(index, element).value_or(remainder);
794 const char* const type = FLOAT_TYPES.at(num_components - 1);
795
796 const u32 location = GetGenericAttributeIndex(index);
797
798 GenericVaryingDescription description;
799 description.first_element = static_cast<u8>(element);
800 description.is_scalar = num_components == 1;
801 description.name = AppendSuffix(location, OUTPUT_ATTRIBUTE_NAME);
802 if (element != 0 || num_components != 4) {
803 const std::string_view name_swizzle = swizzle.substr(element, num_components);
804 description.name = fmt::format("{}_{}", description.name, name_swizzle);
805 }
806 for (std::size_t i = 0; i < num_components; ++i) {
807 const u8 offset = static_cast<u8>(location * 4 + element + i);
808 varying_description.insert({offset, description});
809 }
810
811 code.AddLine("layout (location = {}, component = {}{}) out {} {};", location, element,
812 xfb, type, description.name);
813
814 element = static_cast<u8>(static_cast<std::size_t>(element) + num_components);
815 }
632 } 816 }
633 817
634 void DeclareConstantBuffers() { 818 void DeclareConstantBuffers() {
@@ -925,7 +1109,8 @@ private:
925 // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games 1109 // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
926 // set an 0x80000000 index for those and the shader fails to build. Find out why 1110 // set an 0x80000000 index for those and the shader fails to build. Find out why
927 // this happens and what's its intent. 1111 // this happens and what's its intent.
928 return fmt::format("gs_{}[{} % MAX_VERTEX_INPUT]", name, Visit(buffer).AsUint()); 1112 return fmt::format("gs_{}[{} % {}]", name, Visit(buffer).AsUint(),
1113 max_input_vertices.value());
929 } 1114 }
930 return std::string(name); 1115 return std::string(name);
931 }; 1116 };
@@ -959,7 +1144,7 @@ private:
959 // TODO(Subv): Find out what the values are for the first two elements when inside a 1144 // TODO(Subv): Find out what the values are for the first two elements when inside a
960 // vertex shader, and what's the value of the fourth element when inside a Tess Eval 1145 // vertex shader, and what's the value of the fourth element when inside a Tess Eval
961 // shader. 1146 // shader.
962 ASSERT(IsVertexShader(stage)); 1147 ASSERT(stage == ShaderType::Vertex);
963 switch (element) { 1148 switch (element) {
964 case 2: 1149 case 2:
965 // Config pack's first value is instance_id. 1150 // Config pack's first value is instance_id.
@@ -980,7 +1165,7 @@ private:
980 return {"0", Type::Int}; 1165 return {"0", Type::Int};
981 default: 1166 default:
982 if (IsGenericAttribute(attribute)) { 1167 if (IsGenericAttribute(attribute)) {
983 return {GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element), 1168 return {GeometryPass(GetGenericInputAttribute(attribute)) + GetSwizzle(element),
984 Type::Float}; 1169 Type::Float};
985 } 1170 }
986 break; 1171 break;
@@ -1030,12 +1215,12 @@ private:
1030 UNIMPLEMENTED(); 1215 UNIMPLEMENTED();
1031 return {}; 1216 return {};
1032 case 1: 1217 case 1:
1033 if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { 1218 if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) {
1034 return {}; 1219 return {};
1035 } 1220 }
1036 return {{"gl_Layer", Type::Int}}; 1221 return {{"gl_Layer", Type::Int}};
1037 case 2: 1222 case 2:
1038 if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { 1223 if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) {
1039 return {}; 1224 return {};
1040 } 1225 }
1041 return {{"gl_ViewportIndex", Type::Int}}; 1226 return {{"gl_ViewportIndex", Type::Int}};
@@ -1049,8 +1234,7 @@ private:
1049 return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}}; 1234 return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}};
1050 default: 1235 default:
1051 if (IsGenericAttribute(attribute)) { 1236 if (IsGenericAttribute(attribute)) {
1052 return { 1237 return {{GetGenericOutputAttribute(attribute, abuf->GetElement()), Type::Float}};
1053 {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}};
1054 } 1238 }
1055 UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); 1239 UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
1056 return {}; 1240 return {};
@@ -1822,16 +2006,19 @@ private:
1822 expr += GetSampler(meta->sampler); 2006 expr += GetSampler(meta->sampler);
1823 expr += ", "; 2007 expr += ", ";
1824 2008
1825 expr += constructors.at(operation.GetOperandsCount() - 1); 2009 expr += constructors.at(operation.GetOperandsCount() + (meta->array ? 1 : 0) - 1);
1826 expr += '('; 2010 expr += '(';
1827 for (std::size_t i = 0; i < count; ++i) { 2011 for (std::size_t i = 0; i < count; ++i) {
1828 expr += VisitOperand(operation, i).AsInt(); 2012 if (i > 0) {
1829 const std::size_t next = i + 1;
1830 if (next == count)
1831 expr += ')';
1832 else if (next < count)
1833 expr += ", "; 2013 expr += ", ";
2014 }
2015 expr += VisitOperand(operation, i).AsInt();
2016 }
2017 if (meta->array) {
2018 expr += ", ";
2019 expr += Visit(meta->array).AsInt();
1834 } 2020 }
2021 expr += ')';
1835 2022
1836 if (meta->lod && !meta->sampler.IsBuffer()) { 2023 if (meta->lod && !meta->sampler.IsBuffer()) {
1837 expr += ", "; 2024 expr += ", ";
@@ -1945,7 +2132,7 @@ private:
1945 // TODO(Subv): Figure out how dual-source blending is configured in the Switch. 2132 // TODO(Subv): Figure out how dual-source blending is configured in the Switch.
1946 for (u32 component = 0; component < 4; ++component) { 2133 for (u32 component = 0; component < 4; ++component) {
1947 if (header.ps.IsColorComponentOutputEnabled(render_target, component)) { 2134 if (header.ps.IsColorComponentOutputEnabled(render_target, component)) {
1948 code.AddLine("FragColor{}[{}] = {};", render_target, component, 2135 code.AddLine("frag_color{}{} = {};", render_target, GetColorSwizzle(component),
1949 SafeGetRegister(current_reg).AsFloat()); 2136 SafeGetRegister(current_reg).AsFloat());
1950 ++current_reg; 2137 ++current_reg;
1951 } 2138 }
@@ -2261,27 +2448,34 @@ private:
2261 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); 2448 static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
2262 2449
2263 std::string GetRegister(u32 index) const { 2450 std::string GetRegister(u32 index) const {
2264 return GetDeclarationWithSuffix(index, "gpr"); 2451 return AppendSuffix(index, "gpr");
2265 } 2452 }
2266 2453
2267 std::string GetCustomVariable(u32 index) const { 2454 std::string GetCustomVariable(u32 index) const {
2268 return GetDeclarationWithSuffix(index, "custom_var"); 2455 return AppendSuffix(index, "custom_var");
2269 } 2456 }
2270 2457
2271 std::string GetPredicate(Tegra::Shader::Pred pred) const { 2458 std::string GetPredicate(Tegra::Shader::Pred pred) const {
2272 return GetDeclarationWithSuffix(static_cast<u32>(pred), "pred"); 2459 return AppendSuffix(static_cast<u32>(pred), "pred");
2273 } 2460 }
2274 2461
2275 std::string GetInputAttribute(Attribute::Index attribute) const { 2462 std::string GetGenericInputAttribute(Attribute::Index attribute) const {
2276 return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "input_attr"); 2463 return AppendSuffix(GetGenericAttributeIndex(attribute), INPUT_ATTRIBUTE_NAME);
2277 } 2464 }
2278 2465
2279 std::string GetOutputAttribute(Attribute::Index attribute) const { 2466 std::unordered_map<u8, GenericVaryingDescription> varying_description;
2280 return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "output_attr"); 2467
2468 std::string GetGenericOutputAttribute(Attribute::Index attribute, std::size_t element) const {
2469 const u8 offset = static_cast<u8>(GetGenericAttributeIndex(attribute) * 4 + element);
2470 const auto& description = varying_description.at(offset);
2471 if (description.is_scalar) {
2472 return description.name;
2473 }
2474 return fmt::format("{}[{}]", description.name, element - description.first_element);
2281 } 2475 }
2282 2476
2283 std::string GetConstBuffer(u32 index) const { 2477 std::string GetConstBuffer(u32 index) const {
2284 return GetDeclarationWithSuffix(index, "cbuf"); 2478 return AppendSuffix(index, "cbuf");
2285 } 2479 }
2286 2480
2287 std::string GetGlobalMemory(const GlobalMemoryBase& descriptor) const { 2481 std::string GetGlobalMemory(const GlobalMemoryBase& descriptor) const {
@@ -2294,11 +2488,15 @@ private:
2294 } 2488 }
2295 2489
2296 std::string GetConstBufferBlock(u32 index) const { 2490 std::string GetConstBufferBlock(u32 index) const {
2297 return GetDeclarationWithSuffix(index, "cbuf_block"); 2491 return AppendSuffix(index, "cbuf_block");
2298 } 2492 }
2299 2493
2300 std::string GetLocalMemory() const { 2494 std::string GetLocalMemory() const {
2301 return "lmem_" + suffix; 2495 if (suffix.empty()) {
2496 return "lmem";
2497 } else {
2498 return "lmem_" + std::string{suffix};
2499 }
2302 } 2500 }
2303 2501
2304 std::string GetInternalFlag(InternalFlag flag) const { 2502 std::string GetInternalFlag(InternalFlag flag) const {
@@ -2307,23 +2505,31 @@ private:
2307 const auto index = static_cast<u32>(flag); 2505 const auto index = static_cast<u32>(flag);
2308 ASSERT(index < static_cast<u32>(InternalFlag::Amount)); 2506 ASSERT(index < static_cast<u32>(InternalFlag::Amount));
2309 2507
2310 return fmt::format("{}_{}", InternalFlagNames[index], suffix); 2508 if (suffix.empty()) {
2509 return InternalFlagNames[index];
2510 } else {
2511 return fmt::format("{}_{}", InternalFlagNames[index], suffix);
2512 }
2311 } 2513 }
2312 2514
2313 std::string GetSampler(const Sampler& sampler) const { 2515 std::string GetSampler(const Sampler& sampler) const {
2314 return GetDeclarationWithSuffix(static_cast<u32>(sampler.GetIndex()), "sampler"); 2516 return AppendSuffix(static_cast<u32>(sampler.GetIndex()), "sampler");
2315 } 2517 }
2316 2518
2317 std::string GetImage(const Image& image) const { 2519 std::string GetImage(const Image& image) const {
2318 return GetDeclarationWithSuffix(static_cast<u32>(image.GetIndex()), "image"); 2520 return AppendSuffix(static_cast<u32>(image.GetIndex()), "image");
2319 } 2521 }
2320 2522
2321 std::string GetDeclarationWithSuffix(u32 index, std::string_view name) const { 2523 std::string AppendSuffix(u32 index, std::string_view name) const {
2322 return fmt::format("{}_{}_{}", name, index, suffix); 2524 if (suffix.empty()) {
2525 return fmt::format("{}{}", name, index);
2526 } else {
2527 return fmt::format("{}{}_{}", name, index, suffix);
2528 }
2323 } 2529 }
2324 2530
2325 u32 GetNumPhysicalInputAttributes() const { 2531 u32 GetNumPhysicalInputAttributes() const {
2326 return IsVertexShader(stage) ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings(); 2532 return stage == ShaderType::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
2327 } 2533 }
2328 2534
2329 u32 GetNumPhysicalAttributes() const { 2535 u32 GetNumPhysicalAttributes() const {
@@ -2334,17 +2540,31 @@ private:
2334 return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings); 2540 return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings);
2335 } 2541 }
2336 2542
2543 bool IsRenderTargetEnabled(u32 render_target) const {
2544 for (u32 component = 0; component < 4; ++component) {
2545 if (header.ps.IsColorComponentOutputEnabled(render_target, component)) {
2546 return true;
2547 }
2548 }
2549 return false;
2550 }
2551
2337 const Device& device; 2552 const Device& device;
2338 const ShaderIR& ir; 2553 const ShaderIR& ir;
2554 const Registry& registry;
2339 const ShaderType stage; 2555 const ShaderType stage;
2340 const std::string suffix; 2556 const std::string_view identifier;
2557 const std::string_view suffix;
2341 const Header header; 2558 const Header header;
2559 std::unordered_map<u8, VaryingTFB> transform_feedback;
2342 2560
2343 ShaderWriter code; 2561 ShaderWriter code;
2562
2563 std::optional<u32> max_input_vertices;
2344}; 2564};
2345 2565
2346std::string GetFlowVariable(u32 i) { 2566std::string GetFlowVariable(u32 index) {
2347 return fmt::format("flow_var_{}", i); 2567 return fmt::format("flow_var{}", index);
2348} 2568}
2349 2569
2350class ExprDecompiler { 2570class ExprDecompiler {
@@ -2531,7 +2751,7 @@ void GLSLDecompiler::DecompileAST() {
2531 2751
2532} // Anonymous namespace 2752} // Anonymous namespace
2533 2753
2534ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) { 2754ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
2535 ShaderEntries entries; 2755 ShaderEntries entries;
2536 for (const auto& cbuf : ir.GetConstantBuffers()) { 2756 for (const auto& cbuf : ir.GetConstantBuffers()) {
2537 entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), 2757 entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2555,28 +2775,12 @@ ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) {
2555 return entries; 2775 return entries;
2556} 2776}
2557 2777
2558std::string GetCommonDeclarations() { 2778std::string DecompileShader(const Device& device, const ShaderIR& ir, const Registry& registry,
2559 return R"(#define ftoi floatBitsToInt 2779 ShaderType stage, std::string_view identifier,
2560#define ftou floatBitsToUint 2780 std::string_view suffix) {
2561#define itof intBitsToFloat 2781 GLSLDecompiler decompiler(device, ir, registry, stage, identifier, suffix);
2562#define utof uintBitsToFloat
2563
2564bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {
2565 bvec2 is_nan1 = isnan(pair1);
2566 bvec2 is_nan2 = isnan(pair2);
2567 return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || is_nan2.y);
2568}
2569
2570const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f );
2571const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f );
2572)";
2573}
2574
2575std::string Decompile(const Device& device, const ShaderIR& ir, ShaderType stage,
2576 const std::string& suffix) {
2577 GLSLDecompiler decompiler(device, ir, stage, suffix);
2578 decompiler.Decompile(); 2782 decompiler.Decompile();
2579 return decompiler.GetResult(); 2783 return decompiler.GetResult();
2580} 2784}
2581 2785
2582} // namespace OpenGL::GLShader 2786} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 0f692c1db..e7dbd810c 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -6,22 +6,18 @@
6 6
7#include <array> 7#include <array>
8#include <string> 8#include <string>
9#include <string_view>
9#include <utility> 10#include <utility>
10#include <vector> 11#include <vector>
11#include "common/common_types.h" 12#include "common/common_types.h"
12#include "video_core/engines/maxwell_3d.h" 13#include "video_core/engines/maxwell_3d.h"
13#include "video_core/engines/shader_type.h" 14#include "video_core/engines/shader_type.h"
15#include "video_core/shader/registry.h"
14#include "video_core/shader/shader_ir.h" 16#include "video_core/shader/shader_ir.h"
15 17
16namespace VideoCommon::Shader {
17class ShaderIR;
18}
19
20namespace OpenGL { 18namespace OpenGL {
21class Device;
22}
23 19
24namespace OpenGL::GLShader { 20class Device;
25 21
26using Maxwell = Tegra::Engines::Maxwell3D::Regs; 22using Maxwell = Tegra::Engines::Maxwell3D::Regs;
27using SamplerEntry = VideoCommon::Shader::Sampler; 23using SamplerEntry = VideoCommon::Shader::Sampler;
@@ -78,11 +74,11 @@ struct ShaderEntries {
78 std::size_t shader_length{}; 74 std::size_t shader_length{};
79}; 75};
80 76
81ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir); 77ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);
82
83std::string GetCommonDeclarations();
84 78
85std::string Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, 79std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
86 Tegra::Engines::ShaderType stage, const std::string& suffix); 80 const VideoCommon::Shader::Registry& registry,
81 Tegra::Engines::ShaderType stage, std::string_view identifier,
82 std::string_view suffix = {});
87 83
88} // namespace OpenGL::GLShader 84} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 1fc204f6f..9e95a122b 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -31,32 +31,24 @@ namespace {
31 31
32using ShaderCacheVersionHash = std::array<u8, 64>; 32using ShaderCacheVersionHash = std::array<u8, 64>;
33 33
34enum class TransferableEntryKind : u32 {
35 Raw,
36 Usage,
37};
38
39struct ConstBufferKey { 34struct ConstBufferKey {
40 u32 cbuf{}; 35 u32 cbuf = 0;
41 u32 offset{}; 36 u32 offset = 0;
42 u32 value{}; 37 u32 value = 0;
43}; 38};
44 39
45struct BoundSamplerKey { 40struct BoundSamplerKey {
46 u32 offset{}; 41 u32 offset = 0;
47 Tegra::Engines::SamplerDescriptor sampler{}; 42 Tegra::Engines::SamplerDescriptor sampler;
48}; 43};
49 44
50struct BindlessSamplerKey { 45struct BindlessSamplerKey {
51 u32 cbuf{}; 46 u32 cbuf = 0;
52 u32 offset{}; 47 u32 offset = 0;
53 Tegra::Engines::SamplerDescriptor sampler{}; 48 Tegra::Engines::SamplerDescriptor sampler;
54}; 49};
55 50
56constexpr u32 NativeVersion = 12; 51constexpr u32 NativeVersion = 20;
57
58// Making sure sizes doesn't change by accident
59static_assert(sizeof(ProgramVariant) == 20);
60 52
61ShaderCacheVersionHash GetShaderCacheVersionHash() { 53ShaderCacheVersionHash GetShaderCacheVersionHash() {
62 ShaderCacheVersionHash hash{}; 54 ShaderCacheVersionHash hash{};
@@ -67,61 +59,124 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() {
67 59
68} // Anonymous namespace 60} // Anonymous namespace
69 61
70ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ShaderType type, ProgramCode code, 62ShaderDiskCacheEntry::ShaderDiskCacheEntry() = default;
71 ProgramCode code_b)
72 : unique_identifier{unique_identifier}, type{type}, code{std::move(code)}, code_b{std::move(
73 code_b)} {}
74 63
75ShaderDiskCacheRaw::ShaderDiskCacheRaw() = default; 64ShaderDiskCacheEntry::~ShaderDiskCacheEntry() = default;
76 65
77ShaderDiskCacheRaw::~ShaderDiskCacheRaw() = default; 66bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
78 67 if (file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) {
79bool ShaderDiskCacheRaw::Load(FileUtil::IOFile& file) {
80 if (file.ReadBytes(&unique_identifier, sizeof(u64)) != sizeof(u64) ||
81 file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) {
82 return false; 68 return false;
83 } 69 }
84 u32 code_size{}; 70 u32 code_size;
85 u32 code_size_b{}; 71 u32 code_size_b;
86 if (file.ReadBytes(&code_size, sizeof(u32)) != sizeof(u32) || 72 if (file.ReadBytes(&code_size, sizeof(u32)) != sizeof(u32) ||
87 file.ReadBytes(&code_size_b, sizeof(u32)) != sizeof(u32)) { 73 file.ReadBytes(&code_size_b, sizeof(u32)) != sizeof(u32)) {
88 return false; 74 return false;
89 } 75 }
90
91 code.resize(code_size); 76 code.resize(code_size);
92 code_b.resize(code_size_b); 77 code_b.resize(code_size_b);
93 78
94 if (file.ReadArray(code.data(), code_size) != code_size) 79 if (file.ReadArray(code.data(), code_size) != code_size) {
95 return false; 80 return false;
96 81 }
97 if (HasProgramA() && file.ReadArray(code_b.data(), code_size_b) != code_size_b) { 82 if (HasProgramA() && file.ReadArray(code_b.data(), code_size_b) != code_size_b) {
98 return false; 83 return false;
99 } 84 }
85
86 u8 is_texture_handler_size_known;
87 u32 texture_handler_size_value;
88 u32 num_keys;
89 u32 num_bound_samplers;
90 u32 num_bindless_samplers;
91 if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
92 file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
93 file.ReadArray(&texture_handler_size_value, 1) != 1 ||
94 file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
95 file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
96 file.ReadArray(&num_bindless_samplers, 1) != 1) {
97 return false;
98 }
99 if (is_texture_handler_size_known) {
100 texture_handler_size = texture_handler_size_value;
101 }
102
103 std::vector<ConstBufferKey> flat_keys(num_keys);
104 std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers);
105 std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers);
106 if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
107 file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
108 flat_bound_samplers.size() ||
109 file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
110 flat_bindless_samplers.size()) {
111 return false;
112 }
113 for (const auto& key : flat_keys) {
114 keys.insert({{key.cbuf, key.offset}, key.value});
115 }
116 for (const auto& key : flat_bound_samplers) {
117 bound_samplers.emplace(key.offset, key.sampler);
118 }
119 for (const auto& key : flat_bindless_samplers) {
120 bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
121 }
122
100 return true; 123 return true;
101} 124}
102 125
103bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const { 126bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
104 if (file.WriteObject(unique_identifier) != 1 || file.WriteObject(static_cast<u32>(type)) != 1 || 127 if (file.WriteObject(static_cast<u32>(type)) != 1 ||
105 file.WriteObject(static_cast<u32>(code.size())) != 1 || 128 file.WriteObject(static_cast<u32>(code.size())) != 1 ||
106 file.WriteObject(static_cast<u32>(code_b.size())) != 1) { 129 file.WriteObject(static_cast<u32>(code_b.size())) != 1) {
107 return false; 130 return false;
108 } 131 }
109 132 if (file.WriteArray(code.data(), code.size()) != code.size()) {
110 if (file.WriteArray(code.data(), code.size()) != code.size())
111 return false; 133 return false;
112 134 }
113 if (HasProgramA() && file.WriteArray(code_b.data(), code_b.size()) != code_b.size()) { 135 if (HasProgramA() && file.WriteArray(code_b.data(), code_b.size()) != code_b.size()) {
114 return false; 136 return false;
115 } 137 }
116 return true; 138
139 if (file.WriteObject(unique_identifier) != 1 || file.WriteObject(bound_buffer) != 1 ||
140 file.WriteObject(static_cast<u8>(texture_handler_size.has_value())) != 1 ||
141 file.WriteObject(texture_handler_size.value_or(0)) != 1 ||
142 file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
143 file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
144 file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
145 file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
146 return false;
147 }
148
149 std::vector<ConstBufferKey> flat_keys;
150 flat_keys.reserve(keys.size());
151 for (const auto& [address, value] : keys) {
152 flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
153 }
154
155 std::vector<BoundSamplerKey> flat_bound_samplers;
156 flat_bound_samplers.reserve(bound_samplers.size());
157 for (const auto& [address, sampler] : bound_samplers) {
158 flat_bound_samplers.push_back(BoundSamplerKey{address, sampler});
159 }
160
161 std::vector<BindlessSamplerKey> flat_bindless_samplers;
162 flat_bindless_samplers.reserve(bindless_samplers.size());
163 for (const auto& [address, sampler] : bindless_samplers) {
164 flat_bindless_samplers.push_back(
165 BindlessSamplerKey{address.first, address.second, sampler});
166 }
167
168 return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
169 file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
170 flat_bound_samplers.size() &&
171 file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
172 flat_bindless_samplers.size();
117} 173}
118 174
119ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {} 175ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {}
120 176
121ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default; 177ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default;
122 178
123std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>> 179std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() {
124ShaderDiskCacheOpenGL::LoadTransferable() {
125 // Skip games without title id 180 // Skip games without title id
126 const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0; 181 const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0;
127 if (!Settings::values.use_disk_shader_cache || !has_title_id) { 182 if (!Settings::values.use_disk_shader_cache || !has_title_id) {
@@ -130,17 +185,14 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
130 185
131 FileUtil::IOFile file(GetTransferablePath(), "rb"); 186 FileUtil::IOFile file(GetTransferablePath(), "rb");
132 if (!file.IsOpen()) { 187 if (!file.IsOpen()) {
133 LOG_INFO(Render_OpenGL, "No transferable shader cache found for game with title id={}", 188 LOG_INFO(Render_OpenGL, "No transferable shader cache found");
134 GetTitleID());
135 is_usable = true; 189 is_usable = true;
136 return {}; 190 return {};
137 } 191 }
138 192
139 u32 version{}; 193 u32 version{};
140 if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) { 194 if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) {
141 LOG_ERROR(Render_OpenGL, 195 LOG_ERROR(Render_OpenGL, "Failed to get transferable cache version, skipping it");
142 "Failed to get transferable cache version for title id={}, skipping",
143 GetTitleID());
144 return {}; 196 return {};
145 } 197 }
146 198
@@ -158,105 +210,42 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
158 } 210 }
159 211
160 // Version is valid, load the shaders 212 // Version is valid, load the shaders
161 constexpr const char error_loading[] = "Failed to load transferable raw entry, skipping"; 213 std::vector<ShaderDiskCacheEntry> entries;
162 std::vector<ShaderDiskCacheRaw> raws;
163 std::vector<ShaderDiskCacheUsage> usages;
164 while (file.Tell() < file.GetSize()) { 214 while (file.Tell() < file.GetSize()) {
165 TransferableEntryKind kind{}; 215 ShaderDiskCacheEntry& entry = entries.emplace_back();
166 if (file.ReadBytes(&kind, sizeof(u32)) != sizeof(u32)) { 216 if (!entry.Load(file)) {
167 LOG_ERROR(Render_OpenGL, "Failed to read transferable file, skipping"); 217 LOG_ERROR(Render_OpenGL, "Failed to load transferable raw entry, skipping");
168 return {};
169 }
170
171 switch (kind) {
172 case TransferableEntryKind::Raw: {
173 ShaderDiskCacheRaw entry;
174 if (!entry.Load(file)) {
175 LOG_ERROR(Render_OpenGL, error_loading);
176 return {};
177 }
178 transferable.insert({entry.GetUniqueIdentifier(), {}});
179 raws.push_back(std::move(entry));
180 break;
181 }
182 case TransferableEntryKind::Usage: {
183 ShaderDiskCacheUsage usage;
184
185 u32 num_keys{};
186 u32 num_bound_samplers{};
187 u32 num_bindless_samplers{};
188 if (file.ReadArray(&usage.unique_identifier, 1) != 1 ||
189 file.ReadArray(&usage.variant, 1) != 1 ||
190 file.ReadArray(&usage.bound_buffer, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 ||
191 file.ReadArray(&num_bound_samplers, 1) != 1 ||
192 file.ReadArray(&num_bindless_samplers, 1) != 1) {
193 LOG_ERROR(Render_OpenGL, error_loading);
194 return {};
195 }
196
197 std::vector<ConstBufferKey> keys(num_keys);
198 std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers);
199 std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers);
200 if (file.ReadArray(keys.data(), keys.size()) != keys.size() ||
201 file.ReadArray(bound_samplers.data(), bound_samplers.size()) !=
202 bound_samplers.size() ||
203 file.ReadArray(bindless_samplers.data(), bindless_samplers.size()) !=
204 bindless_samplers.size()) {
205 LOG_ERROR(Render_OpenGL, error_loading);
206 return {};
207 }
208 for (const auto& key : keys) {
209 usage.keys.insert({{key.cbuf, key.offset}, key.value});
210 }
211 for (const auto& key : bound_samplers) {
212 usage.bound_samplers.emplace(key.offset, key.sampler);
213 }
214 for (const auto& key : bindless_samplers) {
215 usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
216 }
217
218 usages.push_back(std::move(usage));
219 break;
220 }
221 default:
222 LOG_ERROR(Render_OpenGL, "Unknown transferable shader cache entry kind={}, skipping",
223 static_cast<u32>(kind));
224 return {}; 218 return {};
225 } 219 }
226 } 220 }
227 221
228 is_usable = true; 222 is_usable = true;
229 return {{std::move(raws), std::move(usages)}}; 223 return {std::move(entries)};
230} 224}
231 225
232std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> 226std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled() {
233ShaderDiskCacheOpenGL::LoadPrecompiled() {
234 if (!is_usable) { 227 if (!is_usable) {
235 return {}; 228 return {};
236 } 229 }
237 230
238 std::string path = GetPrecompiledPath(); 231 FileUtil::IOFile file(GetPrecompiledPath(), "rb");
239 FileUtil::IOFile file(path, "rb");
240 if (!file.IsOpen()) { 232 if (!file.IsOpen()) {
241 LOG_INFO(Render_OpenGL, "No precompiled shader cache found for game with title id={}", 233 LOG_INFO(Render_OpenGL, "No precompiled shader cache found");
242 GetTitleID());
243 return {}; 234 return {};
244 } 235 }
245 236
246 const auto result = LoadPrecompiledFile(file); 237 if (const auto result = LoadPrecompiledFile(file)) {
247 if (!result) { 238 return *result;
248 LOG_INFO(Render_OpenGL,
249 "Failed to load precompiled cache for game with title id={}, removing",
250 GetTitleID());
251 file.Close();
252 InvalidatePrecompiled();
253 return {};
254 } 239 }
255 return *result; 240
241 LOG_INFO(Render_OpenGL, "Failed to load precompiled cache");
242 file.Close();
243 InvalidatePrecompiled();
244 return {};
256} 245}
257 246
258std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>> 247std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::LoadPrecompiledFile(
259ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) { 248 FileUtil::IOFile& file) {
260 // Read compressed file from disk and decompress to virtual precompiled cache file 249 // Read compressed file from disk and decompress to virtual precompiled cache file
261 std::vector<u8> compressed(file.GetSize()); 250 std::vector<u8> compressed(file.GetSize());
262 file.ReadBytes(compressed.data(), compressed.size()); 251 file.ReadBytes(compressed.data(), compressed.size());
@@ -275,58 +264,22 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
275 return {}; 264 return {};
276 } 265 }
277 266
278 ShaderDumpsMap dumps; 267 std::vector<ShaderDiskCachePrecompiled> entries;
279 while (precompiled_cache_virtual_file_offset < precompiled_cache_virtual_file.GetSize()) { 268 while (precompiled_cache_virtual_file_offset < precompiled_cache_virtual_file.GetSize()) {
280 u32 num_keys{}; 269 u32 binary_size;
281 u32 num_bound_samplers{}; 270 auto& entry = entries.emplace_back();
282 u32 num_bindless_samplers{}; 271 if (!LoadObjectFromPrecompiled(entry.unique_identifier) ||
283 ShaderDiskCacheUsage usage; 272 !LoadObjectFromPrecompiled(entry.binary_format) ||
284 if (!LoadObjectFromPrecompiled(usage.unique_identifier) || 273 !LoadObjectFromPrecompiled(binary_size)) {
285 !LoadObjectFromPrecompiled(usage.variant) ||
286 !LoadObjectFromPrecompiled(usage.bound_buffer) ||
287 !LoadObjectFromPrecompiled(num_keys) ||
288 !LoadObjectFromPrecompiled(num_bound_samplers) ||
289 !LoadObjectFromPrecompiled(num_bindless_samplers)) {
290 return {};
291 }
292 std::vector<ConstBufferKey> keys(num_keys);
293 std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers);
294 std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers);
295 if (!LoadArrayFromPrecompiled(keys.data(), keys.size()) ||
296 !LoadArrayFromPrecompiled(bound_samplers.data(), bound_samplers.size()) !=
297 bound_samplers.size() ||
298 !LoadArrayFromPrecompiled(bindless_samplers.data(), bindless_samplers.size()) !=
299 bindless_samplers.size()) {
300 return {};
301 }
302 for (const auto& key : keys) {
303 usage.keys.insert({{key.cbuf, key.offset}, key.value});
304 }
305 for (const auto& key : bound_samplers) {
306 usage.bound_samplers.emplace(key.offset, key.sampler);
307 }
308 for (const auto& key : bindless_samplers) {
309 usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
310 }
311
312 ShaderDiskCacheDump dump;
313 if (!LoadObjectFromPrecompiled(dump.binary_format)) {
314 return {};
315 }
316
317 u32 binary_length{};
318 if (!LoadObjectFromPrecompiled(binary_length)) {
319 return {}; 274 return {};
320 } 275 }
321 276
322 dump.binary.resize(binary_length); 277 entry.binary.resize(binary_size);
323 if (!LoadArrayFromPrecompiled(dump.binary.data(), dump.binary.size())) { 278 if (!LoadArrayFromPrecompiled(entry.binary.data(), entry.binary.size())) {
324 return {}; 279 return {};
325 } 280 }
326
327 dumps.emplace(std::move(usage), dump);
328 } 281 }
329 return dumps; 282 return entries;
330} 283}
331 284
332void ShaderDiskCacheOpenGL::InvalidateTransferable() { 285void ShaderDiskCacheOpenGL::InvalidateTransferable() {
@@ -346,13 +299,13 @@ void ShaderDiskCacheOpenGL::InvalidatePrecompiled() {
346 } 299 }
347} 300}
348 301
349void ShaderDiskCacheOpenGL::SaveRaw(const ShaderDiskCacheRaw& entry) { 302void ShaderDiskCacheOpenGL::SaveEntry(const ShaderDiskCacheEntry& entry) {
350 if (!is_usable) { 303 if (!is_usable) {
351 return; 304 return;
352 } 305 }
353 306
354 const u64 id = entry.GetUniqueIdentifier(); 307 const u64 id = entry.unique_identifier;
355 if (transferable.find(id) != transferable.end()) { 308 if (stored_transferable.find(id) != stored_transferable.end()) {
356 // The shader already exists 309 // The shader already exists
357 return; 310 return;
358 } 311 }
@@ -361,71 +314,17 @@ void ShaderDiskCacheOpenGL::SaveRaw(const ShaderDiskCacheRaw& entry) {
361 if (!file.IsOpen()) { 314 if (!file.IsOpen()) {
362 return; 315 return;
363 } 316 }
364 if (file.WriteObject(TransferableEntryKind::Raw) != 1 || !entry.Save(file)) { 317 if (!entry.Save(file)) {
365 LOG_ERROR(Render_OpenGL, "Failed to save raw transferable cache entry, removing"); 318 LOG_ERROR(Render_OpenGL, "Failed to save raw transferable cache entry, removing");
366 file.Close(); 319 file.Close();
367 InvalidateTransferable(); 320 InvalidateTransferable();
368 return; 321 return;
369 } 322 }
370 transferable.insert({id, {}});
371}
372 323
373void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) { 324 stored_transferable.insert(id);
374 if (!is_usable) {
375 return;
376 }
377
378 const auto it = transferable.find(usage.unique_identifier);
379 ASSERT_MSG(it != transferable.end(), "Saving shader usage without storing raw previously");
380
381 auto& usages{it->second};
382 if (usages.find(usage) != usages.end()) {
383 // Skip this variant since the shader is already stored.
384 return;
385 }
386 usages.insert(usage);
387
388 FileUtil::IOFile file = AppendTransferableFile();
389 if (!file.IsOpen())
390 return;
391 const auto Close = [&] {
392 LOG_ERROR(Render_OpenGL, "Failed to save usage transferable cache entry, removing");
393 file.Close();
394 InvalidateTransferable();
395 };
396
397 if (file.WriteObject(TransferableEntryKind::Usage) != 1 ||
398 file.WriteObject(usage.unique_identifier) != 1 || file.WriteObject(usage.variant) != 1 ||
399 file.WriteObject(usage.bound_buffer) != 1 ||
400 file.WriteObject(static_cast<u32>(usage.keys.size())) != 1 ||
401 file.WriteObject(static_cast<u32>(usage.bound_samplers.size())) != 1 ||
402 file.WriteObject(static_cast<u32>(usage.bindless_samplers.size())) != 1) {
403 Close();
404 return;
405 }
406 for (const auto& [pair, value] : usage.keys) {
407 const auto [cbuf, offset] = pair;
408 if (file.WriteObject(ConstBufferKey{cbuf, offset, value}) != 1) {
409 Close();
410 return;
411 }
412 }
413 for (const auto& [offset, sampler] : usage.bound_samplers) {
414 if (file.WriteObject(BoundSamplerKey{offset, sampler}) != 1) {
415 Close();
416 return;
417 }
418 }
419 for (const auto& [pair, sampler] : usage.bindless_samplers) {
420 const auto [cbuf, offset] = pair;
421 if (file.WriteObject(BindlessSamplerKey{cbuf, offset, sampler}) != 1) {
422 Close();
423 return;
424 }
425 }
426} 325}
427 326
428void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint program) { 327void ShaderDiskCacheOpenGL::SavePrecompiled(u64 unique_identifier, GLuint program) {
429 if (!is_usable) { 328 if (!is_usable) {
430 return; 329 return;
431 } 330 }
@@ -437,51 +336,19 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p
437 SavePrecompiledHeaderToVirtualPrecompiledCache(); 336 SavePrecompiledHeaderToVirtualPrecompiledCache();
438 } 337 }
439 338
440 GLint binary_length{}; 339 GLint binary_length;
441 glGetProgramiv(program, GL_PROGRAM_BINARY_LENGTH, &binary_length); 340 glGetProgramiv(program, GL_PROGRAM_BINARY_LENGTH, &binary_length);
442 341
443 GLenum binary_format{}; 342 GLenum binary_format;
444 std::vector<u8> binary(binary_length); 343 std::vector<u8> binary(binary_length);
445 glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data()); 344 glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data());
446 345
447 const auto Close = [&] { 346 if (!SaveObjectToPrecompiled(unique_identifier) || !SaveObjectToPrecompiled(binary_format) ||
347 !SaveObjectToPrecompiled(static_cast<u32>(binary.size())) ||
348 !SaveArrayToPrecompiled(binary.data(), binary.size())) {
448 LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016X}, removing", 349 LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016X}, removing",
449 usage.unique_identifier); 350 unique_identifier);
450 InvalidatePrecompiled(); 351 InvalidatePrecompiled();
451 };
452
453 if (!SaveObjectToPrecompiled(usage.unique_identifier) ||
454 !SaveObjectToPrecompiled(usage.variant) || !SaveObjectToPrecompiled(usage.bound_buffer) ||
455 !SaveObjectToPrecompiled(static_cast<u32>(usage.keys.size())) ||
456 !SaveObjectToPrecompiled(static_cast<u32>(usage.bound_samplers.size())) ||
457 !SaveObjectToPrecompiled(static_cast<u32>(usage.bindless_samplers.size()))) {
458 Close();
459 return;
460 }
461 for (const auto& [pair, value] : usage.keys) {
462 const auto [cbuf, offset] = pair;
463 if (SaveObjectToPrecompiled(ConstBufferKey{cbuf, offset, value}) != 1) {
464 Close();
465 return;
466 }
467 }
468 for (const auto& [offset, sampler] : usage.bound_samplers) {
469 if (SaveObjectToPrecompiled(BoundSamplerKey{offset, sampler}) != 1) {
470 Close();
471 return;
472 }
473 }
474 for (const auto& [pair, sampler] : usage.bindless_samplers) {
475 const auto [cbuf, offset] = pair;
476 if (SaveObjectToPrecompiled(BindlessSamplerKey{cbuf, offset, sampler}) != 1) {
477 Close();
478 return;
479 }
480 }
481 if (!SaveObjectToPrecompiled(static_cast<u32>(binary_format)) ||
482 !SaveObjectToPrecompiled(static_cast<u32>(binary_length)) ||
483 !SaveArrayToPrecompiled(binary.data(), binary.size())) {
484 Close();
485 } 352 }
486} 353}
487 354
@@ -534,7 +401,6 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() {
534 if (file.WriteBytes(compressed.data(), compressed.size()) != compressed.size()) { 401 if (file.WriteBytes(compressed.data(), compressed.size()) != compressed.size()) {
535 LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version in path={}", 402 LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version in path={}",
536 precompiled_path); 403 precompiled_path);
537 return;
538 } 404 }
539} 405}
540 406
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index ef2371f6d..d5be52e40 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -19,8 +19,7 @@
19#include "common/common_types.h" 19#include "common/common_types.h"
20#include "core/file_sys/vfs_vector.h" 20#include "core/file_sys/vfs_vector.h"
21#include "video_core/engines/shader_type.h" 21#include "video_core/engines/shader_type.h"
22#include "video_core/renderer_opengl/gl_shader_gen.h" 22#include "video_core/shader/registry.h"
23#include "video_core/shader/const_buffer_locker.h"
24 23
25namespace Core { 24namespace Core {
26class System; 25class System;
@@ -32,139 +31,39 @@ class IOFile;
32 31
33namespace OpenGL { 32namespace OpenGL {
34 33
35struct ShaderDiskCacheUsage;
36struct ShaderDiskCacheDump;
37
38using ProgramCode = std::vector<u64>; 34using ProgramCode = std::vector<u64>;
39using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
40
41/// Describes the different variants a program can be compiled with.
42struct ProgramVariant final {
43 ProgramVariant() = default;
44
45 /// Graphics constructor.
46 explicit constexpr ProgramVariant(GLenum primitive_mode) noexcept
47 : primitive_mode{primitive_mode} {}
48
49 /// Compute constructor.
50 explicit constexpr ProgramVariant(u32 block_x, u32 block_y, u32 block_z, u32 shared_memory_size,
51 u32 local_memory_size) noexcept
52 : block_x{block_x}, block_y{static_cast<u16>(block_y)}, block_z{static_cast<u16>(block_z)},
53 shared_memory_size{shared_memory_size}, local_memory_size{local_memory_size} {}
54
55 // Graphics specific parameters.
56 GLenum primitive_mode{};
57
58 // Compute specific parameters.
59 u32 block_x{};
60 u16 block_y{};
61 u16 block_z{};
62 u32 shared_memory_size{};
63 u32 local_memory_size{};
64
65 bool operator==(const ProgramVariant& rhs) const noexcept {
66 return std::tie(primitive_mode, block_x, block_y, block_z, shared_memory_size,
67 local_memory_size) == std::tie(rhs.primitive_mode, rhs.block_x, rhs.block_y,
68 rhs.block_z, rhs.shared_memory_size,
69 rhs.local_memory_size);
70 }
71
72 bool operator!=(const ProgramVariant& rhs) const noexcept {
73 return !operator==(rhs);
74 }
75};
76static_assert(std::is_trivially_copyable_v<ProgramVariant>);
77
78/// Describes how a shader is used.
79struct ShaderDiskCacheUsage {
80 u64 unique_identifier{};
81 ProgramVariant variant;
82 u32 bound_buffer{};
83 VideoCommon::Shader::KeyMap keys;
84 VideoCommon::Shader::BoundSamplerMap bound_samplers;
85 VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
86
87 bool operator==(const ShaderDiskCacheUsage& rhs) const {
88 return std::tie(unique_identifier, variant, keys, bound_samplers, bindless_samplers) ==
89 std::tie(rhs.unique_identifier, rhs.variant, rhs.keys, rhs.bound_samplers,
90 rhs.bindless_samplers);
91 }
92
93 bool operator!=(const ShaderDiskCacheUsage& rhs) const {
94 return !operator==(rhs);
95 }
96};
97
98} // namespace OpenGL
99
100namespace std {
101
102template <>
103struct hash<OpenGL::ProgramVariant> {
104 std::size_t operator()(const OpenGL::ProgramVariant& variant) const noexcept {
105 return (static_cast<std::size_t>(variant.primitive_mode) << 6) ^
106 static_cast<std::size_t>(variant.block_x) ^
107 (static_cast<std::size_t>(variant.block_y) << 32) ^
108 (static_cast<std::size_t>(variant.block_z) << 48) ^
109 (static_cast<std::size_t>(variant.shared_memory_size) << 16) ^
110 (static_cast<std::size_t>(variant.local_memory_size) << 36);
111 }
112};
113
114template <>
115struct hash<OpenGL::ShaderDiskCacheUsage> {
116 std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept {
117 return static_cast<std::size_t>(usage.unique_identifier) ^
118 std::hash<OpenGL::ProgramVariant>{}(usage.variant);
119 }
120};
121
122} // namespace std
123
124namespace OpenGL {
125 35
126/// Describes a shader how it's used by the guest GPU 36/// Describes a shader and how it's used by the guest GPU
127class ShaderDiskCacheRaw { 37struct ShaderDiskCacheEntry {
128public: 38 ShaderDiskCacheEntry();
129 explicit ShaderDiskCacheRaw(u64 unique_identifier, Tegra::Engines::ShaderType type, 39 ~ShaderDiskCacheEntry();
130 ProgramCode code, ProgramCode code_b = {});
131 ShaderDiskCacheRaw();
132 ~ShaderDiskCacheRaw();
133 40
134 bool Load(FileUtil::IOFile& file); 41 bool Load(FileUtil::IOFile& file);
135 42
136 bool Save(FileUtil::IOFile& file) const; 43 bool Save(FileUtil::IOFile& file) const;
137 44
138 u64 GetUniqueIdentifier() const {
139 return unique_identifier;
140 }
141
142 bool HasProgramA() const { 45 bool HasProgramA() const {
143 return !code.empty() && !code_b.empty(); 46 return !code.empty() && !code_b.empty();
144 } 47 }
145 48
146 Tegra::Engines::ShaderType GetType() const {
147 return type;
148 }
149
150 const ProgramCode& GetCode() const {
151 return code;
152 }
153
154 const ProgramCode& GetCodeB() const {
155 return code_b;
156 }
157
158private:
159 u64 unique_identifier{};
160 Tegra::Engines::ShaderType type{}; 49 Tegra::Engines::ShaderType type{};
161 ProgramCode code; 50 ProgramCode code;
162 ProgramCode code_b; 51 ProgramCode code_b;
52
53 u64 unique_identifier = 0;
54 std::optional<u32> texture_handler_size;
55 u32 bound_buffer = 0;
56 VideoCommon::Shader::GraphicsInfo graphics_info;
57 VideoCommon::Shader::ComputeInfo compute_info;
58 VideoCommon::Shader::KeyMap keys;
59 VideoCommon::Shader::BoundSamplerMap bound_samplers;
60 VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
163}; 61};
164 62
165/// Contains an OpenGL dumped binary program 63/// Contains an OpenGL dumped binary program
166struct ShaderDiskCacheDump { 64struct ShaderDiskCachePrecompiled {
167 GLenum binary_format{}; 65 u64 unique_identifier = 0;
66 GLenum binary_format = 0;
168 std::vector<u8> binary; 67 std::vector<u8> binary;
169}; 68};
170 69
@@ -174,11 +73,10 @@ public:
174 ~ShaderDiskCacheOpenGL(); 73 ~ShaderDiskCacheOpenGL();
175 74
176 /// Loads transferable cache. If file has a old version or on failure, it deletes the file. 75 /// Loads transferable cache. If file has a old version or on failure, it deletes the file.
177 std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>> 76 std::optional<std::vector<ShaderDiskCacheEntry>> LoadTransferable();
178 LoadTransferable();
179 77
180 /// Loads current game's precompiled cache. Invalidates on failure. 78 /// Loads current game's precompiled cache. Invalidates on failure.
181 std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> LoadPrecompiled(); 79 std::vector<ShaderDiskCachePrecompiled> LoadPrecompiled();
182 80
183 /// Removes the transferable (and precompiled) cache file. 81 /// Removes the transferable (and precompiled) cache file.
184 void InvalidateTransferable(); 82 void InvalidateTransferable();
@@ -187,21 +85,18 @@ public:
187 void InvalidatePrecompiled(); 85 void InvalidatePrecompiled();
188 86
189 /// Saves a raw dump to the transferable file. Checks for collisions. 87 /// Saves a raw dump to the transferable file. Checks for collisions.
190 void SaveRaw(const ShaderDiskCacheRaw& entry); 88 void SaveEntry(const ShaderDiskCacheEntry& entry);
191
192 /// Saves shader usage to the transferable file. Does not check for collisions.
193 void SaveUsage(const ShaderDiskCacheUsage& usage);
194 89
195 /// Saves a dump entry to the precompiled file. Does not check for collisions. 90 /// Saves a dump entry to the precompiled file. Does not check for collisions.
196 void SaveDump(const ShaderDiskCacheUsage& usage, GLuint program); 91 void SavePrecompiled(u64 unique_identifier, GLuint program);
197 92
198 /// Serializes virtual precompiled shader cache file to real file 93 /// Serializes virtual precompiled shader cache file to real file
199 void SaveVirtualPrecompiledFile(); 94 void SaveVirtualPrecompiledFile();
200 95
201private: 96private:
202 /// Loads the transferable cache. Returns empty on failure. 97 /// Loads the transferable cache. Returns empty on failure.
203 std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>> 98 std::optional<std::vector<ShaderDiskCachePrecompiled>> LoadPrecompiledFile(
204 LoadPrecompiledFile(FileUtil::IOFile& file); 99 FileUtil::IOFile& file);
205 100
206 /// Opens current game's transferable file and write it's header if it doesn't exist 101 /// Opens current game's transferable file and write it's header if it doesn't exist
207 FileUtil::IOFile AppendTransferableFile() const; 102 FileUtil::IOFile AppendTransferableFile() const;
@@ -270,7 +165,7 @@ private:
270 std::size_t precompiled_cache_virtual_file_offset = 0; 165 std::size_t precompiled_cache_virtual_file_offset = 0;
271 166
272 // Stored transferable shaders 167 // Stored transferable shaders
273 std::unordered_map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable; 168 std::unordered_set<u64> stored_transferable;
274 169
275 // The cache has been loaded at boot 170 // The cache has been loaded at boot
276 bool is_usable{}; 171 bool is_usable{};
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
deleted file mode 100644
index 34946fb47..000000000
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
1// Copyright 2018 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <string>
6
7#include <fmt/format.h>
8
9#include "video_core/engines/maxwell_3d.h"
10#include "video_core/engines/shader_type.h"
11#include "video_core/renderer_opengl/gl_device.h"
12#include "video_core/renderer_opengl/gl_shader_decompiler.h"
13#include "video_core/renderer_opengl/gl_shader_gen.h"
14#include "video_core/shader/shader_ir.h"
15
16namespace OpenGL::GLShader {
17
18using Tegra::Engines::Maxwell3D;
19using Tegra::Engines::ShaderType;
20using VideoCommon::Shader::CompileDepth;
21using VideoCommon::Shader::CompilerSettings;
22using VideoCommon::Shader::ProgramCode;
23using VideoCommon::Shader::ShaderIR;
24
25std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b) {
26 std::string out = GetCommonDeclarations();
27 out += fmt::format(R"(
28layout (std140, binding = {}) uniform vs_config {{
29 float y_direction;
30}};
31
32)",
33 EmulationUniformBlockBinding);
34 out += Decompile(device, ir, ShaderType::Vertex, "vertex");
35 if (ir_b) {
36 out += Decompile(device, *ir_b, ShaderType::Vertex, "vertex_b");
37 }
38
39 out += R"(
40void main() {
41 gl_Position = vec4(0.0f, 0.0f, 0.0f, 1.0f);
42 execute_vertex();
43)";
44 if (ir_b) {
45 out += " execute_vertex_b();";
46 }
47 out += "}\n";
48 return out;
49}
50
51std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir) {
52 std::string out = GetCommonDeclarations();
53 out += fmt::format(R"(
54layout (std140, binding = {}) uniform gs_config {{
55 float y_direction;
56}};
57
58)",
59 EmulationUniformBlockBinding);
60 out += Decompile(device, ir, ShaderType::Geometry, "geometry");
61
62 out += R"(
63void main() {
64 execute_geometry();
65}
66)";
67 return out;
68}
69
70std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir) {
71 std::string out = GetCommonDeclarations();
72 out += fmt::format(R"(
73layout (location = 0) out vec4 FragColor0;
74layout (location = 1) out vec4 FragColor1;
75layout (location = 2) out vec4 FragColor2;
76layout (location = 3) out vec4 FragColor3;
77layout (location = 4) out vec4 FragColor4;
78layout (location = 5) out vec4 FragColor5;
79layout (location = 6) out vec4 FragColor6;
80layout (location = 7) out vec4 FragColor7;
81
82layout (std140, binding = {}) uniform fs_config {{
83 float y_direction;
84}};
85
86)",
87 EmulationUniformBlockBinding);
88 out += Decompile(device, ir, ShaderType::Fragment, "fragment");
89
90 out += R"(
91void main() {
92 execute_fragment();
93}
94)";
95 return out;
96}
97
98std::string GenerateComputeShader(const Device& device, const ShaderIR& ir) {
99 std::string out = GetCommonDeclarations();
100 out += Decompile(device, ir, ShaderType::Compute, "compute");
101 out += R"(
102void main() {
103 execute_compute();
104}
105)";
106 return out;
107}
108
109} // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
deleted file mode 100644
index cba2be9f9..000000000
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ /dev/null
@@ -1,34 +0,0 @@
1// Copyright 2018 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <vector>
8
9#include "common/common_types.h"
10#include "video_core/renderer_opengl/gl_shader_decompiler.h"
11#include "video_core/shader/shader_ir.h"
12
13namespace OpenGL {
14class Device;
15}
16
17namespace OpenGL::GLShader {
18
19using VideoCommon::Shader::ProgramCode;
20using VideoCommon::Shader::ShaderIR;
21
22/// Generates the GLSL vertex shader program source code for the given VS program
23std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b);
24
25/// Generates the GLSL geometry shader program source code for the given GS program
26std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir);
27
28/// Generates the GLSL fragment shader program source code for the given FS program
29std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir);
30
31/// Generates the GLSL compute shader program source code for the given CS program
32std::string GenerateComputeShader(const Device& device, const ShaderIR& ir);
33
34} // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp
index 1e43c9ec0..255ac3147 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.cpp
+++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp
@@ -94,6 +94,15 @@ void SetupDirtyShaders(Tables& tables) {
94 Shaders); 94 Shaders);
95} 95}
96 96
97void SetupDirtyPolygonModes(Tables& tables) {
98 tables[0][OFF(polygon_mode_front)] = PolygonModeFront;
99 tables[0][OFF(polygon_mode_back)] = PolygonModeBack;
100
101 tables[1][OFF(polygon_mode_front)] = PolygonModes;
102 tables[1][OFF(polygon_mode_back)] = PolygonModes;
103 tables[0][OFF(fill_rectangle)] = PolygonModes;
104}
105
97void SetupDirtyDepthTest(Tables& tables) { 106void SetupDirtyDepthTest(Tables& tables) {
98 auto& table = tables[0]; 107 auto& table = tables[0];
99 table[OFF(depth_test_enable)] = DepthTest; 108 table[OFF(depth_test_enable)] = DepthTest;
@@ -211,6 +220,7 @@ void StateTracker::Initialize() {
211 SetupDirtyVertexArrays(tables); 220 SetupDirtyVertexArrays(tables);
212 SetupDirtyVertexFormat(tables); 221 SetupDirtyVertexFormat(tables);
213 SetupDirtyShaders(tables); 222 SetupDirtyShaders(tables);
223 SetupDirtyPolygonModes(tables);
214 SetupDirtyDepthTest(tables); 224 SetupDirtyDepthTest(tables);
215 SetupDirtyStencilTest(tables); 225 SetupDirtyStencilTest(tables);
216 SetupDirtyAlphaTest(tables); 226 SetupDirtyAlphaTest(tables);
@@ -228,7 +238,6 @@ void StateTracker::Initialize() {
228 SetupDirtyMisc(tables); 238 SetupDirtyMisc(tables);
229 239
230 auto& store = dirty.on_write_stores; 240 auto& store = dirty.on_write_stores;
231 SetupCommonOnWriteStores(store);
232 store[VertexBuffers] = true; 241 store[VertexBuffers] = true;
233 for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { 242 for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) {
234 store[VertexBuffer0 + i] = true; 243 store[VertexBuffer0 + i] = true;
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h
index e08482911..b882d75c3 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.h
+++ b/src/video_core/renderer_opengl/gl_state_tracker.h
@@ -59,6 +59,10 @@ enum : u8 {
59 Shaders, 59 Shaders,
60 ClipDistances, 60 ClipDistances,
61 61
62 PolygonModes,
63 PolygonModeFront,
64 PolygonModeBack,
65
62 ColorMask, 66 ColorMask,
63 FrontFace, 67 FrontFace,
64 CullTest, 68 CullTest,
@@ -111,6 +115,13 @@ public:
111 flags[OpenGL::Dirty::VertexInstance0 + 1] = true; 115 flags[OpenGL::Dirty::VertexInstance0 + 1] = true;
112 } 116 }
113 117
118 void NotifyPolygonModes() {
119 auto& flags = system.GPU().Maxwell3D().dirty.flags;
120 flags[OpenGL::Dirty::PolygonModes] = true;
121 flags[OpenGL::Dirty::PolygonModeFront] = true;
122 flags[OpenGL::Dirty::PolygonModeBack] = true;
123 }
124
114 void NotifyViewport0() { 125 void NotifyViewport0() {
115 auto& flags = system.GPU().Maxwell3D().dirty.flags; 126 auto& flags = system.GPU().Maxwell3D().dirty.flags;
116 flags[OpenGL::Dirty::Viewports] = true; 127 flags[OpenGL::Dirty::Viewports] = true;
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 2d3838a7a..f424e3000 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -53,6 +53,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
53 {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, false}, // R8UI 53 {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, false}, // R8UI
54 {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false}, // RGBA16F 54 {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false}, // RGBA16F
55 {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, false}, // RGBA16U 55 {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, false}, // RGBA16U
56 {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT, false}, // RGBA16S
56 {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, false}, // RGBA16UI 57 {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, false}, // RGBA16UI
57 {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false}, // R11FG11FB10F 58 {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false}, // R11FG11FB10F
58 {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, false}, // RGBA32UI 59 {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, false}, // RGBA32UI
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 494e38e7a..89f0e04ef 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -488,5 +488,18 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) {
488 return GL_COPY; 488 return GL_COPY;
489} 489}
490 490
491inline GLenum PolygonMode(Maxwell::PolygonMode polygon_mode) {
492 switch (polygon_mode) {
493 case Maxwell::PolygonMode::Point:
494 return GL_POINT;
495 case Maxwell::PolygonMode::Line:
496 return GL_LINE;
497 case Maxwell::PolygonMode::Fill:
498 return GL_FILL;
499 }
500 UNREACHABLE_MSG("Invalid polygon mode={}", static_cast<int>(polygon_mode));
501 return GL_FILL;
502}
503
491} // namespace MaxwellToGL 504} // namespace MaxwellToGL
492} // namespace OpenGL 505} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index c05677cd9..fca5e3ec0 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -5,8 +5,11 @@
5#include <algorithm> 5#include <algorithm>
6#include <cstddef> 6#include <cstddef>
7#include <cstdlib> 7#include <cstdlib>
8#include <cstring>
8#include <memory> 9#include <memory>
10
9#include <glad/glad.h> 11#include <glad/glad.h>
12
10#include "common/assert.h" 13#include "common/assert.h"
11#include "common/logging/log.h" 14#include "common/logging/log.h"
12#include "common/microprofile.h" 15#include "common/microprofile.h"
@@ -25,6 +28,8 @@
25 28
26namespace OpenGL { 29namespace OpenGL {
27 30
31namespace {
32
28// If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have 33// If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have
29// to wait on available presentation frames. 34// to wait on available presentation frames.
30constexpr std::size_t SWAP_CHAIN_SIZE = 3; 35constexpr std::size_t SWAP_CHAIN_SIZE = 3;
@@ -41,124 +46,6 @@ struct Frame {
41 bool is_srgb{}; /// Framebuffer is sRGB or RGB 46 bool is_srgb{}; /// Framebuffer is sRGB or RGB
42}; 47};
43 48
44/**
45 * For smooth Vsync rendering, we want to always present the latest frame that the core generates,
46 * but also make sure that rendering happens at the pace that the frontend dictates. This is a
47 * helper class that the renderer uses to sync frames between the render thread and the presentation
48 * thread
49 */
50class FrameMailbox {
51public:
52 std::mutex swap_chain_lock;
53 std::condition_variable present_cv;
54 std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{};
55 std::queue<Frame*> free_queue;
56 std::deque<Frame*> present_queue;
57 Frame* previous_frame{};
58
59 FrameMailbox() {
60 for (auto& frame : swap_chain) {
61 free_queue.push(&frame);
62 }
63 }
64
65 ~FrameMailbox() {
66 // lock the mutex and clear out the present and free_queues and notify any people who are
67 // blocked to prevent deadlock on shutdown
68 std::scoped_lock lock{swap_chain_lock};
69 std::queue<Frame*>().swap(free_queue);
70 present_queue.clear();
71 present_cv.notify_all();
72 }
73
74 void ReloadPresentFrame(Frame* frame, u32 height, u32 width) {
75 frame->present.Release();
76 frame->present.Create();
77 GLint previous_draw_fbo{};
78 glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo);
79 glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle);
80 glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
81 frame->color.handle);
82 if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
83 LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!");
84 }
85 glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo);
86 frame->color_reloaded = false;
87 }
88
89 void ReloadRenderFrame(Frame* frame, u32 width, u32 height) {
90 // Recreate the color texture attachment
91 frame->color.Release();
92 frame->color.Create();
93 const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8;
94 glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height);
95
96 // Recreate the FBO for the render target
97 frame->render.Release();
98 frame->render.Create();
99 glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle);
100 glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
101 frame->color.handle);
102 if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
103 LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!");
104 }
105
106 frame->width = width;
107 frame->height = height;
108 frame->color_reloaded = true;
109 }
110
111 Frame* GetRenderFrame() {
112 std::unique_lock lock{swap_chain_lock};
113
114 // If theres no free frames, we will reuse the oldest render frame
115 if (free_queue.empty()) {
116 auto frame = present_queue.back();
117 present_queue.pop_back();
118 return frame;
119 }
120
121 Frame* frame = free_queue.front();
122 free_queue.pop();
123 return frame;
124 }
125
126 void ReleaseRenderFrame(Frame* frame) {
127 std::unique_lock lock{swap_chain_lock};
128 present_queue.push_front(frame);
129 present_cv.notify_one();
130 }
131
132 Frame* TryGetPresentFrame(int timeout_ms) {
133 std::unique_lock lock{swap_chain_lock};
134 // wait for new entries in the present_queue
135 present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms),
136 [&] { return !present_queue.empty(); });
137 if (present_queue.empty()) {
138 // timed out waiting for a frame to draw so return the previous frame
139 return previous_frame;
140 }
141
142 // free the previous frame and add it back to the free queue
143 if (previous_frame) {
144 free_queue.push(previous_frame);
145 }
146
147 // the newest entries are pushed to the front of the queue
148 Frame* frame = present_queue.front();
149 present_queue.pop_front();
150 // remove all old entries from the present queue and move them back to the free_queue
151 for (auto f : present_queue) {
152 free_queue.push(f);
153 }
154 present_queue.clear();
155 previous_frame = frame;
156 return frame;
157 }
158};
159
160namespace {
161
162constexpr char VERTEX_SHADER[] = R"( 49constexpr char VERTEX_SHADER[] = R"(
163#version 430 core 50#version 430 core
164 51
@@ -211,6 +98,24 @@ struct ScreenRectVertex {
211 std::array<GLfloat, 2> tex_coord; 98 std::array<GLfloat, 2> tex_coord;
212}; 99};
213 100
101/// Returns true if any debug tool is attached
102bool HasDebugTool() {
103 const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
104 if (nsight) {
105 return true;
106 }
107
108 GLint num_extensions;
109 glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions);
110 for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) {
111 const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index));
112 if (!std::strcmp(name, "GL_EXT_debug_tool")) {
113 return true;
114 }
115 }
116 return false;
117}
118
214/** 119/**
215 * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left 120 * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left
216 * corner and (width, height) on the lower-bottom. 121 * corner and (width, height) on the lower-bottom.
@@ -294,6 +199,153 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit
294 199
295} // Anonymous namespace 200} // Anonymous namespace
296 201
202/**
203 * For smooth Vsync rendering, we want to always present the latest frame that the core generates,
204 * but also make sure that rendering happens at the pace that the frontend dictates. This is a
205 * helper class that the renderer uses to sync frames between the render thread and the presentation
206 * thread
207 */
208class FrameMailbox {
209public:
210 std::mutex swap_chain_lock;
211 std::condition_variable present_cv;
212 std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{};
213 std::queue<Frame*> free_queue;
214 std::deque<Frame*> present_queue;
215 Frame* previous_frame{};
216
217 FrameMailbox() : has_debug_tool{HasDebugTool()} {
218 for (auto& frame : swap_chain) {
219 free_queue.push(&frame);
220 }
221 }
222
223 ~FrameMailbox() {
224 // lock the mutex and clear out the present and free_queues and notify any people who are
225 // blocked to prevent deadlock on shutdown
226 std::scoped_lock lock{swap_chain_lock};
227 std::queue<Frame*>().swap(free_queue);
228 present_queue.clear();
229 present_cv.notify_all();
230 }
231
232 void ReloadPresentFrame(Frame* frame, u32 height, u32 width) {
233 frame->present.Release();
234 frame->present.Create();
235 GLint previous_draw_fbo{};
236 glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo);
237 glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle);
238 glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
239 frame->color.handle);
240 if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
241 LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!");
242 }
243 glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo);
244 frame->color_reloaded = false;
245 }
246
247 void ReloadRenderFrame(Frame* frame, u32 width, u32 height) {
248 // Recreate the color texture attachment
249 frame->color.Release();
250 frame->color.Create();
251 const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8;
252 glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height);
253
254 // Recreate the FBO for the render target
255 frame->render.Release();
256 frame->render.Create();
257 glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle);
258 glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
259 frame->color.handle);
260 if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
261 LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!");
262 }
263
264 frame->width = width;
265 frame->height = height;
266 frame->color_reloaded = true;
267 }
268
269 Frame* GetRenderFrame() {
270 std::unique_lock lock{swap_chain_lock};
271
272 // If theres no free frames, we will reuse the oldest render frame
273 if (free_queue.empty()) {
274 auto frame = present_queue.back();
275 present_queue.pop_back();
276 return frame;
277 }
278
279 Frame* frame = free_queue.front();
280 free_queue.pop();
281 return frame;
282 }
283
284 void ReleaseRenderFrame(Frame* frame) {
285 std::unique_lock lock{swap_chain_lock};
286 present_queue.push_front(frame);
287 present_cv.notify_one();
288
289 DebugNotifyNextFrame();
290 }
291
292 Frame* TryGetPresentFrame(int timeout_ms) {
293 DebugWaitForNextFrame();
294
295 std::unique_lock lock{swap_chain_lock};
296 // wait for new entries in the present_queue
297 present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms),
298 [&] { return !present_queue.empty(); });
299 if (present_queue.empty()) {
300 // timed out waiting for a frame to draw so return the previous frame
301 return previous_frame;
302 }
303
304 // free the previous frame and add it back to the free queue
305 if (previous_frame) {
306 free_queue.push(previous_frame);
307 }
308
309 // the newest entries are pushed to the front of the queue
310 Frame* frame = present_queue.front();
311 present_queue.pop_front();
312 // remove all old entries from the present queue and move them back to the free_queue
313 for (auto f : present_queue) {
314 free_queue.push(f);
315 }
316 present_queue.clear();
317 previous_frame = frame;
318 return frame;
319 }
320
321private:
322 std::mutex debug_synch_mutex;
323 std::condition_variable debug_synch_condition;
324 std::atomic_int frame_for_debug{};
325 const bool has_debug_tool; // When true, using a GPU debugger, so keep frames in lock-step
326
327 /// Signal that a new frame is available (called from GPU thread)
328 void DebugNotifyNextFrame() {
329 if (!has_debug_tool) {
330 return;
331 }
332 frame_for_debug++;
333 std::lock_guard lock{debug_synch_mutex};
334 debug_synch_condition.notify_one();
335 }
336
337 /// Wait for a new frame to be available (called from presentation thread)
338 void DebugWaitForNextFrame() {
339 if (!has_debug_tool) {
340 return;
341 }
342 const int last_frame = frame_for_debug;
343 std::unique_lock lock{debug_synch_mutex};
344 debug_synch_condition.wait(lock,
345 [this, last_frame] { return frame_for_debug > last_frame; });
346 }
347};
348
297RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system) 349RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system)
298 : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system}, 350 : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system},
299 frame_mailbox{std::make_unique<FrameMailbox>()} {} 351 frame_mailbox{std::make_unique<FrameMailbox>()} {}
@@ -576,6 +628,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
576 628
577 // TODO: Signal state tracker about these changes 629 // TODO: Signal state tracker about these changes
578 state_tracker.NotifyScreenDrawVertexArray(); 630 state_tracker.NotifyScreenDrawVertexArray();
631 state_tracker.NotifyPolygonModes();
579 state_tracker.NotifyViewport0(); 632 state_tracker.NotifyViewport0();
580 state_tracker.NotifyScissor0(); 633 state_tracker.NotifyScissor0();
581 state_tracker.NotifyColorMask0(); 634 state_tracker.NotifyColorMask0();
@@ -611,6 +664,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
611 glDisable(GL_ALPHA_TEST); 664 glDisable(GL_ALPHA_TEST);
612 glDisablei(GL_BLEND, 0); 665 glDisablei(GL_BLEND, 0);
613 glDisablei(GL_SCISSOR_TEST, 0); 666 glDisablei(GL_SCISSOR_TEST, 0);
667 glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
614 glCullFace(GL_BACK); 668 glCullFace(GL_BACK);
615 glFrontFace(GL_CW); 669 glFrontFace(GL_CW);
616 glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); 670 glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index df3ac707c..f93447610 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -125,6 +125,7 @@ struct FormatTuple {
125 {vk::Format::eR8Uint, Attachable | Storage}, // R8UI 125 {vk::Format::eR8Uint, Attachable | Storage}, // R8UI
126 {vk::Format::eR16G16B16A16Sfloat, Attachable | Storage}, // RGBA16F 126 {vk::Format::eR16G16B16A16Sfloat, Attachable | Storage}, // RGBA16F
127 {vk::Format::eR16G16B16A16Unorm, Attachable | Storage}, // RGBA16U 127 {vk::Format::eR16G16B16A16Unorm, Attachable | Storage}, // RGBA16U
128 {vk::Format::eR16G16B16A16Snorm, Attachable | Storage}, // RGBA16S
128 {vk::Format::eR16G16B16A16Uint, Attachable | Storage}, // RGBA16UI 129 {vk::Format::eR16G16B16A16Uint, Attachable | Storage}, // RGBA16UI
129 {vk::Format::eB10G11R11UfloatPack32, Attachable | Storage}, // R11FG11FB10F 130 {vk::Format::eB10G11R11UfloatPack32, Attachable | Storage}, // R11FG11FB10F
130 {vk::Format::eR32G32B32A32Uint, Attachable | Storage}, // RGBA32UI 131 {vk::Format::eR32G32B32A32Uint, Attachable | Storage}, // RGBA32UI
@@ -256,6 +257,8 @@ vk::ShaderStageFlagBits ShaderStage(Tegra::Engines::ShaderType stage) {
256 return vk::ShaderStageFlagBits::eGeometry; 257 return vk::ShaderStageFlagBits::eGeometry;
257 case Tegra::Engines::ShaderType::Fragment: 258 case Tegra::Engines::ShaderType::Fragment:
258 return vk::ShaderStageFlagBits::eFragment; 259 return vk::ShaderStageFlagBits::eFragment;
260 case Tegra::Engines::ShaderType::Compute:
261 return vk::ShaderStageFlagBits::eCompute;
259 } 262 }
260 UNIMPLEMENTED_MSG("Unimplemented shader stage={}", static_cast<u32>(stage)); 263 UNIMPLEMENTED_MSG("Unimplemented shader stage={}", static_cast<u32>(stage));
261 return {}; 264 return {};
@@ -331,6 +334,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr
331 return vk::Format::eR16G16B16Unorm; 334 return vk::Format::eR16G16B16Unorm;
332 case Maxwell::VertexAttribute::Size::Size_16_16_16_16: 335 case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
333 return vk::Format::eR16G16B16A16Unorm; 336 return vk::Format::eR16G16B16A16Unorm;
337 case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
338 return vk::Format::eA2B10G10R10UnormPack32;
334 default: 339 default:
335 break; 340 break;
336 } 341 }
@@ -364,6 +369,10 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr
364 return vk::Format::eR8G8B8A8Uint; 369 return vk::Format::eR8G8B8A8Uint;
365 case Maxwell::VertexAttribute::Size::Size_32: 370 case Maxwell::VertexAttribute::Size::Size_32:
366 return vk::Format::eR32Uint; 371 return vk::Format::eR32Uint;
372 case Maxwell::VertexAttribute::Size::Size_32_32:
373 return vk::Format::eR32G32Uint;
374 case Maxwell::VertexAttribute::Size::Size_32_32_32:
375 return vk::Format::eR32G32B32Uint;
367 case Maxwell::VertexAttribute::Size::Size_32_32_32_32: 376 case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
368 return vk::Format::eR32G32B32A32Uint; 377 return vk::Format::eR32G32B32A32Uint;
369 default: 378 default:
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index 886bde3b9..28d2fbc4f 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -107,8 +107,7 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
107 features.occlusionQueryPrecise = true; 107 features.occlusionQueryPrecise = true;
108 features.fragmentStoresAndAtomics = true; 108 features.fragmentStoresAndAtomics = true;
109 features.shaderImageGatherExtended = true; 109 features.shaderImageGatherExtended = true;
110 features.shaderStorageImageReadWithoutFormat = 110 features.shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported;
111 is_shader_storage_img_read_without_format_supported;
112 features.shaderStorageImageWriteWithoutFormat = true; 111 features.shaderStorageImageWriteWithoutFormat = true;
113 features.textureCompressionASTC_LDR = is_optimal_astc_supported; 112 features.textureCompressionASTC_LDR = is_optimal_astc_supported;
114 113
@@ -148,6 +147,15 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
148 LOG_INFO(Render_Vulkan, "Device doesn't support uint8 indexes"); 147 LOG_INFO(Render_Vulkan, "Device doesn't support uint8 indexes");
149 } 148 }
150 149
150 vk::PhysicalDeviceTransformFeedbackFeaturesEXT transform_feedback;
151 if (ext_transform_feedback) {
152 transform_feedback.transformFeedback = true;
153 transform_feedback.geometryStreams = true;
154 SetNext(next, transform_feedback);
155 } else {
156 LOG_INFO(Render_Vulkan, "Device doesn't support transform feedbacks");
157 }
158
151 if (!ext_depth_range_unrestricted) { 159 if (!ext_depth_range_unrestricted) {
152 LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted"); 160 LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted");
153 } 161 }
@@ -385,7 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
385 } 393 }
386 }; 394 };
387 395
388 extensions.reserve(14); 396 extensions.reserve(15);
389 extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); 397 extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
390 extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME); 398 extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME);
391 extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME); 399 extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME);
@@ -397,18 +405,22 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
397 405
398 [[maybe_unused]] const bool nsight = 406 [[maybe_unused]] const bool nsight =
399 std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); 407 std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
400 bool khr_shader_float16_int8{}; 408 bool has_khr_shader_float16_int8{};
401 bool ext_subgroup_size_control{}; 409 bool has_ext_subgroup_size_control{};
410 bool has_ext_transform_feedback{};
402 for (const auto& extension : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) { 411 for (const auto& extension : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) {
403 Test(extension, khr_uniform_buffer_standard_layout, 412 Test(extension, khr_uniform_buffer_standard_layout,
404 VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true); 413 VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true);
405 Test(extension, khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false); 414 Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
415 false);
406 Test(extension, ext_depth_range_unrestricted, 416 Test(extension, ext_depth_range_unrestricted,
407 VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); 417 VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true);
408 Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); 418 Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true);
409 Test(extension, ext_shader_viewport_index_layer, 419 Test(extension, ext_shader_viewport_index_layer,
410 VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true); 420 VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true);
411 Test(extension, ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, 421 Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
422 false);
423 Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME,
412 false); 424 false);
413 if (Settings::values.renderer_debug) { 425 if (Settings::values.renderer_debug) {
414 Test(extension, nv_device_diagnostic_checkpoints, 426 Test(extension, nv_device_diagnostic_checkpoints,
@@ -416,13 +428,13 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
416 } 428 }
417 } 429 }
418 430
419 if (khr_shader_float16_int8) { 431 if (has_khr_shader_float16_int8) {
420 is_float16_supported = 432 is_float16_supported =
421 GetFeatures<vk::PhysicalDeviceFloat16Int8FeaturesKHR>(physical, dldi).shaderFloat16; 433 GetFeatures<vk::PhysicalDeviceFloat16Int8FeaturesKHR>(physical, dldi).shaderFloat16;
422 extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME); 434 extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME);
423 } 435 }
424 436
425 if (ext_subgroup_size_control) { 437 if (has_ext_subgroup_size_control) {
426 const auto features = 438 const auto features =
427 GetFeatures<vk::PhysicalDeviceSubgroupSizeControlFeaturesEXT>(physical, dldi); 439 GetFeatures<vk::PhysicalDeviceSubgroupSizeControlFeaturesEXT>(physical, dldi);
428 const auto properties = 440 const auto properties =
@@ -439,6 +451,20 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
439 is_warp_potentially_bigger = true; 451 is_warp_potentially_bigger = true;
440 } 452 }
441 453
454 if (has_ext_transform_feedback) {
455 const auto features =
456 GetFeatures<vk::PhysicalDeviceTransformFeedbackFeaturesEXT>(physical, dldi);
457 const auto properties =
458 GetProperties<vk::PhysicalDeviceTransformFeedbackPropertiesEXT>(physical, dldi);
459
460 if (features.transformFeedback && features.geometryStreams &&
461 properties.maxTransformFeedbackStreams >= 4 && properties.maxTransformFeedbackBuffers &&
462 properties.transformFeedbackQueries && properties.transformFeedbackDraw) {
463 extensions.push_back(VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME);
464 ext_transform_feedback = true;
465 }
466 }
467
442 return extensions; 468 return extensions;
443} 469}
444 470
@@ -467,8 +493,7 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK
467 493
468void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) { 494void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) {
469 const auto supported_features{physical.getFeatures(dldi)}; 495 const auto supported_features{physical.getFeatures(dldi)};
470 is_shader_storage_img_read_without_format_supported = 496 is_formatless_image_load_supported = supported_features.shaderStorageImageReadWithoutFormat;
471 supported_features.shaderStorageImageReadWithoutFormat;
472 is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi); 497 is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi);
473} 498}
474 499
@@ -510,6 +535,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti
510 vk::Format::eR32G32Sfloat, 535 vk::Format::eR32G32Sfloat,
511 vk::Format::eR32G32Uint, 536 vk::Format::eR32G32Uint,
512 vk::Format::eR16G16B16A16Uint, 537 vk::Format::eR16G16B16A16Uint,
538 vk::Format::eR16G16B16A16Snorm,
513 vk::Format::eR16G16B16A16Unorm, 539 vk::Format::eR16G16B16A16Unorm,
514 vk::Format::eR16G16Unorm, 540 vk::Format::eR16G16Unorm,
515 vk::Format::eR16G16Snorm, 541 vk::Format::eR16G16Snorm,
diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h
index 2c27ad730..6e656517f 100644
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -122,11 +122,6 @@ public:
122 return properties.limits.maxPushConstantsSize; 122 return properties.limits.maxPushConstantsSize;
123 } 123 }
124 124
125 /// Returns true if Shader storage Image Read Without Format supported.
126 bool IsShaderStorageImageReadWithoutFormatSupported() const {
127 return is_shader_storage_img_read_without_format_supported;
128 }
129
130 /// Returns true if ASTC is natively supported. 125 /// Returns true if ASTC is natively supported.
131 bool IsOptimalAstcSupported() const { 126 bool IsOptimalAstcSupported() const {
132 return is_optimal_astc_supported; 127 return is_optimal_astc_supported;
@@ -147,6 +142,11 @@ public:
147 return (guest_warp_stages & stage) != vk::ShaderStageFlags{}; 142 return (guest_warp_stages & stage) != vk::ShaderStageFlags{};
148 } 143 }
149 144
145 /// Returns true if formatless image load is supported.
146 bool IsFormatlessImageLoadSupported() const {
147 return is_formatless_image_load_supported;
148 }
149
150 /// Returns true if the device supports VK_EXT_scalar_block_layout. 150 /// Returns true if the device supports VK_EXT_scalar_block_layout.
151 bool IsKhrUniformBufferStandardLayoutSupported() const { 151 bool IsKhrUniformBufferStandardLayoutSupported() const {
152 return khr_uniform_buffer_standard_layout; 152 return khr_uniform_buffer_standard_layout;
@@ -167,6 +167,11 @@ public:
167 return ext_shader_viewport_index_layer; 167 return ext_shader_viewport_index_layer;
168 } 168 }
169 169
170 /// Returns true if the device supports VK_EXT_transform_feedback.
171 bool IsExtTransformFeedbackSupported() const {
172 return ext_transform_feedback;
173 }
174
170 /// Returns true if the device supports VK_NV_device_diagnostic_checkpoints. 175 /// Returns true if the device supports VK_NV_device_diagnostic_checkpoints.
171 bool IsNvDeviceDiagnosticCheckpoints() const { 176 bool IsNvDeviceDiagnosticCheckpoints() const {
172 return nv_device_diagnostic_checkpoints; 177 return nv_device_diagnostic_checkpoints;
@@ -214,26 +219,26 @@ private:
214 static std::unordered_map<vk::Format, vk::FormatProperties> GetFormatProperties( 219 static std::unordered_map<vk::Format, vk::FormatProperties> GetFormatProperties(
215 const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical); 220 const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical);
216 221
217 const vk::PhysicalDevice physical; ///< Physical device. 222 const vk::PhysicalDevice physical; ///< Physical device.
218 vk::DispatchLoaderDynamic dld; ///< Device function pointers. 223 vk::DispatchLoaderDynamic dld; ///< Device function pointers.
219 vk::PhysicalDeviceProperties properties; ///< Device properties. 224 vk::PhysicalDeviceProperties properties; ///< Device properties.
220 UniqueDevice logical; ///< Logical device. 225 UniqueDevice logical; ///< Logical device.
221 vk::Queue graphics_queue; ///< Main graphics queue. 226 vk::Queue graphics_queue; ///< Main graphics queue.
222 vk::Queue present_queue; ///< Main present queue. 227 vk::Queue present_queue; ///< Main present queue.
223 u32 graphics_family{}; ///< Main graphics queue family index. 228 u32 graphics_family{}; ///< Main graphics queue family index.
224 u32 present_family{}; ///< Main present queue family index. 229 u32 present_family{}; ///< Main present queue family index.
225 vk::DriverIdKHR driver_id{}; ///< Driver ID. 230 vk::DriverIdKHR driver_id{}; ///< Driver ID.
226 vk::ShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced. 231 vk::ShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced.ed
227 bool is_optimal_astc_supported{}; ///< Support for native ASTC. 232 bool is_optimal_astc_supported{}; ///< Support for native ASTC.
228 bool is_float16_supported{}; ///< Support for float16 arithmetics. 233 bool is_float16_supported{}; ///< Support for float16 arithmetics.
229 bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. 234 bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest.
235 bool is_formatless_image_load_supported{}; ///< Support for shader image read without format.
230 bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs. 236 bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs.
231 bool ext_index_type_uint8{}; ///< Support for VK_EXT_index_type_uint8. 237 bool ext_index_type_uint8{}; ///< Support for VK_EXT_index_type_uint8.
232 bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. 238 bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted.
233 bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer. 239 bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer.
240 bool ext_transform_feedback{}; ///< Support for VK_EXT_transform_feedback.
234 bool nv_device_diagnostic_checkpoints{}; ///< Support for VK_NV_device_diagnostic_checkpoints. 241 bool nv_device_diagnostic_checkpoints{}; ///< Support for VK_NV_device_diagnostic_checkpoints.
235 bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage
236 ///< image read without format
237 242
238 // Telemetry parameters 243 // Telemetry parameters
239 std::string vendor_name; ///< Device's driver name. 244 std::string vendor_name; ///< Device's driver name.
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 144e1e007..557b9d662 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -161,8 +161,8 @@ CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stag
161 GPUVAddr gpu_addr, VAddr cpu_addr, u8* host_ptr, 161 GPUVAddr gpu_addr, VAddr cpu_addr, u8* host_ptr,
162 ProgramCode program_code, u32 main_offset) 162 ProgramCode program_code, u32 main_offset)
163 : RasterizerCacheObject{host_ptr}, gpu_addr{gpu_addr}, cpu_addr{cpu_addr}, 163 : RasterizerCacheObject{host_ptr}, gpu_addr{gpu_addr}, cpu_addr{cpu_addr},
164 program_code{std::move(program_code)}, locker{stage, GetEngine(system, stage)}, 164 program_code{std::move(program_code)}, registry{stage, GetEngine(system, stage)},
165 shader_ir{this->program_code, main_offset, compiler_settings, locker}, 165 shader_ir{this->program_code, main_offset, compiler_settings, registry},
166 entries{GenerateShaderEntries(shader_ir)} {} 166 entries{GenerateShaderEntries(shader_ir)} {}
167 167
168CachedShader::~CachedShader() = default; 168CachedShader::~CachedShader() = default;
@@ -179,10 +179,11 @@ Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine(
179VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, 179VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer,
180 const VKDevice& device, VKScheduler& scheduler, 180 const VKDevice& device, VKScheduler& scheduler,
181 VKDescriptorPool& descriptor_pool, 181 VKDescriptorPool& descriptor_pool,
182 VKUpdateDescriptorQueue& update_descriptor_queue) 182 VKUpdateDescriptorQueue& update_descriptor_queue,
183 VKRenderPassCache& renderpass_cache)
183 : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler}, 184 : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler},
184 descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue}, 185 descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue},
185 renderpass_cache(device) {} 186 renderpass_cache{renderpass_cache} {}
186 187
187VKPipelineCache::~VKPipelineCache() = default; 188VKPipelineCache::~VKPipelineCache() = default;
188 189
@@ -191,7 +192,6 @@ std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
191 192
192 std::array<Shader, Maxwell::MaxShaderProgram> shaders; 193 std::array<Shader, Maxwell::MaxShaderProgram> shaders;
193 for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { 194 for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
194 const auto& shader_config = gpu.regs.shader_config[index];
195 const auto program{static_cast<Maxwell::ShaderProgram>(index)}; 195 const auto program{static_cast<Maxwell::ShaderProgram>(index)};
196 196
197 // Skip stages that are not enabled 197 // Skip stages that are not enabled
@@ -273,9 +273,9 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
273 specialization.workgroup_size = key.workgroup_size; 273 specialization.workgroup_size = key.workgroup_size;
274 specialization.shared_memory_size = key.shared_memory_size; 274 specialization.shared_memory_size = key.shared_memory_size;
275 275
276 const SPIRVShader spirv_shader{ 276 const SPIRVShader spirv_shader{Decompile(device, shader->GetIR(), ShaderType::Compute,
277 Decompile(device, shader->GetIR(), ShaderType::Compute, specialization), 277 shader->GetRegistry(), specialization),
278 shader->GetEntries()}; 278 shader->GetEntries()};
279 entry = std::make_unique<VKComputePipeline>(device, scheduler, descriptor_pool, 279 entry = std::make_unique<VKComputePipeline>(device, scheduler, descriptor_pool,
280 update_descriptor_queue, spirv_shader); 280 update_descriptor_queue, spirv_shader);
281 return *entry; 281 return *entry;
@@ -324,8 +324,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
324 const auto& gpu = system.GPU().Maxwell3D(); 324 const auto& gpu = system.GPU().Maxwell3D();
325 325
326 Specialization specialization; 326 Specialization specialization;
327 specialization.primitive_topology = fixed_state.input_assembly.topology; 327 if (fixed_state.input_assembly.topology == Maxwell::PrimitiveTopology::Points) {
328 if (specialization.primitive_topology == Maxwell::PrimitiveTopology::Points) {
329 ASSERT(fixed_state.input_assembly.point_size != 0.0f); 328 ASSERT(fixed_state.input_assembly.point_size != 0.0f);
330 specialization.point_size = fixed_state.input_assembly.point_size; 329 specialization.point_size = fixed_state.input_assembly.point_size;
331 } 330 }
@@ -333,9 +332,6 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
333 specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].type; 332 specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].type;
334 } 333 }
335 specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one; 334 specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one;
336 specialization.tessellation.primitive = fixed_state.tessellation.primitive;
337 specialization.tessellation.spacing = fixed_state.tessellation.spacing;
338 specialization.tessellation.clockwise = fixed_state.tessellation.clockwise;
339 335
340 SPIRVProgram program; 336 SPIRVProgram program;
341 std::vector<vk::DescriptorSetLayoutBinding> bindings; 337 std::vector<vk::DescriptorSetLayoutBinding> bindings;
@@ -356,8 +352,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
356 const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 352 const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
357 const auto program_type = GetShaderType(program_enum); 353 const auto program_type = GetShaderType(program_enum);
358 const auto& entries = shader->GetEntries(); 354 const auto& entries = shader->GetEntries();
359 program[stage] = {Decompile(device, shader->GetIR(), program_type, specialization), 355 program[stage] = {
360 entries}; 356 Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization),
357 entries};
361 358
362 if (program_enum == Maxwell::ShaderProgram::VertexA) { 359 if (program_enum == Maxwell::ShaderProgram::VertexA) {
363 // VertexB was combined with VertexA, so we skip the VertexB iteration 360 // VertexB was combined with VertexA, so we skip the VertexB iteration
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
index 92a670cc7..c4c112290 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -25,7 +25,7 @@
25#include "video_core/renderer_vulkan/vk_renderpass_cache.h" 25#include "video_core/renderer_vulkan/vk_renderpass_cache.h"
26#include "video_core/renderer_vulkan/vk_resource_manager.h" 26#include "video_core/renderer_vulkan/vk_resource_manager.h"
27#include "video_core/renderer_vulkan/vk_shader_decompiler.h" 27#include "video_core/renderer_vulkan/vk_shader_decompiler.h"
28#include "video_core/shader/const_buffer_locker.h" 28#include "video_core/shader/registry.h"
29#include "video_core/shader/shader_ir.h" 29#include "video_core/shader/shader_ir.h"
30#include "video_core/surface.h" 30#include "video_core/surface.h"
31 31
@@ -132,6 +132,10 @@ public:
132 return shader_ir; 132 return shader_ir;
133 } 133 }
134 134
135 const VideoCommon::Shader::Registry& GetRegistry() const {
136 return registry;
137 }
138
135 const VideoCommon::Shader::ShaderIR& GetIR() const { 139 const VideoCommon::Shader::ShaderIR& GetIR() const {
136 return shader_ir; 140 return shader_ir;
137 } 141 }
@@ -147,7 +151,7 @@ private:
147 GPUVAddr gpu_addr{}; 151 GPUVAddr gpu_addr{};
148 VAddr cpu_addr{}; 152 VAddr cpu_addr{};
149 ProgramCode program_code; 153 ProgramCode program_code;
150 VideoCommon::Shader::ConstBufferLocker locker; 154 VideoCommon::Shader::Registry registry;
151 VideoCommon::Shader::ShaderIR shader_ir; 155 VideoCommon::Shader::ShaderIR shader_ir;
152 ShaderEntries entries; 156 ShaderEntries entries;
153}; 157};
@@ -157,7 +161,8 @@ public:
157 explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, 161 explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer,
158 const VKDevice& device, VKScheduler& scheduler, 162 const VKDevice& device, VKScheduler& scheduler,
159 VKDescriptorPool& descriptor_pool, 163 VKDescriptorPool& descriptor_pool,
160 VKUpdateDescriptorQueue& update_descriptor_queue); 164 VKUpdateDescriptorQueue& update_descriptor_queue,
165 VKRenderPassCache& renderpass_cache);
161 ~VKPipelineCache(); 166 ~VKPipelineCache();
162 167
163 std::array<Shader, Maxwell::MaxShaderProgram> GetShaders(); 168 std::array<Shader, Maxwell::MaxShaderProgram> GetShaders();
@@ -180,8 +185,7 @@ private:
180 VKScheduler& scheduler; 185 VKScheduler& scheduler;
181 VKDescriptorPool& descriptor_pool; 186 VKDescriptorPool& descriptor_pool;
182 VKUpdateDescriptorQueue& update_descriptor_queue; 187 VKUpdateDescriptorQueue& update_descriptor_queue;
183 188 VKRenderPassCache& renderpass_cache;
184 VKRenderPassCache renderpass_cache;
185 189
186 std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; 190 std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
187 191
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 2bcb17b56..58c69b786 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -287,12 +287,13 @@ RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWind
287 screen_info{screen_info}, device{device}, resource_manager{resource_manager}, 287 screen_info{screen_info}, device{device}, resource_manager{resource_manager},
288 memory_manager{memory_manager}, state_tracker{state_tracker}, scheduler{scheduler}, 288 memory_manager{memory_manager}, state_tracker{state_tracker}, scheduler{scheduler},
289 staging_pool(device, memory_manager, scheduler), descriptor_pool(device), 289 staging_pool(device, memory_manager, scheduler), descriptor_pool(device),
290 update_descriptor_queue(device, scheduler), 290 update_descriptor_queue(device, scheduler), renderpass_cache(device),
291 quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), 291 quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
292 uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), 292 uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
293 texture_cache(system, *this, device, resource_manager, memory_manager, scheduler, 293 texture_cache(system, *this, device, resource_manager, memory_manager, scheduler,
294 staging_pool), 294 staging_pool),
295 pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue), 295 pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue,
296 renderpass_cache),
296 buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), 297 buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool),
297 sampler_cache(device), query_cache(system, *this, device, scheduler) { 298 sampler_cache(device), query_cache(system, *this, device, scheduler) {
298 scheduler.SetQueryCache(query_cache); 299 scheduler.SetQueryCache(query_cache);
@@ -347,6 +348,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
347 [&pipeline](auto cmdbuf, auto& dld) { cmdbuf.setCheckpointNV(&pipeline, dld); }); 348 [&pipeline](auto cmdbuf, auto& dld) { cmdbuf.setCheckpointNV(&pipeline, dld); });
348 } 349 }
349 350
351 BeginTransformFeedback();
352
350 const auto pipeline_layout = pipeline.GetLayout(); 353 const auto pipeline_layout = pipeline.GetLayout();
351 const auto descriptor_set = pipeline.CommitDescriptorSet(); 354 const auto descriptor_set = pipeline.CommitDescriptorSet();
352 scheduler.Record([pipeline_layout, descriptor_set, draw_params](auto cmdbuf, auto& dld) { 355 scheduler.Record([pipeline_layout, descriptor_set, draw_params](auto cmdbuf, auto& dld) {
@@ -356,18 +359,23 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
356 } 359 }
357 draw_params.Draw(cmdbuf, dld); 360 draw_params.Draw(cmdbuf, dld);
358 }); 361 });
362
363 EndTransformFeedback();
359} 364}
360 365
361void RasterizerVulkan::Clear() { 366void RasterizerVulkan::Clear() {
362 MICROPROFILE_SCOPE(Vulkan_Clearing); 367 MICROPROFILE_SCOPE(Vulkan_Clearing);
363 368
364 query_cache.UpdateCounters();
365
366 const auto& gpu = system.GPU().Maxwell3D(); 369 const auto& gpu = system.GPU().Maxwell3D();
367 if (!system.GPU().Maxwell3D().ShouldExecute()) { 370 if (!system.GPU().Maxwell3D().ShouldExecute()) {
368 return; 371 return;
369 } 372 }
370 373
374 sampled_views.clear();
375 image_views.clear();
376
377 query_cache.UpdateCounters();
378
371 const auto& regs = gpu.regs; 379 const auto& regs = gpu.regs;
372 const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || 380 const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
373 regs.clear_buffers.A; 381 regs.clear_buffers.A;
@@ -376,52 +384,54 @@ void RasterizerVulkan::Clear() {
376 if (!use_color && !use_depth && !use_stencil) { 384 if (!use_color && !use_depth && !use_stencil) {
377 return; 385 return;
378 } 386 }
379 // Clearing images requires to be out of a renderpass
380 scheduler.RequestOutsideRenderPassOperationContext();
381 387
382 // TODO(Rodrigo): Implement clears rendering a quad or using beginning a renderpass. 388 [[maybe_unused]] const auto texceptions = UpdateAttachments();
389 DEBUG_ASSERT(texceptions.none());
390 SetupImageTransitions(0, color_attachments, zeta_attachment);
383 391
384 if (use_color) { 392 const vk::RenderPass renderpass = renderpass_cache.GetRenderPass(GetRenderPassParams(0));
385 View color_view; 393 const auto [framebuffer, render_area] = ConfigureFramebuffers(renderpass);
386 { 394 scheduler.RequestRenderpass({renderpass, framebuffer, {{0, 0}, render_area}, 0, nullptr});
387 MICROPROFILE_SCOPE(Vulkan_RenderTargets); 395
388 color_view = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT.Value(), false); 396 const auto& scissor = regs.scissor_test[0];
389 } 397 const vk::Offset2D scissor_offset(scissor.min_x, scissor.min_y);
398 vk::Extent2D scissor_extent{scissor.max_x - scissor.min_x, scissor.max_y - scissor.min_y};
399 scissor_extent.width = std::min(scissor_extent.width, render_area.width);
400 scissor_extent.height = std::min(scissor_extent.height, render_area.height);
390 401
391 color_view->Transition(vk::ImageLayout::eTransferDstOptimal, 402 const u32 layer = regs.clear_buffers.layer;
392 vk::PipelineStageFlagBits::eTransfer, 403 const vk::ClearRect clear_rect({scissor_offset, scissor_extent}, layer, 1);
393 vk::AccessFlagBits::eTransferWrite);
394 404
405 if (use_color) {
395 const std::array clear_color = {regs.clear_color[0], regs.clear_color[1], 406 const std::array clear_color = {regs.clear_color[0], regs.clear_color[1],
396 regs.clear_color[2], regs.clear_color[3]}; 407 regs.clear_color[2], regs.clear_color[3]};
397 const vk::ClearColorValue clear(clear_color); 408 const vk::ClearValue clear_value{clear_color};
398 scheduler.Record([image = color_view->GetImage(), 409 const u32 color_attachment = regs.clear_buffers.RT;
399 subresource = color_view->GetImageSubresourceRange(), 410 scheduler.Record([color_attachment, clear_value, clear_rect](auto cmdbuf, auto& dld) {
400 clear](auto cmdbuf, auto& dld) { 411 const vk::ClearAttachment attachment(vk::ImageAspectFlagBits::eColor, color_attachment,
401 cmdbuf.clearColorImage(image, vk::ImageLayout::eTransferDstOptimal, clear, subresource, 412 clear_value);
402 dld); 413 cmdbuf.clearAttachments(1, &attachment, 1, &clear_rect, dld);
403 }); 414 });
404 } 415 }
405 if (use_depth || use_stencil) {
406 View zeta_surface;
407 {
408 MICROPROFILE_SCOPE(Vulkan_RenderTargets);
409 zeta_surface = texture_cache.GetDepthBufferSurface(false);
410 }
411 416
412 zeta_surface->Transition(vk::ImageLayout::eTransferDstOptimal, 417 if (!use_depth && !use_stencil) {
413 vk::PipelineStageFlagBits::eTransfer, 418 return;
414 vk::AccessFlagBits::eTransferWrite); 419 }
415 420 vk::ImageAspectFlags aspect_flags;
416 const vk::ClearDepthStencilValue clear(regs.clear_depth, 421 if (use_depth) {
417 static_cast<u32>(regs.clear_stencil)); 422 aspect_flags |= vk::ImageAspectFlagBits::eDepth;
418 scheduler.Record([image = zeta_surface->GetImage(),
419 subresource = zeta_surface->GetImageSubresourceRange(),
420 clear](auto cmdbuf, auto& dld) {
421 cmdbuf.clearDepthStencilImage(image, vk::ImageLayout::eTransferDstOptimal, clear,
422 subresource, dld);
423 });
424 } 423 }
424 if (use_stencil) {
425 aspect_flags |= vk::ImageAspectFlagBits::eStencil;
426 }
427
428 scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil,
429 clear_rect, aspect_flags](auto cmdbuf, auto& dld) {
430 const vk::ClearDepthStencilValue clear_zeta(clear_depth, clear_stencil);
431 const vk::ClearValue clear_value{clear_zeta};
432 const vk::ClearAttachment attachment(aspect_flags, 0, clear_value);
433 cmdbuf.clearAttachments(1, &attachment, 1, &clear_rect, dld);
434 });
425} 435}
426 436
427void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { 437void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
@@ -538,8 +548,6 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
538 548
539 // Verify that the cached surface is the same size and format as the requested framebuffer 549 // Verify that the cached surface is the same size and format as the requested framebuffer
540 const auto& params{surface->GetSurfaceParams()}; 550 const auto& params{surface->GetSurfaceParams()};
541 const auto& pixel_format{
542 VideoCore::Surface::PixelFormatFromGPUPixelFormat(config.pixel_format)};
543 ASSERT_MSG(params.width == config.width, "Framebuffer width is different"); 551 ASSERT_MSG(params.width == config.width, "Framebuffer width is different");
544 ASSERT_MSG(params.height == config.height, "Framebuffer height is different"); 552 ASSERT_MSG(params.height == config.height, "Framebuffer height is different");
545 553
@@ -738,6 +746,44 @@ void RasterizerVulkan::UpdateDynamicStates() {
738 UpdateStencilFaces(regs); 746 UpdateStencilFaces(regs);
739} 747}
740 748
749void RasterizerVulkan::BeginTransformFeedback() {
750 const auto& regs = system.GPU().Maxwell3D().regs;
751 if (regs.tfb_enabled == 0) {
752 return;
753 }
754
755 UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
756 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
757 regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
758
759 UNIMPLEMENTED_IF(regs.tfb_bindings[1].buffer_enable);
760 UNIMPLEMENTED_IF(regs.tfb_bindings[2].buffer_enable);
761 UNIMPLEMENTED_IF(regs.tfb_bindings[3].buffer_enable);
762
763 const auto& binding = regs.tfb_bindings[0];
764 UNIMPLEMENTED_IF(binding.buffer_enable == 0);
765 UNIMPLEMENTED_IF(binding.buffer_offset != 0);
766
767 const GPUVAddr gpu_addr = binding.Address();
768 const std::size_t size = binding.buffer_size;
769 const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
770
771 scheduler.Record([buffer = *buffer, offset = offset, size](auto cmdbuf, auto& dld) {
772 cmdbuf.bindTransformFeedbackBuffersEXT(0, {buffer}, {offset}, {size}, dld);
773 cmdbuf.beginTransformFeedbackEXT(0, {}, {}, dld);
774 });
775}
776
777void RasterizerVulkan::EndTransformFeedback() {
778 const auto& regs = system.GPU().Maxwell3D().regs;
779 if (regs.tfb_enabled == 0) {
780 return;
781 }
782
783 scheduler.Record(
784 [](auto cmdbuf, auto& dld) { cmdbuf.endTransformFeedbackEXT(0, {}, {}, dld); });
785}
786
741void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, 787void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input,
742 BufferBindings& buffer_bindings) { 788 BufferBindings& buffer_bindings) {
743 const auto& regs = system.GPU().Maxwell3D().regs; 789 const auto& regs = system.GPU().Maxwell3D().regs;
@@ -1109,7 +1155,7 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const {
1109 // This implementation assumes that all attributes are used in the shader. 1155 // This implementation assumes that all attributes are used in the shader.
1110 const GPUVAddr start{regs.vertex_array[index].StartAddress()}; 1156 const GPUVAddr start{regs.vertex_array[index].StartAddress()};
1111 const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; 1157 const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
1112 DEBUG_ASSERT(end > start); 1158 DEBUG_ASSERT(end >= start);
1113 1159
1114 size += (end - start + 1) * regs.vertex_array[index].enable; 1160 size += (end - start + 1) * regs.vertex_array[index].enable;
1115 } 1161 }
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 96ea05f0a..3185868e9 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -169,6 +169,10 @@ private:
169 169
170 void UpdateDynamicStates(); 170 void UpdateDynamicStates();
171 171
172 void BeginTransformFeedback();
173
174 void EndTransformFeedback();
175
172 bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment); 176 bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment);
173 177
174 void SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, 178 void SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input,
@@ -249,6 +253,7 @@ private:
249 VKStagingBufferPool staging_pool; 253 VKStagingBufferPool staging_pool;
250 VKDescriptorPool descriptor_pool; 254 VKDescriptorPool descriptor_pool;
251 VKUpdateDescriptorQueue update_descriptor_queue; 255 VKUpdateDescriptorQueue update_descriptor_queue;
256 VKRenderPassCache renderpass_cache;
252 QuadArrayPass quad_array_pass; 257 QuadArrayPass quad_array_pass;
253 Uint8Pass uint8_pass; 258 Uint8Pass uint8_pass;
254 259
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index cfcca5af0..51ecb5567 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -5,7 +5,9 @@
5#include <functional> 5#include <functional>
6#include <limits> 6#include <limits>
7#include <map> 7#include <map>
8#include <optional>
8#include <type_traits> 9#include <type_traits>
10#include <unordered_map>
9#include <utility> 11#include <utility>
10 12
11#include <fmt/format.h> 13#include <fmt/format.h>
@@ -24,6 +26,7 @@
24#include "video_core/renderer_vulkan/vk_shader_decompiler.h" 26#include "video_core/renderer_vulkan/vk_shader_decompiler.h"
25#include "video_core/shader/node.h" 27#include "video_core/shader/node.h"
26#include "video_core/shader/shader_ir.h" 28#include "video_core/shader/shader_ir.h"
29#include "video_core/shader/transform_feedback.h"
27 30
28namespace Vulkan { 31namespace Vulkan {
29 32
@@ -93,6 +96,12 @@ struct VertexIndices {
93 std::optional<u32> clip_distances; 96 std::optional<u32> clip_distances;
94}; 97};
95 98
99struct GenericVaryingDescription {
100 Id id = nullptr;
101 u32 first_element = 0;
102 bool is_scalar = false;
103};
104
96spv::Dim GetSamplerDim(const Sampler& sampler) { 105spv::Dim GetSamplerDim(const Sampler& sampler) {
97 ASSERT(!sampler.IsBuffer()); 106 ASSERT(!sampler.IsBuffer());
98 switch (sampler.GetType()) { 107 switch (sampler.GetType()) {
@@ -266,9 +275,13 @@ bool IsPrecise(Operation operand) {
266class SPIRVDecompiler final : public Sirit::Module { 275class SPIRVDecompiler final : public Sirit::Module {
267public: 276public:
268 explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage, 277 explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage,
269 const Specialization& specialization) 278 const Registry& registry, const Specialization& specialization)
270 : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()}, 279 : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()},
271 specialization{specialization} { 280 registry{registry}, specialization{specialization} {
281 if (stage != ShaderType::Compute) {
282 transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
283 }
284
272 AddCapability(spv::Capability::Shader); 285 AddCapability(spv::Capability::Shader);
273 AddCapability(spv::Capability::UniformAndStorageBuffer16BitAccess); 286 AddCapability(spv::Capability::UniformAndStorageBuffer16BitAccess);
274 AddCapability(spv::Capability::ImageQuery); 287 AddCapability(spv::Capability::ImageQuery);
@@ -286,6 +299,15 @@ public:
286 AddExtension("SPV_KHR_variable_pointers"); 299 AddExtension("SPV_KHR_variable_pointers");
287 AddExtension("SPV_KHR_shader_draw_parameters"); 300 AddExtension("SPV_KHR_shader_draw_parameters");
288 301
302 if (!transform_feedback.empty()) {
303 if (device.IsExtTransformFeedbackSupported()) {
304 AddCapability(spv::Capability::TransformFeedback);
305 } else {
306 LOG_ERROR(Render_Vulkan, "Shader requires transform feedbacks but these are not "
307 "supported on this device");
308 }
309 }
310
289 if (ir.UsesLayer() || ir.UsesViewportIndex()) { 311 if (ir.UsesLayer() || ir.UsesViewportIndex()) {
290 if (ir.UsesViewportIndex()) { 312 if (ir.UsesViewportIndex()) {
291 AddCapability(spv::Capability::MultiViewport); 313 AddCapability(spv::Capability::MultiViewport);
@@ -296,7 +318,7 @@ public:
296 } 318 }
297 } 319 }
298 320
299 if (device.IsShaderStorageImageReadWithoutFormatSupported()) { 321 if (device.IsFormatlessImageLoadSupported()) {
300 AddCapability(spv::Capability::StorageImageReadWithoutFormat); 322 AddCapability(spv::Capability::StorageImageReadWithoutFormat);
301 } 323 }
302 324
@@ -318,25 +340,29 @@ public:
318 AddExecutionMode(main, spv::ExecutionMode::OutputVertices, 340 AddExecutionMode(main, spv::ExecutionMode::OutputVertices,
319 header.common2.threads_per_input_primitive); 341 header.common2.threads_per_input_primitive);
320 break; 342 break;
321 case ShaderType::TesselationEval: 343 case ShaderType::TesselationEval: {
344 const auto& info = registry.GetGraphicsInfo();
322 AddCapability(spv::Capability::Tessellation); 345 AddCapability(spv::Capability::Tessellation);
323 AddEntryPoint(spv::ExecutionModel::TessellationEvaluation, main, "main", interfaces); 346 AddEntryPoint(spv::ExecutionModel::TessellationEvaluation, main, "main", interfaces);
324 AddExecutionMode(main, GetExecutionMode(specialization.tessellation.primitive)); 347 AddExecutionMode(main, GetExecutionMode(info.tessellation_primitive));
325 AddExecutionMode(main, GetExecutionMode(specialization.tessellation.spacing)); 348 AddExecutionMode(main, GetExecutionMode(info.tessellation_spacing));
326 AddExecutionMode(main, specialization.tessellation.clockwise 349 AddExecutionMode(main, info.tessellation_clockwise
327 ? spv::ExecutionMode::VertexOrderCw 350 ? spv::ExecutionMode::VertexOrderCw
328 : spv::ExecutionMode::VertexOrderCcw); 351 : spv::ExecutionMode::VertexOrderCcw);
329 break; 352 break;
330 case ShaderType::Geometry: 353 }
354 case ShaderType::Geometry: {
355 const auto& info = registry.GetGraphicsInfo();
331 AddCapability(spv::Capability::Geometry); 356 AddCapability(spv::Capability::Geometry);
332 AddEntryPoint(spv::ExecutionModel::Geometry, main, "main", interfaces); 357 AddEntryPoint(spv::ExecutionModel::Geometry, main, "main", interfaces);
333 AddExecutionMode(main, GetExecutionMode(specialization.primitive_topology)); 358 AddExecutionMode(main, GetExecutionMode(info.primitive_topology));
334 AddExecutionMode(main, GetExecutionMode(header.common3.output_topology)); 359 AddExecutionMode(main, GetExecutionMode(header.common3.output_topology));
335 AddExecutionMode(main, spv::ExecutionMode::OutputVertices, 360 AddExecutionMode(main, spv::ExecutionMode::OutputVertices,
336 header.common4.max_output_vertices); 361 header.common4.max_output_vertices);
337 // TODO(Rodrigo): Where can we get this info from? 362 // TODO(Rodrigo): Where can we get this info from?
338 AddExecutionMode(main, spv::ExecutionMode::Invocations, 1U); 363 AddExecutionMode(main, spv::ExecutionMode::Invocations, 1U);
339 break; 364 break;
365 }
340 case ShaderType::Fragment: 366 case ShaderType::Fragment:
341 AddEntryPoint(spv::ExecutionModel::Fragment, main, "main", interfaces); 367 AddEntryPoint(spv::ExecutionModel::Fragment, main, "main", interfaces);
342 AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft); 368 AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft);
@@ -545,7 +571,8 @@ private:
545 if (stage != ShaderType::Geometry) { 571 if (stage != ShaderType::Geometry) {
546 return; 572 return;
547 } 573 }
548 const u32 num_input = GetNumPrimitiveTopologyVertices(specialization.primitive_topology); 574 const auto& info = registry.GetGraphicsInfo();
575 const u32 num_input = GetNumPrimitiveTopologyVertices(info.primitive_topology);
549 DeclareInputVertexArray(num_input); 576 DeclareInputVertexArray(num_input);
550 DeclareOutputVertex(); 577 DeclareOutputVertex();
551 } 578 }
@@ -742,12 +769,34 @@ private:
742 } 769 }
743 770
744 void DeclareOutputAttributes() { 771 void DeclareOutputAttributes() {
772 if (stage == ShaderType::Compute || stage == ShaderType::Fragment) {
773 return;
774 }
775
776 UNIMPLEMENTED_IF(registry.GetGraphicsInfo().tfb_enabled && stage != ShaderType::Vertex);
745 for (const auto index : ir.GetOutputAttributes()) { 777 for (const auto index : ir.GetOutputAttributes()) {
746 if (!IsGenericAttribute(index)) { 778 if (!IsGenericAttribute(index)) {
747 continue; 779 continue;
748 } 780 }
749 const u32 location = GetGenericAttributeLocation(index); 781 DeclareOutputAttribute(index);
750 Id type = t_float4; 782 }
783 }
784
785 void DeclareOutputAttribute(Attribute::Index index) {
786 static constexpr std::string_view swizzle = "xyzw";
787
788 const u32 location = GetGenericAttributeLocation(index);
789 u8 element = 0;
790 while (element < 4) {
791 const std::size_t remainder = 4 - element;
792
793 std::size_t num_components = remainder;
794 const std::optional tfb = GetTransformFeedbackInfo(index, element);
795 if (tfb) {
796 num_components = tfb->components;
797 }
798
799 Id type = GetTypeVectorDefinitionLut(Type::Float).at(num_components - 1);
751 Id varying_default = v_varying_default; 800 Id varying_default = v_varying_default;
752 if (IsOutputAttributeArray()) { 801 if (IsOutputAttributeArray()) {
753 const u32 num = GetNumOutputVertices(); 802 const u32 num = GetNumOutputVertices();
@@ -760,13 +809,45 @@ private:
760 } 809 }
761 type = TypePointer(spv::StorageClass::Output, type); 810 type = TypePointer(spv::StorageClass::Output, type);
762 811
812 std::string name = fmt::format("out_attr{}", location);
813 if (num_components < 4 || element > 0) {
814 name = fmt::format("{}_{}", name, swizzle.substr(element, num_components));
815 }
816
763 const Id id = OpVariable(type, spv::StorageClass::Output, varying_default); 817 const Id id = OpVariable(type, spv::StorageClass::Output, varying_default);
764 Name(AddGlobalVariable(id), fmt::format("out_attr{}", location)); 818 Name(AddGlobalVariable(id), name);
765 output_attributes.emplace(index, id); 819
820 GenericVaryingDescription description;
821 description.id = id;
822 description.first_element = element;
823 description.is_scalar = num_components == 1;
824 for (u32 i = 0; i < num_components; ++i) {
825 const u8 offset = static_cast<u8>(static_cast<u32>(index) * 4 + element + i);
826 output_attributes.emplace(offset, description);
827 }
766 interfaces.push_back(id); 828 interfaces.push_back(id);
767 829
768 Decorate(id, spv::Decoration::Location, location); 830 Decorate(id, spv::Decoration::Location, location);
831 if (element > 0) {
832 Decorate(id, spv::Decoration::Component, static_cast<u32>(element));
833 }
834 if (tfb && device.IsExtTransformFeedbackSupported()) {
835 Decorate(id, spv::Decoration::XfbBuffer, static_cast<u32>(tfb->buffer));
836 Decorate(id, spv::Decoration::XfbStride, static_cast<u32>(tfb->stride));
837 Decorate(id, spv::Decoration::Offset, static_cast<u32>(tfb->offset));
838 }
839
840 element = static_cast<u8>(static_cast<std::size_t>(element) + num_components);
841 }
842 }
843
844 std::optional<VaryingTFB> GetTransformFeedbackInfo(Attribute::Index index, u8 element = 0) {
845 const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
846 const auto it = transform_feedback.find(location);
847 if (it == transform_feedback.end()) {
848 return {};
769 } 849 }
850 return it->second;
770 } 851 }
771 852
772 u32 DeclareConstantBuffers(u32 binding) { 853 u32 DeclareConstantBuffers(u32 binding) {
@@ -898,7 +979,7 @@ private:
898 u32 GetNumInputVertices() const { 979 u32 GetNumInputVertices() const {
899 switch (stage) { 980 switch (stage) {
900 case ShaderType::Geometry: 981 case ShaderType::Geometry:
901 return GetNumPrimitiveTopologyVertices(specialization.primitive_topology); 982 return GetNumPrimitiveTopologyVertices(registry.GetGraphicsInfo().primitive_topology);
902 case ShaderType::TesselationControl: 983 case ShaderType::TesselationControl:
903 case ShaderType::TesselationEval: 984 case ShaderType::TesselationEval:
904 return NumInputPatches; 985 return NumInputPatches;
@@ -1346,8 +1427,14 @@ private:
1346 } 1427 }
1347 default: 1428 default:
1348 if (IsGenericAttribute(attribute)) { 1429 if (IsGenericAttribute(attribute)) {
1349 const Id composite = output_attributes.at(attribute); 1430 const u8 offset = static_cast<u8>(static_cast<u8>(attribute) * 4 + element);
1350 return {ArrayPass(t_out_float, composite, {element}), Type::Float}; 1431 const GenericVaryingDescription description = output_attributes.at(offset);
1432 const Id composite = description.id;
1433 std::vector<u32> indices;
1434 if (!description.is_scalar) {
1435 indices.push_back(element - description.first_element);
1436 }
1437 return {ArrayPass(t_out_float, composite, indices), Type::Float};
1351 } 1438 }
1352 UNIMPLEMENTED_MSG("Unhandled output attribute: {}", 1439 UNIMPLEMENTED_MSG("Unhandled output attribute: {}",
1353 static_cast<u32>(attribute)); 1440 static_cast<u32>(attribute));
@@ -1793,7 +1880,7 @@ private:
1793 } 1880 }
1794 1881
1795 Expression ImageLoad(Operation operation) { 1882 Expression ImageLoad(Operation operation) {
1796 if (!device.IsShaderStorageImageReadWithoutFormatSupported()) { 1883 if (!device.IsFormatlessImageLoadSupported()) {
1797 return {v_float_zero, Type::Float}; 1884 return {v_float_zero, Type::Float};
1798 } 1885 }
1799 1886
@@ -2258,11 +2345,11 @@ private:
2258 std::array<Id, 4> GetTypeVectorDefinitionLut(Type type) const { 2345 std::array<Id, 4> GetTypeVectorDefinitionLut(Type type) const {
2259 switch (type) { 2346 switch (type) {
2260 case Type::Float: 2347 case Type::Float:
2261 return {nullptr, t_float2, t_float3, t_float4}; 2348 return {t_float, t_float2, t_float3, t_float4};
2262 case Type::Int: 2349 case Type::Int:
2263 return {nullptr, t_int2, t_int3, t_int4}; 2350 return {t_int, t_int2, t_int3, t_int4};
2264 case Type::Uint: 2351 case Type::Uint:
2265 return {nullptr, t_uint2, t_uint3, t_uint4}; 2352 return {t_uint, t_uint2, t_uint3, t_uint4};
2266 default: 2353 default:
2267 UNIMPLEMENTED(); 2354 UNIMPLEMENTED();
2268 return {}; 2355 return {};
@@ -2495,7 +2582,9 @@ private:
2495 const ShaderIR& ir; 2582 const ShaderIR& ir;
2496 const ShaderType stage; 2583 const ShaderType stage;
2497 const Tegra::Shader::Header header; 2584 const Tegra::Shader::Header header;
2585 const Registry& registry;
2498 const Specialization& specialization; 2586 const Specialization& specialization;
2587 std::unordered_map<u8, VaryingTFB> transform_feedback;
2499 2588
2500 const Id t_void = Name(TypeVoid(), "void"); 2589 const Id t_void = Name(TypeVoid(), "void");
2501 2590
@@ -2584,7 +2673,7 @@ private:
2584 Id shared_memory{}; 2673 Id shared_memory{};
2585 std::array<Id, INTERNAL_FLAGS_COUNT> internal_flags{}; 2674 std::array<Id, INTERNAL_FLAGS_COUNT> internal_flags{};
2586 std::map<Attribute::Index, Id> input_attributes; 2675 std::map<Attribute::Index, Id> input_attributes;
2587 std::map<Attribute::Index, Id> output_attributes; 2676 std::unordered_map<u8, GenericVaryingDescription> output_attributes;
2588 std::map<u32, Id> constant_buffers; 2677 std::map<u32, Id> constant_buffers;
2589 std::map<GlobalMemoryBase, Id> global_buffers; 2678 std::map<GlobalMemoryBase, Id> global_buffers;
2590 std::map<u32, TexelBuffer> texel_buffers; 2679 std::map<u32, TexelBuffer> texel_buffers;
@@ -2870,8 +2959,9 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
2870} 2959}
2871 2960
2872std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, 2961std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir,
2873 ShaderType stage, const Specialization& specialization) { 2962 ShaderType stage, const VideoCommon::Shader::Registry& registry,
2874 return SPIRVDecompiler(device, ir, stage, specialization).Assemble(); 2963 const Specialization& specialization) {
2964 return SPIRVDecompiler(device, ir, stage, registry, specialization).Assemble();
2875} 2965}
2876 2966
2877} // namespace Vulkan 2967} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
index f5dc14d9e..ffea4709e 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -15,6 +15,7 @@
15#include "common/common_types.h" 15#include "common/common_types.h"
16#include "video_core/engines/maxwell_3d.h" 16#include "video_core/engines/maxwell_3d.h"
17#include "video_core/engines/shader_type.h" 17#include "video_core/engines/shader_type.h"
18#include "video_core/shader/registry.h"
18#include "video_core/shader/shader_ir.h" 19#include "video_core/shader/shader_ir.h"
19 20
20namespace Vulkan { 21namespace Vulkan {
@@ -91,17 +92,9 @@ struct Specialization final {
91 u32 shared_memory_size{}; 92 u32 shared_memory_size{};
92 93
93 // Graphics specific 94 // Graphics specific
94 Maxwell::PrimitiveTopology primitive_topology{};
95 std::optional<float> point_size{}; 95 std::optional<float> point_size{};
96 std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{}; 96 std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{};
97 bool ndc_minus_one_to_one{}; 97 bool ndc_minus_one_to_one{};
98
99 // Tessellation specific
100 struct {
101 Maxwell::TessellationPrimitive primitive{};
102 Maxwell::TessellationSpacing spacing{};
103 bool clockwise{};
104 } tessellation;
105}; 98};
106// Old gcc versions don't consider this trivially copyable. 99// Old gcc versions don't consider this trivially copyable.
107// static_assert(std::is_trivially_copyable_v<Specialization>); 100// static_assert(std::is_trivially_copyable_v<Specialization>);
@@ -114,6 +107,8 @@ struct SPIRVShader {
114ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir); 107ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir);
115 108
116std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, 109std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir,
117 Tegra::Engines::ShaderType stage, const Specialization& specialization); 110 Tegra::Engines::ShaderType stage,
111 const VideoCommon::Shader::Registry& registry,
112 const Specialization& specialization);
118 113
119} // namespace Vulkan 114} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
index d9ea3cc21..374959f82 100644
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
@@ -100,7 +100,6 @@ void VKStagingBufferPool::ReleaseCache(bool host_visible) {
100} 100}
101 101
102u64 VKStagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, std::size_t log2) { 102u64 VKStagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, std::size_t log2) {
103 static constexpr u64 epochs_to_destroy = 180;
104 static constexpr std::size_t deletions_per_tick = 16; 103 static constexpr std::size_t deletions_per_tick = 16;
105 104
106 auto& staging = cache[log2]; 105 auto& staging = cache[log2];
@@ -108,6 +107,7 @@ u64 VKStagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, std::size_t lo
108 const std::size_t old_size = entries.size(); 107 const std::size_t old_size = entries.size();
109 108
110 const auto is_deleteable = [this](const auto& entry) { 109 const auto is_deleteable = [this](const auto& entry) {
110 static constexpr u64 epochs_to_destroy = 180;
111 return entry.last_epoch + epochs_to_destroy < epoch && !entry.watch.IsUsed(); 111 return entry.last_epoch + epochs_to_destroy < epoch && !entry.watch.IsUsed();
112 }; 112 };
113 const std::size_t begin_offset = staging.delete_index; 113 const std::size_t begin_offset = staging.delete_index;
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
index d74e68b63..94a89e388 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
@@ -90,8 +90,6 @@ void StateTracker::Initialize() {
90 SetupDirtyBlendConstants(tables); 90 SetupDirtyBlendConstants(tables);
91 SetupDirtyDepthBounds(tables); 91 SetupDirtyDepthBounds(tables);
92 SetupDirtyStencilProperties(tables); 92 SetupDirtyStencilProperties(tables);
93
94 SetupCommonOnWriteStores(dirty.on_write_stores);
95} 93}
96 94
97void StateTracker::InvalidateCommandBufferState() { 95void StateTracker::InvalidateCommandBufferState() {
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 73d92a5ae..26175921b 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -52,6 +52,9 @@ vk::ImageType SurfaceTargetToImage(SurfaceTarget target) {
52 return vk::ImageType::e2D; 52 return vk::ImageType::e2D;
53 case SurfaceTarget::Texture3D: 53 case SurfaceTarget::Texture3D:
54 return vk::ImageType::e3D; 54 return vk::ImageType::e3D;
55 case SurfaceTarget::TextureBuffer:
56 UNREACHABLE();
57 return {};
55 } 58 }
56 UNREACHABLE_MSG("Unknown texture target={}", static_cast<u32>(target)); 59 UNREACHABLE_MSG("Unknown texture target={}", static_cast<u32>(target));
57 return {}; 60 return {};
@@ -273,7 +276,6 @@ void CachedSurface::UploadImage(const std::vector<u8>& staging_buffer) {
273 276
274 for (u32 level = 0; level < params.num_levels; ++level) { 277 for (u32 level = 0; level < params.num_levels; ++level) {
275 vk::BufferImageCopy copy = GetBufferImageCopy(level); 278 vk::BufferImageCopy copy = GetBufferImageCopy(level);
276 const auto& dld = device.GetDispatchLoader();
277 if (image->GetAspectMask() == 279 if (image->GetAspectMask() ==
278 (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) { 280 (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) {
279 vk::BufferImageCopy depth = copy; 281 vk::BufferImageCopy depth = copy;
@@ -422,7 +424,6 @@ void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface,
422 dst_base_layer, num_layers, copy_params.dest_level, 1, vk::PipelineStageFlagBits::eTransfer, 424 dst_base_layer, num_layers, copy_params.dest_level, 1, vk::PipelineStageFlagBits::eTransfer,
423 vk::AccessFlagBits::eTransferWrite, vk::ImageLayout::eTransferDstOptimal); 425 vk::AccessFlagBits::eTransferWrite, vk::ImageLayout::eTransferDstOptimal);
424 426
425 const auto& dld{device.GetDispatchLoader()};
426 const vk::ImageSubresourceLayers src_subresource( 427 const vk::ImageSubresourceLayers src_subresource(
427 src_surface->GetAspectMask(), copy_params.source_level, copy_params.source_z, num_layers); 428 src_surface->GetAspectMask(), copy_params.source_level, copy_params.source_z, num_layers);
428 const vk::ImageSubresourceLayers dst_subresource( 429 const vk::ImageSubresourceLayers dst_subresource(
@@ -458,7 +459,6 @@ void VKTextureCache::ImageBlit(View& src_view, View& dst_view,
458 dst_view->GetImageSubresourceLayers(), {dst_top_left, dst_bot_right}); 459 dst_view->GetImageSubresourceLayers(), {dst_top_left, dst_bot_right});
459 const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; 460 const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear;
460 461
461 const auto& dld{device.GetDispatchLoader()};
462 scheduler.Record([src_image = src_view->GetImage(), dst_image = dst_view->GetImage(), blit, 462 scheduler.Record([src_image = src_view->GetImage(), dst_image = dst_view->GetImage(), blit,
463 is_linear](auto cmdbuf, auto& dld) { 463 is_linear](auto cmdbuf, auto& dld) {
464 cmdbuf.blitImage(src_image, vk::ImageLayout::eTransferSrcOptimal, dst_image, 464 cmdbuf.blitImage(src_image, vk::ImageLayout::eTransferSrcOptimal, dst_image,
diff --git a/src/video_core/shader/const_buffer_locker.cpp b/src/video_core/shader/const_buffer_locker.cpp
deleted file mode 100644
index 0638be8cb..000000000
--- a/src/video_core/shader/const_buffer_locker.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <tuple>
7
8#include "common/common_types.h"
9#include "video_core/engines/maxwell_3d.h"
10#include "video_core/engines/shader_type.h"
11#include "video_core/shader/const_buffer_locker.h"
12
13namespace VideoCommon::Shader {
14
15using Tegra::Engines::SamplerDescriptor;
16
17ConstBufferLocker::ConstBufferLocker(Tegra::Engines::ShaderType shader_stage)
18 : stage{shader_stage} {}
19
20ConstBufferLocker::ConstBufferLocker(Tegra::Engines::ShaderType shader_stage,
21 Tegra::Engines::ConstBufferEngineInterface& engine)
22 : stage{shader_stage}, engine{&engine} {}
23
24ConstBufferLocker::~ConstBufferLocker() = default;
25
26std::optional<u32> ConstBufferLocker::ObtainKey(u32 buffer, u32 offset) {
27 const std::pair<u32, u32> key = {buffer, offset};
28 const auto iter = keys.find(key);
29 if (iter != keys.end()) {
30 return iter->second;
31 }
32 if (!engine) {
33 return std::nullopt;
34 }
35 const u32 value = engine->AccessConstBuffer32(stage, buffer, offset);
36 keys.emplace(key, value);
37 return value;
38}
39
40std::optional<SamplerDescriptor> ConstBufferLocker::ObtainBoundSampler(u32 offset) {
41 const u32 key = offset;
42 const auto iter = bound_samplers.find(key);
43 if (iter != bound_samplers.end()) {
44 return iter->second;
45 }
46 if (!engine) {
47 return std::nullopt;
48 }
49 const SamplerDescriptor value = engine->AccessBoundSampler(stage, offset);
50 bound_samplers.emplace(key, value);
51 return value;
52}
53
54std::optional<Tegra::Engines::SamplerDescriptor> ConstBufferLocker::ObtainBindlessSampler(
55 u32 buffer, u32 offset) {
56 const std::pair key = {buffer, offset};
57 const auto iter = bindless_samplers.find(key);
58 if (iter != bindless_samplers.end()) {
59 return iter->second;
60 }
61 if (!engine) {
62 return std::nullopt;
63 }
64 const SamplerDescriptor value = engine->AccessBindlessSampler(stage, buffer, offset);
65 bindless_samplers.emplace(key, value);
66 return value;
67}
68
69std::optional<u32> ConstBufferLocker::ObtainBoundBuffer() {
70 if (bound_buffer_saved) {
71 return bound_buffer;
72 }
73 if (!engine) {
74 return std::nullopt;
75 }
76 bound_buffer_saved = true;
77 bound_buffer = engine->GetBoundBuffer();
78 return bound_buffer;
79}
80
81void ConstBufferLocker::InsertKey(u32 buffer, u32 offset, u32 value) {
82 keys.insert_or_assign({buffer, offset}, value);
83}
84
85void ConstBufferLocker::InsertBoundSampler(u32 offset, SamplerDescriptor sampler) {
86 bound_samplers.insert_or_assign(offset, sampler);
87}
88
89void ConstBufferLocker::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDescriptor sampler) {
90 bindless_samplers.insert_or_assign({buffer, offset}, sampler);
91}
92
93void ConstBufferLocker::SetBoundBuffer(u32 buffer) {
94 bound_buffer_saved = true;
95 bound_buffer = buffer;
96}
97
98bool ConstBufferLocker::IsConsistent() const {
99 if (!engine) {
100 return false;
101 }
102 return std::all_of(keys.begin(), keys.end(),
103 [this](const auto& pair) {
104 const auto [cbuf, offset] = pair.first;
105 const auto value = pair.second;
106 return value == engine->AccessConstBuffer32(stage, cbuf, offset);
107 }) &&
108 std::all_of(bound_samplers.begin(), bound_samplers.end(),
109 [this](const auto& sampler) {
110 const auto [key, value] = sampler;
111 return value == engine->AccessBoundSampler(stage, key);
112 }) &&
113 std::all_of(bindless_samplers.begin(), bindless_samplers.end(),
114 [this](const auto& sampler) {
115 const auto [cbuf, offset] = sampler.first;
116 const auto value = sampler.second;
117 return value == engine->AccessBindlessSampler(stage, cbuf, offset);
118 });
119}
120
121bool ConstBufferLocker::HasEqualKeys(const ConstBufferLocker& rhs) const {
122 return std::tie(keys, bound_samplers, bindless_samplers) ==
123 std::tie(rhs.keys, rhs.bound_samplers, rhs.bindless_samplers);
124}
125
126} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/const_buffer_locker.h b/src/video_core/shader/const_buffer_locker.h
deleted file mode 100644
index d3ea11087..000000000
--- a/src/video_core/shader/const_buffer_locker.h
+++ /dev/null
@@ -1,103 +0,0 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <optional>
8#include <unordered_map>
9#include "common/common_types.h"
10#include "common/hash.h"
11#include "video_core/engines/const_buffer_engine_interface.h"
12#include "video_core/engines/shader_type.h"
13#include "video_core/guest_driver.h"
14
15namespace VideoCommon::Shader {
16
17using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
18using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
19using BindlessSamplerMap =
20 std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
21
22/**
23 * The ConstBufferLocker is a class use to interface the 3D and compute engines with the shader
24 * compiler. with it, the shader can obtain required data from GPU state and store it for disk
25 * shader compilation.
26 */
27class ConstBufferLocker {
28public:
29 explicit ConstBufferLocker(Tegra::Engines::ShaderType shader_stage);
30
31 explicit ConstBufferLocker(Tegra::Engines::ShaderType shader_stage,
32 Tegra::Engines::ConstBufferEngineInterface& engine);
33
34 ~ConstBufferLocker();
35
36 /// Retrieves a key from the locker, if it's registered, it will give the registered value, if
37 /// not it will obtain it from maxwell3d and register it.
38 std::optional<u32> ObtainKey(u32 buffer, u32 offset);
39
40 std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
41
42 std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
43
44 std::optional<u32> ObtainBoundBuffer();
45
46 /// Inserts a key.
47 void InsertKey(u32 buffer, u32 offset, u32 value);
48
49 /// Inserts a bound sampler key.
50 void InsertBoundSampler(u32 offset, Tegra::Engines::SamplerDescriptor sampler);
51
52 /// Inserts a bindless sampler key.
53 void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler);
54
55 /// Set the bound buffer for this locker.
56 void SetBoundBuffer(u32 buffer);
57
58 /// Checks keys and samplers against engine's current const buffers. Returns true if they are
59 /// the same value, false otherwise;
60 bool IsConsistent() const;
61
62 /// Returns true if the keys are equal to the other ones in the locker.
63 bool HasEqualKeys(const ConstBufferLocker& rhs) const;
64
65 /// Gives an getter to the const buffer keys in the database.
66 const KeyMap& GetKeys() const {
67 return keys;
68 }
69
70 /// Gets samplers database.
71 const BoundSamplerMap& GetBoundSamplers() const {
72 return bound_samplers;
73 }
74
75 /// Gets bindless samplers database.
76 const BindlessSamplerMap& GetBindlessSamplers() const {
77 return bindless_samplers;
78 }
79
80 /// Gets bound buffer used on this shader
81 u32 GetBoundBuffer() const {
82 return bound_buffer;
83 }
84
85 /// Obtains access to the guest driver's profile.
86 VideoCore::GuestDriverProfile* AccessGuestDriverProfile() const {
87 if (engine) {
88 return &engine->AccessGuestDriverProfile();
89 }
90 return nullptr;
91 }
92
93private:
94 const Tegra::Engines::ShaderType stage;
95 Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
96 KeyMap keys;
97 BoundSamplerMap bound_samplers;
98 BindlessSamplerMap bindless_samplers;
99 bool bound_buffer_saved{};
100 u32 bound_buffer{};
101};
102
103} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index 0229733b6..2e2711350 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -13,6 +13,7 @@
13#include "common/common_types.h" 13#include "common/common_types.h"
14#include "video_core/shader/ast.h" 14#include "video_core/shader/ast.h"
15#include "video_core/shader/control_flow.h" 15#include "video_core/shader/control_flow.h"
16#include "video_core/shader/registry.h"
16#include "video_core/shader/shader_ir.h" 17#include "video_core/shader/shader_ir.h"
17 18
18namespace VideoCommon::Shader { 19namespace VideoCommon::Shader {
@@ -64,11 +65,11 @@ struct BlockInfo {
64}; 65};
65 66
66struct CFGRebuildState { 67struct CFGRebuildState {
67 explicit CFGRebuildState(const ProgramCode& program_code, u32 start, ConstBufferLocker& locker) 68 explicit CFGRebuildState(const ProgramCode& program_code, u32 start, Registry& registry)
68 : program_code{program_code}, locker{locker}, start{start} {} 69 : program_code{program_code}, registry{registry}, start{start} {}
69 70
70 const ProgramCode& program_code; 71 const ProgramCode& program_code;
71 ConstBufferLocker& locker; 72 Registry& registry;
72 u32 start{}; 73 u32 start{};
73 std::vector<BlockInfo> block_info; 74 std::vector<BlockInfo> block_info;
74 std::list<u32> inspect_queries; 75 std::list<u32> inspect_queries;
@@ -438,7 +439,7 @@ std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address)
438 const s32 pc_target = offset + result.relative_position; 439 const s32 pc_target = offset + result.relative_position;
439 std::vector<CaseBranch> branches; 440 std::vector<CaseBranch> branches;
440 for (u32 i = 0; i < result.entries; i++) { 441 for (u32 i = 0; i < result.entries; i++) {
441 auto key = state.locker.ObtainKey(result.buffer, result.offset + i * 4); 442 auto key = state.registry.ObtainKey(result.buffer, result.offset + i * 4);
442 if (!key) { 443 if (!key) {
443 return {ParseResult::AbnormalFlow, parse_info}; 444 return {ParseResult::AbnormalFlow, parse_info};
444 } 445 }
@@ -656,14 +657,14 @@ void DecompileShader(CFGRebuildState& state) {
656 657
657std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address, 658std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address,
658 const CompilerSettings& settings, 659 const CompilerSettings& settings,
659 ConstBufferLocker& locker) { 660 Registry& registry) {
660 auto result_out = std::make_unique<ShaderCharacteristics>(); 661 auto result_out = std::make_unique<ShaderCharacteristics>();
661 if (settings.depth == CompileDepth::BruteForce) { 662 if (settings.depth == CompileDepth::BruteForce) {
662 result_out->settings.depth = CompileDepth::BruteForce; 663 result_out->settings.depth = CompileDepth::BruteForce;
663 return result_out; 664 return result_out;
664 } 665 }
665 666
666 CFGRebuildState state{program_code, start_address, locker}; 667 CFGRebuildState state{program_code, start_address, registry};
667 // Inspect Code and generate blocks 668 // Inspect Code and generate blocks
668 state.labels.clear(); 669 state.labels.clear();
669 state.labels.emplace(start_address); 670 state.labels.emplace(start_address);
diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h
index 5304998b9..62a3510d8 100644
--- a/src/video_core/shader/control_flow.h
+++ b/src/video_core/shader/control_flow.h
@@ -12,6 +12,7 @@
12#include "video_core/engines/shader_bytecode.h" 12#include "video_core/engines/shader_bytecode.h"
13#include "video_core/shader/ast.h" 13#include "video_core/shader/ast.h"
14#include "video_core/shader/compiler_settings.h" 14#include "video_core/shader/compiler_settings.h"
15#include "video_core/shader/registry.h"
15#include "video_core/shader/shader_ir.h" 16#include "video_core/shader/shader_ir.h"
16 17
17namespace VideoCommon::Shader { 18namespace VideoCommon::Shader {
@@ -111,6 +112,6 @@ struct ShaderCharacteristics {
111 112
112std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address, 113std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address,
113 const CompilerSettings& settings, 114 const CompilerSettings& settings,
114 ConstBufferLocker& locker); 115 Registry& registry);
115 116
116} // namespace VideoCommon::Shader 117} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index 6b697ed5d..87ac9ac6c 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -34,13 +34,9 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
34 return (absolute_offset % SchedPeriod) == 0; 34 return (absolute_offset % SchedPeriod) == 0;
35} 35}
36 36
37void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver, 37void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver,
38 const std::list<Sampler>& used_samplers) { 38 const std::list<Sampler>& used_samplers) {
39 if (gpu_driver == nullptr) { 39 if (gpu_driver.IsTextureHandlerSizeKnown() || used_samplers.size() <= 1) {
40 LOG_CRITICAL(HW_GPU, "GPU driver profile has not been created yet");
41 return;
42 }
43 if (gpu_driver->TextureHandlerSizeKnown() || used_samplers.size() <= 1) {
44 return; 40 return;
45 } 41 }
46 u32 count{}; 42 u32 count{};
@@ -53,17 +49,13 @@ void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver,
53 bound_offsets.emplace_back(sampler.GetOffset()); 49 bound_offsets.emplace_back(sampler.GetOffset());
54 } 50 }
55 if (count > 1) { 51 if (count > 1) {
56 gpu_driver->DeduceTextureHandlerSize(std::move(bound_offsets)); 52 gpu_driver.DeduceTextureHandlerSize(std::move(bound_offsets));
57 } 53 }
58} 54}
59 55
60std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce, 56std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce,
61 VideoCore::GuestDriverProfile* gpu_driver, 57 VideoCore::GuestDriverProfile& gpu_driver,
62 const std::list<Sampler>& used_samplers) { 58 const std::list<Sampler>& used_samplers) {
63 if (gpu_driver == nullptr) {
64 LOG_CRITICAL(HW_GPU, "GPU Driver profile has not been created yet");
65 return std::nullopt;
66 }
67 const u32 base_offset = sampler_to_deduce.GetOffset(); 59 const u32 base_offset = sampler_to_deduce.GetOffset();
68 u32 max_offset{std::numeric_limits<u32>::max()}; 60 u32 max_offset{std::numeric_limits<u32>::max()};
69 for (const auto& sampler : used_samplers) { 61 for (const auto& sampler : used_samplers) {
@@ -77,7 +69,7 @@ std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce,
77 if (max_offset == std::numeric_limits<u32>::max()) { 69 if (max_offset == std::numeric_limits<u32>::max()) {
78 return std::nullopt; 70 return std::nullopt;
79 } 71 }
80 return ((max_offset - base_offset) * 4) / gpu_driver->GetTextureHandlerSize(); 72 return ((max_offset - base_offset) * 4) / gpu_driver.GetTextureHandlerSize();
81} 73}
82 74
83} // Anonymous namespace 75} // Anonymous namespace
@@ -149,7 +141,7 @@ void ShaderIR::Decode() {
149 std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header)); 141 std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header));
150 142
151 decompiled = false; 143 decompiled = false;
152 auto info = ScanFlow(program_code, main_offset, settings, locker); 144 auto info = ScanFlow(program_code, main_offset, settings, registry);
153 auto& shader_info = *info; 145 auto& shader_info = *info;
154 coverage_begin = shader_info.start; 146 coverage_begin = shader_info.start;
155 coverage_end = shader_info.end; 147 coverage_end = shader_info.end;
@@ -364,7 +356,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
364 356
365void ShaderIR::PostDecode() { 357void ShaderIR::PostDecode() {
366 // Deduce texture handler size if needed 358 // Deduce texture handler size if needed
367 auto gpu_driver = locker.AccessGuestDriverProfile(); 359 auto gpu_driver = registry.AccessGuestDriverProfile();
368 DeduceTextureHandlerSize(gpu_driver, used_samplers); 360 DeduceTextureHandlerSize(gpu_driver, used_samplers);
369 // Deduce Indexed Samplers 361 // Deduce Indexed Samplers
370 if (!uses_indexed_samplers) { 362 if (!uses_indexed_samplers) {
diff --git a/src/video_core/shader/decode/bfe.cpp b/src/video_core/shader/decode/bfe.cpp
index e02bcd097..8e3b46e8e 100644
--- a/src/video_core/shader/decode/bfe.cpp
+++ b/src/video_core/shader/decode/bfe.cpp
@@ -17,33 +17,60 @@ u32 ShaderIR::DecodeBfe(NodeBlock& bb, u32 pc) {
17 const Instruction instr = {program_code[pc]}; 17 const Instruction instr = {program_code[pc]};
18 const auto opcode = OpCode::Decode(instr); 18 const auto opcode = OpCode::Decode(instr);
19 19
20 UNIMPLEMENTED_IF(instr.bfe.negate_b);
21
22 Node op_a = GetRegister(instr.gpr8); 20 Node op_a = GetRegister(instr.gpr8);
23 op_a = GetOperandAbsNegInteger(op_a, false, instr.bfe.negate_a, false); 21 Node op_b = [&] {
24 22 switch (opcode->get().GetId()) {
25 switch (opcode->get().GetId()) { 23 case OpCode::Id::BFE_R:
26 case OpCode::Id::BFE_IMM: { 24 return GetRegister(instr.gpr20);
27 UNIMPLEMENTED_IF_MSG(instr.generates_cc, 25 case OpCode::Id::BFE_C:
28 "Condition codes generation in BFE is not implemented"); 26 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
27 case OpCode::Id::BFE_IMM:
28 return Immediate(instr.alu.GetSignedImm20_20());
29 default:
30 UNREACHABLE();
31 return Immediate(0);
32 }
33 }();
29 34
30 const Node inner_shift_imm = Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue())); 35 UNIMPLEMENTED_IF_MSG(instr.bfe.rd_cc, "Condition codes in BFE is not implemented");
31 const Node outer_shift_imm =
32 Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue() + instr.bfe.shift_position));
33 36
34 const Node inner_shift = 37 const bool is_signed = instr.bfe.is_signed;
35 Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, op_a, inner_shift_imm);
36 const Node outer_shift =
37 Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, inner_shift, outer_shift_imm);
38 38
39 SetInternalFlagsFromInteger(bb, outer_shift, instr.generates_cc); 39 // using reverse parallel method in
40 SetRegister(bb, instr.gpr0, outer_shift); 40 // https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
41 break; 41 // note for later if possible to implement faster method.
42 } 42 if (instr.bfe.brev) {
43 default: 43 const auto swap = [&](u32 s, u32 mask) {
44 UNIMPLEMENTED_MSG("Unhandled BFE instruction: {}", opcode->get().GetName()); 44 Node v1 =
45 SignedOperation(OperationCode::ILogicalShiftRight, is_signed, op_a, Immediate(s));
46 if (mask != 0) {
47 v1 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v1),
48 Immediate(mask));
49 }
50 Node v2 = op_a;
51 if (mask != 0) {
52 v2 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v2),
53 Immediate(mask));
54 }
55 v2 = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, std::move(v2),
56 Immediate(s));
57 return SignedOperation(OperationCode::IBitwiseOr, is_signed, std::move(v1),
58 std::move(v2));
59 };
60 op_a = swap(1, 0x55555555U);
61 op_a = swap(2, 0x33333333U);
62 op_a = swap(4, 0x0F0F0F0FU);
63 op_a = swap(8, 0x00FF00FFU);
64 op_a = swap(16, 0);
45 } 65 }
46 66
67 const auto offset = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b,
68 Immediate(0), Immediate(8));
69 const auto bits = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b,
70 Immediate(8), Immediate(8));
71 auto result = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_a, offset, bits);
72 SetRegister(bb, instr.gpr0, std::move(result));
73
47 return pc; 74 return pc;
48} 75}
49 76
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index bee7d8cad..48350e042 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -12,6 +12,7 @@
12#include "common/logging/log.h" 12#include "common/logging/log.h"
13#include "video_core/engines/shader_bytecode.h" 13#include "video_core/engines/shader_bytecode.h"
14#include "video_core/shader/node_helper.h" 14#include "video_core/shader/node_helper.h"
15#include "video_core/shader/registry.h"
15#include "video_core/shader/shader_ir.h" 16#include "video_core/shader/shader_ir.h"
16 17
17namespace VideoCommon::Shader { 18namespace VideoCommon::Shader {
@@ -359,8 +360,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(std::optional<SamplerInfo> sample
359 if (sampler_info) { 360 if (sampler_info) {
360 return *sampler_info; 361 return *sampler_info;
361 } 362 }
362 const auto sampler = 363 const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset)
363 buffer ? locker.ObtainBindlessSampler(*buffer, offset) : locker.ObtainBoundSampler(offset); 364 : registry.ObtainBoundSampler(offset);
364 if (!sampler) { 365 if (!sampler) {
365 LOG_WARNING(HW_GPU, "Unknown sampler info"); 366 LOG_WARNING(HW_GPU, "Unknown sampler info");
366 return SamplerInfo{TextureType::Texture2D, false, false, false}; 367 return SamplerInfo{TextureType::Texture2D, false, false, false};
diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp
index b3dcd291c..76c56abb5 100644
--- a/src/video_core/shader/node_helper.cpp
+++ b/src/video_core/shader/node_helper.cpp
@@ -68,6 +68,8 @@ OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed)
68 return OperationCode::UBitwiseXor; 68 return OperationCode::UBitwiseXor;
69 case OperationCode::IBitwiseNot: 69 case OperationCode::IBitwiseNot:
70 return OperationCode::UBitwiseNot; 70 return OperationCode::UBitwiseNot;
71 case OperationCode::IBitfieldExtract:
72 return OperationCode::UBitfieldExtract;
71 case OperationCode::IBitfieldInsert: 73 case OperationCode::IBitfieldInsert:
72 return OperationCode::UBitfieldInsert; 74 return OperationCode::UBitfieldInsert;
73 case OperationCode::IBitCount: 75 case OperationCode::IBitCount:
diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp
new file mode 100644
index 000000000..af70b3f35
--- /dev/null
+++ b/src/video_core/shader/registry.cpp
@@ -0,0 +1,161 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <tuple>
7
8#include "common/assert.h"
9#include "common/common_types.h"
10#include "video_core/engines/kepler_compute.h"
11#include "video_core/engines/maxwell_3d.h"
12#include "video_core/engines/shader_type.h"
13#include "video_core/shader/registry.h"
14
15namespace VideoCommon::Shader {
16
17using Tegra::Engines::ConstBufferEngineInterface;
18using Tegra::Engines::SamplerDescriptor;
19using Tegra::Engines::ShaderType;
20
21namespace {
22
23GraphicsInfo MakeGraphicsInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) {
24 if (shader_stage == ShaderType::Compute) {
25 return {};
26 }
27 auto& graphics = static_cast<Tegra::Engines::Maxwell3D&>(engine);
28
29 GraphicsInfo info;
30 info.tfb_layouts = graphics.regs.tfb_layouts;
31 info.tfb_varying_locs = graphics.regs.tfb_varying_locs;
32 info.primitive_topology = graphics.regs.draw.topology;
33 info.tessellation_primitive = graphics.regs.tess_mode.prim;
34 info.tessellation_spacing = graphics.regs.tess_mode.spacing;
35 info.tfb_enabled = graphics.regs.tfb_enabled;
36 info.tessellation_clockwise = graphics.regs.tess_mode.cw;
37 return info;
38}
39
40ComputeInfo MakeComputeInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) {
41 if (shader_stage != ShaderType::Compute) {
42 return {};
43 }
44 auto& compute = static_cast<Tegra::Engines::KeplerCompute&>(engine);
45 const auto& launch = compute.launch_description;
46
47 ComputeInfo info;
48 info.workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z};
49 info.local_memory_size_in_words = launch.local_pos_alloc;
50 info.shared_memory_size_in_words = launch.shared_alloc;
51 return info;
52}
53
54} // Anonymous namespace
55
56Registry::Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info)
57 : stage{shader_stage}, stored_guest_driver_profile{info.guest_driver_profile},
58 bound_buffer{info.bound_buffer}, graphics_info{info.graphics}, compute_info{info.compute} {}
59
60Registry::Registry(Tegra::Engines::ShaderType shader_stage,
61 Tegra::Engines::ConstBufferEngineInterface& engine)
62 : stage{shader_stage}, engine{&engine}, bound_buffer{engine.GetBoundBuffer()},
63 graphics_info{MakeGraphicsInfo(shader_stage, engine)}, compute_info{MakeComputeInfo(
64 shader_stage, engine)} {}
65
66Registry::~Registry() = default;
67
68std::optional<u32> Registry::ObtainKey(u32 buffer, u32 offset) {
69 const std::pair<u32, u32> key = {buffer, offset};
70 const auto iter = keys.find(key);
71 if (iter != keys.end()) {
72 return iter->second;
73 }
74 if (!engine) {
75 return std::nullopt;
76 }
77 const u32 value = engine->AccessConstBuffer32(stage, buffer, offset);
78 keys.emplace(key, value);
79 return value;
80}
81
82std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) {
83 const u32 key = offset;
84 const auto iter = bound_samplers.find(key);
85 if (iter != bound_samplers.end()) {
86 return iter->second;
87 }
88 if (!engine) {
89 return std::nullopt;
90 }
91 const SamplerDescriptor value = engine->AccessBoundSampler(stage, offset);
92 bound_samplers.emplace(key, value);
93 return value;
94}
95
96std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer,
97 u32 offset) {
98 const std::pair key = {buffer, offset};
99 const auto iter = bindless_samplers.find(key);
100 if (iter != bindless_samplers.end()) {
101 return iter->second;
102 }
103 if (!engine) {
104 return std::nullopt;
105 }
106 const SamplerDescriptor value = engine->AccessBindlessSampler(stage, buffer, offset);
107 bindless_samplers.emplace(key, value);
108 return value;
109}
110
111void Registry::InsertKey(u32 buffer, u32 offset, u32 value) {
112 keys.insert_or_assign({buffer, offset}, value);
113}
114
115void Registry::InsertBoundSampler(u32 offset, SamplerDescriptor sampler) {
116 bound_samplers.insert_or_assign(offset, sampler);
117}
118
119void Registry::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDescriptor sampler) {
120 bindless_samplers.insert_or_assign({buffer, offset}, sampler);
121}
122
123bool Registry::IsConsistent() const {
124 if (!engine) {
125 return true;
126 }
127 return std::all_of(keys.begin(), keys.end(),
128 [this](const auto& pair) {
129 const auto [cbuf, offset] = pair.first;
130 const auto value = pair.second;
131 return value == engine->AccessConstBuffer32(stage, cbuf, offset);
132 }) &&
133 std::all_of(bound_samplers.begin(), bound_samplers.end(),
134 [this](const auto& sampler) {
135 const auto [key, value] = sampler;
136 return value == engine->AccessBoundSampler(stage, key);
137 }) &&
138 std::all_of(bindless_samplers.begin(), bindless_samplers.end(),
139 [this](const auto& sampler) {
140 const auto [cbuf, offset] = sampler.first;
141 const auto value = sampler.second;
142 return value == engine->AccessBindlessSampler(stage, cbuf, offset);
143 });
144}
145
146bool Registry::HasEqualKeys(const Registry& rhs) const {
147 return std::tie(keys, bound_samplers, bindless_samplers) ==
148 std::tie(rhs.keys, rhs.bound_samplers, rhs.bindless_samplers);
149}
150
151const GraphicsInfo& Registry::GetGraphicsInfo() const {
152 ASSERT(stage != Tegra::Engines::ShaderType::Compute);
153 return graphics_info;
154}
155
156const ComputeInfo& Registry::GetComputeInfo() const {
157 ASSERT(stage == Tegra::Engines::ShaderType::Compute);
158 return compute_info;
159}
160
161} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h
new file mode 100644
index 000000000..0c80d35fd
--- /dev/null
+++ b/src/video_core/shader/registry.h
@@ -0,0 +1,137 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <optional>
9#include <type_traits>
10#include <unordered_map>
11#include <utility>
12
13#include "common/common_types.h"
14#include "common/hash.h"
15#include "video_core/engines/const_buffer_engine_interface.h"
16#include "video_core/engines/maxwell_3d.h"
17#include "video_core/engines/shader_type.h"
18#include "video_core/guest_driver.h"
19
20namespace VideoCommon::Shader {
21
22using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
23using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
24using BindlessSamplerMap =
25 std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
26
27struct GraphicsInfo {
28 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
29
30 std::array<Maxwell::TransformFeedbackLayout, Maxwell::NumTransformFeedbackBuffers>
31 tfb_layouts{};
32 std::array<std::array<u8, 128>, Maxwell::NumTransformFeedbackBuffers> tfb_varying_locs{};
33 Maxwell::PrimitiveTopology primitive_topology{};
34 Maxwell::TessellationPrimitive tessellation_primitive{};
35 Maxwell::TessellationSpacing tessellation_spacing{};
36 bool tfb_enabled = false;
37 bool tessellation_clockwise = false;
38};
39static_assert(std::is_trivially_copyable_v<GraphicsInfo> &&
40 std::is_standard_layout_v<GraphicsInfo>);
41
42struct ComputeInfo {
43 std::array<u32, 3> workgroup_size{};
44 u32 shared_memory_size_in_words = 0;
45 u32 local_memory_size_in_words = 0;
46};
47static_assert(std::is_trivially_copyable_v<ComputeInfo> && std::is_standard_layout_v<ComputeInfo>);
48
49struct SerializedRegistryInfo {
50 VideoCore::GuestDriverProfile guest_driver_profile;
51 u32 bound_buffer = 0;
52 GraphicsInfo graphics;
53 ComputeInfo compute;
54};
55
56/**
57 * The Registry is a class use to interface the 3D and compute engines with the shader compiler.
58 * With it, the shader can obtain required data from GPU state and store it for disk shader
59 * compilation.
60 */
61class Registry {
62public:
63 explicit Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info);
64
65 explicit Registry(Tegra::Engines::ShaderType shader_stage,
66 Tegra::Engines::ConstBufferEngineInterface& engine);
67
68 ~Registry();
69
70 /// Retrieves a key from the registry, if it's registered, it will give the registered value, if
71 /// not it will obtain it from maxwell3d and register it.
72 std::optional<u32> ObtainKey(u32 buffer, u32 offset);
73
74 std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
75
76 std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
77
78 /// Inserts a key.
79 void InsertKey(u32 buffer, u32 offset, u32 value);
80
81 /// Inserts a bound sampler key.
82 void InsertBoundSampler(u32 offset, Tegra::Engines::SamplerDescriptor sampler);
83
84 /// Inserts a bindless sampler key.
85 void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler);
86
87 /// Checks keys and samplers against engine's current const buffers.
88 /// Returns true if they are the same value, false otherwise.
89 bool IsConsistent() const;
90
91 /// Returns true if the keys are equal to the other ones in the registry.
92 bool HasEqualKeys(const Registry& rhs) const;
93
94 /// Returns graphics information from this shader
95 const GraphicsInfo& GetGraphicsInfo() const;
96
97 /// Returns compute information from this shader
98 const ComputeInfo& GetComputeInfo() const;
99
100 /// Gives an getter to the const buffer keys in the database.
101 const KeyMap& GetKeys() const {
102 return keys;
103 }
104
105 /// Gets samplers database.
106 const BoundSamplerMap& GetBoundSamplers() const {
107 return bound_samplers;
108 }
109
110 /// Gets bindless samplers database.
111 const BindlessSamplerMap& GetBindlessSamplers() const {
112 return bindless_samplers;
113 }
114
115 /// Gets bound buffer used on this shader
116 u32 GetBoundBuffer() const {
117 return bound_buffer;
118 }
119
120 /// Obtains access to the guest driver's profile.
121 VideoCore::GuestDriverProfile& AccessGuestDriverProfile() {
122 return engine ? engine->AccessGuestDriverProfile() : stored_guest_driver_profile;
123 }
124
125private:
126 const Tegra::Engines::ShaderType stage;
127 VideoCore::GuestDriverProfile stored_guest_driver_profile;
128 Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
129 KeyMap keys;
130 BoundSamplerMap bound_samplers;
131 BindlessSamplerMap bindless_samplers;
132 u32 bound_buffer;
133 GraphicsInfo graphics_info;
134 ComputeInfo compute_info;
135};
136
137} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 3a5d280a9..425927777 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -11,6 +11,7 @@
11#include "common/logging/log.h" 11#include "common/logging/log.h"
12#include "video_core/engines/shader_bytecode.h" 12#include "video_core/engines/shader_bytecode.h"
13#include "video_core/shader/node_helper.h" 13#include "video_core/shader/node_helper.h"
14#include "video_core/shader/registry.h"
14#include "video_core/shader/shader_ir.h" 15#include "video_core/shader/shader_ir.h"
15 16
16namespace VideoCommon::Shader { 17namespace VideoCommon::Shader {
@@ -24,8 +25,8 @@ using Tegra::Shader::PredOperation;
24using Tegra::Shader::Register; 25using Tegra::Shader::Register;
25 26
26ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings, 27ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings,
27 ConstBufferLocker& locker) 28 Registry& registry)
28 : program_code{program_code}, main_offset{main_offset}, settings{settings}, locker{locker} { 29 : program_code{program_code}, main_offset{main_offset}, settings{settings}, registry{registry} {
29 Decode(); 30 Decode();
30 PostDecode(); 31 PostDecode();
31} 32}
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index b0851c3be..dde036b40 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -18,8 +18,8 @@
18#include "video_core/engines/shader_header.h" 18#include "video_core/engines/shader_header.h"
19#include "video_core/shader/ast.h" 19#include "video_core/shader/ast.h"
20#include "video_core/shader/compiler_settings.h" 20#include "video_core/shader/compiler_settings.h"
21#include "video_core/shader/const_buffer_locker.h"
22#include "video_core/shader/node.h" 21#include "video_core/shader/node.h"
22#include "video_core/shader/registry.h"
23 23
24namespace VideoCommon::Shader { 24namespace VideoCommon::Shader {
25 25
@@ -69,7 +69,7 @@ struct GlobalMemoryUsage {
69class ShaderIR final { 69class ShaderIR final {
70public: 70public:
71 explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings, 71 explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings,
72 ConstBufferLocker& locker); 72 Registry& registry);
73 ~ShaderIR(); 73 ~ShaderIR();
74 74
75 const std::map<u32, NodeBlock>& GetBasicBlocks() const { 75 const std::map<u32, NodeBlock>& GetBasicBlocks() const {
@@ -414,7 +414,7 @@ private:
414 const ProgramCode& program_code; 414 const ProgramCode& program_code;
415 const u32 main_offset; 415 const u32 main_offset;
416 const CompilerSettings settings; 416 const CompilerSettings settings;
417 ConstBufferLocker& locker; 417 Registry& registry;
418 418
419 bool decompiled{}; 419 bool decompiled{};
420 bool disable_flow_stack{}; 420 bool disable_flow_stack{};
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index 15e22b9fa..10739b37d 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -81,26 +81,20 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
81 MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue()); 81 MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue());
82 return {tracked, track}; 82 return {tracked, track};
83 } else if (const auto operation = std::get_if<OperationNode>(&*offset)) { 83 } else if (const auto operation = std::get_if<OperationNode>(&*offset)) {
84 auto bound_buffer = locker.ObtainBoundBuffer(); 84 const u32 bound_buffer = registry.GetBoundBuffer();
85 if (!bound_buffer) { 85 if (bound_buffer != cbuf->GetIndex()) {
86 return {}; 86 return {};
87 } 87 }
88 if (*bound_buffer != cbuf->GetIndex()) { 88 const auto pair = DecoupleIndirectRead(*operation);
89 return {};
90 }
91 auto pair = DecoupleIndirectRead(*operation);
92 if (!pair) { 89 if (!pair) {
93 return {}; 90 return {};
94 } 91 }
95 auto [gpr, base_offset] = *pair; 92 auto [gpr, base_offset] = *pair;
96 const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset); 93 const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset);
97 auto gpu_driver = locker.AccessGuestDriverProfile(); 94 const auto& gpu_driver = registry.AccessGuestDriverProfile();
98 if (gpu_driver == nullptr) {
99 return {};
100 }
101 const u32 bindless_cv = NewCustomVariable(); 95 const u32 bindless_cv = NewCustomVariable();
102 const Node op = Operation(OperationCode::UDiv, NO_PRECISE, gpr, 96 const Node op =
103 Immediate(gpu_driver->GetTextureHandlerSize())); 97 Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize()));
104 98
105 const Node cv_node = GetCustomVariable(bindless_cv); 99 const Node cv_node = GetCustomVariable(bindless_cv);
106 Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op)); 100 Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op));
diff --git a/src/video_core/shader/transform_feedback.cpp b/src/video_core/shader/transform_feedback.cpp
new file mode 100644
index 000000000..22a933761
--- /dev/null
+++ b/src/video_core/shader/transform_feedback.cpp
@@ -0,0 +1,115 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <array>
7#include <unordered_map>
8
9#include "common/assert.h"
10#include "common/common_types.h"
11#include "video_core/engines/maxwell_3d.h"
12#include "video_core/shader/registry.h"
13#include "video_core/shader/transform_feedback.h"
14
15namespace VideoCommon::Shader {
16
17namespace {
18
19using Maxwell = Tegra::Engines::Maxwell3D::Regs;
20
21// TODO(Rodrigo): Change this to constexpr std::unordered_set in C++20
22
23/// Attribute offsets that describe a vector
24constexpr std::array VECTORS = {
25 28, // gl_Position
26 32, // Generic 0
27 36, // Generic 1
28 40, // Generic 2
29 44, // Generic 3
30 48, // Generic 4
31 52, // Generic 5
32 56, // Generic 6
33 60, // Generic 7
34 64, // Generic 8
35 68, // Generic 9
36 72, // Generic 10
37 76, // Generic 11
38 80, // Generic 12
39 84, // Generic 13
40 88, // Generic 14
41 92, // Generic 15
42 96, // Generic 16
43 100, // Generic 17
44 104, // Generic 18
45 108, // Generic 19
46 112, // Generic 20
47 116, // Generic 21
48 120, // Generic 22
49 124, // Generic 23
50 128, // Generic 24
51 132, // Generic 25
52 136, // Generic 26
53 140, // Generic 27
54 144, // Generic 28
55 148, // Generic 29
56 152, // Generic 30
57 156, // Generic 31
58 160, // gl_FrontColor
59 164, // gl_FrontSecondaryColor
60 160, // gl_BackColor
61 164, // gl_BackSecondaryColor
62 192, // gl_TexCoord[0]
63 196, // gl_TexCoord[1]
64 200, // gl_TexCoord[2]
65 204, // gl_TexCoord[3]
66 208, // gl_TexCoord[4]
67 212, // gl_TexCoord[5]
68 216, // gl_TexCoord[6]
69 220, // gl_TexCoord[7]
70};
71} // namespace
72
73std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info) {
74
75 std::unordered_map<u8, VaryingTFB> tfb;
76
77 for (std::size_t buffer = 0; buffer < Maxwell::NumTransformFeedbackBuffers; ++buffer) {
78 const auto& locations = info.tfb_varying_locs[buffer];
79 const auto& layout = info.tfb_layouts[buffer];
80 const std::size_t varying_count = layout.varying_count;
81
82 std::size_t highest = 0;
83
84 for (std::size_t offset = 0; offset < varying_count; ++offset) {
85 const std::size_t base_offset = offset;
86 const u8 location = locations[offset];
87
88 VaryingTFB varying;
89 varying.buffer = layout.stream;
90 varying.stride = layout.stride;
91 varying.offset = offset * sizeof(u32);
92 varying.components = 1;
93
94 if (std::find(VECTORS.begin(), VECTORS.end(), location / 4 * 4) != VECTORS.end()) {
95 UNIMPLEMENTED_IF_MSG(location % 4 != 0, "Unaligned TFB");
96
97 const u8 base_index = location / 4;
98 while (offset + 1 < varying_count && base_index == locations[offset + 1] / 4) {
99 ++offset;
100 ++varying.components;
101 }
102 }
103
104 [[maybe_unused]] const bool inserted = tfb.emplace(location, varying).second;
105 UNIMPLEMENTED_IF_MSG(!inserted, "Varying already stored");
106
107 highest = std::max(highest, (base_offset + varying.components) * sizeof(u32));
108 }
109
110 UNIMPLEMENTED_IF(highest != layout.stride);
111 }
112 return tfb;
113}
114
115} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/transform_feedback.h b/src/video_core/shader/transform_feedback.h
new file mode 100644
index 000000000..77d05f64c
--- /dev/null
+++ b/src/video_core/shader/transform_feedback.h
@@ -0,0 +1,23 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <unordered_map>
8
9#include "common/common_types.h"
10#include "video_core/shader/registry.h"
11
12namespace VideoCommon::Shader {
13
14struct VaryingTFB {
15 std::size_t buffer;
16 std::size_t stride;
17 std::size_t offset;
18 std::size_t components;
19};
20
21std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info);
22
23} // namespace VideoCommon::Shader
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index 9707c353d..cc7181229 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -111,6 +111,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format)
111 return PixelFormat::RGBA16F; 111 return PixelFormat::RGBA16F;
112 case Tegra::RenderTargetFormat::RGBA16_UNORM: 112 case Tegra::RenderTargetFormat::RGBA16_UNORM:
113 return PixelFormat::RGBA16U; 113 return PixelFormat::RGBA16U;
114 case Tegra::RenderTargetFormat::RGBA16_SNORM:
115 return PixelFormat::RGBA16S;
114 case Tegra::RenderTargetFormat::RGBA16_UINT: 116 case Tegra::RenderTargetFormat::RGBA16_UINT:
115 return PixelFormat::RGBA16UI; 117 return PixelFormat::RGBA16UI;
116 case Tegra::RenderTargetFormat::RGBA32_FLOAT: 118 case Tegra::RenderTargetFormat::RGBA32_FLOAT:
diff --git a/src/video_core/surface.h b/src/video_core/surface.h
index d88109e5a..ae8817465 100644
--- a/src/video_core/surface.h
+++ b/src/video_core/surface.h
@@ -25,82 +25,83 @@ enum class PixelFormat {
25 R8UI = 7, 25 R8UI = 7,
26 RGBA16F = 8, 26 RGBA16F = 8,
27 RGBA16U = 9, 27 RGBA16U = 9,
28 RGBA16UI = 10, 28 RGBA16S = 10,
29 R11FG11FB10F = 11, 29 RGBA16UI = 11,
30 RGBA32UI = 12, 30 R11FG11FB10F = 12,
31 DXT1 = 13, 31 RGBA32UI = 13,
32 DXT23 = 14, 32 DXT1 = 14,
33 DXT45 = 15, 33 DXT23 = 15,
34 DXN1 = 16, // This is also known as BC4 34 DXT45 = 16,
35 DXN2UNORM = 17, 35 DXN1 = 17, // This is also known as BC4
36 DXN2SNORM = 18, 36 DXN2UNORM = 18,
37 BC7U = 19, 37 DXN2SNORM = 19,
38 BC6H_UF16 = 20, 38 BC7U = 20,
39 BC6H_SF16 = 21, 39 BC6H_UF16 = 21,
40 ASTC_2D_4X4 = 22, 40 BC6H_SF16 = 22,
41 BGRA8 = 23, 41 ASTC_2D_4X4 = 23,
42 RGBA32F = 24, 42 BGRA8 = 24,
43 RG32F = 25, 43 RGBA32F = 25,
44 R32F = 26, 44 RG32F = 26,
45 R16F = 27, 45 R32F = 27,
46 R16U = 28, 46 R16F = 28,
47 R16S = 29, 47 R16U = 29,
48 R16UI = 30, 48 R16S = 30,
49 R16I = 31, 49 R16UI = 31,
50 RG16 = 32, 50 R16I = 32,
51 RG16F = 33, 51 RG16 = 33,
52 RG16UI = 34, 52 RG16F = 34,
53 RG16I = 35, 53 RG16UI = 35,
54 RG16S = 36, 54 RG16I = 36,
55 RGB32F = 37, 55 RG16S = 37,
56 RGBA8_SRGB = 38, 56 RGB32F = 38,
57 RG8U = 39, 57 RGBA8_SRGB = 39,
58 RG8S = 40, 58 RG8U = 40,
59 RG32UI = 41, 59 RG8S = 41,
60 RGBX16F = 42, 60 RG32UI = 42,
61 R32UI = 43, 61 RGBX16F = 43,
62 R32I = 44, 62 R32UI = 44,
63 ASTC_2D_8X8 = 45, 63 R32I = 45,
64 ASTC_2D_8X5 = 46, 64 ASTC_2D_8X8 = 46,
65 ASTC_2D_5X4 = 47, 65 ASTC_2D_8X5 = 47,
66 BGRA8_SRGB = 48, 66 ASTC_2D_5X4 = 48,
67 DXT1_SRGB = 49, 67 BGRA8_SRGB = 49,
68 DXT23_SRGB = 50, 68 DXT1_SRGB = 50,
69 DXT45_SRGB = 51, 69 DXT23_SRGB = 51,
70 BC7U_SRGB = 52, 70 DXT45_SRGB = 52,
71 R4G4B4A4U = 53, 71 BC7U_SRGB = 53,
72 ASTC_2D_4X4_SRGB = 54, 72 R4G4B4A4U = 54,
73 ASTC_2D_8X8_SRGB = 55, 73 ASTC_2D_4X4_SRGB = 55,
74 ASTC_2D_8X5_SRGB = 56, 74 ASTC_2D_8X8_SRGB = 56,
75 ASTC_2D_5X4_SRGB = 57, 75 ASTC_2D_8X5_SRGB = 57,
76 ASTC_2D_5X5 = 58, 76 ASTC_2D_5X4_SRGB = 58,
77 ASTC_2D_5X5_SRGB = 59, 77 ASTC_2D_5X5 = 59,
78 ASTC_2D_10X8 = 60, 78 ASTC_2D_5X5_SRGB = 60,
79 ASTC_2D_10X8_SRGB = 61, 79 ASTC_2D_10X8 = 61,
80 ASTC_2D_6X6 = 62, 80 ASTC_2D_10X8_SRGB = 62,
81 ASTC_2D_6X6_SRGB = 63, 81 ASTC_2D_6X6 = 63,
82 ASTC_2D_10X10 = 64, 82 ASTC_2D_6X6_SRGB = 64,
83 ASTC_2D_10X10_SRGB = 65, 83 ASTC_2D_10X10 = 65,
84 ASTC_2D_12X12 = 66, 84 ASTC_2D_10X10_SRGB = 66,
85 ASTC_2D_12X12_SRGB = 67, 85 ASTC_2D_12X12 = 67,
86 ASTC_2D_8X6 = 68, 86 ASTC_2D_12X12_SRGB = 68,
87 ASTC_2D_8X6_SRGB = 69, 87 ASTC_2D_8X6 = 69,
88 ASTC_2D_6X5 = 70, 88 ASTC_2D_8X6_SRGB = 70,
89 ASTC_2D_6X5_SRGB = 71, 89 ASTC_2D_6X5 = 71,
90 E5B9G9R9F = 72, 90 ASTC_2D_6X5_SRGB = 72,
91 E5B9G9R9F = 73,
91 92
92 MaxColorFormat, 93 MaxColorFormat,
93 94
94 // Depth formats 95 // Depth formats
95 Z32F = 73, 96 Z32F = 74,
96 Z16 = 74, 97 Z16 = 75,
97 98
98 MaxDepthFormat, 99 MaxDepthFormat,
99 100
100 // DepthStencil formats 101 // DepthStencil formats
101 Z24S8 = 75, 102 Z24S8 = 76,
102 S8Z24 = 76, 103 S8Z24 = 77,
103 Z32FS8 = 77, 104 Z32FS8 = 78,
104 105
105 MaxDepthStencilFormat, 106 MaxDepthStencilFormat,
106 107
@@ -138,6 +139,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{
138 0, // R8UI 139 0, // R8UI
139 0, // RGBA16F 140 0, // RGBA16F
140 0, // RGBA16U 141 0, // RGBA16U
142 0, // RGBA16S
141 0, // RGBA16UI 143 0, // RGBA16UI
142 0, // R11FG11FB10F 144 0, // R11FG11FB10F
143 0, // RGBA32UI 145 0, // RGBA32UI
@@ -235,6 +237,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{
235 1, // R8UI 237 1, // R8UI
236 1, // RGBA16F 238 1, // RGBA16F
237 1, // RGBA16U 239 1, // RGBA16U
240 1, // RGBA16S
238 1, // RGBA16UI 241 1, // RGBA16UI
239 1, // R11FG11FB10F 242 1, // R11FG11FB10F
240 1, // RGBA32UI 243 1, // RGBA32UI
@@ -324,6 +327,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{
324 1, // R8UI 327 1, // R8UI
325 1, // RGBA16F 328 1, // RGBA16F
326 1, // RGBA16U 329 1, // RGBA16U
330 1, // RGBA16S
327 1, // RGBA16UI 331 1, // RGBA16UI
328 1, // R11FG11FB10F 332 1, // R11FG11FB10F
329 1, // RGBA32UI 333 1, // RGBA32UI
@@ -413,6 +417,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{
413 8, // R8UI 417 8, // R8UI
414 64, // RGBA16F 418 64, // RGBA16F
415 64, // RGBA16U 419 64, // RGBA16U
420 64, // RGBA16S
416 64, // RGBA16UI 421 64, // RGBA16UI
417 32, // R11FG11FB10F 422 32, // R11FG11FB10F
418 128, // RGBA32UI 423 128, // RGBA32UI
@@ -517,6 +522,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table
517 SurfaceCompression::None, // R8UI 522 SurfaceCompression::None, // R8UI
518 SurfaceCompression::None, // RGBA16F 523 SurfaceCompression::None, // RGBA16F
519 SurfaceCompression::None, // RGBA16U 524 SurfaceCompression::None, // RGBA16U
525 SurfaceCompression::None, // RGBA16S
520 SurfaceCompression::None, // RGBA16UI 526 SurfaceCompression::None, // RGBA16UI
521 SurfaceCompression::None, // R11FG11FB10F 527 SurfaceCompression::None, // R11FG11FB10F
522 SurfaceCompression::None, // RGBA32UI 528 SurfaceCompression::None, // RGBA32UI
diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp
index cc3ad8417..e151c26c4 100644
--- a/src/video_core/texture_cache/format_lookup_table.cpp
+++ b/src/video_core/texture_cache/format_lookup_table.cpp
@@ -41,7 +41,7 @@ struct Table {
41 ComponentType alpha_component; 41 ComponentType alpha_component;
42 bool is_srgb; 42 bool is_srgb;
43}; 43};
44constexpr std::array<Table, 75> DefinitionTable = {{ 44constexpr std::array<Table, 76> DefinitionTable = {{
45 {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U}, 45 {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U},
46 {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S}, 46 {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S},
47 {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, 47 {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI},
@@ -61,6 +61,7 @@ constexpr std::array<Table, 75> DefinitionTable = {{
61 {TextureFormat::G8R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG8U}, 61 {TextureFormat::G8R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG8U},
62 {TextureFormat::G8R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG8S}, 62 {TextureFormat::G8R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG8S},
63 63
64 {TextureFormat::R16_G16_B16_A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RGBA16S},
64 {TextureFormat::R16_G16_B16_A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA16U}, 65 {TextureFormat::R16_G16_B16_A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA16U},
65 {TextureFormat::R16_G16_B16_A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA16F}, 66 {TextureFormat::R16_G16_B16_A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA16F},
66 {TextureFormat::R16_G16_B16_A16, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA16UI}, 67 {TextureFormat::R16_G16_B16_A16, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA16UI},
diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp
index f00839313..9931c5ef7 100644
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -113,8 +113,10 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta
113 params.height = tic.Height(); 113 params.height = tic.Height();
114 params.depth = tic.Depth(); 114 params.depth = tic.Depth();
115 params.pitch = params.is_tiled ? 0 : tic.Pitch(); 115 params.pitch = params.is_tiled ? 0 : tic.Pitch();
116 if (params.target == SurfaceTarget::TextureCubemap || 116 if (params.target == SurfaceTarget::Texture2D && params.depth > 1) {
117 params.target == SurfaceTarget::TextureCubeArray) { 117 params.depth = 1;
118 } else if (params.target == SurfaceTarget::TextureCubemap ||
119 params.target == SurfaceTarget::TextureCubeArray) {
118 params.depth *= 6; 120 params.depth *= 6;
119 } 121 }
120 params.num_levels = tic.max_mip_level + 1; 122 params.num_levels = tic.max_mip_level + 1;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 51373b687..6cdbe63d0 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -104,6 +104,11 @@ public:
104 if (!cache_addr) { 104 if (!cache_addr) {
105 return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); 105 return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
106 } 106 }
107
108 if (!IsTypeCompatible(tic.texture_type, entry)) {
109 return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
110 }
111
107 const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)}; 112 const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)};
108 const auto [surface, view] = GetSurface(gpu_addr, cache_addr, params, true, false); 113 const auto [surface, view] = GetSurface(gpu_addr, cache_addr, params, true, false);
109 if (guard_samplers) { 114 if (guard_samplers) {
@@ -914,13 +919,15 @@ private:
914 params.width = 1; 919 params.width = 1;
915 params.height = 1; 920 params.height = 1;
916 params.depth = 1; 921 params.depth = 1;
922 if (target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray) {
923 params.depth = 6;
924 }
917 params.pitch = 4; 925 params.pitch = 4;
918 params.num_levels = 1; 926 params.num_levels = 1;
919 params.emulated_levels = 1; 927 params.emulated_levels = 1;
920 params.pixel_format = VideoCore::Surface::PixelFormat::RGBA16F; 928 params.pixel_format = VideoCore::Surface::PixelFormat::R8U;
921 params.type = VideoCore::Surface::SurfaceType::ColorTexture; 929 params.type = VideoCore::Surface::SurfaceType::ColorTexture;
922 auto surface = CreateSurface(0ULL, params); 930 auto surface = CreateSurface(0ULL, params);
923 invalid_memory.clear();
924 invalid_memory.resize(surface->GetHostSizeInBytes(), 0U); 931 invalid_memory.resize(surface->GetHostSizeInBytes(), 0U);
925 surface->UploadTexture(invalid_memory); 932 surface->UploadTexture(invalid_memory);
926 surface->MarkAsModified(false, Tick()); 933 surface->MarkAsModified(false, Tick());
@@ -1082,6 +1089,36 @@ private:
1082 return siblings_table[static_cast<std::size_t>(format)]; 1089 return siblings_table[static_cast<std::size_t>(format)];
1083 } 1090 }
1084 1091
1092 /// Returns true the shader sampler entry is compatible with the TIC texture type.
1093 static bool IsTypeCompatible(Tegra::Texture::TextureType tic_type,
1094 const VideoCommon::Shader::Sampler& entry) {
1095 const auto shader_type = entry.GetType();
1096 switch (tic_type) {
1097 case Tegra::Texture::TextureType::Texture1D:
1098 case Tegra::Texture::TextureType::Texture1DArray:
1099 return shader_type == Tegra::Shader::TextureType::Texture1D;
1100 case Tegra::Texture::TextureType::Texture1DBuffer:
1101 // TODO(Rodrigo): Assume as valid for now
1102 return true;
1103 case Tegra::Texture::TextureType::Texture2D:
1104 case Tegra::Texture::TextureType::Texture2DNoMipmap:
1105 return shader_type == Tegra::Shader::TextureType::Texture2D;
1106 case Tegra::Texture::TextureType::Texture2DArray:
1107 return shader_type == Tegra::Shader::TextureType::Texture2D ||
1108 shader_type == Tegra::Shader::TextureType::TextureCube;
1109 case Tegra::Texture::TextureType::Texture3D:
1110 return shader_type == Tegra::Shader::TextureType::Texture3D;
1111 case Tegra::Texture::TextureType::TextureCubeArray:
1112 case Tegra::Texture::TextureType::TextureCubemap:
1113 if (shader_type == Tegra::Shader::TextureType::TextureCube) {
1114 return true;
1115 }
1116 return shader_type == Tegra::Shader::TextureType::Texture2D && entry.IsArray();
1117 }
1118 UNREACHABLE();
1119 return true;
1120 }
1121
1085 struct FramebufferTargetInfo { 1122 struct FramebufferTargetInfo {
1086 TSurface target; 1123 TSurface target;
1087 TView view; 1124 TView view;
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 33bd31865..062b4f252 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -17,26 +17,37 @@
17 17
18#include <algorithm> 18#include <algorithm>
19#include <cassert> 19#include <cassert>
20#include <cstdint>
21#include <cstring> 20#include <cstring>
22#include <vector> 21#include <vector>
23 22
23#include "common/common_types.h"
24
24#include "video_core/textures/astc.h" 25#include "video_core/textures/astc.h"
25 26
27namespace {
28
29/// Count the number of bits set in a number.
30constexpr u32 Popcnt(u32 n) {
31 u32 c = 0;
32 for (; n; c++) {
33 n &= n - 1;
34 }
35 return c;
36}
37
38} // Anonymous namespace
39
26class InputBitStream { 40class InputBitStream {
27public: 41public:
28 explicit InputBitStream(const unsigned char* ptr, int start_offset = 0) 42 explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
29 : m_CurByte(ptr), m_NextBit(start_offset % 8) {} 43 : m_CurByte(ptr), m_NextBit(start_offset % 8) {}
30 44
31 ~InputBitStream() = default; 45 std::size_t GetBitsRead() const {
32
33 int GetBitsRead() const {
34 return m_BitsRead; 46 return m_BitsRead;
35 } 47 }
36 48
37 int ReadBit() { 49 u32 ReadBit() {
38 50 u32 bit = *m_CurByte >> m_NextBit++;
39 int bit = *m_CurByte >> m_NextBit++;
40 while (m_NextBit >= 8) { 51 while (m_NextBit >= 8) {
41 m_NextBit -= 8; 52 m_NextBit -= 8;
42 m_CurByte++; 53 m_CurByte++;
@@ -46,57 +57,66 @@ public:
46 return bit & 1; 57 return bit & 1;
47 } 58 }
48 59
49 unsigned int ReadBits(unsigned int nBits) { 60 u32 ReadBits(std::size_t nBits) {
50 unsigned int ret = 0; 61 u32 ret = 0;
51 for (unsigned int i = 0; i < nBits; i++) { 62 for (std::size_t i = 0; i < nBits; ++i) {
63 ret |= (ReadBit() & 1) << i;
64 }
65 return ret;
66 }
67
68 template <std::size_t nBits>
69 u32 ReadBits() {
70 u32 ret = 0;
71 for (std::size_t i = 0; i < nBits; ++i) {
52 ret |= (ReadBit() & 1) << i; 72 ret |= (ReadBit() & 1) << i;
53 } 73 }
54 return ret; 74 return ret;
55 } 75 }
56 76
57private: 77private:
58 const unsigned char* m_CurByte; 78 const u8* m_CurByte;
59 int m_NextBit = 0; 79 std::size_t m_NextBit = 0;
60 int m_BitsRead = 0; 80 std::size_t m_BitsRead = 0;
61}; 81};
62 82
63class OutputBitStream { 83class OutputBitStream {
64public: 84public:
65 explicit OutputBitStream(unsigned char* ptr, int nBits = 0, int start_offset = 0) 85 explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0)
66 : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {} 86 : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {}
67 87
68 ~OutputBitStream() = default; 88 ~OutputBitStream() = default;
69 89
70 int GetBitsWritten() const { 90 s32 GetBitsWritten() const {
71 return m_BitsWritten; 91 return m_BitsWritten;
72 } 92 }
73 93
74 void WriteBitsR(unsigned int val, unsigned int nBits) { 94 void WriteBitsR(u32 val, u32 nBits) {
75 for (unsigned int i = 0; i < nBits; i++) { 95 for (u32 i = 0; i < nBits; i++) {
76 WriteBit((val >> (nBits - i - 1)) & 1); 96 WriteBit((val >> (nBits - i - 1)) & 1);
77 } 97 }
78 } 98 }
79 99
80 void WriteBits(unsigned int val, unsigned int nBits) { 100 void WriteBits(u32 val, u32 nBits) {
81 for (unsigned int i = 0; i < nBits; i++) { 101 for (u32 i = 0; i < nBits; i++) {
82 WriteBit((val >> i) & 1); 102 WriteBit((val >> i) & 1);
83 } 103 }
84 } 104 }
85 105
86private: 106private:
87 void WriteBit(int b) { 107 void WriteBit(s32 b) {
88 108
89 if (done) 109 if (done)
90 return; 110 return;
91 111
92 const unsigned int mask = 1 << m_NextBit++; 112 const u32 mask = 1 << m_NextBit++;
93 113
94 // clear the bit 114 // clear the bit
95 *m_CurByte &= static_cast<unsigned char>(~mask); 115 *m_CurByte &= static_cast<u8>(~mask);
96 116
97 // Write the bit, if necessary 117 // Write the bit, if necessary
98 if (b) 118 if (b)
99 *m_CurByte |= static_cast<unsigned char>(mask); 119 *m_CurByte |= static_cast<u8>(mask);
100 120
101 // Next byte? 121 // Next byte?
102 if (m_NextBit >= 8) { 122 if (m_NextBit >= 8) {
@@ -107,10 +127,10 @@ private:
107 done = done || ++m_BitsWritten >= m_NumBits; 127 done = done || ++m_BitsWritten >= m_NumBits;
108 } 128 }
109 129
110 int m_BitsWritten = 0; 130 s32 m_BitsWritten = 0;
111 const int m_NumBits; 131 const s32 m_NumBits;
112 unsigned char* m_CurByte; 132 u8* m_CurByte;
113 int m_NextBit = 0; 133 s32 m_NextBit = 0;
114 134
115 bool done = false; 135 bool done = false;
116}; 136};
@@ -123,20 +143,20 @@ public:
123 Bits(const Bits&) = delete; 143 Bits(const Bits&) = delete;
124 Bits& operator=(const Bits&) = delete; 144 Bits& operator=(const Bits&) = delete;
125 145
126 uint8_t operator[](uint32_t bitPos) const { 146 u8 operator[](u32 bitPos) const {
127 return static_cast<uint8_t>((m_Bits >> bitPos) & 1); 147 return static_cast<u8>((m_Bits >> bitPos) & 1);
128 } 148 }
129 149
130 IntType operator()(uint32_t start, uint32_t end) const { 150 IntType operator()(u32 start, u32 end) const {
131 if (start == end) { 151 if (start == end) {
132 return (*this)[start]; 152 return (*this)[start];
133 } else if (start > end) { 153 } else if (start > end) {
134 uint32_t t = start; 154 u32 t = start;
135 start = end; 155 start = end;
136 end = t; 156 end = t;
137 } 157 }
138 158
139 uint64_t mask = (1 << (end - start + 1)) - 1; 159 u64 mask = (1 << (end - start + 1)) - 1;
140 return (m_Bits >> start) & static_cast<IntType>(mask); 160 return (m_Bits >> start) & static_cast<IntType>(mask);
141 } 161 }
142 162
@@ -144,273 +164,236 @@ private:
144 const IntType& m_Bits; 164 const IntType& m_Bits;
145}; 165};
146 166
147enum EIntegerEncoding { eIntegerEncoding_JustBits, eIntegerEncoding_Quint, eIntegerEncoding_Trit }; 167enum class IntegerEncoding { JustBits, Qus32, Trit };
148
149class IntegerEncodedValue {
150private:
151 const EIntegerEncoding m_Encoding;
152 const uint32_t m_NumBits;
153 uint32_t m_BitValue;
154 union {
155 uint32_t m_QuintValue;
156 uint32_t m_TritValue;
157 };
158 168
159public: 169struct IntegerEncodedValue {
160 // Jank, but we're not doing any heavy lifting in this class, so it's 170 constexpr IntegerEncodedValue() = default;
161 // probably OK. It allows us to use these in std::vectors...
162 IntegerEncodedValue& operator=(const IntegerEncodedValue& other) {
163 new (this) IntegerEncodedValue(other);
164 return *this;
165 }
166 171
167 IntegerEncodedValue(EIntegerEncoding encoding, uint32_t numBits) 172 constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
168 : m_Encoding(encoding), m_NumBits(numBits) {} 173 : encoding{encoding_}, num_bits{num_bits_} {}
169 174
170 EIntegerEncoding GetEncoding() const { 175 constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
171 return m_Encoding; 176 return encoding == other.encoding && num_bits == other.num_bits;
172 }
173 uint32_t BaseBitLength() const {
174 return m_NumBits;
175 }
176
177 uint32_t GetBitValue() const {
178 return m_BitValue;
179 }
180 void SetBitValue(uint32_t val) {
181 m_BitValue = val;
182 }
183
184 uint32_t GetTritValue() const {
185 return m_TritValue;
186 }
187 void SetTritValue(uint32_t val) {
188 m_TritValue = val;
189 }
190
191 uint32_t GetQuintValue() const {
192 return m_QuintValue;
193 }
194 void SetQuintValue(uint32_t val) {
195 m_QuintValue = val;
196 }
197
198 bool MatchesEncoding(const IntegerEncodedValue& other) const {
199 return m_Encoding == other.m_Encoding && m_NumBits == other.m_NumBits;
200 } 177 }
201 178
202 // Returns the number of bits required to encode nVals values. 179 // Returns the number of bits required to encode nVals values.
203 uint32_t GetBitLength(uint32_t nVals) const { 180 u32 GetBitLength(u32 nVals) const {
204 uint32_t totalBits = m_NumBits * nVals; 181 u32 totalBits = num_bits * nVals;
205 if (m_Encoding == eIntegerEncoding_Trit) { 182 if (encoding == IntegerEncoding::Trit) {
206 totalBits += (nVals * 8 + 4) / 5; 183 totalBits += (nVals * 8 + 4) / 5;
207 } else if (m_Encoding == eIntegerEncoding_Quint) { 184 } else if (encoding == IntegerEncoding::Qus32) {
208 totalBits += (nVals * 7 + 2) / 3; 185 totalBits += (nVals * 7 + 2) / 3;
209 } 186 }
210 return totalBits; 187 return totalBits;
211 } 188 }
212 189
213 // Count the number of bits set in a number. 190 IntegerEncoding encoding{};
214 static inline uint32_t Popcnt(uint32_t n) { 191 u32 num_bits = 0;
215 uint32_t c; 192 u32 bit_value = 0;
216 for (c = 0; n; c++) { 193 union {
217 n &= n - 1; 194 u32 qus32_value = 0;
195 u32 trit_value;
196 };
197};
198
199static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
200 u32 nBitsPerValue) {
201 // Implement the algorithm in section C.2.12
202 u32 m[5];
203 u32 t[5];
204 u32 T;
205
206 // Read the trit encoded block according to
207 // table C.2.14
208 m[0] = bits.ReadBits(nBitsPerValue);
209 T = bits.ReadBits<2>();
210 m[1] = bits.ReadBits(nBitsPerValue);
211 T |= bits.ReadBits<2>() << 2;
212 m[2] = bits.ReadBits(nBitsPerValue);
213 T |= bits.ReadBit() << 4;
214 m[3] = bits.ReadBits(nBitsPerValue);
215 T |= bits.ReadBits<2>() << 5;
216 m[4] = bits.ReadBits(nBitsPerValue);
217 T |= bits.ReadBit() << 7;
218
219 u32 C = 0;
220
221 Bits<u32> Tb(T);
222 if (Tb(2, 4) == 7) {
223 C = (Tb(5, 7) << 2) | Tb(0, 1);
224 t[4] = t[3] = 2;
225 } else {
226 C = Tb(0, 4);
227 if (Tb(5, 6) == 3) {
228 t[4] = 2;
229 t[3] = Tb[7];
230 } else {
231 t[4] = Tb[7];
232 t[3] = Tb(5, 6);
218 } 233 }
219 return c;
220 } 234 }
221 235
222 // Returns a new instance of this struct that corresponds to the 236 Bits<u32> Cb(C);
223 // can take no more than maxval values 237 if (Cb(0, 1) == 3) {
224 static IntegerEncodedValue CreateEncoding(uint32_t maxVal) { 238 t[2] = 2;
225 while (maxVal > 0) { 239 t[1] = Cb[4];
226 uint32_t check = maxVal + 1; 240 t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]);
227 241 } else if (Cb(2, 3) == 3) {
228 // Is maxVal a power of two? 242 t[2] = 2;
229 if (!(check & (check - 1))) { 243 t[1] = 2;
230 return IntegerEncodedValue(eIntegerEncoding_JustBits, Popcnt(maxVal)); 244 t[0] = Cb(0, 1);
231 } 245 } else {
232 246 t[2] = Cb[4];
233 // Is maxVal of the type 3*2^n - 1? 247 t[1] = Cb(2, 3);
234 if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { 248 t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]);
235 return IntegerEncodedValue(eIntegerEncoding_Trit, Popcnt(check / 3 - 1)); 249 }
236 }
237 250
238 // Is maxVal of the type 5*2^n - 1? 251 for (std::size_t i = 0; i < 5; ++i) {
239 if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { 252 IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Trit, nBitsPerValue);
240 return IntegerEncodedValue(eIntegerEncoding_Quint, Popcnt(check / 5 - 1)); 253 val.bit_value = m[i];
241 } 254 val.trit_value = t[i];
255 }
256}
242 257
243 // Apparently it can't be represented with a bounded integer sequence... 258static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
244 // just iterate. 259 u32 nBitsPerValue) {
245 maxVal--; 260 // Implement the algorithm in section C.2.12
261 u32 m[3];
262 u32 q[3];
263 u32 Q;
264
265 // Read the trit encoded block according to
266 // table C.2.15
267 m[0] = bits.ReadBits(nBitsPerValue);
268 Q = bits.ReadBits<3>();
269 m[1] = bits.ReadBits(nBitsPerValue);
270 Q |= bits.ReadBits<2>() << 3;
271 m[2] = bits.ReadBits(nBitsPerValue);
272 Q |= bits.ReadBits<2>() << 5;
273
274 Bits<u32> Qb(Q);
275 if (Qb(1, 2) == 3 && Qb(5, 6) == 0) {
276 q[0] = q[1] = 4;
277 q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]);
278 } else {
279 u32 C = 0;
280 if (Qb(1, 2) == 3) {
281 q[2] = 4;
282 C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0];
283 } else {
284 q[2] = Qb(5, 6);
285 C = Qb(0, 4);
246 } 286 }
247 return IntegerEncodedValue(eIntegerEncoding_JustBits, 0);
248 }
249
250 // Fills result with the values that are encoded in the given
251 // bitstream. We must know beforehand what the maximum possible
252 // value is, and how many values we're decoding.
253 static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result,
254 InputBitStream& bits, uint32_t maxRange, uint32_t nValues) {
255 // Determine encoding parameters
256 IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(maxRange);
257
258 // Start decoding
259 uint32_t nValsDecoded = 0;
260 while (nValsDecoded < nValues) {
261 switch (val.GetEncoding()) {
262 case eIntegerEncoding_Quint:
263 DecodeQuintBlock(bits, result, val.BaseBitLength());
264 nValsDecoded += 3;
265 break;
266 287
267 case eIntegerEncoding_Trit: 288 Bits<u32> Cb(C);
268 DecodeTritBlock(bits, result, val.BaseBitLength()); 289 if (Cb(0, 2) == 5) {
269 nValsDecoded += 5; 290 q[1] = 4;
270 break; 291 q[0] = Cb(3, 4);
271 292 } else {
272 case eIntegerEncoding_JustBits: 293 q[1] = Cb(3, 4);
273 val.SetBitValue(bits.ReadBits(val.BaseBitLength())); 294 q[0] = Cb(0, 2);
274 result.push_back(val);
275 nValsDecoded++;
276 break;
277 }
278 } 295 }
279 } 296 }
280 297
281private: 298 for (std::size_t i = 0; i < 3; ++i) {
282 static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, 299 IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Qus32, nBitsPerValue);
283 uint32_t nBitsPerValue) { 300 val.bit_value = m[i];
284 // Implement the algorithm in section C.2.12 301 val.qus32_value = q[i];
285 uint32_t m[5]; 302 }
286 uint32_t t[5]; 303}
287 uint32_t T; 304
288 305// Returns a new instance of this struct that corresponds to the
289 // Read the trit encoded block according to 306// can take no more than maxval values
290 // table C.2.14 307static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) {
291 m[0] = bits.ReadBits(nBitsPerValue); 308 while (maxVal > 0) {
292 T = bits.ReadBits(2); 309 u32 check = maxVal + 1;
293 m[1] = bits.ReadBits(nBitsPerValue); 310
294 T |= bits.ReadBits(2) << 2; 311 // Is maxVal a power of two?
295 m[2] = bits.ReadBits(nBitsPerValue); 312 if (!(check & (check - 1))) {
296 T |= bits.ReadBit() << 4; 313 return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal));
297 m[3] = bits.ReadBits(nBitsPerValue);
298 T |= bits.ReadBits(2) << 5;
299 m[4] = bits.ReadBits(nBitsPerValue);
300 T |= bits.ReadBit() << 7;
301
302 uint32_t C = 0;
303
304 Bits<uint32_t> Tb(T);
305 if (Tb(2, 4) == 7) {
306 C = (Tb(5, 7) << 2) | Tb(0, 1);
307 t[4] = t[3] = 2;
308 } else {
309 C = Tb(0, 4);
310 if (Tb(5, 6) == 3) {
311 t[4] = 2;
312 t[3] = Tb[7];
313 } else {
314 t[4] = Tb[7];
315 t[3] = Tb(5, 6);
316 }
317 } 314 }
318 315
319 Bits<uint32_t> Cb(C); 316 // Is maxVal of the type 3*2^n - 1?
320 if (Cb(0, 1) == 3) { 317 if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
321 t[2] = 2; 318 return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1));
322 t[1] = Cb[4];
323 t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]);
324 } else if (Cb(2, 3) == 3) {
325 t[2] = 2;
326 t[1] = 2;
327 t[0] = Cb(0, 1);
328 } else {
329 t[2] = Cb[4];
330 t[1] = Cb(2, 3);
331 t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]);
332 } 319 }
333 320
334 for (uint32_t i = 0; i < 5; i++) { 321 // Is maxVal of the type 5*2^n - 1?
335 IntegerEncodedValue val(eIntegerEncoding_Trit, nBitsPerValue); 322 if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
336 val.SetBitValue(m[i]); 323 return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1));
337 val.SetTritValue(t[i]);
338 result.push_back(val);
339 } 324 }
325
326 // Apparently it can't be represented with a bounded integer sequence...
327 // just iterate.
328 maxVal--;
340 } 329 }
330 return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
331}
341 332
342 static void DecodeQuintBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, 333static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
343 uint32_t nBitsPerValue) { 334 std::array<IntegerEncodedValue, 256> encodings{};
344 // Implement the algorithm in section C.2.12 335 for (std::size_t i = 0; i < encodings.size(); ++i) {
345 uint32_t m[3]; 336 encodings[i] = CreateEncoding(static_cast<u32>(i));
346 uint32_t q[3]; 337 }
347 uint32_t Q; 338 return encodings;
348 339}
349 // Read the trit encoded block according to
350 // table C.2.15
351 m[0] = bits.ReadBits(nBitsPerValue);
352 Q = bits.ReadBits(3);
353 m[1] = bits.ReadBits(nBitsPerValue);
354 Q |= bits.ReadBits(2) << 3;
355 m[2] = bits.ReadBits(nBitsPerValue);
356 Q |= bits.ReadBits(2) << 5;
357
358 Bits<uint32_t> Qb(Q);
359 if (Qb(1, 2) == 3 && Qb(5, 6) == 0) {
360 q[0] = q[1] = 4;
361 q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]);
362 } else {
363 uint32_t C = 0;
364 if (Qb(1, 2) == 3) {
365 q[2] = 4;
366 C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0];
367 } else {
368 q[2] = Qb(5, 6);
369 C = Qb(0, 4);
370 }
371 340
372 Bits<uint32_t> Cb(C); 341static constexpr std::array EncodingsValues = MakeEncodedValues();
373 if (Cb(0, 2) == 5) { 342
374 q[1] = 4; 343// Fills result with the values that are encoded in the given
375 q[0] = Cb(3, 4); 344// bitstream. We must know beforehand what the maximum possible
376 } else { 345// value is, and how many values we're decoding.
377 q[1] = Cb(3, 4); 346static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits,
378 q[0] = Cb(0, 2); 347 u32 maxRange, u32 nValues) {
379 } 348 // Determine encoding parameters
380 } 349 IntegerEncodedValue val = EncodingsValues[maxRange];
350
351 // Start decoding
352 u32 nValsDecoded = 0;
353 while (nValsDecoded < nValues) {
354 switch (val.encoding) {
355 case IntegerEncoding::Qus32:
356 DecodeQus32Block(bits, result, val.num_bits);
357 nValsDecoded += 3;
358 break;
359
360 case IntegerEncoding::Trit:
361 DecodeTritBlock(bits, result, val.num_bits);
362 nValsDecoded += 5;
363 break;
381 364
382 for (uint32_t i = 0; i < 3; i++) { 365 case IntegerEncoding::JustBits:
383 IntegerEncodedValue val(eIntegerEncoding_Quint, nBitsPerValue); 366 val.bit_value = bits.ReadBits(val.num_bits);
384 val.m_BitValue = m[i];
385 val.m_QuintValue = q[i];
386 result.push_back(val); 367 result.push_back(val);
368 nValsDecoded++;
369 break;
387 } 370 }
388 } 371 }
389}; 372}
390 373
391namespace ASTCC { 374namespace ASTCC {
392 375
393struct TexelWeightParams { 376struct TexelWeightParams {
394 uint32_t m_Width = 0; 377 u32 m_Width = 0;
395 uint32_t m_Height = 0; 378 u32 m_Height = 0;
396 bool m_bDualPlane = false; 379 bool m_bDualPlane = false;
397 uint32_t m_MaxWeight = 0; 380 u32 m_MaxWeight = 0;
398 bool m_bError = false; 381 bool m_bError = false;
399 bool m_bVoidExtentLDR = false; 382 bool m_bVoidExtentLDR = false;
400 bool m_bVoidExtentHDR = false; 383 bool m_bVoidExtentHDR = false;
401 384
402 uint32_t GetPackedBitSize() const { 385 u32 GetPackedBitSize() const {
403 // How many indices do we have? 386 // How many indices do we have?
404 uint32_t nIdxs = m_Height * m_Width; 387 u32 nIdxs = m_Height * m_Width;
405 if (m_bDualPlane) { 388 if (m_bDualPlane) {
406 nIdxs *= 2; 389 nIdxs *= 2;
407 } 390 }
408 391
409 return IntegerEncodedValue::CreateEncoding(m_MaxWeight).GetBitLength(nIdxs); 392 return EncodingsValues[m_MaxWeight].GetBitLength(nIdxs);
410 } 393 }
411 394
412 uint32_t GetNumWeightValues() const { 395 u32 GetNumWeightValues() const {
413 uint32_t ret = m_Width * m_Height; 396 u32 ret = m_Width * m_Height;
414 if (m_bDualPlane) { 397 if (m_bDualPlane) {
415 ret *= 2; 398 ret *= 2;
416 } 399 }
@@ -422,7 +405,7 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
422 TexelWeightParams params; 405 TexelWeightParams params;
423 406
424 // Read the entire block mode all at once 407 // Read the entire block mode all at once
425 uint16_t modeBits = static_cast<uint16_t>(strm.ReadBits(11)); 408 u16 modeBits = static_cast<u16>(strm.ReadBits<11>());
426 409
427 // Does this match the void extent block mode? 410 // Does this match the void extent block mode?
428 if ((modeBits & 0x01FF) == 0x1FC) { 411 if ((modeBits & 0x01FF) == 0x1FC) {
@@ -457,7 +440,7 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
457 // of the block mode. Layout is determined by a number 440 // of the block mode. Layout is determined by a number
458 // between 0 and 9 corresponding to table C.2.8 of the 441 // between 0 and 9 corresponding to table C.2.8 of the
459 // ASTC spec. 442 // ASTC spec.
460 uint32_t layout = 0; 443 u32 layout = 0;
461 444
462 if ((modeBits & 0x1) || (modeBits & 0x2)) { 445 if ((modeBits & 0x1) || (modeBits & 0x2)) {
463 // layout is in [0-4] 446 // layout is in [0-4]
@@ -509,7 +492,7 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
509 assert(layout < 10); 492 assert(layout < 10);
510 493
511 // Determine R 494 // Determine R
512 uint32_t R = !!(modeBits & 0x10); 495 u32 R = !!(modeBits & 0x10);
513 if (layout < 5) { 496 if (layout < 5) {
514 R |= (modeBits & 0x3) << 1; 497 R |= (modeBits & 0x3) << 1;
515 } else { 498 } else {
@@ -520,54 +503,54 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
520 // Determine width & height 503 // Determine width & height
521 switch (layout) { 504 switch (layout) {
522 case 0: { 505 case 0: {
523 uint32_t A = (modeBits >> 5) & 0x3; 506 u32 A = (modeBits >> 5) & 0x3;
524 uint32_t B = (modeBits >> 7) & 0x3; 507 u32 B = (modeBits >> 7) & 0x3;
525 params.m_Width = B + 4; 508 params.m_Width = B + 4;
526 params.m_Height = A + 2; 509 params.m_Height = A + 2;
527 break; 510 break;
528 } 511 }
529 512
530 case 1: { 513 case 1: {
531 uint32_t A = (modeBits >> 5) & 0x3; 514 u32 A = (modeBits >> 5) & 0x3;
532 uint32_t B = (modeBits >> 7) & 0x3; 515 u32 B = (modeBits >> 7) & 0x3;
533 params.m_Width = B + 8; 516 params.m_Width = B + 8;
534 params.m_Height = A + 2; 517 params.m_Height = A + 2;
535 break; 518 break;
536 } 519 }
537 520
538 case 2: { 521 case 2: {
539 uint32_t A = (modeBits >> 5) & 0x3; 522 u32 A = (modeBits >> 5) & 0x3;
540 uint32_t B = (modeBits >> 7) & 0x3; 523 u32 B = (modeBits >> 7) & 0x3;
541 params.m_Width = A + 2; 524 params.m_Width = A + 2;
542 params.m_Height = B + 8; 525 params.m_Height = B + 8;
543 break; 526 break;
544 } 527 }
545 528
546 case 3: { 529 case 3: {
547 uint32_t A = (modeBits >> 5) & 0x3; 530 u32 A = (modeBits >> 5) & 0x3;
548 uint32_t B = (modeBits >> 7) & 0x1; 531 u32 B = (modeBits >> 7) & 0x1;
549 params.m_Width = A + 2; 532 params.m_Width = A + 2;
550 params.m_Height = B + 6; 533 params.m_Height = B + 6;
551 break; 534 break;
552 } 535 }
553 536
554 case 4: { 537 case 4: {
555 uint32_t A = (modeBits >> 5) & 0x3; 538 u32 A = (modeBits >> 5) & 0x3;
556 uint32_t B = (modeBits >> 7) & 0x1; 539 u32 B = (modeBits >> 7) & 0x1;
557 params.m_Width = B + 2; 540 params.m_Width = B + 2;
558 params.m_Height = A + 2; 541 params.m_Height = A + 2;
559 break; 542 break;
560 } 543 }
561 544
562 case 5: { 545 case 5: {
563 uint32_t A = (modeBits >> 5) & 0x3; 546 u32 A = (modeBits >> 5) & 0x3;
564 params.m_Width = 12; 547 params.m_Width = 12;
565 params.m_Height = A + 2; 548 params.m_Height = A + 2;
566 break; 549 break;
567 } 550 }
568 551
569 case 6: { 552 case 6: {
570 uint32_t A = (modeBits >> 5) & 0x3; 553 u32 A = (modeBits >> 5) & 0x3;
571 params.m_Width = A + 2; 554 params.m_Width = A + 2;
572 params.m_Height = 12; 555 params.m_Height = 12;
573 break; 556 break;
@@ -586,15 +569,15 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
586 } 569 }
587 570
588 case 9: { 571 case 9: {
589 uint32_t A = (modeBits >> 5) & 0x3; 572 u32 A = (modeBits >> 5) & 0x3;
590 uint32_t B = (modeBits >> 9) & 0x3; 573 u32 B = (modeBits >> 9) & 0x3;
591 params.m_Width = A + 6; 574 params.m_Width = A + 6;
592 params.m_Height = B + 6; 575 params.m_Height = B + 6;
593 break; 576 break;
594 } 577 }
595 578
596 default: 579 default:
597 assert(!"Don't know this layout..."); 580 assert(false && "Don't know this layout...");
598 params.m_bError = true; 581 params.m_bError = true;
599 break; 582 break;
600 } 583 }
@@ -605,10 +588,10 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
605 bool H = (layout != 9) && (modeBits & 0x200); 588 bool H = (layout != 9) && (modeBits & 0x200);
606 589
607 if (H) { 590 if (H) {
608 const uint32_t maxWeights[6] = {9, 11, 15, 19, 23, 31}; 591 const u32 maxWeights[6] = {9, 11, 15, 19, 23, 31};
609 params.m_MaxWeight = maxWeights[R - 2]; 592 params.m_MaxWeight = maxWeights[R - 2];
610 } else { 593 } else {
611 const uint32_t maxWeights[6] = {1, 2, 3, 4, 5, 7}; 594 const u32 maxWeights[6] = {1, 2, 3, 4, 5, 7};
612 params.m_MaxWeight = maxWeights[R - 2]; 595 params.m_MaxWeight = maxWeights[R - 2];
613 } 596 }
614 597
@@ -617,32 +600,32 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
617 return params; 600 return params;
618} 601}
619 602
620static void FillVoidExtentLDR(InputBitStream& strm, uint32_t* const outBuf, uint32_t blockWidth, 603static void FillVoidExtentLDR(InputBitStream& strm, u32* const outBuf, u32 blockWidth,
621 uint32_t blockHeight) { 604 u32 blockHeight) {
622 // Don't actually care about the void extent, just read the bits... 605 // Don't actually care about the void extent, just read the bits...
623 for (int i = 0; i < 4; ++i) { 606 for (s32 i = 0; i < 4; ++i) {
624 strm.ReadBits(13); 607 strm.ReadBits<13>();
625 } 608 }
626 609
627 // Decode the RGBA components and renormalize them to the range [0, 255] 610 // Decode the RGBA components and renormalize them to the range [0, 255]
628 uint16_t r = static_cast<uint16_t>(strm.ReadBits(16)); 611 u16 r = static_cast<u16>(strm.ReadBits<16>());
629 uint16_t g = static_cast<uint16_t>(strm.ReadBits(16)); 612 u16 g = static_cast<u16>(strm.ReadBits<16>());
630 uint16_t b = static_cast<uint16_t>(strm.ReadBits(16)); 613 u16 b = static_cast<u16>(strm.ReadBits<16>());
631 uint16_t a = static_cast<uint16_t>(strm.ReadBits(16)); 614 u16 a = static_cast<u16>(strm.ReadBits<16>());
632 615
633 uint32_t rgba = (r >> 8) | (g & 0xFF00) | (static_cast<uint32_t>(b) & 0xFF00) << 8 | 616 u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
634 (static_cast<uint32_t>(a) & 0xFF00) << 16; 617 (static_cast<u32>(a) & 0xFF00) << 16;
635 618
636 for (uint32_t j = 0; j < blockHeight; j++) { 619 for (u32 j = 0; j < blockHeight; j++) {
637 for (uint32_t i = 0; i < blockWidth; i++) { 620 for (u32 i = 0; i < blockWidth; i++) {
638 outBuf[j * blockWidth + i] = rgba; 621 outBuf[j * blockWidth + i] = rgba;
639 } 622 }
640 } 623 }
641} 624}
642 625
643static void FillError(uint32_t* outBuf, uint32_t blockWidth, uint32_t blockHeight) { 626static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) {
644 for (uint32_t j = 0; j < blockHeight; j++) { 627 for (u32 j = 0; j < blockHeight; j++) {
645 for (uint32_t i = 0; i < blockWidth; i++) { 628 for (u32 i = 0; i < blockWidth; i++) {
646 outBuf[j * blockWidth + i] = 0xFFFF00FF; 629 outBuf[j * blockWidth + i] = 0xFFFF00FF;
647 } 630 }
648 } 631 }
@@ -651,18 +634,18 @@ static void FillError(uint32_t* outBuf, uint32_t blockWidth, uint32_t blockHeigh
651// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] 634// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
652// is the same as [(numBits - 1):0] and repeats all the way down. 635// is the same as [(numBits - 1):0] and repeats all the way down.
653template <typename IntType> 636template <typename IntType>
654static IntType Replicate(const IntType& val, uint32_t numBits, uint32_t toBit) { 637static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
655 if (numBits == 0) 638 if (numBits == 0)
656 return 0; 639 return 0;
657 if (toBit == 0) 640 if (toBit == 0)
658 return 0; 641 return 0;
659 IntType v = val & static_cast<IntType>((1 << numBits) - 1); 642 IntType v = val & static_cast<IntType>((1 << numBits) - 1);
660 IntType res = v; 643 IntType res = v;
661 uint32_t reslen = numBits; 644 u32 reslen = numBits;
662 while (reslen < toBit) { 645 while (reslen < toBit) {
663 uint32_t comp = 0; 646 u32 comp = 0;
664 if (numBits > toBit - reslen) { 647 if (numBits > toBit - reslen) {
665 uint32_t newshift = toBit - reslen; 648 u32 newshift = toBit - reslen;
666 comp = numBits - newshift; 649 comp = numBits - newshift;
667 numBits = newshift; 650 numBits = newshift;
668 } 651 }
@@ -675,14 +658,14 @@ static IntType Replicate(const IntType& val, uint32_t numBits, uint32_t toBit) {
675 658
676class Pixel { 659class Pixel {
677protected: 660protected:
678 using ChannelType = int16_t; 661 using ChannelType = s16;
679 uint8_t m_BitDepth[4] = {8, 8, 8, 8}; 662 u8 m_BitDepth[4] = {8, 8, 8, 8};
680 int16_t color[4] = {}; 663 s16 color[4] = {};
681 664
682public: 665public:
683 Pixel() = default; 666 Pixel() = default;
684 Pixel(uint32_t a, uint32_t r, uint32_t g, uint32_t b, unsigned bitDepth = 8) 667 Pixel(u32 a, u32 r, u32 g, u32 b, u32 bitDepth = 8)
685 : m_BitDepth{uint8_t(bitDepth), uint8_t(bitDepth), uint8_t(bitDepth), uint8_t(bitDepth)}, 668 : m_BitDepth{u8(bitDepth), u8(bitDepth), u8(bitDepth), u8(bitDepth)},
686 color{static_cast<ChannelType>(a), static_cast<ChannelType>(r), 669 color{static_cast<ChannelType>(a), static_cast<ChannelType>(r),
687 static_cast<ChannelType>(g), static_cast<ChannelType>(b)} {} 670 static_cast<ChannelType>(g), static_cast<ChannelType>(b)} {}
688 671
@@ -691,22 +674,22 @@ public:
691 // significant bits when going from larger to smaller bit depth 674 // significant bits when going from larger to smaller bit depth
692 // or by repeating the most significant bits when going from 675 // or by repeating the most significant bits when going from
693 // smaller to larger bit depths. 676 // smaller to larger bit depths.
694 void ChangeBitDepth(const uint8_t (&depth)[4]) { 677 void ChangeBitDepth(const u8 (&depth)[4]) {
695 for (uint32_t i = 0; i < 4; i++) { 678 for (u32 i = 0; i < 4; i++) {
696 Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]); 679 Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]);
697 m_BitDepth[i] = depth[i]; 680 m_BitDepth[i] = depth[i];
698 } 681 }
699 } 682 }
700 683
701 template <typename IntType> 684 template <typename IntType>
702 static float ConvertChannelToFloat(IntType channel, uint8_t bitDepth) { 685 static float ConvertChannelToFloat(IntType channel, u8 bitDepth) {
703 float denominator = static_cast<float>((1 << bitDepth) - 1); 686 float denominator = static_cast<float>((1 << bitDepth) - 1);
704 return static_cast<float>(channel) / denominator; 687 return static_cast<float>(channel) / denominator;
705 } 688 }
706 689
707 // Changes the bit depth of a single component. See the comment 690 // Changes the bit depth of a single component. See the comment
708 // above for how we do this. 691 // above for how we do this.
709 static ChannelType ChangeBitDepth(Pixel::ChannelType val, uint8_t oldDepth, uint8_t newDepth) { 692 static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) {
710 assert(newDepth <= 8); 693 assert(newDepth <= 8);
711 assert(oldDepth <= 8); 694 assert(oldDepth <= 8);
712 695
@@ -722,16 +705,15 @@ public:
722 if (newDepth == 0) { 705 if (newDepth == 0) {
723 return 0xFF; 706 return 0xFF;
724 } else { 707 } else {
725 uint8_t bitsWasted = static_cast<uint8_t>(oldDepth - newDepth); 708 u8 bitsWasted = static_cast<u8>(oldDepth - newDepth);
726 uint16_t v = static_cast<uint16_t>(val); 709 u16 v = static_cast<u16>(val);
727 v = static_cast<uint16_t>((v + (1 << (bitsWasted - 1))) >> bitsWasted); 710 v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
728 v = ::std::min<uint16_t>(::std::max<uint16_t>(0, v), 711 v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1));
729 static_cast<uint16_t>((1 << newDepth) - 1)); 712 return static_cast<u8>(v);
730 return static_cast<uint8_t>(v);
731 } 713 }
732 } 714 }
733 715
734 assert(!"We shouldn't get here."); 716 assert(false && "We shouldn't get here.");
735 return 0; 717 return 0;
736 } 718 }
737 719
@@ -759,15 +741,15 @@ public:
759 ChannelType& B() { 741 ChannelType& B() {
760 return color[3]; 742 return color[3];
761 } 743 }
762 const ChannelType& Component(uint32_t idx) const { 744 const ChannelType& Component(u32 idx) const {
763 return color[idx]; 745 return color[idx];
764 } 746 }
765 ChannelType& Component(uint32_t idx) { 747 ChannelType& Component(u32 idx) {
766 return color[idx]; 748 return color[idx];
767 } 749 }
768 750
769 void GetBitDepth(uint8_t (&outDepth)[4]) const { 751 void GetBitDepth(u8 (&outDepth)[4]) const {
770 for (int i = 0; i < 4; i++) { 752 for (s32 i = 0; i < 4; i++) {
771 outDepth[i] = m_BitDepth[i]; 753 outDepth[i] = m_BitDepth[i];
772 } 754 }
773 } 755 }
@@ -776,12 +758,12 @@ public:
776 // and then pack each channel into an R8G8B8A8 32-bit integer. We assume 758 // and then pack each channel into an R8G8B8A8 32-bit integer. We assume
777 // that the architecture is little-endian, so the alpha channel will end 759 // that the architecture is little-endian, so the alpha channel will end
778 // up in the most-significant byte. 760 // up in the most-significant byte.
779 uint32_t Pack() const { 761 u32 Pack() const {
780 Pixel eightBit(*this); 762 Pixel eightBit(*this);
781 const uint8_t eightBitDepth[4] = {8, 8, 8, 8}; 763 const u8 eightBitDepth[4] = {8, 8, 8, 8};
782 eightBit.ChangeBitDepth(eightBitDepth); 764 eightBit.ChangeBitDepth(eightBitDepth);
783 765
784 uint32_t r = 0; 766 u32 r = 0;
785 r |= eightBit.A(); 767 r |= eightBit.A();
786 r <<= 8; 768 r <<= 8;
787 r |= eightBit.B(); 769 r |= eightBit.B();
@@ -794,7 +776,7 @@ public:
794 776
795 // Clamps the pixel to the range [0,255] 777 // Clamps the pixel to the range [0,255]
796 void ClampByte() { 778 void ClampByte() {
797 for (uint32_t i = 0; i < 4; i++) { 779 for (u32 i = 0; i < 4; i++) {
798 color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); 780 color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
799 } 781 }
800 } 782 }
@@ -804,24 +786,24 @@ public:
804 } 786 }
805}; 787};
806 788
807static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* modes, 789static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nPartitions,
808 const uint32_t nPartitions, const uint32_t nBitsForColorData) { 790 const u32 nBitsForColorData) {
809 // First figure out how many color values we have 791 // First figure out how many color values we have
810 uint32_t nValues = 0; 792 u32 nValues = 0;
811 for (uint32_t i = 0; i < nPartitions; i++) { 793 for (u32 i = 0; i < nPartitions; i++) {
812 nValues += ((modes[i] >> 2) + 1) << 1; 794 nValues += ((modes[i] >> 2) + 1) << 1;
813 } 795 }
814 796
815 // Then based on the number of values and the remaining number of bits, 797 // Then based on the number of values and the remaining number of bits,
816 // figure out the max value for each of them... 798 // figure out the max value for each of them...
817 uint32_t range = 256; 799 u32 range = 256;
818 while (--range > 0) { 800 while (--range > 0) {
819 IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(range); 801 IntegerEncodedValue val = EncodingsValues[range];
820 uint32_t bitLength = val.GetBitLength(nValues); 802 u32 bitLength = val.GetBitLength(nValues);
821 if (bitLength <= nBitsForColorData) { 803 if (bitLength <= nBitsForColorData) {
822 // Find the smallest possible range that matches the given encoding 804 // Find the smallest possible range that matches the given encoding
823 while (--range > 0) { 805 while (--range > 0) {
824 IntegerEncodedValue newval = IntegerEncodedValue::CreateEncoding(range); 806 IntegerEncodedValue newval = EncodingsValues[range];
825 if (!newval.MatchesEncoding(val)) { 807 if (!newval.MatchesEncoding(val)) {
826 break; 808 break;
827 } 809 }
@@ -835,12 +817,14 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode
835 817
836 // We now have enough to decode our integer sequence. 818 // We now have enough to decode our integer sequence.
837 std::vector<IntegerEncodedValue> decodedColorValues; 819 std::vector<IntegerEncodedValue> decodedColorValues;
820 decodedColorValues.reserve(32);
821
838 InputBitStream colorStream(data); 822 InputBitStream colorStream(data);
839 IntegerEncodedValue::DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); 823 DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
840 824
841 // Once we have the decoded values, we need to dequantize them to the 0-255 range 825 // Once we have the decoded values, we need to dequantize them to the 0-255 range
842 // This procedure is outlined in ASTC spec C.2.13 826 // This procedure is outlined in ASTC spec C.2.13
843 uint32_t outIdx = 0; 827 u32 outIdx = 0;
844 for (auto itr = decodedColorValues.begin(); itr != decodedColorValues.end(); ++itr) { 828 for (auto itr = decodedColorValues.begin(); itr != decodedColorValues.end(); ++itr) {
845 // Have we already decoded all that we need? 829 // Have we already decoded all that we need?
846 if (outIdx >= nValues) { 830 if (outIdx >= nValues) {
@@ -848,25 +832,25 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode
848 } 832 }
849 833
850 const IntegerEncodedValue& val = *itr; 834 const IntegerEncodedValue& val = *itr;
851 uint32_t bitlen = val.BaseBitLength(); 835 u32 bitlen = val.num_bits;
852 uint32_t bitval = val.GetBitValue(); 836 u32 bitval = val.bit_value;
853 837
854 assert(bitlen >= 1); 838 assert(bitlen >= 1);
855 839
856 uint32_t A = 0, B = 0, C = 0, D = 0; 840 u32 A = 0, B = 0, C = 0, D = 0;
857 // A is just the lsb replicated 9 times. 841 // A is just the lsb replicated 9 times.
858 A = Replicate(bitval & 1, 1, 9); 842 A = Replicate(bitval & 1, 1, 9);
859 843
860 switch (val.GetEncoding()) { 844 switch (val.encoding) {
861 // Replicate bits 845 // Replicate bits
862 case eIntegerEncoding_JustBits: 846 case IntegerEncoding::JustBits:
863 out[outIdx++] = Replicate(bitval, bitlen, 8); 847 out[outIdx++] = Replicate(bitval, bitlen, 8);
864 break; 848 break;
865 849
866 // Use algorithm in C.2.13 850 // Use algorithm in C.2.13
867 case eIntegerEncoding_Trit: { 851 case IntegerEncoding::Trit: {
868 852
869 D = val.GetTritValue(); 853 D = val.trit_value;
870 854
871 switch (bitlen) { 855 switch (bitlen) {
872 case 1: { 856 case 1: {
@@ -876,48 +860,48 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode
876 case 2: { 860 case 2: {
877 C = 93; 861 C = 93;
878 // B = b000b0bb0 862 // B = b000b0bb0
879 uint32_t b = (bitval >> 1) & 1; 863 u32 b = (bitval >> 1) & 1;
880 B = (b << 8) | (b << 4) | (b << 2) | (b << 1); 864 B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
881 } break; 865 } break;
882 866
883 case 3: { 867 case 3: {
884 C = 44; 868 C = 44;
885 // B = cb000cbcb 869 // B = cb000cbcb
886 uint32_t cb = (bitval >> 1) & 3; 870 u32 cb = (bitval >> 1) & 3;
887 B = (cb << 7) | (cb << 2) | cb; 871 B = (cb << 7) | (cb << 2) | cb;
888 } break; 872 } break;
889 873
890 case 4: { 874 case 4: {
891 C = 22; 875 C = 22;
892 // B = dcb000dcb 876 // B = dcb000dcb
893 uint32_t dcb = (bitval >> 1) & 7; 877 u32 dcb = (bitval >> 1) & 7;
894 B = (dcb << 6) | dcb; 878 B = (dcb << 6) | dcb;
895 } break; 879 } break;
896 880
897 case 5: { 881 case 5: {
898 C = 11; 882 C = 11;
899 // B = edcb000ed 883 // B = edcb000ed
900 uint32_t edcb = (bitval >> 1) & 0xF; 884 u32 edcb = (bitval >> 1) & 0xF;
901 B = (edcb << 5) | (edcb >> 2); 885 B = (edcb << 5) | (edcb >> 2);
902 } break; 886 } break;
903 887
904 case 6: { 888 case 6: {
905 C = 5; 889 C = 5;
906 // B = fedcb000f 890 // B = fedcb000f
907 uint32_t fedcb = (bitval >> 1) & 0x1F; 891 u32 fedcb = (bitval >> 1) & 0x1F;
908 B = (fedcb << 4) | (fedcb >> 4); 892 B = (fedcb << 4) | (fedcb >> 4);
909 } break; 893 } break;
910 894
911 default: 895 default:
912 assert(!"Unsupported trit encoding for color values!"); 896 assert(false && "Unsupported trit encoding for color values!");
913 break; 897 break;
914 } // switch(bitlen) 898 } // switch(bitlen)
915 } // case eIntegerEncoding_Trit 899 } // case IntegerEncoding::Trit
916 break; 900 break;
917 901
918 case eIntegerEncoding_Quint: { 902 case IntegerEncoding::Qus32: {
919 903
920 D = val.GetQuintValue(); 904 D = val.qus32_value;
921 905
922 switch (bitlen) { 906 switch (bitlen) {
923 case 1: { 907 case 1: {
@@ -927,41 +911,41 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode
927 case 2: { 911 case 2: {
928 C = 54; 912 C = 54;
929 // B = b0000bb00 913 // B = b0000bb00
930 uint32_t b = (bitval >> 1) & 1; 914 u32 b = (bitval >> 1) & 1;
931 B = (b << 8) | (b << 3) | (b << 2); 915 B = (b << 8) | (b << 3) | (b << 2);
932 } break; 916 } break;
933 917
934 case 3: { 918 case 3: {
935 C = 26; 919 C = 26;
936 // B = cb0000cbc 920 // B = cb0000cbc
937 uint32_t cb = (bitval >> 1) & 3; 921 u32 cb = (bitval >> 1) & 3;
938 B = (cb << 7) | (cb << 1) | (cb >> 1); 922 B = (cb << 7) | (cb << 1) | (cb >> 1);
939 } break; 923 } break;
940 924
941 case 4: { 925 case 4: {
942 C = 13; 926 C = 13;
943 // B = dcb0000dc 927 // B = dcb0000dc
944 uint32_t dcb = (bitval >> 1) & 7; 928 u32 dcb = (bitval >> 1) & 7;
945 B = (dcb << 6) | (dcb >> 1); 929 B = (dcb << 6) | (dcb >> 1);
946 } break; 930 } break;
947 931
948 case 5: { 932 case 5: {
949 C = 6; 933 C = 6;
950 // B = edcb0000e 934 // B = edcb0000e
951 uint32_t edcb = (bitval >> 1) & 0xF; 935 u32 edcb = (bitval >> 1) & 0xF;
952 B = (edcb << 5) | (edcb >> 3); 936 B = (edcb << 5) | (edcb >> 3);
953 } break; 937 } break;
954 938
955 default: 939 default:
956 assert(!"Unsupported quint encoding for color values!"); 940 assert(false && "Unsupported quint encoding for color values!");
957 break; 941 break;
958 } // switch(bitlen) 942 } // switch(bitlen)
959 } // case eIntegerEncoding_Quint 943 } // case IntegerEncoding::Qus32
960 break; 944 break;
961 } // switch(val.GetEncoding()) 945 } // switch(val.encoding)
962 946
963 if (val.GetEncoding() != eIntegerEncoding_JustBits) { 947 if (val.encoding != IntegerEncoding::JustBits) {
964 uint32_t T = D * C + B; 948 u32 T = D * C + B;
965 T ^= A; 949 T ^= A;
966 T = (A & 0x80) | (T >> 2); 950 T = (A & 0x80) | (T >> 2);
967 out[outIdx++] = T; 951 out[outIdx++] = T;
@@ -969,31 +953,31 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode
969 } 953 }
970 954
971 // Make sure that each of our values is in the proper range... 955 // Make sure that each of our values is in the proper range...
972 for (uint32_t i = 0; i < nValues; i++) { 956 for (u32 i = 0; i < nValues; i++) {
973 assert(out[i] <= 255); 957 assert(out[i] <= 255);
974 } 958 }
975} 959}
976 960
977static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) { 961static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
978 uint32_t bitval = val.GetBitValue(); 962 u32 bitval = val.bit_value;
979 uint32_t bitlen = val.BaseBitLength(); 963 u32 bitlen = val.num_bits;
980 964
981 uint32_t A = Replicate(bitval & 1, 1, 7); 965 u32 A = Replicate(bitval & 1, 1, 7);
982 uint32_t B = 0, C = 0, D = 0; 966 u32 B = 0, C = 0, D = 0;
983 967
984 uint32_t result = 0; 968 u32 result = 0;
985 switch (val.GetEncoding()) { 969 switch (val.encoding) {
986 case eIntegerEncoding_JustBits: 970 case IntegerEncoding::JustBits:
987 result = Replicate(bitval, bitlen, 6); 971 result = Replicate(bitval, bitlen, 6);
988 break; 972 break;
989 973
990 case eIntegerEncoding_Trit: { 974 case IntegerEncoding::Trit: {
991 D = val.GetTritValue(); 975 D = val.trit_value;
992 assert(D < 3); 976 assert(D < 3);
993 977
994 switch (bitlen) { 978 switch (bitlen) {
995 case 0: { 979 case 0: {
996 uint32_t results[3] = {0, 32, 63}; 980 u32 results[3] = {0, 32, 63};
997 result = results[D]; 981 result = results[D];
998 } break; 982 } break;
999 983
@@ -1003,29 +987,29 @@ static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) {
1003 987
1004 case 2: { 988 case 2: {
1005 C = 23; 989 C = 23;
1006 uint32_t b = (bitval >> 1) & 1; 990 u32 b = (bitval >> 1) & 1;
1007 B = (b << 6) | (b << 2) | b; 991 B = (b << 6) | (b << 2) | b;
1008 } break; 992 } break;
1009 993
1010 case 3: { 994 case 3: {
1011 C = 11; 995 C = 11;
1012 uint32_t cb = (bitval >> 1) & 3; 996 u32 cb = (bitval >> 1) & 3;
1013 B = (cb << 5) | cb; 997 B = (cb << 5) | cb;
1014 } break; 998 } break;
1015 999
1016 default: 1000 default:
1017 assert(!"Invalid trit encoding for texel weight"); 1001 assert(false && "Invalid trit encoding for texel weight");
1018 break; 1002 break;
1019 } 1003 }
1020 } break; 1004 } break;
1021 1005
1022 case eIntegerEncoding_Quint: { 1006 case IntegerEncoding::Qus32: {
1023 D = val.GetQuintValue(); 1007 D = val.qus32_value;
1024 assert(D < 5); 1008 assert(D < 5);
1025 1009
1026 switch (bitlen) { 1010 switch (bitlen) {
1027 case 0: { 1011 case 0: {
1028 uint32_t results[5] = {0, 16, 32, 47, 63}; 1012 u32 results[5] = {0, 16, 32, 47, 63};
1029 result = results[D]; 1013 result = results[D];
1030 } break; 1014 } break;
1031 1015
@@ -1035,18 +1019,18 @@ static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) {
1035 1019
1036 case 2: { 1020 case 2: {
1037 C = 13; 1021 C = 13;
1038 uint32_t b = (bitval >> 1) & 1; 1022 u32 b = (bitval >> 1) & 1;
1039 B = (b << 6) | (b << 1); 1023 B = (b << 6) | (b << 1);
1040 } break; 1024 } break;
1041 1025
1042 default: 1026 default:
1043 assert(!"Invalid quint encoding for texel weight"); 1027 assert(false && "Invalid quint encoding for texel weight");
1044 break; 1028 break;
1045 } 1029 }
1046 } break; 1030 } break;
1047 } 1031 }
1048 1032
1049 if (val.GetEncoding() != eIntegerEncoding_JustBits && bitlen > 0) { 1033 if (val.encoding != IntegerEncoding::JustBits && bitlen > 0) {
1050 // Decode the value... 1034 // Decode the value...
1051 result = D * C + B; 1035 result = D * C + B;
1052 result ^= A; 1036 result ^= A;
@@ -1063,12 +1047,11 @@ static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) {
1063 return result; 1047 return result;
1064} 1048}
1065 1049
1066static void UnquantizeTexelWeights(uint32_t out[2][144], 1050static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights,
1067 const std::vector<IntegerEncodedValue>& weights, 1051 const TexelWeightParams& params, const u32 blockWidth,
1068 const TexelWeightParams& params, const uint32_t blockWidth, 1052 const u32 blockHeight) {
1069 const uint32_t blockHeight) { 1053 u32 weightIdx = 0;
1070 uint32_t weightIdx = 0; 1054 u32 unquantized[2][144];
1071 uint32_t unquantized[2][144];
1072 1055
1073 for (auto itr = weights.begin(); itr != weights.end(); ++itr) { 1056 for (auto itr = weights.begin(); itr != weights.end(); ++itr) {
1074 unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr); 1057 unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr);
@@ -1086,34 +1069,34 @@ static void UnquantizeTexelWeights(uint32_t out[2][144],
1086 } 1069 }
1087 1070
1088 // Do infill if necessary (Section C.2.18) ... 1071 // Do infill if necessary (Section C.2.18) ...
1089 uint32_t Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1); 1072 u32 Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1);
1090 uint32_t Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1); 1073 u32 Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1);
1091 1074
1092 const uint32_t kPlaneScale = params.m_bDualPlane ? 2U : 1U; 1075 const u32 kPlaneScale = params.m_bDualPlane ? 2U : 1U;
1093 for (uint32_t plane = 0; plane < kPlaneScale; plane++) 1076 for (u32 plane = 0; plane < kPlaneScale; plane++)
1094 for (uint32_t t = 0; t < blockHeight; t++) 1077 for (u32 t = 0; t < blockHeight; t++)
1095 for (uint32_t s = 0; s < blockWidth; s++) { 1078 for (u32 s = 0; s < blockWidth; s++) {
1096 uint32_t cs = Ds * s; 1079 u32 cs = Ds * s;
1097 uint32_t ct = Dt * t; 1080 u32 ct = Dt * t;
1098 1081
1099 uint32_t gs = (cs * (params.m_Width - 1) + 32) >> 6; 1082 u32 gs = (cs * (params.m_Width - 1) + 32) >> 6;
1100 uint32_t gt = (ct * (params.m_Height - 1) + 32) >> 6; 1083 u32 gt = (ct * (params.m_Height - 1) + 32) >> 6;
1101 1084
1102 uint32_t js = gs >> 4; 1085 u32 js = gs >> 4;
1103 uint32_t fs = gs & 0xF; 1086 u32 fs = gs & 0xF;
1104 1087
1105 uint32_t jt = gt >> 4; 1088 u32 jt = gt >> 4;
1106 uint32_t ft = gt & 0x0F; 1089 u32 ft = gt & 0x0F;
1107 1090
1108 uint32_t w11 = (fs * ft + 8) >> 4; 1091 u32 w11 = (fs * ft + 8) >> 4;
1109 uint32_t w10 = ft - w11; 1092 u32 w10 = ft - w11;
1110 uint32_t w01 = fs - w11; 1093 u32 w01 = fs - w11;
1111 uint32_t w00 = 16 - fs - ft + w11; 1094 u32 w00 = 16 - fs - ft + w11;
1112 1095
1113 uint32_t v0 = js + jt * params.m_Width; 1096 u32 v0 = js + jt * params.m_Width;
1114 1097
1115#define FIND_TEXEL(tidx, bidx) \ 1098#define FIND_TEXEL(tidx, bidx) \
1116 uint32_t p##bidx = 0; \ 1099 u32 p##bidx = 0; \
1117 do { \ 1100 do { \
1118 if ((tidx) < (params.m_Width * params.m_Height)) { \ 1101 if ((tidx) < (params.m_Width * params.m_Height)) { \
1119 p##bidx = unquantized[plane][(tidx)]; \ 1102 p##bidx = unquantized[plane][(tidx)]; \
@@ -1133,7 +1116,7 @@ static void UnquantizeTexelWeights(uint32_t out[2][144],
1133} 1116}
1134 1117
1135// Transfers a bit as described in C.2.14 1118// Transfers a bit as described in C.2.14
1136static inline void BitTransferSigned(int32_t& a, int32_t& b) { 1119static inline void BitTransferSigned(s32& a, s32& b) {
1137 b >>= 1; 1120 b >>= 1;
1138 b |= a & 0x80; 1121 b |= a & 0x80;
1139 a >>= 1; 1122 a >>= 1;
@@ -1144,14 +1127,14 @@ static inline void BitTransferSigned(int32_t& a, int32_t& b) {
1144 1127
1145// Adds more precision to the blue channel as described 1128// Adds more precision to the blue channel as described
1146// in C.2.14 1129// in C.2.14
1147static inline Pixel BlueContract(int32_t a, int32_t r, int32_t g, int32_t b) { 1130static inline Pixel BlueContract(s32 a, s32 r, s32 g, s32 b) {
1148 return Pixel(static_cast<int16_t>(a), static_cast<int16_t>((r + b) >> 1), 1131 return Pixel(static_cast<s16>(a), static_cast<s16>((r + b) >> 1),
1149 static_cast<int16_t>((g + b) >> 1), static_cast<int16_t>(b)); 1132 static_cast<s16>((g + b) >> 1), static_cast<s16>(b));
1150} 1133}
1151 1134
1152// Partition selection functions as specified in 1135// Partition selection functions as specified in
1153// C.2.21 1136// C.2.21
1154static inline uint32_t hash52(uint32_t p) { 1137static inline u32 hash52(u32 p) {
1155 p ^= p >> 15; 1138 p ^= p >> 15;
1156 p -= p << 17; 1139 p -= p << 17;
1157 p += p << 7; 1140 p += p << 7;
@@ -1165,8 +1148,7 @@ static inline uint32_t hash52(uint32_t p) {
1165 return p; 1148 return p;
1166} 1149}
1167 1150
1168static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z, 1151static u32 SelectPartition(s32 seed, s32 x, s32 y, s32 z, s32 partitionCount, s32 smallBlock) {
1169 int32_t partitionCount, int32_t smallBlock) {
1170 if (1 == partitionCount) 1152 if (1 == partitionCount)
1171 return 0; 1153 return 0;
1172 1154
@@ -1178,34 +1160,34 @@ static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z,
1178 1160
1179 seed += (partitionCount - 1) * 1024; 1161 seed += (partitionCount - 1) * 1024;
1180 1162
1181 uint32_t rnum = hash52(static_cast<uint32_t>(seed)); 1163 u32 rnum = hash52(static_cast<u32>(seed));
1182 uint8_t seed1 = static_cast<uint8_t>(rnum & 0xF); 1164 u8 seed1 = static_cast<u8>(rnum & 0xF);
1183 uint8_t seed2 = static_cast<uint8_t>((rnum >> 4) & 0xF); 1165 u8 seed2 = static_cast<u8>((rnum >> 4) & 0xF);
1184 uint8_t seed3 = static_cast<uint8_t>((rnum >> 8) & 0xF); 1166 u8 seed3 = static_cast<u8>((rnum >> 8) & 0xF);
1185 uint8_t seed4 = static_cast<uint8_t>((rnum >> 12) & 0xF); 1167 u8 seed4 = static_cast<u8>((rnum >> 12) & 0xF);
1186 uint8_t seed5 = static_cast<uint8_t>((rnum >> 16) & 0xF); 1168 u8 seed5 = static_cast<u8>((rnum >> 16) & 0xF);
1187 uint8_t seed6 = static_cast<uint8_t>((rnum >> 20) & 0xF); 1169 u8 seed6 = static_cast<u8>((rnum >> 20) & 0xF);
1188 uint8_t seed7 = static_cast<uint8_t>((rnum >> 24) & 0xF); 1170 u8 seed7 = static_cast<u8>((rnum >> 24) & 0xF);
1189 uint8_t seed8 = static_cast<uint8_t>((rnum >> 28) & 0xF); 1171 u8 seed8 = static_cast<u8>((rnum >> 28) & 0xF);
1190 uint8_t seed9 = static_cast<uint8_t>((rnum >> 18) & 0xF); 1172 u8 seed9 = static_cast<u8>((rnum >> 18) & 0xF);
1191 uint8_t seed10 = static_cast<uint8_t>((rnum >> 22) & 0xF); 1173 u8 seed10 = static_cast<u8>((rnum >> 22) & 0xF);
1192 uint8_t seed11 = static_cast<uint8_t>((rnum >> 26) & 0xF); 1174 u8 seed11 = static_cast<u8>((rnum >> 26) & 0xF);
1193 uint8_t seed12 = static_cast<uint8_t>(((rnum >> 30) | (rnum << 2)) & 0xF); 1175 u8 seed12 = static_cast<u8>(((rnum >> 30) | (rnum << 2)) & 0xF);
1194 1176
1195 seed1 = static_cast<uint8_t>(seed1 * seed1); 1177 seed1 = static_cast<u8>(seed1 * seed1);
1196 seed2 = static_cast<uint8_t>(seed2 * seed2); 1178 seed2 = static_cast<u8>(seed2 * seed2);
1197 seed3 = static_cast<uint8_t>(seed3 * seed3); 1179 seed3 = static_cast<u8>(seed3 * seed3);
1198 seed4 = static_cast<uint8_t>(seed4 * seed4); 1180 seed4 = static_cast<u8>(seed4 * seed4);
1199 seed5 = static_cast<uint8_t>(seed5 * seed5); 1181 seed5 = static_cast<u8>(seed5 * seed5);
1200 seed6 = static_cast<uint8_t>(seed6 * seed6); 1182 seed6 = static_cast<u8>(seed6 * seed6);
1201 seed7 = static_cast<uint8_t>(seed7 * seed7); 1183 seed7 = static_cast<u8>(seed7 * seed7);
1202 seed8 = static_cast<uint8_t>(seed8 * seed8); 1184 seed8 = static_cast<u8>(seed8 * seed8);
1203 seed9 = static_cast<uint8_t>(seed9 * seed9); 1185 seed9 = static_cast<u8>(seed9 * seed9);
1204 seed10 = static_cast<uint8_t>(seed10 * seed10); 1186 seed10 = static_cast<u8>(seed10 * seed10);
1205 seed11 = static_cast<uint8_t>(seed11 * seed11); 1187 seed11 = static_cast<u8>(seed11 * seed11);
1206 seed12 = static_cast<uint8_t>(seed12 * seed12); 1188 seed12 = static_cast<u8>(seed12 * seed12);
1207 1189
1208 int32_t sh1, sh2, sh3; 1190 s32 sh1, sh2, sh3;
1209 if (seed & 1) { 1191 if (seed & 1) {
1210 sh1 = (seed & 2) ? 4 : 5; 1192 sh1 = (seed & 2) ? 4 : 5;
1211 sh2 = (partitionCount == 3) ? 6 : 5; 1193 sh2 = (partitionCount == 3) ? 6 : 5;
@@ -1215,23 +1197,23 @@ static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z,
1215 } 1197 }
1216 sh3 = (seed & 0x10) ? sh1 : sh2; 1198 sh3 = (seed & 0x10) ? sh1 : sh2;
1217 1199
1218 seed1 = static_cast<uint8_t>(seed1 >> sh1); 1200 seed1 = static_cast<u8>(seed1 >> sh1);
1219 seed2 = static_cast<uint8_t>(seed2 >> sh2); 1201 seed2 = static_cast<u8>(seed2 >> sh2);
1220 seed3 = static_cast<uint8_t>(seed3 >> sh1); 1202 seed3 = static_cast<u8>(seed3 >> sh1);
1221 seed4 = static_cast<uint8_t>(seed4 >> sh2); 1203 seed4 = static_cast<u8>(seed4 >> sh2);
1222 seed5 = static_cast<uint8_t>(seed5 >> sh1); 1204 seed5 = static_cast<u8>(seed5 >> sh1);
1223 seed6 = static_cast<uint8_t>(seed6 >> sh2); 1205 seed6 = static_cast<u8>(seed6 >> sh2);
1224 seed7 = static_cast<uint8_t>(seed7 >> sh1); 1206 seed7 = static_cast<u8>(seed7 >> sh1);
1225 seed8 = static_cast<uint8_t>(seed8 >> sh2); 1207 seed8 = static_cast<u8>(seed8 >> sh2);
1226 seed9 = static_cast<uint8_t>(seed9 >> sh3); 1208 seed9 = static_cast<u8>(seed9 >> sh3);
1227 seed10 = static_cast<uint8_t>(seed10 >> sh3); 1209 seed10 = static_cast<u8>(seed10 >> sh3);
1228 seed11 = static_cast<uint8_t>(seed11 >> sh3); 1210 seed11 = static_cast<u8>(seed11 >> sh3);
1229 seed12 = static_cast<uint8_t>(seed12 >> sh3); 1211 seed12 = static_cast<u8>(seed12 >> sh3);
1230 1212
1231 int32_t a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); 1213 s32 a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
1232 int32_t b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); 1214 s32 b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
1233 int32_t c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); 1215 s32 c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
1234 int32_t d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); 1216 s32 d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
1235 1217
1236 a &= 0x3F; 1218 a &= 0x3F;
1237 b &= 0x3F; 1219 b &= 0x3F;
@@ -1252,27 +1234,26 @@ static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z,
1252 return 3; 1234 return 3;
1253} 1235}
1254 1236
1255static inline uint32_t Select2DPartition(int32_t seed, int32_t x, int32_t y, int32_t partitionCount, 1237static inline u32 Select2DPartition(s32 seed, s32 x, s32 y, s32 partitionCount, s32 smallBlock) {
1256 int32_t smallBlock) {
1257 return SelectPartition(seed, x, y, 0, partitionCount, smallBlock); 1238 return SelectPartition(seed, x, y, 0, partitionCount, smallBlock);
1258} 1239}
1259 1240
1260// Section C.2.14 1241// Section C.2.14
1261static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValues, 1242static void ComputeEndpos32s(Pixel& ep1, Pixel& ep2, const u32*& colorValues,
1262 uint32_t colorEndpointMode) { 1243 u32 colorEndpos32Mode) {
1263#define READ_UINT_VALUES(N) \ 1244#define READ_UINT_VALUES(N) \
1264 uint32_t v[N]; \ 1245 u32 v[N]; \
1265 for (uint32_t i = 0; i < N; i++) { \ 1246 for (u32 i = 0; i < N; i++) { \
1266 v[i] = *(colorValues++); \ 1247 v[i] = *(colorValues++); \
1267 } 1248 }
1268 1249
1269#define READ_INT_VALUES(N) \ 1250#define READ_INT_VALUES(N) \
1270 int32_t v[N]; \ 1251 s32 v[N]; \
1271 for (uint32_t i = 0; i < N; i++) { \ 1252 for (u32 i = 0; i < N; i++) { \
1272 v[i] = static_cast<int32_t>(*(colorValues++)); \ 1253 v[i] = static_cast<s32>(*(colorValues++)); \
1273 } 1254 }
1274 1255
1275 switch (colorEndpointMode) { 1256 switch (colorEndpos32Mode) {
1276 case 0: { 1257 case 0: {
1277 READ_UINT_VALUES(2) 1258 READ_UINT_VALUES(2)
1278 ep1 = Pixel(0xFF, v[0], v[0], v[0]); 1259 ep1 = Pixel(0xFF, v[0], v[0], v[0]);
@@ -1281,8 +1262,8 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValue
1281 1262
1282 case 1: { 1263 case 1: {
1283 READ_UINT_VALUES(2) 1264 READ_UINT_VALUES(2)
1284 uint32_t L0 = (v[0] >> 2) | (v[1] & 0xC0); 1265 u32 L0 = (v[0] >> 2) | (v[1] & 0xC0);
1285 uint32_t L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU); 1266 u32 L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU);
1286 ep1 = Pixel(0xFF, L0, L0, L0); 1267 ep1 = Pixel(0xFF, L0, L0, L0);
1287 ep2 = Pixel(0xFF, L1, L1, L1); 1268 ep2 = Pixel(0xFF, L1, L1, L1);
1288 } break; 1269 } break;
@@ -1371,7 +1352,7 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValue
1371 } break; 1352 } break;
1372 1353
1373 default: 1354 default:
1374 assert(!"Unsupported color endpoint mode (is it HDR?)"); 1355 assert(false && "Unsupported color endpoint mode (is it HDR?)");
1375 break; 1356 break;
1376 } 1357 }
1377 1358
@@ -1379,14 +1360,14 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValue
1379#undef READ_INT_VALUES 1360#undef READ_INT_VALUES
1380} 1361}
1381 1362
1382static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, 1363static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 blockHeight,
1383 const uint32_t blockHeight, uint32_t* outBuf) { 1364 u32* outBuf) {
1384 InputBitStream strm(inBuf); 1365 InputBitStream strm(inBuf);
1385 TexelWeightParams weightParams = DecodeBlockInfo(strm); 1366 TexelWeightParams weightParams = DecodeBlockInfo(strm);
1386 1367
1387 // Was there an error? 1368 // Was there an error?
1388 if (weightParams.m_bError) { 1369 if (weightParams.m_bError) {
1389 assert(!"Invalid block mode"); 1370 assert(false && "Invalid block mode");
1390 FillError(outBuf, blockWidth, blockHeight); 1371 FillError(outBuf, blockWidth, blockHeight);
1391 return; 1372 return;
1392 } 1373 }
@@ -1397,63 +1378,63 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth,
1397 } 1378 }
1398 1379
1399 if (weightParams.m_bVoidExtentHDR) { 1380 if (weightParams.m_bVoidExtentHDR) {
1400 assert(!"HDR void extent blocks are unsupported!"); 1381 assert(false && "HDR void extent blocks are unsupported!");
1401 FillError(outBuf, blockWidth, blockHeight); 1382 FillError(outBuf, blockWidth, blockHeight);
1402 return; 1383 return;
1403 } 1384 }
1404 1385
1405 if (weightParams.m_Width > blockWidth) { 1386 if (weightParams.m_Width > blockWidth) {
1406 assert(!"Texel weight grid width should be smaller than block width"); 1387 assert(false && "Texel weight grid width should be smaller than block width");
1407 FillError(outBuf, blockWidth, blockHeight); 1388 FillError(outBuf, blockWidth, blockHeight);
1408 return; 1389 return;
1409 } 1390 }
1410 1391
1411 if (weightParams.m_Height > blockHeight) { 1392 if (weightParams.m_Height > blockHeight) {
1412 assert(!"Texel weight grid height should be smaller than block height"); 1393 assert(false && "Texel weight grid height should be smaller than block height");
1413 FillError(outBuf, blockWidth, blockHeight); 1394 FillError(outBuf, blockWidth, blockHeight);
1414 return; 1395 return;
1415 } 1396 }
1416 1397
1417 // Read num partitions 1398 // Read num partitions
1418 uint32_t nPartitions = strm.ReadBits(2) + 1; 1399 u32 nPartitions = strm.ReadBits<2>() + 1;
1419 assert(nPartitions <= 4); 1400 assert(nPartitions <= 4);
1420 1401
1421 if (nPartitions == 4 && weightParams.m_bDualPlane) { 1402 if (nPartitions == 4 && weightParams.m_bDualPlane) {
1422 assert(!"Dual plane mode is incompatible with four partition blocks"); 1403 assert(false && "Dual plane mode is incompatible with four partition blocks");
1423 FillError(outBuf, blockWidth, blockHeight); 1404 FillError(outBuf, blockWidth, blockHeight);
1424 return; 1405 return;
1425 } 1406 }
1426 1407
1427 // Based on the number of partitions, read the color endpoint mode for 1408 // Based on the number of partitions, read the color endpos32 mode for
1428 // each partition. 1409 // each partition.
1429 1410
1430 // Determine partitions, partition index, and color endpoint modes 1411 // Determine partitions, partition index, and color endpos32 modes
1431 int32_t planeIdx = -1; 1412 s32 planeIdx = -1;
1432 uint32_t partitionIndex; 1413 u32 partitionIndex;
1433 uint32_t colorEndpointMode[4] = {0, 0, 0, 0}; 1414 u32 colorEndpos32Mode[4] = {0, 0, 0, 0};
1434 1415
1435 // Define color data. 1416 // Define color data.
1436 uint8_t colorEndpointData[16]; 1417 u8 colorEndpos32Data[16];
1437 memset(colorEndpointData, 0, sizeof(colorEndpointData)); 1418 memset(colorEndpos32Data, 0, sizeof(colorEndpos32Data));
1438 OutputBitStream colorEndpointStream(colorEndpointData, 16 * 8, 0); 1419 OutputBitStream colorEndpos32Stream(colorEndpos32Data, 16 * 8, 0);
1439 1420
1440 // Read extra config data... 1421 // Read extra config data...
1441 uint32_t baseCEM = 0; 1422 u32 baseCEM = 0;
1442 if (nPartitions == 1) { 1423 if (nPartitions == 1) {
1443 colorEndpointMode[0] = strm.ReadBits(4); 1424 colorEndpos32Mode[0] = strm.ReadBits<4>();
1444 partitionIndex = 0; 1425 partitionIndex = 0;
1445 } else { 1426 } else {
1446 partitionIndex = strm.ReadBits(10); 1427 partitionIndex = strm.ReadBits<10>();
1447 baseCEM = strm.ReadBits(6); 1428 baseCEM = strm.ReadBits<6>();
1448 } 1429 }
1449 uint32_t baseMode = (baseCEM & 3); 1430 u32 baseMode = (baseCEM & 3);
1450 1431
1451 // Remaining bits are color endpoint data... 1432 // Remaining bits are color endpos32 data...
1452 uint32_t nWeightBits = weightParams.GetPackedBitSize(); 1433 u32 nWeightBits = weightParams.GetPackedBitSize();
1453 int32_t remainingBits = 128 - nWeightBits - strm.GetBitsRead(); 1434 s32 remainingBits = 128 - nWeightBits - static_cast<s32>(strm.GetBitsRead());
1454 1435
1455 // Consider extra bits prior to texel data... 1436 // Consider extra bits prior to texel data...
1456 uint32_t extraCEMbits = 0; 1437 u32 extraCEMbits = 0;
1457 if (baseMode) { 1438 if (baseMode) {
1458 switch (nPartitions) { 1439 switch (nPartitions) {
1459 case 2: 1440 case 2:
@@ -1473,18 +1454,18 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth,
1473 remainingBits -= extraCEMbits; 1454 remainingBits -= extraCEMbits;
1474 1455
1475 // Do we have a dual plane situation? 1456 // Do we have a dual plane situation?
1476 uint32_t planeSelectorBits = 0; 1457 u32 planeSelectorBits = 0;
1477 if (weightParams.m_bDualPlane) { 1458 if (weightParams.m_bDualPlane) {
1478 planeSelectorBits = 2; 1459 planeSelectorBits = 2;
1479 } 1460 }
1480 remainingBits -= planeSelectorBits; 1461 remainingBits -= planeSelectorBits;
1481 1462
1482 // Read color data... 1463 // Read color data...
1483 uint32_t colorDataBits = remainingBits; 1464 u32 colorDataBits = remainingBits;
1484 while (remainingBits > 0) { 1465 while (remainingBits > 0) {
1485 uint32_t nb = std::min(remainingBits, 8); 1466 u32 nb = std::min(remainingBits, 8);
1486 uint32_t b = strm.ReadBits(nb); 1467 u32 b = strm.ReadBits(nb);
1487 colorEndpointStream.WriteBits(b, nb); 1468 colorEndpos32Stream.WriteBits(b, nb);
1488 remainingBits -= 8; 1469 remainingBits -= 8;
1489 } 1470 }
1490 1471
@@ -1493,64 +1474,64 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth,
1493 1474
1494 // Read the rest of the CEM 1475 // Read the rest of the CEM
1495 if (baseMode) { 1476 if (baseMode) {
1496 uint32_t extraCEM = strm.ReadBits(extraCEMbits); 1477 u32 extraCEM = strm.ReadBits(extraCEMbits);
1497 uint32_t CEM = (extraCEM << 6) | baseCEM; 1478 u32 CEM = (extraCEM << 6) | baseCEM;
1498 CEM >>= 2; 1479 CEM >>= 2;
1499 1480
1500 bool C[4] = {0}; 1481 bool C[4] = {0};
1501 for (uint32_t i = 0; i < nPartitions; i++) { 1482 for (u32 i = 0; i < nPartitions; i++) {
1502 C[i] = CEM & 1; 1483 C[i] = CEM & 1;
1503 CEM >>= 1; 1484 CEM >>= 1;
1504 } 1485 }
1505 1486
1506 uint8_t M[4] = {0}; 1487 u8 M[4] = {0};
1507 for (uint32_t i = 0; i < nPartitions; i++) { 1488 for (u32 i = 0; i < nPartitions; i++) {
1508 M[i] = CEM & 3; 1489 M[i] = CEM & 3;
1509 CEM >>= 2; 1490 CEM >>= 2;
1510 assert(M[i] <= 3); 1491 assert(M[i] <= 3);
1511 } 1492 }
1512 1493
1513 for (uint32_t i = 0; i < nPartitions; i++) { 1494 for (u32 i = 0; i < nPartitions; i++) {
1514 colorEndpointMode[i] = baseMode; 1495 colorEndpos32Mode[i] = baseMode;
1515 if (!(C[i])) 1496 if (!(C[i]))
1516 colorEndpointMode[i] -= 1; 1497 colorEndpos32Mode[i] -= 1;
1517 colorEndpointMode[i] <<= 2; 1498 colorEndpos32Mode[i] <<= 2;
1518 colorEndpointMode[i] |= M[i]; 1499 colorEndpos32Mode[i] |= M[i];
1519 } 1500 }
1520 } else if (nPartitions > 1) { 1501 } else if (nPartitions > 1) {
1521 uint32_t CEM = baseCEM >> 2; 1502 u32 CEM = baseCEM >> 2;
1522 for (uint32_t i = 0; i < nPartitions; i++) { 1503 for (u32 i = 0; i < nPartitions; i++) {
1523 colorEndpointMode[i] = CEM; 1504 colorEndpos32Mode[i] = CEM;
1524 } 1505 }
1525 } 1506 }
1526 1507
1527 // Make sure everything up till here is sane. 1508 // Make sure everything up till here is sane.
1528 for (uint32_t i = 0; i < nPartitions; i++) { 1509 for (u32 i = 0; i < nPartitions; i++) {
1529 assert(colorEndpointMode[i] < 16); 1510 assert(colorEndpos32Mode[i] < 16);
1530 } 1511 }
1531 assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128); 1512 assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128);
1532 1513
1533 // Decode both color data and texel weight data 1514 // Decode both color data and texel weight data
1534 uint32_t colorValues[32]; // Four values, two endpoints, four maximum paritions 1515 u32 colorValues[32]; // Four values, two endpos32s, four maximum paritions
1535 DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions, 1516 DecodeColorValues(colorValues, colorEndpos32Data, colorEndpos32Mode, nPartitions,
1536 colorDataBits); 1517 colorDataBits);
1537 1518
1538 Pixel endpoints[4][2]; 1519 Pixel endpos32s[4][2];
1539 const uint32_t* colorValuesPtr = colorValues; 1520 const u32* colorValuesPtr = colorValues;
1540 for (uint32_t i = 0; i < nPartitions; i++) { 1521 for (u32 i = 0; i < nPartitions; i++) {
1541 ComputeEndpoints(endpoints[i][0], endpoints[i][1], colorValuesPtr, colorEndpointMode[i]); 1522 ComputeEndpos32s(endpos32s[i][0], endpos32s[i][1], colorValuesPtr, colorEndpos32Mode[i]);
1542 } 1523 }
1543 1524
1544 // Read the texel weight data.. 1525 // Read the texel weight data..
1545 uint8_t texelWeightData[16]; 1526 u8 texelWeightData[16];
1546 memcpy(texelWeightData, inBuf, sizeof(texelWeightData)); 1527 memcpy(texelWeightData, inBuf, sizeof(texelWeightData));
1547 1528
1548 // Reverse everything 1529 // Reverse everything
1549 for (uint32_t i = 0; i < 8; i++) { 1530 for (u32 i = 0; i < 8; i++) {
1550// Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits 1531// Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
1551#define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32 1532#define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32
1552 unsigned char a = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[i])); 1533 u8 a = static_cast<u8>(REVERSE_BYTE(texelWeightData[i]));
1553 unsigned char b = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[15 - i])); 1534 u8 b = static_cast<u8>(REVERSE_BYTE(texelWeightData[15 - i]));
1554#undef REVERSE_BYTE 1535#undef REVERSE_BYTE
1555 1536
1556 texelWeightData[i] = b; 1537 texelWeightData[i] = b;
@@ -1558,50 +1539,51 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth,
1558 } 1539 }
1559 1540
1560 // Make sure that higher non-texel bits are set to zero 1541 // Make sure that higher non-texel bits are set to zero
1561 const uint32_t clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1; 1542 const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1;
1562 texelWeightData[clearByteStart - 1] = 1543 texelWeightData[clearByteStart - 1] =
1563 texelWeightData[clearByteStart - 1] & 1544 texelWeightData[clearByteStart - 1] &
1564 static_cast<uint8_t>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); 1545 static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
1565 memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); 1546 memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
1566 1547
1567 std::vector<IntegerEncodedValue> texelWeightValues; 1548 std::vector<IntegerEncodedValue> texelWeightValues;
1549 texelWeightValues.reserve(64);
1550
1568 InputBitStream weightStream(texelWeightData); 1551 InputBitStream weightStream(texelWeightData);
1569 1552
1570 IntegerEncodedValue::DecodeIntegerSequence(texelWeightValues, weightStream, 1553 DecodeIntegerSequence(texelWeightValues, weightStream, weightParams.m_MaxWeight,
1571 weightParams.m_MaxWeight, 1554 weightParams.GetNumWeightValues());
1572 weightParams.GetNumWeightValues());
1573 1555
1574 // Blocks can be at most 12x12, so we can have as many as 144 weights 1556 // Blocks can be at most 12x12, so we can have as many as 144 weights
1575 uint32_t weights[2][144]; 1557 u32 weights[2][144];
1576 UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight); 1558 UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight);
1577 1559
1578 // Now that we have endpoints and weights, we can interpolate and generate 1560 // Now that we have endpos32s and weights, we can s32erpolate and generate
1579 // the proper decoding... 1561 // the proper decoding...
1580 for (uint32_t j = 0; j < blockHeight; j++) 1562 for (u32 j = 0; j < blockHeight; j++)
1581 for (uint32_t i = 0; i < blockWidth; i++) { 1563 for (u32 i = 0; i < blockWidth; i++) {
1582 uint32_t partition = Select2DPartition(partitionIndex, i, j, nPartitions, 1564 u32 partition = Select2DPartition(partitionIndex, i, j, nPartitions,
1583 (blockHeight * blockWidth) < 32); 1565 (blockHeight * blockWidth) < 32);
1584 assert(partition < nPartitions); 1566 assert(partition < nPartitions);
1585 1567
1586 Pixel p; 1568 Pixel p;
1587 for (uint32_t c = 0; c < 4; c++) { 1569 for (u32 c = 0; c < 4; c++) {
1588 uint32_t C0 = endpoints[partition][0].Component(c); 1570 u32 C0 = endpos32s[partition][0].Component(c);
1589 C0 = Replicate(C0, 8, 16); 1571 C0 = Replicate(C0, 8, 16);
1590 uint32_t C1 = endpoints[partition][1].Component(c); 1572 u32 C1 = endpos32s[partition][1].Component(c);
1591 C1 = Replicate(C1, 8, 16); 1573 C1 = Replicate(C1, 8, 16);
1592 1574
1593 uint32_t plane = 0; 1575 u32 plane = 0;
1594 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { 1576 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
1595 plane = 1; 1577 plane = 1;
1596 } 1578 }
1597 1579
1598 uint32_t weight = weights[plane][j * blockWidth + i]; 1580 u32 weight = weights[plane][j * blockWidth + i];
1599 uint32_t C = (C0 * (64 - weight) + C1 * weight + 32) / 64; 1581 u32 C = (C0 * (64 - weight) + C1 * weight + 32) / 64;
1600 if (C == 65535) { 1582 if (C == 65535) {
1601 p.Component(c) = 255; 1583 p.Component(c) = 255;
1602 } else { 1584 } else {
1603 double Cf = static_cast<double>(C); 1585 double Cf = static_cast<double>(C);
1604 p.Component(c) = static_cast<uint16_t>(255.0 * (Cf / 65536.0) + 0.5); 1586 p.Component(c) = static_cast<u16>(255.0 * (Cf / 65536.0) + 0.5);
1605 } 1587 }
1606 } 1588 }
1607 1589
@@ -1613,26 +1595,26 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth,
1613 1595
1614namespace Tegra::Texture::ASTC { 1596namespace Tegra::Texture::ASTC {
1615 1597
1616std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t height, 1598std::vector<u8> Decompress(const u8* data, u32 width, u32 height, u32 depth, u32 block_width,
1617 uint32_t depth, uint32_t block_width, uint32_t block_height) { 1599 u32 block_height) {
1618 uint32_t blockIdx = 0; 1600 u32 blockIdx = 0;
1619 std::size_t depth_offset = 0; 1601 std::size_t depth_offset = 0;
1620 std::vector<uint8_t> outData(height * width * depth * 4); 1602 std::vector<u8> outData(height * width * depth * 4);
1621 for (uint32_t k = 0; k < depth; k++) { 1603 for (u32 k = 0; k < depth; k++) {
1622 for (uint32_t j = 0; j < height; j += block_height) { 1604 for (u32 j = 0; j < height; j += block_height) {
1623 for (uint32_t i = 0; i < width; i += block_width) { 1605 for (u32 i = 0; i < width; i += block_width) {
1624 1606
1625 const uint8_t* blockPtr = data + blockIdx * 16; 1607 const u8* blockPtr = data + blockIdx * 16;
1626 1608
1627 // Blocks can be at most 12x12 1609 // Blocks can be at most 12x12
1628 uint32_t uncompData[144]; 1610 u32 uncompData[144];
1629 ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData); 1611 ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData);
1630 1612
1631 uint32_t decompWidth = std::min(block_width, width - i); 1613 u32 decompWidth = std::min(block_width, width - i);
1632 uint32_t decompHeight = std::min(block_height, height - j); 1614 u32 decompHeight = std::min(block_height, height - j);
1633 1615
1634 uint8_t* outRow = depth_offset + outData.data() + (j * width + i) * 4; 1616 u8* outRow = depth_offset + outData.data() + (j * width + i) * 4;
1635 for (uint32_t jj = 0; jj < decompHeight; jj++) { 1617 for (u32 jj = 0; jj < decompHeight; jj++) {
1636 memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4); 1618 memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4);
1637 } 1619 }
1638 1620
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index c38860628..e26af33b3 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -698,6 +698,8 @@ void Config::ReadSystemValues() {
698 Settings::values.custom_rtc = std::nullopt; 698 Settings::values.custom_rtc = std::nullopt;
699 } 699 }
700 700
701 Settings::values.sound_index = ReadSetting(QStringLiteral("sound_index"), 1).toInt();
702
701 qt_config->endGroup(); 703 qt_config->endGroup();
702} 704}
703 705
@@ -1125,6 +1127,8 @@ void Config::SaveSystemValues() {
1125 Settings::values.custom_rtc.value_or(std::chrono::seconds{}).count()), 1127 Settings::values.custom_rtc.value_or(std::chrono::seconds{}).count()),
1126 0); 1128 0);
1127 1129
1130 WriteSetting(QStringLiteral("sound_index"), Settings::values.sound_index, 1);
1131
1128 qt_config->endGroup(); 1132 qt_config->endGroup();
1129} 1133}
1130 1134
diff --git a/src/yuzu/configuration/configure_system.cpp b/src/yuzu/configuration/configure_system.cpp
index e1b52f8d9..f9a5b4fbe 100644
--- a/src/yuzu/configuration/configure_system.cpp
+++ b/src/yuzu/configuration/configure_system.cpp
@@ -56,6 +56,7 @@ void ConfigureSystem::SetConfiguration() {
56 enabled = !Core::System::GetInstance().IsPoweredOn(); 56 enabled = !Core::System::GetInstance().IsPoweredOn();
57 57
58 ui->combo_language->setCurrentIndex(Settings::values.language_index); 58 ui->combo_language->setCurrentIndex(Settings::values.language_index);
59 ui->combo_sound->setCurrentIndex(Settings::values.sound_index);
59 60
60 ui->rng_seed_checkbox->setChecked(Settings::values.rng_seed.has_value()); 61 ui->rng_seed_checkbox->setChecked(Settings::values.rng_seed.has_value());
61 ui->rng_seed_edit->setEnabled(Settings::values.rng_seed.has_value()); 62 ui->rng_seed_edit->setEnabled(Settings::values.rng_seed.has_value());
@@ -81,6 +82,7 @@ void ConfigureSystem::ApplyConfiguration() {
81 } 82 }
82 83
83 Settings::values.language_index = ui->combo_language->currentIndex(); 84 Settings::values.language_index = ui->combo_language->currentIndex();
85 Settings::values.sound_index = ui->combo_sound->currentIndex();
84 86
85 if (ui->rng_seed_checkbox->isChecked()) { 87 if (ui->rng_seed_checkbox->isChecked()) {
86 Settings::values.rng_seed = ui->rng_seed_edit->text().toULongLong(nullptr, 16); 88 Settings::values.rng_seed = ui->rng_seed_edit->text().toULongLong(nullptr, 16);
diff --git a/src/yuzu/loading_screen.cpp b/src/yuzu/loading_screen.cpp
index 4f2bfab48..2a6483370 100644
--- a/src/yuzu/loading_screen.cpp
+++ b/src/yuzu/loading_screen.cpp
@@ -34,18 +34,6 @@ constexpr char PROGRESSBAR_STYLE_PREPARE[] = R"(
34QProgressBar {} 34QProgressBar {}
35QProgressBar::chunk {})"; 35QProgressBar::chunk {})";
36 36
37constexpr char PROGRESSBAR_STYLE_DECOMPILE[] = R"(
38QProgressBar {
39 background-color: black;
40 border: 2px solid white;
41 border-radius: 4px;
42 padding: 2px;
43}
44QProgressBar::chunk {
45 background-color: #0ab9e6;
46 width: 1px;
47})";
48
49constexpr char PROGRESSBAR_STYLE_BUILD[] = R"( 37constexpr char PROGRESSBAR_STYLE_BUILD[] = R"(
50QProgressBar { 38QProgressBar {
51 background-color: black; 39 background-color: black;
@@ -100,13 +88,11 @@ LoadingScreen::LoadingScreen(QWidget* parent)
100 88
101 stage_translations = { 89 stage_translations = {
102 {VideoCore::LoadCallbackStage::Prepare, tr("Loading...")}, 90 {VideoCore::LoadCallbackStage::Prepare, tr("Loading...")},
103 {VideoCore::LoadCallbackStage::Decompile, tr("Preparing Shaders %1 / %2")},
104 {VideoCore::LoadCallbackStage::Build, tr("Loading Shaders %1 / %2")}, 91 {VideoCore::LoadCallbackStage::Build, tr("Loading Shaders %1 / %2")},
105 {VideoCore::LoadCallbackStage::Complete, tr("Launching...")}, 92 {VideoCore::LoadCallbackStage::Complete, tr("Launching...")},
106 }; 93 };
107 progressbar_style = { 94 progressbar_style = {
108 {VideoCore::LoadCallbackStage::Prepare, PROGRESSBAR_STYLE_PREPARE}, 95 {VideoCore::LoadCallbackStage::Prepare, PROGRESSBAR_STYLE_PREPARE},
109 {VideoCore::LoadCallbackStage::Decompile, PROGRESSBAR_STYLE_DECOMPILE},
110 {VideoCore::LoadCallbackStage::Build, PROGRESSBAR_STYLE_BUILD}, 96 {VideoCore::LoadCallbackStage::Build, PROGRESSBAR_STYLE_BUILD},
111 {VideoCore::LoadCallbackStage::Complete, PROGRESSBAR_STYLE_COMPLETE}, 97 {VideoCore::LoadCallbackStage::Complete, PROGRESSBAR_STYLE_COMPLETE},
112 }; 98 };
@@ -192,8 +178,7 @@ void LoadingScreen::OnLoadProgress(VideoCore::LoadCallbackStage stage, std::size
192 } 178 }
193 179
194 // update labels and progress bar 180 // update labels and progress bar
195 if (stage == VideoCore::LoadCallbackStage::Decompile || 181 if (stage == VideoCore::LoadCallbackStage::Build) {
196 stage == VideoCore::LoadCallbackStage::Build) {
197 ui->stage->setText(stage_translations[stage].arg(value).arg(total)); 182 ui->stage->setText(stage_translations[stage].arg(value).arg(total));
198 } else { 183 } else {
199 ui->stage->setText(stage_translations[stage]); 184 ui->stage->setText(stage_translations[stage]);